From fe5720e6846d5ee3b5e6ef61bca18f49f04c5735 Mon Sep 17 00:00:00 2001 From: Yanglei Zou Date: Tue, 29 Oct 2024 14:20:49 +0800 Subject: [PATCH 001/254] Add ggml-openvino base files --- ggml/include/ggml-openvino.h | 45 ++++++++++++++++++++++++++++++++++++ ggml/src/ggml-openvino.cpp | 23 ++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 ggml/include/ggml-openvino.h create mode 100644 ggml/src/ggml-openvino.cpp diff --git a/ggml/include/ggml-openvino.h b/ggml/include/ggml-openvino.h new file mode 100644 index 0000000000..e0229cf18c --- /dev/null +++ b/ggml/include/ggml-openvino.h @@ -0,0 +1,45 @@ +#pragma once + +#include "ggml-backend.h" +#include "ggml.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// backend API +GGML_API ggml_backend_t ggml_backend_openvino_init(int device); + +GGML_API bool ggml_backend_is_openvino(ggml_backend_t backend); + +// device buffer +GGML_API ggml_backend_buffer_type_t +ggml_backend_openvino_buffer_type(int device); + +// split tensor buffer that splits matrices by rows across multiple devices +GGML_API ggml_backend_buffer_type_t +ggml_backend_openvino_split_buffer_type(const float *tensor_split); + +// pinned host buffer for use with the CPU backend for faster copies between CPU +// and GPU +GGML_API ggml_backend_buffer_type_t +ggml_backend_openvino_host_buffer_type(void); + +// GGML_API int ggml_backend_openvino_get_device_count(void); +// GGML_API void ggml_backend_openvino_get_device_description(int device, +// char *description, +// size_t +// description_size); +// GGML_API void ggml_backend_openvino_get_device_memory(int device, size_t +// *free, +// size_t *total); + +// GGML_API bool ggml_backend_openvino_register_host_buffer(void *buffer, size_t +// size); GGML_API void ggml_backend_openvino_unregister_host_buffer(void +// *buffer); + +GGML_API ggml_backend_reg_t ggml_backend_openvino_reg(void); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp new file mode 100644 index 0000000000..177e51458d --- /dev/null +++ b/ggml/src/ggml-openvino.cpp @@ -0,0 +1,23 @@ +#include "ggml-openvino.h" +#include "ggml-backend-impl.h" +#include "ggml-impl.h" + +// backend API +GGML_API ggml_backend_t ggml_backend_openvino_init(int device) {} + +GGML_API bool ggml_backend_is_openvino(ggml_backend_t backend) {} + +// device buffer +GGML_API ggml_backend_buffer_type_t +ggml_backend_openvino_buffer_type(int device) {} + +// split tensor buffer that splits matrices by rows across multiple devices +GGML_API ggml_backend_buffer_type_t +ggml_backend_openvino_split_buffer_type(const float *tensor_split) {} + +// pinned host buffer for use with the CPU backend for faster copies between CPU +// and GPU +GGML_API ggml_backend_buffer_type_t +ggml_backend_openvino_host_buffer_type(void) {} + +GGML_API ggml_backend_reg_t ggml_backend_openvino_reg(void) {} From 5294402b50cb5d8d30639acf0c8a6ad72bfcaf1d Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Wed, 13 Nov 2024 13:32:44 +0800 Subject: [PATCH 002/254] add openvino as optional backend for Llama.cpp ggml --- ggml/include/ggml-openvino.h | 30 ++- ggml/src/ggml-openvino.cpp | 450 ++++++++++++++++++++++++++++++++++- 2 files changed, 471 insertions(+), 9 deletions(-) diff --git a/ggml/include/ggml-openvino.h b/ggml/include/ggml-openvino.h index e0229cf18c..9172414c29 100644 --- a/ggml/include/ggml-openvino.h +++ b/ggml/include/ggml-openvino.h @@ -1,12 +1,18 @@ #pragma once -#include "ggml-backend.h" #include "ggml.h" +#include "ggml-backend.h" + +#include +#include #ifdef __cplusplus extern "C" { #endif +#define GGML_OPENVINO_NAME "OPENVINO" +#define GGML_OPENVINO_MAX_DEVICES 16 + // backend API GGML_API ggml_backend_t ggml_backend_openvino_init(int device); @@ -25,7 +31,7 @@ ggml_backend_openvino_split_buffer_type(const float *tensor_split); GGML_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(void); -// GGML_API int ggml_backend_openvino_get_device_count(void); +GGML_API int ggml_backend_openvino_get_device_count(void); // GGML_API void ggml_backend_openvino_get_device_description(int device, // char *description, // size_t @@ -40,6 +46,26 @@ ggml_backend_openvino_host_buffer_type(void); GGML_API ggml_backend_reg_t ggml_backend_openvino_reg(void); +struct ggml_openvino_device_info { + int device_count; + + struct openvino_device_info { + int cc; // compute capability + int nsm; // number of streaming multiprocessors + size_t smpb; // max. shared memory per block + size_t smpbo; // max. shared memory per block (with opt-in) + bool vmm; // virtual memory support + size_t vmm_granularity; // granularity of virtual memory + size_t total_vram; + }; + + openvino_device_info devices[GGML_OPENVINO_MAX_DEVICES] = {}; + + std::array default_tensor_split = {}; +}; + +const ggml_openvino_device_info & ggml_openvino_info(); + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 177e51458d..87047a2f30 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -2,22 +2,458 @@ #include "ggml-backend-impl.h" #include "ggml-impl.h" -// backend API -GGML_API ggml_backend_t ggml_backend_openvino_init(int device) {} +#include +#include +#include +#include +#include +#include +#include -GGML_API bool ggml_backend_is_openvino(ggml_backend_t backend) {} +struct ggml_backend_openvino_context { + int device; + std::string name; + std::string description; +}; + +static void ggml_backend_openvino_free(ggml_backend_t backend) { + ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *)backend->context; + delete ctx; + delete backend; +} + +static const char * ggml_backend_openvino_get_name(ggml_backend_t backend) { + return GGML_OPENVINO_NAME; + GGML_UNUSED(backend); +} + +static ggml_backend_buffer_type_t ggml_backend_openvino_get_default_buffer_type(ggml_backend_t backend) { + return ggml_backend_cpu_buffer_type(); + GGML_UNUSED(backend); +} + +static void ggml_backend_openvino_add(ggml_backend_openvino_context & ctx, ggml_tensor * dst) { + // Placeholder for OpenVINO add operation + GGML_ASSERT(ctx.device != 0); + GGML_ASSERT(dst->data != nullptr); +} + +static void test_op_for_NONE() { + GGML_LOG_DEBUG("...test_op_for_NONE... \n"); +} + +static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + // TODO + ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *)backend->context; + + for (int i = 0; i < cgraph->n_nodes; i++) { + struct ggml_tensor * node = cgraph->nodes[i]; + + switch (node->op) { + case GGML_OP_ADD: + // TODO + ggml_backend_openvino_add(*ctx, node); + break; + case GGML_OP_MUL_MAT: + case GGML_OP_OUT_PROD: + break; + case GGML_OP_NONE: + test_op_for_NONE(); + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + break; + default: + GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node)); + } + } + + return GGML_STATUS_SUCCESS; + + GGML_UNUSED(backend); +} + +static const ggml_backend_i ggml_backend_openvino_interface = { + /* .get_name = */ ggml_backend_openvino_get_name, + /* .free = */ ggml_backend_openvino_free, + /* .get_default_buffer_type = */ ggml_backend_openvino_get_default_buffer_type, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_async = */ NULL, + /* .synchronize = */ NULL, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_openvino_graph_compute, + /* .supports_op = */ NULL, + /* .supports_buft = */ NULL, + /* .offload_op = */ NULL, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, +}; + +int ggml_backend_openvino_get_device_count() { + return ggml_openvino_info().device_count; +} + +static ggml_guid_t ggml_backend_openvino_guid(void) { + static ggml_guid guid = { 0x12, 0xa8, 0xae, 0xf4, 0xc0, 0x1e, 0x61, 0x97, 0x8f, 0xeb, 0x33, 0x04, 0xa1, 0x33, 0x51, 0x2d }; + return &guid; +} + +// backend API +GGML_API ggml_backend_t ggml_backend_openvino_init(int device) { + if (device < 0 || device >= ggml_backend_openvino_get_device_count()) { + GGML_LOG_ERROR("%s: invalid device %d\n", __func__, device); + return nullptr; + } + + ggml_backend_openvino_context * ctx = new ggml_backend_openvino_context; + if (ctx == nullptr) { + GGML_LOG_ERROR("%s: failed to allocate context\n", __func__); + return nullptr; + } + + ggml_backend_t openvino_backend = new ggml_backend { + /* .guid = */ ggml_backend_openvino_guid(), + /* .interface = */ ggml_backend_openvino_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_openvino_reg(), device), + /* .context = */ ctx, + }; + + return openvino_backend; +} + +GGML_API bool ggml_backend_is_openvino(ggml_backend_t backend) { + GGML_ASSERT(backend->context != nullptr); + return true; +} // device buffer GGML_API ggml_backend_buffer_type_t -ggml_backend_openvino_buffer_type(int device) {} +ggml_backend_openvino_buffer_type(int device) { + GGML_ASSERT(device >= 0); + return nullptr; +} // split tensor buffer that splits matrices by rows across multiple devices GGML_API ggml_backend_buffer_type_t -ggml_backend_openvino_split_buffer_type(const float *tensor_split) {} +ggml_backend_openvino_split_buffer_type(const float *tensor_split) { + GGML_ASSERT(tensor_split != nullptr); + return nullptr; +} // pinned host buffer for use with the CPU backend for faster copies between CPU // and GPU GGML_API ggml_backend_buffer_type_t -ggml_backend_openvino_host_buffer_type(void) {} +ggml_backend_openvino_host_buffer_type(void) { return nullptr;} -GGML_API ggml_backend_reg_t ggml_backend_openvino_reg(void) {} + +struct ggml_backend_openvino_buffer_type_context { + int device; + std::string name; +}; + +static const char * ggml_backend_openvino_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + ggml_backend_openvino_buffer_type_context * ctx = (ggml_backend_openvino_buffer_type_context *)buft->context; + + return ctx->name.c_str(); +} +static bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name; +} + + +static const char * ggml_backend_openvino_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + return GGML_OPENVINO_NAME "_Split"; + + GGML_UNUSED(buft); +} + +static bool ggml_backend_buft_is_openvino_split(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_openvino_split_buffer_type_get_name; +} + +struct ggml_backend_openvino_device_context { + int device; + std::string name; + std::string description; +}; + +static const char * ggml_backend_openvino_device_get_name(ggml_backend_dev_t dev) { + ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context; + return ctx->name.c_str(); +} + +static const char * ggml_backend_openvino_device_get_description(ggml_backend_dev_t dev) { + ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context; + return ctx->description.c_str(); +} + +// TODO +static void ggml_backend_openvino_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + GGML_ASSERT(dev->context != nullptr); + GGML_ASSERT(free != nullptr); + GGML_ASSERT(total != nullptr); + ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context; + // Placeholder + GGML_ASSERT(ctx->device >= 0); + // ggml_openvino_set_device(ctx->device); +} + +static enum ggml_backend_dev_type ggml_backend_openvino_device_get_type(ggml_backend_dev_t dev) { + GGML_UNUSED(dev); + return GGML_BACKEND_DEVICE_TYPE_CPU; + // return GGML_BACKEND_DEVICE_TYPE_GPU_FULL; +} + +static void ggml_backend_openvino_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { + props->name = ggml_backend_openvino_device_get_name(dev); + props->description = ggml_backend_openvino_device_get_description(dev); + props->type = ggml_backend_openvino_device_get_type(dev); + ggml_backend_openvino_device_get_memory(dev, &props->memory_free, &props->memory_total); + + bool host_buffer = getenv("GGML_OPENVINO_NO_PINNED") == nullptr; +#ifdef GGML_OPENVINO_NO_PEER_COPY + bool events = false; +#else + bool events = true; +#endif + + props->caps = { + /* .async = */ true, + /* .host_buffer = */ host_buffer, + /* .buffer_from_host_ptr = */ false, + /* .events = */ events, + }; +} + +static ggml_backend_t ggml_backend_openvino_device_init(ggml_backend_dev_t dev, const char * params) { + GGML_UNUSED(params); + ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context; + return ggml_backend_openvino_init(ctx->device); +} + +static ggml_backend_buffer_type_t ggml_backend_openvino_device_get_buffer_type(ggml_backend_dev_t dev) { + ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context; + return ggml_backend_openvino_buffer_type(ctx->device); +} + +static ggml_backend_buffer_type_t ggml_backend_openvino_device_get_host_buffer_type(ggml_backend_dev_t dev) { + GGML_UNUSED(dev); + return ggml_backend_openvino_host_buffer_type(); +} + +static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { + GGML_UNUSED(dev); + GGML_UNUSED(ptr); + GGML_UNUSED(size); + GGML_UNUSED(max_tensor_size); + return nullptr; +} + +static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { + GGML_UNUSED(dev); + GGML_UNUSED(ptr); + GGML_UNUSED(size); + GGML_UNUSED(max_tensor_size); + return nullptr; +} + +static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { + GGML_ASSERT(dev->reg != nullptr); + // ggml_backend_openvino_device_context * dev_ctx = (ggml_backend_openvino_device_context *) dev->context; + + switch (op->op) { + case GGML_OP_UNARY: + return false; + case GGML_OP_NONE: + return true; + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + case GGML_OP_NORM: + return false; + case GGML_OP_ADD: + { + ov::op::v1::Add add; + //add.evaluate(op->outputs[0], op->inputs[1]); + return false; + } + case GGML_OP_ADD1: + case GGML_OP_SUB: + { + ov::op::v1::Subtract sub; + //sub.evaluate(TensorVector& outputs, const TensorVector& inputs); + return false; + } + case GGML_OP_MUL: + case GGML_OP_DIV: + case GGML_OP_RMS_NORM: + case GGML_OP_SCALE: + case GGML_OP_SQR: + case GGML_OP_SQRT: + case GGML_OP_SIN: + case GGML_OP_COS: + case GGML_OP_IM2COL: + case GGML_OP_POOL_2D: + case GGML_OP_SUM: + case GGML_OP_SUM_ROWS: + case GGML_OP_ARGSORT: + case GGML_OP_ACC: + case GGML_OP_GROUP_NORM: + case GGML_OP_UPSCALE: + case GGML_OP_PAD: + case GGML_OP_ARANGE: + case GGML_OP_TIMESTEP_EMBEDDING: + case GGML_OP_LEAKY_RELU: + case GGML_OP_CROSS_ENTROPY_LOSS: + case GGML_OP_CROSS_ENTROPY_LOSS_BACK: + case GGML_OP_OPT_STEP_ADAMW: + return false; + default: + return false; + } +} + +static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + return ggml_backend_buft_is_host(buft); + GGML_UNUSED(dev); +} + +static const struct ggml_backend_device_i ggml_backend_openvino_device_interface = { + /* .get_name = */ ggml_backend_openvino_device_get_name, + /* .get_description = */ ggml_backend_openvino_device_get_description, + /* .get_memory = */ ggml_backend_openvino_device_get_memory, + /* .get_type = */ ggml_backend_openvino_device_get_type, + /* .get_props = */ ggml_backend_openvino_device_get_props, + /* .init_backend = */ ggml_backend_openvino_device_init, + /* .get_buffer_type = */ ggml_backend_openvino_device_get_buffer_type, + /* .get_host_buffer_type = */ NULL, + /* .buffer_from_host_ptr = */ ggml_backend_openvino_device_buffer_from_ptr, + /* .supports_op = */ ggml_backend_openvino_device_supports_op, + /* .supports_buft = */ ggml_backend_openvino_device_supports_buft, + /* .offload_op = */ NULL, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_synchronize = */ NULL, +}; + +struct ggml_backend_openvino_reg_context { + std::vector devices; +}; + +static const char * ggml_backend_openvino_reg_get_name(ggml_backend_reg_t reg) { + return GGML_OPENVINO_NAME; + GGML_UNUSED(reg); +} + +static size_t ggml_backend_openvino_reg_get_device_count(ggml_backend_reg_t reg) { + return ggml_openvino_info().device_count; + GGML_UNUSED(reg); + + // TODO + ggml_backend_openvino_reg_context * ctx = (ggml_backend_openvino_reg_context *)reg->context; + + return ctx->devices.size(); +} + +static ggml_backend_dev_t ggml_backend_openvino_reg_get_device(ggml_backend_reg_t reg, size_t index) { + ggml_backend_openvino_reg_context * ctx = (ggml_backend_openvino_reg_context *)reg->context; + GGML_ASSERT(index < ctx->devices.size()); + return ctx->devices[index]; + // GGML_ASSERT(index == 0); + + // static ggml_backend_device ggml_backend_openvino_device = { + // /* .iface = */ ggml_backend_openvino_device_interface, + // /* .reg = */ reg, + // /* .context = */ nullptr, + // }; + + // return &ggml_backend_openvino_device; + + // GGML_UNUSED(reg); + // GGML_UNUSED(index); +} + +static void * ggml_backend_openvino_get_proc_address(ggml_backend_reg_t reg, const char * name) { + GGML_UNUSED(reg); + if (strcmp(name, "ggml_backend_split_buffer_type") == 0) { + return (void *)ggml_backend_openvino_split_buffer_type; + } + // if (strcmp(name, "ggml_backend_register_host_buffer") == 0) { + // return (void *)ggml_backend_openvino_register_host_buffer; + // } + // if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) { + // return (void *)ggml_backend_openvino_unregister_host_buffer; + // } + return nullptr; +} + +static const struct ggml_backend_reg_i ggml_backend_openvino_reg_interface = { + /* .get_name = */ ggml_backend_openvino_reg_get_name, + /* .get_device_count = */ ggml_backend_openvino_reg_get_device_count, + /* .get_device = */ ggml_backend_openvino_reg_get_device, + /* .get_proc_address = */ ggml_backend_openvino_get_proc_address, +}; + +static int get_openvino_device_count() { + ov::Core core; + auto devices = core.get_available_devices(); + // return devices.size(); + return 1; +} + +static ggml_openvino_device_info ggml_openvino_init() { + ggml_openvino_device_info info = {}; + // TODO + info.device_count = get_openvino_device_count(); + return info; +} + +const ggml_openvino_device_info & ggml_openvino_info() { + static ggml_openvino_device_info info = ggml_openvino_init(); + return info; +} + +GGML_API ggml_backend_reg_t ggml_backend_openvino_reg(void) { + static ggml_backend_reg reg; + + static bool initialized = false; + { + static std::mutex mutex; + std::lock_guard lock(mutex); + if (!initialized) { + ggml_backend_openvino_reg_context * ctx = new ggml_backend_openvino_reg_context; + + // GGML_LOG_DEBUG("ggml_openvino_info().device_count = %d \n", ggml_openvino_info().device_count); + for (int i = 0; i < ggml_openvino_info().device_count; i++) { + ggml_backend_openvino_device_context * dev_ctx = new ggml_backend_openvino_device_context; + dev_ctx->device = i; + dev_ctx->name = GGML_OPENVINO_NAME + std::to_string(i); + + // ggml_openvino_set_device(i); + dev_ctx->description = ov::get_openvino_version().description; + + ggml_backend_dev_t dev = new ggml_backend_device { + /* .interface = */ ggml_backend_openvino_device_interface, + /* .reg = */ ®, + /* .context = */ dev_ctx + }; + ctx->devices.push_back(dev); + } + + reg = ggml_backend_reg { + /* .interface = */ ggml_backend_openvino_reg_interface, + /* .context = */ ctx + }; + } + + initialized = true; + } + + return ® +} From 9b9d51dddf8df8d62210b7ddfa4179a8b7eb8607 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Tue, 19 Nov 2024 15:53:54 +0800 Subject: [PATCH 003/254] * Configure the device(default CPU) that uses OpenVINO to compile the model * Add OpenVINO ADD operator to Llama.cpp. The output is somewhat abnormal and needs further debugging. --- ggml/src/ggml-openvino.cpp | 150 +++++++++++++++++++++++++++++++++++-- 1 file changed, 144 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 87047a2f30..4b864a0b6d 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -10,10 +10,29 @@ #include #include +#define GGML_OPENVINO_MAX_STREAMS 8 + struct ggml_backend_openvino_context { - int device; - std::string name; - std::string description; + int device; // the device ID currently in use + std::string name; // context Name + std::string description; // context description + + // OpenVINO core components + ov::Core core; // OpenVINO core interface + std::shared_ptr model; // compiled Model + ov::InferRequest infer_request; // inference Request + + // OpenVINO Multi-stream support + static const int MAX_STREAMS = 8; // define the maximum number of flows + std::vector streams; // used to support multi-stream reasoning + int current_stream; // the currently active stream index + + // state Management + bool is_initialized; // initialize + + ggml_backend_openvino_context() + : device(0), name("OpenVINO"), description("OpenVINO Backend Context"), + current_stream(0), is_initialized(false) {} }; static void ggml_backend_openvino_free(ggml_backend_t backend) { @@ -32,10 +51,129 @@ static ggml_backend_buffer_type_t ggml_backend_openvino_get_default_buffer_type( GGML_UNUSED(backend); } +static void ggml_backend_openvino_add_forward(ggml_backend_openvino_context & ctx, ggml_tensor * dst) { + // Step 1: get the input tensor src0 和 src1 + const ggml_tensor *src0 = dst->src[0]; + const ggml_tensor *src1 = dst->src[1]; + + if (src0 == nullptr || src1 == nullptr) { + std::cerr << "Error: src0 or src1 is null." << std::endl; + return; + } + + // Step 2: Check that the input tensor types and shapes match + if (src0->type != GGML_TYPE_F32 || src1->type != GGML_TYPE_F32) { + std::cerr << "Error: Unsupported tensor type. Only GGML_TYPE_F32 is supported for OpenVINO." << std::endl; + return; + } + if (src0->ne[0] != src1->ne[0] || src0->ne[1] != src1->ne[1]) { + std::cerr << "Error: src0 and src1 shapes do not match." << std::endl; + return; + } + + // Step 3: Initialize OpenVINO model and streams (only done on first call) + if (!ctx.is_initialized) { + try { + // define input tensor shape + ov::Shape input_shape = {static_cast(src0->ne[0]), static_cast(src0->ne[1])}; + + // creat OpenVINO input node + auto input0 = std::make_shared(ov::element::f32, input_shape); + auto input1 = std::make_shared(ov::element::f32, input_shape); + + // define add operation + auto add_node = std::make_shared(input0, input1); + + // create model + auto model = std::make_shared(add_node, ov::ParameterVector{input0, input1}); + + // compile model and store in context +#ifdef GGML_OPENVINO_GPU + ctx.model = std::make_shared(ctx.core.compile_model(model, "GPU")); +#elif GGML_OPENVINO_NPU + ctx.model = std::make_shared(ctx.core.compile_model(model, "NPU")); +#else + ctx.model = std::make_shared(ctx.core.compile_model(model, "CPU")); +#endif + // initialize infer request + ctx.infer_request = ctx.model->create_infer_request(); + ctx.is_initialized = true; + + // std::cout << "OpenVINO add model initialized successfully." << std::endl; + } catch (const std::exception &e) { + std::cerr << "Error initializing OpenVINO model: " << e.what() << std::endl; + return; + } + } + + // Step 4: set input data, copy src0 and src1 data to OpenVINO input tensors + auto input_tensor0 = ctx.infer_request.get_tensor(ctx.model->input(0)); + auto input_tensor1 = ctx.infer_request.get_tensor(ctx.model->input(1)); + + // Note: OpenVINO Tensor data is contiguous, make sure src0 and src1 data is contiguous. + std::memcpy(input_tensor0.data(), src0->data, src0->nb[0] * src0->ne[0]); + std::memcpy(input_tensor1.data(), src1->data, src1->nb[0] * src1->ne[0]); + + // Step 5: execute inference + ctx.infer_request.infer(); + + // Step 6: get output data + ov::Tensor output_tensor = ctx.infer_request.get_tensor(ctx.model->output(0)); + + // Allocate memory for dst->data if not already allocated + if (dst->data == nullptr) { + dst->data = malloc(dst->nb[0] * dst->ne[0]); + if (dst->data == nullptr) { + std::cerr << "Error: Failed to allocate memory for dst->data." << std::endl; + return; + } + } + // Copy output data to dst + std::memcpy(dst->data, output_tensor.data(), dst->nb[0] * dst->ne[0]); + + // // Print results (optional, for debugging) + // float* dst_data = static_cast(dst->data); + // std::cout << "Output data:"; + // for (int i = 0; i < std::min(10, static_cast(dst->ne[0])); ++i) { + // std::cout << dst_data[i] << " "; + // } + // std::cout << std::endl; +} + static void ggml_backend_openvino_add(ggml_backend_openvino_context & ctx, ggml_tensor * dst) { // Placeholder for OpenVINO add operation - GGML_ASSERT(ctx.device != 0); + // GGML_ASSERT(ctx.device != 0); GGML_ASSERT(dst->data != nullptr); + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + switch (src0->type) { + case GGML_TYPE_F16: + { + if (src1->type == GGML_TYPE_F16) { + // ggml_backend_openvino_add_forward(ctx, dst, src0, src1); + } else if (src1->type == GGML_TYPE_F32) { + // ggml_compute_forward_add_f16_f32(params, dst); + } else { + GGML_ABORT("fatal error"); + } + } break; + case GGML_TYPE_F32: + { + if (src1->type == GGML_TYPE_F32) { + { + ggml_backend_openvino_add_forward(ctx, dst); + } + } + else { + GGML_ABORT("fatal error"); + } + } break; + default: + GGML_ABORT("%s: unsupported type %d\n", __func__, src1->type); + } + } static void test_op_for_NONE() { @@ -270,7 +408,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con case GGML_OP_UNARY: return false; case GGML_OP_NONE: - return true; + return false; case GGML_OP_RESHAPE: case GGML_OP_VIEW: case GGML_OP_PERMUTE: @@ -281,7 +419,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con { ov::op::v1::Add add; //add.evaluate(op->outputs[0], op->inputs[1]); - return false; + return true; } case GGML_OP_ADD1: case GGML_OP_SUB: From faa4a7de76ab9daa32b755dce18ea3aadc54edc8 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Thu, 21 Nov 2024 18:03:22 +0800 Subject: [PATCH 004/254] Solve the issue of abnormal model output caused by using OpenVINO ADD operator --- ggml/src/ggml-openvino.cpp | 159 ++++++++++++------------------------- 1 file changed, 52 insertions(+), 107 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 4b864a0b6d..2cb9dfa7d3 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -51,10 +51,18 @@ static ggml_backend_buffer_type_t ggml_backend_openvino_get_default_buffer_type( GGML_UNUSED(backend); } -static void ggml_backend_openvino_add_forward(ggml_backend_openvino_context & ctx, ggml_tensor * dst) { +static void ggml_backend_openvino_add_forward(ggml_tensor * dst) { // Step 1: get the input tensor src0 和 src1 - const ggml_tensor *src0 = dst->src[0]; - const ggml_tensor *src1 = dst->src[1]; + const struct ggml_tensor *src0 = dst->src[0]; + const struct ggml_tensor *src1 = dst->src[1]; + + ov::Core core; + + // set the shape and stride of dst + dst->ne[0] = src0->ne[0]; + dst->ne[1] = src0->ne[1]; + dst->nb[0] = src0->nb[0]; + dst->nb[1] = src0->nb[1]; if (src0 == nullptr || src1 == nullptr) { std::cerr << "Error: src0 or src1 is null." << std::endl; @@ -71,76 +79,61 @@ static void ggml_backend_openvino_add_forward(ggml_backend_openvino_context & ct return; } - // Step 3: Initialize OpenVINO model and streams (only done on first call) - if (!ctx.is_initialized) { - try { - // define input tensor shape - ov::Shape input_shape = {static_cast(src0->ne[0]), static_cast(src0->ne[1])}; + ov::Tensor input0 = ov::Tensor(ov::element::f32, {static_cast(src0->ne[0]), static_cast(src0->ne[1])}, src0->data); + ov::Tensor input1 = ov::Tensor(ov::element::f32, {static_cast(src1->ne[0]), static_cast(src1->ne[1])}, src1->data); - // creat OpenVINO input node - auto input0 = std::make_shared(ov::element::f32, input_shape); - auto input1 = std::make_shared(ov::element::f32, input_shape); + auto input0_param = std::make_shared(ov::element::f32, ov::Shape{static_cast(src0->ne[0]), static_cast(src0->ne[1])}); + auto input1_param = std::make_shared(ov::element::f32, ov::Shape{static_cast(src0->ne[0]), static_cast(src0->ne[1])}); + auto add = std::make_shared(input0_param, input1_param); + auto function = std::make_shared(add, ov::ParameterVector{input0_param, input1_param}); - // define add operation - auto add_node = std::make_shared(input0, input1); - - // create model - auto model = std::make_shared(add_node, ov::ParameterVector{input0, input1}); - - // compile model and store in context + // compile model and store in context #ifdef GGML_OPENVINO_GPU - ctx.model = std::make_shared(ctx.core.compile_model(model, "GPU")); + auto compiled_model = core.compile_model(function, "GPU"); #elif GGML_OPENVINO_NPU - ctx.model = std::make_shared(ctx.core.compile_model(model, "NPU")); + auto compiled_model = core.compile_model(function, "NPU"); #else - ctx.model = std::make_shared(ctx.core.compile_model(model, "CPU")); + auto compiled_model = core.compile_model(function, "CPU"); #endif - // initialize infer request - ctx.infer_request = ctx.model->create_infer_request(); - ctx.is_initialized = true; - - // std::cout << "OpenVINO add model initialized successfully." << std::endl; - } catch (const std::exception &e) { - std::cerr << "Error initializing OpenVINO model: " << e.what() << std::endl; - return; - } - } + // initialize infer request + auto infer_request = compiled_model.create_infer_request(); // Step 4: set input data, copy src0 and src1 data to OpenVINO input tensors - auto input_tensor0 = ctx.infer_request.get_tensor(ctx.model->input(0)); - auto input_tensor1 = ctx.infer_request.get_tensor(ctx.model->input(1)); - - // Note: OpenVINO Tensor data is contiguous, make sure src0 and src1 data is contiguous. - std::memcpy(input_tensor0.data(), src0->data, src0->nb[0] * src0->ne[0]); - std::memcpy(input_tensor1.data(), src1->data, src1->nb[0] * src1->ne[0]); + infer_request.set_tensor(input0_param, input0); + infer_request.set_tensor(input1_param, input1); // Step 5: execute inference - ctx.infer_request.infer(); + infer_request.infer(); // Step 6: get output data - ov::Tensor output_tensor = ctx.infer_request.get_tensor(ctx.model->output(0)); + ov::Tensor output = infer_request.get_tensor(compiled_model.output()); - // Allocate memory for dst->data if not already allocated - if (dst->data == nullptr) { - dst->data = malloc(dst->nb[0] * dst->ne[0]); - if (dst->data == nullptr) { - std::cerr << "Error: Failed to allocate memory for dst->data." << std::endl; - return; - } - } - // Copy output data to dst - std::memcpy(dst->data, output_tensor.data(), dst->nb[0] * dst->ne[0]); - - // // Print results (optional, for debugging) - // float* dst_data = static_cast(dst->data); - // std::cout << "Output data:"; - // for (int i = 0; i < std::min(10, static_cast(dst->ne[0])); ++i) { - // std::cout << dst_data[i] << " "; + // // Allocate memory for dst->data if not already allocated + // if (dst->data == nullptr) { + // dst->data = malloc(dst->nb[0] * dst->ne[0]); + // if (dst->data == nullptr) { + // std::cerr << "Error: Failed to allocate memory for dst->data." << std::endl; + // return; + // } // } - // std::cout << std::endl; + + std::memcpy(dst->data, output.data(), output.get_byte_size()); + + if (dst->ne[0] != src0->ne[0] || dst->ne[1] != src0->ne[1]) { + std::cerr << "Error: dst tensor shape does not match input tensor shape." << std::endl; + return; + } + + // float* dst_data1 = (float*)(dst->data); + // printf("Output data:");; + // for (int i = 0; i < (10 < (int)(dst->ne[0]) ? 10 : (int)(dst->ne[0])); ++i) { + // printf("%f ", dst_data1[i]); + // } + // printf("\n"); + // fflush(stdout); } -static void ggml_backend_openvino_add(ggml_backend_openvino_context & ctx, ggml_tensor * dst) { +static void ggml_backend_openvino_add(ggml_tensor * dst) { // Placeholder for OpenVINO add operation // GGML_ASSERT(ctx.device != 0); GGML_ASSERT(dst->data != nullptr); @@ -163,7 +156,7 @@ static void ggml_backend_openvino_add(ggml_backend_openvino_context & ctx, ggml_ { if (src1->type == GGML_TYPE_F32) { { - ggml_backend_openvino_add_forward(ctx, dst); + ggml_backend_openvino_add_forward(dst); } } else { @@ -181,16 +174,13 @@ static void test_op_for_NONE() { } static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - // TODO - ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *)backend->context; - for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; switch (node->op) { case GGML_OP_ADD: // TODO - ggml_backend_openvino_add(*ctx, node); + ggml_backend_openvino_add(node); break; case GGML_OP_MUL_MAT: case GGML_OP_OUT_PROD: @@ -405,53 +395,8 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con // ggml_backend_openvino_device_context * dev_ctx = (ggml_backend_openvino_device_context *) dev->context; switch (op->op) { - case GGML_OP_UNARY: - return false; - case GGML_OP_NONE: - return false; - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - case GGML_OP_NORM: - return false; case GGML_OP_ADD: - { - ov::op::v1::Add add; - //add.evaluate(op->outputs[0], op->inputs[1]); return true; - } - case GGML_OP_ADD1: - case GGML_OP_SUB: - { - ov::op::v1::Subtract sub; - //sub.evaluate(TensorVector& outputs, const TensorVector& inputs); - return false; - } - case GGML_OP_MUL: - case GGML_OP_DIV: - case GGML_OP_RMS_NORM: - case GGML_OP_SCALE: - case GGML_OP_SQR: - case GGML_OP_SQRT: - case GGML_OP_SIN: - case GGML_OP_COS: - case GGML_OP_IM2COL: - case GGML_OP_POOL_2D: - case GGML_OP_SUM: - case GGML_OP_SUM_ROWS: - case GGML_OP_ARGSORT: - case GGML_OP_ACC: - case GGML_OP_GROUP_NORM: - case GGML_OP_UPSCALE: - case GGML_OP_PAD: - case GGML_OP_ARANGE: - case GGML_OP_TIMESTEP_EMBEDDING: - case GGML_OP_LEAKY_RELU: - case GGML_OP_CROSS_ENTROPY_LOSS: - case GGML_OP_CROSS_ENTROPY_LOSS_BACK: - case GGML_OP_OPT_STEP_ADAMW: - return false; default: return false; } From adc2c70f440bc6c698f0c2202e1683a365143dbb Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Mon, 2 Dec 2024 10:18:54 +0800 Subject: [PATCH 005/254] Add OpenVINO MUL operator to GGML of Llama.cpp. --- ggml/src/ggml-openvino.cpp | 94 ++++++++++++++++++++++++++++++++------ 1 file changed, 81 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 2cb9dfa7d3..788c2cb122 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #define GGML_OPENVINO_MAX_STREAMS 8 @@ -133,6 +134,42 @@ static void ggml_backend_openvino_add_forward(ggml_tensor * dst) { // fflush(stdout); } +static void ggml_backend_openvino_mul_forward(ggml_tensor * dst) { + struct ggml_tensor *src0 = dst->src[0]; + struct ggml_tensor *src1 = dst->src[1]; + + ov::Core core; + + // define shape + ov::Shape shape0 = {static_cast(src0->ne[1]), static_cast(src0->ne[0])}; // For Example: [7, 3072] + ov::Shape shape1 = {static_cast(src1->ne[1]), static_cast(src1->ne[0])}; // For Example: [1, 3072] -> broadcast to [7, 3072] + + // create OpenVINO tensor (src0 and src1) + ov::Tensor tensor0(ov::element::f32, shape0, src0->data); + ov::Tensor tensor1(ov::element::f32, shape1, src1->data); + + // define input parameters + auto input0 = std::make_shared(ov::element::f32, shape0); + auto input1 = std::make_shared(ov::element::f32, shape1); + + // create a multiply operation using broadcasting + auto multiply = std::make_shared(input0, input1); + + // create model + auto model = std::make_shared(multiply, ov::ParameterVector{input0, input1}); + ov::CompiledModel compiled_model = core.compile_model(model, "CPU"); + + ov::InferRequest infer_request = compiled_model.create_infer_request(); + infer_request.set_tensor(input0, tensor0); + infer_request.set_tensor(input1, tensor1); + + infer_request.infer(); + + // get output tensor and copy it back to dst->data + ov::Tensor output_tensor = infer_request.get_output_tensor(); + std::memcpy(dst->data, output_tensor.data(), src0->ne[0] * src0->ne[1] * sizeof(float)); +} + static void ggml_backend_openvino_add(ggml_tensor * dst) { // Placeholder for OpenVINO add operation // GGML_ASSERT(ctx.device != 0); @@ -169,28 +206,49 @@ static void ggml_backend_openvino_add(ggml_tensor * dst) { } -static void test_op_for_NONE() { - GGML_LOG_DEBUG("...test_op_for_NONE... \n"); +static void ggml_backend_openvino_mul(ggml_tensor * dst) { + GGML_ASSERT(dst->data != nullptr); + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now"); + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_backend_openvino_mul_forward(dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } } static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; + if (node->op == GGML_OP_NONE || ggml_is_empty(node)) { + return GGML_STATUS_SUCCESS; + } + switch (node->op) { - case GGML_OP_ADD: - // TODO - ggml_backend_openvino_add(node); - break; - case GGML_OP_MUL_MAT: - case GGML_OP_OUT_PROD: - break; - case GGML_OP_NONE: - test_op_for_NONE(); - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: case GGML_OP_PERMUTE: + case GGML_OP_RESHAPE: case GGML_OP_TRANSPOSE: + case GGML_OP_VIEW: + break; + case GGML_OP_ADD: + { + ggml_backend_openvino_add(node); + } break; + case GGML_OP_MUL: + { + ggml_backend_openvino_mul(node); + } break; + case GGML_OP_MUL_MAT: break; default: GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node)); @@ -395,8 +453,18 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con // ggml_backend_openvino_device_context * dev_ctx = (ggml_backend_openvino_device_context *) dev->context; switch (op->op) { + case GGML_OP_NONE: + case GGML_OP_PERMUTE: + case GGML_OP_RESHAPE: + case GGML_OP_TRANSPOSE: + case GGML_OP_VIEW: + return true; case GGML_OP_ADD: return true; + case GGML_OP_MUL: + return true; + case GGML_OP_MUL_MAT: + return false; default: return false; } From 0a81aa19f73e616a16a81a0ede98776e269844ee Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Mon, 2 Dec 2024 10:39:36 +0800 Subject: [PATCH 006/254] Add compile options --- ggml/src/ggml-openvino.cpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 788c2cb122..370c0c5d98 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -86,15 +86,15 @@ static void ggml_backend_openvino_add_forward(ggml_tensor * dst) { auto input0_param = std::make_shared(ov::element::f32, ov::Shape{static_cast(src0->ne[0]), static_cast(src0->ne[1])}); auto input1_param = std::make_shared(ov::element::f32, ov::Shape{static_cast(src0->ne[0]), static_cast(src0->ne[1])}); auto add = std::make_shared(input0_param, input1_param); - auto function = std::make_shared(add, ov::ParameterVector{input0_param, input1_param}); + auto model = std::make_shared(add, ov::ParameterVector{input0_param, input1_param}); // compile model and store in context #ifdef GGML_OPENVINO_GPU - auto compiled_model = core.compile_model(function, "GPU"); + auto compiled_model = core.compile_model(model, "GPU"); #elif GGML_OPENVINO_NPU - auto compiled_model = core.compile_model(function, "NPU"); + auto compiled_model = core.compile_model(model, "NPU"); #else - auto compiled_model = core.compile_model(function, "CPU"); + auto compiled_model = core.compile_model(model, "CPU"); #endif // initialize infer request auto infer_request = compiled_model.create_infer_request(); @@ -157,7 +157,14 @@ static void ggml_backend_openvino_mul_forward(ggml_tensor * dst) { // create model auto model = std::make_shared(multiply, ov::ParameterVector{input0, input1}); + // compile model and store in context +#ifdef GGML_OPENVINO_GPU + ov::CompiledModel compiled_model = core.compile_model(model, "GPU"); +#elif GGML_OPENVINO_NPU + ov::CompiledModel compiled_model = core.compile_model(model, "NPU"); +#else ov::CompiledModel compiled_model = core.compile_model(model, "CPU"); +#endif ov::InferRequest infer_request = compiled_model.create_infer_request(); infer_request.set_tensor(input0, tensor0); From 77d68146a89625cacb2e50356eb6eef0e7e54400 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Wed, 4 Dec 2024 14:09:13 +0800 Subject: [PATCH 007/254] add OpenVINO frontend convert process steps --- ggml/src/ggml-openvino.cpp | 53 ++--- ggml/src/ggml-openvino/README.md | 30 +++ ggml/src/ggml-openvino/decoder.h | 54 +++++ ggml/src/ggml-openvino/ggml-decoder.cpp | 203 ++++++++++++++++++ ggml/src/ggml-openvino/ggml-decoder.h | 69 ++++++ .../src/ggml-openvino/ggml-graph-iterator.cpp | 96 +++++++++ ggml/src/ggml-openvino/ggml-graph-iterator.h | 61 ++++++ ggml/src/ggml-openvino/graph_iterator.h | 43 ++++ ggml/src/ggml-openvino/utils.cpp | 108 ++++++++++ ggml/src/ggml-openvino/utils.h | 6 + 10 files changed, 698 insertions(+), 25 deletions(-) create mode 100644 ggml/src/ggml-openvino/README.md create mode 100644 ggml/src/ggml-openvino/decoder.h create mode 100644 ggml/src/ggml-openvino/ggml-decoder.cpp create mode 100644 ggml/src/ggml-openvino/ggml-decoder.h create mode 100644 ggml/src/ggml-openvino/ggml-graph-iterator.cpp create mode 100644 ggml/src/ggml-openvino/ggml-graph-iterator.h create mode 100644 ggml/src/ggml-openvino/graph_iterator.h create mode 100644 ggml/src/ggml-openvino/utils.cpp create mode 100644 ggml/src/ggml-openvino/utils.h diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 370c0c5d98..34d692a8cf 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -1,6 +1,7 @@ #include "ggml-openvino.h" #include "ggml-backend-impl.h" #include "ggml-impl.h" +#include "ggml-openvino/utils.h" #include #include @@ -234,33 +235,35 @@ static void ggml_backend_openvino_mul(ggml_tensor * dst) { } static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - for (int i = 0; i < cgraph->n_nodes; i++) { - struct ggml_tensor * node = cgraph->nodes[i]; + // for (int i = 0; i < cgraph->n_nodes; i++) { + // struct ggml_tensor * node = cgraph->nodes[i]; - if (node->op == GGML_OP_NONE || ggml_is_empty(node)) { - return GGML_STATUS_SUCCESS; - } + // if (node->op == GGML_OP_NONE || ggml_is_empty(node)) { + // return GGML_STATUS_SUCCESS; + // } - switch (node->op) { - case GGML_OP_PERMUTE: - case GGML_OP_RESHAPE: - case GGML_OP_TRANSPOSE: - case GGML_OP_VIEW: - break; - case GGML_OP_ADD: - { - ggml_backend_openvino_add(node); - } break; - case GGML_OP_MUL: - { - ggml_backend_openvino_mul(node); - } break; - case GGML_OP_MUL_MAT: - break; - default: - GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node)); - } - } + // switch (node->op) { + // case GGML_OP_PERMUTE: + // case GGML_OP_RESHAPE: + // case GGML_OP_TRANSPOSE: + // case GGML_OP_VIEW: + // break; + // case GGML_OP_ADD: + // { + // ggml_backend_openvino_add(node); + // } break; + // case GGML_OP_MUL: + // { + // ggml_backend_openvino_mul(node); + // } break; + // case GGML_OP_MUL_MAT: + // break; + // default: + // GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node)); + // } + // } + + openvino_frontend_compute(backend, cgraph); return GGML_STATUS_SUCCESS; diff --git a/ggml/src/ggml-openvino/README.md b/ggml/src/ggml-openvino/README.md new file mode 100644 index 0000000000..46c2adb438 --- /dev/null +++ b/ggml/src/ggml-openvino/README.md @@ -0,0 +1,30 @@ +# Instructions to Modify and Build ggml with OpenVINO + +## Step 1: Modify the Source Code + +In order to change the frontend `.so` path to the path to `.so` file, you need to add path to the `.so` file in cmake compiler option: +1. Open a terminal and navigate to the root directory of this repo. +2. Run the following commands to configure: + ```sh + mkdir build + cmake -B build -DGGML_OV_FRONTEND="${openvino_repo_dir}/bin/intel64/Release/libopenvino_ggml_frontend.so" + ``` +Where GGML_OV_FRONTEND should point to the path to `libopenvino_ggml_frontend.so` file. + +## Step 2: Build the Project + +After modifying the source code, you need to build the project using CMake. Follow these steps: + +1. (Optional) Enable debug option for ggml-openvino, this will output dump of subgraph sent to OpenVINO, information after convert ggml_cgraph to GraphIterator, and calculation input value/output value of each OP: + ```sh + cmake -B build -DGGML_OPENVINO_DEBUG=ON + ``` + +2. Run the following commands to configure and build the project: + ```sh + cmake -B build -DGGML_OPENVINO=ON + cmake --build build -j + ``` + +This will configure the project with OpenVINO support and build it using multiple cores for faster compilation. + diff --git a/ggml/src/ggml-openvino/decoder.h b/ggml/src/ggml-openvino/decoder.h new file mode 100644 index 0000000000..d2ef7587b8 --- /dev/null +++ b/ggml/src/ggml-openvino/decoder.h @@ -0,0 +1,54 @@ +#pragma once + +#include "openvino/core/node.hpp" +#include "openvino/frontend/decoder.hpp" + +namespace ov { +namespace frontend { +namespace ggml { + +// TODO: Directly include from openvino +class GgmlDecoder : public DecoderBase { +public: + virtual ov::Any get_attribute(const std::string& name) const = 0; + + virtual PartialShape get_input_shape(size_t index) const = 0; + + virtual element::Type get_input_type(size_t index) const = 0; + + virtual size_t get_input_size() const = 0; + + virtual void get_input_node(size_t input_port_idx, + std::string& producer_name, + std::string& producer_output_port_name, + size_t& producer_output_port_index) const = 0; + + virtual bool is_graph_input(size_t index) const = 0; + + virtual std::string& get_input_name(size_t index) const = 0; + + virtual PartialShape get_output_shape(size_t index) const = 0; + + virtual element::Type get_output_type(size_t index) const = 0; + + virtual size_t get_output_size() const = 0; + + virtual bool is_graph_output(size_t index) const = 0; + + virtual int32_t* get_output_op_params(size_t index) const = 0; + + virtual std::string& get_output_name(size_t index) const = 0; + + virtual const std::string& get_op_type() const = 0; + + virtual const std::string& get_op_name() const = 0; + + // virtual const std::vector& outputs() const = 0; + + // virtual size_t output(size_t index) const = 0; + +}; + +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp new file mode 100644 index 0000000000..4d82c756cd --- /dev/null +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -0,0 +1,203 @@ +#include "ggml-decoder.h" +#include +#include + +GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgraph) + :m_cgraph(cgraph), + m_node(node), + m_op_name(std::string(m_node->name)) { + switch (m_node->op) { + // Unary OPs + case GGML_OP_UNARY: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + { + m_inputs.push_back(m_node->src[0]); + m_outputs.push_back(m_node); + #ifdef GGML_OPENVINO_DEBUG + GGML_LOG_INFO("Decoder input 0: %f \n", *(float*)(m_node->src[0]->data)); + #endif + break; + } + // SCALE + case GGML_OP_SCALE: + { + m_inputs.push_back(m_node->src[0]); + m_outputs.push_back(m_node); + #ifdef GGML_OPENVINO_DEBUG + float v; + memcpy(&v, m_node->op_params, sizeof(float)); + GGML_LOG_INFO("Decoder input 0: %f \n", *(float*)(m_node->src[0]->data)); + GGML_LOG_INFO("Scale: %f \n", v); + #endif + break; + } + // OPs with 2 inputs + case GGML_OP_ADD: + case GGML_OP_DIV: + case GGML_OP_MUL: + case GGML_OP_MUL_MAT: + case GGML_OP_SUB: + case GGML_OP_GET_ROWS: + { + m_inputs.push_back(m_node->src[0]); + m_inputs.push_back(m_node->src[1]); + m_outputs.push_back(m_node); + #ifdef GGML_OPENVINO_DEBUG + GGML_LOG_INFO("Decoder input 0: %f \n", *(float*)(m_node->src[0]->data)); + GGML_LOG_INFO("Decoder input 1: %f \n", *(float*)(m_node->src[1]->data)); + #endif + break; + } + default: + break; + } +} + +ov::PartialShape GgmlOvDecoder::get_input_shape(size_t index) const { + ov::PartialShape input_shape; + // Use input_node->ne + ggml_tensor * node = m_inputs[index]; + std::vector shape; + // GGML_MAX_DIMS + // for (int i = 0; i < GGML_MAX_DIMS; ++i) { + for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { + if (node->ne[i] == 0) { + return input_shape; + } + shape.push_back(static_cast(node->ne[i])); + } + input_shape = ov::PartialShape(shape); + return input_shape; +} + +ov::element::Type GgmlOvDecoder::get_input_type(size_t index) const { + ov::element::Type type = ov::element::dynamic; + // GGML_LOG_DEBUG("%d\n", m_inputs[index]->type); + switch (m_inputs[index]->type) { + case GGML_TYPE_F32: + type = ov::element::f32; + break; + case GGML_TYPE_F16: + type = ov::element::f16; + break; + case GGML_TYPE_I64: + type = ov::element::i64; + break; + case GGML_TYPE_I32: + type = ov::element::i32; + break; + default: + break; + } + return type; +} + +size_t GgmlOvDecoder::get_input_size() const { + return m_inputs.size(); +} + +bool GgmlOvDecoder::is_graph_input(size_t index) const { + if (m_inputs[index]->flags & GGML_TENSOR_FLAG_INPUT ) { + return true; + } + return false; +} + +std::string& GgmlOvDecoder::get_input_name(size_t index) const { + m_name = std::string(m_inputs[index]->name); + return m_name; +} + +ov::PartialShape GgmlOvDecoder::get_output_shape(size_t index) const { + ov::PartialShape output_shape; + // Use input_node->ne + ggml_tensor * node = m_outputs[index]; + std::vector shape; + // GGML_MAX_DIMS + // for (int i = 0; i < GGML_MAX_DIMS; ++i) { + for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { + if (node->ne[i] == 0 ) { + // empty if any dimension has no elements + return output_shape; + } + shape.push_back(static_cast(node->ne[i])); + } + output_shape = ov::PartialShape(shape); + return output_shape; +} + +ov::element::Type GgmlOvDecoder::get_output_type(size_t index) const { + // TODO: Change to Output + ov::element::Type type = ov::element::dynamic; + // GGML_LOG_DEBUG("%d\n", m_outputs[index]->type); + switch (m_outputs[index]->type) { + case GGML_TYPE_F32: + type = ov::element::f32; + break; + case GGML_TYPE_F16: + type = ov::element::f16; + break; + case GGML_TYPE_I64: + type = ov::element::i64; + break; + case GGML_TYPE_I32: + type = ov::element::i32; + break; + default: + break; + } + return type; +} + +bool GgmlOvDecoder::is_graph_output(size_t index) const { + if (m_outputs[index]->flags & GGML_TENSOR_FLAG_OUTPUT) { + return true; + } + return false; +} + +int32_t* GgmlOvDecoder::get_output_op_params(size_t index) const{ + return m_outputs[index]->op_params; +} + +size_t GgmlOvDecoder::get_output_size() const { + return m_outputs.size(); +} + +std::string& GgmlOvDecoder::get_output_name(size_t index) const { + m_name = std::string(m_outputs[index]->name); + return m_name; +} + +const std::string& GgmlOvDecoder::get_op_name() const { + return m_op_name; +} + +const std::string& GgmlOvDecoder::get_op_type() const { + static const std::map opTypeMap = { + {GGML_OP_ACC, "GGML_OP_ACC"}, + {GGML_OP_ADD, "GGML_OP_ADD"}, + {GGML_OP_ADD1, "GGML_OP_ADD1"}, + {GGML_OP_DIV, "GGML_OP_DIV"}, + {GGML_OP_DUP, "GGML_OP_DUP"}, + {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS"}, + {GGML_OP_MUL, "GGML_OP_MUL"}, + {GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT"}, + {GGML_OP_PERMUTE, "GGML_OP_PERMUTE"}, + {GGML_OP_RESHAPE, "GGML_OP_RESHAPE"}, + {GGML_OP_SCALE, "GGML_OP_SCALE"}, + {GGML_OP_SUB, "GGML_OP_SUB"}, + {GGML_OP_UNARY, "GGML_OP_UNARY"}, + {GGML_OP_VIEW, "GGML_OP_VIEW"} + }; + auto it = opTypeMap.find(m_node->op); + if (it != opTypeMap.end()) { + return it->second; + } else { + static const std::string unknown_op = "UNKNOWN_OP"; + return unknown_op; + } + // static std::string op_type = ggml_op_name(m_node->op); + // return op_type; +} diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h new file mode 100644 index 0000000000..3048e2e7e9 --- /dev/null +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -0,0 +1,69 @@ +#pragma once + +#include "decoder.h" +#include "ggml.h" + +class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { +public: + using ov::frontend::ggml::GgmlDecoder::GgmlDecoder; + GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgraph); + + virtual ov::Any get_attribute(const std::string& name) const override { + return nullptr; + GGML_UNUSED(name); + } + + virtual ov::PartialShape get_input_shape(size_t index) const override; + + virtual ov::element::Type get_input_type(size_t index) const override; + + virtual size_t get_input_size() const override; + + virtual void get_input_node(size_t input_port_idx, + std::string& producer_name, + std::string& producer_output_port_name, + size_t& producer_output_port_index) const override { + GGML_UNUSED(input_port_idx); + GGML_UNUSED(producer_name); + GGML_UNUSED(producer_output_port_name); + GGML_UNUSED(producer_output_port_index); + } + + virtual bool is_graph_input(size_t index) const override; + + virtual std::string& get_input_name(size_t index) const override; + + virtual ov::PartialShape get_output_shape(size_t index) const override; + + virtual ov::element::Type get_output_type(size_t index) const override; + + virtual size_t get_output_size() const override; + + virtual bool is_graph_output(size_t index) const override; + + virtual int32_t* get_output_op_params(size_t index) const override; + + virtual std::string& get_output_name(size_t index) const override; + + virtual const std::string& get_op_type() const override; + + virtual const std::string& get_op_name() const override; + + const ggml_tensor* get_input_ggml_tensor(size_t index) const { + return m_inputs[index]; + } + + // virtual const std::vector& outputs() const override; + + // virtual size_t output(size_t index) const override; + +private: + size_t m_index; + struct ggml_cgraph * m_cgraph; + std::vector m_inputs; + std::vector m_outputs; + ggml_tensor * m_node; + const std::string m_op_name; + mutable std::string m_name; +}; + diff --git a/ggml/src/ggml-openvino/ggml-graph-iterator.cpp b/ggml/src/ggml-openvino/ggml-graph-iterator.cpp new file mode 100644 index 0000000000..17a9b7ecfe --- /dev/null +++ b/ggml/src/ggml-openvino/ggml-graph-iterator.cpp @@ -0,0 +1,96 @@ +#include "ggml-graph-iterator.h" +#include +#include + +namespace ov { +namespace frontend { +namespace tensorflow { +namespace ggml { + +GgmlOvGraphIterator::GgmlOvGraphIterator(struct ggml_cgraph * cgraph) + :m_cgraph(cgraph) { + initialize_decoders(); + #ifdef GGML_OPENVINO_DEBUG + dump_graph_iterator(); + #endif +} + + void GgmlOvGraphIterator::initialize_decoders() { + auto nodes_size = m_cgraph->n_nodes; + // Initialize decoder for each node + // m_decoders.resize(static_cast(nodes_size)); + + for (int i = 0; i < nodes_size; ++i) { + // Skip View Op + if (m_cgraph->nodes[i] ->op == GGML_OP_VIEW || m_cgraph->nodes[i] ->op == GGML_OP_PERMUTE) { + continue; + } + auto decoder = std::make_shared(m_cgraph->nodes[i], m_cgraph); + m_decoders.push_back(decoder); + for (size_t inp = 0; inp < decoder->get_input_size(); ++inp) { + // if (i == 0 || decoder->is_graph_input(inp)) { + m_input_names.push_back(decoder->get_input_name(inp)); + // } + } + for (size_t inp = 0; inp < decoder->get_output_size(); ++inp) { + if (i == nodes_size - 1 || decoder->is_graph_output(inp)) { + m_output_names.push_back(decoder->get_output_name(inp)); + } + } + } + +} + +void GgmlOvGraphIterator::reset() { + node_index = 0; + } + +size_t GgmlOvGraphIterator::size() const { + return m_decoders.size(); +} + +void GgmlOvGraphIterator::next() { + node_index++; +} + +bool GgmlOvGraphIterator::is_end() const { + return node_index >= m_decoders.size(); +} + +std::shared_ptr GgmlOvGraphIterator::get_decoder() const { + return m_decoders[node_index]; +} + +std::vector GgmlOvGraphIterator::get_input_names() const { + return m_input_names; +} + +std::vector GgmlOvGraphIterator::get_output_names() const { + return m_output_names; +} + +void GgmlOvGraphIterator::dump_graph_iterator() const { + for (size_t i = 0; i < m_decoders.size(); ++i) { + GGML_LOG_INFO("OP %zu: %s\n", i, m_decoders[i]->get_op_name().c_str()); + for (size_t inp = 0; inp < m_decoders[i]->get_input_size(); ++inp) { + ov::PartialShape pshape = std::dynamic_pointer_cast(m_decoders[i])->get_input_shape(inp); + ov::element::Type ptype = std::dynamic_pointer_cast(m_decoders[i])->get_input_type(inp); + GGML_LOG_INFO("Input name: %s\n", std::dynamic_pointer_cast(m_decoders[i])->get_input_name(inp).c_str()); + GGML_LOG_INFO("Input shape: %s\n", pshape.to_string().c_str()); + GGML_LOG_INFO("Input type: %s\n", ptype.to_string().c_str()); + } + for (size_t outp = 0; outp < std::dynamic_pointer_cast(m_decoders[i])->get_output_size(); ++outp) { + ov::PartialShape pshape = std::dynamic_pointer_cast(m_decoders[i])->get_output_shape(outp); + ov::element::Type ptype = std::dynamic_pointer_cast(m_decoders[i])->get_output_type(outp); + GGML_LOG_INFO("Output name: %s\n", std::dynamic_pointer_cast(m_decoders[i])->get_output_name(outp).c_str()); + GGML_LOG_INFO("Output shape: %s\n", pshape.to_string().c_str()); + GGML_LOG_INFO("Output type: %s\n", ptype.to_string().c_str()); + + } + } +} + +} +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/ggml-graph-iterator.h b/ggml/src/ggml-openvino/ggml-graph-iterator.h new file mode 100644 index 0000000000..305afb5c98 --- /dev/null +++ b/ggml/src/ggml-openvino/ggml-graph-iterator.h @@ -0,0 +1,61 @@ +#pragma once + +#include "graph_iterator.h" +#include "ggml-decoder.h" +#include + +// To remove tensorflow +namespace ov { +namespace frontend { +namespace tensorflow { +namespace ggml { + +class GgmlOvGraphIterator : public GgmlGraphIterator { + +protected: + void initialize_decoders(); + +public: + using Ptr = std::shared_ptr; + GgmlOvGraphIterator(struct ggml_cgraph * cgraph); + + /// \brief Get a number of operation nodes in the sgraph + virtual size_t size() const override; + + /// \brief Set iterator to the start position + virtual void reset() override; + + /// \brief Move to the next node in the graph + virtual void next() override; + + /// \brief Returns true if iterator goes out of the range of available nodes + virtual bool is_end() const override; + + /// \brief Return a pointer to a decoder of the current node + virtual std::shared_ptr get_decoder() const override; + + virtual std::shared_ptr get_body_graph_iterator(const std::string& func_name) const override { + return nullptr; + GGML_UNUSED(func_name); + } + + /// \brief Returns a vector of input names in the original order + virtual std::vector get_input_names() const override; + + /// \brief Returns a vector of output names in the original order + virtual std::vector get_output_names() const override; + + virtual void dump_graph_iterator() const; + +private: + struct ggml_cgraph * m_cgraph; + size_t node_index = 0; + std::vector> m_decoders; + std::vector m_input_names; + std::vector m_output_names; +}; + +} +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/graph_iterator.h b/ggml/src/ggml-openvino/graph_iterator.h new file mode 100644 index 0000000000..e0b475e445 --- /dev/null +++ b/ggml/src/ggml-openvino/graph_iterator.h @@ -0,0 +1,43 @@ +#pragma once + +#include "openvino/frontend/graph_iterator.hpp" + +namespace ov { +namespace frontend { +namespace tensorflow { // To be Removed +namespace ggml { + +// TODO: Directly include from openvino +class GgmlGraphIterator : public GraphIterator { +public: + + virtual size_t size() const = 0; + + virtual void reset() = 0; + + virtual void next() = 0; + + virtual bool is_end() const = 0; + + virtual std::shared_ptr get_decoder() const = 0; + + virtual std::vector get_input_names() const = 0; + + virtual std::vector get_output_names() const = 0; + + virtual std::shared_ptr get_body_graph_iterator(const std::string& func_name) const = 0; + + virtual std::map get_input_names_map() const { + return {}; + } + + virtual std::map get_output_names_map() const { + return {}; + } + +}; + +} +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp new file mode 100644 index 0000000000..905e2f4197 --- /dev/null +++ b/ggml/src/ggml-openvino/utils.cpp @@ -0,0 +1,108 @@ +#include "utils.h" +#include "ggml-backend-impl.h" +#include +#include + +using ov::frontend::tensorflow::ggml::GgmlOvGraphIterator; + +std::shared_ptr get_ggml_graph_iterator(struct ggml_cgraph * cgraph) { + return std::make_shared(cgraph); +} + +std::map get_ggml_graph_input_tensors(std::shared_ptr ggml_graph_iterator) { + std::map input_tensors; + auto input_names = ggml_graph_iterator->get_input_names(); + ggml_graph_iterator->reset(); + for (; !ggml_graph_iterator->is_end(); ggml_graph_iterator->next()) { + auto decoder = std::dynamic_pointer_cast(ggml_graph_iterator->get_decoder()); + for (size_t inp = 0; inp < decoder->get_input_size(); ++inp) { + if (std::find(input_names.begin(), input_names.end(), decoder->get_input_name(inp)) != input_names.end()) { + auto input_data = decoder->get_input_ggml_tensor(inp)->data; + ov::Tensor input_tensor = ov::Tensor(decoder->get_input_type(inp), decoder->get_input_shape(inp).to_shape(), input_data); + input_tensors[decoder->get_input_name(inp)] = input_tensor; + } + } + } + return input_tensors; +} + +static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { + ov::frontend::FrontEnd::Ptr front_end = nullptr; + auto fem = ov::frontend::FrontEndManager(); + std::string fe_so_path; +#ifdef GGML_OV_FRONTEND + fe_so_path = GGML_OV_FRONTEND; +#endif + fem.register_front_end("ggml", fe_so_path); + front_end = fem.load_by_framework("ggml"); + return front_end; +} + +enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + ov::Core core; + auto devices = core.get_available_devices(); + // Get GGML Frontend + auto front_end = get_ggml_frontend(); + if (!front_end) { + GGML_LOG_ERROR("GGML FrontEnd is not initialized \n"); + return GGML_STATUS_FAILED; + } else { + #ifdef GGML_OPENVINO_DEBUG + GGML_LOG_INFO("GGML FrontEnd is initialized \n"); + #endif + } + + auto ggml_graph_iterator = get_ggml_graph_iterator(cgraph); + std::shared_ptr graph_iterator = ggml_graph_iterator; + + // Load GraphIterator -> InputModel + ov::frontend::InputModel::Ptr input_model = front_end->load(graph_iterator); + if (!input_model) { + GGML_LOG_ERROR("Input Model is not loaded \n"); + return GGML_STATUS_FAILED; + } else { + #ifdef GGML_OPENVINO_DEBUG + GGML_LOG_INFO("Input Model loaded \n"); + #endif + } + + // Convert InputModel -> ov::Model + std::shared_ptr model = front_end->convert(input_model); + if (!model) { + GGML_LOG_ERROR("Model is not converted \n"); + } else { + #ifdef GGML_OPENVINO_DEBUG + GGML_LOG_INFO("Model converted \n"); + #endif + } + + + // Loading a model to the device + ov::CompiledModel compiled_model = core.compile_model(model); + + // Create infer request + ov::InferRequest infer_request = compiled_model.create_infer_request(); + + // Get input tensor + auto input_names = ggml_graph_iterator->get_input_names(); + auto input_tensors = get_ggml_graph_input_tensors(ggml_graph_iterator); + + // Set input tensor + for (size_t i = 0; i < input_names.size(); i++) { + infer_request.set_input_tensor(i, input_tensors[input_names[i]]); + } + + infer_request.infer(); + + ov::Tensor output_tensor = infer_request.get_output_tensor(); + // Put data in output tensor to the last node -> data in cgraph + // Get output type + ggml_tensor* dst = cgraph->nodes[cgraph->n_nodes - 1]; + std::memcpy(dst->data, output_tensor.data(), output_tensor.get_byte_size()); + #ifdef GGML_OPENVINO_DEBUG + GGML_LOG_INFO("Output: %f\n", *output_tensor.data()); + #endif + + return GGML_STATUS_SUCCESS; + GGML_UNUSED(backend); +} diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h new file mode 100644 index 0000000000..15dd46ed4e --- /dev/null +++ b/ggml/src/ggml-openvino/utils.h @@ -0,0 +1,6 @@ +#include "ggml-graph-iterator.h" +#include "ggml-backend-impl.h" + +std::shared_ptr get_ggml_graph_iterator(struct ggml_cgraph * cgraph); + +enum ggml_status openvino_frontend_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); From ee31dc1c1b60eecdffdf94fa9a6b4a30a9c07c36 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Thu, 5 Dec 2024 16:58:36 +0800 Subject: [PATCH 008/254] add get openvino available ops function --- ggml/src/ggml-openvino.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 34d692a8cf..c25a927c30 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -458,6 +458,17 @@ static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(g return nullptr; } +std::set get_openvino_available_opsets() { + ov::Core core; + std::set unique_ops; + for (const auto& opset : ov::get_available_opsets()) { + for (const auto& op : opset.second().get_type_info_set()) { + unique_ops.insert(op.name).second; + } + } + return unique_ops; +} + static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { GGML_ASSERT(dev->reg != nullptr); // ggml_backend_openvino_device_context * dev_ctx = (ggml_backend_openvino_device_context *) dev->context; From 171c4681f44c4e59f30b388566498e6052e82983 Mon Sep 17 00:00:00 2001 From: yumengbo Date: Sat, 16 Nov 2024 12:52:19 +0800 Subject: [PATCH 009/254] Add PoC of integration of openvino frontend. Main changes: ggml-ov-frontend-utils, GraphIterator, Decoder --- ggml/src/ggml-openvino.cpp | 2 +- .../ggml-openvino/ggml-ov-frontend-utils.cpp | 54 +++++++++++++++++++ .../ggml-openvino/ggml-ov-frontend-utils.h | 6 +++ 3 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp create mode 100644 ggml/src/ggml-openvino/ggml-ov-frontend-utils.h diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index c25a927c30..c33e3f2be0 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -487,7 +487,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con case GGML_OP_MUL_MAT: return false; default: - return false; + return true; } } diff --git a/ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp b/ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp new file mode 100644 index 0000000000..f1b865aacf --- /dev/null +++ b/ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp @@ -0,0 +1,54 @@ +#include "ggml-ov-frontend-utils.h" +#include "ggml-backend-impl.h" +#include + +using ov::frontend::tensorflow::ggml::GgmlOvGraphIterator; + +std::shared_ptr get_ggml_graph_iterator(struct ggml_cgraph * cgraph) { + return std::make_shared(cgraph); +} + +static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { + ov::frontend::FrontEnd::Ptr front_end = nullptr; + auto fem = ov::frontend::FrontEndManager(); + std::string fe_so_path = "/home/yumeng/Code/ov-ggml-frontend/openvino/bin/intel64/Release/libopenvino_ggml_frontend.so"; + fem.register_front_end("ggml", fe_so_path); + front_end = fem.load_by_framework("ggml"); + return front_end; +} + +enum ggml_status openvino_frontend_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph) { + // Get GGML Frontend + auto front_end = get_ggml_frontend(); + if (!front_end) { + GGML_LOG_ERROR("GGML FrontEnd is not initialized \n"); + return GGML_STATUS_FAILED; + } else { + GGML_LOG_ERROR("GGML FrontEnd is initialized \n"); + } + + auto ggml_graph_iterator = get_ggml_graph_iterator(cgraph); + std::shared_ptr graph_iterator = ggml_graph_iterator; + GGML_LOG_ERROR("Decoder count in current GraphIterator: "); + GGML_LOG_ERROR(std::to_string(graph_iterator->size()).c_str()); + + // Load GraphIterator -> InputModel + ov::frontend::InputModel::Ptr input_model = front_end->load(graph_iterator); + if (!input_model) { + GGML_LOG_ERROR("\nInput Model is not loaded \n"); + return GGML_STATUS_FAILED; + } else { + GGML_LOG_ERROR("\nInput Model loaded \n"); + } + + // TODO: Convert InputModel -> ov::Model + // std::shared_ptr model = front_end->convert(input_model); + // if (!model) { + // GGML_LOG_ERROR("Model is not converted"); + // } + + // TODO: Compute + + return GGML_STATUS_SUCCESS; + GGML_UNUSED(backend); +} diff --git a/ggml/src/ggml-openvino/ggml-ov-frontend-utils.h b/ggml/src/ggml-openvino/ggml-ov-frontend-utils.h new file mode 100644 index 0000000000..15dd46ed4e --- /dev/null +++ b/ggml/src/ggml-openvino/ggml-ov-frontend-utils.h @@ -0,0 +1,6 @@ +#include "ggml-graph-iterator.h" +#include "ggml-backend-impl.h" + +std::shared_ptr get_ggml_graph_iterator(struct ggml_cgraph * cgraph); + +enum ggml_status openvino_frontend_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); From 34e826ac144586e41c1e2a25f50517361c4cadcf Mon Sep 17 00:00:00 2001 From: yumengbo Date: Tue, 19 Nov 2024 10:25:31 +0800 Subject: [PATCH 010/254] Implement GgmlOvDecoder. Add dump functions. --- ggml/src/ggml-openvino/decoder.h | 2 ++ ggml/src/ggml-openvino/ggml-decoder.h | 2 ++ ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp | 3 +-- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-openvino/decoder.h b/ggml/src/ggml-openvino/decoder.h index d2ef7587b8..e047235d88 100644 --- a/ggml/src/ggml-openvino/decoder.h +++ b/ggml/src/ggml-openvino/decoder.h @@ -39,6 +39,8 @@ public: virtual std::string& get_output_name(size_t index) const = 0; + virtual size_t get_output_size() const = 0; + virtual const std::string& get_op_type() const = 0; virtual const std::string& get_op_name() const = 0; diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 3048e2e7e9..96398d3f83 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -45,6 +45,8 @@ public: virtual std::string& get_output_name(size_t index) const override; + size_t get_output_size() const override; + virtual const std::string& get_op_type() const override; virtual const std::string& get_op_name() const override; diff --git a/ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp b/ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp index f1b865aacf..fd5921b476 100644 --- a/ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp +++ b/ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp @@ -29,8 +29,7 @@ enum ggml_status openvino_frontend_compute (ggml_backend_t backend, struct ggml_ auto ggml_graph_iterator = get_ggml_graph_iterator(cgraph); std::shared_ptr graph_iterator = ggml_graph_iterator; - GGML_LOG_ERROR("Decoder count in current GraphIterator: "); - GGML_LOG_ERROR(std::to_string(graph_iterator->size()).c_str()); + GGML_LOG_ERROR("Decoder count in current GraphIterator: %s\n", std::to_string(graph_iterator->size()).c_str()); // Load GraphIterator -> InputModel ov::frontend::InputModel::Ptr input_model = front_end->load(graph_iterator); From 9b7b63d12c49a5d74c92c173bbe010c656abbd85 Mon Sep 17 00:00:00 2001 From: yumengbo Date: Fri, 22 Nov 2024 13:10:14 +0800 Subject: [PATCH 011/254] Convert subgraph with add, sub, mul, div op to ov model and do infer on openvino device --- ggml/src/ggml-openvino.cpp | 3 +- ggml/src/ggml-openvino/decoder.h | 4 + ggml/src/ggml-openvino/ggml-decoder.h | 6 +- .../ggml-openvino/ggml-ov-frontend-utils.cpp | 73 ++++++++++++++++--- 4 files changed, 75 insertions(+), 11 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index c33e3f2be0..ea12c05ac7 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -268,6 +268,7 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe return GGML_STATUS_SUCCESS; GGML_UNUSED(backend); + GGML_UNUSED(ctx); } static const ggml_backend_i ggml_backend_openvino_interface = { @@ -487,7 +488,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con case GGML_OP_MUL_MAT: return false; default: - return true; + return false; } } diff --git a/ggml/src/ggml-openvino/decoder.h b/ggml/src/ggml-openvino/decoder.h index e047235d88..be943716f2 100644 --- a/ggml/src/ggml-openvino/decoder.h +++ b/ggml/src/ggml-openvino/decoder.h @@ -41,6 +41,10 @@ public: virtual size_t get_output_size() const = 0; + virtual bool is_graph_output(size_t index) const = 0; + + virtual std::string& get_output_name(size_t index) const = 0; + virtual const std::string& get_op_type() const = 0; virtual const std::string& get_op_name() const = 0; diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 96398d3f83..1eaba59426 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -45,7 +45,11 @@ public: virtual std::string& get_output_name(size_t index) const override; - size_t get_output_size() const override; + virtual size_t get_output_size() const override; + + virtual bool is_graph_output(size_t index) const override; + + virtual std::string& get_output_name(size_t index) const override; virtual const std::string& get_op_type() const override; diff --git a/ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp b/ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp index fd5921b476..10107cbfd0 100644 --- a/ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp +++ b/ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp @@ -1,6 +1,7 @@ #include "ggml-ov-frontend-utils.h" #include "ggml-backend-impl.h" #include +#include using ov::frontend::tensorflow::ggml::GgmlOvGraphIterator; @@ -8,9 +9,27 @@ std::shared_ptr get_ggml_graph_iterator(struct ggml_cgraph return std::make_shared(cgraph); } +std::vector get_ggml_graph_input_tensors(std::shared_ptr ggml_graph_iterator) { + std::vector input_tensors; + auto input_names = ggml_graph_iterator->get_input_names(); + ggml_graph_iterator->reset(); + for (; !ggml_graph_iterator->is_end(); ggml_graph_iterator->next()) { + auto decoder = std::dynamic_pointer_cast(ggml_graph_iterator->get_decoder()); + for (size_t inp = 0; inp < decoder->get_input_size(); ++inp) { + if (std::find(input_names.begin(), input_names.end(), decoder->get_input_name(inp)) != input_names.end()) { + auto input_data = decoder->get_input_ggml_tensor(inp)->data; + ov::Tensor input_tensor = ov::Tensor(decoder->get_input_type(inp), decoder->get_input_shape(inp).to_shape(), input_data); + input_tensors.push_back(input_tensor); + } + } + } + return input_tensors; +} + static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { ov::frontend::FrontEnd::Ptr front_end = nullptr; auto fem = ov::frontend::FrontEndManager(); + // std::string fe_so_path = "/home/yumeng/Code/test/openvino/bin/intel64/Release/libopenvino_ggml_frontend.so"; std::string fe_so_path = "/home/yumeng/Code/ov-ggml-frontend/openvino/bin/intel64/Release/libopenvino_ggml_frontend.so"; fem.register_front_end("ggml", fe_so_path); front_end = fem.load_by_framework("ggml"); @@ -18,36 +37,72 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { } enum ggml_status openvino_frontend_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph) { + ov::Core core; + auto devices = core.get_available_devices(); + #ifdef GGML_OPENVINO_DEBUG + GGML_LOG_INFO("Device numbers: %d\n", devices.size()); + #endif // Get GGML Frontend auto front_end = get_ggml_frontend(); if (!front_end) { GGML_LOG_ERROR("GGML FrontEnd is not initialized \n"); return GGML_STATUS_FAILED; } else { - GGML_LOG_ERROR("GGML FrontEnd is initialized \n"); + #ifdef GGML_OPENVINO_DEBUG + GGML_LOG_INFO("GGML FrontEnd is initialized \n"); + #endif } auto ggml_graph_iterator = get_ggml_graph_iterator(cgraph); std::shared_ptr graph_iterator = ggml_graph_iterator; - GGML_LOG_ERROR("Decoder count in current GraphIterator: %s\n", std::to_string(graph_iterator->size()).c_str()); // Load GraphIterator -> InputModel ov::frontend::InputModel::Ptr input_model = front_end->load(graph_iterator); if (!input_model) { - GGML_LOG_ERROR("\nInput Model is not loaded \n"); + GGML_LOG_ERROR("Input Model is not loaded \n"); return GGML_STATUS_FAILED; } else { - GGML_LOG_ERROR("\nInput Model loaded \n"); + #ifdef GGML_OPENVINO_DEBUG + GGML_LOG_INFO("Input Model loaded \n"); + #endif } // TODO: Convert InputModel -> ov::Model - // std::shared_ptr model = front_end->convert(input_model); - // if (!model) { - // GGML_LOG_ERROR("Model is not converted"); - // } + std::shared_ptr model = front_end->convert(input_model); + if (!model) { + GGML_LOG_ERROR("Model is not converted \n"); + } else { + #ifdef GGML_OPENVINO_DEBUG + GGML_LOG_INFO("Model converted \n"); + #endif + } - // TODO: Compute + // Loading a model to the device + ov::CompiledModel compiled_model = core.compile_model(model); + + // Create infer request + ov::InferRequest infer_request = compiled_model.create_infer_request(); + + // Get input tensor + auto input_tensor = get_ggml_graph_input_tensors(ggml_graph_iterator); + + // Set input tensor + for (size_t i = 0; i < input_tensor.size(); i++) { + infer_request.set_input_tensor(i, input_tensor[i]); + } + + infer_request.infer(); + + ov::Tensor output_tensor = infer_request.get_output_tensor(); + // Put data in output tensor to the last node -> data in cgraph + // Get output type + ggml_tensor* dst = cgraph->nodes[cgraph->n_nodes - 1]; + std::memcpy(dst->data, output_tensor.data(), output_tensor.get_byte_size()); + #ifdef GGML_OPENVINO_DEBUG + GGML_LOG_INFO("%f\n", *output_tensor.data()); + #endif + return GGML_STATUS_SUCCESS; GGML_UNUSED(backend); } From 31bd816426d42a27896d8ec66689852707e24c4f Mon Sep 17 00:00:00 2001 From: yumengbo Date: Sat, 23 Nov 2024 06:03:08 +0800 Subject: [PATCH 012/254] Add GGML_OV_FRONTEND option. Add readme. --- .../ggml-openvino/ggml-ov-frontend-utils.cpp | 108 ------------------ .../ggml-openvino/ggml-ov-frontend-utils.h | 6 - 2 files changed, 114 deletions(-) delete mode 100644 ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp delete mode 100644 ggml/src/ggml-openvino/ggml-ov-frontend-utils.h diff --git a/ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp b/ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp deleted file mode 100644 index 10107cbfd0..0000000000 --- a/ggml/src/ggml-openvino/ggml-ov-frontend-utils.cpp +++ /dev/null @@ -1,108 +0,0 @@ -#include "ggml-ov-frontend-utils.h" -#include "ggml-backend-impl.h" -#include -#include - -using ov::frontend::tensorflow::ggml::GgmlOvGraphIterator; - -std::shared_ptr get_ggml_graph_iterator(struct ggml_cgraph * cgraph) { - return std::make_shared(cgraph); -} - -std::vector get_ggml_graph_input_tensors(std::shared_ptr ggml_graph_iterator) { - std::vector input_tensors; - auto input_names = ggml_graph_iterator->get_input_names(); - ggml_graph_iterator->reset(); - for (; !ggml_graph_iterator->is_end(); ggml_graph_iterator->next()) { - auto decoder = std::dynamic_pointer_cast(ggml_graph_iterator->get_decoder()); - for (size_t inp = 0; inp < decoder->get_input_size(); ++inp) { - if (std::find(input_names.begin(), input_names.end(), decoder->get_input_name(inp)) != input_names.end()) { - auto input_data = decoder->get_input_ggml_tensor(inp)->data; - ov::Tensor input_tensor = ov::Tensor(decoder->get_input_type(inp), decoder->get_input_shape(inp).to_shape(), input_data); - input_tensors.push_back(input_tensor); - } - } - } - return input_tensors; -} - -static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { - ov::frontend::FrontEnd::Ptr front_end = nullptr; - auto fem = ov::frontend::FrontEndManager(); - // std::string fe_so_path = "/home/yumeng/Code/test/openvino/bin/intel64/Release/libopenvino_ggml_frontend.so"; - std::string fe_so_path = "/home/yumeng/Code/ov-ggml-frontend/openvino/bin/intel64/Release/libopenvino_ggml_frontend.so"; - fem.register_front_end("ggml", fe_so_path); - front_end = fem.load_by_framework("ggml"); - return front_end; -} - -enum ggml_status openvino_frontend_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph) { - ov::Core core; - auto devices = core.get_available_devices(); - #ifdef GGML_OPENVINO_DEBUG - GGML_LOG_INFO("Device numbers: %d\n", devices.size()); - #endif - // Get GGML Frontend - auto front_end = get_ggml_frontend(); - if (!front_end) { - GGML_LOG_ERROR("GGML FrontEnd is not initialized \n"); - return GGML_STATUS_FAILED; - } else { - #ifdef GGML_OPENVINO_DEBUG - GGML_LOG_INFO("GGML FrontEnd is initialized \n"); - #endif - } - - auto ggml_graph_iterator = get_ggml_graph_iterator(cgraph); - std::shared_ptr graph_iterator = ggml_graph_iterator; - - // Load GraphIterator -> InputModel - ov::frontend::InputModel::Ptr input_model = front_end->load(graph_iterator); - if (!input_model) { - GGML_LOG_ERROR("Input Model is not loaded \n"); - return GGML_STATUS_FAILED; - } else { - #ifdef GGML_OPENVINO_DEBUG - GGML_LOG_INFO("Input Model loaded \n"); - #endif - } - - // TODO: Convert InputModel -> ov::Model - std::shared_ptr model = front_end->convert(input_model); - if (!model) { - GGML_LOG_ERROR("Model is not converted \n"); - } else { - #ifdef GGML_OPENVINO_DEBUG - GGML_LOG_INFO("Model converted \n"); - #endif - } - - - // Loading a model to the device - ov::CompiledModel compiled_model = core.compile_model(model); - - // Create infer request - ov::InferRequest infer_request = compiled_model.create_infer_request(); - - // Get input tensor - auto input_tensor = get_ggml_graph_input_tensors(ggml_graph_iterator); - - // Set input tensor - for (size_t i = 0; i < input_tensor.size(); i++) { - infer_request.set_input_tensor(i, input_tensor[i]); - } - - infer_request.infer(); - - ov::Tensor output_tensor = infer_request.get_output_tensor(); - // Put data in output tensor to the last node -> data in cgraph - // Get output type - ggml_tensor* dst = cgraph->nodes[cgraph->n_nodes - 1]; - std::memcpy(dst->data, output_tensor.data(), output_tensor.get_byte_size()); - #ifdef GGML_OPENVINO_DEBUG - GGML_LOG_INFO("%f\n", *output_tensor.data()); - #endif - - return GGML_STATUS_SUCCESS; - GGML_UNUSED(backend); -} diff --git a/ggml/src/ggml-openvino/ggml-ov-frontend-utils.h b/ggml/src/ggml-openvino/ggml-ov-frontend-utils.h deleted file mode 100644 index 15dd46ed4e..0000000000 --- a/ggml/src/ggml-openvino/ggml-ov-frontend-utils.h +++ /dev/null @@ -1,6 +0,0 @@ -#include "ggml-graph-iterator.h" -#include "ggml-backend-impl.h" - -std::shared_ptr get_ggml_graph_iterator(struct ggml_cgraph * cgraph); - -enum ggml_status openvino_frontend_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); From 5b46dc23be800ce6794655dce1b2da06e0cc2d9f Mon Sep 17 00:00:00 2001 From: yumengbo Date: Fri, 6 Dec 2024 07:37:58 +0800 Subject: [PATCH 013/254] Change output for infer request to set output tensor. Support scale, view op. --- ggml/src/ggml-openvino/ggml-decoder.cpp | 43 ++++++++++++------- ggml/src/ggml-openvino/ggml-decoder.h | 4 ++ .../src/ggml-openvino/ggml-graph-iterator.cpp | 27 ++++++------ ggml/src/ggml-openvino/utils.cpp | 41 ++++++++++++++---- 4 files changed, 78 insertions(+), 37 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 4d82c756cd..b367987372 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -10,13 +10,21 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr // Unary OPs case GGML_OP_UNARY: case GGML_OP_RESHAPE: - case GGML_OP_VIEW: + case GGML_OP_TRANSPOSE: + case GGML_OP_PERMUTE: + case GGML_OP_CONT: + case GGML_OP_CPY: + case GGML_OP_RMS_NORM: { m_inputs.push_back(m_node->src[0]); m_outputs.push_back(m_node); - #ifdef GGML_OPENVINO_DEBUG - GGML_LOG_INFO("Decoder input 0: %f \n", *(float*)(m_node->src[0]->data)); - #endif + break; + } + // For view, input is m_node itself + case GGML_OP_VIEW: + { + m_inputs.push_back(m_node); + m_outputs.push_back(m_node); break; } // SCALE @@ -24,12 +32,6 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr { m_inputs.push_back(m_node->src[0]); m_outputs.push_back(m_node); - #ifdef GGML_OPENVINO_DEBUG - float v; - memcpy(&v, m_node->op_params, sizeof(float)); - GGML_LOG_INFO("Decoder input 0: %f \n", *(float*)(m_node->src[0]->data)); - GGML_LOG_INFO("Scale: %f \n", v); - #endif break; } // OPs with 2 inputs @@ -39,14 +41,20 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr case GGML_OP_MUL_MAT: case GGML_OP_SUB: case GGML_OP_GET_ROWS: + case GGML_OP_SOFT_MAX: { m_inputs.push_back(m_node->src[0]); m_inputs.push_back(m_node->src[1]); m_outputs.push_back(m_node); - #ifdef GGML_OPENVINO_DEBUG - GGML_LOG_INFO("Decoder input 0: %f \n", *(float*)(m_node->src[0]->data)); - GGML_LOG_INFO("Decoder input 1: %f \n", *(float*)(m_node->src[1]->data)); - #endif + break; + } + // OPs with 3 inputs: + case GGML_OP_ROPE: + { + m_inputs.push_back(m_node->src[0]); + m_inputs.push_back(m_node->src[1]); + m_inputs.push_back(m_node->src[2]); // ??? + m_outputs.push_back(m_node); break; } default: @@ -130,7 +138,6 @@ ov::PartialShape GgmlOvDecoder::get_output_shape(size_t index) const { ov::element::Type GgmlOvDecoder::get_output_type(size_t index) const { // TODO: Change to Output ov::element::Type type = ov::element::dynamic; - // GGML_LOG_DEBUG("%d\n", m_outputs[index]->type); switch (m_outputs[index]->type) { case GGML_TYPE_F32: type = ov::element::f32; @@ -179,6 +186,8 @@ const std::string& GgmlOvDecoder::get_op_type() const { {GGML_OP_ACC, "GGML_OP_ACC"}, {GGML_OP_ADD, "GGML_OP_ADD"}, {GGML_OP_ADD1, "GGML_OP_ADD1"}, + {GGML_OP_CONT, "GGML_OP_CONT"}, + {GGML_OP_CPY, "GGML_OP_CPY"}, {GGML_OP_DIV, "GGML_OP_DIV"}, {GGML_OP_DUP, "GGML_OP_DUP"}, {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS"}, @@ -186,8 +195,12 @@ const std::string& GgmlOvDecoder::get_op_type() const { {GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT"}, {GGML_OP_PERMUTE, "GGML_OP_PERMUTE"}, {GGML_OP_RESHAPE, "GGML_OP_RESHAPE"}, + {GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM"}, + {GGML_OP_ROPE, "GGML_OP_ROPE"}, {GGML_OP_SCALE, "GGML_OP_SCALE"}, + {GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX"}, {GGML_OP_SUB, "GGML_OP_SUB"}, + {GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE"}, {GGML_OP_UNARY, "GGML_OP_UNARY"}, {GGML_OP_VIEW, "GGML_OP_VIEW"} }; diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 1eaba59426..ceae589ed4 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -59,6 +59,10 @@ public: return m_inputs[index]; } + const ggml_tensor* get_output_ggml_tensor(size_t index) const { + return m_outputs[index]; + } + // virtual const std::vector& outputs() const override; // virtual size_t output(size_t index) const override; diff --git a/ggml/src/ggml-openvino/ggml-graph-iterator.cpp b/ggml/src/ggml-openvino/ggml-graph-iterator.cpp index 17a9b7ecfe..44e119a1ac 100644 --- a/ggml/src/ggml-openvino/ggml-graph-iterator.cpp +++ b/ggml/src/ggml-openvino/ggml-graph-iterator.cpp @@ -15,16 +15,17 @@ GgmlOvGraphIterator::GgmlOvGraphIterator(struct ggml_cgraph * cgraph) #endif } - void GgmlOvGraphIterator::initialize_decoders() { +void GgmlOvGraphIterator::initialize_decoders() { auto nodes_size = m_cgraph->n_nodes; // Initialize decoder for each node // m_decoders.resize(static_cast(nodes_size)); for (int i = 0; i < nodes_size; ++i) { // Skip View Op - if (m_cgraph->nodes[i] ->op == GGML_OP_VIEW || m_cgraph->nodes[i] ->op == GGML_OP_PERMUTE) { - continue; - } + // if (m_cgraph->nodes[i] ->op == GGML_OP_PERMUTE + // || m_cgraph->nodes[i] ->op == GGML_OP_CPY ) { + // continue; + // } auto decoder = std::make_shared(m_cgraph->nodes[i], m_cgraph); m_decoders.push_back(decoder); for (size_t inp = 0; inp < decoder->get_input_size(); ++inp) { @@ -33,9 +34,9 @@ GgmlOvGraphIterator::GgmlOvGraphIterator(struct ggml_cgraph * cgraph) // } } for (size_t inp = 0; inp < decoder->get_output_size(); ++inp) { - if (i == nodes_size - 1 || decoder->is_graph_output(inp)) { + // if (i == nodes_size - 1 || decoder->is_graph_output(inp)) { m_output_names.push_back(decoder->get_output_name(inp)); - } + // } } } @@ -71,20 +72,20 @@ std::vector GgmlOvGraphIterator::get_output_names() const { void GgmlOvGraphIterator::dump_graph_iterator() const { for (size_t i = 0; i < m_decoders.size(); ++i) { - GGML_LOG_INFO("OP %zu: %s\n", i, m_decoders[i]->get_op_name().c_str()); + GGML_LOG_INFO("\nOP %zu: %s\n", i, m_decoders[i]->get_op_name().c_str()); for (size_t inp = 0; inp < m_decoders[i]->get_input_size(); ++inp) { ov::PartialShape pshape = std::dynamic_pointer_cast(m_decoders[i])->get_input_shape(inp); ov::element::Type ptype = std::dynamic_pointer_cast(m_decoders[i])->get_input_type(inp); - GGML_LOG_INFO("Input name: %s\n", std::dynamic_pointer_cast(m_decoders[i])->get_input_name(inp).c_str()); - GGML_LOG_INFO("Input shape: %s\n", pshape.to_string().c_str()); - GGML_LOG_INFO("Input type: %s\n", ptype.to_string().c_str()); + GGML_LOG_INFO("- Input name: %s\n", std::dynamic_pointer_cast(m_decoders[i])->get_input_name(inp).c_str()); + GGML_LOG_INFO(" Input shape: %s\n", pshape.to_string().c_str()); + GGML_LOG_INFO(" Input type: %s\n", ptype.to_string().c_str()); } for (size_t outp = 0; outp < std::dynamic_pointer_cast(m_decoders[i])->get_output_size(); ++outp) { ov::PartialShape pshape = std::dynamic_pointer_cast(m_decoders[i])->get_output_shape(outp); ov::element::Type ptype = std::dynamic_pointer_cast(m_decoders[i])->get_output_type(outp); - GGML_LOG_INFO("Output name: %s\n", std::dynamic_pointer_cast(m_decoders[i])->get_output_name(outp).c_str()); - GGML_LOG_INFO("Output shape: %s\n", pshape.to_string().c_str()); - GGML_LOG_INFO("Output type: %s\n", ptype.to_string().c_str()); + GGML_LOG_INFO("- Output name: %s\n", std::dynamic_pointer_cast(m_decoders[i])->get_output_name(outp).c_str()); + GGML_LOG_INFO(" Output shape: %s\n", pshape.to_string().c_str()); + GGML_LOG_INFO(" Output type: %s\n", ptype.to_string().c_str()); } } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 905e2f4197..db52b1f81d 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -18,6 +18,9 @@ std::map get_ggml_graph_input_tensors(std::shared_ptrget_input_size(); ++inp) { if (std::find(input_names.begin(), input_names.end(), decoder->get_input_name(inp)) != input_names.end()) { auto input_data = decoder->get_input_ggml_tensor(inp)->data; + #ifdef GGML_OPENVINO_DEBUG + printf("Subgraph input %d: %g\n", inp, *(double*)(input_data)); + #endif ov::Tensor input_tensor = ov::Tensor(decoder->get_input_type(inp), decoder->get_input_shape(inp).to_shape(), input_data); input_tensors[decoder->get_input_name(inp)] = input_tensor; } @@ -26,6 +29,27 @@ std::map get_ggml_graph_input_tensors(std::shared_ptr get_ggml_graph_output_tensors(std::shared_ptr ggml_graph_iterator) { + std::map output_tensors; + auto output_names = ggml_graph_iterator->get_output_names(); + ggml_graph_iterator->reset(); + for (; !ggml_graph_iterator->is_end(); ggml_graph_iterator->next()) { + auto decoder = std::dynamic_pointer_cast(ggml_graph_iterator->get_decoder()); + for (size_t inp = 0; inp < decoder->get_output_size(); ++inp) { + if (std::find(output_names.begin(), output_names.end(), decoder->get_output_name(inp)) != output_names.end()) { + auto output_data = decoder->get_output_ggml_tensor(inp)->data; + #ifdef GGML_OPENVINO_DEBUG + printf("Output %d: %g\n", inp, *(double*)(output_data)); + #endif + ov::Tensor output_tensor = ov::Tensor(decoder->get_output_type(inp), decoder->get_output_shape(inp).to_shape(), output_data); + output_tensors[decoder->get_output_name(inp)] = output_tensor; + } + } + } + return output_tensors; +} + + static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { ov::frontend::FrontEnd::Ptr front_end = nullptr; auto fem = ov::frontend::FrontEndManager(); @@ -92,16 +116,15 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c infer_request.set_input_tensor(i, input_tensors[input_names[i]]); } - infer_request.infer(); + // Set output tensor - ov::Tensor output_tensor = infer_request.get_output_tensor(); - // Put data in output tensor to the last node -> data in cgraph - // Get output type - ggml_tensor* dst = cgraph->nodes[cgraph->n_nodes - 1]; - std::memcpy(dst->data, output_tensor.data(), output_tensor.get_byte_size()); - #ifdef GGML_OPENVINO_DEBUG - GGML_LOG_INFO("Output: %f\n", *output_tensor.data()); - #endif + auto output_names = ggml_graph_iterator->get_output_names(); + auto output_tensors = get_ggml_graph_output_tensors(ggml_graph_iterator); + for (size_t i = 0; i < output_names.size(); i++) { + infer_request.set_output_tensor(i, output_tensors[output_names[i]]); + } + + infer_request.infer(); return GGML_STATUS_SUCCESS; GGML_UNUSED(backend); From 49804f43fcf2cdfb5d0424dd334a13240b3b11b2 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Mon, 9 Dec 2024 10:09:13 +0800 Subject: [PATCH 014/254] add GET_ROWS operator of OpenVINO to GGML of llama.cpp --- ggml/src/ggml-openvino.cpp | 146 ++++++++++++++++++++++++++++++------- 1 file changed, 120 insertions(+), 26 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index ea12c05ac7..0a1e969c9f 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -234,36 +234,130 @@ static void ggml_backend_openvino_mul(ggml_tensor * dst) { } } +void ggml_compute_forward_get_rows_f16(struct ggml_tensor *dst) { + const struct ggml_tensor *src0 = dst->src[0]; + const struct ggml_tensor *src1 = dst->src[1]; + + ov::Core core; + + ov::Shape shape0 = {static_cast(src0->ne[1]), static_cast(src0->ne[0])}; // [3072, 7] + ov::Shape shape1 = {static_cast(src1->ne[0])}; // [7] + + ov::Tensor tensor0(ov::element::f16, shape0, src0->data); + ov::Tensor tensor1(ov::element::i32, shape1, src1->data); + + auto input0 = std::make_shared(ov::element::f16, shape0); + auto input1 = std::make_shared(ov::element::i32, shape1); + + auto gather = std::make_shared(input0, input1, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {0})); + + auto model = std::make_shared(gather, ov::ParameterVector{input0, input1}); + ov::CompiledModel compiled_model = core.compile_model(model, "CPU"); + + ov::InferRequest infer_request = compiled_model.create_infer_request(); + infer_request.set_tensor(input0, tensor0); + infer_request.set_tensor(input1, tensor1); + + infer_request.infer(); + + ov::Tensor output_tensor = infer_request.get_output_tensor(); + // Convert output tensor data type from f16 to f32 + ov::Tensor output_tensor_f32 = ov::Tensor(ov::element::f32, output_tensor.get_shape()); + for (size_t i = 0; i < output_tensor.get_size(); ++i) { + output_tensor_f32.data()[i] = static_cast(output_tensor.data()[i]); + } + + // Copy the converted data to dst->data + std::memcpy(dst->data, output_tensor_f32.data(), output_tensor_f32.get_byte_size()); +} + +void ggml_compute_forward_get_rows_f32(struct ggml_tensor *dst) { + const struct ggml_tensor *src0 = dst->src[0]; + const struct ggml_tensor *src1 = dst->src[1]; + + ov::Core core; + + ov::Shape shape0 = {static_cast(src0->ne[1]), static_cast(src0->ne[0])}; // [3072, 7] + ov::Shape shape1 = {static_cast(src1->ne[0])}; // [7] + + ov::Tensor tensor0(ov::element::f32, shape0, src0->data); + ov::Tensor tensor1(ov::element::i32, shape1, src1->data); + + auto input0 = std::make_shared(ov::element::f32, shape0); + auto input1 = std::make_shared(ov::element::i32, shape1); + + auto gather = std::make_shared(input0, input1, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {0})); + + auto model = std::make_shared(gather, ov::ParameterVector{input0, input1}); + ov::CompiledModel compiled_model = core.compile_model(model, "CPU"); + + ov::InferRequest infer_request = compiled_model.create_infer_request(); + infer_request.set_tensor(input0, tensor0); + infer_request.set_tensor(input1, tensor1); + + infer_request.infer(); + + ov::Tensor output_tensor = infer_request.get_output_tensor(); + + // Copy the converted data to dst->data + std::memcpy(dst->data, output_tensor.data(), output_tensor.get_byte_size()); +} + +void ggml_compute_forward_get_rows(struct ggml_tensor *dst) { + const struct ggml_tensor *src0 = dst->src[0]; + const struct ggml_tensor *src1 = dst->src[1]; + + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_get_rows_f16(dst); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_get_rows_f32(dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } + +} + static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - // for (int i = 0; i < cgraph->n_nodes; i++) { - // struct ggml_tensor * node = cgraph->nodes[i]; + for (int i = 0; i < cgraph->n_nodes; i++) { + struct ggml_tensor * node = cgraph->nodes[i]; - // if (node->op == GGML_OP_NONE || ggml_is_empty(node)) { - // return GGML_STATUS_SUCCESS; - // } + if (node->op == GGML_OP_NONE || ggml_is_empty(node)) { + return GGML_STATUS_SUCCESS; + } - // switch (node->op) { - // case GGML_OP_PERMUTE: - // case GGML_OP_RESHAPE: - // case GGML_OP_TRANSPOSE: - // case GGML_OP_VIEW: - // break; - // case GGML_OP_ADD: - // { - // ggml_backend_openvino_add(node); - // } break; - // case GGML_OP_MUL: - // { - // ggml_backend_openvino_mul(node); - // } break; - // case GGML_OP_MUL_MAT: - // break; - // default: - // GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node)); - // } - // } + switch (node->op) { + case GGML_OP_PERMUTE: + case GGML_OP_RESHAPE: + case GGML_OP_TRANSPOSE: + case GGML_OP_VIEW: + break; + case GGML_OP_ADD: + { + ggml_backend_openvino_add(node); + } break; + case GGML_OP_MUL: + { + ggml_backend_openvino_mul(node); + } break; + case GGML_OP_MUL_MAT: + break; + case GGML_OP_GET_ROWS: + { + ggml_compute_forward_get_rows(node); + } break; + default: + GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node)); + } + } - openvino_frontend_compute(backend, cgraph); + // openvino_frontend_compute(backend, cgraph); return GGML_STATUS_SUCCESS; From 80c330a469505c57d5a3b6e15a2de8e9bf4acdd4 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Tue, 10 Dec 2024 18:26:55 +0800 Subject: [PATCH 015/254] Update build.md and add operation mapping(GGML to OpenVINO) --- ggml/src/ggml-openvino.cpp | 118 ++++++++++++++++++++++++++----------- 1 file changed, 83 insertions(+), 35 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 0a1e969c9f..efbff646e3 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -325,39 +325,7 @@ void ggml_compute_forward_get_rows(struct ggml_tensor *dst) { } static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - for (int i = 0; i < cgraph->n_nodes; i++) { - struct ggml_tensor * node = cgraph->nodes[i]; - - if (node->op == GGML_OP_NONE || ggml_is_empty(node)) { - return GGML_STATUS_SUCCESS; - } - - switch (node->op) { - case GGML_OP_PERMUTE: - case GGML_OP_RESHAPE: - case GGML_OP_TRANSPOSE: - case GGML_OP_VIEW: - break; - case GGML_OP_ADD: - { - ggml_backend_openvino_add(node); - } break; - case GGML_OP_MUL: - { - ggml_backend_openvino_mul(node); - } break; - case GGML_OP_MUL_MAT: - break; - case GGML_OP_GET_ROWS: - { - ggml_compute_forward_get_rows(node); - } break; - default: - GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node)); - } - } - - // openvino_frontend_compute(backend, cgraph); + openvino_frontend_compute(backend, cgraph); return GGML_STATUS_SUCCESS; @@ -558,7 +526,7 @@ std::set get_openvino_available_opsets() { std::set unique_ops; for (const auto& opset : ov::get_available_opsets()) { for (const auto& op : opset.second().get_type_info_set()) { - unique_ops.insert(op.name).second; + unique_ops.insert(op.name); } } return unique_ops; @@ -566,8 +534,12 @@ std::set get_openvino_available_opsets() { static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { GGML_ASSERT(dev->reg != nullptr); - // ggml_backend_openvino_device_context * dev_ctx = (ggml_backend_openvino_device_context *) dev->context; +#ifdef OPENVINO_OP_DEBUG +static const std::set& openvino_ops = []() -> const std::set& { + static const std::set ops = get_openvino_available_opsets(); + return ops; + }(); switch (op->op) { case GGML_OP_NONE: case GGML_OP_PERMUTE: @@ -584,6 +556,82 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con default: return false; } +#else + static const std::set& openvino_ops = []() -> const std::set& { + static const std::set ops = get_openvino_available_opsets(); + return ops; + }(); + + static const std::map> op_mapping = { + {GGML_OP_ACC, {"Add"}}, + {GGML_OP_ADD, {"Add"}}, + {GGML_OP_ADD1, {"Add"}}, + {GGML_OP_ADD_REL_POS, {"Add", "MatMul", "Reshape"}}, + {GGML_OP_ARANGE, {"Range"}}, + {GGML_OP_ARGMAX, {"TopK"}}, + {GGML_OP_ARGSORT, {"TopK"}}, + {GGML_OP_CLAMP, {"Clamp"}}, + {GGML_OP_CONCAT, {"Concat"}}, + {GGML_OP_CONV_TRANSPOSE_1D, {"ConvolutionBackpropData"}}, + {GGML_OP_CONV_TRANSPOSE_2D, {"ConvolutionBackpropData"}}, + {GGML_OP_COS, {"Cos"}}, + {GGML_OP_CROSS_ENTROPY_LOSS, {"Softmax", "Log", "Multiply", "ReduceSum", "Negative"}}, + {GGML_OP_DIAG, {"Eye", "Multiply"}}, + {GGML_OP_DIAG_MASK_INF, {"Eye", "Multiply", "Select", "Broadcast"}}, + {GGML_OP_DIAG_MASK_ZERO, {"Eye", "Multiply", "Select", "Broadcast"}}, + {GGML_OP_DIV, {"Divide"}}, + {GGML_OP_FLASH_ATTN_EXT, {"ScaledDotProductAttention"}}, + {GGML_OP_GET_ROWS, {"Gather"}}, + {GGML_OP_GROUP_NORM, {"GroupNormalization"}}, + {GGML_OP_IM2COL, {"Custom", "Reshape", "Transpose"}}, + {GGML_OP_LEAKY_RELU, {"PReLU"}}, + {GGML_OP_LOG, {"Log"}}, + {GGML_OP_MEAN, {"ReduceMean"}}, + {GGML_OP_MUL, {"Multiply"}}, + {GGML_OP_MUL_MAT, {"MatMul"}}, + {GGML_OP_MUL_MAT_ID, {"MatMul", "Identity"}}, + {GGML_OP_NORM, {"NormalizeL2"}}, + {GGML_OP_OUT_PROD, {"MatMul", "Reshape"}}, + {GGML_OP_PAD, {"Pad"}}, + {GGML_OP_PERMUTE, {"Transpose"}}, + {GGML_OP_POOL_1D, {"AvgPool", "MaxPool"}}, + {GGML_OP_POOL_2D, {"AvgPool", "MaxPool"}}, + {GGML_OP_REPEAT, {"Tile"}}, + {GGML_OP_RESHAPE, {"Reshape"}}, + {GGML_OP_RMS_NORM, {"Custom"}}, + {GGML_OP_ROPE, {"Custom"}}, + {GGML_OP_SCALE, {"Multiply", "Constant"}}, + {GGML_OP_SET, {"Assign"}}, + {GGML_OP_SIN, {"Sin"}}, + {GGML_OP_SOFT_MAX, {"Softmax"}}, + {GGML_OP_SQR, {"Power"}}, + {GGML_OP_SQRT, {"Sqrt"}}, + {GGML_OP_SSM_CONV, {"Custom"}}, + {GGML_OP_SSM_SCAN, {"Custom"}}, + {GGML_OP_SUB, {"Subtract"}}, + {GGML_OP_SUM, {"ReduceSum"}}, + {GGML_OP_SUM_ROWS, {"ReduceSum", "Squeeze", "Unsqueeze"}}, + {GGML_OP_TIMESTEP_EMBEDDING, {"Range", "Power", "Multiply", "Sin", "Cos", "Concat"}}, + {GGML_OP_TRANSPOSE, {"Transpose"}}, + {GGML_OP_UPSCALE, {"Interpolate"}}, + {GGML_OP_VIEW, {"Reshape"}}, + {GGML_OP_WIN_PART, {"StridedSlice", "Concat", "Reshape", "Custom"}}, + {GGML_OP_WIN_UNPART, {"Reshape", "Transpose", "Custom"}}, + }; + + auto it = op_mapping.find(op->op); + if (it == op_mapping.end()) { + return false; + } + + for (const std::string& op_name : it->second) { + if (openvino_ops.count(op_name) == 0) { + return false; + } + } + + return true; +#endif } static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { From 8c5a609f8da7ca6267796c8ef38f01fe4960e198 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Mon, 16 Dec 2024 11:13:45 +0800 Subject: [PATCH 016/254] add the rms_norm operator implemented using OpenVINO to the GGML backend of llama.cpp --- ggml/src/ggml-openvino.cpp | 91 +++++++++++++++++++++++++++++++++++++- 1 file changed, 90 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index efbff646e3..b6f01fdb45 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -324,6 +324,95 @@ void ggml_compute_forward_get_rows(struct ggml_tensor *dst) { } +void ggml_backend_openvino_rms_norm_f32(ggml_tensor *dst) { + const struct ggml_tensor *src0 = dst->src[0]; + assert(src0 != nullptr); + + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + const int64_t ne0 = src0->ne[0]; + const int64_t ne1 = src0->ne[1]; + const int64_t ne2 = src0->ne[2]; + const int64_t ne3 = src0->ne[3]; + + const size_t input_size = ne0 * ne1 * ne2 * ne3; + + const float *src_data = static_cast(src0->data); + float *dst_data = static_cast(dst->data); + assert(dst_data != nullptr); + + ov::Core core; + + ov::Shape input_shape = {static_cast(ne3), static_cast(ne2), + static_cast(ne1), static_cast(ne0)}; + ov::Tensor input_tensor(ov::element::f32, input_shape, const_cast(src_data)); + + auto input_param = std::make_shared( + input_tensor.get_element_type(), + input_tensor.get_shape() + ); + assert(input_param != nullptr && "Input parameter creation failed!"); + + auto square = std::make_shared(input_param, input_param); + auto reduce_sum = std::make_shared( + square, + ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {3}), + true + ); + + auto mean = std::make_shared( + reduce_sum, + ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {static_cast(ne0)}) + ); + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + auto rms = std::make_shared( + std::make_shared( + mean, + ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {eps}) + ) + ); + + auto scale = std::make_shared( + ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {1.0f}), + rms + ); + + auto normalized_input = std::make_shared(input_param, scale); + + ov::ParameterVector parameters = {input_param}; + auto function = std::make_shared(ov::NodeVector{normalized_input}, parameters); + + auto compiled_model = core.compile_model(function, "CPU"); + + auto infer_request = compiled_model.create_infer_request(); + + infer_request.set_input_tensor(0, input_tensor); + + infer_request.infer(); + + auto output_tensor = infer_request.get_output_tensor(); + assert(output_tensor.get_size() == input_size); + + std::memcpy(dst_data, output_tensor.data(), input_size * sizeof(float)); +} + +void ggml_backend_openvino_rms_norm(ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_backend_openvino_rms_norm_f32(dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { openvino_frontend_compute(backend, cgraph); @@ -598,7 +687,7 @@ static const std::set& openvino_ops = []() -> const std::set Date: Thu, 12 Dec 2024 13:13:31 +0800 Subject: [PATCH 017/254] Fix issue for output memory copy of infer request --- .../src/ggml-openvino/ggml-graph-iterator.cpp | 16 +++++------- ggml/src/ggml-openvino/utils.cpp | 26 +++++++++---------- 2 files changed, 19 insertions(+), 23 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-graph-iterator.cpp b/ggml/src/ggml-openvino/ggml-graph-iterator.cpp index 44e119a1ac..5c06179023 100644 --- a/ggml/src/ggml-openvino/ggml-graph-iterator.cpp +++ b/ggml/src/ggml-openvino/ggml-graph-iterator.cpp @@ -21,22 +21,20 @@ void GgmlOvGraphIterator::initialize_decoders() { // m_decoders.resize(static_cast(nodes_size)); for (int i = 0; i < nodes_size; ++i) { - // Skip View Op - // if (m_cgraph->nodes[i] ->op == GGML_OP_PERMUTE - // || m_cgraph->nodes[i] ->op == GGML_OP_CPY ) { - // continue; - // } auto decoder = std::make_shared(m_cgraph->nodes[i], m_cgraph); m_decoders.push_back(decoder); for (size_t inp = 0; inp < decoder->get_input_size(); ++inp) { - // if (i == 0 || decoder->is_graph_input(inp)) { + // Skip duplicate input name + if (std::find(m_input_names.begin(), m_input_names.end(), decoder->get_input_name(inp)) == m_input_names.end()) { m_input_names.push_back(decoder->get_input_name(inp)); - // } + } } for (size_t inp = 0; inp < decoder->get_output_size(); ++inp) { - // if (i == nodes_size - 1 || decoder->is_graph_output(inp)) { + // Skip duplicate output name + auto output_name = decoder->get_output_name(inp); + if (std::find(m_output_names.begin(), m_output_names.end(), output_name) == m_output_names.end()) { m_output_names.push_back(decoder->get_output_name(inp)); - // } + } } } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index db52b1f81d..2dfe837cbd 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -29,8 +29,8 @@ std::map get_ggml_graph_input_tensors(std::shared_ptr get_ggml_graph_output_tensors(std::shared_ptr ggml_graph_iterator) { - std::map output_tensors; +std::map get_ggml_graph_output_dst(std::shared_ptr ggml_graph_iterator) { + std::map output_tensors; auto output_names = ggml_graph_iterator->get_output_names(); ggml_graph_iterator->reset(); for (; !ggml_graph_iterator->is_end(); ggml_graph_iterator->next()) { @@ -41,8 +41,7 @@ std::map get_ggml_graph_output_tensors(std::shared_ptr< #ifdef GGML_OPENVINO_DEBUG printf("Output %d: %g\n", inp, *(double*)(output_data)); #endif - ov::Tensor output_tensor = ov::Tensor(decoder->get_output_type(inp), decoder->get_output_shape(inp).to_shape(), output_data); - output_tensors[decoder->get_output_name(inp)] = output_tensor; + output_tensors[decoder->get_output_name(inp)] = output_data; } } } @@ -100,7 +99,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c #endif } - // Loading a model to the device ov::CompiledModel compiled_model = core.compile_model(model); @@ -113,18 +111,18 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // Set input tensor for (size_t i = 0; i < input_names.size(); i++) { - infer_request.set_input_tensor(i, input_tensors[input_names[i]]); - } - - // Set output tensor - - auto output_names = ggml_graph_iterator->get_output_names(); - auto output_tensors = get_ggml_graph_output_tensors(ggml_graph_iterator); - for (size_t i = 0; i < output_names.size(); i++) { - infer_request.set_output_tensor(i, output_tensors[output_names[i]]); + infer_request.set_input_tensor(i, input_tensors[input_names[i]]); } infer_request.infer(); + + // Set dst data for outputs + auto output_names = ggml_graph_iterator->get_output_names(); + auto output_tensors = get_ggml_graph_output_dst(ggml_graph_iterator); + for (size_t i = 0; i < output_names.size(); i++) { + auto output_tensor = infer_request.get_output_tensor(i); + std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size()); + } return GGML_STATUS_SUCCESS; GGML_UNUSED(backend); From b100f89bad810c6a1e50c341c091cd3cd124711f Mon Sep 17 00:00:00 2001 From: yumengbo Date: Fri, 13 Dec 2024 07:28:28 +0800 Subject: [PATCH 018/254] Change to implementation following pytorch frontend --- ggml/src/ggml-openvino/decoder.h | 18 ++- ggml/src/ggml-openvino/ggml-decoder.cpp | 134 +++++++++++------- ggml/src/ggml-openvino/ggml-decoder.h | 44 +++--- .../src/ggml-openvino/ggml-graph-iterator.cpp | 95 ------------- ggml/src/ggml-openvino/ggml-graph-iterator.h | 61 -------- ggml/src/ggml-openvino/graph_iterator.h | 43 ------ ggml/src/ggml-openvino/utils.cpp | 74 +++++----- ggml/src/ggml-openvino/utils.h | 4 +- 8 files changed, 143 insertions(+), 330 deletions(-) delete mode 100644 ggml/src/ggml-openvino/ggml-graph-iterator.cpp delete mode 100644 ggml/src/ggml-openvino/ggml-graph-iterator.h delete mode 100644 ggml/src/ggml-openvino/graph_iterator.h diff --git a/ggml/src/ggml-openvino/decoder.h b/ggml/src/ggml-openvino/decoder.h index be943716f2..c7f1bbd725 100644 --- a/ggml/src/ggml-openvino/decoder.h +++ b/ggml/src/ggml-openvino/decoder.h @@ -12,9 +12,9 @@ class GgmlDecoder : public DecoderBase { public: virtual ov::Any get_attribute(const std::string& name) const = 0; - virtual PartialShape get_input_shape(size_t index) const = 0; + virtual PartialShape get_input_shape(const std::string& name) const = 0; - virtual element::Type get_input_type(size_t index) const = 0; + virtual element::Type get_input_type(const std::string& name) const = 0; virtual size_t get_input_size() const = 0; @@ -23,19 +23,15 @@ public: std::string& producer_output_port_name, size_t& producer_output_port_index) const = 0; - virtual bool is_graph_input(size_t index) const = 0; - virtual std::string& get_input_name(size_t index) const = 0; - virtual PartialShape get_output_shape(size_t index) const = 0; + virtual std::vector get_input_names() const = 0; - virtual element::Type get_output_type(size_t index) const = 0; + virtual PartialShape get_output_shape(const std::string& name) const = 0; - virtual size_t get_output_size() const = 0; + virtual element::Type get_output_type(const std::string& name) const = 0; - virtual bool is_graph_output(size_t index) const = 0; - - virtual int32_t* get_output_op_params(size_t index) const = 0; + virtual int32_t* get_output_op_params(const std::string& name) const = 0; virtual std::string& get_output_name(size_t index) const = 0; @@ -49,6 +45,8 @@ public: virtual const std::string& get_op_name() const = 0; + virtual void visit_subgraph(std::function)> node_visitor) const = 0; + // virtual const std::vector& outputs() const = 0; // virtual size_t output(size_t index) const = 0; diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index b367987372..ab4b0995a5 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -2,11 +2,8 @@ #include #include -GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgraph) - :m_cgraph(cgraph), - m_node(node), - m_op_name(std::string(m_node->name)) { - switch (m_node->op) { +void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs) { + switch (node->op) { // Unary OPs case GGML_OP_UNARY: case GGML_OP_RESHAPE: @@ -16,22 +13,26 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr case GGML_OP_CPY: case GGML_OP_RMS_NORM: { - m_inputs.push_back(m_node->src[0]); - m_outputs.push_back(m_node); + inputs[node->src[0]->name] = node->src[0]; + outputs[node->name] = node; + m_input_names.push_back(node->src[0]->name); + m_output_names.push_back(node->name); break; } - // For view, input is m_node itself + // For view, input is node itself case GGML_OP_VIEW: { - m_inputs.push_back(m_node); - m_outputs.push_back(m_node); + inputs[node->src[0]->name] = node; + outputs[node->name] = node; break; } // SCALE case GGML_OP_SCALE: { - m_inputs.push_back(m_node->src[0]); - m_outputs.push_back(m_node); + inputs[node->src[0]->name] = node->src[0]; + outputs[node->name] = node; + m_input_names.push_back(node->name); + m_output_names.push_back(node->name); break; } // OPs with 2 inputs @@ -43,18 +44,25 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr case GGML_OP_GET_ROWS: case GGML_OP_SOFT_MAX: { - m_inputs.push_back(m_node->src[0]); - m_inputs.push_back(m_node->src[1]); - m_outputs.push_back(m_node); + inputs[node->src[0]->name] = node->src[0]; + inputs[node->src[1]->name] = node->src[1]; + outputs[node->name] = node; + m_input_names.push_back(node->src[0]->name); + m_input_names.push_back(node->src[1]->name); + m_output_names.push_back(node->name); break; } // OPs with 3 inputs: case GGML_OP_ROPE: { - m_inputs.push_back(m_node->src[0]); - m_inputs.push_back(m_node->src[1]); - m_inputs.push_back(m_node->src[2]); // ??? - m_outputs.push_back(m_node); + inputs[node->src[0]->name] = node->src[0]; + inputs[node->src[1]->name] = node->src[1]; + inputs[node->src[2]->name] = node->src[2]; + outputs[node->name] = node; + m_input_names.push_back(node->src[0]->name); + m_input_names.push_back(node->src[1]->name); + m_input_names.push_back(node->src[2]->name); + m_output_names.push_back(node->name); break; } default: @@ -62,13 +70,33 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr } } -ov::PartialShape GgmlOvDecoder::get_input_shape(size_t index) const { +GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgraph) + :m_cgraph(cgraph), + m_node(node), + m_op_name(m_node ? std::string(m_node->name) : "NONE_OP") { + m_inputs.clear(); + m_outputs.clear(); + m_input_names.clear(); + m_output_names.clear(); + // If first init + if (m_node) { + set_input_output(m_node, m_inputs, m_outputs); + } else { + for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { + auto cur_node = m_cgraph->nodes[node_n]; + m_nodes.push_back(cur_node); + // Init model input and output + set_input_output(cur_node, m_inputs, m_outputs); + } + } +} + +ov::PartialShape GgmlOvDecoder::get_input_shape(const std::string& name) const { ov::PartialShape input_shape; // Use input_node->ne - ggml_tensor * node = m_inputs[index]; + ggml_tensor * node = m_inputs.at(name); std::vector shape; - // GGML_MAX_DIMS - // for (int i = 0; i < GGML_MAX_DIMS; ++i) { + for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { if (node->ne[i] == 0) { return input_shape; @@ -79,10 +107,9 @@ ov::PartialShape GgmlOvDecoder::get_input_shape(size_t index) const { return input_shape; } -ov::element::Type GgmlOvDecoder::get_input_type(size_t index) const { +ov::element::Type GgmlOvDecoder::get_input_type(const std::string& name) const { ov::element::Type type = ov::element::dynamic; - // GGML_LOG_DEBUG("%d\n", m_inputs[index]->type); - switch (m_inputs[index]->type) { + switch (m_inputs.at(name)->type) { case GGML_TYPE_F32: type = ov::element::f32; break; @@ -102,28 +129,24 @@ ov::element::Type GgmlOvDecoder::get_input_type(size_t index) const { } size_t GgmlOvDecoder::get_input_size() const { - return m_inputs.size(); -} - -bool GgmlOvDecoder::is_graph_input(size_t index) const { - if (m_inputs[index]->flags & GGML_TENSOR_FLAG_INPUT ) { - return true; - } - return false; + return m_input_names.size(); } std::string& GgmlOvDecoder::get_input_name(size_t index) const { - m_name = std::string(m_inputs[index]->name); + m_name = m_input_names[index]; return m_name; } -ov::PartialShape GgmlOvDecoder::get_output_shape(size_t index) const { +std::vector GgmlOvDecoder::get_input_names() const { + return m_input_names; +} + +ov::PartialShape GgmlOvDecoder::get_output_shape(const std::string& name) const { ov::PartialShape output_shape; // Use input_node->ne - ggml_tensor * node = m_outputs[index]; + ggml_tensor * node = m_outputs.at(name); std::vector shape; - // GGML_MAX_DIMS - // for (int i = 0; i < GGML_MAX_DIMS; ++i) { + for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { if (node->ne[i] == 0 ) { // empty if any dimension has no elements @@ -135,10 +158,10 @@ ov::PartialShape GgmlOvDecoder::get_output_shape(size_t index) const { return output_shape; } -ov::element::Type GgmlOvDecoder::get_output_type(size_t index) const { +ov::element::Type GgmlOvDecoder::get_output_type(const std::string& name) const { // TODO: Change to Output ov::element::Type type = ov::element::dynamic; - switch (m_outputs[index]->type) { + switch (m_outputs.at(name)->type) { case GGML_TYPE_F32: type = ov::element::f32; break; @@ -157,30 +180,31 @@ ov::element::Type GgmlOvDecoder::get_output_type(size_t index) const { return type; } -bool GgmlOvDecoder::is_graph_output(size_t index) const { - if (m_outputs[index]->flags & GGML_TENSOR_FLAG_OUTPUT) { - return true; - } - return false; -} - -int32_t* GgmlOvDecoder::get_output_op_params(size_t index) const{ - return m_outputs[index]->op_params; -} - -size_t GgmlOvDecoder::get_output_size() const { - return m_outputs.size(); +int32_t* GgmlOvDecoder::get_output_op_params(const std::string& name) const{ + return m_outputs.at(name)->op_params; } std::string& GgmlOvDecoder::get_output_name(size_t index) const { - m_name = std::string(m_outputs[index]->name); + m_name = std::string(m_output_names[index]); return m_name; } +std::vector GgmlOvDecoder::get_output_names() const { + return m_output_names; +} + const std::string& GgmlOvDecoder::get_op_name() const { return m_op_name; } +void GgmlOvDecoder::visit_subgraph(std::function)> node_visitor) const { + for (const auto& node : m_nodes) { + auto decoder = std::make_shared(node, m_cgraph); + // m_decoders.push_back(decoder); + node_visitor(decoder); + } +} + const std::string& GgmlOvDecoder::get_op_type() const { static const std::map opTypeMap = { {GGML_OP_ACC, "GGML_OP_ACC"}, diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index ceae589ed4..56bb3f889f 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -6,6 +6,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { public: using ov::frontend::ggml::GgmlDecoder::GgmlDecoder; + GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgraph); virtual ov::Any get_attribute(const std::string& name) const override { @@ -13,9 +14,9 @@ public: GGML_UNUSED(name); } - virtual ov::PartialShape get_input_shape(size_t index) const override; + virtual ov::PartialShape get_input_shape(const std::string& name) const override; - virtual ov::element::Type get_input_type(size_t index) const override; + virtual ov::element::Type get_input_type(const std::string& name) const override; virtual size_t get_input_size() const override; @@ -29,19 +30,15 @@ public: GGML_UNUSED(producer_output_port_index); } - virtual bool is_graph_input(size_t index) const override; - virtual std::string& get_input_name(size_t index) const override; - virtual ov::PartialShape get_output_shape(size_t index) const override; + virtual std::vector get_input_names() const override; - virtual ov::element::Type get_output_type(size_t index) const override; + virtual ov::PartialShape get_output_shape(const std::string& name) const override; - virtual size_t get_output_size() const override; + virtual ov::element::Type get_output_type(const std::string& name) const override; - virtual bool is_graph_output(size_t index) const override; - - virtual int32_t* get_output_op_params(size_t index) const override; + virtual int32_t* get_output_op_params(const std::string& name) const override; virtual std::string& get_output_name(size_t index) const override; @@ -55,24 +52,27 @@ public: virtual const std::string& get_op_name() const override; - const ggml_tensor* get_input_ggml_tensor(size_t index) const { - return m_inputs[index]; + virtual void visit_subgraph(std::function)> node_visitor) const override; + + const ggml_tensor* get_input_ggml_tensor(std::string& name) const { + return m_inputs.at(name); } - const ggml_tensor* get_output_ggml_tensor(size_t index) const { - return m_outputs[index]; + const ggml_tensor* get_output_ggml_tensor(std::string& name) const { + return m_outputs.at(name); } - // virtual const std::vector& outputs() const override; - - // virtual size_t output(size_t index) const override; - private: - size_t m_index; + void set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs); + struct ggml_cgraph * m_cgraph; - std::vector m_inputs; - std::vector m_outputs; - ggml_tensor * m_node; + std::map m_inputs; + std::vector m_input_names; + std::map m_outputs; + std::vector m_output_names; + ggml_tensor* m_node; + std::vector m_nodes; + std::vector> m_decoders; const std::string m_op_name; mutable std::string m_name; }; diff --git a/ggml/src/ggml-openvino/ggml-graph-iterator.cpp b/ggml/src/ggml-openvino/ggml-graph-iterator.cpp deleted file mode 100644 index 5c06179023..0000000000 --- a/ggml/src/ggml-openvino/ggml-graph-iterator.cpp +++ /dev/null @@ -1,95 +0,0 @@ -#include "ggml-graph-iterator.h" -#include -#include - -namespace ov { -namespace frontend { -namespace tensorflow { -namespace ggml { - -GgmlOvGraphIterator::GgmlOvGraphIterator(struct ggml_cgraph * cgraph) - :m_cgraph(cgraph) { - initialize_decoders(); - #ifdef GGML_OPENVINO_DEBUG - dump_graph_iterator(); - #endif -} - -void GgmlOvGraphIterator::initialize_decoders() { - auto nodes_size = m_cgraph->n_nodes; - // Initialize decoder for each node - // m_decoders.resize(static_cast(nodes_size)); - - for (int i = 0; i < nodes_size; ++i) { - auto decoder = std::make_shared(m_cgraph->nodes[i], m_cgraph); - m_decoders.push_back(decoder); - for (size_t inp = 0; inp < decoder->get_input_size(); ++inp) { - // Skip duplicate input name - if (std::find(m_input_names.begin(), m_input_names.end(), decoder->get_input_name(inp)) == m_input_names.end()) { - m_input_names.push_back(decoder->get_input_name(inp)); - } - } - for (size_t inp = 0; inp < decoder->get_output_size(); ++inp) { - // Skip duplicate output name - auto output_name = decoder->get_output_name(inp); - if (std::find(m_output_names.begin(), m_output_names.end(), output_name) == m_output_names.end()) { - m_output_names.push_back(decoder->get_output_name(inp)); - } - } - } - -} - -void GgmlOvGraphIterator::reset() { - node_index = 0; - } - -size_t GgmlOvGraphIterator::size() const { - return m_decoders.size(); -} - -void GgmlOvGraphIterator::next() { - node_index++; -} - -bool GgmlOvGraphIterator::is_end() const { - return node_index >= m_decoders.size(); -} - -std::shared_ptr GgmlOvGraphIterator::get_decoder() const { - return m_decoders[node_index]; -} - -std::vector GgmlOvGraphIterator::get_input_names() const { - return m_input_names; -} - -std::vector GgmlOvGraphIterator::get_output_names() const { - return m_output_names; -} - -void GgmlOvGraphIterator::dump_graph_iterator() const { - for (size_t i = 0; i < m_decoders.size(); ++i) { - GGML_LOG_INFO("\nOP %zu: %s\n", i, m_decoders[i]->get_op_name().c_str()); - for (size_t inp = 0; inp < m_decoders[i]->get_input_size(); ++inp) { - ov::PartialShape pshape = std::dynamic_pointer_cast(m_decoders[i])->get_input_shape(inp); - ov::element::Type ptype = std::dynamic_pointer_cast(m_decoders[i])->get_input_type(inp); - GGML_LOG_INFO("- Input name: %s\n", std::dynamic_pointer_cast(m_decoders[i])->get_input_name(inp).c_str()); - GGML_LOG_INFO(" Input shape: %s\n", pshape.to_string().c_str()); - GGML_LOG_INFO(" Input type: %s\n", ptype.to_string().c_str()); - } - for (size_t outp = 0; outp < std::dynamic_pointer_cast(m_decoders[i])->get_output_size(); ++outp) { - ov::PartialShape pshape = std::dynamic_pointer_cast(m_decoders[i])->get_output_shape(outp); - ov::element::Type ptype = std::dynamic_pointer_cast(m_decoders[i])->get_output_type(outp); - GGML_LOG_INFO("- Output name: %s\n", std::dynamic_pointer_cast(m_decoders[i])->get_output_name(outp).c_str()); - GGML_LOG_INFO(" Output shape: %s\n", pshape.to_string().c_str()); - GGML_LOG_INFO(" Output type: %s\n", ptype.to_string().c_str()); - - } - } -} - -} -} // namespace ggml -} // namespace frontend -} // namespace ov diff --git a/ggml/src/ggml-openvino/ggml-graph-iterator.h b/ggml/src/ggml-openvino/ggml-graph-iterator.h deleted file mode 100644 index 305afb5c98..0000000000 --- a/ggml/src/ggml-openvino/ggml-graph-iterator.h +++ /dev/null @@ -1,61 +0,0 @@ -#pragma once - -#include "graph_iterator.h" -#include "ggml-decoder.h" -#include - -// To remove tensorflow -namespace ov { -namespace frontend { -namespace tensorflow { -namespace ggml { - -class GgmlOvGraphIterator : public GgmlGraphIterator { - -protected: - void initialize_decoders(); - -public: - using Ptr = std::shared_ptr; - GgmlOvGraphIterator(struct ggml_cgraph * cgraph); - - /// \brief Get a number of operation nodes in the sgraph - virtual size_t size() const override; - - /// \brief Set iterator to the start position - virtual void reset() override; - - /// \brief Move to the next node in the graph - virtual void next() override; - - /// \brief Returns true if iterator goes out of the range of available nodes - virtual bool is_end() const override; - - /// \brief Return a pointer to a decoder of the current node - virtual std::shared_ptr get_decoder() const override; - - virtual std::shared_ptr get_body_graph_iterator(const std::string& func_name) const override { - return nullptr; - GGML_UNUSED(func_name); - } - - /// \brief Returns a vector of input names in the original order - virtual std::vector get_input_names() const override; - - /// \brief Returns a vector of output names in the original order - virtual std::vector get_output_names() const override; - - virtual void dump_graph_iterator() const; - -private: - struct ggml_cgraph * m_cgraph; - size_t node_index = 0; - std::vector> m_decoders; - std::vector m_input_names; - std::vector m_output_names; -}; - -} -} // namespace ggml -} // namespace frontend -} // namespace ov diff --git a/ggml/src/ggml-openvino/graph_iterator.h b/ggml/src/ggml-openvino/graph_iterator.h deleted file mode 100644 index e0b475e445..0000000000 --- a/ggml/src/ggml-openvino/graph_iterator.h +++ /dev/null @@ -1,43 +0,0 @@ -#pragma once - -#include "openvino/frontend/graph_iterator.hpp" - -namespace ov { -namespace frontend { -namespace tensorflow { // To be Removed -namespace ggml { - -// TODO: Directly include from openvino -class GgmlGraphIterator : public GraphIterator { -public: - - virtual size_t size() const = 0; - - virtual void reset() = 0; - - virtual void next() = 0; - - virtual bool is_end() const = 0; - - virtual std::shared_ptr get_decoder() const = 0; - - virtual std::vector get_input_names() const = 0; - - virtual std::vector get_output_names() const = 0; - - virtual std::shared_ptr get_body_graph_iterator(const std::string& func_name) const = 0; - - virtual std::map get_input_names_map() const { - return {}; - } - - virtual std::map get_output_names_map() const { - return {}; - } - -}; - -} -} // namespace ggml -} // namespace frontend -} // namespace ov diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 2dfe837cbd..2436f86feb 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -1,49 +1,40 @@ #include "utils.h" +#include "ggml-impl.h" #include "ggml-backend-impl.h" #include #include -using ov::frontend::tensorflow::ggml::GgmlOvGraphIterator; +using ov::frontend::ggml::GgmlDecoder; -std::shared_ptr get_ggml_graph_iterator(struct ggml_cgraph * cgraph) { - return std::make_shared(cgraph); +std::shared_ptr get_ggml_decoder(struct ggml_cgraph * cgraph) { + return std::make_shared(nullptr, cgraph); } -std::map get_ggml_graph_input_tensors(std::shared_ptr ggml_graph_iterator) { +std::map get_ggml_graph_input_tensors(std::shared_ptr ggml_decoder) { std::map input_tensors; - auto input_names = ggml_graph_iterator->get_input_names(); - ggml_graph_iterator->reset(); - for (; !ggml_graph_iterator->is_end(); ggml_graph_iterator->next()) { - auto decoder = std::dynamic_pointer_cast(ggml_graph_iterator->get_decoder()); - for (size_t inp = 0; inp < decoder->get_input_size(); ++inp) { - if (std::find(input_names.begin(), input_names.end(), decoder->get_input_name(inp)) != input_names.end()) { - auto input_data = decoder->get_input_ggml_tensor(inp)->data; - #ifdef GGML_OPENVINO_DEBUG - printf("Subgraph input %d: %g\n", inp, *(double*)(input_data)); - #endif - ov::Tensor input_tensor = ov::Tensor(decoder->get_input_type(inp), decoder->get_input_shape(inp).to_shape(), input_data); - input_tensors[decoder->get_input_name(inp)] = input_tensor; - } - } + auto input_names = ggml_decoder->get_input_names(); + for (size_t inp = 0; inp < input_names.size(); ++inp) { + auto name = input_names[inp]; + auto input_data = ggml_decoder->get_input_ggml_tensor(name)->data; + #ifdef GGML_OPENVINO_DEBUG + printf("Subgraph input %d: %g\n", inp, *(double*)(input_data)); + #endif + ov::Tensor input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); + input_tensors[name] = input_tensor; } return input_tensors; } -std::map get_ggml_graph_output_dst(std::shared_ptr ggml_graph_iterator) { +std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder) { std::map output_tensors; - auto output_names = ggml_graph_iterator->get_output_names(); - ggml_graph_iterator->reset(); - for (; !ggml_graph_iterator->is_end(); ggml_graph_iterator->next()) { - auto decoder = std::dynamic_pointer_cast(ggml_graph_iterator->get_decoder()); - for (size_t inp = 0; inp < decoder->get_output_size(); ++inp) { - if (std::find(output_names.begin(), output_names.end(), decoder->get_output_name(inp)) != output_names.end()) { - auto output_data = decoder->get_output_ggml_tensor(inp)->data; - #ifdef GGML_OPENVINO_DEBUG - printf("Output %d: %g\n", inp, *(double*)(output_data)); - #endif - output_tensors[decoder->get_output_name(inp)] = output_data; - } - } + auto output_names = ggml_decoder->get_output_names(); + for (size_t inp = 0; inp < output_names.size(); ++inp) { + auto name = output_names[inp]; + auto output_data = ggml_decoder->get_output_ggml_tensor(name)->data; + #ifdef GGML_OPENVINO_DEBUG + printf("Output %d: %g\n", inp, *(double*)(output_data)); + #endif + output_tensors[name] = output_data; } return output_tensors; } @@ -74,12 +65,10 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c GGML_LOG_INFO("GGML FrontEnd is initialized \n"); #endif } - - auto ggml_graph_iterator = get_ggml_graph_iterator(cgraph); - std::shared_ptr graph_iterator = ggml_graph_iterator; - + auto ggml_decoder = get_ggml_decoder(cgraph); + std::shared_ptr graph_decoder = ggml_decoder; // Load GraphIterator -> InputModel - ov::frontend::InputModel::Ptr input_model = front_end->load(graph_iterator); + ov::frontend::InputModel::Ptr input_model = front_end->load(graph_decoder); if (!input_model) { GGML_LOG_ERROR("Input Model is not loaded \n"); return GGML_STATUS_FAILED; @@ -106,8 +95,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c ov::InferRequest infer_request = compiled_model.create_infer_request(); // Get input tensor - auto input_names = ggml_graph_iterator->get_input_names(); - auto input_tensors = get_ggml_graph_input_tensors(ggml_graph_iterator); + auto input_names = ggml_decoder->get_input_names(); + auto input_tensors = get_ggml_graph_input_tensors(ggml_decoder); // Set input tensor for (size_t i = 0; i < input_names.size(); i++) { @@ -117,11 +106,14 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c infer_request.infer(); // Set dst data for outputs - auto output_names = ggml_graph_iterator->get_output_names(); - auto output_tensors = get_ggml_graph_output_dst(ggml_graph_iterator); + auto output_names = ggml_decoder->get_output_names(); + auto output_tensors = get_ggml_graph_output_dst(ggml_decoder); for (size_t i = 0; i < output_names.size(); i++) { auto output_tensor = infer_request.get_output_tensor(i); std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size()); + #ifdef GGML_OPENVINO_DEBUG + printf("Output %s after: %g\n", output_names[i], *(double*)(output_tensor.data())); + #endif } return GGML_STATUS_SUCCESS; diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 15dd46ed4e..7ec633beda 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -1,6 +1,4 @@ -#include "ggml-graph-iterator.h" +#include "ggml-decoder.h" #include "ggml-backend-impl.h" -std::shared_ptr get_ggml_graph_iterator(struct ggml_cgraph * cgraph); - enum ggml_status openvino_frontend_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); From 590f587b27b1709b4451227c79926f85118d93fc Mon Sep 17 00:00:00 2001 From: yumengbo Date: Wed, 18 Dec 2024 03:04:49 +0800 Subject: [PATCH 019/254] Add support for UNARY SILU op . Fix pytorch impl bugs. --- ggml/src/ggml-openvino.cpp | 7 +++++ ggml/src/ggml-openvino/ggml-decoder.cpp | 36 ++++++++++++++++++++----- ggml/src/ggml-openvino/utils.cpp | 2 +- 3 files changed, 37 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index b6f01fdb45..1fede40c4a 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -642,6 +642,13 @@ static const std::set& openvino_ops = []() -> const std::setsrc[0]->name] = node; + inputs[node->name] = node; outputs[node->name] = node; + m_input_names.push_back(node->name); + m_output_names.push_back(node->name); break; } // SCALE @@ -228,13 +230,33 @@ const std::string& GgmlOvDecoder::get_op_type() const { {GGML_OP_UNARY, "GGML_OP_UNARY"}, {GGML_OP_VIEW, "GGML_OP_VIEW"} }; + static const std::map unaryOpTypeMap = { + {GGML_UNARY_OP_ABS, "GGML_UNARY_OP_ABS"}, + {GGML_UNARY_OP_SGN, "GGML_UNARY_OP_SGN"}, + {GGML_UNARY_OP_NEG, "GGML_UNARY_OP_NEG"}, + {GGML_UNARY_OP_STEP, "GGML_UNARY_OP_STEP"}, + {GGML_UNARY_OP_TANH, "GGML_UNARY_OP_TANH"}, + {GGML_UNARY_OP_ELU, "GGML_UNARY_OP_ELU"}, + {GGML_UNARY_OP_RELU, "GGML_UNARY_OP_RELU"}, + {GGML_UNARY_OP_SIGMOID, "GGML_UNARY_OP_SIGMOID"}, + {GGML_UNARY_OP_GELU, "GGML_UNARY_OP_GELU"}, + {GGML_UNARY_OP_GELU_QUICK, "GGML_UNARY_OP_GELU_QUICK"}, + {GGML_UNARY_OP_SILU, "GGML_UNARY_OP_SILU"}, + {GGML_UNARY_OP_HARDSWISH, "GGML_UNARY_OP_HARDSWISH"}, + {GGML_UNARY_OP_HARDSIGMOID, "GGML_UNARY_OP_HARDSIGMOID"}, + {GGML_UNARY_OP_EXP, "GGML_UNARY_OP_EXP"}, + {GGML_UNARY_OP_COUNT, "GGML_UNARY_OP_COUNT"} + }; auto it = opTypeMap.find(m_node->op); if (it != opTypeMap.end()) { + if (it->first == GGML_OP_UNARY) { + auto unary_it = unaryOpTypeMap.find(ggml_get_unary_op(m_node)); + if (unary_it != unaryOpTypeMap.end()) { + return unary_it->second; + } + } return it->second; - } else { - static const std::string unknown_op = "UNKNOWN_OP"; - return unknown_op; - } - // static std::string op_type = ggml_op_name(m_node->op); - // return op_type; + } + static const std::string unknown_op = "UNKNOWN_OP"; + return unknown_op; } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 2436f86feb..3bc5779b49 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -112,7 +112,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto output_tensor = infer_request.get_output_tensor(i); std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size()); #ifdef GGML_OPENVINO_DEBUG - printf("Output %s after: %g\n", output_names[i], *(double*)(output_tensor.data())); + printf("Output %s after: %g\n", output_names[i].c_str(), *(double*)(output_tensor.data())); #endif } From d218c61e6d33bd06f0ddb4c98a0b688f64c864bd Mon Sep 17 00:00:00 2001 From: yumengbo Date: Thu, 19 Dec 2024 03:37:38 +0800 Subject: [PATCH 020/254] Support Softmax op --- ggml/src/ggml-openvino.cpp | 17 +++++++++++++++++ ggml/src/ggml-openvino/ggml-decoder.cpp | 6 ++++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 1fede40c4a..771ca86d02 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -642,11 +642,28 @@ static const std::set& openvino_ops = []() -> const std::setsrc[0]->name] = node->src[0]; - inputs[node->src[1]->name] = node->src[1]; outputs[node->name] = node; m_input_names.push_back(node->src[0]->name); - m_input_names.push_back(node->src[1]->name); m_output_names.push_back(node->name); + if (node->src[1]) { + inputs[node->src[1]->name] = node->src[1]; + m_input_names.push_back(node->src[1]->name); + } break; } // OPs with 3 inputs: From 8aba03bac66a32f1558469391da0fcd091aacdd4 Mon Sep 17 00:00:00 2001 From: yumengbo Date: Thu, 19 Dec 2024 03:39:05 +0800 Subject: [PATCH 021/254] Support Softmax op --- ggml/src/ggml-openvino.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 771ca86d02..797ceb74ba 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -642,8 +642,6 @@ static const std::set& openvino_ops = []() -> const std::set Date: Sat, 21 Dec 2024 08:27:12 +0800 Subject: [PATCH 022/254] Support ROPE op. --- ggml/src/ggml-openvino/ggml-decoder.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index ee156bb995..4f351266c6 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -61,12 +61,14 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]->name] = node->src[0]; inputs[node->src[1]->name] = node->src[1]; - inputs[node->src[2]->name] = node->src[2]; - outputs[node->name] = node; m_input_names.push_back(node->src[0]->name); m_input_names.push_back(node->src[1]->name); - m_input_names.push_back(node->src[2]->name); + outputs[node->name] = node; m_output_names.push_back(node->name); + if (node->src[2]) { + inputs[node->src[2]->name] = node->src[2]; + m_input_names.push_back(node->src[2]->name); + } break; } default: @@ -92,6 +94,9 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr // Init model input and output set_input_output(cur_node, m_inputs, m_outputs); } + #ifdef GGML_OPENVINO_DEBUG + ggml_graph_print(m_cgraph); + #endif } } From 0f7d07de7d98a514013c8adaf7d328edc60f7b09 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Thu, 19 Dec 2024 15:43:39 +0800 Subject: [PATCH 023/254] Add support for RMS_NORM OP --- ggml/src/ggml-openvino.cpp | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 797ceb74ba..f8389f06b5 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -334,9 +334,8 @@ void ggml_backend_openvino_rms_norm_f32(ggml_tensor *dst) { const int64_t ne0 = src0->ne[0]; const int64_t ne1 = src0->ne[1]; const int64_t ne2 = src0->ne[2]; - const int64_t ne3 = src0->ne[3]; - const size_t input_size = ne0 * ne1 * ne2 * ne3; + const size_t input_size = ne0 * ne1 * ne2; const float *src_data = static_cast(src0->data); float *dst_data = static_cast(dst->data); @@ -344,8 +343,7 @@ void ggml_backend_openvino_rms_norm_f32(ggml_tensor *dst) { ov::Core core; - ov::Shape input_shape = {static_cast(ne3), static_cast(ne2), - static_cast(ne1), static_cast(ne0)}; + ov::Shape input_shape = {static_cast(ne2), static_cast(ne1), static_cast(ne0)}; ov::Tensor input_tensor(ov::element::f32, input_shape, const_cast(src_data)); auto input_param = std::make_shared( @@ -357,7 +355,7 @@ void ggml_backend_openvino_rms_norm_f32(ggml_tensor *dst) { auto square = std::make_shared(input_param, input_param); auto reduce_sum = std::make_shared( square, - ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {3}), + ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}), true ); @@ -383,9 +381,16 @@ void ggml_backend_openvino_rms_norm_f32(ggml_tensor *dst) { auto normalized_input = std::make_shared(input_param, scale); ov::ParameterVector parameters = {input_param}; - auto function = std::make_shared(ov::NodeVector{normalized_input}, parameters); + auto model = std::make_shared(ov::NodeVector{normalized_input}, parameters); - auto compiled_model = core.compile_model(function, "CPU"); + // static bool model_saved = false; + // if (!model_saved) { + // std::cout << "\n rms model saved" << std::endl; + // ov::save_model(model, "//rms_norm_model.xml"); + // model_saved = true; + // } + + auto compiled_model = core.compile_model(model, "CPU"); auto infer_request = compiled_model.create_infer_request(); @@ -416,6 +421,18 @@ void ggml_backend_openvino_rms_norm(ggml_tensor * dst) { static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { openvino_frontend_compute(backend, cgraph); + // for (int i = 0; i < cgraph->n_nodes; i++) { + // struct ggml_tensor * node = cgraph->nodes[i]; + + // switch (node->op) { + // case GGML_OP_RMS_NORM: + // ggml_backend_openvino_rms_norm(node); + // break; + // default: + // GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node)); + // } + // } + return GGML_STATUS_SUCCESS; GGML_UNUSED(backend); From 2b04bd43be03b320d76b0e435290467326c636ef Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Wed, 15 Jan 2025 00:37:49 +0800 Subject: [PATCH 024/254] Add MUL_MAT,CPY,CONT as operators implemented in OpenVINO for GGML backend --- ggml/src/ggml-openvino.cpp | 430 +++++++++++++++++++++++- ggml/src/ggml-openvino/ggml-decoder.cpp | 5 +- ggml/src/ggml-openvino/ggml-decoder.h | 2 +- ggml/src/ggml-openvino/utils.cpp | 8 +- ggml/src/ggml-openvino/utils.h | 2 +- 5 files changed, 427 insertions(+), 20 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index f8389f06b5..07aff4b72e 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -1,6 +1,7 @@ -#include "ggml-openvino.h" #include "ggml-backend-impl.h" +#include "ggml-cpu-impl.h" #include "ggml-impl.h" +#include "ggml-openvino.h" #include "ggml-openvino/utils.h" #include @@ -418,20 +419,425 @@ void ggml_backend_openvino_rms_norm(ggml_tensor * dst) { } } + +void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + const int ith = 0; + const int nth = 1; + + const enum ggml_type type = src0->type; + const auto *type_traits = ggml_get_type_traits(type); + + enum ggml_type const vec_dot_type = type_traits->vec_dot_type; + ggml_from_float_t const from_float = type_traits->from_float; + ggml_from_float_to_mat_t const from_float_to_mat = type_traits->from_float_to_mat; + int64_t const vec_dot_num_rows = type_traits->nrows; + int64_t const matmul_num_cols = type_traits->ncols; + int64_t const blck_size_interleave = type_traits->blck_size_interleave; + ggml_gemv_t const gemv = type_traits->gemv; + ggml_gemm_t const gemm = type_traits->gemm; + + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == ggml_type_size(src1->type)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + // src1->type = GGML_TYPE_F32, vec_dot_type = GGML_TYPE_F16 + // The main function of this code is to convert the data of src1 from GGML_TYPE_F32 type to vec_dot_type (i.e. GGML_TYPE_F16) and store the result in params->wdata. + // The code processes data of different dimensions through multiple loops and conditional judgments and uses different conversion functions to complete data conversion. + std::unique_ptr wdata(new char[ne13 * ggml_row_size(vec_dot_type, ne10) * ne11 * ne12]); + if (src1->type != vec_dot_type) { + const size_t nbw1 = ggml_row_size(vec_dot_type, ne10); + const size_t nbw2 = nbw1*ne11; + const size_t nbw3 = nbw2*ne12; + + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = ith; i11 < ne11; i11 += nth) { + from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), + (void *) (wdata.get() + i13*nbw3 + i12*nbw2 + i11*nbw1), + ne10); + } + } + } + } + + // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers) + const int64_t nr0 = ne0; + + // This is the size of the rest of the dimensions of the result + const int64_t nr1 = ne1 * ne2 * ne3; + + // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols + int64_t num_rows_per_vec_dot = vec_dot_num_rows; + // TODO: currently the mmla kernels support only even numbered rows/cols. + // this check can be removed once they are extended to support odd numbered rows/cols too + if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) { + num_rows_per_vec_dot = 1; + } + + // Now select a reasonable chunk size. + int chunk_size = 16; + + // We need to step up the size if it's small + if (nr0 == 1 || nr1 == 1) { + chunk_size = 64; + } + + // distribute the work across the inner or outer loop based on which one is larger + // The number of chunks in the 0/1 dim. + // CEIL(nr0/chunk_size) + int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size; + int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size; + + // The number of elements in each chunk + const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0; + const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1; + + // The first chunk comes from our thread_id, the rest will get auto-assigned. + int current_chunk = ith; + + while (current_chunk < nchunk0 * nchunk1) { + const int64_t ith0 = current_chunk % nchunk0; + const int64_t ith1 = current_chunk / nchunk0; + + const int64_t ir0_start = dr0 * ith0; + const int64_t ir0_end = MIN(ir0_start + dr0, nr0); + + const int64_t ir1_start = dr1 * ith1; + const int64_t ir1_end = MIN(ir1_start + dr1, nr1); + + const bool src1_cont = ggml_is_contiguous(src1); + + ggml_vec_dot_t const vec_dot = type_traits->vec_dot; + enum ggml_type const vec_dot_type = type_traits->vec_dot_type; + + // broadcast factors + const int64_t r2 = ne12 / ne02; + const int64_t r3 = ne13 / ne03; + + // threads with no work simply yield (not sure if it helps) + if (ir0_start >= ir0_end || ir1_start >= ir1_end) { + return; + } + + // const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const size_t row_size = ggml_row_size(vec_dot_type, ne10); + + assert(ne12 % ne02 == 0); + assert(ne13 % ne03 == 0); + + // block-tiling attempt + const int64_t blck_0 = 16; + const int64_t blck_1 = 16; + + const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11; + + // attempt to reduce false-sharing (does not seem to make a difference) + // 16 * 2, accounting for mmla kernels + float tmp[32]; + + for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) { + for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) { + for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) { + const int64_t i13 = (ir1 / (ne12 * ne1)); + const int64_t i12 = (ir1 - i13 * ne12 * ne1) / ne1; + const int64_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1); + + // broadcast src0 into src1 + const int64_t i03 = i13 / r3; + const int64_t i02 = i12 / r2; + + const int64_t i1 = i11; + const int64_t i2 = i12; + const int64_t i3 = i13; + + const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03); + + // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides + // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using + // the original src1 data pointer, so we should index using the indices directly + const char * src1_col = (const char*)wdata.get() + + (src1_cont || src1->type != vec_dot_type + ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size + : (i11 * nb11 + i12 * nb12 + i13 * nb13)); + float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3)); + + for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) { + vec_dot(ne00, &tmp[ir0 - iir0], + (num_rows_per_vec_dot > 1 ? 16 : 0), + src0_row + ir0 * nb01, + (num_rows_per_vec_dot > 1 ? nb01 : 0), + src1_col, + (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), + num_rows_per_vec_dot); + } + + for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) { + memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float)); + } + } + } + } + + if (nth >= nchunk0 * nchunk1) { + break; + } + + // current_chunk = atomic_fetch_add_explicit(¶ms->threadpool->current_chunk, 1, memory_order_relaxed); + current_chunk++; + } +} + +void ggml_backend_openvino_reshape(ggml_tensor *dst) { + + GGML_UNUSED(dst); +} + +void ggml_backend_openvino_view(ggml_tensor *dst) { + + GGML_UNUSED(dst); +} + +void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { + const struct ggml_tensor *src0 = dst->src[0]; + + // Validate tensor properties + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); + GGML_ASSERT(src0->type == dst->type); + + // Determine tensor properties + const size_t element_size = ggml_type_size(src0->type); + + // Case 1: Both tensors are contiguous + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) { + // OpenVINO tensors for src and dst + // Source is 1D since it's contiguous + ov::Tensor src_tensor(ov::element::f32, {src0->ne[0]}, src0->data); + // // Destination is 1D since it's contiguous + ov::Tensor dst_tensor(ov::element::f32, {dst->ne[0]}, dst->data); + + // Perform the memory copy row by row + size_t row_size = dst->nb[0]; // Size of one row in destination + size_t src_stride = src0->nb[0]; // Stride for source tensor + + for (size_t i = 0; i < dst->ne[0]; ++i) { + std::memcpy((char *)dst_tensor.data()+i*row_size, (char *)src_tensor.data()+i*src_stride, row_size); + } + return; + } + + // Case 2: Compatible types, dimensions, and strides + const size_t ne00 = src0->ne[0]; + const size_t ne01 = src0->ne[1]; + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + const size_t nb0 = dst->nb[0]; + + if (src0->type == dst->type && ne00 == dst->ne[0] && nb00 == element_size && nb0 == element_size) { + for (size_t i01 = 0; i01 < ne01; ++i01) { + const char *src_row = reinterpret_cast(src0->data) + i01 * nb01; + char *dst_row = reinterpret_cast(dst->data) + i01 * dst->nb[1]; + + ov::Tensor src_row_tensor(ov::element::f32, {ne00}, const_cast(reinterpret_cast(src_row))); + ov::Tensor dst_row_tensor(ov::element::f32, {ne00}, reinterpret_cast(dst_row)); + + std::memcpy(dst_row_tensor.data(), src_row_tensor.data(), ne00 * sizeof(float)); + } + return; + } + + // Case 3: Non-contiguous source, contiguous destination + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + const int64_t nb02 = src0->nb[2]; + const int64_t nb03 = src0->nb[3]; + + // dst->ne =[3072,7,1,1], dst->nb =[4,12288,86016,86016], dst->type=GGML_TYPE_F32 + // dst->src[0]->ne=[96,32,7,1], dst->src[0]->nb=[4,2688,384,86016], dst->src[0]->type=GGML_TYPE_F32 + if (ggml_is_contiguous(dst)) { + const size_t rs = ne00 * element_size; // Row size in bytes for dst + + // Create OpenVINO tensors for source and destination + // The tensors are reshaped to a 2D structure (num_rows x ne00) for easier iteration and compatibility with the simplified loop. + ov::Tensor src_tensor(ov::element::f32, ov::Shape{ne03 * ne02 * ne01, ne00}, src0->data); + ov::Tensor dst_tensor(ov::element::f32, ov::Shape{ne03 * ne02 * ne01, ne00}, dst->data); + + // Perform the copy in a single loop + const size_t num_rows = ne03 * ne02 * ne01; + for (size_t row = 0; row < num_rows; ++row) { + // Calculate the source row pointer based on original strides + // The source row pointer is calculated based on the combined index row and the strides nb03, nb02, and nb01. + const char* src0_ptr = (char*)src_tensor.data() + + // Calculates which block of the i03 dimension the current row belongs to + (row / (ne02 * ne01)) * nb03 + // 0 + // Calculates which block of the i02 dimension the current row belongs to within the current i03 block. + ((row / ne01) % ne02) * nb02 + // 0, 0,......, 0,384, 384,......, 384,768,......, 2304 + // Calculates the position within the current i02 block in terms of the i01 index. + (row % ne01) * nb01; // 0,2688,......,83328, 0, 2688,......,83328, 0,......, 83328 + + // Destination row pointer is linear + // Since dst is contiguous, its rows are accessed linearly using a single stride rs, simplifying the destination pointer calculation. + char* dst_ptr = (char*)dst_tensor.data() + row * rs; + + // Copy row + std::memcpy(dst_ptr, src0_ptr, rs); + } + return; + } + std::cout << "Duplication of bytes completed successfully." << std::endl; +} + +static void ggml_backend_openvino_transpose(ggml_tensor *dst) { + // NOP + GGML_UNUSED(dst); +} + +static void ggml_backend_openvino_permute(const struct ggml_tensor * dst) { + // NOP + GGML_UNUSED(dst); +} + +void ggml_backend_openvino_cpy(struct ggml_tensor *dst) { + const struct ggml_tensor *src0 = dst->src[0]; + assert(src0 != nullptr); + assert(ggml_nelements(dst) == ggml_nelements(src0)); + + // Extract shapes + ov::Shape src_shape(src0->ne, src0->ne + 4); + ov::Shape dst_shape(dst->ne, dst->ne + 4); + + // Initialize OpenVINO core + ov::Core core; + + // Create OpenVINO parameter for the source tensor + auto src_input = std::make_shared(ov::element::f32, src_shape); + + std::shared_ptr model; + if (ggml_is_contiguous(dst)) { + // Contiguous Case: Flatten src and reshape to dst shape + ov::Shape flattened_shape = {ggml_nelements(src0)}; + auto flatten = std::make_shared( + src_input, ov::op::v0::Constant::create(ov::element::i64, {1}, flattened_shape), false); + + auto reshape_to_dst = std::make_shared( + flatten, ov::op::v0::Constant::create(ov::element::i64, {4}, dst_shape), false); + + auto dst_output = std::make_shared(reshape_to_dst, ov::element::f16); + + model = std::make_shared( + ov::ResultVector{std::make_shared(dst_output)}, + ov::ParameterVector{src_input}, + "ContiguousCopy"); + // Compile and execute the model + auto compiled_model = core.compile_model(model, "CPU"); + + ov::Tensor src_tensor(ov::element::f32, src_shape, src0->data); + ov::Tensor dst_tensor(ov::element::f16, dst_shape, dst->data); + + auto infer_request = compiled_model.create_infer_request(); + infer_request.set_input_tensor(0, src_tensor); + infer_request.set_output_tensor(0, dst_tensor); + infer_request.infer(); + } else { + // Non-contiguous case: element-wise copy + for (int64_t i03 = 0; i03 < dst->ne[3]; ++i03) { + for (int64_t i02 = 0; i02 < dst->ne[2]; ++i02) { + for (int64_t i01 = 0; i01 < dst->ne[1]; ++i01) { + for (int64_t i00 = 0; i00 < dst->ne[0]; ++i00) { + const char *src_ptr = static_cast(src0->data) + + i00 * src0->nb[0] + i01 * src0->nb[1] + + i02 * src0->nb[2] + i03 * src0->nb[3]; + + char *dst_ptr = static_cast(dst->data) + + i00 * dst->nb[0] + i01 * dst->nb[1] + + i02 * dst->nb[2] + i03 * dst->nb[3]; + + *(ggml_fp16_t *)dst_ptr = GGML_FP32_TO_FP16(*(const float *)src_ptr); + } + } + } + } + } +} + static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - openvino_frontend_compute(backend, cgraph); + // Find the indices of GGML_OP_CONT, GGML_OP_CPY nodes, GGML_OP_MUL_MAT and so on. + std::vector cont_indices; + std::vector reshape_indices; + std::vector view_indices; - // for (int i = 0; i < cgraph->n_nodes; i++) { - // struct ggml_tensor * node = cgraph->nodes[i]; + std::vector cpy_indices; + std::vector transpose_indices; + std::vector permute_indices; - // switch (node->op) { - // case GGML_OP_RMS_NORM: - // ggml_backend_openvino_rms_norm(node); - // break; - // default: - // GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node)); - // } - // } + std::vector mul_mat_indices; + + for (int i = 0; i < cgraph->n_nodes; i++) { + if (cgraph->nodes[i]->op == GGML_OP_CONT) { + cont_indices.push_back(i); + } else if (cgraph->nodes[i]->op == GGML_OP_RESHAPE) { + reshape_indices.push_back(i); + } else if (cgraph->nodes[i]->op == GGML_OP_VIEW) { + view_indices.push_back(i); + } else if (cgraph->nodes[i]->op == GGML_OP_CPY) { + cpy_indices.push_back(i); + } else if (cgraph->nodes[i]->op == GGML_OP_TRANSPOSE) { + transpose_indices.push_back(i); + } else if (cgraph->nodes[i]->op == GGML_OP_PERMUTE) { + permute_indices.push_back(i); + } else if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT) { + mul_mat_indices.push_back(i); + } + } + + // Process nodes in order + for (int i = 0; i < cgraph->n_nodes; i++) { + if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { + ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); + } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { + ggml_backend_openvino_reshape(cgraph->nodes[i]); + } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { + ggml_backend_openvino_view(cgraph->nodes[i]); + } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { + ggml_backend_openvino_cpy(cgraph->nodes[i]); + } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { + ggml_backend_openvino_transpose(cgraph->nodes[i]); + } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { + ggml_backend_openvino_permute(cgraph->nodes[i]); + } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { + ggml_backend_openvino_mul_mat(cgraph->nodes[i]); + } else { + // Process a range of nodes with openvino_frontend_compute + int start_index = i; + while (i < cgraph->n_nodes && + std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() && + std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() && + std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end()) { + i++; + } + if (start_index < i) { + openvino_frontend_compute(backend, cgraph, start_index, --i); + } + } + } return GGML_STATUS_SUCCESS; diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 4f351266c6..172c72ff50 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -76,7 +76,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapname) : "NONE_OP") { @@ -88,7 +88,8 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr if (m_node) { set_input_output(m_node, m_inputs, m_outputs); } else { - for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { + // for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { + for (int node_n = start_index; node_n <= end_index; node_n++) { auto cur_node = m_cgraph->nodes[node_n]; m_nodes.push_back(cur_node); // Init model input and output diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 56bb3f889f..2bb2f585f1 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -7,7 +7,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { public: using ov::frontend::ggml::GgmlDecoder::GgmlDecoder; - GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgraph); + GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgraph, const int32_t start_index=0, const int32_t end_index=0); virtual ov::Any get_attribute(const std::string& name) const override { return nullptr; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 3bc5779b49..84c9001c5c 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -6,8 +6,8 @@ using ov::frontend::ggml::GgmlDecoder; -std::shared_ptr get_ggml_decoder(struct ggml_cgraph * cgraph) { - return std::make_shared(nullptr, cgraph); +std::shared_ptr get_ggml_decoder(struct ggml_cgraph * cgraph, const int32_t start_index, const int32_t end_index) { + return std::make_shared(nullptr, cgraph, start_index, end_index); } std::map get_ggml_graph_input_tensors(std::shared_ptr ggml_decoder) { @@ -52,7 +52,7 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { return front_end; } -enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, const int32_t start_index, const int32_t end_index) { ov::Core core; auto devices = core.get_available_devices(); // Get GGML Frontend @@ -65,7 +65,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c GGML_LOG_INFO("GGML FrontEnd is initialized \n"); #endif } - auto ggml_decoder = get_ggml_decoder(cgraph); + auto ggml_decoder = get_ggml_decoder(cgraph, start_index, end_index); std::shared_ptr graph_decoder = ggml_decoder; // Load GraphIterator -> InputModel ov::frontend::InputModel::Ptr input_model = front_end->load(graph_decoder); diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 7ec633beda..fc5268d98a 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -1,4 +1,4 @@ #include "ggml-decoder.h" #include "ggml-backend-impl.h" -enum ggml_status openvino_frontend_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); +enum ggml_status openvino_frontend_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph, const int32_t start_index=0, const int32_t end_index=0); From cb2729bc4a0f5959e19cc140e521809e19ceff90 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Wed, 22 Jan 2025 15:22:56 +0800 Subject: [PATCH 025/254] Move CPY from GGML OV Backend to OV Frontend --- ggml/src/ggml-openvino.cpp | 7 +- ggml/src/ggml-openvino/decoder.h | 2 + ggml/src/ggml-openvino/ggml-decoder.cpp | 100 +++++++++++++++++++++++- ggml/src/ggml-openvino/ggml-decoder.h | 4 + 4 files changed, 107 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 07aff4b72e..444ccdf366 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -815,9 +815,9 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { ggml_backend_openvino_reshape(cgraph->nodes[i]); } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { - ggml_backend_openvino_view(cgraph->nodes[i]); - } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { - ggml_backend_openvino_cpy(cgraph->nodes[i]); + ggml_backend_openvino_view(cgraph->nodes[i]); + // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { + // ggml_backend_openvino_cpy(cgraph->nodes[i]); } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { ggml_backend_openvino_transpose(cgraph->nodes[i]); } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { @@ -829,7 +829,6 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe int start_index = i; while (i < cgraph->n_nodes && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() && - std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end()) { i++; } diff --git a/ggml/src/ggml-openvino/decoder.h b/ggml/src/ggml-openvino/decoder.h index c7f1bbd725..56f2ddcc80 100644 --- a/ggml/src/ggml-openvino/decoder.h +++ b/ggml/src/ggml-openvino/decoder.h @@ -51,6 +51,8 @@ public: // virtual size_t output(size_t index) const = 0; + virtual bool check_if_continuous() const = 0; + }; } // namespace ggml diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 172c72ff50..355a95d978 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -1,6 +1,7 @@ #include "ggml-decoder.h" #include #include +#include void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs) { switch (node->op) { @@ -9,8 +10,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]->name] = node->src[0]; @@ -19,6 +18,103 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapname); break; } + case GGML_OP_CONT: + { + if (ggml_is_contiguous(node->src[0]) && ggml_is_contiguous(node)) { + inputs[node->src[0]->name] = node->src[0]; + outputs[node->name] = node; + m_input_names.push_back(node->src[0]->name); + m_output_names.push_back(node->name); + m_continuous = true; + break; + } + + if (node->src[0]->type == node->type && node->src[0]->ne[0] == node->ne[0] && + node->src[0]->nb[0] == ggml_type_size(node->src[0]->type) && node->nb[0] == ggml_type_size(node->src[0]->type)) { + + for (size_t i01 = 0; i01 < node->src[0]->ne[1]; ++i01) { + const char *src_row = reinterpret_cast(node->src[0]->data) + i01 * node->src[0]->nb[1]; + char *dst_row = reinterpret_cast(node->data) + i01 * node->nb[1]; + std::memcpy(dst_row, src_row, node->src[0]->ne[0] * ggml_type_size(node->src[0]->type)); + } + + inputs[node->name] = node; + outputs[node->name] = node; + m_input_names.push_back(node->name); + m_output_names.push_back(node->name); + m_continuous = false; + break; + } + + // if (ggml_is_contiguous(node)) { + const size_t rs = node->src[0]->ne[0] * ggml_type_size(node->src[0]->type); // Row size in bytes for dst + + // Create OpenVINO tensors for source and destination + // The tensors are reshaped to a 2D structure (num_rows x ne00) for easier iteration and compatibility with the simplified loop. + ov::Tensor src_tensor(ov::element::f32, + ov::Shape{node->src[0]->ne[3] * node->src[0]->ne[2] * node->src[0]->ne[1], node->src[0]->ne[0]}, + node->src[0]->data); + ov::Tensor dst_tensor(ov::element::f32, + ov::Shape{node->src[0]->ne[3] * node->src[0]->ne[2] * node->src[0]->ne[1], node->src[0]->ne[0]}, + node->data); + + // Perform the copy in a single loop + const size_t num_rows = node->src[0]->ne[3] * node->src[0]->ne[2] * node->src[0]->ne[1]; + for (size_t row = 0; row < num_rows; ++row) { + // Calculate the source row pointer based on original strides + // The source row pointer is calculated based on the combined index row and the strides nb03, nb02, and nb01. + const char* src0_ptr = (char*)src_tensor.data() + + // Calculates which block of the i03 dimension the current row belongs to + (row / (node->src[0]->ne[2] * node->src[0]->ne[1])) * node->src[0]->nb[3] + // 0 + // Calculates which block of the i02 dimension the current row belongs to within the current i03 block. + ((row / node->src[0]->ne[1]) % node->src[0]->ne[2]) * node->src[0]->nb[2] + // 0, 0,......, 0,384, 384,......, 384,768,......, 2304 + // Calculates the position within the current i02 block in terms of the i01 index. + (row % node->src[0]->ne[1]) * node->src[0]->nb[1]; // 0,2688,......,83328, 0, 2688,......,83328, 0,......, 83328 + + // Destination row pointer is linear + // Since dst is contiguous, its rows are accessed linearly using a single stride rs, simplifying the destination pointer calculation. + char* dst_ptr = (char*)dst_tensor.data() + row * rs; + + // Copy row + std::memcpy(dst_ptr, src0_ptr, rs); + } + + inputs[node->name] = node; + outputs[node->name] = node; + m_input_names.push_back(node->name); + m_output_names.push_back(node->name); + m_continuous = false; + break; + //} + } + case GGML_OP_CPY: + { + if (ggml_is_contiguous(node)) { + inputs[node->src[0]->name] = node->src[0]; + outputs[node->name] = node; + m_input_names.push_back(node->src[0]->name); + m_output_names.push_back(node->name); + m_continuous = true; + break; + } else { + for (int64_t i1 = 0; i1 < node->ne[1]; ++i1) { // ne[1] = 3072 + for (int64_t i0 = 0; i0 < node->ne[0]; ++i0) { // ne[0] = 7 + int64_t src_index = i0 * node->src[0]->nb[0] / sizeof(float) + // stride in nb[0] + i1 * node->src[0]->nb[1] / sizeof(float); // stride in nb[1] + char *dst_ptr = static_cast(node->data) + + i0 * node->nb[0] + i1 * node->nb[1]; + *(ggml_fp16_t *)dst_ptr = GGML_FP32_TO_FP16(((float*)node->src[0]->data)[src_index]); + } + } + // inputs[node->src[0]->name] = node->src[0]; + inputs[node->name] = node; + outputs[node->name] = node; + m_input_names.push_back(node->name); + m_output_names.push_back(node->name); + m_continuous = false; + break; + } + } // For view, input is node itself case GGML_OP_VIEW: { diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 2bb2f585f1..2afde161ee 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -62,6 +62,9 @@ public: return m_outputs.at(name); } + virtual bool check_if_continuous() const override { + return m_continuous; + } private: void set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs); @@ -75,5 +78,6 @@ private: std::vector> m_decoders; const std::string m_op_name; mutable std::string m_name; + bool m_continuous; }; From 8484769981690c497452913680a5ac442ceed1fd Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Tue, 18 Feb 2025 14:11:07 +0800 Subject: [PATCH 026/254] add implementation of MUL_MAT, CPY, CONT of GGML ops using OV ops --- ggml/src/ggml-openvino.cpp | 609 +++++++++++++++++------- ggml/src/ggml-openvino/ggml-decoder.cpp | 1 + ggml/src/ggml-openvino/ggml-decoder.h | 10 + ggml/src/ggml-openvino/utils.cpp | 1 + 4 files changed, 443 insertions(+), 178 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 444ccdf366..99a32b1dfd 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -419,191 +419,200 @@ void ggml_backend_openvino_rms_norm(ggml_tensor * dst) { } } +// Extracting valid shapes +std::vector get_effective_shape(const ggml_tensor * t) { + std::vector shape; + for (int i = 2; i >= 0; i--) { + if (t->ne[i] != 1 || t->ne[2] != 1) + shape.push_back(t->ne[i]); + } + return shape; +} + +/* +* Construct an index vector for Gather to extract non-contiguous data. +* Parameters: +* - valid_cols: number of valid columns per row (e.g., for src0, valid columns = 96) +* - num_rows: number of rows in each batch (e.g., src0: 32 rows per batch) +* - batch: number of batches (e.g., 32) +* - row_stride: physical row length (in elements), e.g., src0: nb[1]/(element_size) = 6144/2 = 3072 +* - batch_stride: physical batch stride (in elements), e.g., src0: nb[2]/(element_size) = 192/2 = 96 +*/ +std::vector build_indices(int valid_cols, int num_rows, int batch, int row_stride, int batch_stride) { + std::vector indices; + indices.reserve(valid_cols * num_rows * batch); + for (int b = 0; b < batch; b++) { + for (int r = 0; r < num_rows; r++) { + for (int c = 0; c < valid_cols; c++) { + // 计算物理索引 = b * batch_stride + r * row_stride + c + indices.push_back(b * batch_stride + r * row_stride + c); + } + } + } + return indices; +} void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { + assert(dst && dst->src[0] && dst->src[1]); + const ggml_tensor * src0 = dst->src[0]; // src0 type F16 + const ggml_tensor * src1 = dst->src[1]; // src1 type F32 - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; + if(!ggml_is_contiguous(src1) || dst->src[1]->ne[0] * dst->src[1]->nb[0] != dst->src[1]->nb[1]) { + int valid_cols_src0 = dst->src[0]->ne[0]; + int num_rows_src0 = dst->src[0]->ne[1]; + int batch_src0 = dst->src[0]->ne[2]; + int valid_cols_src1 = dst->src[1]->ne[0]; + int num_rows_src1 = dst->src[1]->ne[1]; + int batch_src1 = dst->src[1]->ne[2]; + int row_stride_src0 = dst->src[0]->nb[1] / dst->src[0]->nb[0]; + int batch_stride_src0 = dst->src[0]->nb[2] / dst->src[0]->nb[0]; - GGML_TENSOR_BINARY_OP_LOCALS + int row_stride_src1 = dst->src[1]->nb[1] / dst->src[1]->nb[0]; + int batch_stride_src1 = dst->src[1]->nb[2] / dst->src[1]->nb[0]; - const int ith = 0; - const int nth = 1; + std::vector indices_src0 = build_indices(valid_cols_src0, num_rows_src0, batch_src0, row_stride_src0, batch_stride_src0); + std::vector indices_src1 = build_indices(valid_cols_src1, num_rows_src1, batch_src1, row_stride_src1, batch_stride_src1); - const enum ggml_type type = src0->type; - const auto *type_traits = ggml_get_type_traits(type); + // Total number of elements + size_t total_src0 = indices_src0.size(); // = 96 * 32 * 32 + size_t total_src1 = indices_src1.size(); // = 96 * 7 * 32 - enum ggml_type const vec_dot_type = type_traits->vec_dot_type; - ggml_from_float_t const from_float = type_traits->from_float; - ggml_from_float_to_mat_t const from_float_to_mat = type_traits->from_float_to_mat; - int64_t const vec_dot_num_rows = type_traits->nrows; - int64_t const matmul_num_cols = type_traits->ncols; - int64_t const blck_size_interleave = type_traits->blck_size_interleave; - ggml_gemv_t const gemv = type_traits->gemv; - ggml_gemm_t const gemm = type_traits->gemm; + // Treat src0->data and src1->data as 1D tensors + // Note: The total length of physical data should be enough to cover the last valid element index + 1. + // flat shapes: + ov::Shape flat_shape_src0 = { total_src0 }; + ov::Shape flat_shape_src1 = { total_src1 }; - GGML_ASSERT(ne0 == ne01); - GGML_ASSERT(ne1 == ne11); - GGML_ASSERT(ne2 == ne12); - GGML_ASSERT(ne3 == ne13); + // Create a Parameter node for collecting non-continuous data + auto param_src0 = std::make_shared(ov::element::f16, flat_shape_src0); + auto param_src1 = std::make_shared(ov::element::f32, flat_shape_src1); - // we don't support permuted src0 or src1 - GGML_ASSERT(nb00 == ggml_type_size(type)); - GGML_ASSERT(nb10 == ggml_type_size(src1->type)); + // Create an index Constant node + auto indices_const_src0 = ov::op::v0::Constant::create(ov::element::i64, flat_shape_src0, indices_src0); + auto indices_const_src1 = ov::op::v0::Constant::create(ov::element::i64, flat_shape_src1, indices_src1); - // dst cannot be transposed or permuted - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb0 <= nb1); - GGML_ASSERT(nb1 <= nb2); - GGML_ASSERT(nb2 <= nb3); + // Use the Gather operator to collect valid data + // axis = 0 + auto axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto gathered_src0 = std::make_shared(param_src0, indices_const_src0, axis_const); + auto gathered_src1 = std::make_shared(param_src1, indices_const_src1, axis_const); - // src1->type = GGML_TYPE_F32, vec_dot_type = GGML_TYPE_F16 - // The main function of this code is to convert the data of src1 from GGML_TYPE_F32 type to vec_dot_type (i.e. GGML_TYPE_F16) and store the result in params->wdata. - // The code processes data of different dimensions through multiple loops and conditional judgments and uses different conversion functions to complete data conversion. - std::unique_ptr wdata(new char[ne13 * ggml_row_size(vec_dot_type, ne10) * ne11 * ne12]); - if (src1->type != vec_dot_type) { - const size_t nbw1 = ggml_row_size(vec_dot_type, ne10); - const size_t nbw2 = nbw1*ne11; - const size_t nbw3 = nbw2*ne12; + // Reshape to batched form: + // For src0: valid matrix size for each batch [num_rows_src0, valid_cols_src0] = [32,96], total batches = 32, + // Therefore, reshape to 3D Tensor: shape = [32, 32, 96] where first dimension is batch. + std::vector shape_src0_cont = { batch_src0, num_rows_src0, valid_cols_src0 }; + auto reshape_src0 = std::make_shared( + gathered_src0, + ov::op::v0::Constant::create(ov::element::i64, { shape_src0_cont.size() }, shape_src0_cont), + false); + // For src1: valid matrix size for each batch [num_rows_src1, valid_cols_src1] = [7,96], batch = 32, + // Reshape to 3D Tensor: shape = [32, 7, 96]. + std::vector shape_src1_cont = { batch_src1, num_rows_src1, valid_cols_src1 }; + auto reshape_src1 = std::make_shared( + gathered_src1, + ov::op::v0::Constant::create(ov::element::i64, { shape_src1_cont.size() }, shape_src1_cont), + false); - GGML_ASSERT(src1->type == GGML_TYPE_F32); + // For src0, first Convert from F16 to F32 + auto src0_f32 = std::make_shared(reshape_src0, ov::element::f32); - for (int64_t i13 = 0; i13 < ne13; ++i13) { - for (int64_t i12 = 0; i12 < ne12; ++i12) { - for (int64_t i11 = ith; i11 < ne11; i11 += nth) { - from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), - (void *) (wdata.get() + i13*nbw3 + i12*nbw2 + i11*nbw1), - ne10); - } - } - } + // Use Batched Transpose: swap the last two dimensions, dimension order [0, 2, 1] + auto transpose_order = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{0, 2, 1}); + auto src0_transposed = std::make_shared(src0_f32, transpose_order); + + auto A = src0_transposed; + auto B = reshape_src1; + + auto batched_matmul = std::make_shared(B, A, false, false); + // batched_matmul output: shape = [32,7,32] + + std::vector full_dst_shape = { dst->ne[2], dst->ne[1], dst->ne[0]}; + auto final_shape_const = ov::op::v0::Constant::create(ov::element::i64, { full_dst_shape.size() }, full_dst_shape); + + auto model = std::make_shared(ov::NodeVector{ batched_matmul }, ov::ParameterVector{param_src1, param_src0}); + + ov::Core core; + auto compiled_model = core.compile_model(model, "CPU"); + auto infer_request = compiled_model.create_infer_request(); + + // Construct input Tensors: treat src0->data and src1->data as 1D flat data respectively + ov::Tensor tensor_src0(ov::element::f16, flat_shape_src0, src0->data); + ov::Tensor tensor_src1(ov::element::f32, flat_shape_src1, src1->data); + infer_request.set_input_tensor(0, tensor_src1); + infer_request.set_input_tensor(1, tensor_src0); + + ov::Tensor tensor_dst(ov::element::f32, ov::Shape(full_dst_shape.begin(), full_dst_shape.end()), dst->data); + infer_request.set_output_tensor(0, tensor_dst); + + infer_request.infer(); + return ; } - // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers) - const int64_t nr0 = ne0; + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; - // This is the size of the rest of the dimensions of the result - const int64_t nr1 = ne1 * ne2 * ne3; + // Valid shape + std::vector eff_shape_src0 = get_effective_shape(src0); + std::vector eff_shape_src1 = get_effective_shape(src1); + std::vector eff_shape_dst = get_effective_shape(dst); - // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols - int64_t num_rows_per_vec_dot = vec_dot_num_rows; - // TODO: currently the mmla kernels support only even numbered rows/cols. - // this check can be removed once they are extended to support odd numbered rows/cols too - if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) { - num_rows_per_vec_dot = 1; + // Determine whether it is batched (effective rank==3) or two-dimensional (rank==2) or one-dimensional (rank==1) + int rank = static_cast(eff_shape_dst.size()); + if (rank != 1 && rank != 2 && rank != 3) + throw std::runtime_error("Only rank 1, 2 or 3 supported"); + + // Total number of flattened elements + size_t total_src0 = 1; for (auto d : eff_shape_src0) total_src0 *= d; + size_t total_src1 = 1; for (auto d : eff_shape_src1) total_src1 *= d; + + ov::Shape flat_shape_src0 = { total_src0 }; + ov::Shape flat_shape_src1 = { total_src1 }; + + auto param_flat_src0 = std::make_shared(ov::element::f16, flat_shape_src0); + auto param_flat_src1 = std::make_shared(ov::element::f32, flat_shape_src1); + + auto reshape_src0 = std::make_shared( + param_flat_src0, + ov::op::v0::Constant::create(ov::element::i64, { eff_shape_src0.size() }, eff_shape_src0), + false); + auto reshape_src1 = std::make_shared( + param_flat_src1, + ov::op::v0::Constant::create(ov::element::i64, { eff_shape_src1.size() }, eff_shape_src1), + false); + + // Convert src0: F16 -> F32 + auto src0_f32 = std::make_shared(reshape_src0, ov::element::f32); + + // Transpose src0_f32: + // For the 2D case, the shape of reshape_src0 is [3072,9216], and after transposition, it is [9216,3072]. + // For the batched case, assuming the shape is [M, K, Batch], batch-wise transposition is required: use order [0, 2, 1]. + ov::Output A_for_mul; + if (rank == 1) { + auto trans_order = ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{1, 0}); + A_for_mul = std::make_shared(src0_f32, trans_order); + } else if (rank == 2) { + auto trans_order = ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{1, 0}); + A_for_mul = std::make_shared(src0_f32, trans_order); + } else { // rank == 3 + auto trans_order = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{0, 2, 1}); + A_for_mul = std::make_shared(src0_f32, trans_order); } - // Now select a reasonable chunk size. - int chunk_size = 16; + ov::Core core; + ov::Tensor tensor_src0{ov::element::f16, flat_shape_src0, (void *)src0->data}; + ov::Tensor tensor_src1{ov::element::f32, flat_shape_src1, (void *)src1->data}; + ov::Tensor tensor_dst(ov::element::f32, ov::Shape(eff_shape_dst.begin(), eff_shape_dst.end()), dst->data); - // We need to step up the size if it's small - if (nr0 == 1 || nr1 == 1) { - chunk_size = 64; - } + std::shared_ptr matmul = std::make_shared(reshape_src1, A_for_mul, false, false); + auto model = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{param_flat_src1, param_flat_src0}); - // distribute the work across the inner or outer loop based on which one is larger - // The number of chunks in the 0/1 dim. - // CEIL(nr0/chunk_size) - int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size; - int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size; + auto compiled_model = core.compile_model(model, "CPU"); + auto infer_request = compiled_model.create_infer_request(); - // The number of elements in each chunk - const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0; - const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1; - - // The first chunk comes from our thread_id, the rest will get auto-assigned. - int current_chunk = ith; - - while (current_chunk < nchunk0 * nchunk1) { - const int64_t ith0 = current_chunk % nchunk0; - const int64_t ith1 = current_chunk / nchunk0; - - const int64_t ir0_start = dr0 * ith0; - const int64_t ir0_end = MIN(ir0_start + dr0, nr0); - - const int64_t ir1_start = dr1 * ith1; - const int64_t ir1_end = MIN(ir1_start + dr1, nr1); - - const bool src1_cont = ggml_is_contiguous(src1); - - ggml_vec_dot_t const vec_dot = type_traits->vec_dot; - enum ggml_type const vec_dot_type = type_traits->vec_dot_type; - - // broadcast factors - const int64_t r2 = ne12 / ne02; - const int64_t r3 = ne13 / ne03; - - // threads with no work simply yield (not sure if it helps) - if (ir0_start >= ir0_end || ir1_start >= ir1_end) { - return; - } - - // const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; - const size_t row_size = ggml_row_size(vec_dot_type, ne10); - - assert(ne12 % ne02 == 0); - assert(ne13 % ne03 == 0); - - // block-tiling attempt - const int64_t blck_0 = 16; - const int64_t blck_1 = 16; - - const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11; - - // attempt to reduce false-sharing (does not seem to make a difference) - // 16 * 2, accounting for mmla kernels - float tmp[32]; - - for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) { - for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) { - for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) { - const int64_t i13 = (ir1 / (ne12 * ne1)); - const int64_t i12 = (ir1 - i13 * ne12 * ne1) / ne1; - const int64_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1); - - // broadcast src0 into src1 - const int64_t i03 = i13 / r3; - const int64_t i02 = i12 / r2; - - const int64_t i1 = i11; - const int64_t i2 = i12; - const int64_t i3 = i13; - - const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03); - - // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides - // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using - // the original src1 data pointer, so we should index using the indices directly - const char * src1_col = (const char*)wdata.get() + - (src1_cont || src1->type != vec_dot_type - ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size - : (i11 * nb11 + i12 * nb12 + i13 * nb13)); - float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3)); - - for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) { - vec_dot(ne00, &tmp[ir0 - iir0], - (num_rows_per_vec_dot > 1 ? 16 : 0), - src0_row + ir0 * nb01, - (num_rows_per_vec_dot > 1 ? nb01 : 0), - src1_col, - (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), - num_rows_per_vec_dot); - } - - for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) { - memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float)); - } - } - } - } - - if (nth >= nchunk0 * nchunk1) { - break; - } - - // current_chunk = atomic_fetch_add_explicit(¶ms->threadpool->current_chunk, 1, memory_order_relaxed); - current_chunk++; - } + infer_request.set_input_tensor(0, tensor_src1); + infer_request.set_input_tensor(1, tensor_src0); + infer_request.set_output_tensor(0, tensor_dst); + infer_request.infer(); } void ggml_backend_openvino_reshape(ggml_tensor *dst) { @@ -628,19 +637,45 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { // Case 1: Both tensors are contiguous if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) { - // OpenVINO tensors for src and dst - // Source is 1D since it's contiguous - ov::Tensor src_tensor(ov::element::f32, {src0->ne[0]}, src0->data); - // // Destination is 1D since it's contiguous - ov::Tensor dst_tensor(ov::element::f32, {dst->ne[0]}, dst->data); + ov::Shape flat_shape = { static_cast(ggml_nelements(dst)) }; - // Perform the memory copy row by row - size_t row_size = dst->nb[0]; // Size of one row in destination - size_t src_stride = src0->nb[0]; // Stride for source tensor + // Construct the logical shape of the target tensor + ov::Shape dst_shape = { + static_cast(dst->ne[2]), + static_cast(dst->ne[1]), + static_cast(dst->ne[0]) + }; - for (size_t i = 0; i < dst->ne[0]; ++i) { - std::memcpy((char *)dst_tensor.data()+i*row_size, (char *)src_tensor.data()+i*src_stride, row_size); - } + // --- Construct the OpenVINO computation graph --- + // 1. Define input parameter, type f32, shape flat_shape: [8192] + auto input_param = std::make_shared(ov::element::f32, flat_shape); + + // 2. Create a Constant node to represent the new shape of the target Reshape(dst_shape) + // Note: dst_shape needs to be converted to an int64_t array + std::vector dst_shape_vec(dst_shape.begin(), dst_shape.end()); + auto reshape_const = ov::op::v0::Constant::create(ov::element::i64, { dst_shape_vec.size() }, dst_shape_vec); + + // 3. Use the Reshape operator to reshape the input tensor to the target shape(dst_shape) + auto reshape_op = std::make_shared(input_param, reshape_const, false); + + // 4. Construct the model, whose output is the result of reshape_op + auto model = std::make_shared(ov::OutputVector{ reshape_op }, ov::ParameterVector{ input_param }); + + // --- Compile and execute --- + ov::Core core; + auto compiled_model = core.compile_model(model, "CPU"); + auto infer_request = compiled_model.create_infer_request(); + + // Construct input Tensor: directly wrap src0->data, shape is flat_shape[8192] + ov::Tensor input_tensor(ov::element::f32, flat_shape, src0->data); + infer_request.set_input_tensor(0, input_tensor); + + // Construct output Tensor: dst->data, shape is dst_shape: [1,1,8192] + ov::Tensor output_tensor(ov::element::f32, dst_shape, dst->data); + infer_request.set_output_tensor(0, output_tensor); + + // Execute inference, the computation graph flattens the data of src0 and reshapes it to the shape of dst->ne, and writes it directly to dst->data + infer_request.infer(); return; } @@ -652,6 +687,70 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { const size_t nb0 = dst->nb[0]; if (src0->type == dst->type && ne00 == dst->ne[0] && nb00 == element_size && nb0 == element_size) { + // Assume that the data type is f32 and each element is 4 bytes + const size_t element_size = ggml_type_size(src0->type); // 4 bytes + + // Logically, the number of valid elements per row is 3072 (src0->ne[0]), and the number of rows is 7 (src0->ne[1]) + size_t valid_elems = static_cast(src0->ne[0]); // 3072 + size_t num_rows = static_cast(src0->ne[1]); // 7 + + // Number of floats physically stored per row = nb[1] / element_size = 36864/4 = 9216 + size_t phys_stride = static_cast(src0->nb[1]) / element_size; // 9216 + + // Total number of physical elements = (num_rows - 1)*phys_stride + valid_elems + size_t total_phys = (num_rows - 1) * phys_stride + valid_elems; // 6*9216 + 3072 = 58368 + // size_t total_phys = num_rows * phys_stride; + + // 1. Wrap src0->data into a 1D tensor with shape [58368] + ov::Shape flat_input_shape = { total_phys }; + auto flat_input_param = std::make_shared(ov::element::f32, flat_input_shape); + + // 2. Construct index tensor idx with shape [3072,7] + // For each logical position (i,j) (i in [0,3072), j in [0,7)), calculate index = j*phys_stride + i. + std::vector indices; + indices.reserve(valid_elems * num_rows); + for (size_t j = 0; j < num_rows; j++) { + for (size_t i = 0; i < valid_elems; i++) { + indices.push_back(static_cast(j * phys_stride + i)); + } + } + ov::Shape indices_shape = { valid_elems, num_rows }; // [3072,7] + auto indices_const = ov::op::v0::Constant::create(ov::element::i64, indices_shape, indices); + + // 3. Use the Gather operator (axis=0) to collect valid data + // Note: The third parameter is axis, and a value of 0 means collecting data from the 1D input according to the index + auto axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto gathered = std::make_shared(flat_input_param, indices_const, axis_const); + // The shape of gathered should be [3072,7] + + // 4. Reshape gathered into a 4D tensor [3072,7,1,1] + auto reshape_const = ov::op::v0::Constant::create( + ov::element::i64, {4}, std::vector{ static_cast(valid_elems), static_cast(num_rows), 1, 1 } + ); + auto reshaped = std::make_shared(gathered, reshape_const, false); + // The reshaped shape is [3072,7,1,1] + + // 5. Construct the model and output it as reshaped + auto model = std::make_shared(ov::OutputVector{reshaped}, ov::ParameterVector{flat_input_param}); + + // --- Compile and execute --- + ov::Core core; + auto compiled_model = core.compile_model(model, "CPU"); + auto infer_request = compiled_model.create_infer_request(); + + // Construct input Tensor: directly wrap src0->data, shape is flat_input_shape = [58368] + ov::Tensor input_tensor(ov::element::f32, flat_input_shape, src0->data); + infer_request.set_input_tensor(0, input_tensor); + + // Construct output Tensor: dst is continuous storage, and its logical shape is [3072,7,1,1] + ov::Shape output_shape = { valid_elems, num_rows, 1, 1 }; + ov::Tensor output_tensor(ov::element::f32, output_shape, dst->data); + infer_request.set_output_tensor(0, output_tensor); + + // Execute inference. The computation graph uses Gather to collect the first 3072 valid elements of each row of src0, + // and reshape them to [3072,7,1,1] and write them directly to dst->data + infer_request.infer(); + /* for (size_t i01 = 0; i01 < ne01; ++i01) { const char *src_row = reinterpret_cast(src0->data) + i01 * nb01; char *dst_row = reinterpret_cast(dst->data) + i01 * dst->nb[1]; @@ -660,7 +759,7 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { ov::Tensor dst_row_tensor(ov::element::f32, {ne00}, reinterpret_cast(dst_row)); std::memcpy(dst_row_tensor.data(), src_row_tensor.data(), ne00 * sizeof(float)); - } + }*/ return; } @@ -673,6 +772,72 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { // dst->ne =[3072,7,1,1], dst->nb =[4,12288,86016,86016], dst->type=GGML_TYPE_F32 // dst->src[0]->ne=[96,32,7,1], dst->src[0]->nb=[4,2688,384,86016], dst->src[0]->type=GGML_TYPE_F32 if (ggml_is_contiguous(dst)) { + size_t valid_i = static_cast(src0->ne[0]); // 96 + size_t valid_j = static_cast(src0->ne[1]); // 32 + size_t valid_k = static_cast(src0->ne[2]); // 7 + + // Output the logical shape of dst: dst->ne = [3072, 7, 1, 1] + // 3072 = 32 * 96, 7 is consistent with src0->ne[2] + size_t total_valid = valid_i * valid_j * valid_k; // 96 * 32 * 7 = 21504 + + // Physics step length: + size_t stride_j = static_cast(src0->nb[1]) / ggml_type_size(src0->type); // 2688/4 = 672 + size_t stride_k = static_cast(src0->nb[2]) / ggml_type_size(src0->type); // 384/4 = 96 + + // Construct index array, output order: for k in [0,6], for j in [0,31], for i in [0,95]: + // desired input index = j * stride_j + k * stride_k + i + std::vector indices; + indices.reserve(total_valid); + for (size_t k = 0; k < valid_k; k++) { + for (size_t j = 0; j < valid_j; j++) { + for (size_t i = 0; i < valid_i; i++) { + int64_t idx = static_cast(j * stride_j + k * stride_k + i); + indices.push_back(idx); + } + } + } + // The size of indices should be 21504 + + // 1. Construct input: treat src0->data as a 1D tensor. The valid range is 0~21503. + ov::Shape flat_input_shape = { total_valid }; + auto input_param = std::make_shared(ov::element::f32, flat_input_shape); + + // 2. Construct index constant: 1D tensor, shape [21504] + ov::Shape indices_shape = { total_valid }; + auto indices_const = ov::op::v0::Constant::create(ov::element::i64, indices_shape, indices); + + // 3. Set axis=0 (collect data from 1D input) + auto axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + + // 4. Use the Gather operator (OpenVINO v8 Gather is used here) to collect valid data + auto gathered = std::make_shared(input_param, indices_const, axis_const); + // gathered has a shape of [21504] + + // 5. Reshape gathered to [3072,7,1,1], because 3072*7 = 21504 + ov::Shape target_shape = { static_cast(dst->ne[0]), static_cast(dst->ne[1]), 1, 1 }; // [3072,7,1,1] + auto reshape_const = ov::op::v0::Constant::create(ov::element::i64, {4}, + std::vector{ static_cast(dst->ne[0]), static_cast(dst->ne[1]), 1, 1 }); + auto reshaped = std::make_shared(gathered, reshape_const, false); + + // 6. Construct model + auto model = std::make_shared(ov::OutputVector{reshaped}, ov::ParameterVector{input_param}); + + // --- Compile and execute --- + ov::Core core; + auto compiled_model = core.compile_model(model, "CPU"); + auto infer_request = compiled_model.create_infer_request(); + + // Construct input Tensor: directly wrap src0->data. Note: src0->data is regarded as a one-dimensional array according to the physical valid area, flat_input_shape: [21504] + ov::Tensor input_tensor(ov::element::f32, flat_input_shape, src0->data); + infer_request.set_input_tensor(0, input_tensor); + + // Construct output Tensor: dst->data is stored continuously, with shape target_shape: [3072,7,1,1] + ov::Tensor output_tensor(ov::element::f32, target_shape, dst->data); + infer_request.set_output_tensor(0, output_tensor); + + // Execute reasoning: The computation graph uses Gather+Reshape to collect each valid element of src0 in a predetermined order and write it directly to dst->data + infer_request.infer(); + /* const size_t rs = ne00 * element_size; // Row size in bytes for dst // Create OpenVINO tensors for source and destination @@ -699,7 +864,7 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { // Copy row std::memcpy(dst_ptr, src0_ptr, rs); - } + }*/ return; } std::cout << "Duplication of bytes completed successfully." << std::endl; @@ -746,7 +911,7 @@ void ggml_backend_openvino_cpy(struct ggml_tensor *dst) { ov::ResultVector{std::make_shared(dst_output)}, ov::ParameterVector{src_input}, "ContiguousCopy"); - // Compile and execute the model + // Compile and execute the model auto compiled_model = core.compile_model(model, "CPU"); ov::Tensor src_tensor(ov::element::f32, src_shape, src0->data); @@ -757,6 +922,93 @@ void ggml_backend_openvino_cpy(struct ggml_tensor *dst) { infer_request.set_output_tensor(0, dst_tensor); infer_request.infer(); } else { + // In this example, the logical shape is [7,3072,1,1]. + // Here we assume that the number of "rows" is 3072 and the number of "columns" is 7. + const size_t num_cols = static_cast(dst->ne[0]); // 7 + const size_t num_rows = static_cast(dst->ne[1]); // 3072 + const size_t total_elems = num_cols * num_rows; // 7 * 3072 = 21504 + + // For src0: + // src0->nb[0] = 12288, so the stride along logical dimension 0 = 12288/4 = 3072 (f32) + // const size_t src_stride0 = 12288 / ggml_type_size(src0->type); // 3072 + const size_t src_stride0 = src0->nb[0] / ggml_type_size(src0->type); // 3072 + + // Construct index array (length 21504), in flat output order (row-first, row length = 7): + // For output flat index n, set: + // r = n / 7, c = n % 7. + // Valid data index corresponding to src0 = c * src_stride0 + r. + std::vector indices; + indices.reserve(total_elems); + for (size_t n = 0; n < total_elems; n++) { + size_t r = n / num_cols; // r in [0,3072) + size_t c = n % num_cols; // c in [0,7) + int64_t idx = static_cast(c * src_stride0 + r); + indices.push_back(idx); + } + + // --- Construct OpenVINO calculation graph --- + // 1. Encapsulate src0->data into 1D input Tensor with shape [21504] + ov::Shape flat_shape = { total_elems }; + auto input_param = std::make_shared(ov::element::f32, flat_shape); + + // 2. Constructs an index constant with a shape of [21504] + auto indices_const = ov::op::v0::Constant::create(ov::element::i64, flat_shape, indices); + + // 3. Construct axis constant, axis = 0 + auto axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + + // 4. Use the Gather operator to collect valid data. The result shape is [21504], type f32 + auto gathered = std::make_shared(input_param, indices_const, axis_const); + + // 5. Convert data types: f32 to f16 + auto converted = std::make_shared(gathered, ov::element::f16); + + // 6. Reshape into a 2D tensor with shape [num_rows, num_cols] = [3072,7]. + // Note: row-first arrangement is used here, that is, the 0th dimension represents rows (3072 rows) and the 1st dimension represents columns (7 consecutive elements) + std::vector new_shape = { static_cast(num_rows), static_cast(num_cols) }; + auto reshape_const = ov::op::v0::Constant::create(ov::element::i64, {2}, new_shape); + auto reshaped = std::make_shared(converted, reshape_const, false); + + // 7. To keep consistent with the logical shape of dst [7,3072,1,1] (note: the order of ne arrays in ggml may be different from the intuitive), + // Here we finally need to get a flat continuous result with row-first arrangement of [3072,7] (i.e., 7 consecutive elements per row). + // If you need to expand to 4D, you can further reshape, but here we only focus on two-dimensional valid data. + // Let output_shape = [num_rows, num_cols] = [3072,7] + + // 8. Construct model: input is input_param, output is reshaped + auto model = std::make_shared(ov::OutputVector{ reshaped }, ov::ParameterVector{ input_param }); + + ov::Core core; + auto compiled_model = core.compile_model(model, "CPU"); + auto infer_request = compiled_model.create_infer_request(); + + // 9. Construct input Tensor: directly wrap src0->data, shape is flat_shape, type f32 + ov::Tensor input_tensor(ov::element::f32, flat_shape, src0->data); + infer_request.set_input_tensor(0, input_tensor); + + // 10. Since dst is non-contiguous (row spacing is dst->nb[1] = 64 bytes), + // We let the model output to a temporary continuous buffer and then copy it row by row to dst->data. + ov::Shape contig_output_shape = { num_rows, num_cols }; // [3072,7] + // Allocate a temporary buffer (to store f16 data, number of elements = 3072*7) + std::vector temp_output(total_elems); + ov::Tensor output_tensor_contig(ov::element::f16, contig_output_shape, temp_output.data()); + infer_request.set_output_tensor(0, output_tensor_contig); + + // 11. Execute inference, the computation graph will collect, convert, and reshape to obtain a continuous f16 result + infer_request.infer(); + + // 12. Copy temporary output to dst->data by line, considering the non-continuous storage of dst (each line is separated by dst->nb[1] bytes) + // Each line of valid data is num_cols * sizeof(f16) = 7 * 2 = 14 bytes. + uint8_t *dst_ptr = static_cast(dst->data); + size_t dst_row_stride = static_cast(dst->nb[1]); // 64 bytes per row + size_t row_bytes = num_cols * ggml_type_size(dst->type); // 7 * 2 = 14 bytes + for (size_t r = 0; r < num_rows; r++) { + // Temporary output is a continuous two-dimensional array, offset = r * num_cols + uint8_t *src_row_ptr = reinterpret_cast(temp_output.data()) + r * row_bytes; + // Copy row_bytes to the starting address of the dst row + std::memcpy(dst_ptr + r * dst_row_stride, src_row_ptr, row_bytes); + } + + /** // Non-contiguous case: element-wise copy for (int64_t i03 = 0; i03 < dst->ne[3]; ++i03) { for (int64_t i02 = 0; i02 < dst->ne[2]; ++i02) { @@ -774,7 +1026,7 @@ void ggml_backend_openvino_cpy(struct ggml_tensor *dst) { } } } - } + }*/ } } @@ -828,6 +1080,7 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe // Process a range of nodes with openvino_frontend_compute int start_index = i; while (i < cgraph->n_nodes && + // std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end()) { i++; diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 355a95d978..945b5cbf7a 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -4,6 +4,7 @@ #include void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs) { + m_node_op_name[node->name] = ggml_op_name(node->op); switch (node->op) { // Unary OPs case GGML_OP_UNARY: diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 2afde161ee..f4b91f9251 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -65,6 +65,15 @@ public: virtual bool check_if_continuous() const override { return m_continuous; } + + virtual const std::string& get_node_op_name(const std::string& name) const { + auto it = m_node_op_name.find(name); + if (it != m_node_op_name.end()) { + return it->second; + } + return ""; + } + private: void set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs); @@ -79,5 +88,6 @@ private: const std::string m_op_name; mutable std::string m_name; bool m_continuous; + std::map m_node_op_name; }; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 84c9001c5c..88d603b4ae 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -109,6 +109,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto output_names = ggml_decoder->get_output_names(); auto output_tensors = get_ggml_graph_output_dst(ggml_decoder); for (size_t i = 0; i < output_names.size(); i++) { + // std::string op_name = ggml_decoder->get_node_op_name(output_names[i]); auto output_tensor = infer_request.get_output_tensor(i); std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size()); #ifdef GGML_OPENVINO_DEBUG From 57582fda39208b9d3e5324102f5ddaf513f722ef Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Wed, 19 Feb 2025 17:51:07 +0800 Subject: [PATCH 027/254] add implementation of CPY when the output tensor is non-contiguous --- ggml/src/ggml-openvino.cpp | 147 ++++++++++++------------------------- 1 file changed, 48 insertions(+), 99 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 99a32b1dfd..dc45f0fe6d 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -529,7 +529,7 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { std::vector full_dst_shape = { dst->ne[2], dst->ne[1], dst->ne[0]}; auto final_shape_const = ov::op::v0::Constant::create(ov::element::i64, { full_dst_shape.size() }, full_dst_shape); - auto model = std::make_shared(ov::NodeVector{ batched_matmul }, ov::ParameterVector{param_src1, param_src0}); + auto model = std::make_shared(ov::NodeVector{ batched_matmul }, ov::ParameterVector{param_src0, param_src1}); ov::Core core; auto compiled_model = core.compile_model(model, "CPU"); @@ -538,8 +538,8 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { // Construct input Tensors: treat src0->data and src1->data as 1D flat data respectively ov::Tensor tensor_src0(ov::element::f16, flat_shape_src0, src0->data); ov::Tensor tensor_src1(ov::element::f32, flat_shape_src1, src1->data); - infer_request.set_input_tensor(0, tensor_src1); - infer_request.set_input_tensor(1, tensor_src0); + infer_request.set_input_tensor(0, tensor_src0); + infer_request.set_input_tensor(1, tensor_src1); ov::Tensor tensor_dst(ov::element::f32, ov::Shape(full_dst_shape.begin(), full_dst_shape.end()), dst->data); infer_request.set_output_tensor(0, tensor_dst); @@ -548,9 +548,6 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { return ; } - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - // Valid shape std::vector eff_shape_src0 = get_effective_shape(src0); std::vector eff_shape_src1 = get_effective_shape(src1); @@ -604,13 +601,13 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { ov::Tensor tensor_dst(ov::element::f32, ov::Shape(eff_shape_dst.begin(), eff_shape_dst.end()), dst->data); std::shared_ptr matmul = std::make_shared(reshape_src1, A_for_mul, false, false); - auto model = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{param_flat_src1, param_flat_src0}); + auto model = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{param_flat_src0, param_flat_src1}); auto compiled_model = core.compile_model(model, "CPU"); auto infer_request = compiled_model.create_infer_request(); - infer_request.set_input_tensor(0, tensor_src1); - infer_request.set_input_tensor(1, tensor_src0); + infer_request.set_input_tensor(0, tensor_src0); + infer_request.set_input_tensor(1, tensor_src1); infer_request.set_output_tensor(0, tensor_dst); infer_request.infer(); } @@ -922,111 +919,63 @@ void ggml_backend_openvino_cpy(struct ggml_tensor *dst) { infer_request.set_output_tensor(0, dst_tensor); infer_request.infer(); } else { - // In this example, the logical shape is [7,3072,1,1]. - // Here we assume that the number of "rows" is 3072 and the number of "columns" is 7. - const size_t num_cols = static_cast(dst->ne[0]); // 7 - const size_t num_rows = static_cast(dst->ne[1]); // 3072 - const size_t total_elems = num_cols * num_rows; // 7 * 3072 = 21504 - - // For src0: - // src0->nb[0] = 12288, so the stride along logical dimension 0 = 12288/4 = 3072 (f32) - // const size_t src_stride0 = 12288 / ggml_type_size(src0->type); // 3072 - const size_t src_stride0 = src0->nb[0] / ggml_type_size(src0->type); // 3072 - - // Construct index array (length 21504), in flat output order (row-first, row length = 7): - // For output flat index n, set: - // r = n / 7, c = n % 7. - // Valid data index corresponding to src0 = c * src_stride0 + r. - std::vector indices; - indices.reserve(total_elems); - for (size_t n = 0; n < total_elems; n++) { - size_t r = n / num_cols; // r in [0,3072) - size_t c = n % num_cols; // c in [0,7) - int64_t idx = static_cast(c * src_stride0 + r); - indices.push_back(idx); + std::vector gather_idx; + for (int row = 0; row < dst->src[0]->ne[1]; row++) { + for (int col = 0; col < dst->src[0]->ne[0]; col++) { + gather_idx.push_back((row*dst->src[0]->nb[1]+col*dst->src[0]->nb[0])/4); + } } + size_t N = gather_idx.size(); + ov::Shape gather_idx_shape = {N, 1}; + std::vector scatter_idx; + for (int row = 0; row < dst->ne[1]; row++) { + for (int col = 0; col < dst->ne[0]; col++) { + scatter_idx.push_back(row * dst->nb[1] / 2 + col); + } + } + ov::Shape scatter_idx_shape = {N, 1}; - // --- Construct OpenVINO calculation graph --- - // 1. Encapsulate src0->data into 1D input Tensor with shape [21504] - ov::Shape flat_shape = { total_elems }; - auto input_param = std::make_shared(ov::element::f32, flat_shape); + // param_src0 shape => 1D, rank=1, size is large enough. For example, row*col= 21504 + some padding, e.g. 80000 + // ov::Shape flat_src0_shape = {80000}; + ov::Shape flat_src0_shape = {dst->src[0]->nb[2]}; + auto param_src0 = std::make_shared(ov::element::f32, flat_src0_shape); - // 2. Constructs an index constant with a shape of [21504] - auto indices_const = ov::op::v0::Constant::create(ov::element::i64, flat_shape, indices); + auto gather_indices_const = ov::op::v0::Constant::create(ov::element::i64, gather_idx_shape, gather_idx); + auto gather_axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto gathered = std::make_shared( + param_src0, gather_indices_const, gather_axis_const); - // 3. Construct axis constant, axis = 0 - auto axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - - // 4. Use the Gather operator to collect valid data. The result shape is [21504], type f32 - auto gathered = std::make_shared(input_param, indices_const, axis_const); - - // 5. Convert data types: f32 to f16 auto converted = std::make_shared(gathered, ov::element::f16); - // 6. Reshape into a 2D tensor with shape [num_rows, num_cols] = [3072,7]. - // Note: row-first arrangement is used here, that is, the 0th dimension represents rows (3072 rows) and the 1st dimension represents columns (7 consecutive elements) - std::vector new_shape = { static_cast(num_rows), static_cast(num_cols) }; - auto reshape_const = ov::op::v0::Constant::create(ov::element::i64, {2}, new_shape); - auto reshaped = std::make_shared(converted, reshape_const, false); + // param_dst_base shape => 1D, rank=1, size够大, e.g. row=3072 => i up to 3071 => offset i*64=196544 + j*2, e.g.200000 + // ov::Shape flat_dst_shape = {200000, 1}; + ov::Shape flat_dst_shape = {dst->nb[2], 1}; + auto param_dst_base = std::make_shared(ov::element::f16, flat_dst_shape); - // 7. To keep consistent with the logical shape of dst [7,3072,1,1] (note: the order of ne arrays in ggml may be different from the intuitive), - // Here we finally need to get a flat continuous result with row-first arrangement of [3072,7] (i.e., 7 consecutive elements per row). - // If you need to expand to 4D, you can further reshape, but here we only focus on two-dimensional valid data. - // Let output_shape = [num_rows, num_cols] = [3072,7] + auto scatter_indices_const = ov::op::v0::Constant::create(ov::element::i64, scatter_idx_shape, scatter_idx); - // 8. Construct model: input is input_param, output is reshaped - auto model = std::make_shared(ov::OutputVector{ reshaped }, ov::ParameterVector{ input_param }); + // ScatterNDUpdate( base, scatter_indices, updates ) + // scatter_indices last dimension = 1 => each index is 1D coordinate + auto scatter = std::make_shared( + param_dst_base, scatter_indices_const, converted + ); + + ov::ParameterVector params = { param_src0, param_dst_base }; + auto model = std::make_shared(ov::OutputVector{ scatter }, params); - ov::Core core; auto compiled_model = core.compile_model(model, "CPU"); auto infer_request = compiled_model.create_infer_request(); - // 9. Construct input Tensor: directly wrap src0->data, shape is flat_shape, type f32 - ov::Tensor input_tensor(ov::element::f32, flat_shape, src0->data); - infer_request.set_input_tensor(0, input_tensor); + ov::Tensor tensor_src0(ov::element::f32, flat_src0_shape, src0->data); + ov::Tensor tensor_dst_base(ov::element::f16, flat_dst_shape, dst->data); - // 10. Since dst is non-contiguous (row spacing is dst->nb[1] = 64 bytes), - // We let the model output to a temporary continuous buffer and then copy it row by row to dst->data. - ov::Shape contig_output_shape = { num_rows, num_cols }; // [3072,7] - // Allocate a temporary buffer (to store f16 data, number of elements = 3072*7) - std::vector temp_output(total_elems); - ov::Tensor output_tensor_contig(ov::element::f16, contig_output_shape, temp_output.data()); - infer_request.set_output_tensor(0, output_tensor_contig); + infer_request.set_input_tensor(0, tensor_src0); + infer_request.set_input_tensor(1, tensor_dst_base); + + ov::Tensor out_tensor(ov::element::f16, flat_dst_shape, dst->data); + infer_request.set_output_tensor(0, out_tensor); - // 11. Execute inference, the computation graph will collect, convert, and reshape to obtain a continuous f16 result infer_request.infer(); - - // 12. Copy temporary output to dst->data by line, considering the non-continuous storage of dst (each line is separated by dst->nb[1] bytes) - // Each line of valid data is num_cols * sizeof(f16) = 7 * 2 = 14 bytes. - uint8_t *dst_ptr = static_cast(dst->data); - size_t dst_row_stride = static_cast(dst->nb[1]); // 64 bytes per row - size_t row_bytes = num_cols * ggml_type_size(dst->type); // 7 * 2 = 14 bytes - for (size_t r = 0; r < num_rows; r++) { - // Temporary output is a continuous two-dimensional array, offset = r * num_cols - uint8_t *src_row_ptr = reinterpret_cast(temp_output.data()) + r * row_bytes; - // Copy row_bytes to the starting address of the dst row - std::memcpy(dst_ptr + r * dst_row_stride, src_row_ptr, row_bytes); - } - - /** - // Non-contiguous case: element-wise copy - for (int64_t i03 = 0; i03 < dst->ne[3]; ++i03) { - for (int64_t i02 = 0; i02 < dst->ne[2]; ++i02) { - for (int64_t i01 = 0; i01 < dst->ne[1]; ++i01) { - for (int64_t i00 = 0; i00 < dst->ne[0]; ++i00) { - const char *src_ptr = static_cast(src0->data) + - i00 * src0->nb[0] + i01 * src0->nb[1] + - i02 * src0->nb[2] + i03 * src0->nb[3]; - - char *dst_ptr = static_cast(dst->data) + - i00 * dst->nb[0] + i01 * dst->nb[1] + - i02 * dst->nb[2] + i03 * dst->nb[3]; - - *(ggml_fp16_t *)dst_ptr = GGML_FP32_TO_FP16(*(const float *)src_ptr); - } - } - } - }*/ } } From afb8594194b7bb77cdc6dc5d5ee632ed54047780 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Tue, 25 Feb 2025 12:43:12 +0800 Subject: [PATCH 028/254] add tmp source code files --- examples/simple/simple.cpp | 2 +- ggml/src/ggml-openvino.cpp | 63 ++--- ggml/src/ggml-openvino/decoder.h | 15 ++ ggml/src/ggml-openvino/ggml-decoder.cpp | 290 ++++++++++++++++++------ ggml/src/ggml-openvino/ggml-decoder.h | 17 +- ggml/src/ggml-openvino/utils.cpp | 50 +++- setup.sh | 2 + 7 files changed, 321 insertions(+), 118 deletions(-) create mode 100755 setup.sh diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index d09771d104..9e6c678e83 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -140,7 +140,7 @@ int main(int argc, char ** argv) { std::string s(buf, n); printf("%s", s.c_str()); } - + printf("\n"); // prepare a batch for the prompt llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size()); diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index dc45f0fe6d..2e20e8e39b 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -685,8 +685,6 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { if (src0->type == dst->type && ne00 == dst->ne[0] && nb00 == element_size && nb0 == element_size) { // Assume that the data type is f32 and each element is 4 bytes - const size_t element_size = ggml_type_size(src0->type); // 4 bytes - // Logically, the number of valid elements per row is 3072 (src0->ne[0]), and the number of rows is 7 (src0->ne[1]) size_t valid_elems = static_cast(src0->ne[0]); // 3072 size_t num_rows = static_cast(src0->ne[1]); // 7 @@ -740,7 +738,10 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { infer_request.set_input_tensor(0, input_tensor); // Construct output Tensor: dst is continuous storage, and its logical shape is [3072,7,1,1] - ov::Shape output_shape = { valid_elems, num_rows, 1, 1 }; + ov::Shape output_shape = { static_cast(dst->ne[0]), + static_cast(dst->ne[1]), + static_cast(dst->ne[2]), + static_cast(dst->ne[3])}; ov::Tensor output_tensor(ov::element::f32, output_shape, dst->data); infer_request.set_output_tensor(0, output_tensor); @@ -811,7 +812,10 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { // gathered has a shape of [21504] // 5. Reshape gathered to [3072,7,1,1], because 3072*7 = 21504 - ov::Shape target_shape = { static_cast(dst->ne[0]), static_cast(dst->ne[1]), 1, 1 }; // [3072,7,1,1] + ov::Shape target_shape = { static_cast(dst->ne[0]), + static_cast(dst->ne[1]), + static_cast(dst->ne[2]), + static_cast(dst->ne[3])}; // [3072,7,1,1] auto reshape_const = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{ static_cast(dst->ne[0]), static_cast(dst->ne[1]), 1, 1 }); auto reshaped = std::make_shared(gathered, reshape_const, false); @@ -834,34 +838,6 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { // Execute reasoning: The computation graph uses Gather+Reshape to collect each valid element of src0 in a predetermined order and write it directly to dst->data infer_request.infer(); - /* - const size_t rs = ne00 * element_size; // Row size in bytes for dst - - // Create OpenVINO tensors for source and destination - // The tensors are reshaped to a 2D structure (num_rows x ne00) for easier iteration and compatibility with the simplified loop. - ov::Tensor src_tensor(ov::element::f32, ov::Shape{ne03 * ne02 * ne01, ne00}, src0->data); - ov::Tensor dst_tensor(ov::element::f32, ov::Shape{ne03 * ne02 * ne01, ne00}, dst->data); - - // Perform the copy in a single loop - const size_t num_rows = ne03 * ne02 * ne01; - for (size_t row = 0; row < num_rows; ++row) { - // Calculate the source row pointer based on original strides - // The source row pointer is calculated based on the combined index row and the strides nb03, nb02, and nb01. - const char* src0_ptr = (char*)src_tensor.data() + - // Calculates which block of the i03 dimension the current row belongs to - (row / (ne02 * ne01)) * nb03 + // 0 - // Calculates which block of the i02 dimension the current row belongs to within the current i03 block. - ((row / ne01) % ne02) * nb02 + // 0, 0,......, 0,384, 384,......, 384,768,......, 2304 - // Calculates the position within the current i02 block in terms of the i01 index. - (row % ne01) * nb01; // 0,2688,......,83328, 0, 2688,......,83328, 0,......, 83328 - - // Destination row pointer is linear - // Since dst is contiguous, its rows are accessed linearly using a single stride rs, simplifying the destination pointer calculation. - char* dst_ptr = (char*)dst_tensor.data() + row * rs; - - // Copy row - std::memcpy(dst_ptr, src0_ptr, rs); - }*/ return; } std::cout << "Duplication of bytes completed successfully." << std::endl; @@ -939,6 +915,7 @@ void ggml_backend_openvino_cpy(struct ggml_tensor *dst) { // ov::Shape flat_src0_shape = {80000}; ov::Shape flat_src0_shape = {dst->src[0]->nb[2]}; auto param_src0 = std::make_shared(ov::element::f32, flat_src0_shape); + // auto param_src00 = std::make_shared(ov::element::f32, flat_src0_shape); auto gather_indices_const = ov::op::v0::Constant::create(ov::element::i64, gather_idx_shape, gather_idx); auto gather_axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); @@ -951,6 +928,7 @@ void ggml_backend_openvino_cpy(struct ggml_tensor *dst) { // ov::Shape flat_dst_shape = {200000, 1}; ov::Shape flat_dst_shape = {dst->nb[2], 1}; auto param_dst_base = std::make_shared(ov::element::f16, flat_dst_shape); + // auto param_dst_base11 = std::make_shared(ov::element::f16, flat_dst_shape); auto scatter_indices_const = ov::op::v0::Constant::create(ov::element::i64, scatter_idx_shape, scatter_idx); @@ -961,6 +939,8 @@ void ggml_backend_openvino_cpy(struct ggml_tensor *dst) { ); ov::ParameterVector params = { param_src0, param_dst_base }; + // ov::ParameterVector params = { param_src0}; + // ov::ParameterVector params = { param_src00, param_dst_base11}; auto model = std::make_shared(ov::OutputVector{ scatter }, params); auto compiled_model = core.compile_model(model, "CPU"); @@ -1009,16 +989,17 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe } } + // openvino_frontend_compute(backend, cgraph); // Process nodes in order for (int i = 0; i < cgraph->n_nodes; i++) { - if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { - ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); - } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { + if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { ggml_backend_openvino_reshape(cgraph->nodes[i]); + // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { + // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { - ggml_backend_openvino_view(cgraph->nodes[i]); - // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { - // ggml_backend_openvino_cpy(cgraph->nodes[i]); + ggml_backend_openvino_view(cgraph->nodes[i]); + } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { + ggml_backend_openvino_cpy(cgraph->nodes[i]); } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { ggml_backend_openvino_transpose(cgraph->nodes[i]); } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { @@ -1029,8 +1010,8 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe // Process a range of nodes with openvino_frontend_compute int start_index = i; while (i < cgraph->n_nodes && - // std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() && - std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() && + std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() && + //std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end()) { i++; } @@ -1270,7 +1251,7 @@ static const std::set& openvino_ops = []() -> const std::set shape; + std::vector stride; +}; // TODO: Directly include from openvino class GgmlDecoder : public DecoderBase { public: @@ -14,6 +21,8 @@ public: virtual PartialShape get_input_shape(const std::string& name) const = 0; + virtual std::vector get_input_stride(const std::string& name) const = 0; + virtual element::Type get_input_type(const std::string& name) const = 0; virtual size_t get_input_size() const = 0; @@ -27,6 +36,10 @@ public: virtual std::vector get_input_names() const = 0; + virtual const std::string& get_node_op_name(const std::string& name) const = 0; + + // virtual const struct tensor_info get_node_op_info(const std::string& name) const = 0; + virtual PartialShape get_output_shape(const std::string& name) const = 0; virtual element::Type get_output_type(const std::string& name) const = 0; @@ -53,6 +66,8 @@ public: virtual bool check_if_continuous() const = 0; + virtual const std::vector>& get_params() const = 0; + }; } // namespace ggml diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 945b5cbf7a..a412f8b75a 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -2,9 +2,13 @@ #include #include #include +#include +#include void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs) { m_node_op_name[node->name] = ggml_op_name(node->op); + std::string src0_name = std::string(node->src[0]->name) + "_" + std::to_string(node->src[0]->view_offs) + "_input_" + ggml_op_name(node->src[0]->op); + std::string node_name = std::string(node->name) + "_" + std::to_string(node->view_offs) + "_output_" + ggml_op_name(node->op); switch (node->op) { // Unary OPs case GGML_OP_UNARY: @@ -16,6 +20,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]->name] = node->src[0]; outputs[node->name] = node; m_input_names.push_back(node->src[0]->name); + m_node_op_name[node->src[0]->name] = ggml_op_name(node->op); m_output_names.push_back(node->name); break; } @@ -25,76 +30,73 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]->name] = node->src[0]; outputs[node->name] = node; m_input_names.push_back(node->src[0]->name); + m_node_op_name[node->src[0]->name] = ggml_op_name(node->op); m_output_names.push_back(node->name); m_continuous = true; + + ov::Shape flat_shape = { static_cast(ggml_nelements(node)) }; + auto input_param = std::make_shared(ov::element::f32, flat_shape); + m_params.push_back(input_param); + break; } if (node->src[0]->type == node->type && node->src[0]->ne[0] == node->ne[0] && - node->src[0]->nb[0] == ggml_type_size(node->src[0]->type) && node->nb[0] == ggml_type_size(node->src[0]->type)) { + node->src[0]->nb[0] == ggml_type_size(node->src[0]->type) && + node->nb[0] == ggml_type_size(node->src[0]->type)) { - for (size_t i01 = 0; i01 < node->src[0]->ne[1]; ++i01) { - const char *src_row = reinterpret_cast(node->src[0]->data) + i01 * node->src[0]->nb[1]; - char *dst_row = reinterpret_cast(node->data) + i01 * node->nb[1]; - std::memcpy(dst_row, src_row, node->src[0]->ne[0] * ggml_type_size(node->src[0]->type)); - } + // for (size_t i01 = 0; i01 < node->src[0]->ne[1]; ++i01) { + // const char *src_row = reinterpret_cast(node->src[0]->data) + i01 * node->src[0]->nb[1]; + // char *dst_row = reinterpret_cast(node->data) + i01 * node->nb[1]; + // std::memcpy(dst_row, src_row, node->src[0]->ne[0] * ggml_type_size(node->src[0]->type)); + // } - inputs[node->name] = node; + inputs[node->src[0]->name] = node->src[0]; outputs[node->name] = node; - m_input_names.push_back(node->name); + m_input_names.push_back(node->src[0]->name); + m_node_op_name[node->src[0]->name] = ggml_op_name(node->op); m_output_names.push_back(node->name); + + const size_t element_size = ggml_type_size(node->src[0]->type); + size_t valid_elems = static_cast(node->src[0]->ne[0]); // 3072 + size_t num_rows = static_cast(node->src[0]->ne[1]); // 7 + size_t phys_stride = static_cast(node->src[0]->nb[1]) / element_size; // 9216 + size_t total_phys = (num_rows - 1) * phys_stride + valid_elems; // 6*9216 + 3072 = 58368 + ov::Shape flat_input_shape = { total_phys }; + auto flat_input_param = std::make_shared(ov::element::f32, flat_input_shape); + m_params.push_back(flat_input_param); + m_continuous = false; break; } - // if (ggml_is_contiguous(node)) { - const size_t rs = node->src[0]->ne[0] * ggml_type_size(node->src[0]->type); // Row size in bytes for dst - - // Create OpenVINO tensors for source and destination - // The tensors are reshaped to a 2D structure (num_rows x ne00) for easier iteration and compatibility with the simplified loop. - ov::Tensor src_tensor(ov::element::f32, - ov::Shape{node->src[0]->ne[3] * node->src[0]->ne[2] * node->src[0]->ne[1], node->src[0]->ne[0]}, - node->src[0]->data); - ov::Tensor dst_tensor(ov::element::f32, - ov::Shape{node->src[0]->ne[3] * node->src[0]->ne[2] * node->src[0]->ne[1], node->src[0]->ne[0]}, - node->data); - - // Perform the copy in a single loop - const size_t num_rows = node->src[0]->ne[3] * node->src[0]->ne[2] * node->src[0]->ne[1]; - for (size_t row = 0; row < num_rows; ++row) { - // Calculate the source row pointer based on original strides - // The source row pointer is calculated based on the combined index row and the strides nb03, nb02, and nb01. - const char* src0_ptr = (char*)src_tensor.data() + - // Calculates which block of the i03 dimension the current row belongs to - (row / (node->src[0]->ne[2] * node->src[0]->ne[1])) * node->src[0]->nb[3] + // 0 - // Calculates which block of the i02 dimension the current row belongs to within the current i03 block. - ((row / node->src[0]->ne[1]) % node->src[0]->ne[2]) * node->src[0]->nb[2] + // 0, 0,......, 0,384, 384,......, 384,768,......, 2304 - // Calculates the position within the current i02 block in terms of the i01 index. - (row % node->src[0]->ne[1]) * node->src[0]->nb[1]; // 0,2688,......,83328, 0, 2688,......,83328, 0,......, 83328 - - // Destination row pointer is linear - // Since dst is contiguous, its rows are accessed linearly using a single stride rs, simplifying the destination pointer calculation. - char* dst_ptr = (char*)dst_tensor.data() + row * rs; - - // Copy row - std::memcpy(dst_ptr, src0_ptr, rs); - } - - inputs[node->name] = node; - outputs[node->name] = node; - m_input_names.push_back(node->name); - m_output_names.push_back(node->name); - m_continuous = false; - break; - //} - } - case GGML_OP_CPY: - { if (ggml_is_contiguous(node)) { inputs[node->src[0]->name] = node->src[0]; outputs[node->name] = node; m_input_names.push_back(node->src[0]->name); + m_node_op_name[node->src[0]->name] = ggml_op_name(node->op); m_output_names.push_back(node->name); + + size_t valid_i = static_cast(node->src[0]->ne[0]); // 96 + size_t valid_j = static_cast(node->src[0]->ne[1]); // 32 + size_t valid_k = static_cast(node->src[0]->ne[2]); // 7 + size_t total_valid = valid_i * valid_j * valid_k; // 96 * 32 * 7 = 21504 + ov::Shape flat_input_shape = { total_valid }; + auto input_param = std::make_shared(ov::element::f32, flat_input_shape); + m_params.push_back(input_param); + + m_continuous = false; + break; + } + } + case GGML_OP_CPY: + { + if (ggml_is_contiguous(node)) { + inputs[src0_name] = node->src[0]; + outputs[node_name] = node; + m_input_names.push_back(src0_name); + m_node_op_name[src0_name] = ggml_op_name(node->op); + m_output_names.push_back(node_name); m_continuous = true; break; } else { @@ -108,12 +110,40 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]->name] = node->src[0]; - inputs[node->name] = node; - outputs[node->name] = node; - m_input_names.push_back(node->name); - m_output_names.push_back(node->name); + inputs[node_name] = node; + outputs[node_name] = node; + m_input_names.push_back(node_name); + m_node_op_name[node_name] = ggml_op_name(node->op); + m_output_names.push_back(node_name); m_continuous = false; break; + + // inputs[node->src[0]->name] = node->src[0]; + // std::string temp_name = node->src[0]->name + std::string("_cpy_tmp"); + // inputs[temp_name] = node; + + // outputs[node->name] = node; + // m_input_names.push_back(node->src[0]->name); + // m_input_names.push_back(temp_name); + // m_node_op_name[node->src[0]->name] = ggml_op_name(node->op); + // m_node_op_name[temp_name] = ggml_op_name(node->op); + + // m_output_names.push_back(node->name); + + // ov::Shape flat_src0_shape = {80000}; + // auto param_src0 = std::make_shared(ov::element::f32, flat_src0_shape); + // m_params.push_back(param_src0); + + // std::cout << "decoder ADDR-0: " << param_src0.get() << std::endl; + + // ov::Shape flat_dst_shape = {200000, 1}; + // auto param_dst_base = std::make_shared(ov::element::f16, flat_dst_shape); + // m_params.push_back(param_dst_base); + + // std::cout << "decoder ADDR-1: " << param_dst_base.get() << std::endl; + + // m_continuous = false; + // break; } } // For view, input is node itself @@ -122,49 +152,76 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapname] = node; outputs[node->name] = node; m_input_names.push_back(node->name); + m_node_op_name[node->name] = ggml_op_name(node->op); m_output_names.push_back(node->name); break; } // SCALE case GGML_OP_SCALE: { - inputs[node->src[0]->name] = node->src[0]; - outputs[node->name] = node; - m_input_names.push_back(node->name); - m_output_names.push_back(node->name); + inputs[src0_name] = node->src[0]; + outputs[node_name] = node; + m_input_names.push_back(node_name); + // m_node_op_name[node_name] = ggml_op_name(node->op); + m_output_names.push_back(node_name); + break; + } + case GGML_OP_MUL_MAT: + { + std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs) + "_input_" + ggml_op_name(node->src[1]->op); + if (!ggml_is_contiguous(node->src[1]) || node->src[1]->ne[0] * node->src[1]->nb[0] != node->src[1]->nb[1]) { + m_continuous = false; + } else { + m_continuous = true; + } + inputs[src0_name] = node->src[0]; + inputs[src1_name] = node->src[1]; + outputs[node_name] = node; + m_input_names.push_back(src0_name); + m_node_op_name[src0_name] = ggml_op_name(node->op); + m_input_names.push_back(src1_name); + m_node_op_name[src1_name] = ggml_op_name(node->op); + m_output_names.push_back(node_name); break; } // OPs with 2 inputs case GGML_OP_ADD: case GGML_OP_DIV: case GGML_OP_MUL: - case GGML_OP_MUL_MAT: case GGML_OP_SUB: case GGML_OP_GET_ROWS: case GGML_OP_SOFT_MAX: { - inputs[node->src[0]->name] = node->src[0]; - outputs[node->name] = node; - m_input_names.push_back(node->src[0]->name); - m_output_names.push_back(node->name); + inputs[src0_name] = node->src[0]; + outputs[node_name] = node; + m_input_names.push_back(src0_name); + m_node_op_name[src0_name] = ggml_op_name(node->op); + m_output_names.push_back(node_name); if (node->src[1]) { - inputs[node->src[1]->name] = node->src[1]; - m_input_names.push_back(node->src[1]->name); + std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs) + "_input_" + ggml_op_name(node->src[1]->op); + inputs[src1_name] = node->src[1]; + m_node_op_name[src1_name] = ggml_op_name(node->op); + m_input_names.push_back(src1_name); } break; } // OPs with 3 inputs: case GGML_OP_ROPE: { + std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs) + "_input_" + ggml_op_name(node->src[1]->op); inputs[node->src[0]->name] = node->src[0]; inputs[node->src[1]->name] = node->src[1]; m_input_names.push_back(node->src[0]->name); + m_node_op_name[node->src[0]->name] = ggml_op_name(node->op); m_input_names.push_back(node->src[1]->name); + m_node_op_name[node->src[1]->name] = ggml_op_name(node->op); outputs[node->name] = node; m_output_names.push_back(node->name); if (node->src[2]) { + std::string src2_name = std::string(node->src[2]->name) + "_" + std::to_string(node->src[2]->view_offs) + "_input_" + ggml_op_name(node->src[2]->op); inputs[node->src[2]->name] = node->src[2]; m_input_names.push_back(node->src[2]->name); + m_node_op_name[node->src[2]->name] = ggml_op_name(node->op); } break; } @@ -173,6 +230,77 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapn_nodes; i++) { + struct ggml_tensor * node = cgraph->nodes[i]; + + file << " - " << std::setw(3) << i << ": [ " + << std::setw(5) << node->ne[0] << ", " + << std::setw(5) << node->ne[1] << ", " + << std::setw(5) << node->ne[2] << "] " + << std::left << std::setw(16) << ggml_op_name(node->op) << std::right << " " + << " " << node->name + << ((node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ") << "\n"; + + if (node->src[0]) { + file << std::setw(10) << " [ " + << std::setw(5) << node->src[0]->ne[0] << ", " + << std::setw(5) << node->src[0]->ne[1] << ", " + << std::setw(5) << node->src[0]->ne[2] << "] " + << std::setw(12) + << "0: " << ggml_op_name(node->src[0]->op) << " "; + // // Custom logic to handle '\000' + // const char* name_ptr = node->src[0]->name; + // while (*name_ptr != '\0' || *(name_ptr + 1) != '\0' || *(name_ptr + 2) != '\0') { + // file << *name_ptr; + // name_ptr++; + // } + file << node->src[0]->name; + file << "\n"; + } + if (node->src[1]) { + file << std::setw(10) << " [ " + << std::setw(5) << node->src[1]->ne[0] << ", " + << std::setw(5) << node->src[1]->ne[1] << ", " + << std::setw(5) << node->src[1]->ne[2] << "] " + << std::setw(12) + << "1: " << ggml_op_name(node->src[1]->op) << " "; + // // Custom logic to handle '\000' + // const char* name_ptr = node->src[1]->name; + // while (*name_ptr != '\0' || *(name_ptr + 1) != '\0' || *(name_ptr + 2) != '\0') { + // file << *name_ptr; + // name_ptr++; + // } + file << node->src[1]->name; + file << "\n"; + } + } + + file << "n_leafs = " << cgraph->n_leafs << "\n"; + for (int i = 0; i < cgraph->n_leafs; i++) { + struct ggml_tensor * node = cgraph->leafs[i]; + + file << " - " << std::setw(3) << i << ": [ " + << std::setw(5) << node->ne[0] << ", " + << std::setw(5) << node->ne[1] << "] " + << std::setw(8) << ggml_op_name(node->op) << " " + << std::setw(16) << ggml_get_name(node) << "\n"; + } + + file << "========================================\n"; + + file.close(); +} + GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgraph, const int32_t start_index, const int32_t end_index) :m_cgraph(cgraph), m_node(node), @@ -193,7 +321,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr set_input_output(cur_node, m_inputs, m_outputs); } #ifdef GGML_OPENVINO_DEBUG - ggml_graph_print(m_cgraph); + ggml_graph_op_print(m_cgraph); #endif } } @@ -204,6 +332,13 @@ ov::PartialShape GgmlOvDecoder::get_input_shape(const std::string& name) const { ggml_tensor * node = m_inputs.at(name); std::vector shape; + // [TODO], 在这里判断如果是MUL_MAT就设置shape为一维 + if(m_node_op_name.at(name) == "MUL_MAT") { + shape.push_back(static_cast(node->ne[0] * node->ne[1] * node->ne[2])); + input_shape = ov::PartialShape(shape); + return input_shape; + } + for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { if (node->ne[i] == 0) { return input_shape; @@ -214,6 +349,15 @@ ov::PartialShape GgmlOvDecoder::get_input_shape(const std::string& name) const { return input_shape; } +std::vector GgmlOvDecoder::get_input_stride(const std::string& name) const { + std::vector stride; + ggml_tensor * node = m_inputs.at(name); + for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { + stride.push_back(static_cast(node->nb[i])); + } + return stride; +} + ov::element::Type GgmlOvDecoder::get_input_type(const std::string& name) const { ov::element::Type type = ov::element::dynamic; switch (m_inputs.at(name)->type) { @@ -248,6 +392,18 @@ std::vector GgmlOvDecoder::get_input_names() const { return m_input_names; } +const std::string& GgmlOvDecoder::get_node_op_name(const std::string& name) const { + auto it = m_node_op_name.find(name); + if (it != m_node_op_name.end()) { + return it->second; + } + return ""; +} + +const std::vector>& GgmlOvDecoder::get_params() const { + return m_params; +} + ov::PartialShape GgmlOvDecoder::get_output_shape(const std::string& name) const { ov::PartialShape output_shape; // Use input_node->ne diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index f4b91f9251..0921fd8bb5 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -2,6 +2,7 @@ #include "decoder.h" #include "ggml.h" +#include "openvino/op/parameter.hpp" class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { public: @@ -16,6 +17,8 @@ public: virtual ov::PartialShape get_input_shape(const std::string& name) const override; + virtual std::vector get_input_stride(const std::string& name) const override; + virtual ov::element::Type get_input_type(const std::string& name) const override; virtual size_t get_input_size() const override; @@ -66,13 +69,10 @@ public: return m_continuous; } - virtual const std::string& get_node_op_name(const std::string& name) const { - auto it = m_node_op_name.find(name); - if (it != m_node_op_name.end()) { - return it->second; - } - return ""; - } + virtual const std::string& get_node_op_name(const std::string& name) const override; + // virtual const std::string& get_node_op_info(const std::string& name) const override; + + virtual const std::vector>& get_params() const override; private: void set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs); @@ -85,9 +85,10 @@ private: ggml_tensor* m_node; std::vector m_nodes; std::vector> m_decoders; - const std::string m_op_name; + std::string m_op_name; mutable std::string m_name; bool m_continuous; std::map m_node_op_name; + std::vector> m_params; }; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 88d603b4ae..8fa1f99a01 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -13,13 +13,58 @@ std::shared_ptr get_ggml_decoder(struct ggml_cgraph * cgraph, con std::map get_ggml_graph_input_tensors(std::shared_ptr ggml_decoder) { std::map input_tensors; auto input_names = ggml_decoder->get_input_names(); + // auto node_name = ggml_decoder->get_op_name(); for (size_t inp = 0; inp < input_names.size(); ++inp) { auto name = input_names[inp]; + auto node_op_name = ggml_decoder->get_node_op_name(name); auto input_data = ggml_decoder->get_input_ggml_tensor(name)->data; #ifdef GGML_OPENVINO_DEBUG printf("Subgraph input %d: %g\n", inp, *(double*)(input_data)); #endif - ov::Tensor input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); + ov::Tensor input_tensor; + auto input_shape = ggml_decoder->get_input_shape(name).to_shape(); + // if (node_op_name == "CPY" && (input_shape[0] != 7)) { + // input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), {80000}, input_data); + + // } else if (node_op_name == "CONT" || node_op_name == "MUL_MAT") { + // // auto input_shape = ggml_decoder->get_input_shape(name).to_shape(); + // // size_t total_size = 1; + // // for (auto dim : input_shape) { + // // total_size *= dim; + // // } + // // ov::Shape new_shape = {total_size}; + // input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), {ggml_decoder->get_input_shape(name).to_shape()[0]}, input_data); + // } else { + if (node_op_name == "CONT" && ggml_decoder->check_if_continuous()) { + ov::Shape flat_shape = { ggml_decoder->get_input_shape(name).to_shape()[0] * + ggml_decoder->get_input_shape(name).to_shape()[1] * + ggml_decoder->get_input_shape(name).to_shape()[2] }; + input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), flat_shape, input_data); + } else if ( node_op_name == "CONT" && + !ggml_decoder->check_if_continuous() && + input_shape[0] == 1) { + size_t valid_elems = static_cast(ggml_decoder->get_input_shape(name).to_shape()[2]); // 3072 + size_t num_rows = static_cast(ggml_decoder->get_input_shape(name).to_shape()[1]); // 7 + ov::element::Type input_type = ggml_decoder->get_input_type(name); + size_t element_size = input_type.size(); + std::vector strides = ggml_decoder->get_input_stride(name); + size_t phys_stride = static_cast(strides[1]) / element_size; + size_t total_phys = (num_rows - 1) * phys_stride + valid_elems; + ov::Shape flat_input_shape = { total_phys }; + input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), flat_input_shape, input_data); + } else if (node_op_name == "CONT") { + size_t valid_i = static_cast(ggml_decoder->get_input_shape(name).to_shape()[2]); // 96 + size_t valid_j = static_cast(ggml_decoder->get_input_shape(name).to_shape()[1]); // 32 + size_t valid_k = static_cast(ggml_decoder->get_input_shape(name).to_shape()[0]); // 7 + size_t total_valid = valid_i * valid_j * valid_k; // 96 * 32 * 7 = 21504 + ov::Shape flat_input_shape = { total_valid }; + input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), flat_input_shape, input_data); + } else { + input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); + } + // input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); + // } + input_tensors[name] = input_tensor; } return input_tensors; @@ -80,6 +125,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // Convert InputModel -> ov::Model std::shared_ptr model = front_end->convert(input_model); + ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_model.xml"); + if (!model) { GGML_LOG_ERROR("Model is not converted \n"); } else { @@ -90,6 +137,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // Loading a model to the device ov::CompiledModel compiled_model = core.compile_model(model); + ov::save_model(compiled_model.get_runtime_model(), "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_compile_model.xml"); // Create infer request ov::InferRequest infer_request = compiled_model.create_infer_request(); diff --git a/setup.sh b/setup.sh new file mode 100755 index 0000000000..697639dd14 --- /dev/null +++ b/setup.sh @@ -0,0 +1,2 @@ +cmake --build build --parallel $(nproc) + From 081b52667bd7de113b781c55165641c339239e29 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Tue, 25 Feb 2025 17:29:43 +0800 Subject: [PATCH 029/254] Execute singel CONT operator is OK --- ggml/src/ggml-openvino.cpp | 8 +- ggml/src/ggml-openvino/decoder.h | 2 + ggml/src/ggml-openvino/ggml-decoder.cpp | 129 +++++++++++++----------- ggml/src/ggml-openvino/ggml-decoder.h | 2 + 4 files changed, 78 insertions(+), 63 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 2e20e8e39b..e1c294a1d9 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -998,8 +998,8 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { ggml_backend_openvino_view(cgraph->nodes[i]); - } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { - ggml_backend_openvino_cpy(cgraph->nodes[i]); + // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { + // ggml_backend_openvino_cpy(cgraph->nodes[i]); } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { ggml_backend_openvino_transpose(cgraph->nodes[i]); } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { @@ -1010,8 +1010,8 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe // Process a range of nodes with openvino_frontend_compute int start_index = i; while (i < cgraph->n_nodes && - std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() && - //std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() && + // std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() && + // std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end()) { i++; } diff --git a/ggml/src/ggml-openvino/decoder.h b/ggml/src/ggml-openvino/decoder.h index ef18c12144..9a884a3374 100644 --- a/ggml/src/ggml-openvino/decoder.h +++ b/ggml/src/ggml-openvino/decoder.h @@ -42,6 +42,8 @@ public: virtual PartialShape get_output_shape(const std::string& name) const = 0; + virtual std::vector get_output_stride(const std::string& name) const = 0; + virtual element::Type get_output_type(const std::string& name) const = 0; virtual int32_t* get_output_op_params(const std::string& name) const = 0; diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index a412f8b75a..6a249c103f 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -7,8 +7,11 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs) { m_node_op_name[node->name] = ggml_op_name(node->op); - std::string src0_name = std::string(node->src[0]->name) + "_" + std::to_string(node->src[0]->view_offs) + "_input_" + ggml_op_name(node->src[0]->op); - std::string node_name = std::string(node->name) + "_" + std::to_string(node->view_offs) + "_output_" + ggml_op_name(node->op); + // Execute singel CONT operator is OK + std::string src0_name = std::string(node->src[0]->name) + "_" + std::to_string(node->src[0]->view_offs) + "_" + ggml_op_name(node->src[0]->op); + std::string node_name = std::string(node->name) + "_" + std::to_string(node->view_offs) + "_" + ggml_op_name(node->op); + // std::string src0_name = std::string(node->src[0]->name) + "_" + std::to_string(node->src[0]->view_offs); + // std::string node_name = std::string(node->name) + "_" + std::to_string(node->view_offs); switch (node->op) { // Unary OPs case GGML_OP_UNARY: @@ -17,21 +20,21 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]->name] = node->src[0]; - outputs[node->name] = node; - m_input_names.push_back(node->src[0]->name); - m_node_op_name[node->src[0]->name] = ggml_op_name(node->op); - m_output_names.push_back(node->name); + inputs[src0_name] = node->src[0]; + outputs[node_name] = node; + m_input_names.push_back(src0_name); + m_node_op_name[src0_name] = ggml_op_name(node->op); + m_output_names.push_back(node_name); break; } case GGML_OP_CONT: { if (ggml_is_contiguous(node->src[0]) && ggml_is_contiguous(node)) { - inputs[node->src[0]->name] = node->src[0]; - outputs[node->name] = node; - m_input_names.push_back(node->src[0]->name); - m_node_op_name[node->src[0]->name] = ggml_op_name(node->op); - m_output_names.push_back(node->name); + inputs[src0_name] = node->src[0]; + outputs[node_name] = node; + m_input_names.push_back(src0_name); + m_node_op_name[src0_name] = ggml_op_name(node->op); + m_output_names.push_back(node_name); m_continuous = true; ov::Shape flat_shape = { static_cast(ggml_nelements(node)) }; @@ -51,11 +54,11 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]->ne[0] * ggml_type_size(node->src[0]->type)); // } - inputs[node->src[0]->name] = node->src[0]; - outputs[node->name] = node; - m_input_names.push_back(node->src[0]->name); - m_node_op_name[node->src[0]->name] = ggml_op_name(node->op); - m_output_names.push_back(node->name); + inputs[src0_name] = node->src[0]; + outputs[node_name] = node; + m_input_names.push_back(src0_name); + m_node_op_name[src0_name] = ggml_op_name(node->op); + m_output_names.push_back(node_name); const size_t element_size = ggml_type_size(node->src[0]->type); size_t valid_elems = static_cast(node->src[0]->ne[0]); // 3072 @@ -71,11 +74,11 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]->name] = node->src[0]; - outputs[node->name] = node; - m_input_names.push_back(node->src[0]->name); - m_node_op_name[node->src[0]->name] = ggml_op_name(node->op); - m_output_names.push_back(node->name); + inputs[src0_name] = node->src[0]; + outputs[node_name] = node; + m_input_names.push_back(src0_name); + m_node_op_name[src0_name] = ggml_op_name(node->op); + m_output_names.push_back(node_name); size_t valid_i = static_cast(node->src[0]->ne[0]); // 96 size_t valid_j = static_cast(node->src[0]->ne[1]); // 32 @@ -98,6 +101,10 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop); m_output_names.push_back(node_name); m_continuous = true; + + ov::Shape src_shape(node->src[0]->ne, node->src[0]->ne + 4); + auto input_param = std::make_shared(ov::element::f32, src_shape); + m_params.push_back(input_param); break; } else { for (int64_t i1 = 0; i1 < node->ne[1]; ++i1) { // ne[1] = 3072 @@ -118,57 +125,52 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]->name] = node->src[0]; - // std::string temp_name = node->src[0]->name + std::string("_cpy_tmp"); + // inputs[src0_name] = node->src[0]; + // std::string temp_name = src0_name + std::string("_cpy_tmp"); // inputs[temp_name] = node; - // outputs[node->name] = node; - // m_input_names.push_back(node->src[0]->name); + // outputs[node_name] = node; + // m_input_names.push_back(src0_name); // m_input_names.push_back(temp_name); - // m_node_op_name[node->src[0]->name] = ggml_op_name(node->op); + // m_node_op_name[src0_name] = ggml_op_name(node->op); // m_node_op_name[temp_name] = ggml_op_name(node->op); + // m_output_names.push_back(node_name); + // m_continuous = false; - // m_output_names.push_back(node->name); - - // ov::Shape flat_src0_shape = {80000}; + // ov::Shape flat_src0_shape = {node->src[0]->nb[2]}; // auto param_src0 = std::make_shared(ov::element::f32, flat_src0_shape); // m_params.push_back(param_src0); - // std::cout << "decoder ADDR-0: " << param_src0.get() << std::endl; - - // ov::Shape flat_dst_shape = {200000, 1}; + // ov::Shape flat_dst_shape = {node->nb[2], 1}; // auto param_dst_base = std::make_shared(ov::element::f16, flat_dst_shape); // m_params.push_back(param_dst_base); - // std::cout << "decoder ADDR-1: " << param_dst_base.get() << std::endl; - - // m_continuous = false; - // break; + break; } } // For view, input is node itself case GGML_OP_VIEW: { - inputs[node->name] = node; - outputs[node->name] = node; - m_input_names.push_back(node->name); - m_node_op_name[node->name] = ggml_op_name(node->op); - m_output_names.push_back(node->name); + inputs[node_name] = node; + outputs[node_name] = node; + m_input_names.push_back(node_name); + m_node_op_name[node_name] = ggml_op_name(node->op); + m_output_names.push_back(node_name); break; } // SCALE case GGML_OP_SCALE: { - inputs[src0_name] = node->src[0]; + inputs[node_name] = node->src[0]; outputs[node_name] = node; m_input_names.push_back(node_name); - // m_node_op_name[node_name] = ggml_op_name(node->op); + m_node_op_name[node_name] = ggml_op_name(node->op); m_output_names.push_back(node_name); break; } case GGML_OP_MUL_MAT: { - std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs) + "_input_" + ggml_op_name(node->src[1]->op); + std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs); if (!ggml_is_contiguous(node->src[1]) || node->src[1]->ne[0] * node->src[1]->nb[0] != node->src[1]->nb[1]) { m_continuous = false; } else { @@ -198,7 +200,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop); m_output_names.push_back(node_name); if (node->src[1]) { - std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs) + "_input_" + ggml_op_name(node->src[1]->op); + std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs); inputs[src1_name] = node->src[1]; m_node_op_name[src1_name] = ggml_op_name(node->op); m_input_names.push_back(src1_name); @@ -208,20 +210,20 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[1]->name) + "_" + std::to_string(node->src[1]->view_offs) + "_input_" + ggml_op_name(node->src[1]->op); - inputs[node->src[0]->name] = node->src[0]; - inputs[node->src[1]->name] = node->src[1]; - m_input_names.push_back(node->src[0]->name); - m_node_op_name[node->src[0]->name] = ggml_op_name(node->op); - m_input_names.push_back(node->src[1]->name); - m_node_op_name[node->src[1]->name] = ggml_op_name(node->op); - outputs[node->name] = node; - m_output_names.push_back(node->name); + std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs); + inputs[src0_name] = node->src[0]; + inputs[src1_name] = node->src[1]; + m_input_names.push_back(src0_name); + m_node_op_name[src0_name] = ggml_op_name(node->op); + m_input_names.push_back(src1_name); + m_node_op_name[src1_name] = ggml_op_name(node->op); + outputs[node_name] = node; + m_output_names.push_back(node_name); if (node->src[2]) { - std::string src2_name = std::string(node->src[2]->name) + "_" + std::to_string(node->src[2]->view_offs) + "_input_" + ggml_op_name(node->src[2]->op); - inputs[node->src[2]->name] = node->src[2]; - m_input_names.push_back(node->src[2]->name); - m_node_op_name[node->src[2]->name] = ggml_op_name(node->op); + std::string src2_name = std::string(node->src[2]->name) + "_" + std::to_string(node->src[2]->view_offs); + inputs[src2_name] = node->src[2]; + m_input_names.push_back(src2_name); + m_node_op_name[src2_name] = ggml_op_name(node->op); } break; } @@ -358,6 +360,15 @@ std::vector GgmlOvDecoder::get_input_stride(const std::string& name) con return stride; } +std::vector GgmlOvDecoder::get_output_stride(const std::string& name) const { + std::vector stride; + ggml_tensor * node = m_outputs.at(name); + for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { + stride.push_back(static_cast(node->nb[i])); + } + return stride; +} + ov::element::Type GgmlOvDecoder::get_input_type(const std::string& name) const { ov::element::Type type = ov::element::dynamic; switch (m_inputs.at(name)->type) { diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 0921fd8bb5..98c418dd6a 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -39,6 +39,8 @@ public: virtual ov::PartialShape get_output_shape(const std::string& name) const override; + virtual std::vector get_output_stride(const std::string& name) const override; + virtual ov::element::Type get_output_type(const std::string& name) const override; virtual int32_t* get_output_op_params(const std::string& name) const override; From 901f7347ff3517e0436e815b6adf2cc271930369 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Sat, 1 Mar 2025 22:18:43 +0800 Subject: [PATCH 030/254] Execute CONT & VIEW operators in OV Frontend is OK --- ggml/src/ggml-openvino.cpp | 69 ++++++++++++++++++------- ggml/src/ggml-openvino/ggml-decoder.cpp | 53 +++++++++++-------- ggml/src/ggml-openvino/utils.cpp | 20 +++---- 3 files changed, 91 insertions(+), 51 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index e1c294a1d9..35f04f32c3 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -482,6 +482,9 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { // flat shapes: ov::Shape flat_shape_src0 = { total_src0 }; ov::Shape flat_shape_src1 = { total_src1 }; + // Same as above + // ov::Shape flat_shape_src0 = { ggml_nelements(src0) }; + // ov::Shape flat_shape_src1 = { ggml_nelements(src1) }; // Create a Parameter node for collecting non-continuous data auto param_src0 = std::make_shared(ov::element::f16, flat_shape_src0); @@ -526,9 +529,6 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { auto batched_matmul = std::make_shared(B, A, false, false); // batched_matmul output: shape = [32,7,32] - std::vector full_dst_shape = { dst->ne[2], dst->ne[1], dst->ne[0]}; - auto final_shape_const = ov::op::v0::Constant::create(ov::element::i64, { full_dst_shape.size() }, full_dst_shape); - auto model = std::make_shared(ov::NodeVector{ batched_matmul }, ov::ParameterVector{param_src0, param_src1}); ov::Core core; @@ -541,7 +541,7 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { infer_request.set_input_tensor(0, tensor_src0); infer_request.set_input_tensor(1, tensor_src1); - ov::Tensor tensor_dst(ov::element::f32, ov::Shape(full_dst_shape.begin(), full_dst_shape.end()), dst->data); + ov::Tensor tensor_dst(ov::element::f32, { dst->ne[0], dst->ne[1], dst->ne[2]}, dst->data); infer_request.set_output_tensor(0, tensor_dst); infer_request.infer(); @@ -564,6 +564,9 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { ov::Shape flat_shape_src0 = { total_src0 }; ov::Shape flat_shape_src1 = { total_src1 }; + // Same as above + // ov::Shape flat_shape_src0 = { ggml_nelements(src0) }; + // ov::Shape flat_shape_src1 = { ggml_nelements(src1) }; auto param_flat_src0 = std::make_shared(ov::element::f16, flat_shape_src0); auto param_flat_src1 = std::make_shared(ov::element::f32, flat_shape_src1); @@ -602,6 +605,7 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { std::shared_ptr matmul = std::make_shared(reshape_src1, A_for_mul, false, false); auto model = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{param_flat_src0, param_flat_src1}); + // ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/002_backend_mulmat_model.xml"); auto compiled_model = core.compile_model(model, "CPU"); auto infer_request = compiled_model.create_infer_request(); @@ -618,8 +622,35 @@ void ggml_backend_openvino_reshape(ggml_tensor *dst) { } void ggml_backend_openvino_view(ggml_tensor *dst) { + ov::Core core; + ov::Shape tensor_shape{static_cast(dst->ne[3]), static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0])}; - GGML_UNUSED(dst); + // auto param = std::make_shared(ov::element::f32, tensor_shape); + auto param = std::make_shared(ov::element::f16, tensor_shape); + + auto reshaped = std::make_shared(param, + ov::op::v0::Constant::create(ov::element::i64, { tensor_shape.size() }, tensor_shape), + false); + + auto model = std::make_shared(ov::NodeVector{reshaped}, ov::ParameterVector{param}); + // ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/003_backend_view_model.xml"); + + auto compiled_model = core.compile_model(model, "CPU"); + + ov::InferRequest infer_request = compiled_model.create_infer_request(); + + // ov::Tensor input_tensor(ov::element::f32, tensor_shape, dst->data); + ov::Tensor input_tensor(ov::element::f16, tensor_shape, dst->data); + // infer_request.set_tensor(param, input_tensor); + infer_request.set_input_tensor(0, input_tensor); + + // ov::Tensor output_tensor(ov::element::f32, tensor_shape, dst->data); + ov::Tensor output_tensor(ov::element::f16, tensor_shape, dst->data); + infer_request.set_output_tensor(0, output_tensor); + + infer_request.infer(); + // auto output_tensor = infer_request.get_output_tensor(0); + // dst->data = output_tensor.data(); } void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { @@ -992,31 +1023,33 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe // openvino_frontend_compute(backend, cgraph); // Process nodes in order for (int i = 0; i < cgraph->n_nodes; i++) { - if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { - ggml_backend_openvino_reshape(cgraph->nodes[i]); + if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { + ggml_backend_openvino_permute(cgraph->nodes[i]); // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); - } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { - ggml_backend_openvino_view(cgraph->nodes[i]); + // } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { + // ggml_backend_openvino_view(cgraph->nodes[i]); // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { // ggml_backend_openvino_cpy(cgraph->nodes[i]); } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { ggml_backend_openvino_transpose(cgraph->nodes[i]); - } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { - ggml_backend_openvino_permute(cgraph->nodes[i]); - } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { - ggml_backend_openvino_mul_mat(cgraph->nodes[i]); + } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { + ggml_backend_openvino_reshape(cgraph->nodes[i]); + // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { + // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); } else { // Process a range of nodes with openvino_frontend_compute int start_index = i; - while (i < cgraph->n_nodes && - // std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() && - // std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() && - std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end()) { + while (i < cgraph->n_nodes + // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() + // && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() + // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() + // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() + ) { i++; } if (start_index < i) { - openvino_frontend_compute(backend, cgraph, start_index, --i); + openvino_frontend_compute(backend, cgraph, start_index, --i); } } } diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 6a249c103f..fab8d4aed6 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -6,12 +6,20 @@ #include void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs) { - m_node_op_name[node->name] = ggml_op_name(node->op); + // m_node_op_name[node->name] = ggml_op_name(node->op); + + // std::string src0_name = std::string(node->src[0]->name) + "_" + std::to_string(node->src[0]->view_offs) + "_input_" + ggml_op_name(node->src[0]->op); + // std::string node_name = std::string(node->name) + "_" + std::to_string(node->view_offs) + "_output_" + ggml_op_name(node->op); + // Execute singel CONT operator is OK - std::string src0_name = std::string(node->src[0]->name) + "_" + std::to_string(node->src[0]->view_offs) + "_" + ggml_op_name(node->src[0]->op); - std::string node_name = std::string(node->name) + "_" + std::to_string(node->view_offs) + "_" + ggml_op_name(node->op); + // std::string src0_name = std::string(node->src[0]->name) + "_" + std::to_string(node->src[0]->view_offs) + "_" + ggml_op_name(node->src[0]->op); + // std::string node_name = std::string(node->name) + "_" + std::to_string(node->view_offs) + "_" + ggml_op_name(node->op); + // std::string src0_name = std::string(node->src[0]->name) + "_" + std::to_string(node->src[0]->view_offs); // std::string node_name = std::string(node->name) + "_" + std::to_string(node->view_offs); + + std::string src0_name = std::string(node->src[0]->name); + std::string node_name = std::string(node->name); switch (node->op) { // Unary OPs case GGML_OP_UNARY: @@ -151,6 +159,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapname) + "_" + std::to_string(node->view_offs) + "_output_" + ggml_op_name(node->op); inputs[node_name] = node; outputs[node_name] = node; m_input_names.push_back(node_name); @@ -161,21 +170,29 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; + inputs[src0_name] = node->src[0]; outputs[node_name] = node; - m_input_names.push_back(node_name); - m_node_op_name[node_name] = ggml_op_name(node->op); + m_input_names.push_back(src0_name); + m_node_op_name[src0_name] = ggml_op_name(node->op); m_output_names.push_back(node_name); break; } case GGML_OP_MUL_MAT: { - std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs); + ov::Shape flat_shape_src0 = { node->src[0]->ne[0]*node->src[0]->ne[1]*node->src[0]->ne[2] }; + ov::Shape flat_shape_src1 = { node->src[1]->ne[0]*node->src[1]->ne[1]*node->src[1]->ne[2] }; + auto param_src0 = std::make_shared(ov::element::f16, flat_shape_src0); + auto param_src1 = std::make_shared(ov::element::f32, flat_shape_src1); + m_params.push_back(param_src0); + m_params.push_back(param_src1); if (!ggml_is_contiguous(node->src[1]) || node->src[1]->ne[0] * node->src[1]->nb[0] != node->src[1]->nb[1]) { m_continuous = false; } else { m_continuous = true; } + // std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs) + "_input_" + ggml_op_name(node->src[1]->op); + // std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs); + std::string src1_name = std::string(node->src[1]->name); inputs[src0_name] = node->src[0]; inputs[src1_name] = node->src[1]; outputs[node_name] = node; @@ -200,7 +217,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop); m_output_names.push_back(node_name); if (node->src[1]) { - std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs); + // std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs); + std::string src1_name = std::string(node->src[1]->name); inputs[src1_name] = node->src[1]; m_node_op_name[src1_name] = ggml_op_name(node->op); m_input_names.push_back(src1_name); @@ -210,7 +228,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[1]->name) + "_" + std::to_string(node->src[1]->view_offs); + // std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs); + std::string src1_name = std::string(node->src[1]->name); inputs[src0_name] = node->src[0]; inputs[src1_name] = node->src[1]; m_input_names.push_back(src0_name); @@ -220,7 +239,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[2]) { - std::string src2_name = std::string(node->src[2]->name) + "_" + std::to_string(node->src[2]->view_offs); + // std::string src2_name = std::string(node->src[2]->name) + "_" + std::to_string(node->src[2]->view_offs); + std::string src2_name = std::string(node->src[2]->name); inputs[src2_name] = node->src[2]; m_input_names.push_back(src2_name); m_node_op_name[src2_name] = ggml_op_name(node->op); @@ -334,13 +354,6 @@ ov::PartialShape GgmlOvDecoder::get_input_shape(const std::string& name) const { ggml_tensor * node = m_inputs.at(name); std::vector shape; - // [TODO], 在这里判断如果是MUL_MAT就设置shape为一维 - if(m_node_op_name.at(name) == "MUL_MAT") { - shape.push_back(static_cast(node->ne[0] * node->ne[1] * node->ne[2])); - input_shape = ov::PartialShape(shape); - return input_shape; - } - for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { if (node->ne[i] == 0) { return input_shape; @@ -405,10 +418,8 @@ std::vector GgmlOvDecoder::get_input_names() const { const std::string& GgmlOvDecoder::get_node_op_name(const std::string& name) const { auto it = m_node_op_name.find(name); - if (it != m_node_op_name.end()) { - return it->second; - } - return ""; + static const std::string empty_str; + return (it != m_node_op_name.end()) ? it->second : empty_str; } const std::vector>& GgmlOvDecoder::get_params() const { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 8fa1f99a01..21edad596b 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -26,18 +26,9 @@ std::map get_ggml_graph_input_tensors(std::shared_ptrget_input_type(name), {80000}, input_data); - // } else if (node_op_name == "CONT" || node_op_name == "MUL_MAT") { - // // auto input_shape = ggml_decoder->get_input_shape(name).to_shape(); - // // size_t total_size = 1; - // // for (auto dim : input_shape) { - // // total_size *= dim; - // // } - // // ov::Shape new_shape = {total_size}; - // input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), {ggml_decoder->get_input_shape(name).to_shape()[0]}, input_data); - // } else { if (node_op_name == "CONT" && ggml_decoder->check_if_continuous()) { - ov::Shape flat_shape = { ggml_decoder->get_input_shape(name).to_shape()[0] * - ggml_decoder->get_input_shape(name).to_shape()[1] * + ov::Shape flat_shape = { ggml_decoder->get_input_shape(name).to_shape()[0] * + ggml_decoder->get_input_shape(name).to_shape()[1] * ggml_decoder->get_input_shape(name).to_shape()[2] }; input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), flat_shape, input_data); } else if ( node_op_name == "CONT" && @@ -59,6 +50,11 @@ std::map get_ggml_graph_input_tensors(std::shared_ptrget_input_type(name), flat_input_shape, input_data); + } else if (node_op_name == "MUL_MAT") { + ov::Shape flat_shape = { ggml_decoder->get_input_shape(name).to_shape()[0] * + ggml_decoder->get_input_shape(name).to_shape()[1] * + ggml_decoder->get_input_shape(name).to_shape()[2] }; + input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), flat_shape, input_data); } else { input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); } @@ -125,7 +121,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // Convert InputModel -> ov::Model std::shared_ptr model = front_end->convert(input_model); - ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_model.xml"); + // ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_model.xml"); if (!model) { GGML_LOG_ERROR("Model is not converted \n"); From 95ae982d590e8844517f1ffed910a7642150732f Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Tue, 4 Mar 2025 00:05:00 +0800 Subject: [PATCH 031/254] OV Frontend supports GET_ROWS/RMS_NORM/MUL/MUL_MAT graph conversion of consecutive OPs --- ggml/src/ggml-openvino.cpp | 64 +++++++++++++------------ ggml/src/ggml-openvino/ggml-decoder.cpp | 46 +++++++++++++----- ggml/src/ggml-openvino/utils.cpp | 11 ++++- 3 files changed, 78 insertions(+), 43 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 35f04f32c3..883e43365f 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -1020,39 +1020,41 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe } } + int end_node = cgraph->n_nodes - 1; + openvino_frontend_compute(backend, cgraph, 0, end_node); // openvino_frontend_compute(backend, cgraph); // Process nodes in order - for (int i = 0; i < cgraph->n_nodes; i++) { - if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { - ggml_backend_openvino_permute(cgraph->nodes[i]); - // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { - // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); - // } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { - // ggml_backend_openvino_view(cgraph->nodes[i]); - // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { - // ggml_backend_openvino_cpy(cgraph->nodes[i]); - } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { - ggml_backend_openvino_transpose(cgraph->nodes[i]); - } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { - ggml_backend_openvino_reshape(cgraph->nodes[i]); - // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { - // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); - } else { - // Process a range of nodes with openvino_frontend_compute - int start_index = i; - while (i < cgraph->n_nodes - // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() - // && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() - // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() - // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() - ) { - i++; - } - if (start_index < i) { - openvino_frontend_compute(backend, cgraph, start_index, --i); - } - } - } + // for (int i = 0; i < cgraph->n_nodes; i++) { + // if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { + // ggml_backend_openvino_permute(cgraph->nodes[i]); + // // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { + // // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); + // // } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { + // // ggml_backend_openvino_view(cgraph->nodes[i]); + // // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { + // // ggml_backend_openvino_cpy(cgraph->nodes[i]); + // // } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { + // // ggml_backend_openvino_transpose(cgraph->nodes[i]); + // // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { + // // ggml_backend_openvino_reshape(cgraph->nodes[i]); + // // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { + // // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); + // } else { + // // Process a range of nodes with openvino_frontend_compute + // int start_index = i; + // while (i < cgraph->n_nodes + // // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() + // // && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() + // // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() + // // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() + // ) { + // i++; + // } + // if (start_index < i) { + // openvino_frontend_compute(backend, cgraph, start_index, --i); + // } + // } + // } return GGML_STATUS_SUCCESS; diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index fab8d4aed6..90755ec9a6 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -20,6 +20,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]->name); std::string node_name = std::string(node->name); + switch (node->op) { // Unary OPs case GGML_OP_UNARY: @@ -110,7 +111,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]->ne, node->src[0]->ne + 4); + ov::Shape src_shape(node->src[0]->ne, node->src[0]->ne + 3); auto input_param = std::make_shared(ov::element::f32, src_shape); m_params.push_back(input_param); break; @@ -217,6 +218,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop); m_output_names.push_back(node_name); if (node->src[1]) { + // std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs) + "_input_" + ggml_op_name(node->src[1]->op); // std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs); std::string src1_name = std::string(node->src[1]->name); inputs[src1_name] = node->src[1]; @@ -228,6 +230,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[1]->name) + "_" + std::to_string(node->src[1]->view_offs) + "_input_" + ggml_op_name(node->src[1]->op); // std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs); std::string src1_name = std::string(node->src[1]->name); inputs[src0_name] = node->src[0]; @@ -239,6 +242,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[2]) { + // std::string src2_name = std::string(node->src[2]->name) + "_" + std::to_string(node->src[2]->view_offs) + "_input_" + ggml_op_name(node->src[2]->op); // std::string src2_name = std::string(node->src[2]->name) + "_" + std::to_string(node->src[2]->view_offs); std::string src2_name = std::string(node->src[2]->name); inputs[src2_name] = node->src[2]; @@ -253,7 +257,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapn_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; @@ -269,9 +280,14 @@ void ggml_graph_op_print(const struct ggml_cgraph * cgraph) { << std::setw(5) << node->ne[0] << ", " << std::setw(5) << node->ne[1] << ", " << std::setw(5) << node->ne[2] << "] " - << std::left << std::setw(16) << ggml_op_name(node->op) << std::right << " " - << " " << node->name - << ((node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ") << "\n"; + << std::left << std::setw(20) << ggml_op_name(node->op) << std::right << " " + << std::left << std::setw(44) << node->name << std::right + << ((node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ") + << std::setw(2) << "[ " + << std::setw(0) << node->nb[0] << ", " + << std::setw(5) << node->nb[1] << ", " + << std::setw(5) << node->nb[2] << "] " + << "\n"; if (node->src[0]) { file << std::setw(10) << " [ " @@ -279,15 +295,19 @@ void ggml_graph_op_print(const struct ggml_cgraph * cgraph) { << std::setw(5) << node->src[0]->ne[1] << ", " << std::setw(5) << node->src[0]->ne[2] << "] " << std::setw(12) - << "0: " << ggml_op_name(node->src[0]->op) << " "; + << "0: " << std::left << std::setw(12) << ggml_op_name(node->src[0]->op) << std::right; // // Custom logic to handle '\000' // const char* name_ptr = node->src[0]->name; // while (*name_ptr != '\0' || *(name_ptr + 1) != '\0' || *(name_ptr + 2) != '\0') { // file << *name_ptr; // name_ptr++; // } - file << node->src[0]->name; - file << "\n"; + file << std::left << std::setw(30) << node->src[0]->name << std::right + << std::setw(16) << "[ " + << std::setw(0) << node->src[0]->nb[0] << ", " + << std::setw(5) << node->src[0]->nb[1] << ", " + << std::setw(5) << node->src[0]->nb[2] << "] " + << "\n"; } if (node->src[1]) { file << std::setw(10) << " [ " @@ -295,15 +315,19 @@ void ggml_graph_op_print(const struct ggml_cgraph * cgraph) { << std::setw(5) << node->src[1]->ne[1] << ", " << std::setw(5) << node->src[1]->ne[2] << "] " << std::setw(12) - << "1: " << ggml_op_name(node->src[1]->op) << " "; + << "1: " << std::left << std::setw(12) << ggml_op_name(node->src[1]->op) << std::right; // // Custom logic to handle '\000' // const char* name_ptr = node->src[1]->name; // while (*name_ptr != '\0' || *(name_ptr + 1) != '\0' || *(name_ptr + 2) != '\0') { // file << *name_ptr; // name_ptr++; // } - file << node->src[1]->name; - file << "\n"; + file << std::left << std::setw(30) << node->src[1]->name << std::right + << std::setw(16) << "[ " + << std::setw(0) << node->src[1]->nb[0] << ", " + << std::setw(5) << node->src[1]->nb[1] << ", " + << std::setw(5) << node->src[1]->nb[2] << "] " + << "\n"; } } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 21edad596b..4b25c13689 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -121,7 +121,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // Convert InputModel -> ov::Model std::shared_ptr model = front_end->convert(input_model); - // ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_model.xml"); + ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_model.xml"); if (!model) { GGML_LOG_ERROR("Model is not converted \n"); @@ -145,6 +145,14 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // Set input tensor for (size_t i = 0; i < input_names.size(); i++) { infer_request.set_input_tensor(i, input_tensors[input_names[i]]); + + // auto input_tensor = infer_request.get_input_tensor(i); + // auto input_shape = input_tensor.get_shape(); + // std::cout << "Input tensor " << i << " shape: "; + // for (const auto& dim : input_shape) { + // std::cout << dim << " "; + // } + // std::cout << std::endl; } infer_request.infer(); @@ -155,6 +163,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c for (size_t i = 0; i < output_names.size(); i++) { // std::string op_name = ggml_decoder->get_node_op_name(output_names[i]); auto output_tensor = infer_request.get_output_tensor(i); + // output_tensor.get_shape(); std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size()); #ifdef GGML_OPENVINO_DEBUG printf("Output %s after: %g\n", output_names[i].c_str(), *(double*)(output_tensor.data())); From 9a7b7d8d6de4512daddcb81d95f4f1ca50a83a47 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Wed, 5 Mar 2025 18:50:18 +0800 Subject: [PATCH 032/254] OV Frontend supports GET_ROWS/RMS_NORM/MUL/MUL_MAT/ROPE/SCALE/SOFTMAX/ADD adjacent op graph conversion --- ggml/src/ggml-openvino.cpp | 1 - ggml/src/ggml-openvino/decoder.h | 2 ++ ggml/src/ggml-openvino/ggml-decoder.cpp | 38 +++++++++++++++++++++++-- ggml/src/ggml-openvino/ggml-decoder.h | 3 +- ggml/src/ggml-openvino/utils.cpp | 17 ++++++----- 5 files changed, 49 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 883e43365f..8cc4de05b1 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -1279,7 +1279,6 @@ static const std::set& openvino_ops = []() -> const std::setop); + m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); break; } @@ -43,6 +44,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop); + m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); m_continuous = true; @@ -67,13 +69,15 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop); + m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); const size_t element_size = ggml_type_size(node->src[0]->type); size_t valid_elems = static_cast(node->src[0]->ne[0]); // 3072 size_t num_rows = static_cast(node->src[0]->ne[1]); // 7 size_t phys_stride = static_cast(node->src[0]->nb[1]) / element_size; // 9216 - size_t total_phys = (num_rows - 1) * phys_stride + valid_elems; // 6*9216 + 3072 = 58368 + // size_t total_phys = (num_rows - 1) * phys_stride + valid_elems; // 6*9216 + 3072 = 58368 + size_t total_phys = num_rows * phys_stride; // 7 * 9216 = 64512 ov::Shape flat_input_shape = { total_phys }; auto flat_input_param = std::make_shared(ov::element::f32, flat_input_shape); m_params.push_back(flat_input_param); @@ -87,6 +91,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop); + m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); size_t valid_i = static_cast(node->src[0]->ne[0]); // 96 @@ -108,6 +113,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop); + m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); m_continuous = true; @@ -130,6 +136,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop); + m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); m_continuous = false; break; @@ -161,10 +168,12 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapname) + "_" + std::to_string(node->view_offs) + "_output_" + ggml_op_name(node->op); + // std::string node_name = std::string(node->name) + "_" + std::to_string(node->view_offs); inputs[node_name] = node; outputs[node_name] = node; m_input_names.push_back(node_name); m_node_op_name[node_name] = ggml_op_name(node->op); + m_op_node_name.emplace_back(node_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); break; } @@ -175,6 +184,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop); + m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); break; } @@ -199,8 +209,10 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop); + m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_input_names.push_back(src1_name); m_node_op_name[src1_name] = ggml_op_name(node->op); + m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); break; } @@ -216,6 +228,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop); + m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); if (node->src[1]) { // std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs) + "_input_" + ggml_op_name(node->src[1]->op); @@ -223,6 +236,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[1]->name); inputs[src1_name] = node->src[1]; m_node_op_name[src1_name] = ggml_op_name(node->op); + m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); m_input_names.push_back(src1_name); } break; @@ -237,8 +251,10 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[1]; m_input_names.push_back(src0_name); m_node_op_name[src0_name] = ggml_op_name(node->op); + m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_input_names.push_back(src1_name); m_node_op_name[src1_name] = ggml_op_name(node->op); + m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); outputs[node_name] = node; m_output_names.push_back(node_name); if (node->src[2]) { @@ -248,6 +264,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[2]; m_input_names.push_back(src2_name); m_node_op_name[src2_name] = ggml_op_name(node->op); + m_op_node_name.emplace_back(src2_name, ggml_op_name(node->op)); } break; } @@ -359,8 +376,8 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr if (m_node) { set_input_output(m_node, m_inputs, m_outputs); } else { - // for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { - for (int node_n = start_index; node_n <= end_index; node_n++) { + for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { + // for (int node_n = start_index; node_n <= end_index; node_n++) { auto cur_node = m_cgraph->nodes[node_n]; m_nodes.push_back(cur_node); // Init model input and output @@ -446,6 +463,21 @@ const std::string& GgmlOvDecoder::get_node_op_name(const std::string& name) cons return (it != m_node_op_name.end()) ? it->second : empty_str; } +std::string& GgmlOvDecoder::get_op_node_name(const std::string& key_name, const int index) { + if (index == -1) { + for (size_t i = 0; i < m_op_node_name.size(); ++i) { + if (m_op_node_name[i].first == key_name) { + return m_op_node_name[i].second; + } + } + } else { + return m_op_node_name[index].second; + } + + static std::string empty_string = ""; + return empty_string; // empty string +} + const std::vector>& GgmlOvDecoder::get_params() const { return m_params; } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 98c418dd6a..238f1d79b4 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -72,7 +72,7 @@ public: } virtual const std::string& get_node_op_name(const std::string& name) const override; - // virtual const std::string& get_node_op_info(const std::string& name) const override; + std::string& get_op_node_name(const std::string& key_name, const int index) override; virtual const std::vector>& get_params() const override; @@ -92,5 +92,6 @@ private: bool m_continuous; std::map m_node_op_name; std::vector> m_params; + std::vector> m_op_node_name; }; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 4b25c13689..8f27bbc97d 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -14,9 +14,11 @@ std::map get_ggml_graph_input_tensors(std::shared_ptr input_tensors; auto input_names = ggml_decoder->get_input_names(); // auto node_name = ggml_decoder->get_op_name(); + size_t iter = 0; for (size_t inp = 0; inp < input_names.size(); ++inp) { auto name = input_names[inp]; - auto node_op_name = ggml_decoder->get_node_op_name(name); + std::string op_node_name = ggml_decoder->get_op_node_name(name, iter++); + // auto node_op_name = ggml_decoder->get_node_op_name(name); auto input_data = ggml_decoder->get_input_ggml_tensor(name)->data; #ifdef GGML_OPENVINO_DEBUG printf("Subgraph input %d: %g\n", inp, *(double*)(input_data)); @@ -26,12 +28,12 @@ std::map get_ggml_graph_input_tensors(std::shared_ptrget_input_type(name), {80000}, input_data); - if (node_op_name == "CONT" && ggml_decoder->check_if_continuous()) { + if (op_node_name == "CONT" && ggml_decoder->check_if_continuous()) { ov::Shape flat_shape = { ggml_decoder->get_input_shape(name).to_shape()[0] * ggml_decoder->get_input_shape(name).to_shape()[1] * ggml_decoder->get_input_shape(name).to_shape()[2] }; input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), flat_shape, input_data); - } else if ( node_op_name == "CONT" && + } else if ( op_node_name == "CONT" && !ggml_decoder->check_if_continuous() && input_shape[0] == 1) { size_t valid_elems = static_cast(ggml_decoder->get_input_shape(name).to_shape()[2]); // 3072 @@ -40,17 +42,18 @@ std::map get_ggml_graph_input_tensors(std::shared_ptr strides = ggml_decoder->get_input_stride(name); size_t phys_stride = static_cast(strides[1]) / element_size; - size_t total_phys = (num_rows - 1) * phys_stride + valid_elems; + // size_t total_phys = (num_rows - 1) * phys_stride + valid_elems; + size_t total_phys = num_rows* phys_stride; ov::Shape flat_input_shape = { total_phys }; input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), flat_input_shape, input_data); - } else if (node_op_name == "CONT") { + } else if (op_node_name == "CONT") { size_t valid_i = static_cast(ggml_decoder->get_input_shape(name).to_shape()[2]); // 96 size_t valid_j = static_cast(ggml_decoder->get_input_shape(name).to_shape()[1]); // 32 size_t valid_k = static_cast(ggml_decoder->get_input_shape(name).to_shape()[0]); // 7 size_t total_valid = valid_i * valid_j * valid_k; // 96 * 32 * 7 = 21504 ov::Shape flat_input_shape = { total_valid }; input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), flat_input_shape, input_data); - } else if (node_op_name == "MUL_MAT") { + } else if (op_node_name == "MUL_MAT") { ov::Shape flat_shape = { ggml_decoder->get_input_shape(name).to_shape()[0] * ggml_decoder->get_input_shape(name).to_shape()[1] * ggml_decoder->get_input_shape(name).to_shape()[2] }; @@ -144,7 +147,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // Set input tensor for (size_t i = 0; i < input_names.size(); i++) { - infer_request.set_input_tensor(i, input_tensors[input_names[i]]); + infer_request.set_input_tensor(i, input_tensors[input_names[i]]); // auto input_tensor = infer_request.get_input_tensor(i); // auto input_shape = input_tensor.get_shape(); From f98d215162ea0ef18d44382fa3e2cce91c95ccae Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Wed, 5 Mar 2025 23:07:22 +0800 Subject: [PATCH 033/254] Change the input parameter shape of CONT operator --- ggml/src/ggml-openvino.cpp | 228 +++++++++++++++---------------------- 1 file changed, 89 insertions(+), 139 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 8cc4de05b1..034bd698c3 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -665,44 +665,46 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { // Case 1: Both tensors are contiguous if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) { - ov::Shape flat_shape = { static_cast(ggml_nelements(dst)) }; + ov::Shape input_shape = { + static_cast(src0->ne[0]), + static_cast(src0->ne[1]), + static_cast(src0->ne[2]), + static_cast(src0->ne[3]) + }; + size_t num_elements = 1; + for (auto d : input_shape) { + num_elements *= d; + } + ov::Shape flat_shape = { num_elements }; - // Construct the logical shape of the target tensor ov::Shape dst_shape = { static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0]) }; - // --- Construct the OpenVINO computation graph --- - // 1. Define input parameter, type f32, shape flat_shape: [8192] - auto input_param = std::make_shared(ov::element::f32, flat_shape); + auto input_param = std::make_shared(ov::element::f32, input_shape); + + std::vector flat_shape_vec(flat_shape.begin(), flat_shape.end()); + auto flat_reshape_const = ov::op::v0::Constant::create(ov::element::i64, { flat_shape_vec.size() }, flat_shape_vec); + auto flat_reshape = std::make_shared(input_param, flat_reshape_const, false); - // 2. Create a Constant node to represent the new shape of the target Reshape(dst_shape) - // Note: dst_shape needs to be converted to an int64_t array std::vector dst_shape_vec(dst_shape.begin(), dst_shape.end()); - auto reshape_const = ov::op::v0::Constant::create(ov::element::i64, { dst_shape_vec.size() }, dst_shape_vec); + auto dst_reshape_const = ov::op::v0::Constant::create(ov::element::i64, { dst_shape_vec.size() }, dst_shape_vec); + auto final_reshape = std::make_shared(flat_reshape, dst_reshape_const, false); - // 3. Use the Reshape operator to reshape the input tensor to the target shape(dst_shape) - auto reshape_op = std::make_shared(input_param, reshape_const, false); + auto model = std::make_shared(ov::OutputVector{ final_reshape }, ov::ParameterVector{ input_param }); - // 4. Construct the model, whose output is the result of reshape_op - auto model = std::make_shared(ov::OutputVector{ reshape_op }, ov::ParameterVector{ input_param }); - - // --- Compile and execute --- ov::Core core; auto compiled_model = core.compile_model(model, "CPU"); auto infer_request = compiled_model.create_infer_request(); - // Construct input Tensor: directly wrap src0->data, shape is flat_shape[8192] - ov::Tensor input_tensor(ov::element::f32, flat_shape, src0->data); + ov::Tensor input_tensor(ov::element::f32, input_shape, src0->data); infer_request.set_input_tensor(0, input_tensor); - // Construct output Tensor: dst->data, shape is dst_shape: [1,1,8192] ov::Tensor output_tensor(ov::element::f32, dst_shape, dst->data); infer_request.set_output_tensor(0, output_tensor); - // Execute inference, the computation graph flattens the data of src0 and reshapes it to the shape of dst->ne, and writes it directly to dst->data infer_request.infer(); return; } @@ -715,69 +717,42 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { const size_t nb0 = dst->nb[0]; if (src0->type == dst->type && ne00 == dst->ne[0] && nb00 == element_size && nb0 == element_size) { - // Assume that the data type is f32 and each element is 4 bytes - // Logically, the number of valid elements per row is 3072 (src0->ne[0]), and the number of rows is 7 (src0->ne[1]) - size_t valid_elems = static_cast(src0->ne[0]); // 3072 - size_t num_rows = static_cast(src0->ne[1]); // 7 + const size_t valid_elems = static_cast(src0->ne[0]); + const size_t num_rows = static_cast(src0->ne[1]); + const size_t dim2 = static_cast(src0->ne[2]); + const size_t dim3 = static_cast(src0->ne[3]); - // Number of floats physically stored per row = nb[1] / element_size = 36864/4 = 9216 - size_t phys_stride = static_cast(src0->nb[1]) / element_size; // 9216 + size_t phys_stride = static_cast(src0->nb[1]) / element_size; + size_t total_logical = valid_elems * num_rows * dim2 * dim3; - // Total number of physical elements = (num_rows - 1)*phys_stride + valid_elems - size_t total_phys = (num_rows - 1) * phys_stride + valid_elems; // 6*9216 + 3072 = 58368 - // size_t total_phys = num_rows * phys_stride; + std::vector contiguous_data(total_logical); - // 1. Wrap src0->data into a 1D tensor with shape [58368] - ov::Shape flat_input_shape = { total_phys }; - auto flat_input_param = std::make_shared(ov::element::f32, flat_input_shape); - - // 2. Construct index tensor idx with shape [3072,7] - // For each logical position (i,j) (i in [0,3072), j in [0,7)), calculate index = j*phys_stride + i. - std::vector indices; - indices.reserve(valid_elems * num_rows); for (size_t j = 0; j < num_rows; j++) { - for (size_t i = 0; i < valid_elems; i++) { - indices.push_back(static_cast(j * phys_stride + i)); - } + const float *src_row = reinterpret_cast(src0->data) + j * phys_stride; + float *dst_row = contiguous_data.data() + j * valid_elems; + std::copy(src_row, src_row + valid_elems, dst_row); } - ov::Shape indices_shape = { valid_elems, num_rows }; // [3072,7] - auto indices_const = ov::op::v0::Constant::create(ov::element::i64, indices_shape, indices); - // 3. Use the Gather operator (axis=0) to collect valid data - // Note: The third parameter is axis, and a value of 0 means collecting data from the 1D input according to the index - auto axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto gathered = std::make_shared(flat_input_param, indices_const, axis_const); - // The shape of gathered should be [3072,7] + ov::Shape logical_shape = { valid_elems, num_rows, dim2, dim3 }; + auto input_param = std::make_shared(ov::element::f32, logical_shape); + auto identity_const = ov::op::v0::Constant::create(ov::element::i64, + { logical_shape.size() }, + std::vector(logical_shape.begin(), logical_shape.end())); + auto identity_op = std::make_shared(input_param, identity_const, false); - // 4. Reshape gathered into a 4D tensor [3072,7,1,1] - auto reshape_const = ov::op::v0::Constant::create( - ov::element::i64, {4}, std::vector{ static_cast(valid_elems), static_cast(num_rows), 1, 1 } - ); - auto reshaped = std::make_shared(gathered, reshape_const, false); - // The reshaped shape is [3072,7,1,1] + auto model = std::make_shared(ov::OutputVector{identity_op}, + ov::ParameterVector{input_param}); - // 5. Construct the model and output it as reshaped - auto model = std::make_shared(ov::OutputVector{reshaped}, ov::ParameterVector{flat_input_param}); - - // --- Compile and execute --- ov::Core core; auto compiled_model = core.compile_model(model, "CPU"); auto infer_request = compiled_model.create_infer_request(); - // Construct input Tensor: directly wrap src0->data, shape is flat_input_shape = [58368] - ov::Tensor input_tensor(ov::element::f32, flat_input_shape, src0->data); + ov::Tensor input_tensor(ov::element::f32, logical_shape, contiguous_data.data()); infer_request.set_input_tensor(0, input_tensor); - // Construct output Tensor: dst is continuous storage, and its logical shape is [3072,7,1,1] - ov::Shape output_shape = { static_cast(dst->ne[0]), - static_cast(dst->ne[1]), - static_cast(dst->ne[2]), - static_cast(dst->ne[3])}; - ov::Tensor output_tensor(ov::element::f32, output_shape, dst->data); + ov::Tensor output_tensor(ov::element::f32, logical_shape, dst->data); infer_request.set_output_tensor(0, output_tensor); - // Execute inference. The computation graph uses Gather to collect the first 3072 valid elements of each row of src0, - // and reshape them to [3072,7,1,1] and write them directly to dst->data infer_request.infer(); /* for (size_t i01 = 0; i01 < ne01; ++i01) { @@ -804,74 +779,48 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { size_t valid_i = static_cast(src0->ne[0]); // 96 size_t valid_j = static_cast(src0->ne[1]); // 32 size_t valid_k = static_cast(src0->ne[2]); // 7 + size_t valid_l = static_cast(src0->ne[3]); // 1 - // Output the logical shape of dst: dst->ne = [3072, 7, 1, 1] - // 3072 = 32 * 96, 7 is consistent with src0->ne[2] size_t total_valid = valid_i * valid_j * valid_k; // 96 * 32 * 7 = 21504 + size_t stride_j = static_cast(src0->nb[1]) / element_size; // 672 + size_t stride_k = static_cast(src0->nb[2]) / element_size; // 96 - // Physics step length: - size_t stride_j = static_cast(src0->nb[1]) / ggml_type_size(src0->type); // 2688/4 = 672 - size_t stride_k = static_cast(src0->nb[2]) / ggml_type_size(src0->type); // 384/4 = 96 - - // Construct index array, output order: for k in [0,6], for j in [0,31], for i in [0,95]: - // desired input index = j * stride_j + k * stride_k + i - std::vector indices; - indices.reserve(total_valid); + std::vector contiguous_data(total_valid); + const float *src_data = reinterpret_cast(src0->data); for (size_t k = 0; k < valid_k; k++) { for (size_t j = 0; j < valid_j; j++) { for (size_t i = 0; i < valid_i; i++) { - int64_t idx = static_cast(j * stride_j + k * stride_k + i); - indices.push_back(idx); + size_t out_index = k * (valid_i * valid_j) + j * valid_i + i; + size_t src_index = j * stride_j + k * stride_k + i; + contiguous_data[out_index] = src_data[src_index]; } } } - // The size of indices should be 21504 - // 1. Construct input: treat src0->data as a 1D tensor. The valid range is 0~21503. - ov::Shape flat_input_shape = { total_valid }; - auto input_param = std::make_shared(ov::element::f32, flat_input_shape); + ov::Shape input_shape = { dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2] }; + auto input_param = std::make_shared(ov::element::f32, input_shape); - // 2. Construct index constant: 1D tensor, shape [21504] - ov::Shape indices_shape = { total_valid }; - auto indices_const = ov::op::v0::Constant::create(ov::element::i64, indices_shape, indices); + ov::Shape target_shape = { dst->ne[0], dst->ne[1], dst->ne[2] }; + std::vector target_shape_vec = { static_cast(dst->ne[0]), + static_cast(dst->ne[1]), dst->ne[2]}; + auto reshape_const = ov::op::v0::Constant::create(ov::element::i64, {3}, target_shape_vec); + auto reshaped = std::make_shared(input_param, reshape_const, false); - // 3. Set axis=0 (collect data from 1D input) - auto axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - - // 4. Use the Gather operator (OpenVINO v8 Gather is used here) to collect valid data - auto gathered = std::make_shared(input_param, indices_const, axis_const); - // gathered has a shape of [21504] - - // 5. Reshape gathered to [3072,7,1,1], because 3072*7 = 21504 - ov::Shape target_shape = { static_cast(dst->ne[0]), - static_cast(dst->ne[1]), - static_cast(dst->ne[2]), - static_cast(dst->ne[3])}; // [3072,7,1,1] - auto reshape_const = ov::op::v0::Constant::create(ov::element::i64, {4}, - std::vector{ static_cast(dst->ne[0]), static_cast(dst->ne[1]), 1, 1 }); - auto reshaped = std::make_shared(gathered, reshape_const, false); - - // 6. Construct model auto model = std::make_shared(ov::OutputVector{reshaped}, ov::ParameterVector{input_param}); - // --- Compile and execute --- ov::Core core; auto compiled_model = core.compile_model(model, "CPU"); auto infer_request = compiled_model.create_infer_request(); - // Construct input Tensor: directly wrap src0->data. Note: src0->data is regarded as a one-dimensional array according to the physical valid area, flat_input_shape: [21504] - ov::Tensor input_tensor(ov::element::f32, flat_input_shape, src0->data); + ov::Tensor input_tensor(ov::element::f32, input_shape, contiguous_data.data()); infer_request.set_input_tensor(0, input_tensor); - // Construct output Tensor: dst->data is stored continuously, with shape target_shape: [3072,7,1,1] ov::Tensor output_tensor(ov::element::f32, target_shape, dst->data); infer_request.set_output_tensor(0, output_tensor); - // Execute reasoning: The computation graph uses Gather+Reshape to collect each valid element of src0 in a predetermined order and write it directly to dst->data infer_request.infer(); return; } - std::cout << "Duplication of bytes completed successfully." << std::endl; } static void ggml_backend_openvino_transpose(ggml_tensor *dst) { @@ -1021,40 +970,40 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe } int end_node = cgraph->n_nodes - 1; - openvino_frontend_compute(backend, cgraph, 0, end_node); + // openvino_frontend_compute(backend, cgraph, 0, end_node); // openvino_frontend_compute(backend, cgraph); // Process nodes in order - // for (int i = 0; i < cgraph->n_nodes; i++) { - // if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { - // ggml_backend_openvino_permute(cgraph->nodes[i]); - // // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { - // // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); - // // } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { - // // ggml_backend_openvino_view(cgraph->nodes[i]); - // // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { - // // ggml_backend_openvino_cpy(cgraph->nodes[i]); - // // } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { - // // ggml_backend_openvino_transpose(cgraph->nodes[i]); - // // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { - // // ggml_backend_openvino_reshape(cgraph->nodes[i]); - // // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { - // // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); - // } else { - // // Process a range of nodes with openvino_frontend_compute - // int start_index = i; - // while (i < cgraph->n_nodes - // // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() - // // && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() - // // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() - // // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() - // ) { - // i++; - // } - // if (start_index < i) { - // openvino_frontend_compute(backend, cgraph, start_index, --i); - // } - // } - // } + for (int i = 0; i < cgraph->n_nodes; i++) { + if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { + ggml_backend_openvino_permute(cgraph->nodes[i]); + } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { + ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); + } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { + ggml_backend_openvino_view(cgraph->nodes[i]); + // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { + // ggml_backend_openvino_cpy(cgraph->nodes[i]); + // } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { + // ggml_backend_openvino_transpose(cgraph->nodes[i]); + } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { + ggml_backend_openvino_reshape(cgraph->nodes[i]); + // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { + // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); + } else { + // Process a range of nodes with openvino_frontend_compute + int start_index = i; + while (i < cgraph->n_nodes + && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() + // && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() + && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() + // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() + ) { + i++; + } + if (start_index < i) { + openvino_frontend_compute(backend, cgraph, start_index, --i); + } + } + } return GGML_STATUS_SUCCESS; @@ -1522,3 +1471,4 @@ GGML_API ggml_backend_reg_t ggml_backend_openvino_reg(void) { return ® } + From f37fa21a5cf7b6196a0384b052cee22fb28d2a22 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Thu, 6 Mar 2025 01:38:01 +0800 Subject: [PATCH 034/254] Change the input and ouput node shape of MUL_MAT operator --- ggml/src/ggml-openvino.cpp | 201 ++++++++++++++++++++----------------- 1 file changed, 111 insertions(+), 90 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 034bd698c3..afd616a338 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -458,68 +458,72 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { const ggml_tensor * src1 = dst->src[1]; // src1 type F32 if(!ggml_is_contiguous(src1) || dst->src[1]->ne[0] * dst->src[1]->nb[0] != dst->src[1]->nb[1]) { - int valid_cols_src0 = dst->src[0]->ne[0]; - int num_rows_src0 = dst->src[0]->ne[1]; - int batch_src0 = dst->src[0]->ne[2]; - int valid_cols_src1 = dst->src[1]->ne[0]; - int num_rows_src1 = dst->src[1]->ne[1]; - int batch_src1 = dst->src[1]->ne[2]; - int row_stride_src0 = dst->src[0]->nb[1] / dst->src[0]->nb[0]; - int batch_stride_src0 = dst->src[0]->nb[2] / dst->src[0]->nb[0]; + int valid_cols_src0 = src0->ne[0]; // 96 + int num_rows_src0 = src0->ne[1]; // 32 + int batch_src0 = src0->ne[2]; // 32 - int row_stride_src1 = dst->src[1]->nb[1] / dst->src[1]->nb[0]; - int batch_stride_src1 = dst->src[1]->nb[2] / dst->src[1]->nb[0]; + int valid_cols_src1 = src1->ne[0]; // 96 + int num_rows_src1 = src1->ne[1]; // 7 + int batch_src1 = src1->ne[2]; // 32 + + // 对 src0:row_stride = nb[1] / nb[0] + int row_stride_src0 = src0->nb[1] / src0->nb[0]; // 6144 / 2 = 3072 + int batch_stride_src0 = src0->nb[2] / src0->nb[0]; // 192 / 2 = 96 + + // 对 src1:row_stride = nb[1] / nb[0] + int row_stride_src1 = src1->nb[1] / src1->nb[0]; // 12288 / 4 = 3072 + int batch_stride_src1 = src1->nb[2] / src1->nb[0]; // 384 / 4 = 96 std::vector indices_src0 = build_indices(valid_cols_src0, num_rows_src0, batch_src0, row_stride_src0, batch_stride_src0); std::vector indices_src1 = build_indices(valid_cols_src1, num_rows_src1, batch_src1, row_stride_src1, batch_stride_src1); - // Total number of elements size_t total_src0 = indices_src0.size(); // = 96 * 32 * 32 size_t total_src1 = indices_src1.size(); // = 96 * 7 * 32 - // Treat src0->data and src1->data as 1D tensors - // Note: The total length of physical data should be enough to cover the last valid element index + 1. - // flat shapes: + ov::Shape orig_shape_src0 = { static_cast(src0->ne[0]), + static_cast(src0->ne[1]), + static_cast(src0->ne[2]), + static_cast(src0->ne[3]) }; + ov::Shape orig_shape_src1 = { static_cast(src1->ne[0]), + static_cast(src1->ne[1]), + static_cast(src1->ne[2]), + static_cast(src1->ne[3]) }; + + auto param_src0 = std::make_shared(ov::element::f16, orig_shape_src0); + auto param_src1 = std::make_shared(ov::element::f32, orig_shape_src1); + ov::Shape flat_shape_src0 = { total_src0 }; ov::Shape flat_shape_src1 = { total_src1 }; - // Same as above - // ov::Shape flat_shape_src0 = { ggml_nelements(src0) }; - // ov::Shape flat_shape_src1 = { ggml_nelements(src1) }; - // Create a Parameter node for collecting non-continuous data - auto param_src0 = std::make_shared(ov::element::f16, flat_shape_src0); - auto param_src1 = std::make_shared(ov::element::f32, flat_shape_src1); + auto flatten_src0 = std::make_shared( + param_src0, + ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{ static_cast(total_src0) }), + false); + auto flatten_src1 = std::make_shared( + param_src1, + ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{ static_cast(total_src1) }), + false); - // Create an index Constant node auto indices_const_src0 = ov::op::v0::Constant::create(ov::element::i64, flat_shape_src0, indices_src0); auto indices_const_src1 = ov::op::v0::Constant::create(ov::element::i64, flat_shape_src1, indices_src1); - - // Use the Gather operator to collect valid data - // axis = 0 auto axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto gathered_src0 = std::make_shared(param_src0, indices_const_src0, axis_const); - auto gathered_src1 = std::make_shared(param_src1, indices_const_src1, axis_const); - // Reshape to batched form: - // For src0: valid matrix size for each batch [num_rows_src0, valid_cols_src0] = [32,96], total batches = 32, - // Therefore, reshape to 3D Tensor: shape = [32, 32, 96] where first dimension is batch. + auto gathered_src0 = std::make_shared(flatten_src0, indices_const_src0, axis_const); + auto gathered_src1 = std::make_shared(flatten_src1, indices_const_src1, axis_const); + std::vector shape_src0_cont = { batch_src0, num_rows_src0, valid_cols_src0 }; auto reshape_src0 = std::make_shared( gathered_src0, ov::op::v0::Constant::create(ov::element::i64, { shape_src0_cont.size() }, shape_src0_cont), false); - // For src1: valid matrix size for each batch [num_rows_src1, valid_cols_src1] = [7,96], batch = 32, - // Reshape to 3D Tensor: shape = [32, 7, 96]. + std::vector shape_src1_cont = { batch_src1, num_rows_src1, valid_cols_src1 }; auto reshape_src1 = std::make_shared( gathered_src1, ov::op::v0::Constant::create(ov::element::i64, { shape_src1_cont.size() }, shape_src1_cont), false); - // For src0, first Convert from F16 to F32 auto src0_f32 = std::make_shared(reshape_src0, ov::element::f32); - - // Use Batched Transpose: swap the last two dimensions, dimension order [0, 2, 1] auto transpose_order = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{0, 2, 1}); auto src0_transposed = std::make_shared(src0_f32, transpose_order); @@ -527,89 +531,105 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { auto B = reshape_src1; auto batched_matmul = std::make_shared(B, A, false, false); - // batched_matmul output: shape = [32,7,32] + auto model = std::make_shared(ov::NodeVector{ batched_matmul }, + ov::ParameterVector{ param_src0, param_src1 }); - auto model = std::make_shared(ov::NodeVector{ batched_matmul }, ov::ParameterVector{param_src0, param_src1}); + ov::Tensor tensor_src0{ ov::element::f16, orig_shape_src0, src0->data }; + ov::Tensor tensor_src1{ ov::element::f32, orig_shape_src1, src1->data }; + ov::Shape output_shape = { static_cast(dst->ne[0]), + static_cast(dst->ne[1]), + static_cast(dst->ne[2]) }; + ov::Tensor tensor_dst(ov::element::f32, output_shape, dst->data); ov::Core core; auto compiled_model = core.compile_model(model, "CPU"); auto infer_request = compiled_model.create_infer_request(); - - // Construct input Tensors: treat src0->data and src1->data as 1D flat data respectively - ov::Tensor tensor_src0(ov::element::f16, flat_shape_src0, src0->data); - ov::Tensor tensor_src1(ov::element::f32, flat_shape_src1, src1->data); infer_request.set_input_tensor(0, tensor_src0); infer_request.set_input_tensor(1, tensor_src1); - - ov::Tensor tensor_dst(ov::element::f32, { dst->ne[0], dst->ne[1], dst->ne[2]}, dst->data); infer_request.set_output_tensor(0, tensor_dst); - infer_request.infer(); return ; } - // Valid shape + int rank = 0; + if (dst->ne[2] == 1 && dst->ne[3] == 1) { + rank = 2; + } else if (dst->ne[3] == 1) { + rank = 3; + } else { + throw std::runtime_error("Only rank 2 or rank 3 are supported in this implementation."); + } + std::vector eff_shape_src0 = get_effective_shape(src0); std::vector eff_shape_src1 = get_effective_shape(src1); std::vector eff_shape_dst = get_effective_shape(dst); - // Determine whether it is batched (effective rank==3) or two-dimensional (rank==2) or one-dimensional (rank==1) - int rank = static_cast(eff_shape_dst.size()); - if (rank != 1 && rank != 2 && rank != 3) - throw std::runtime_error("Only rank 1, 2 or 3 supported"); + ov::Shape orig_shape_src0 = { static_cast(src0->ne[0]), + static_cast(src0->ne[1]), + static_cast(src0->ne[2]), + static_cast(src0->ne[3]) }; + ov::Shape orig_shape_src1 = { static_cast(src1->ne[0]), + static_cast(src1->ne[1]), + static_cast(src1->ne[2]), + static_cast(src1->ne[3]) }; - // Total number of flattened elements - size_t total_src0 = 1; for (auto d : eff_shape_src0) total_src0 *= d; - size_t total_src1 = 1; for (auto d : eff_shape_src1) total_src1 *= d; - - ov::Shape flat_shape_src0 = { total_src0 }; - ov::Shape flat_shape_src1 = { total_src1 }; - // Same as above - // ov::Shape flat_shape_src0 = { ggml_nelements(src0) }; - // ov::Shape flat_shape_src1 = { ggml_nelements(src1) }; - - auto param_flat_src0 = std::make_shared(ov::element::f16, flat_shape_src0); - auto param_flat_src1 = std::make_shared(ov::element::f32, flat_shape_src1); + auto param_src0 = std::make_shared(ov::element::f16, orig_shape_src0); + auto param_src1 = std::make_shared(ov::element::f32, orig_shape_src1); auto reshape_src0 = std::make_shared( - param_flat_src0, + param_src0, ov::op::v0::Constant::create(ov::element::i64, { eff_shape_src0.size() }, eff_shape_src0), false); auto reshape_src1 = std::make_shared( - param_flat_src1, + param_src1, ov::op::v0::Constant::create(ov::element::i64, { eff_shape_src1.size() }, eff_shape_src1), false); - // Convert src0: F16 -> F32 auto src0_f32 = std::make_shared(reshape_src0, ov::element::f32); - // Transpose src0_f32: - // For the 2D case, the shape of reshape_src0 is [3072,9216], and after transposition, it is [9216,3072]. - // For the batched case, assuming the shape is [M, K, Batch], batch-wise transposition is required: use order [0, 2, 1]. ov::Output A_for_mul; - if (rank == 1) { - auto trans_order = ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{1, 0}); + if (rank == 2) { + auto trans_order = ov::op::v0::Constant::create(ov::element::i64, { 2 }, std::vector{1, 0}); A_for_mul = std::make_shared(src0_f32, trans_order); - } else if (rank == 2) { - auto trans_order = ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{1, 0}); - A_for_mul = std::make_shared(src0_f32, trans_order); - } else { // rank == 3 - auto trans_order = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{0, 2, 1}); + } else if (rank == 3) { + auto trans_order = ov::op::v0::Constant::create(ov::element::i64, { 3 }, std::vector{0, 2, 1}); A_for_mul = std::make_shared(src0_f32, trans_order); + } else { + A_for_mul = src0_f32; } + auto matmul = std::make_shared(reshape_src1, A_for_mul, false, false); + + auto matmul_output_shape = matmul->get_output_shape(0); + std::vector final_output_shape; + if (matmul_output_shape.size() == 1) { + final_output_shape = { 1, 1, static_cast(matmul_output_shape[0]) }; + } else if (matmul_output_shape.size() == 2) { + final_output_shape = { 1, static_cast(matmul_output_shape[0]), static_cast(matmul_output_shape[1]) }; + } else { + final_output_shape = { static_cast(matmul_output_shape[0]), static_cast(matmul_output_shape[1]), static_cast(matmul_output_shape[2]) }; + } + + auto reshape_output = std::make_shared( + matmul, + ov::op::v0::Constant::create(ov::element::i64, {3}, final_output_shape), + false + ); + + auto model = std::make_shared(ov::NodeVector{ reshape_output }, + ov::ParameterVector{ param_src0, param_src1 }); + + ov::Tensor tensor_src0{ ov::element::f16, orig_shape_src0, (void *)src0->data }; + ov::Tensor tensor_src1{ ov::element::f32, orig_shape_src1, (void *)src1->data }; + + ov::Shape output_shape = { static_cast(dst->ne[2]), + static_cast(dst->ne[1]), + static_cast(dst->ne[0]) }; + ov::Tensor tensor_dst(ov::element::f32, output_shape, dst->data); + ov::Core core; - ov::Tensor tensor_src0{ov::element::f16, flat_shape_src0, (void *)src0->data}; - ov::Tensor tensor_src1{ov::element::f32, flat_shape_src1, (void *)src1->data}; - ov::Tensor tensor_dst(ov::element::f32, ov::Shape(eff_shape_dst.begin(), eff_shape_dst.end()), dst->data); - - std::shared_ptr matmul = std::make_shared(reshape_src1, A_for_mul, false, false); - auto model = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{param_flat_src0, param_flat_src1}); - // ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/002_backend_mulmat_model.xml"); - auto compiled_model = core.compile_model(model, "CPU"); auto infer_request = compiled_model.create_infer_request(); - infer_request.set_input_tensor(0, tensor_src0); infer_request.set_input_tensor(1, tensor_src1); infer_request.set_output_tensor(0, tensor_dst); @@ -980,22 +1000,22 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { ggml_backend_openvino_view(cgraph->nodes[i]); - // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { - // ggml_backend_openvino_cpy(cgraph->nodes[i]); - // } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { - // ggml_backend_openvino_transpose(cgraph->nodes[i]); + } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { + ggml_backend_openvino_cpy(cgraph->nodes[i]); + } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { + ggml_backend_openvino_transpose(cgraph->nodes[i]); } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { ggml_backend_openvino_reshape(cgraph->nodes[i]); - // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { - // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); + } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { + ggml_backend_openvino_mul_mat(cgraph->nodes[i]); } else { // Process a range of nodes with openvino_frontend_compute int start_index = i; while (i < cgraph->n_nodes && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() - // && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() + && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() - // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() + && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() ) { i++; } @@ -1228,6 +1248,7 @@ static const std::set& openvino_ops = []() -> const std::set Date: Thu, 6 Mar 2025 01:49:14 +0800 Subject: [PATCH 035/254] Change the input and ouput node shape of MUL_MAT operator --- ggml/src/ggml-openvino.cpp | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index afd616a338..c45f778e80 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -531,14 +531,25 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { auto B = reshape_src1; auto batched_matmul = std::make_shared(B, A, false, false); - auto model = std::make_shared(ov::NodeVector{ batched_matmul }, + + std::vector final_output_shape = {static_cast(dst->ne[2]), + static_cast(dst->ne[1]), + static_cast(dst->ne[0])}; + + auto reshape_output = std::make_shared( + batched_matmul, + ov::op::v0::Constant::create(ov::element::i64, {3}, final_output_shape), + false + ); + + auto model = std::make_shared(ov::NodeVector{ reshape_output }, ov::ParameterVector{ param_src0, param_src1 }); ov::Tensor tensor_src0{ ov::element::f16, orig_shape_src0, src0->data }; ov::Tensor tensor_src1{ ov::element::f32, orig_shape_src1, src1->data }; - ov::Shape output_shape = { static_cast(dst->ne[0]), + ov::Shape output_shape = { static_cast(dst->ne[2]), static_cast(dst->ne[1]), - static_cast(dst->ne[2]) }; + static_cast(dst->ne[0]) }; ov::Tensor tensor_dst(ov::element::f32, output_shape, dst->data); ov::Core core; From d05c458421fba5471da46fed9f8ca8522f9e2fd6 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Thu, 6 Mar 2025 10:22:20 +0800 Subject: [PATCH 036/254] change CONT and MULMAT input node shape --- ggml/src/ggml-openvino.cpp | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index c45f778e80..109003d686 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -482,12 +482,10 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { ov::Shape orig_shape_src0 = { static_cast(src0->ne[0]), static_cast(src0->ne[1]), - static_cast(src0->ne[2]), - static_cast(src0->ne[3]) }; + static_cast(src0->ne[2])}; ov::Shape orig_shape_src1 = { static_cast(src1->ne[0]), static_cast(src1->ne[1]), - static_cast(src1->ne[2]), - static_cast(src1->ne[3]) }; + static_cast(src1->ne[2])}; auto param_src0 = std::make_shared(ov::element::f16, orig_shape_src0); auto param_src1 = std::make_shared(ov::element::f32, orig_shape_src1); @@ -577,13 +575,10 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { ov::Shape orig_shape_src0 = { static_cast(src0->ne[0]), static_cast(src0->ne[1]), - static_cast(src0->ne[2]), - static_cast(src0->ne[3]) }; + static_cast(src0->ne[2])}; ov::Shape orig_shape_src1 = { static_cast(src1->ne[0]), static_cast(src1->ne[1]), - static_cast(src1->ne[2]), - static_cast(src1->ne[3]) }; - + static_cast(src1->ne[2])}; auto param_src0 = std::make_shared(ov::element::f16, orig_shape_src0); auto param_src1 = std::make_shared(ov::element::f32, orig_shape_src1); @@ -697,10 +692,9 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { // Case 1: Both tensors are contiguous if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) { ov::Shape input_shape = { - static_cast(src0->ne[0]), - static_cast(src0->ne[1]), static_cast(src0->ne[2]), - static_cast(src0->ne[3]) + static_cast(src0->ne[1]), + static_cast(src0->ne[0]) }; size_t num_elements = 1; for (auto d : input_shape) { @@ -764,7 +758,7 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { std::copy(src_row, src_row + valid_elems, dst_row); } - ov::Shape logical_shape = { valid_elems, num_rows, dim2, dim3 }; + ov::Shape logical_shape = { dim2, num_rows, valid_elems}; auto input_param = std::make_shared(ov::element::f32, logical_shape); auto identity_const = ov::op::v0::Constant::create(ov::element::i64, { logical_shape.size() }, @@ -828,12 +822,16 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { } } - ov::Shape input_shape = { dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2] }; + // ov::Shape input_shape = { dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2] }; + ov::Shape input_shape = { dst->src[0]->ne[2], dst->src[0]->ne[1], dst->src[0]->ne[0]}; auto input_param = std::make_shared(ov::element::f32, input_shape); - ov::Shape target_shape = { dst->ne[0], dst->ne[1], dst->ne[2] }; - std::vector target_shape_vec = { static_cast(dst->ne[0]), - static_cast(dst->ne[1]), dst->ne[2]}; + // ov::Shape target_shape = { dst->ne[0], dst->ne[1], dst->ne[2] }; + // std::vector target_shape_vec = { static_cast(dst->ne[0]), + // static_cast(dst->ne[1]), dst->ne[2]}; + ov::Shape target_shape = { dst->ne[2], dst->ne[1], dst->ne[0] }; + std::vector target_shape_vec = { static_cast(dst->ne[2]), + static_cast(dst->ne[1]), dst->ne[0]}; auto reshape_const = ov::op::v0::Constant::create(ov::element::i64, {3}, target_shape_vec); auto reshaped = std::make_shared(input_param, reshape_const, false); From e08a7fda334f128fb684bea7aeedf84e9a38a433 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Thu, 6 Mar 2025 13:51:34 +0800 Subject: [PATCH 037/254] All adjacent ops can conversion but calculation result is wrong and need debugging --- ggml/src/ggml-openvino.cpp | 87 ++++++++++++------------- ggml/src/ggml-openvino/ggml-decoder.cpp | 50 +++++++------- ggml/src/ggml-openvino/utils.cpp | 74 +++++++++++++-------- 3 files changed, 114 insertions(+), 97 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 109003d686..230edded11 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -480,12 +480,12 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { size_t total_src0 = indices_src0.size(); // = 96 * 32 * 32 size_t total_src1 = indices_src1.size(); // = 96 * 7 * 32 - ov::Shape orig_shape_src0 = { static_cast(src0->ne[0]), + ov::Shape orig_shape_src0 = { static_cast(src0->ne[2]), static_cast(src0->ne[1]), - static_cast(src0->ne[2])}; - ov::Shape orig_shape_src1 = { static_cast(src1->ne[0]), + static_cast(src0->ne[0])}; + ov::Shape orig_shape_src1 = { static_cast(src1->ne[2]), static_cast(src1->ne[1]), - static_cast(src1->ne[2])}; + static_cast(src1->ne[0])}; auto param_src0 = std::make_shared(ov::element::f16, orig_shape_src0); auto param_src1 = std::make_shared(ov::element::f32, orig_shape_src1); @@ -573,12 +573,12 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { std::vector eff_shape_src1 = get_effective_shape(src1); std::vector eff_shape_dst = get_effective_shape(dst); - ov::Shape orig_shape_src0 = { static_cast(src0->ne[0]), - static_cast(src0->ne[1]), - static_cast(src0->ne[2])}; - ov::Shape orig_shape_src1 = { static_cast(src1->ne[0]), - static_cast(src1->ne[1]), - static_cast(src1->ne[2])}; + ov::Shape orig_shape_src0 = { static_cast(src0->ne[2]), + static_cast(src0->ne[1]), + static_cast(src0->ne[0])}; + ov::Shape orig_shape_src1 = { static_cast(src1->ne[2]), + static_cast(src1->ne[1]), + static_cast(src1->ne[0])}; auto param_src0 = std::make_shared(ov::element::f16, orig_shape_src0); auto param_src1 = std::make_shared(ov::element::f32, orig_shape_src1); @@ -999,40 +999,40 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe } int end_node = cgraph->n_nodes - 1; - // openvino_frontend_compute(backend, cgraph, 0, end_node); + openvino_frontend_compute(backend, cgraph, 0, end_node); // openvino_frontend_compute(backend, cgraph); // Process nodes in order - for (int i = 0; i < cgraph->n_nodes; i++) { - if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { - ggml_backend_openvino_permute(cgraph->nodes[i]); - } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { - ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); - } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { - ggml_backend_openvino_view(cgraph->nodes[i]); - } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { - ggml_backend_openvino_cpy(cgraph->nodes[i]); - } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { - ggml_backend_openvino_transpose(cgraph->nodes[i]); - } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { - ggml_backend_openvino_reshape(cgraph->nodes[i]); - } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { - ggml_backend_openvino_mul_mat(cgraph->nodes[i]); - } else { - // Process a range of nodes with openvino_frontend_compute - int start_index = i; - while (i < cgraph->n_nodes - && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() - && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() - && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() - && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() - ) { - i++; - } - if (start_index < i) { - openvino_frontend_compute(backend, cgraph, start_index, --i); - } - } - } + // for (int i = 0; i < cgraph->n_nodes; i++) { + // if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { + // ggml_backend_openvino_permute(cgraph->nodes[i]); + // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { + // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); + // } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { + // ggml_backend_openvino_view(cgraph->nodes[i]); + // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { + // ggml_backend_openvino_cpy(cgraph->nodes[i]); + // } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { + // ggml_backend_openvino_transpose(cgraph->nodes[i]); + // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { + // ggml_backend_openvino_reshape(cgraph->nodes[i]); + // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { + // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); + // } else { + // // Process a range of nodes with openvino_frontend_compute + // int start_index = i; + // while (i < cgraph->n_nodes + // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() + // && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() + // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() + // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() + // ) { + // i++; + // } + // if (start_index < i) { + // openvino_frontend_compute(backend, cgraph, start_index, --i); + // } + // } + // } return GGML_STATUS_SUCCESS; @@ -1257,14 +1257,13 @@ static const std::set& openvino_ops = []() -> const std::set(ggml_nelements(node)) }; - auto input_param = std::make_shared(ov::element::f32, flat_shape); - m_params.push_back(input_param); + // ov::Shape flat_shape = { static_cast(ggml_nelements(node)) }; + // auto input_param = std::make_shared(ov::element::f32, flat_shape); + // m_params.push_back(input_param); break; } @@ -72,15 +72,15 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop)); m_output_names.push_back(node_name); - const size_t element_size = ggml_type_size(node->src[0]->type); - size_t valid_elems = static_cast(node->src[0]->ne[0]); // 3072 - size_t num_rows = static_cast(node->src[0]->ne[1]); // 7 - size_t phys_stride = static_cast(node->src[0]->nb[1]) / element_size; // 9216 - // size_t total_phys = (num_rows - 1) * phys_stride + valid_elems; // 6*9216 + 3072 = 58368 - size_t total_phys = num_rows * phys_stride; // 7 * 9216 = 64512 - ov::Shape flat_input_shape = { total_phys }; - auto flat_input_param = std::make_shared(ov::element::f32, flat_input_shape); - m_params.push_back(flat_input_param); + // const size_t element_size = ggml_type_size(node->src[0]->type); + // size_t valid_elems = static_cast(node->src[0]->ne[0]); // 3072 + // size_t num_rows = static_cast(node->src[0]->ne[1]); // 7 + // size_t phys_stride = static_cast(node->src[0]->nb[1]) / element_size; // 9216 + // // size_t total_phys = (num_rows - 1) * phys_stride + valid_elems; // 6*9216 + 3072 = 58368 + // size_t total_phys = num_rows * phys_stride; // 7 * 9216 = 64512 + // ov::Shape flat_input_shape = { total_phys }; + // auto flat_input_param = std::make_shared(ov::element::f32, flat_input_shape); + // m_params.push_back(flat_input_param); m_continuous = false; break; @@ -94,13 +94,13 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop)); m_output_names.push_back(node_name); - size_t valid_i = static_cast(node->src[0]->ne[0]); // 96 - size_t valid_j = static_cast(node->src[0]->ne[1]); // 32 - size_t valid_k = static_cast(node->src[0]->ne[2]); // 7 - size_t total_valid = valid_i * valid_j * valid_k; // 96 * 32 * 7 = 21504 - ov::Shape flat_input_shape = { total_valid }; - auto input_param = std::make_shared(ov::element::f32, flat_input_shape); - m_params.push_back(input_param); + // size_t valid_i = static_cast(node->src[0]->ne[0]); // 96 + // size_t valid_j = static_cast(node->src[0]->ne[1]); // 32 + // size_t valid_k = static_cast(node->src[0]->ne[2]); // 7 + // size_t total_valid = valid_i * valid_j * valid_k; // 96 * 32 * 7 = 21504 + // ov::Shape flat_input_shape = { total_valid }; + // auto input_param = std::make_shared(ov::element::f32, flat_input_shape); + // m_params.push_back(input_param); m_continuous = false; break; @@ -190,12 +190,12 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]->ne[0]*node->src[0]->ne[1]*node->src[0]->ne[2] }; - ov::Shape flat_shape_src1 = { node->src[1]->ne[0]*node->src[1]->ne[1]*node->src[1]->ne[2] }; - auto param_src0 = std::make_shared(ov::element::f16, flat_shape_src0); - auto param_src1 = std::make_shared(ov::element::f32, flat_shape_src1); - m_params.push_back(param_src0); - m_params.push_back(param_src1); + // ov::Shape flat_shape_src0 = { node->src[0]->ne[0]*node->src[0]->ne[1]*node->src[0]->ne[2] }; + // ov::Shape flat_shape_src1 = { node->src[1]->ne[0]*node->src[1]->ne[1]*node->src[1]->ne[2] }; + // auto param_src0 = std::make_shared(ov::element::f16, flat_shape_src0); + // auto param_src1 = std::make_shared(ov::element::f32, flat_shape_src1); + // m_params.push_back(param_src0); + // m_params.push_back(param_src1); if (!ggml_is_contiguous(node->src[1]) || node->src[1]->ne[0] * node->src[1]->nb[0] != node->src[1]->nb[1]) { m_continuous = false; } else { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 8f27bbc97d..a0234ebd30 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -14,12 +14,15 @@ std::map get_ggml_graph_input_tensors(std::shared_ptr input_tensors; auto input_names = ggml_decoder->get_input_names(); // auto node_name = ggml_decoder->get_op_name(); - size_t iter = 0; + size_t op_iter = 0; for (size_t inp = 0; inp < input_names.size(); ++inp) { auto name = input_names[inp]; - std::string op_node_name = ggml_decoder->get_op_node_name(name, iter++); + std::string op_node_name = ggml_decoder->get_op_node_name(name, op_iter++); // auto node_op_name = ggml_decoder->get_node_op_name(name); + ov::element::Type input_type = ggml_decoder->get_input_type(name); + size_t element_size = input_type.size(); auto input_data = ggml_decoder->get_input_ggml_tensor(name)->data; + std::vector input_stride = ggml_decoder->get_input_stride(name); #ifdef GGML_OPENVINO_DEBUG printf("Subgraph input %d: %g\n", inp, *(double*)(input_data)); #endif @@ -28,36 +31,51 @@ std::map get_ggml_graph_input_tensors(std::shared_ptrget_input_type(name), {80000}, input_data); - if (op_node_name == "CONT" && ggml_decoder->check_if_continuous()) { - ov::Shape flat_shape = { ggml_decoder->get_input_shape(name).to_shape()[0] * - ggml_decoder->get_input_shape(name).to_shape()[1] * - ggml_decoder->get_input_shape(name).to_shape()[2] }; - input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), flat_shape, input_data); - } else if ( op_node_name == "CONT" && - !ggml_decoder->check_if_continuous() && - input_shape[0] == 1) { - size_t valid_elems = static_cast(ggml_decoder->get_input_shape(name).to_shape()[2]); // 3072 - size_t num_rows = static_cast(ggml_decoder->get_input_shape(name).to_shape()[1]); // 7 - ov::element::Type input_type = ggml_decoder->get_input_type(name); - size_t element_size = input_type.size(); - std::vector strides = ggml_decoder->get_input_stride(name); - size_t phys_stride = static_cast(strides[1]) / element_size; - // size_t total_phys = (num_rows - 1) * phys_stride + valid_elems; - size_t total_phys = num_rows* phys_stride; - ov::Shape flat_input_shape = { total_phys }; - input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), flat_input_shape, input_data); - } else if (op_node_name == "CONT") { + if (op_node_name == "CONT" && !ggml_decoder->check_if_continuous() && input_shape[0] == 1) { + const size_t valid_elems = static_cast(ggml_decoder->get_input_shape(name).to_shape()[2]); + const size_t num_rows = static_cast(ggml_decoder->get_input_shape(name).to_shape()[1]); + const size_t dim2 = static_cast(ggml_decoder->get_input_shape(name).to_shape()[0]); + size_t phys_stride = static_cast(input_stride[1]) / element_size; + size_t total_logical = valid_elems * num_rows * dim2; + + std::vector contiguous_data(total_logical); + + for (size_t j = 0; j < num_rows; j++) { + const float *src_row = reinterpret_cast(input_data) + j * phys_stride; + float *dst_row = contiguous_data.data() + j * valid_elems; + std::copy(src_row, src_row + valid_elems, dst_row); + } + input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), + ggml_decoder->get_input_shape(name).to_shape(), + contiguous_data.data()); + } else if (op_node_name == "CONT" && !ggml_decoder->check_if_continuous()){ size_t valid_i = static_cast(ggml_decoder->get_input_shape(name).to_shape()[2]); // 96 size_t valid_j = static_cast(ggml_decoder->get_input_shape(name).to_shape()[1]); // 32 size_t valid_k = static_cast(ggml_decoder->get_input_shape(name).to_shape()[0]); // 7 + size_t total_valid = valid_i * valid_j * valid_k; // 96 * 32 * 7 = 21504 - ov::Shape flat_input_shape = { total_valid }; - input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), flat_input_shape, input_data); - } else if (op_node_name == "MUL_MAT") { - ov::Shape flat_shape = { ggml_decoder->get_input_shape(name).to_shape()[0] * - ggml_decoder->get_input_shape(name).to_shape()[1] * - ggml_decoder->get_input_shape(name).to_shape()[2] }; - input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), flat_shape, input_data); + size_t stride_j = static_cast(input_stride[1]) / element_size; // 672 + size_t stride_k = static_cast(input_stride[0]) / element_size; // 96 + + std::vector contiguous_data(total_valid); + const float *src_data = reinterpret_cast(input_data); + for (size_t k = 0; k < valid_k; k++) { + for (size_t j = 0; j < valid_j; j++) { + for (size_t i = 0; i < valid_i; i++) { + size_t out_index = k * (valid_i * valid_j) + j * valid_i + i; + size_t src_index = j * stride_j + k * stride_k + i; + contiguous_data[out_index] = src_data[src_index]; + } + } + } + input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), + ggml_decoder->get_input_shape(name).to_shape(), + contiguous_data.data()); + // } else if (op_node_name == "MUL_MAT") { + // ov::Shape flat_shape = { ggml_decoder->get_input_shape(name).to_shape()[0] * + // ggml_decoder->get_input_shape(name).to_shape()[1] * + // ggml_decoder->get_input_shape(name).to_shape()[2] }; + // input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), flat_shape, input_data); } else { input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); } From cff473a9e20df37c9fc32c30009b9abfe12ed948 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Sun, 9 Mar 2025 23:35:18 +0800 Subject: [PATCH 038/254] 1. All operators implemented using OpenVINO can be successfully executed individually. 2. VIEW op output tensor shape is not same with CONT(non-contiguous) input tensor shape 3. CPY(non-contiguous) can't be implemented with original input/output tensor shape and data(need change the original shape when create input/output tensor) Currently. VIEW op executed in the ggml backend and others executed in the OpenVINO Frontend. --- ggml/src/ggml-openvino.cpp | 191 ++++++++++++------------ ggml/src/ggml-openvino/ggml-decoder.cpp | 88 ++++------- ggml/src/ggml-openvino/utils.cpp | 76 +++------- ggml/src/ggml-openvino/utils.h | 2 +- 4 files changed, 140 insertions(+), 217 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 230edded11..082ab27458 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -537,8 +537,7 @@ void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { auto reshape_output = std::make_shared( batched_matmul, ov::op::v0::Constant::create(ov::element::i64, {3}, final_output_shape), - false - ); + false); auto model = std::make_shared(ov::NodeVector{ reshape_output }, ov::ParameterVector{ param_src0, param_src1 }); @@ -659,6 +658,7 @@ void ggml_backend_openvino_view(ggml_tensor *dst) { false); auto model = std::make_shared(ov::NodeVector{reshaped}, ov::ParameterVector{param}); + // auto model = std::make_shared(ov::NodeVector{param}, ov::ParameterVector{param}); // ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/003_backend_view_model.xml"); auto compiled_model = core.compile_model(model, "CPU"); @@ -742,106 +742,91 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { const size_t nb0 = dst->nb[0]; if (src0->type == dst->type && ne00 == dst->ne[0] && nb00 == element_size && nb0 == element_size) { - const size_t valid_elems = static_cast(src0->ne[0]); - const size_t num_rows = static_cast(src0->ne[1]); - const size_t dim2 = static_cast(src0->ne[2]); - const size_t dim3 = static_cast(src0->ne[3]); + const size_t valid_elems = static_cast(src0->ne[0]); // 3072 + const size_t num_rows = static_cast(src0->ne[1]); // 7 + const size_t dim2 = static_cast(src0->ne[2]); // 1 - size_t phys_stride = static_cast(src0->nb[1]) / element_size; - size_t total_logical = valid_elems * num_rows * dim2 * dim3; + size_t phys_stride = static_cast(src0->nb[1]) / element_size; // 9216 - std::vector contiguous_data(total_logical); + ov::Shape input_shape = { dim2, num_rows, phys_stride }; // 如 {1, 7, 9216 } + ov::Shape logical_shape = { dim2, num_rows, valid_elems }; // {1, 7, 3072} - for (size_t j = 0; j < num_rows; j++) { - const float *src_row = reinterpret_cast(src0->data) + j * phys_stride; - float *dst_row = contiguous_data.data() + j * valid_elems; - std::copy(src_row, src_row + valid_elems, dst_row); - } + auto input_param = std::make_shared(ov::element::f32, input_shape); - ov::Shape logical_shape = { dim2, num_rows, valid_elems}; - auto input_param = std::make_shared(ov::element::f32, logical_shape); - auto identity_const = ov::op::v0::Constant::create(ov::element::i64, - { logical_shape.size() }, - std::vector(logical_shape.begin(), logical_shape.end())); - auto identity_op = std::make_shared(input_param, identity_const, false); + std::vector begin = { 0, 0, 0 }; + std::vector end = { static_cast(dim2), + static_cast(num_rows), + static_cast(valid_elems) }; + std::vector strides = { 1, 1, 1 }; - auto model = std::make_shared(ov::OutputVector{identity_op}, - ov::ParameterVector{input_param}); + auto begin_const = ov::op::v0::Constant::create(ov::element::i64, { begin.size() }, begin); + auto end_const = ov::op::v0::Constant::create(ov::element::i64, { end.size() }, end); + auto strides_const = ov::op::v0::Constant::create(ov::element::i64, { strides.size() }, strides); + + std::vector begin_mask = {0, 0, 0}; + std::vector end_mask = {0, 0, 0}; + auto slice = std::make_shared( + input_param, + begin_const, + end_const, + strides_const, + begin_mask, + end_mask + ); + + auto model = std::make_shared(ov::OutputVector{ slice }, + ov::ParameterVector{ input_param }); ov::Core core; auto compiled_model = core.compile_model(model, "CPU"); auto infer_request = compiled_model.create_infer_request(); - ov::Tensor input_tensor(ov::element::f32, logical_shape, contiguous_data.data()); + //[NOTE]: input_shape should be {1, 7, 9216} not the original shap of src0. + ov::Tensor input_tensor(ov::element::f32, input_shape, src0->data); infer_request.set_input_tensor(0, input_tensor); ov::Tensor output_tensor(ov::element::f32, logical_shape, dst->data); infer_request.set_output_tensor(0, output_tensor); infer_request.infer(); - /* - for (size_t i01 = 0; i01 < ne01; ++i01) { - const char *src_row = reinterpret_cast(src0->data) + i01 * nb01; - char *dst_row = reinterpret_cast(dst->data) + i01 * dst->nb[1]; - - ov::Tensor src_row_tensor(ov::element::f32, {ne00}, const_cast(reinterpret_cast(src_row))); - ov::Tensor dst_row_tensor(ov::element::f32, {ne00}, reinterpret_cast(dst_row)); - - std::memcpy(dst_row_tensor.data(), src_row_tensor.data(), ne00 * sizeof(float)); - }*/ return; } // Case 3: Non-contiguous source, contiguous destination - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - const int64_t nb02 = src0->nb[2]; - const int64_t nb03 = src0->nb[3]; - // dst->ne =[3072,7,1,1], dst->nb =[4,12288,86016,86016], dst->type=GGML_TYPE_F32 // dst->src[0]->ne=[96,32,7,1], dst->src[0]->nb=[4,2688,384,86016], dst->src[0]->type=GGML_TYPE_F32 if (ggml_is_contiguous(dst)) { size_t valid_i = static_cast(src0->ne[0]); // 96 size_t valid_j = static_cast(src0->ne[1]); // 32 size_t valid_k = static_cast(src0->ne[2]); // 7 - size_t valid_l = static_cast(src0->ne[3]); // 1 - size_t total_valid = valid_i * valid_j * valid_k; // 96 * 32 * 7 = 21504 - size_t stride_j = static_cast(src0->nb[1]) / element_size; // 672 - size_t stride_k = static_cast(src0->nb[2]) / element_size; // 96 + ov::Shape src_shape = { valid_k, valid_j, valid_i }; // {7, 32, 96}; + auto src_param = std::make_shared(ov::element::f32, src_shape); - std::vector contiguous_data(total_valid); - const float *src_data = reinterpret_cast(src0->data); - for (size_t k = 0; k < valid_k; k++) { - for (size_t j = 0; j < valid_j; j++) { - for (size_t i = 0; i < valid_i; i++) { - size_t out_index = k * (valid_i * valid_j) + j * valid_i + i; - size_t src_index = j * stride_j + k * stride_k + i; - contiguous_data[out_index] = src_data[src_index]; - } - } - } + ov::Shape input_shape = { valid_j, valid_k, valid_i }; // {32, 7, 96} + auto tmp_param = ov::op::v0::Constant::create(ov::element::i64, { input_shape.size() }, input_shape); + auto input_param = std::make_shared(src_param, tmp_param, false); - // ov::Shape input_shape = { dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2] }; - ov::Shape input_shape = { dst->src[0]->ne[2], dst->src[0]->ne[1], dst->src[0]->ne[0]}; - auto input_param = std::make_shared(ov::element::f32, input_shape); + // 添加 Transpose 节点,将 {32,7,96} 变换为 {7,32,96},恢复逻辑顺序 + // 这里交换第 0 与第 1 维,即 permutation = {1, 0, 2} + std::vector order = {1, 0, 2}; + auto order_const = ov::op::v0::Constant::create(ov::element::i64, {order.size()}, order); + auto transpose = std::make_shared(input_param, order_const); - // ov::Shape target_shape = { dst->ne[0], dst->ne[1], dst->ne[2] }; - // std::vector target_shape_vec = { static_cast(dst->ne[0]), - // static_cast(dst->ne[1]), dst->ne[2]}; - ov::Shape target_shape = { dst->ne[2], dst->ne[1], dst->ne[0] }; + ov::Shape target_shape = { dst->ne[2], dst->ne[1], dst->ne[0] }; // {1, 7, 3072} std::vector target_shape_vec = { static_cast(dst->ne[2]), - static_cast(dst->ne[1]), dst->ne[0]}; - auto reshape_const = ov::op::v0::Constant::create(ov::element::i64, {3}, target_shape_vec); - auto reshaped = std::make_shared(input_param, reshape_const, false); - - auto model = std::make_shared(ov::OutputVector{reshaped}, ov::ParameterVector{input_param}); + static_cast(dst->ne[1]), + static_cast(dst->ne[0]) }; + auto reshape_const = ov::op::v0::Constant::create(ov::element::i64, { target_shape_vec.size() }, target_shape_vec); + auto reshaped = std::make_shared(transpose, reshape_const, false); + auto model = std::make_shared(ov::OutputVector{ reshaped }, + ov::ParameterVector{ src_param }); ov::Core core; auto compiled_model = core.compile_model(model, "CPU"); auto infer_request = compiled_model.create_infer_request(); - ov::Tensor input_tensor(ov::element::f32, input_shape, contiguous_data.data()); + ov::Tensor input_tensor(ov::element::f32, src_shape, src0->data); infer_request.set_input_tensor(0, input_tensor); ov::Tensor output_tensor(ov::element::f32, target_shape, dst->data); @@ -998,40 +983,48 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe } } - int end_node = cgraph->n_nodes - 1; - openvino_frontend_compute(backend, cgraph, 0, end_node); - // openvino_frontend_compute(backend, cgraph); + // Process nodes in order - // for (int i = 0; i < cgraph->n_nodes; i++) { - // if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { - // ggml_backend_openvino_permute(cgraph->nodes[i]); - // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { - // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); - // } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { - // ggml_backend_openvino_view(cgraph->nodes[i]); - // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { - // ggml_backend_openvino_cpy(cgraph->nodes[i]); - // } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { - // ggml_backend_openvino_transpose(cgraph->nodes[i]); - // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { - // ggml_backend_openvino_reshape(cgraph->nodes[i]); - // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { - // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); - // } else { - // // Process a range of nodes with openvino_frontend_compute - // int start_index = i; - // while (i < cgraph->n_nodes - // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() - // && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() - // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() - // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() - // ) { - // i++; - // } - // if (start_index < i) { - // openvino_frontend_compute(backend, cgraph, start_index, --i); - // } - // } + + // if (cgraph->nodes[0]->ne[1] == 1) { + // bool prompt_process_flag = false; + // int end_node = cgraph->n_nodes - 1; + // openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag); + // } else { + + for (int i = 0; i < cgraph->n_nodes; i++) { + if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { + // ggml_backend_openvino_permute(cgraph->nodes[i]); + // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { + // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); + } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { + ggml_backend_openvino_view(cgraph->nodes[i]); + // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { + // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); + // } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { + // ggml_backend_openvino_transpose(cgraph->nodes[i]); + // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { + // ggml_backend_openvino_reshape(cgraph->nodes[i]); + // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { + // ggml_backend_openvino_cpy(cgraph->nodes[i]); + } else { + // Process a range of nodes with openvino_frontend_compute + int start_index = i; + while (i < cgraph->n_nodes + // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() + && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() + // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() + // && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() + // && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() + ) { + i++; + } + if (start_index < i) { + openvino_frontend_compute(backend, cgraph, start_index, --i); + } + } + } + // } return GGML_STATUS_SUCCESS; diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 90bfdcd103..2b04cd632a 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -46,12 +46,14 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); + + ov::Shape input_shape = { static_cast(node->src[0]->ne[2]), + static_cast(node->src[0]->ne[1]), + static_cast(node->src[0]->ne[0])}; + auto input_param = std::make_shared(ov::element::f32, input_shape); + m_params.push_back(input_param); + m_continuous = true; - - // ov::Shape flat_shape = { static_cast(ggml_nelements(node)) }; - // auto input_param = std::make_shared(ov::element::f32, flat_shape); - // m_params.push_back(input_param); - break; } @@ -59,12 +61,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]->nb[0] == ggml_type_size(node->src[0]->type) && node->nb[0] == ggml_type_size(node->src[0]->type)) { - // for (size_t i01 = 0; i01 < node->src[0]->ne[1]; ++i01) { - // const char *src_row = reinterpret_cast(node->src[0]->data) + i01 * node->src[0]->nb[1]; - // char *dst_row = reinterpret_cast(node->data) + i01 * node->nb[1]; - // std::memcpy(dst_row, src_row, node->src[0]->ne[0] * ggml_type_size(node->src[0]->type)); - // } - inputs[src0_name] = node->src[0]; outputs[node_name] = node; m_input_names.push_back(src0_name); @@ -72,15 +68,16 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop)); m_output_names.push_back(node_name); - // const size_t element_size = ggml_type_size(node->src[0]->type); - // size_t valid_elems = static_cast(node->src[0]->ne[0]); // 3072 - // size_t num_rows = static_cast(node->src[0]->ne[1]); // 7 - // size_t phys_stride = static_cast(node->src[0]->nb[1]) / element_size; // 9216 - // // size_t total_phys = (num_rows - 1) * phys_stride + valid_elems; // 6*9216 + 3072 = 58368 - // size_t total_phys = num_rows * phys_stride; // 7 * 9216 = 64512 - // ov::Shape flat_input_shape = { total_phys }; - // auto flat_input_param = std::make_shared(ov::element::f32, flat_input_shape); - // m_params.push_back(flat_input_param); + const size_t element_size = ggml_type_size(node->src[0]->type); + size_t valid_elems = static_cast(node->src[0]->ne[0]); // 3072 + size_t num_rows = static_cast(node->src[0]->ne[1]); // 7 + size_t dim2 = static_cast(node->src[0]->ne[2]); // 1 + size_t phys_stride = static_cast(node->src[0]->nb[1]) / element_size; // 9216 + // size_t total_phys = (num_rows - 1) * phys_stride + valid_elems; // 6*9216 + 3072 = 58368 + size_t total_phys = num_rows * phys_stride; // 7 * 9216 = 64512 + ov::Shape input_shape = { dim2, num_rows, phys_stride }; + auto input_param = std::make_shared(ov::element::f32, input_shape); + m_params.push_back(input_param); m_continuous = false; break; @@ -94,13 +91,11 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop)); m_output_names.push_back(node_name); - // size_t valid_i = static_cast(node->src[0]->ne[0]); // 96 - // size_t valid_j = static_cast(node->src[0]->ne[1]); // 32 - // size_t valid_k = static_cast(node->src[0]->ne[2]); // 7 - // size_t total_valid = valid_i * valid_j * valid_k; // 96 * 32 * 7 = 21504 - // ov::Shape flat_input_shape = { total_valid }; - // auto input_param = std::make_shared(ov::element::f32, flat_input_shape); - // m_params.push_back(input_param); + ov::Shape input_shape = { static_cast(node->src[0]->ne[2]), + static_cast(node->src[0]->ne[1]), + static_cast(node->src[0]->ne[0])}; + auto input_param = std::make_shared(ov::element::f32, input_shape); + m_params.push_back(input_param); m_continuous = false; break; @@ -117,9 +112,9 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]->ne, node->src[0]->ne + 3); - auto input_param = std::make_shared(ov::element::f32, src_shape); - m_params.push_back(input_param); + // ov::Shape src_shape(node->src[0]->ne, node->src[0]->ne + 3); + // auto input_param = std::make_shared(ov::element::f32, src_shape); + // m_params.push_back(input_param); break; } else { for (int64_t i1 = 0; i1 < node->ne[1]; ++i1) { // ne[1] = 3072 @@ -139,27 +134,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop)); m_output_names.push_back(node_name); m_continuous = false; - break; - - // inputs[src0_name] = node->src[0]; - // std::string temp_name = src0_name + std::string("_cpy_tmp"); - // inputs[temp_name] = node; - - // outputs[node_name] = node; - // m_input_names.push_back(src0_name); - // m_input_names.push_back(temp_name); - // m_node_op_name[src0_name] = ggml_op_name(node->op); - // m_node_op_name[temp_name] = ggml_op_name(node->op); - // m_output_names.push_back(node_name); - // m_continuous = false; - - // ov::Shape flat_src0_shape = {node->src[0]->nb[2]}; - // auto param_src0 = std::make_shared(ov::element::f32, flat_src0_shape); - // m_params.push_back(param_src0); - - // ov::Shape flat_dst_shape = {node->nb[2], 1}; - // auto param_dst_base = std::make_shared(ov::element::f16, flat_dst_shape); - // m_params.push_back(param_dst_base); break; } @@ -167,8 +141,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapname) + "_" + std::to_string(node->view_offs) + "_output_" + ggml_op_name(node->op); - // std::string node_name = std::string(node->name) + "_" + std::to_string(node->view_offs); inputs[node_name] = node; outputs[node_name] = node; m_input_names.push_back(node_name); @@ -190,12 +162,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]->ne[0]*node->src[0]->ne[1]*node->src[0]->ne[2] }; - // ov::Shape flat_shape_src1 = { node->src[1]->ne[0]*node->src[1]->ne[1]*node->src[1]->ne[2] }; - // auto param_src0 = std::make_shared(ov::element::f16, flat_shape_src0); - // auto param_src1 = std::make_shared(ov::element::f32, flat_shape_src1); - // m_params.push_back(param_src0); - // m_params.push_back(param_src1); if (!ggml_is_contiguous(node->src[1]) || node->src[1]->ne[0] * node->src[1]->nb[0] != node->src[1]->nb[1]) { m_continuous = false; } else { @@ -376,8 +342,8 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr if (m_node) { set_input_output(m_node, m_inputs, m_outputs); } else { - for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { - // for (int node_n = start_index; node_n <= end_index; node_n++) { + // for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { + for (int node_n = start_index; node_n <= end_index; node_n++) { auto cur_node = m_cgraph->nodes[node_n]; m_nodes.push_back(cur_node); // Init model input and output diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index a0234ebd30..c44aa2568b 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -10,8 +10,10 @@ std::shared_ptr get_ggml_decoder(struct ggml_cgraph * cgraph, con return std::make_shared(nullptr, cgraph, start_index, end_index); } -std::map get_ggml_graph_input_tensors(std::shared_ptr ggml_decoder) { - std::map input_tensors; +// std::map get_ggml_graph_input_tensors(std::shared_ptr ggml_decoder) { +std::vector> get_ggml_graph_input_tensors(std::shared_ptr ggml_decoder, bool flag) { + // std::map input_tensors; + std::vector> input_tensors; auto input_names = ggml_decoder->get_input_names(); // auto node_name = ggml_decoder->get_op_name(); size_t op_iter = 0; @@ -19,10 +21,7 @@ std::map get_ggml_graph_input_tensors(std::shared_ptrget_op_node_name(name, op_iter++); // auto node_op_name = ggml_decoder->get_node_op_name(name); - ov::element::Type input_type = ggml_decoder->get_input_type(name); - size_t element_size = input_type.size(); auto input_data = ggml_decoder->get_input_ggml_tensor(name)->data; - std::vector input_stride = ggml_decoder->get_input_stride(name); #ifdef GGML_OPENVINO_DEBUG printf("Subgraph input %d: %g\n", inp, *(double*)(input_data)); #endif @@ -31,58 +30,22 @@ std::map get_ggml_graph_input_tensors(std::shared_ptrget_input_type(name), {80000}, input_data); - if (op_node_name == "CONT" && !ggml_decoder->check_if_continuous() && input_shape[0] == 1) { - const size_t valid_elems = static_cast(ggml_decoder->get_input_shape(name).to_shape()[2]); + if (flag & op_node_name == "CONT" && input_shape[0] == 1 && input_shape[1] != 1) { + std::vector input_stride = ggml_decoder->get_input_stride(name); + ov::element::Type input_type = ggml_decoder->get_input_type(name); + size_t element_size = input_type.size(); + // const size_t valid_elems = static_cast(ggml_decoder->get_input_shape(name).to_shape()[2]); const size_t num_rows = static_cast(ggml_decoder->get_input_shape(name).to_shape()[1]); const size_t dim2 = static_cast(ggml_decoder->get_input_shape(name).to_shape()[0]); size_t phys_stride = static_cast(input_stride[1]) / element_size; - size_t total_logical = valid_elems * num_rows * dim2; - - std::vector contiguous_data(total_logical); - - for (size_t j = 0; j < num_rows; j++) { - const float *src_row = reinterpret_cast(input_data) + j * phys_stride; - float *dst_row = contiguous_data.data() + j * valid_elems; - std::copy(src_row, src_row + valid_elems, dst_row); - } - input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), - ggml_decoder->get_input_shape(name).to_shape(), - contiguous_data.data()); - } else if (op_node_name == "CONT" && !ggml_decoder->check_if_continuous()){ - size_t valid_i = static_cast(ggml_decoder->get_input_shape(name).to_shape()[2]); // 96 - size_t valid_j = static_cast(ggml_decoder->get_input_shape(name).to_shape()[1]); // 32 - size_t valid_k = static_cast(ggml_decoder->get_input_shape(name).to_shape()[0]); // 7 - - size_t total_valid = valid_i * valid_j * valid_k; // 96 * 32 * 7 = 21504 - size_t stride_j = static_cast(input_stride[1]) / element_size; // 672 - size_t stride_k = static_cast(input_stride[0]) / element_size; // 96 - - std::vector contiguous_data(total_valid); - const float *src_data = reinterpret_cast(input_data); - for (size_t k = 0; k < valid_k; k++) { - for (size_t j = 0; j < valid_j; j++) { - for (size_t i = 0; i < valid_i; i++) { - size_t out_index = k * (valid_i * valid_j) + j * valid_i + i; - size_t src_index = j * stride_j + k * stride_k + i; - contiguous_data[out_index] = src_data[src_index]; - } - } - } - input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), - ggml_decoder->get_input_shape(name).to_shape(), - contiguous_data.data()); - // } else if (op_node_name == "MUL_MAT") { - // ov::Shape flat_shape = { ggml_decoder->get_input_shape(name).to_shape()[0] * - // ggml_decoder->get_input_shape(name).to_shape()[1] * - // ggml_decoder->get_input_shape(name).to_shape()[2] }; - // input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), flat_shape, input_data); + ov::Shape input_shape = { dim2, num_rows, phys_stride }; // {1, 7, 9216 } + input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); } else { input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); } - // input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); - // } - input_tensors[name] = input_tensor; + // input_tensors[name] = input_tensor; + input_tensors.emplace_back(name, input_tensor); } return input_tensors; } @@ -114,11 +77,11 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { return front_end; } -enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, const int32_t start_index, const int32_t end_index) { - ov::Core core; - auto devices = core.get_available_devices(); +enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, const int32_t start_index, const int32_t end_index, bool flag) { + static ov::Core core; + // auto devices = core.get_available_devices(); // Get GGML Frontend - auto front_end = get_ggml_frontend(); + static auto front_end = get_ggml_frontend(); if (!front_end) { GGML_LOG_ERROR("GGML FrontEnd is not initialized \n"); return GGML_STATUS_FAILED; @@ -161,11 +124,12 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // Get input tensor auto input_names = ggml_decoder->get_input_names(); - auto input_tensors = get_ggml_graph_input_tensors(ggml_decoder); + auto input_tensors = get_ggml_graph_input_tensors(ggml_decoder, flag); // Set input tensor for (size_t i = 0; i < input_names.size(); i++) { - infer_request.set_input_tensor(i, input_tensors[input_names[i]]); + // infer_request.set_input_tensor(i, input_tensors[input_names[i]]); + infer_request.set_input_tensor(i, input_tensors.at(i).second); // auto input_tensor = infer_request.get_input_tensor(i); // auto input_shape = input_tensor.get_shape(); diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index fc5268d98a..7806c418cb 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -1,4 +1,4 @@ #include "ggml-decoder.h" #include "ggml-backend-impl.h" -enum ggml_status openvino_frontend_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph, const int32_t start_index=0, const int32_t end_index=0); +enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, const int32_t start_index=0, const int32_t end_index=0, bool flag = true); From 467a5ddf04b68cad37ce90dd7901e4c0af48c2fa Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Tue, 11 Mar 2025 10:32:50 +0800 Subject: [PATCH 039/254] 1. Update the implementation of CPY node when it's non-contiguous 2. Remove duplicate get node operation function --- ggml/src/ggml-openvino.cpp | 120 ++++++++++++++---------- ggml/src/ggml-openvino/decoder.h | 2 - ggml/src/ggml-openvino/ggml-decoder.cpp | 86 ++++++----------- ggml/src/ggml-openvino/ggml-decoder.h | 2 - ggml/src/ggml-openvino/utils.cpp | 21 ++++- 5 files changed, 116 insertions(+), 115 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 082ab27458..679b030dfa 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -849,6 +849,7 @@ static void ggml_backend_openvino_permute(const struct ggml_tensor * dst) { void ggml_backend_openvino_cpy(struct ggml_tensor *dst) { const struct ggml_tensor *src0 = dst->src[0]; + const struct ggml_tensor *src1 = dst->src[1]; assert(src0 != nullptr); assert(ggml_nelements(dst) == ggml_nelements(src0)); @@ -889,64 +890,81 @@ void ggml_backend_openvino_cpy(struct ggml_tensor *dst) { infer_request.set_output_tensor(0, dst_tensor); infer_request.infer(); } else { - std::vector gather_idx; - for (int row = 0; row < dst->src[0]->ne[1]; row++) { - for (int col = 0; col < dst->src[0]->ne[0]; col++) { - gather_idx.push_back((row*dst->src[0]->nb[1]+col*dst->src[0]->nb[0])/4); - } - } - size_t N = gather_idx.size(); - ov::Shape gather_idx_shape = {N, 1}; - std::vector scatter_idx; - for (int row = 0; row < dst->ne[1]; row++) { - for (int col = 0; col < dst->ne[0]; col++) { - scatter_idx.push_back(row * dst->nb[1] / 2 + col); - } - } - ov::Shape scatter_idx_shape = {N, 1}; + int src0_elem_size = ggml_type_size(src0->type); + int src1_elem_size = ggml_type_size(src1->type); - // param_src0 shape => 1D, rank=1, size is large enough. For example, row*col= 21504 + some padding, e.g. 80000 - // ov::Shape flat_src0_shape = {80000}; - ov::Shape flat_src0_shape = {dst->src[0]->nb[2]}; - auto param_src0 = std::make_shared(ov::element::f32, flat_src0_shape); - // auto param_src00 = std::make_shared(ov::element::f32, flat_src0_shape); + int src0_logical_cols = src0->ne[0]; + int src0_logical_rows = src0->ne[1]; + int src1_logical_cols = src1->ne[0]; + int src1_logical_rows = src1->ne[1]; + + int src0_phys_cols = src0->nb[0] / src0_elem_size; + int src0_phys_rows = src0_logical_rows; + + int src1_phys_cols = src1->nb[1] / src1_elem_size; + int src1_phys_rows = src1_logical_rows; + + ov::Shape src0_phys_shape = {1, static_cast(src0_phys_rows), static_cast(src0_phys_cols) }; + ov::Shape src1_phys_shape = {1, static_cast(src1_phys_rows), static_cast(src1_phys_cols) }; + + size_t logical_elems = static_cast(src0_logical_cols * src0_logical_rows); + size_t src_flat_size = 1 * src0_phys_cols * src0_phys_rows; + size_t dst_flat_size = 1 * src1_phys_rows * src1_phys_cols; + + ov::Core core; + + std::vector gather_idx; + gather_idx.reserve(logical_elems); + for (int row = 0; row < src0_logical_rows; row++) { + for (int col = 0; col < src0_logical_cols; col++) { + gather_idx.push_back(static_cast(row + col * src0_phys_rows)); + } + } + ov::Shape gather_idx_shape = { logical_elems }; + + std::vector scatter_idx; + scatter_idx.reserve(logical_elems); + for (int row = 0; row < src1_logical_rows; row++) { + for (int col = 0; col < src1_logical_cols; col++) { + scatter_idx.push_back(static_cast(row * src1_phys_cols + col)); + } + } + ov::Shape scatter_idx_shape = { logical_elems, 1 }; + + auto param_src0 = std::make_shared(ov::element::f32, src0_phys_shape); + auto param_src1 = std::make_shared(ov::element::f16, src1_phys_shape); + + auto src_flat_shape_const = ov::op::v0::Constant::create(ov::element::i64, {1}, + { static_cast(src_flat_size) }); + auto reshape_src = std::make_shared(param_src0, src_flat_shape_const, false); + auto dst_flat_shape_const = ov::op::v0::Constant::create(ov::element::i64, {1}, + { static_cast(dst_flat_size) }); + auto reshape_dst = std::make_shared(param_src1, dst_flat_shape_const, false); auto gather_indices_const = ov::op::v0::Constant::create(ov::element::i64, gather_idx_shape, gather_idx); - auto gather_axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto gathered = std::make_shared( - param_src0, gather_indices_const, gather_axis_const); - + auto axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto gathered = std::make_shared(reshape_src, gather_indices_const, axis_const); auto converted = std::make_shared(gathered, ov::element::f16); - // param_dst_base shape => 1D, rank=1, size够大, e.g. row=3072 => i up to 3071 => offset i*64=196544 + j*2, e.g.200000 - // ov::Shape flat_dst_shape = {200000, 1}; - ov::Shape flat_dst_shape = {dst->nb[2], 1}; - auto param_dst_base = std::make_shared(ov::element::f16, flat_dst_shape); - // auto param_dst_base11 = std::make_shared(ov::element::f16, flat_dst_shape); - auto scatter_indices_const = ov::op::v0::Constant::create(ov::element::i64, scatter_idx_shape, scatter_idx); + auto scatter = std::make_shared(reshape_dst, scatter_indices_const, converted); - // ScatterNDUpdate( base, scatter_indices, updates ) - // scatter_indices last dimension = 1 => each index is 1D coordinate - auto scatter = std::make_shared( - param_dst_base, scatter_indices_const, converted - ); - - ov::ParameterVector params = { param_src0, param_dst_base }; - // ov::ParameterVector params = { param_src0}; - // ov::ParameterVector params = { param_src00, param_dst_base11}; - auto model = std::make_shared(ov::OutputVector{ scatter }, params); + std::vector dst_phys_shape_vec = {1, static_cast(src1_phys_rows), + static_cast(src1_phys_cols) }; + auto dst_phys_shape_const = ov::op::v0::Constant::create(ov::element::i64, {3}, dst_phys_shape_vec); + auto final_output = std::make_shared(scatter, dst_phys_shape_const, false); + ov::ParameterVector params = { param_src0, param_src1 }; + auto model = std::make_shared(ov::OutputVector{ final_output }, params); auto compiled_model = core.compile_model(model, "CPU"); auto infer_request = compiled_model.create_infer_request(); - ov::Tensor tensor_src0(ov::element::f32, flat_src0_shape, src0->data); - ov::Tensor tensor_dst_base(ov::element::f16, flat_dst_shape, dst->data); + ov::Tensor tensor_src(ov::element::f32, src0_phys_shape, src0->data); + ov::Tensor tensor_dst(ov::element::f16, src1_phys_shape, src1->data); + infer_request.set_input_tensor(0, tensor_src); + infer_request.set_input_tensor(1, tensor_dst); - infer_request.set_input_tensor(0, tensor_src0); - infer_request.set_input_tensor(1, tensor_dst_base); - - ov::Tensor out_tensor(ov::element::f16, flat_dst_shape, dst->data); + ov::Tensor out_tensor(ov::element::f16, src1_phys_shape, dst->data); infer_request.set_output_tensor(0, out_tensor); infer_request.infer(); @@ -986,15 +1004,17 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe // Process nodes in order - // if (cgraph->nodes[0]->ne[1] == 1) { - // bool prompt_process_flag = false; + bool prompt_process_flag = true; + if (cgraph->nodes[0]->ne[1] == 1) { + prompt_process_flag = false; + } // int end_node = cgraph->n_nodes - 1; // openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag); // } else { for (int i = 0; i < cgraph->n_nodes; i++) { if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { - // ggml_backend_openvino_permute(cgraph->nodes[i]); + ggml_backend_openvino_permute(cgraph->nodes[i]); // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { @@ -1020,7 +1040,7 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe i++; } if (start_index < i) { - openvino_frontend_compute(backend, cgraph, start_index, --i); + openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag); } } } diff --git a/ggml/src/ggml-openvino/decoder.h b/ggml/src/ggml-openvino/decoder.h index 729946ac39..584f16986c 100644 --- a/ggml/src/ggml-openvino/decoder.h +++ b/ggml/src/ggml-openvino/decoder.h @@ -36,8 +36,6 @@ public: virtual std::vector get_input_names() const = 0; - virtual const std::string& get_node_op_name(const std::string& name) const = 0; - virtual std::string& get_op_node_name(const std::string& name, const int index = -1) = 0; // virtual const struct tensor_info get_node_op_info(const std::string& name) const = 0; diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 2b04cd632a..218c53f09f 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -6,18 +6,6 @@ #include void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs) { - // m_node_op_name[node->name] = ggml_op_name(node->op); - - // std::string src0_name = std::string(node->src[0]->name) + "_" + std::to_string(node->src[0]->view_offs) + "_input_" + ggml_op_name(node->src[0]->op); - // std::string node_name = std::string(node->name) + "_" + std::to_string(node->view_offs) + "_output_" + ggml_op_name(node->op); - - // Execute singel CONT operator is OK - // std::string src0_name = std::string(node->src[0]->name) + "_" + std::to_string(node->src[0]->view_offs) + "_" + ggml_op_name(node->src[0]->op); - // std::string node_name = std::string(node->name) + "_" + std::to_string(node->view_offs) + "_" + ggml_op_name(node->op); - - // std::string src0_name = std::string(node->src[0]->name) + "_" + std::to_string(node->src[0]->view_offs); - // std::string node_name = std::string(node->name) + "_" + std::to_string(node->view_offs); - std::string src0_name = std::string(node->src[0]->name); std::string node_name = std::string(node->name); @@ -32,7 +20,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; outputs[node_name] = node; m_input_names.push_back(src0_name); - m_node_op_name[src0_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); break; @@ -43,7 +30,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; outputs[node_name] = node; m_input_names.push_back(src0_name); - m_node_op_name[src0_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); @@ -64,7 +50,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; outputs[node_name] = node; m_input_names.push_back(src0_name); - m_node_op_name[src0_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); @@ -87,7 +72,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; outputs[node_name] = node; m_input_names.push_back(src0_name); - m_node_op_name[src0_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); @@ -107,32 +91,45 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; outputs[node_name] = node; m_input_names.push_back(src0_name); - m_node_op_name[src0_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); m_continuous = true; - // ov::Shape src_shape(node->src[0]->ne, node->src[0]->ne + 3); - // auto input_param = std::make_shared(ov::element::f32, src_shape); - // m_params.push_back(input_param); + ov::Shape input_shape = { static_cast(node->src[0]->ne[2]), + static_cast(node->src[0]->ne[1]), + static_cast(node->src[0]->ne[0])}; + auto input_param = std::make_shared(ov::element::f32, input_shape); + m_params.push_back(input_param); break; } else { - for (int64_t i1 = 0; i1 < node->ne[1]; ++i1) { // ne[1] = 3072 - for (int64_t i0 = 0; i0 < node->ne[0]; ++i0) { // ne[0] = 7 - int64_t src_index = i0 * node->src[0]->nb[0] / sizeof(float) + // stride in nb[0] - i1 * node->src[0]->nb[1] / sizeof(float); // stride in nb[1] - char *dst_ptr = static_cast(node->data) + - i0 * node->nb[0] + i1 * node->nb[1]; - *(ggml_fp16_t *)dst_ptr = GGML_FP32_TO_FP16(((float*)node->src[0]->data)[src_index]); - } - } - // inputs[node->src[0]->name] = node->src[0]; - inputs[node_name] = node; + std::string src1_name = std::string(node->src[1]->name); + inputs[src0_name] = node->src[0]; + inputs[src1_name] = node->src[1]; outputs[node_name] = node; - m_input_names.push_back(node_name); - m_node_op_name[node_name] = ggml_op_name(node->op); + m_input_names.push_back(src0_name); + m_input_names.push_back(src1_name); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); + m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); + + int src0_elem_size = ggml_type_size(node->src[0]->type); + int src1_elem_size = ggml_type_size(node->src[1]->type); + + int src0_logical_rows = node->src[0]->ne[1]; + int src1_logical_rows = node->src[1]->ne[1]; + + int src0_phys_cols = node->src[0]->nb[0] / src0_elem_size; + int src0_phys_rows = src0_logical_rows; + + int src1_phys_cols = node->src[1]->nb[1] / src1_elem_size; + int src1_phys_rows = src1_logical_rows; + ov::Shape src0_phys_shape = {1, static_cast(src0_phys_rows), static_cast(src0_phys_cols) }; + ov::Shape src1_phys_shape = {1, static_cast(src1_phys_rows), static_cast(src1_phys_cols) }; + auto input0_param = std::make_shared(ov::element::f32, src0_phys_shape); + auto input1_param = std::make_shared(ov::element::f16, src1_phys_shape); + m_params.push_back(input0_param); + m_params.push_back(input1_param); + m_continuous = false; break; @@ -144,7 +141,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop); m_op_node_name.emplace_back(node_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); break; @@ -155,7 +151,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; outputs[node_name] = node; m_input_names.push_back(src0_name); - m_node_op_name[src0_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); break; @@ -167,17 +162,13 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[1]->name) + "_" + std::to_string(node->src[1]->view_offs) + "_input_" + ggml_op_name(node->src[1]->op); - // std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs); std::string src1_name = std::string(node->src[1]->name); inputs[src0_name] = node->src[0]; inputs[src1_name] = node->src[1]; outputs[node_name] = node; m_input_names.push_back(src0_name); - m_node_op_name[src0_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_input_names.push_back(src1_name); - m_node_op_name[src1_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); break; @@ -193,15 +184,11 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; outputs[node_name] = node; m_input_names.push_back(src0_name); - m_node_op_name[src0_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); if (node->src[1]) { - // std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs) + "_input_" + ggml_op_name(node->src[1]->op); - // std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs); std::string src1_name = std::string(node->src[1]->name); inputs[src1_name] = node->src[1]; - m_node_op_name[src1_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); m_input_names.push_back(src1_name); } @@ -210,26 +197,19 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[1]->name) + "_" + std::to_string(node->src[1]->view_offs) + "_input_" + ggml_op_name(node->src[1]->op); - // std::string src1_name = std::string(node->src[1]->name) + "_" + std::to_string(node->src[1]->view_offs); std::string src1_name = std::string(node->src[1]->name); inputs[src0_name] = node->src[0]; inputs[src1_name] = node->src[1]; m_input_names.push_back(src0_name); - m_node_op_name[src0_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_input_names.push_back(src1_name); - m_node_op_name[src1_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); outputs[node_name] = node; m_output_names.push_back(node_name); if (node->src[2]) { - // std::string src2_name = std::string(node->src[2]->name) + "_" + std::to_string(node->src[2]->view_offs) + "_input_" + ggml_op_name(node->src[2]->op); - // std::string src2_name = std::string(node->src[2]->name) + "_" + std::to_string(node->src[2]->view_offs); std::string src2_name = std::string(node->src[2]->name); inputs[src2_name] = node->src[2]; m_input_names.push_back(src2_name); - m_node_op_name[src2_name] = ggml_op_name(node->op); m_op_node_name.emplace_back(src2_name, ggml_op_name(node->op)); } break; @@ -423,12 +403,6 @@ std::vector GgmlOvDecoder::get_input_names() const { return m_input_names; } -const std::string& GgmlOvDecoder::get_node_op_name(const std::string& name) const { - auto it = m_node_op_name.find(name); - static const std::string empty_str; - return (it != m_node_op_name.end()) ? it->second : empty_str; -} - std::string& GgmlOvDecoder::get_op_node_name(const std::string& key_name, const int index) { if (index == -1) { for (size_t i = 0; i < m_op_node_name.size(); ++i) { diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 238f1d79b4..fc1d878409 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -71,7 +71,6 @@ public: return m_continuous; } - virtual const std::string& get_node_op_name(const std::string& name) const override; std::string& get_op_node_name(const std::string& key_name, const int index) override; virtual const std::vector>& get_params() const override; @@ -90,7 +89,6 @@ private: std::string m_op_name; mutable std::string m_name; bool m_continuous; - std::map m_node_op_name; std::vector> m_params; std::vector> m_op_node_name; }; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index c44aa2568b..a0adc917e7 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -22,24 +22,35 @@ std::vector> get_ggml_graph_input_tensors(std std::string op_node_name = ggml_decoder->get_op_node_name(name, op_iter++); // auto node_op_name = ggml_decoder->get_node_op_name(name); auto input_data = ggml_decoder->get_input_ggml_tensor(name)->data; + auto check_if_contiguous = ggml_is_contiguous(ggml_decoder->get_input_ggml_tensor(name)); #ifdef GGML_OPENVINO_DEBUG printf("Subgraph input %d: %g\n", inp, *(double*)(input_data)); #endif ov::Tensor input_tensor; auto input_shape = ggml_decoder->get_input_shape(name).to_shape(); - // if (node_op_name == "CPY" && (input_shape[0] != 7)) { - // input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), {80000}, input_data); if (flag & op_node_name == "CONT" && input_shape[0] == 1 && input_shape[1] != 1) { std::vector input_stride = ggml_decoder->get_input_stride(name); ov::element::Type input_type = ggml_decoder->get_input_type(name); size_t element_size = input_type.size(); - // const size_t valid_elems = static_cast(ggml_decoder->get_input_shape(name).to_shape()[2]); const size_t num_rows = static_cast(ggml_decoder->get_input_shape(name).to_shape()[1]); const size_t dim2 = static_cast(ggml_decoder->get_input_shape(name).to_shape()[0]); size_t phys_stride = static_cast(input_stride[1]) / element_size; ov::Shape input_shape = { dim2, num_rows, phys_stride }; // {1, 7, 9216 } input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); + } else if (op_node_name == "CPY" && (!check_if_contiguous || input_shape[2] == 1)) { //[TODO]: Temporarily determine whether the node corresponding to the input tensor of the Phi-3 model CPY is continuous + std::vector input_stride = ggml_decoder->get_input_stride(name); + ov::element::Type input_type = ggml_decoder->get_input_type(name); + size_t element_size = input_type.size(); + ov::Shape phys_shape; + static int iter = 0; + if (iter++ % 2 == 0) { + phys_shape = {1, input_shape[1], input_stride[2] / element_size}; + input_tensor = ov::Tensor(ov::element::f32, phys_shape, input_data); + } else { + phys_shape = {1, input_shape[1], input_stride[1] / element_size}; + input_tensor = ov::Tensor(ov::element::f16, phys_shape, input_data); + } } else { input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); } @@ -105,7 +116,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // Convert InputModel -> ov::Model std::shared_ptr model = front_end->convert(input_model); - ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_model.xml"); + // ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_model.xml"); if (!model) { GGML_LOG_ERROR("Model is not converted \n"); @@ -117,7 +128,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // Loading a model to the device ov::CompiledModel compiled_model = core.compile_model(model); - ov::save_model(compiled_model.get_runtime_model(), "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_compile_model.xml"); + // ov::save_model(compiled_model.get_runtime_model(), "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_compile_model.xml"); // Create infer request ov::InferRequest infer_request = compiled_model.create_infer_request(); From b14b49d5f6aaa704835d4f5eb2d8060dbf5d232f Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Tue, 11 Mar 2025 15:16:40 +0800 Subject: [PATCH 040/254] Minor Update --- ggml/src/ggml-openvino.cpp | 12 ++++++------ ggml/src/ggml-openvino/ggml-decoder.cpp | 20 +++++++++++++------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 679b030dfa..4608019d9f 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -813,7 +813,7 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { auto order_const = ov::op::v0::Constant::create(ov::element::i64, {order.size()}, order); auto transpose = std::make_shared(input_param, order_const); - ov::Shape target_shape = { dst->ne[2], dst->ne[1], dst->ne[0] }; // {1, 7, 3072} + ov::Shape target_shape = { static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0]) }; // {1, 7, 3072} std::vector target_shape_vec = { static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0]) }; @@ -866,7 +866,7 @@ void ggml_backend_openvino_cpy(struct ggml_tensor *dst) { std::shared_ptr model; if (ggml_is_contiguous(dst)) { // Contiguous Case: Flatten src and reshape to dst shape - ov::Shape flattened_shape = {ggml_nelements(src0)}; + ov::Shape flattened_shape = {static_cast(ggml_nelements(src0))}; auto flatten = std::make_shared( src_input, ov::op::v0::Constant::create(ov::element::i64, {1}, flattened_shape), false); @@ -1013,12 +1013,12 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe // } else { for (int i = 0; i < cgraph->n_nodes; i++) { - if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { - ggml_backend_openvino_permute(cgraph->nodes[i]); + if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { + ggml_backend_openvino_view(cgraph->nodes[i]); // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); - } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { - ggml_backend_openvino_view(cgraph->nodes[i]); + // } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { + // ggml_backend_openvino_permute(cgraph->nodes[i]); // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); // } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 218c53f09f..55a82b0580 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -231,7 +231,7 @@ void ggml_graph_op_print(const struct ggml_cgraph * cgraph) { file << "n_nodes = " << cgraph->n_nodes << "\n"; file << " " << std::setw(3) << "nodes" << std::setw(15) << "shape" - << std::setw(16) << "op" + << std::setw(20) << "op" << std::setw(20) << "name" << std::setw(3) << " " << std::setw(50) << "stride" @@ -242,21 +242,24 @@ void ggml_graph_op_print(const struct ggml_cgraph * cgraph) { file << " - " << std::setw(3) << i << ": [ " << std::setw(5) << node->ne[0] << ", " << std::setw(5) << node->ne[1] << ", " - << std::setw(5) << node->ne[2] << "] " + << std::setw(5) << node->ne[2] << ", " + << std::setw(5) << node->ne[3] << "] " << std::left << std::setw(20) << ggml_op_name(node->op) << std::right << " " << std::left << std::setw(44) << node->name << std::right << ((node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ") << std::setw(2) << "[ " << std::setw(0) << node->nb[0] << ", " << std::setw(5) << node->nb[1] << ", " - << std::setw(5) << node->nb[2] << "] " + << std::setw(5) << node->nb[2] << ", " + << std::setw(5) << node->nb[3] << "] " << "\n"; if (node->src[0]) { file << std::setw(10) << " [ " << std::setw(5) << node->src[0]->ne[0] << ", " << std::setw(5) << node->src[0]->ne[1] << ", " - << std::setw(5) << node->src[0]->ne[2] << "] " + << std::setw(5) << node->src[0]->ne[2] << ", " + << std::setw(5) << node->src[0]->ne[3] << "] " << std::setw(12) << "0: " << std::left << std::setw(12) << ggml_op_name(node->src[0]->op) << std::right; // // Custom logic to handle '\000' @@ -269,14 +272,16 @@ void ggml_graph_op_print(const struct ggml_cgraph * cgraph) { << std::setw(16) << "[ " << std::setw(0) << node->src[0]->nb[0] << ", " << std::setw(5) << node->src[0]->nb[1] << ", " - << std::setw(5) << node->src[0]->nb[2] << "] " + << std::setw(5) << node->src[0]->nb[2] << ", " + << std::setw(5) << node->src[0]->nb[3] << "] " << "\n"; } if (node->src[1]) { file << std::setw(10) << " [ " << std::setw(5) << node->src[1]->ne[0] << ", " << std::setw(5) << node->src[1]->ne[1] << ", " - << std::setw(5) << node->src[1]->ne[2] << "] " + << std::setw(5) << node->src[1]->ne[2] << ", " + << std::setw(5) << node->src[1]->ne[3] << "] " << std::setw(12) << "1: " << std::left << std::setw(12) << ggml_op_name(node->src[1]->op) << std::right; // // Custom logic to handle '\000' @@ -289,7 +294,8 @@ void ggml_graph_op_print(const struct ggml_cgraph * cgraph) { << std::setw(16) << "[ " << std::setw(0) << node->src[1]->nb[0] << ", " << std::setw(5) << node->src[1]->nb[1] << ", " - << std::setw(5) << node->src[1]->nb[2] << "] " + << std::setw(5) << node->src[1]->nb[2] << ", " + << std::setw(5) << node->src[1]->nb[3] << "] " << "\n"; } } From 19ec9b6bf59bc5617ebfa39d835a1b4b047fe1e0 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Wed, 12 Mar 2025 21:43:23 +0800 Subject: [PATCH 041/254] Try to add VIEW node to OV Frontend and have some issues that need to be dealt with --- ggml/src/ggml-openvino.cpp | 236 ++++++++++++++++++++---- ggml/src/ggml-openvino/decoder.h | 2 + ggml/src/ggml-openvino/ggml-decoder.cpp | 27 ++- ggml/src/ggml-openvino/ggml-decoder.h | 2 + 4 files changed, 232 insertions(+), 35 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 4608019d9f..d2a21511dd 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -647,36 +647,169 @@ void ggml_backend_openvino_reshape(ggml_tensor *dst) { } void ggml_backend_openvino_view(ggml_tensor *dst) { + + /* + // Case 1: Set the output tensor shape as the same shape of the input tensor [1, 7, 9216], for next CONT node operator + if (dst->ne[0] > dst->ne[1] && (dst->ne[0] * dst->nb[0] != dst->nb[1]) && dst->ne[2] == 1) { + // if (dst->view_offs == 0) { + // return; + // } + ov::Core core; + ov::Shape input_shape{ static_cast(dst->src[0]->ne[2]), static_cast(dst->src[0]->ne[1]), static_cast(dst->src[0]->ne[0])}; + ov::Shape out_shape{ static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0])}; + + auto input_param = std::make_shared(ov::element::f32, input_shape); + + // auto new_shape_node = ov::op::v0::Constant::create(ov::element::i64, + // ov::Shape{input_shape.size()}, + // std::vector(input_shape.begin(), input_shape.end())); + // auto res = std::make_shared(input_param, new_shape_node, false); + + int64_t split_addr = dst->view_offs / dst->nb[0]; + std::vector begin = { 0, 0, split_addr }; + std::vector end = { static_cast(dst->src[0]->ne[2]), + static_cast(dst->src[0]->ne[1]), + split_addr + static_cast(dst->ne[0]) }; + std::vector strides = { 1, 1, 1 }; + + auto begin_const = ov::op::v0::Constant::create(ov::element::i64, { begin.size() }, begin); + auto end_const = ov::op::v0::Constant::create(ov::element::i64, { end.size() }, end); + auto strides_const = ov::op::v0::Constant::create(ov::element::i64, { strides.size() }, strides); + + std::vector begin_mask = {0, 0, 0}; + std::vector end_mask = {0, 0, 0}; + auto slice = std::make_shared( + input_param, + begin_const, + end_const, + strides_const, + begin_mask, + end_mask + ); + + auto model = std::make_shared(ov::OutputVector{ slice }, + ov::ParameterVector{ input_param }); + + auto compiled_model = core.compile_model(model, "CPU"); + + ov::InferRequest infer_request = compiled_model.create_infer_request(); + + ov::Tensor input_tensor(ov::element::f32, input_shape, dst->src[0]->data); + infer_request.set_input_tensor(0, input_tensor); + + ov::Tensor output_tensor(ov::element::f32, out_shape, dst->data); + infer_request.set_output_tensor(0, output_tensor); + + infer_request.infer(); + } + */ + + + /* + // Case 2: Slice contiguous input tensor [98304, 1, 1] to contiguout output tensor [ 21504, 1, 1] + if (ggml_is_contiguous(dst) && dst->ne[1] == 1 && (dst->ne[0] * dst->nb[0] == dst->nb[1])) { + ov::Core core; + ov::Shape input_shape = { static_cast(dst->src[0]->ne[2]), + static_cast(dst->src[0]->ne[1]), + static_cast(dst->src[0]->ne[0])}; + ov::Shape output_shape = { static_cast(dst->ne[2]), + static_cast(dst->ne[1]), + static_cast(dst->ne[0])}; + auto input_param = std::make_shared(ov::element::f16, input_shape); + + + std::vector begin = { 0, 0, 0 }; + std::vector end = { static_cast(dst->ne[2]), + static_cast(dst->ne[1]), + static_cast(dst->ne[0]) }; + std::vector strides = { 1, 1, 1 }; + + auto begin_const = ov::op::v0::Constant::create(ov::element::i64, { begin.size() }, begin); + auto end_const = ov::op::v0::Constant::create(ov::element::i64, { end.size() }, end); + auto strides_const = ov::op::v0::Constant::create(ov::element::i64, { strides.size() }, strides); + + std::vector begin_mask = {0, 0, 0}; + std::vector end_mask = {0, 0, 0}; + auto slice = std::make_shared( + input_param, + begin_const, + end_const, + strides_const, + begin_mask, + end_mask + ); + + std::shared_ptr model = std::make_shared(ov::OutputVector{ slice }, + ov::ParameterVector{ input_param }); + + auto compiled_model = core.compile_model(model, "CPU"); + ov::InferRequest infer_request = compiled_model.create_infer_request(); + + ov::Tensor input_tensor(ov::element::f16, input_shape, dst->src[0]->data); + ov::Tensor output_tensor(ov::element::f16, output_shape, dst->data); + infer_request.set_input_tensor(0, input_tensor); + infer_request.set_output_tensor(0, output_tensor); + + infer_request.infer(); + } + */ + + /* + // Case 3: Reshape the input tensor [1, 1, 98304] to output tensor [1, 3072, 32](Physical shape) + if (dst->ne[0] < dst->ne[1] && dst->ne[2] == 1) { + ov::Core core; + ov::Shape input_shape = { static_cast(dst->src[0]->ne[2]), + static_cast(dst->src[0]->ne[1]), + static_cast(dst->src[0]->ne[0])}; + ov::Shape output_shape = { static_cast(dst->nb[2]), + static_cast(dst->ne[1]), + static_cast(dst->nb[1] / dst->nb[0])}; + auto input_param = std::make_shared(ov::element::f16, input_shape); + + auto new_shape_node = ov::op::v0::Constant::create(ov::element::i64, + ov::Shape{output_shape.size()}, + std::vector(output_shape.begin(), output_shape.end())); + auto res = std::make_shared(input_param, new_shape_node, false); + + std::shared_ptr model = std::make_shared(ov::OutputVector{res}, + ov::ParameterVector{input_param}); + auto compiled_model = core.compile_model(model, "CPU"); + ov::InferRequest infer_request = compiled_model.create_infer_request(); + + ov::Tensor input_tensor(ov::element::f16, input_shape, dst->src[0]->data); + ov::Tensor output_tensor(ov::element::f16, output_shape, dst->data); + infer_request.set_input_tensor(0, input_tensor); + infer_request.set_output_tensor(0, output_tensor); + + infer_request.infer(); + } + */ + + /* + // Case 4: + if (dst->ne[0] != 1 && dst->ne[1] != 1 && dst->ne[2] !=1) { + + } + */ + ov::Core core; - ov::Shape tensor_shape{static_cast(dst->ne[3]), static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0])}; - - // auto param = std::make_shared(ov::element::f32, tensor_shape); - auto param = std::make_shared(ov::element::f16, tensor_shape); - - auto reshaped = std::make_shared(param, - ov::op::v0::Constant::create(ov::element::i64, { tensor_shape.size() }, tensor_shape), - false); - - auto model = std::make_shared(ov::NodeVector{reshaped}, ov::ParameterVector{param}); - // auto model = std::make_shared(ov::NodeVector{param}, ov::ParameterVector{param}); - // ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/003_backend_view_model.xml"); + ov::Shape input_shape{static_cast(dst->src[0]->ne[2]), static_cast(dst->src[0]->ne[1]), static_cast(dst->src[0]->ne[0])}; + // ov::Shape output_shape{static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0])}; + auto input_param = std::make_shared(ov::element::f32, input_shape); + std::shared_ptr model = std::make_shared(ov::OutputVector{input_param}, + ov::ParameterVector{input_param}); auto compiled_model = core.compile_model(model, "CPU"); - ov::InferRequest infer_request = compiled_model.create_infer_request(); - // ov::Tensor input_tensor(ov::element::f32, tensor_shape, dst->data); - ov::Tensor input_tensor(ov::element::f16, tensor_shape, dst->data); - // infer_request.set_tensor(param, input_tensor); + ov::Tensor input_tensor(ov::element::f32, input_shape, dst->src[0]->data); + // ov::Tensor output_tensor(ov::element::f32, input_shape, dst->data); infer_request.set_input_tensor(0, input_tensor); - - // ov::Tensor output_tensor(ov::element::f32, tensor_shape, dst->data); - ov::Tensor output_tensor(ov::element::f16, tensor_shape, dst->data); - infer_request.set_output_tensor(0, output_tensor); + // infer_request.set_output_tensor(0, output_tensor); infer_request.infer(); - // auto output_tensor = infer_request.get_output_tensor(0); - // dst->data = output_tensor.data(); + + GGML_UNUSED(dst); } void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { @@ -747,12 +880,20 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { const size_t dim2 = static_cast(src0->ne[2]); // 1 size_t phys_stride = static_cast(src0->nb[1]) / element_size; // 9216 + // size_t phys_stride = static_cast(src0->ne[0]); // 3072 ov::Shape input_shape = { dim2, num_rows, phys_stride }; // 如 {1, 7, 9216 } ov::Shape logical_shape = { dim2, num_rows, valid_elems }; // {1, 7, 3072} + // std::cout << "CONT input shape: " << input_shape << std::endl; auto input_param = std::make_shared(ov::element::f32, input_shape); + // int64_t split_addr = dst->src[0]->view_offs / dst->src[0]->nb[0]; + // std::vector begin = { 0, 0, split_addr }; + // std::vector end = { static_cast(dim2), + // static_cast(num_rows), + // split_addr + static_cast(valid_elems) }; + std::vector begin = { 0, 0, 0 }; std::vector end = { static_cast(dim2), static_cast(num_rows), @@ -838,6 +979,35 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { } static void ggml_backend_openvino_transpose(ggml_tensor *dst) { + ov::Core core; + ov::Shape input_shape{static_cast(dst->src[0]->ne[2]), static_cast(dst->src[0]->ne[1]), static_cast(dst->src[0]->ne[0])}; + ov::Shape output_shape{static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0])}; + auto input_param = std::make_shared(ov::element::f32, input_shape); + + //auto res = std::make_shared(input_param, ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 2, 1})); + + + + auto new_shape_node = ov::op::v0::Constant::create(ov::element::i64, + ov::Shape{output_shape.size()}, + std::vector(output_shape.begin(), output_shape.end())); + auto res = std::make_shared(input_param, new_shape_node, false); + + + + + std::shared_ptr model = std::make_shared(ov::OutputVector{res}, + ov::ParameterVector{input_param}); + auto compiled_model = core.compile_model(model, "CPU"); + ov::InferRequest infer_request = compiled_model.create_infer_request(); + + ov::Tensor input_tensor(ov::element::f32, input_shape, dst->src[0]->data); + ov::Tensor output_tensor(ov::element::f32, output_shape, dst->data); + infer_request.set_input_tensor(0, input_tensor); + infer_request.set_output_tensor(0, output_tensor); + + infer_request.infer(); + // NOP GGML_UNUSED(dst); } @@ -1013,29 +1183,31 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe // } else { for (int i = 0; i < cgraph->n_nodes; i++) { - if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { - ggml_backend_openvino_view(cgraph->nodes[i]); + if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { + ggml_backend_openvino_permute(cgraph->nodes[i]); // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); - // } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { - // ggml_backend_openvino_permute(cgraph->nodes[i]); + // } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { + // ggml_backend_openvino_view(cgraph->nodes[i]); // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); - // } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { - // ggml_backend_openvino_transpose(cgraph->nodes[i]); + } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { + ggml_backend_openvino_transpose(cgraph->nodes[i]); // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { // ggml_backend_openvino_reshape(cgraph->nodes[i]); - // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { - // ggml_backend_openvino_cpy(cgraph->nodes[i]); + } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { + ggml_backend_openvino_cpy(cgraph->nodes[i]); } else { // Process a range of nodes with openvino_frontend_compute int start_index = i; while (i < cgraph->n_nodes + && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end() // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() - && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() + // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() // && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() - // && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() + && std::find(transpose_indices.begin(), transpose_indices.end(), i) == transpose_indices.end() + && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() ) { i++; } diff --git a/ggml/src/ggml-openvino/decoder.h b/ggml/src/ggml-openvino/decoder.h index 584f16986c..e287f31e23 100644 --- a/ggml/src/ggml-openvino/decoder.h +++ b/ggml/src/ggml-openvino/decoder.h @@ -46,6 +46,8 @@ public: virtual element::Type get_output_type(const std::string& name) const = 0; + virtual int32_t* get_input_op_params(const std::string& name) const = 0; + virtual int32_t* get_output_op_params(const std::string& name) const = 0; virtual std::string& get_output_name(size_t index) const = 0; diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 55a82b0580..4483241481 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -138,11 +138,28 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; outputs[node_name] = node; - m_input_names.push_back(node_name); - m_op_node_name.emplace_back(node_name, ggml_op_name(node->op)); + m_input_names.push_back(src0_name); + m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); + + // ov::Shape input_shape = { static_cast(node->src[0]->ne[2]), + // static_cast(node->src[0]->ne[1]), + // static_cast(node->src[0]->ne[0])}; + // auto input_param = std::make_shared(ov::element::f32, input_shape); + // m_params.push_back(input_param); + + // if (node->ne[0] > node->ne[1] && (node->ne[0] * node->nb[0] != node->nb[1]) && node->ne[2] == 1) { + // m_continuous = false; + // } else { + // m_continuous = true; + + // } + // m_continuous = false; + + // [TODO]: multiple cases + break; } // SCALE @@ -467,6 +484,10 @@ ov::element::Type GgmlOvDecoder::get_output_type(const std::string& name) const return type; } +int32_t* GgmlOvDecoder::get_input_op_params(const std::string& name) const{ + return m_inputs.at(name)->op_params; +} + int32_t* GgmlOvDecoder::get_output_op_params(const std::string& name) const{ return m_outputs.at(name)->op_params; } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index fc1d878409..eac045d158 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -43,6 +43,8 @@ public: virtual ov::element::Type get_output_type(const std::string& name) const override; + virtual int32_t* get_input_op_params(const std::string& name) const override; + virtual int32_t* get_output_op_params(const std::string& name) const override; virtual std::string& get_output_name(size_t index) const override; From b02265a5072119cdbdb7ded26a7bb2e8dc26f273 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Sat, 15 Mar 2025 19:32:40 +0800 Subject: [PATCH 042/254] 1. In the Prompt process and predict first token stage, the PERMUTE node needs to be integrated into the OV Frontend 2. In the predict latest token stage, the VIEW, CONT, Reshape need to be integrated into the OV Frontend. --- ggml/src/ggml-openvino.cpp | 242 ++++-------------------- ggml/src/ggml-openvino/ggml-decoder.cpp | 40 ++-- ggml/src/ggml-openvino/utils.cpp | 25 ++- 3 files changed, 83 insertions(+), 224 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index d2a21511dd..fd24356412 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -647,168 +647,6 @@ void ggml_backend_openvino_reshape(ggml_tensor *dst) { } void ggml_backend_openvino_view(ggml_tensor *dst) { - - /* - // Case 1: Set the output tensor shape as the same shape of the input tensor [1, 7, 9216], for next CONT node operator - if (dst->ne[0] > dst->ne[1] && (dst->ne[0] * dst->nb[0] != dst->nb[1]) && dst->ne[2] == 1) { - // if (dst->view_offs == 0) { - // return; - // } - ov::Core core; - ov::Shape input_shape{ static_cast(dst->src[0]->ne[2]), static_cast(dst->src[0]->ne[1]), static_cast(dst->src[0]->ne[0])}; - ov::Shape out_shape{ static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0])}; - - auto input_param = std::make_shared(ov::element::f32, input_shape); - - // auto new_shape_node = ov::op::v0::Constant::create(ov::element::i64, - // ov::Shape{input_shape.size()}, - // std::vector(input_shape.begin(), input_shape.end())); - // auto res = std::make_shared(input_param, new_shape_node, false); - - int64_t split_addr = dst->view_offs / dst->nb[0]; - std::vector begin = { 0, 0, split_addr }; - std::vector end = { static_cast(dst->src[0]->ne[2]), - static_cast(dst->src[0]->ne[1]), - split_addr + static_cast(dst->ne[0]) }; - std::vector strides = { 1, 1, 1 }; - - auto begin_const = ov::op::v0::Constant::create(ov::element::i64, { begin.size() }, begin); - auto end_const = ov::op::v0::Constant::create(ov::element::i64, { end.size() }, end); - auto strides_const = ov::op::v0::Constant::create(ov::element::i64, { strides.size() }, strides); - - std::vector begin_mask = {0, 0, 0}; - std::vector end_mask = {0, 0, 0}; - auto slice = std::make_shared( - input_param, - begin_const, - end_const, - strides_const, - begin_mask, - end_mask - ); - - auto model = std::make_shared(ov::OutputVector{ slice }, - ov::ParameterVector{ input_param }); - - auto compiled_model = core.compile_model(model, "CPU"); - - ov::InferRequest infer_request = compiled_model.create_infer_request(); - - ov::Tensor input_tensor(ov::element::f32, input_shape, dst->src[0]->data); - infer_request.set_input_tensor(0, input_tensor); - - ov::Tensor output_tensor(ov::element::f32, out_shape, dst->data); - infer_request.set_output_tensor(0, output_tensor); - - infer_request.infer(); - } - */ - - - /* - // Case 2: Slice contiguous input tensor [98304, 1, 1] to contiguout output tensor [ 21504, 1, 1] - if (ggml_is_contiguous(dst) && dst->ne[1] == 1 && (dst->ne[0] * dst->nb[0] == dst->nb[1])) { - ov::Core core; - ov::Shape input_shape = { static_cast(dst->src[0]->ne[2]), - static_cast(dst->src[0]->ne[1]), - static_cast(dst->src[0]->ne[0])}; - ov::Shape output_shape = { static_cast(dst->ne[2]), - static_cast(dst->ne[1]), - static_cast(dst->ne[0])}; - auto input_param = std::make_shared(ov::element::f16, input_shape); - - - std::vector begin = { 0, 0, 0 }; - std::vector end = { static_cast(dst->ne[2]), - static_cast(dst->ne[1]), - static_cast(dst->ne[0]) }; - std::vector strides = { 1, 1, 1 }; - - auto begin_const = ov::op::v0::Constant::create(ov::element::i64, { begin.size() }, begin); - auto end_const = ov::op::v0::Constant::create(ov::element::i64, { end.size() }, end); - auto strides_const = ov::op::v0::Constant::create(ov::element::i64, { strides.size() }, strides); - - std::vector begin_mask = {0, 0, 0}; - std::vector end_mask = {0, 0, 0}; - auto slice = std::make_shared( - input_param, - begin_const, - end_const, - strides_const, - begin_mask, - end_mask - ); - - std::shared_ptr model = std::make_shared(ov::OutputVector{ slice }, - ov::ParameterVector{ input_param }); - - auto compiled_model = core.compile_model(model, "CPU"); - ov::InferRequest infer_request = compiled_model.create_infer_request(); - - ov::Tensor input_tensor(ov::element::f16, input_shape, dst->src[0]->data); - ov::Tensor output_tensor(ov::element::f16, output_shape, dst->data); - infer_request.set_input_tensor(0, input_tensor); - infer_request.set_output_tensor(0, output_tensor); - - infer_request.infer(); - } - */ - - /* - // Case 3: Reshape the input tensor [1, 1, 98304] to output tensor [1, 3072, 32](Physical shape) - if (dst->ne[0] < dst->ne[1] && dst->ne[2] == 1) { - ov::Core core; - ov::Shape input_shape = { static_cast(dst->src[0]->ne[2]), - static_cast(dst->src[0]->ne[1]), - static_cast(dst->src[0]->ne[0])}; - ov::Shape output_shape = { static_cast(dst->nb[2]), - static_cast(dst->ne[1]), - static_cast(dst->nb[1] / dst->nb[0])}; - auto input_param = std::make_shared(ov::element::f16, input_shape); - - auto new_shape_node = ov::op::v0::Constant::create(ov::element::i64, - ov::Shape{output_shape.size()}, - std::vector(output_shape.begin(), output_shape.end())); - auto res = std::make_shared(input_param, new_shape_node, false); - - std::shared_ptr model = std::make_shared(ov::OutputVector{res}, - ov::ParameterVector{input_param}); - auto compiled_model = core.compile_model(model, "CPU"); - ov::InferRequest infer_request = compiled_model.create_infer_request(); - - ov::Tensor input_tensor(ov::element::f16, input_shape, dst->src[0]->data); - ov::Tensor output_tensor(ov::element::f16, output_shape, dst->data); - infer_request.set_input_tensor(0, input_tensor); - infer_request.set_output_tensor(0, output_tensor); - - infer_request.infer(); - } - */ - - /* - // Case 4: - if (dst->ne[0] != 1 && dst->ne[1] != 1 && dst->ne[2] !=1) { - - } - */ - - ov::Core core; - ov::Shape input_shape{static_cast(dst->src[0]->ne[2]), static_cast(dst->src[0]->ne[1]), static_cast(dst->src[0]->ne[0])}; - // ov::Shape output_shape{static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0])}; - auto input_param = std::make_shared(ov::element::f32, input_shape); - - std::shared_ptr model = std::make_shared(ov::OutputVector{input_param}, - ov::ParameterVector{input_param}); - auto compiled_model = core.compile_model(model, "CPU"); - ov::InferRequest infer_request = compiled_model.create_infer_request(); - - ov::Tensor input_tensor(ov::element::f32, input_shape, dst->src[0]->data); - // ov::Tensor output_tensor(ov::element::f32, input_shape, dst->data); - infer_request.set_input_tensor(0, input_tensor); - // infer_request.set_output_tensor(0, output_tensor); - - infer_request.infer(); - GGML_UNUSED(dst); } @@ -823,7 +661,7 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { const size_t element_size = ggml_type_size(src0->type); // Case 1: Both tensors are contiguous - if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) { + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && (src0->ne[0] * element_size == src0->nb[1])) { ov::Shape input_shape = { static_cast(src0->ne[2]), static_cast(src0->ne[1]), @@ -1152,6 +990,7 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe std::vector permute_indices; std::vector mul_mat_indices; + std::vector add_indices; for (int i = 0; i < cgraph->n_nodes; i++) { if (cgraph->nodes[i]->op == GGML_OP_CONT) { @@ -1168,6 +1007,8 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe permute_indices.push_back(i); } else if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT) { mul_mat_indices.push_back(i); + } else if (cgraph->nodes[i]->op == GGML_OP_ADD) { + add_indices.push_back(i); } } @@ -1177,48 +1018,49 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe bool prompt_process_flag = true; if (cgraph->nodes[0]->ne[1] == 1) { prompt_process_flag = false; - } - // int end_node = cgraph->n_nodes - 1; - // openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag); - // } else { - - for (int i = 0; i < cgraph->n_nodes; i++) { - if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { - ggml_backend_openvino_permute(cgraph->nodes[i]); - // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { - // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); - // } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { - // ggml_backend_openvino_view(cgraph->nodes[i]); - // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { - // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); - } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { - ggml_backend_openvino_transpose(cgraph->nodes[i]); - // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { - // ggml_backend_openvino_reshape(cgraph->nodes[i]); - } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { - ggml_backend_openvino_cpy(cgraph->nodes[i]); - } else { - // Process a range of nodes with openvino_frontend_compute - int start_index = i; - while (i < cgraph->n_nodes - && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end() - // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() - // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() - // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() - // && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() - && std::find(transpose_indices.begin(), transpose_indices.end(), i) == transpose_indices.end() - && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() - ) { - i++; + // int end_node = cgraph->n_nodes - 1; + // openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag); + for (int i = 0; i < cgraph->n_nodes; i++) { + if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { + ggml_backend_openvino_view(cgraph->nodes[i]); + } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { + ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); + } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { + ggml_backend_openvino_reshape(cgraph->nodes[i]); + } else { + // Process a range of nodes with openvino_frontend_compute + int start_index = i; + while (i < cgraph->n_nodes + && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() + && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() + && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() + ) { + i++; + } + if (start_index < i) { + openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag); + } } - if (start_index < i) { - openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag); + } + } else { + for (int i = 0; i < cgraph->n_nodes; i++) { + if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { + ggml_backend_openvino_permute(cgraph->nodes[i]); + } else { + // Process a range of nodes with openvino_frontend_compute + int start_index = i; + while (i < cgraph->n_nodes + && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end() + ) { + i++; + } + if (start_index < i) { + openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag); + } } } } - // } - return GGML_STATUS_SUCCESS; GGML_UNUSED(backend); diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 4483241481..d91338127a 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -26,7 +26,9 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]) && ggml_is_contiguous(node)) { + if (ggml_is_contiguous(node->src[0]) + && ggml_is_contiguous(node) + && (node->src[0]->ne[0] * node->src[0]->nb[0] == node->src[0]->nb[1])) { inputs[src0_name] = node->src[0]; outputs[node_name] = node; m_input_names.push_back(src0_name); @@ -112,22 +114,31 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop)); m_output_names.push_back(node_name); - int src0_elem_size = ggml_type_size(node->src[0]->type); - int src1_elem_size = ggml_type_size(node->src[1]->type); + // int src0_elem_size = ggml_type_size(node->src[0]->type); + // int src1_elem_size = ggml_type_size(node->src[1]->type); - int src0_logical_rows = node->src[0]->ne[1]; - int src1_logical_rows = node->src[1]->ne[1]; + // int src0_logical_rows = node->src[0]->ne[1]; + // int src1_logical_rows = node->src[1]->ne[1]; - int src0_phys_cols = node->src[0]->nb[0] / src0_elem_size; - int src0_phys_rows = src0_logical_rows; + // int src0_phys_cols = node->src[0]->nb[0] / src0_elem_size; + // int src0_phys_rows = src0_logical_rows; - int src1_phys_cols = node->src[1]->nb[1] / src1_elem_size; - int src1_phys_rows = src1_logical_rows; - ov::Shape src0_phys_shape = {1, static_cast(src0_phys_rows), static_cast(src0_phys_cols) }; - ov::Shape src1_phys_shape = {1, static_cast(src1_phys_rows), static_cast(src1_phys_cols) }; - auto input0_param = std::make_shared(ov::element::f32, src0_phys_shape); - auto input1_param = std::make_shared(ov::element::f16, src1_phys_shape); + // int src1_phys_cols = node->src[1]->nb[1] / src1_elem_size; + // int src1_phys_rows = src1_logical_rows; + // ov::Shape src0_phys_shape = {1, static_cast(src0_phys_rows), static_cast(src0_phys_cols) }; + // ov::Shape src1_phys_shape = {1, static_cast(src1_phys_rows), static_cast(src1_phys_cols) }; + // auto input0_param = std::make_shared(ov::element::f32, src0_phys_shape); + // auto input1_param = std::make_shared(ov::element::f16, src1_phys_shape); + // m_params.push_back(input0_param); + // m_params.push_back(input1_param); + + ov::Shape input0_shape = { static_cast(node->src[0]->ne[2]), + static_cast(node->src[0]->ne[1]), + static_cast(node->src[0]->ne[0])}; + auto input0_param = std::make_shared(ov::element::f32, input0_shape); m_params.push_back(input0_param); + ov::Shape input1_shape = { 1, 1, static_cast(node->src[1]->nb[2] / node->src[1]->nb[0])}; + auto input1_param = std::make_shared(ov::element::f16, input1_shape); m_params.push_back(input1_param); m_continuous = false; @@ -147,7 +158,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map(node->src[0]->ne[2]), // static_cast(node->src[0]->ne[1]), // static_cast(node->src[0]->ne[0])}; - // auto input_param = std::make_shared(ov::element::f32, input_shape); + // auto type = get_input_type(src0_name); + // auto input_param = std::make_shared(type, input_shape); // m_params.push_back(input_param); // if (node->ne[0] > node->ne[1] && (node->ne[0] * node->nb[0] != node->nb[1]) && node->ne[2] == 1) { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index a0adc917e7..b8315a0013 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -27,12 +27,12 @@ std::vector> get_ggml_graph_input_tensors(std printf("Subgraph input %d: %g\n", inp, *(double*)(input_data)); #endif ov::Tensor input_tensor; - auto input_shape = ggml_decoder->get_input_shape(name).to_shape(); + ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape(); - if (flag & op_node_name == "CONT" && input_shape[0] == 1 && input_shape[1] != 1) { - std::vector input_stride = ggml_decoder->get_input_stride(name); - ov::element::Type input_type = ggml_decoder->get_input_type(name); - size_t element_size = input_type.size(); + ov::element::Type input_type = ggml_decoder->get_input_type(name); + size_t element_size = input_type.size(); + std::vector input_stride = ggml_decoder->get_input_stride(name); + if (op_node_name == "CONT" && input_shape[0] == 1 && (input_shape[1] != 1 && flag || input_shape[2]*element_size!=input_stride[1])) { const size_t num_rows = static_cast(ggml_decoder->get_input_shape(name).to_shape()[1]); const size_t dim2 = static_cast(ggml_decoder->get_input_shape(name).to_shape()[0]); size_t phys_stride = static_cast(input_stride[1]) / element_size; @@ -42,14 +42,14 @@ std::vector> get_ggml_graph_input_tensors(std std::vector input_stride = ggml_decoder->get_input_stride(name); ov::element::Type input_type = ggml_decoder->get_input_type(name); size_t element_size = input_type.size(); - ov::Shape phys_shape; + // ov::Shape phys_shape; static int iter = 0; if (iter++ % 2 == 0) { - phys_shape = {1, input_shape[1], input_stride[2] / element_size}; - input_tensor = ov::Tensor(ov::element::f32, phys_shape, input_data); + // phys_shape = {1, input_shape[1], input_stride[2] / element_size}; + input_tensor = ov::Tensor(ov::element::f32, input_shape, input_data); } else { - phys_shape = {1, input_shape[1], input_stride[1] / element_size}; - input_tensor = ov::Tensor(ov::element::f16, phys_shape, input_data); + ov::Shape flat_shape = {1, 1, input_stride[0] / element_size}; + input_tensor = ov::Tensor(ov::element::f16, flat_shape, input_data); } } else { input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); @@ -161,6 +161,11 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto output_tensor = infer_request.get_output_tensor(i); // output_tensor.get_shape(); std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size()); + // std::cout << std::left << "[ " << std::setw(2) << i << " ]: " + // << "output_names: " << std::setw(20) << output_names[i] + // << " output data: " << std::setw(15) << ((float*)output_tensor.data())[0] + // << std::setw(15) << ((float*)output_tensor.data())[1] << std::right + // << std::endl; #ifdef GGML_OPENVINO_DEBUG printf("Output %s after: %g\n", output_names[i].c_str(), *(double*)(output_tensor.data())); #endif From 8020138406faac0dbf22c73543f4d060d06f08cc Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Mon, 17 Mar 2025 17:00:43 +0800 Subject: [PATCH 043/254] add debug info --- ggml/src/ggml-openvino.cpp | 35 ++++++++++++++++++++++++++------ ggml/src/ggml-openvino/utils.cpp | 12 +++++++++-- 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index fd24356412..2c83edaeb5 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -419,6 +419,11 @@ void ggml_backend_openvino_rms_norm(ggml_tensor * dst) { } } +static void ggml_backend_openvino_permute(const struct ggml_tensor * dst) { + // NOP + GGML_UNUSED(dst); +} + // Extracting valid shapes std::vector get_effective_shape(const ggml_tensor * t) { std::vector shape; @@ -850,11 +855,6 @@ static void ggml_backend_openvino_transpose(ggml_tensor *dst) { GGML_UNUSED(dst); } -static void ggml_backend_openvino_permute(const struct ggml_tensor * dst) { - // NOP - GGML_UNUSED(dst); -} - void ggml_backend_openvino_cpy(struct ggml_tensor *dst) { const struct ggml_tensor *src0 = dst->src[0]; const struct ggml_tensor *src1 = dst->src[1]; @@ -984,6 +984,7 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe std::vector cont_indices; std::vector reshape_indices; std::vector view_indices; + std::vector view_indices_prompt; std::vector cpy_indices; std::vector transpose_indices; @@ -997,8 +998,12 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe cont_indices.push_back(i); } else if (cgraph->nodes[i]->op == GGML_OP_RESHAPE) { reshape_indices.push_back(i); + // } else if (cgraph->nodes[i]->op == GGML_OP_VIEW) { } else if (cgraph->nodes[i]->op == GGML_OP_VIEW) { view_indices.push_back(i); + if (cgraph->nodes[i]->ne[0] == 96) { + view_indices_prompt.push_back(i); + } } else if (cgraph->nodes[i]->op == GGML_OP_CPY) { cpy_indices.push_back(i); } else if (cgraph->nodes[i]->op == GGML_OP_TRANSPOSE) { @@ -1043,14 +1048,32 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe } } } else { + // int end_node = cgraph->n_nodes - 1; + // openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag); for (int i = 0; i < cgraph->n_nodes; i++) { - if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { + if (std::find(add_indices.begin(), add_indices.end(), i) != add_indices.end()) { + ggml_backend_openvino_add_forward(cgraph->nodes[i]); + } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { ggml_backend_openvino_permute(cgraph->nodes[i]); + // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { + // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); + // } else if (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) != view_indices_prompt.end()) { + // ggml_backend_openvino_view(cgraph->nodes[i]); + // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { + // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); + // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { + // ggml_backend_openvino_reshape(cgraph->nodes[i]); } else { // Process a range of nodes with openvino_frontend_compute int start_index = i; while (i < cgraph->n_nodes + && std::find(add_indices.begin(), add_indices.end(), i) == add_indices.end() && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end() + // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() + // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() + // && (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) == view_indices_prompt.end()) + // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() + // && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() ) { i++; } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index b8315a0013..3909afbe2d 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -161,10 +161,18 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto output_tensor = infer_request.get_output_tensor(i); // output_tensor.get_shape(); std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size()); + auto tensor = ggml_decoder->get_output_ggml_tensor(output_names[i]); // std::cout << std::left << "[ " << std::setw(2) << i << " ]: " // << "output_names: " << std::setw(20) << output_names[i] - // << " output data: " << std::setw(15) << ((float*)output_tensor.data())[0] - // << std::setw(15) << ((float*)output_tensor.data())[1] << std::right + // << ", shape: " << std::setw(4) << tensor->ne[0] << " " << std::setw(4) << tensor->ne[1] << " " << tensor->ne[2] + // << ", address: " + // << std::setw(15) << tensor->data << " " + // << std::setw(15) << ((float*)output_tensor.data())[0] + // << std::setw(15) << ((float*)output_tensor.data())[1] + // << ", ne[0]: " + // << std::setw(15) << ((float*)output_tensor.data())[tensor->ne[0]] << std::right + // << std::setw(15) << ((float*)output_tensor.data())[tensor->ne[0] + 1] << std::right + // << std::right // << std::endl; #ifdef GGML_OPENVINO_DEBUG printf("Output %s after: %g\n", output_names[i].c_str(), *(double*)(output_tensor.data())); From 8ae700ae11345a1d8aa0c600ca639c4c8839da13 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Wed, 26 Mar 2025 16:31:52 +0800 Subject: [PATCH 044/254] Process Prompt and predict first token is OK --- ggml/src/ggml-openvino.cpp | 146 ++++++++++++----- ggml/src/ggml-openvino/ggml-decoder.cpp | 88 ++++++---- ggml/src/ggml-openvino/utils.cpp | 208 ++++++++++++++++++++---- 3 files changed, 340 insertions(+), 102 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 2c83edaeb5..a508aeea40 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -652,6 +652,7 @@ void ggml_backend_openvino_reshape(ggml_tensor *dst) { } void ggml_backend_openvino_view(ggml_tensor *dst) { + GGML_UNUSED(dst); } @@ -985,8 +986,11 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe std::vector reshape_indices; std::vector view_indices; std::vector view_indices_prompt; + std::vector view_split; std::vector cpy_indices; + std::vector cpy_split_16; + std::vector cpy_split_19; std::vector transpose_indices; std::vector permute_indices; @@ -1000,12 +1004,23 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe reshape_indices.push_back(i); // } else if (cgraph->nodes[i]->op == GGML_OP_VIEW) { } else if (cgraph->nodes[i]->op == GGML_OP_VIEW) { + // if (cgraph->nodes[i]->src[0]->ne[0] == 98304) + // continue; view_indices.push_back(i); - if (cgraph->nodes[i]->ne[0] == 96) { + if (cgraph->nodes[i]->ne[0] == 32) { view_indices_prompt.push_back(i); } + if (i == 18) { + view_split.push_back(i); + } } else if (cgraph->nodes[i]->op == GGML_OP_CPY) { cpy_indices.push_back(i); + if (i == 16) { + cpy_split_16.push_back(i); + } + if (i == 19) { + cpy_split_19.push_back(i); + } } else if (cgraph->nodes[i]->op == GGML_OP_TRANSPOSE) { transpose_indices.push_back(i); } else if (cgraph->nodes[i]->op == GGML_OP_PERMUTE) { @@ -1023,10 +1038,18 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe bool prompt_process_flag = true; if (cgraph->nodes[0]->ne[1] == 1) { prompt_process_flag = false; - // int end_node = cgraph->n_nodes - 1; - // openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag); for (int i = 0; i < cgraph->n_nodes; i++) { - if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { + if (std::find(add_indices.begin(), add_indices.end(), i) != add_indices.end()) { + ggml_backend_openvino_add_forward(cgraph->nodes[i]); + } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { + ggml_backend_openvino_transpose(cgraph->nodes[i]); + } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { + ggml_backend_openvino_cpy(cgraph->nodes[i]); + } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { + ggml_backend_openvino_permute(cgraph->nodes[i]); + // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { + // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); + } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { ggml_backend_openvino_view(cgraph->nodes[i]); } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); @@ -1036,6 +1059,11 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe // Process a range of nodes with openvino_frontend_compute int start_index = i; while (i < cgraph->n_nodes + && std::find(add_indices.begin(), add_indices.end(), i) == add_indices.end() + && std::find(transpose_indices.begin(), transpose_indices.end(), i) == transpose_indices.end() + && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() + && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end() + // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() @@ -1047,41 +1075,85 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe } } } - } else { // int end_node = cgraph->n_nodes - 1; // openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag); - for (int i = 0; i < cgraph->n_nodes; i++) { - if (std::find(add_indices.begin(), add_indices.end(), i) != add_indices.end()) { - ggml_backend_openvino_add_forward(cgraph->nodes[i]); - } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { - ggml_backend_openvino_permute(cgraph->nodes[i]); - // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { - // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); - // } else if (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) != view_indices_prompt.end()) { - // ggml_backend_openvino_view(cgraph->nodes[i]); - // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { - // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); - // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { - // ggml_backend_openvino_reshape(cgraph->nodes[i]); - } else { - // Process a range of nodes with openvino_frontend_compute - int start_index = i; - while (i < cgraph->n_nodes - && std::find(add_indices.begin(), add_indices.end(), i) == add_indices.end() - && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end() - // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() - // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() - // && (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) == view_indices_prompt.end()) - // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() - // && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() - ) { - i++; - } - if (start_index < i) { - openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag); - } - } - } + // for (int i = 0; i < cgraph->n_nodes; i++) { + // // if (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) != view_indices_prompt.end()) { + // // ggml_backend_openvino_view(cgraph->nodes[i]); + // if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { + // ggml_backend_openvino_view(cgraph->nodes[i]); + // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { + // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); + // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { + // ggml_backend_openvino_reshape(cgraph->nodes[i]); + // } else { + // // Process a range of nodes with openvino_frontend_compute + // int start_index = i; + // while (i < cgraph->n_nodes + // // && (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) == view_indices_prompt.end()) + // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() + // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() + // && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() + // ) { + // i++; + // } + // if (start_index < i) { + // openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag); + // } + // } + // } + } else { + int end_node = cgraph->n_nodes - 1; + openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag); + // for (int i = 0; i < cgraph->n_nodes; i++) { + // if (std::find(add_indices.begin(), add_indices.end(), i) != add_indices.end()) { + // ggml_backend_openvino_add_forward(cgraph->nodes[i]); + // // } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { + // // ggml_backend_openvino_transpose(cgraph->nodes[i]); + // // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { + // // ggml_backend_openvino_cpy(cgraph->nodes[i]); + // // } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { + // // ggml_backend_openvino_permute(cgraph->nodes[i]); + // // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { + // // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); + // // } else if (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) != view_indices_prompt.end()) { + // // ggml_backend_openvino_view(cgraph->nodes[i]); + // // } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { + // // ggml_backend_openvino_view(cgraph->nodes[i]); + // // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { + // // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); + // // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { + // // ggml_backend_openvino_reshape(cgraph->nodes[i]); + // // }else if (std::find(view_split.begin(), view_split.end(), i) != view_split.end()) { + // // ggml_backend_openvino_view(cgraph->nodes[i]); + // // }else if (std::find(cpy_split_16.begin(), cpy_split_16.end(), i) != cpy_split_16.end()) { + // // ggml_backend_openvino_cpy(cgraph->nodes[i]); + // // }else if (std::find(cpy_split_19.begin(), cpy_split_19.end(), i) != cpy_split_19.end()) { + // // ggml_backend_openvino_cpy(cgraph->nodes[i]); + // } else { + // // Process a range of nodes with openvino_frontend_compute + // int start_index = i; + // while (i < cgraph->n_nodes + // && std::find(add_indices.begin(), add_indices.end(), i) == add_indices.end() + // // && std::find(transpose_indices.begin(), transpose_indices.end(), i) == transpose_indices.end() + // // && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() + // // && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end() + // // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() + // // && (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) == view_indices_prompt.end()) + // // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() + // // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() + // // && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() + // // && std::find(view_split.begin(), view_split.end(), i) == view_split.end() + // // && std::find(cpy_split_16.begin(), cpy_split_16.end(), i) == cpy_split_16.end() + // // && std::find(cpy_split_19.begin(), cpy_split_19.end(), i) == cpy_split_19.end() + // ) { + // i++; + // } + // if (start_index < i) { + // openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag); + // } + // } + // } } return GGML_STATUS_SUCCESS; diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index d91338127a..4ec1be7b4d 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -90,47 +90,49 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - m_continuous = true; - - ov::Shape input_shape = { static_cast(node->src[0]->ne[2]), - static_cast(node->src[0]->ne[1]), - static_cast(node->src[0]->ne[0])}; - auto input_param = std::make_shared(ov::element::f32, input_shape); - m_params.push_back(input_param); - break; - } else { std::string src1_name = std::string(node->src[1]->name); inputs[src0_name] = node->src[0]; + // inputs[src1_name] = node->src[1]; + // outputs[node_name] = node; + src1_name = std::string(node->src[1]->view_src->name); inputs[src1_name] = node->src[1]; + node_name = std::string(node->view_src->name); outputs[node_name] = node; m_input_names.push_back(src0_name); m_input_names.push_back(src1_name); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); + m_continuous = true; - // int src0_elem_size = ggml_type_size(node->src[0]->type); - // int src1_elem_size = ggml_type_size(node->src[1]->type); - - // int src0_logical_rows = node->src[0]->ne[1]; - // int src1_logical_rows = node->src[1]->ne[1]; - - // int src0_phys_cols = node->src[0]->nb[0] / src0_elem_size; - // int src0_phys_rows = src0_logical_rows; - - // int src1_phys_cols = node->src[1]->nb[1] / src1_elem_size; - // int src1_phys_rows = src1_logical_rows; - // ov::Shape src0_phys_shape = {1, static_cast(src0_phys_rows), static_cast(src0_phys_cols) }; - // ov::Shape src1_phys_shape = {1, static_cast(src1_phys_rows), static_cast(src1_phys_cols) }; - // auto input0_param = std::make_shared(ov::element::f32, src0_phys_shape); - // auto input1_param = std::make_shared(ov::element::f16, src1_phys_shape); - // m_params.push_back(input0_param); - // m_params.push_back(input1_param); + ov::Shape input1_shape = { static_cast(node->src[0]->ne[2]), + static_cast(node->src[0]->ne[1]), + static_cast(node->src[0]->ne[0])}; + auto input1_param = std::make_shared(ov::element::f32, input1_shape); + m_params.push_back(input1_param); + // ov::Shape input2_shape = { static_cast(node->src[1]->ne[2]), + // static_cast(node->src[1]->ne[1]), + // static_cast(node->src[1]->ne[0])}; + ov::Shape input2_shape = { static_cast(node->src[1]->ne[2]), + static_cast(node->src[1]->ne[1]), + static_cast(node->src[1]->view_src->ne[0])}; + auto input2_param = std::make_shared(ov::element::f16, input2_shape); + m_params.push_back(input2_param); + break; + } else { + std::string src1_name = std::string(node->src[1]->name); + inputs[src0_name] = node->src[0]; + // inputs[src1_name] = node->src[1]; + // outputs[node_name] = node; + src1_name = std::string(node->src[1]->view_src->name); + inputs[src1_name] = node->src[1]; + node_name = std::string(node->view_src->name); + outputs[node_name] = node; + m_input_names.push_back(src0_name); + m_input_names.push_back(src1_name); + m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); + m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); + m_output_names.push_back(node_name); ov::Shape input0_shape = { static_cast(node->src[0]->ne[2]), static_cast(node->src[0]->ne[1]), @@ -150,6 +152,15 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; + // if (node->ne[0] == 21504 || node->ne[0] == 7 + // || node->ne[0] == 3072 && node->src[0]->ne[0] == 98304 + // || node->ne[0] == 1 && node->src[0]->ne[0] == 98304) { + // // if (node->ne[0] == 21504 || node->ne[0] == 7) { + // node_name = std::string(node->view_src->name); + // outputs[node_name] = node; + // } else { + // outputs[node_name] = node; + // } outputs[node_name] = node; m_input_names.push_back(src0_name); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); @@ -193,6 +204,11 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[1]->name); inputs[src0_name] = node->src[0]; + // if (node->ne[0] == 32 &&node->src[0]->type == GGML_TYPE_I32) { + // static_cast(inputs[src0_name]->data)[0] = 1; + // } else if (node->ne[0] == 32 && node->src[0]->type == GGML_TYPE_F16) { + // static_cast(inputs[src0_name]->data)[0] = static_cast(1); + // } inputs[src1_name] = node->src[1]; outputs[node_name] = node; m_input_names.push_back(src0_name); @@ -346,13 +362,17 @@ void ggml_graph_op_print(const struct ggml_cgraph * cgraph) { } GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgraph, const int32_t start_index, const int32_t end_index) - :m_cgraph(cgraph), - m_node(node), - m_op_name(m_node ? std::string(m_node->name) : "NONE_OP") { + :m_cgraph(cgraph), + m_node(node), + m_op_name(m_node ? std::string(m_node->name) : "NONE_OP") { m_inputs.clear(); m_outputs.clear(); m_input_names.clear(); m_output_names.clear(); + m_params.clear(); + m_op_node_name.clear(); + m_decoders.clear(); + // If first init if (m_node) { set_input_output(m_node, m_inputs, m_outputs); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 3909afbe2d..53fecd3b23 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -3,6 +3,7 @@ #include "ggml-backend-impl.h" #include #include +#include using ov::frontend::ggml::GgmlDecoder; @@ -32,32 +33,70 @@ std::vector> get_ggml_graph_input_tensors(std ov::element::Type input_type = ggml_decoder->get_input_type(name); size_t element_size = input_type.size(); std::vector input_stride = ggml_decoder->get_input_stride(name); - if (op_node_name == "CONT" && input_shape[0] == 1 && (input_shape[1] != 1 && flag || input_shape[2]*element_size!=input_stride[1])) { + if (op_node_name == "CONT" && input_shape[0] == 1 // Except for the kqv_merge node + && (input_shape[1] != 1 && flag || input_shape[2]*element_size!=input_stride[1]) + ) { const size_t num_rows = static_cast(ggml_decoder->get_input_shape(name).to_shape()[1]); const size_t dim2 = static_cast(ggml_decoder->get_input_shape(name).to_shape()[0]); size_t phys_stride = static_cast(input_stride[1]) / element_size; ov::Shape input_shape = { dim2, num_rows, phys_stride }; // {1, 7, 9216 } + // if (!flag) { + // std::cout << "CONT input shape: " << input_shape << std::endl; + // } input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); - } else if (op_node_name == "CPY" && (!check_if_contiguous || input_shape[2] == 1)) { //[TODO]: Temporarily determine whether the node corresponding to the input tensor of the Phi-3 model CPY is continuous - std::vector input_stride = ggml_decoder->get_input_stride(name); - ov::element::Type input_type = ggml_decoder->get_input_type(name); - size_t element_size = input_type.size(); - // ov::Shape phys_shape; - static int iter = 0; - if (iter++ % 2 == 0) { - // phys_shape = {1, input_shape[1], input_stride[2] / element_size}; - input_tensor = ov::Tensor(ov::element::f32, input_shape, input_data); - } else { - ov::Shape flat_shape = {1, 1, input_stride[0] / element_size}; - input_tensor = ov::Tensor(ov::element::f16, flat_shape, input_data); - } + // } else if (op_node_name == "CPY" && (!check_if_contiguous || input_shape[2] == 1)) { //[TODO]: Temporarily determine whether the node corresponding to the input tensor of the Phi-3 model CPY is continuous + // } else if (op_node_name == "CPY") { + // std::vector input_stride = ggml_decoder->get_input_stride(name); + // ov::element::Type input_type = ggml_decoder->get_input_type(name); + // size_t element_size = input_type.size(); + // // ov::Shape phys_shape; + // static int iter = 0; + // if (iter++ % 2 == 0) { + // // phys_shape = {1, input_shape[1], input_stride[2] / element_size}; + // input_tensor = ov::Tensor(ov::element::f32, input_shape, input_data); + // } else { + // ov::Shape flat_shape = {1, 1, input_stride[0] / element_size}; + // input_tensor = ov::Tensor(ov::element::f16, flat_shape, input_data); + // } } else { input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); + // if(!flag) { + // std::cout << std::left << "[ " << std::setw(2) << inp << " ]: " + // << "Input Name: " << std::setw(20) << name + // << "Type: " << std::setw(5) << ggml_decoder->get_input_type(name) + // << "OP: " << std::setw(10) << op_node_name + // << "CONT: " << check_if_contiguous + // << ", shape: " << std::setw(4) << input_tensor.get_shape()[0] << " " << std::setw(4) << input_tensor.get_shape()[1] << " " << input_tensor.get_shape()[2] + // << ", address: " + // << std::setw(15) << input_tensor.data() << " " + // << std::setw(15) << ((float*)input_tensor.data())[0] + // << std::setw(15) << ((float*)input_tensor.data())[1] + // << ", ne[0]: " + // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0]] << std::right + // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0] + 1] << std::right + // << std::right + // << std::endl; + // } + // if (op_node_name == "MUL_MAT") { + // std::cout << std::left << "[ " << std::setw(2) << inp << " ]: " + // << "Input MUL_MAT name: " << std::setw(20) << name + // << ", shape: " << std::setw(4) << input_tensor.get_shape()[0] << " " << std::setw(4) << input_tensor.get_shape()[1] << " " << input_tensor.get_shape()[2] + // << ", address: " + // << std::setw(15) << input_tensor.data() << " " + // << std::setw(15) << ((float*)input_tensor.data())[0] + // << std::setw(15) << ((float*)input_tensor.data())[1] + // << ", ne[0]: " + // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0]] << std::right + // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0] + 1] << std::right + // << std::right + // << std::endl; + // } } // input_tensors[name] = input_tensor; input_tensors.emplace_back(name, input_tensor); } + // std::cout << "input_names.size(): " << input_names.size() << std::endl; return input_tensors; } @@ -117,7 +156,13 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // Convert InputModel -> ov::Model std::shared_ptr model = front_end->convert(input_model); // ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_model.xml"); - + + // auto cloned_model = model->clone(); + // std::string model_dir = "/home/user/zhan/merge_git_commits/llama.cpp-ov"; + // auto path_base = model_dir + "/" + cloned_model->get_name(); + // // ov::pass::VisualizeTree(path_base + ".svg").run_on_model(cloned_model); + // ov::serialize(cloned_model, path_base + ".xml", path_base + ".bin"); + if (!model) { GGML_LOG_ERROR("Model is not converted \n"); } else { @@ -126,9 +171,14 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c #endif } + // model = core.read_model("/home/user/zhan/merge_git_commits/llama.cpp-ov/replaceWithInputLayer_000_model.xml"); // Loading a model to the device + // std::cout << "Compile ..." << std::endl; ov::CompiledModel compiled_model = core.compile_model(model); // ov::save_model(compiled_model.get_runtime_model(), "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_compile_model.xml"); + // std::ofstream output_file("/home/user/zhan/merge_git_commits/llama.cpp-ov/000_compile_model.xml"); + // compiled_model.export_model(output_file); + // output_file.close(); // Create infer request ov::InferRequest infer_request = compiled_model.create_infer_request(); @@ -151,34 +201,130 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // std::cout << std::endl; } + // std::cout << "Infer ..." << std::endl; infer_request.infer(); // Set dst data for outputs auto output_names = ggml_decoder->get_output_names(); auto output_tensors = get_ggml_graph_output_dst(ggml_decoder); for (size_t i = 0; i < output_names.size(); i++) { - // std::string op_name = ggml_decoder->get_node_op_name(output_names[i]); auto output_tensor = infer_request.get_output_tensor(i); - // output_tensor.get_shape(); std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size()); - auto tensor = ggml_decoder->get_output_ggml_tensor(output_names[i]); - // std::cout << std::left << "[ " << std::setw(2) << i << " ]: " - // << "output_names: " << std::setw(20) << output_names[i] - // << ", shape: " << std::setw(4) << tensor->ne[0] << " " << std::setw(4) << tensor->ne[1] << " " << tensor->ne[2] - // << ", address: " - // << std::setw(15) << tensor->data << " " - // << std::setw(15) << ((float*)output_tensor.data())[0] - // << std::setw(15) << ((float*)output_tensor.data())[1] - // << ", ne[0]: " - // << std::setw(15) << ((float*)output_tensor.data())[tensor->ne[0]] << std::right - // << std::setw(15) << ((float*)output_tensor.data())[tensor->ne[0] + 1] << std::right - // << std::right - // << std::endl; + // if(!flag) { + // auto tensor = ggml_decoder->get_output_ggml_tensor(output_names[i]); + // std::cout << std::left << "[ " << std::setw(2) << i << " ]: " + // << "output_names: " << std::setw(20) << output_names[i] + // << ", shape: " << std::setw(4) << tensor->ne[0] << " " << std::setw(4) << tensor->ne[1] << " " << tensor->ne[2] + // << ", address: " + // << std::setw(15) << tensor->data << " " + // << std::setw(15) << static_cast(((float*)output_tensor.data())[0]) + // << std::setw(15) << static_cast(((float*)output_tensor.data())[1]) + // << ", ne[0]: " + // << std::setw(15) << static_cast(((float*)output_tensor.data())[tensor->ne[0]]) << std::right + // << std::setw(15) << static_cast(((float*)output_tensor.data())[tensor->ne[0] + 1]) << std::right + // << std::right + // << std::endl; + // if (i == 19) { + // auto output_tensor_18 = infer_request.get_output_tensor(18); + // auto tensor = ggml_decoder->get_output_ggml_tensor(output_names[18]); + // std::cout << std::left << " " << std::setw(2) << 18 << " : " + // << "output_names: " << std::setw(20) << output_names[18] + // << ", shape: " << std::setw(4) << tensor->ne[0] << " " << std::setw(4) << tensor->ne[1] << " " << tensor->ne[2] + // << ", address: " + // << std::setw(15) << tensor->data << " " + // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[0]) + // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[1]) + // << ", ne[0]: " + // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[tensor->ne[0]]) << std::right + // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[tensor->ne[0] + 1]) << std::right + // << std::right + // << std::endl; + // } + // if(i == 23) { + // auto output_tensor_15 = infer_request.get_output_tensor(15); + // auto tensor = ggml_decoder->get_output_ggml_tensor(output_names[15]); + // std::cout << std::left << " " << std::setw(2) << 15 << " : " + // << "output_names: " << std::setw(20) << output_names[15] + // << ", shape: " << std::setw(4) << tensor->ne[0] << " " << std::setw(4) << tensor->ne[1] << " " << tensor->ne[2] + // << ", address: " + // << std::setw(15) << tensor->data << " " + // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[0]) + // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[1]) + // << ", ne[0]: " + // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[tensor->ne[0]]) << std::right + // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[tensor->ne[0] + 1]) << std::right + // << std::right + // << std::endl; + // auto cache_k_l0_20 = ggml_decoder->get_input_names()[20]; + // // auto input_data = ggml_decoder->get_input_ggml_tensor(cache_k_l0_20)->data; + // auto input_tensor = input_tensors.at(20).second; + // std::cout << std::left << " " << std::setw(2) << 20 << " : " + // << "Input Name: " << std::setw(20) << cache_k_l0_20 + // << ", shape: " << std::setw(4) << input_tensor.get_shape()[0] << " " << std::setw(4) << input_tensor.get_shape()[1] << " " << input_tensor.get_shape()[2] + // << ", address: " + // << std::setw(15) << input_tensor.data() << " " + // << std::setw(15) << ((float*)input_tensor.data())[0] + // << std::setw(15) << ((float*)input_tensor.data())[1] + // << ", ne[0]: " + // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0]] << std::right + // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0] + 1] << std::right + // << std::right + // << std::endl; + + // auto cache_k_l0_27 = ggml_decoder->get_input_names()[27]; + // // auto input_data = ggml_decoder->get_input_ggml_tensor(cache_k_l0_20)->data; + // auto input_tensor_27 = input_tensors.at(27).second; + // std::cout << std::left << " " << std::setw(2) << 27 << " : " + // << "Input Name: " << std::setw(20) << cache_k_l0_27 + // << ", shape: " << std::setw(4) << input_tensor_27.get_shape()[0] << " " << std::setw(4) << input_tensor_27.get_shape()[1] << " " << input_tensor_27.get_shape()[2] + // << ", address: " + // << std::setw(15) << input_tensor_27.data() << " " + // << std::setw(15) << ((float*)input_tensor_27.data())[0] + // << std::setw(15) << ((float*)input_tensor_27.data())[1] + // << ", ne[0]: " + // << std::setw(15) << ((float*)input_tensor_27.data())[input_tensor_27.get_shape()[0]] << std::right + // << std::setw(15) << ((float*)input_tensor_27.data())[input_tensor_27.get_shape()[0] + 1] << std::right + // << std::right + // << std::endl; + + // auto cache_k_l0_29 = ggml_decoder->get_input_names()[29]; + // // auto input_data = ggml_decoder->get_input_ggml_tensor(cache_k_l0_20)->data; + // auto input_tensor_29 = input_tensors.at(29).second; + // std::cout << std::left << " " << std::setw(2) << 29 << " : " + // << "Input Name: " << std::setw(20) << cache_k_l0_29 + // << ", shape: " << std::setw(4) << input_tensor_29.get_shape()[0] << " " << std::setw(4) << input_tensor_29.get_shape()[1] << " " << input_tensor_29.get_shape()[2] + // << ", address: " + // << std::setw(15) << input_tensor_29.data() << " " + // << std::setw(15) << ((float*)input_tensor_29.data())[0] + // << std::setw(15) << ((float*)input_tensor_29.data())[1] + // << ", ne[0]: " + // << std::setw(15) << ((float*)input_tensor_29.data())[input_tensor_29.get_shape()[0]] << std::right + // << std::setw(15) << ((float*)input_tensor_29.data())[input_tensor_29.get_shape()[0] + 1] << std::right + // << std::right + // << std::endl; + + // auto cache_k_l0_30 = ggml_decoder->get_input_names()[30]; + // // auto input_data = ggml_decoder->get_input_ggml_tensor(cache_k_l0_20)->data; + // auto input_tensor_30 = input_tensors.at(30).second; + // std::cout << std::left << " " << std::setw(2) << 30 << " : " + // << "Input Name: " << std::setw(20) << cache_k_l0_30 + // << ", shape: " << std::setw(4) << input_tensor_30.get_shape()[0] << " " << std::setw(4) << input_tensor_30.get_shape()[1] << " " << input_tensor_30.get_shape()[2] + // << ", address: " + // << std::setw(15) << input_tensor_30.data() << " " + // << std::setw(15) << ((float*)input_tensor_30.data())[0] + // << std::setw(15) << ((float*)input_tensor_30.data())[1] + // << ", ne[0]: " + // << std::setw(15) << ((float*)input_tensor_30.data())[input_tensor_30.get_shape()[0]] << std::right + // << std::setw(15) << ((float*)input_tensor_30.data())[input_tensor_30.get_shape()[0] + 1] << std::right + // << std::right + // << std::endl; + // } + // } #ifdef GGML_OPENVINO_DEBUG printf("Output %s after: %g\n", output_names[i].c_str(), *(double*)(output_tensor.data())); #endif } - + return GGML_STATUS_SUCCESS; GGML_UNUSED(backend); } From eac9a99530a5c4a36f3e8349698eb5d7ec8df590 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Mon, 31 Mar 2025 10:41:04 +0800 Subject: [PATCH 045/254] =?UTF-8?q?1.=20Solve=20the=20AC=20issue=20of=20Pe?= =?UTF-8?q?rmute+VIEW=20and=20MULMAL=20issue=20in=20the=20phase=20of=20?= =?UTF-8?q?=E2=80=9C1.=20Process=20Prompt=20and=20predict=20the=20first=20?= =?UTF-8?q?token=E2=80=9D.=202.=20There=20is=20still=20an=20AC=20issue=20i?= =?UTF-8?q?n=20the=20"2.=20Predict=20the=20subsequent=20tokens=20phase"=20?= =?UTF-8?q?and=20it=20is=20being=20debugged.=20=20=20=20A=20deviation=20ha?= =?UTF-8?q?s=20been=20detected=20in=20the=20computation=20of=20OpenVINO's?= =?UTF-8?q?=20CPY=20Node=20at=20stage=202,=20and=20it=20is=20currently=20b?= =?UTF-8?q?eing=20fixed.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ggml/src/ggml-openvino.cpp | 140 +++++++----------------- ggml/src/ggml-openvino/ggml-decoder.cpp | 7 ++ ggml/src/ggml-openvino/utils.cpp | 43 ++++---- 3 files changed, 70 insertions(+), 120 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index a508aeea40..2279df1d6d 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -823,34 +823,34 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { } static void ggml_backend_openvino_transpose(ggml_tensor *dst) { - ov::Core core; - ov::Shape input_shape{static_cast(dst->src[0]->ne[2]), static_cast(dst->src[0]->ne[1]), static_cast(dst->src[0]->ne[0])}; - ov::Shape output_shape{static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0])}; - auto input_param = std::make_shared(ov::element::f32, input_shape); + // ov::Core core; + // ov::Shape input_shape{static_cast(dst->src[0]->ne[2]), static_cast(dst->src[0]->ne[1]), static_cast(dst->src[0]->ne[0])}; + // ov::Shape output_shape{static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0])}; + // auto input_param = std::make_shared(ov::element::f32, input_shape); - //auto res = std::make_shared(input_param, ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 2, 1})); + // //auto res = std::make_shared(input_param, ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 2, 1})); - auto new_shape_node = ov::op::v0::Constant::create(ov::element::i64, - ov::Shape{output_shape.size()}, - std::vector(output_shape.begin(), output_shape.end())); - auto res = std::make_shared(input_param, new_shape_node, false); + // auto new_shape_node = ov::op::v0::Constant::create(ov::element::i64, + // ov::Shape{output_shape.size()}, + // std::vector(output_shape.begin(), output_shape.end())); + // auto res = std::make_shared(input_param, new_shape_node, false); - std::shared_ptr model = std::make_shared(ov::OutputVector{res}, - ov::ParameterVector{input_param}); - auto compiled_model = core.compile_model(model, "CPU"); - ov::InferRequest infer_request = compiled_model.create_infer_request(); + // std::shared_ptr model = std::make_shared(ov::OutputVector{res}, + // ov::ParameterVector{input_param}); + // auto compiled_model = core.compile_model(model, "CPU"); + // ov::InferRequest infer_request = compiled_model.create_infer_request(); - ov::Tensor input_tensor(ov::element::f32, input_shape, dst->src[0]->data); - ov::Tensor output_tensor(ov::element::f32, output_shape, dst->data); - infer_request.set_input_tensor(0, input_tensor); - infer_request.set_output_tensor(0, output_tensor); + // ov::Tensor input_tensor(ov::element::f32, input_shape, dst->src[0]->data); + // ov::Tensor output_tensor(ov::element::f32, output_shape, dst->data); + // infer_request.set_input_tensor(0, input_tensor); + // infer_request.set_output_tensor(0, output_tensor); - infer_request.infer(); + // infer_request.infer(); // NOP GGML_UNUSED(dst); @@ -1004,7 +1004,7 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe reshape_indices.push_back(i); // } else if (cgraph->nodes[i]->op == GGML_OP_VIEW) { } else if (cgraph->nodes[i]->op == GGML_OP_VIEW) { - // if (cgraph->nodes[i]->src[0]->ne[0] == 98304) + // if (cgraph->nodes[i]->src[0]->ne[0] == 98304 && (cgraph->nodes[i]->ne[0] == 3072 || cgraph->nodes[i]->ne[0] == 1)) // continue; view_indices.push_back(i); if (cgraph->nodes[i]->ne[0] == 32) { @@ -1045,16 +1045,25 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe ggml_backend_openvino_transpose(cgraph->nodes[i]); } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { ggml_backend_openvino_cpy(cgraph->nodes[i]); - } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { - ggml_backend_openvino_permute(cgraph->nodes[i]); - // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { - // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); + // } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { + // ggml_backend_openvino_permute(cgraph->nodes[i]); } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { ggml_backend_openvino_view(cgraph->nodes[i]); } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { ggml_backend_openvino_reshape(cgraph->nodes[i]); + + // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { + // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); + // } else if (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) != view_indices_prompt.end()) { + // ggml_backend_openvino_view(cgraph->nodes[i]); + // }else if (std::find(view_split.begin(), view_split.end(), i) != view_split.end()) { + // ggml_backend_openvino_view(cgraph->nodes[i]); + // }else if (std::find(cpy_split_16.begin(), cpy_split_16.end(), i) != cpy_split_16.end()) { + // ggml_backend_openvino_cpy(cgraph->nodes[i]); + // }else if (std::find(cpy_split_19.begin(), cpy_split_19.end(), i) != cpy_split_19.end()) { + // ggml_backend_openvino_cpy(cgraph->nodes[i]); } else { // Process a range of nodes with openvino_frontend_compute int start_index = i; @@ -1062,11 +1071,16 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe && std::find(add_indices.begin(), add_indices.end(), i) == add_indices.end() && std::find(transpose_indices.begin(), transpose_indices.end(), i) == transpose_indices.end() && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() - && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end() - // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() + // && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end() && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() + + // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() + // && (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) == view_indices_prompt.end()) + // && std::find(view_split.begin(), view_split.end(), i) == view_split.end() + // && std::find(cpy_split_16.begin(), cpy_split_16.end(), i) == cpy_split_16.end() + // && std::find(cpy_split_19.begin(), cpy_split_19.end(), i) == cpy_split_19.end() ) { i++; } @@ -1075,85 +1089,9 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe } } } - // int end_node = cgraph->n_nodes - 1; - // openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag); - // for (int i = 0; i < cgraph->n_nodes; i++) { - // // if (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) != view_indices_prompt.end()) { - // // ggml_backend_openvino_view(cgraph->nodes[i]); - // if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { - // ggml_backend_openvino_view(cgraph->nodes[i]); - // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { - // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); - // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { - // ggml_backend_openvino_reshape(cgraph->nodes[i]); - // } else { - // // Process a range of nodes with openvino_frontend_compute - // int start_index = i; - // while (i < cgraph->n_nodes - // // && (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) == view_indices_prompt.end()) - // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() - // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() - // && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() - // ) { - // i++; - // } - // if (start_index < i) { - // openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag); - // } - // } - // } } else { int end_node = cgraph->n_nodes - 1; openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag); - // for (int i = 0; i < cgraph->n_nodes; i++) { - // if (std::find(add_indices.begin(), add_indices.end(), i) != add_indices.end()) { - // ggml_backend_openvino_add_forward(cgraph->nodes[i]); - // // } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { - // // ggml_backend_openvino_transpose(cgraph->nodes[i]); - // // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { - // // ggml_backend_openvino_cpy(cgraph->nodes[i]); - // // } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { - // // ggml_backend_openvino_permute(cgraph->nodes[i]); - // // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { - // // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); - // // } else if (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) != view_indices_prompt.end()) { - // // ggml_backend_openvino_view(cgraph->nodes[i]); - // // } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { - // // ggml_backend_openvino_view(cgraph->nodes[i]); - // // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { - // // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); - // // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { - // // ggml_backend_openvino_reshape(cgraph->nodes[i]); - // // }else if (std::find(view_split.begin(), view_split.end(), i) != view_split.end()) { - // // ggml_backend_openvino_view(cgraph->nodes[i]); - // // }else if (std::find(cpy_split_16.begin(), cpy_split_16.end(), i) != cpy_split_16.end()) { - // // ggml_backend_openvino_cpy(cgraph->nodes[i]); - // // }else if (std::find(cpy_split_19.begin(), cpy_split_19.end(), i) != cpy_split_19.end()) { - // // ggml_backend_openvino_cpy(cgraph->nodes[i]); - // } else { - // // Process a range of nodes with openvino_frontend_compute - // int start_index = i; - // while (i < cgraph->n_nodes - // && std::find(add_indices.begin(), add_indices.end(), i) == add_indices.end() - // // && std::find(transpose_indices.begin(), transpose_indices.end(), i) == transpose_indices.end() - // // && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() - // // && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end() - // // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() - // // && (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) == view_indices_prompt.end()) - // // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() - // // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() - // // && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() - // // && std::find(view_split.begin(), view_split.end(), i) == view_split.end() - // // && std::find(cpy_split_16.begin(), cpy_split_16.end(), i) == cpy_split_16.end() - // // && std::find(cpy_split_19.begin(), cpy_split_19.end(), i) == cpy_split_19.end() - // ) { - // i++; - // } - // if (start_index < i) { - // openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag); - // } - // } - // } } return GGML_STATUS_SUCCESS; diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 4ec1be7b4d..ec827e8006 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -161,6 +161,13 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapne[0] == 3072 && node->ne[1] == 1 && node->ne[2] == 1) { + // outputs[src0_name] = node; + // m_output_names.push_back(src0_name); + // } else { + // outputs[node_name] = node; + // m_output_names.push_back(node_name); + // } outputs[node_name] = node; m_input_names.push_back(src0_name); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 53fecd3b23..642f2b6662 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -44,24 +44,8 @@ std::vector> get_ggml_graph_input_tensors(std // std::cout << "CONT input shape: " << input_shape << std::endl; // } input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); - // } else if (op_node_name == "CPY" && (!check_if_contiguous || input_shape[2] == 1)) { //[TODO]: Temporarily determine whether the node corresponding to the input tensor of the Phi-3 model CPY is continuous - // } else if (op_node_name == "CPY") { - // std::vector input_stride = ggml_decoder->get_input_stride(name); - // ov::element::Type input_type = ggml_decoder->get_input_type(name); - // size_t element_size = input_type.size(); - // // ov::Shape phys_shape; - // static int iter = 0; - // if (iter++ % 2 == 0) { - // // phys_shape = {1, input_shape[1], input_stride[2] / element_size}; - // input_tensor = ov::Tensor(ov::element::f32, input_shape, input_data); - // } else { - // ov::Shape flat_shape = {1, 1, input_stride[0] / element_size}; - // input_tensor = ov::Tensor(ov::element::f16, flat_shape, input_data); - // } - } else { - input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); // if(!flag) { - // std::cout << std::left << "[ " << std::setw(2) << inp << " ]: " + // std::cout << std::left << "*[" << std::setw(2) << inp << "]*: " // << "Input Name: " << std::setw(20) << name // << "Type: " << std::setw(5) << ggml_decoder->get_input_type(name) // << "OP: " << std::setw(10) << op_node_name @@ -77,14 +61,21 @@ std::vector> get_ggml_graph_input_tensors(std // << std::right // << std::endl; // } - // if (op_node_name == "MUL_MAT") { + } else { + input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); + // if(!flag) { // std::cout << std::left << "[ " << std::setw(2) << inp << " ]: " - // << "Input MUL_MAT name: " << std::setw(20) << name + // << "Input Name: " << std::setw(20) << name + // << "Type: " << std::setw(5) << ggml_decoder->get_input_type(name) + // << "OP: " << std::setw(10) << op_node_name + // << "CONT: " << check_if_contiguous // << ", shape: " << std::setw(4) << input_tensor.get_shape()[0] << " " << std::setw(4) << input_tensor.get_shape()[1] << " " << input_tensor.get_shape()[2] // << ", address: " // << std::setw(15) << input_tensor.data() << " " // << std::setw(15) << ((float*)input_tensor.data())[0] // << std::setw(15) << ((float*)input_tensor.data())[1] + // << ", ne[0]-1: " + // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0]-1] // << ", ne[0]: " // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0]] << std::right // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0] + 1] << std::right @@ -219,6 +210,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // << std::setw(15) << tensor->data << " " // << std::setw(15) << static_cast(((float*)output_tensor.data())[0]) // << std::setw(15) << static_cast(((float*)output_tensor.data())[1]) + // << ", ne[0]-1: " + // << std::setw(15) << static_cast(((float*)output_tensor.data())[tensor->ne[0] - 1]) // << ", ne[0]: " // << std::setw(15) << static_cast(((float*)output_tensor.data())[tensor->ne[0]]) << std::right // << std::setw(15) << static_cast(((float*)output_tensor.data())[tensor->ne[0] + 1]) << std::right @@ -234,6 +227,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // << std::setw(15) << tensor->data << " " // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[0]) // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[1]) + // << ", ne[0]-1: " + // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[tensor->ne[0] - 1]) // << ", ne[0]: " // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[tensor->ne[0]]) << std::right // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[tensor->ne[0] + 1]) << std::right @@ -250,6 +245,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // << std::setw(15) << tensor->data << " " // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[0]) // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[1]) + // << ", ne[0]-1: " + // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[tensor->ne[0] - 1]) // << ", ne[0]: " // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[tensor->ne[0]]) << std::right // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[tensor->ne[0] + 1]) << std::right @@ -265,6 +262,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // << std::setw(15) << input_tensor.data() << " " // << std::setw(15) << ((float*)input_tensor.data())[0] // << std::setw(15) << ((float*)input_tensor.data())[1] + // << ", ne[0]-1: " + // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0] - 1] // << ", ne[0]: " // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0]] << std::right // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0] + 1] << std::right @@ -281,6 +280,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // << std::setw(15) << input_tensor_27.data() << " " // << std::setw(15) << ((float*)input_tensor_27.data())[0] // << std::setw(15) << ((float*)input_tensor_27.data())[1] + // << ", ne[0]-1: " + // << std::setw(15) << ((float*)input_tensor_27.data())[input_tensor_27.get_shape()[0] - 1] // << ", ne[0]: " // << std::setw(15) << ((float*)input_tensor_27.data())[input_tensor_27.get_shape()[0]] << std::right // << std::setw(15) << ((float*)input_tensor_27.data())[input_tensor_27.get_shape()[0] + 1] << std::right @@ -297,6 +298,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // << std::setw(15) << input_tensor_29.data() << " " // << std::setw(15) << ((float*)input_tensor_29.data())[0] // << std::setw(15) << ((float*)input_tensor_29.data())[1] + // << ", ne[0]-1: " + // << std::setw(15) << ((float*)input_tensor_29.data())[input_tensor_29.get_shape()[0] - 1] // << ", ne[0]: " // << std::setw(15) << ((float*)input_tensor_29.data())[input_tensor_29.get_shape()[0]] << std::right // << std::setw(15) << ((float*)input_tensor_29.data())[input_tensor_29.get_shape()[0] + 1] << std::right @@ -313,6 +316,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // << std::setw(15) << input_tensor_30.data() << " " // << std::setw(15) << ((float*)input_tensor_30.data())[0] // << std::setw(15) << ((float*)input_tensor_30.data())[1] + // << ", ne[0]-1: " + // << std::setw(15) << ((float*)input_tensor_30.data())[input_tensor_30.get_shape()[0] - 1] // << ", ne[0]: " // << std::setw(15) << ((float*)input_tensor_30.data())[input_tensor_30.get_shape()[0]] << std::right // << std::setw(15) << ((float*)input_tensor_30.data())[input_tensor_30.get_shape()[0] + 1] << std::right From 84be5c6f15f51578017817401381b7e0a2481e7c Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Mon, 31 Mar 2025 20:09:40 +0800 Subject: [PATCH 046/254] 1. Delete some comments 2. Process Prompt and predict first token is OK --- ggml/src/ggml-openvino.cpp | 20 --- ggml/src/ggml-openvino/ggml-decoder.cpp | 46 ------ ggml/src/ggml-openvino/utils.cpp | 190 ------------------------ 3 files changed, 256 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 2279df1d6d..b9f1b89722 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -1045,25 +1045,12 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe ggml_backend_openvino_transpose(cgraph->nodes[i]); } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { ggml_backend_openvino_cpy(cgraph->nodes[i]); - // } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { - // ggml_backend_openvino_permute(cgraph->nodes[i]); } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { ggml_backend_openvino_view(cgraph->nodes[i]); } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { ggml_backend_openvino_reshape(cgraph->nodes[i]); - - // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { - // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); - // } else if (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) != view_indices_prompt.end()) { - // ggml_backend_openvino_view(cgraph->nodes[i]); - // }else if (std::find(view_split.begin(), view_split.end(), i) != view_split.end()) { - // ggml_backend_openvino_view(cgraph->nodes[i]); - // }else if (std::find(cpy_split_16.begin(), cpy_split_16.end(), i) != cpy_split_16.end()) { - // ggml_backend_openvino_cpy(cgraph->nodes[i]); - // }else if (std::find(cpy_split_19.begin(), cpy_split_19.end(), i) != cpy_split_19.end()) { - // ggml_backend_openvino_cpy(cgraph->nodes[i]); } else { // Process a range of nodes with openvino_frontend_compute int start_index = i; @@ -1071,16 +1058,9 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe && std::find(add_indices.begin(), add_indices.end(), i) == add_indices.end() && std::find(transpose_indices.begin(), transpose_indices.end(), i) == transpose_indices.end() && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() - // && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end() && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() - - // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() - // && (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) == view_indices_prompt.end()) - // && std::find(view_split.begin(), view_split.end(), i) == view_split.end() - // && std::find(cpy_split_16.begin(), cpy_split_16.end(), i) == cpy_split_16.end() - // && std::find(cpy_split_19.begin(), cpy_split_19.end(), i) == cpy_split_19.end() ) { i++; } diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index ec827e8006..3b396c05f7 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -92,8 +92,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[1]->name); inputs[src0_name] = node->src[0]; - // inputs[src1_name] = node->src[1]; - // outputs[node_name] = node; src1_name = std::string(node->src[1]->view_src->name); inputs[src1_name] = node->src[1]; node_name = std::string(node->view_src->name); @@ -110,9 +108,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map(node->src[0]->ne[0])}; auto input1_param = std::make_shared(ov::element::f32, input1_shape); m_params.push_back(input1_param); - // ov::Shape input2_shape = { static_cast(node->src[1]->ne[2]), - // static_cast(node->src[1]->ne[1]), - // static_cast(node->src[1]->ne[0])}; ov::Shape input2_shape = { static_cast(node->src[1]->ne[2]), static_cast(node->src[1]->ne[1]), static_cast(node->src[1]->view_src->ne[0])}; @@ -122,8 +117,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[1]->name); inputs[src0_name] = node->src[0]; - // inputs[src1_name] = node->src[1]; - // outputs[node_name] = node; src1_name = std::string(node->src[1]->view_src->name); inputs[src1_name] = node->src[1]; node_name = std::string(node->view_src->name); @@ -152,44 +145,10 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; - // if (node->ne[0] == 21504 || node->ne[0] == 7 - // || node->ne[0] == 3072 && node->src[0]->ne[0] == 98304 - // || node->ne[0] == 1 && node->src[0]->ne[0] == 98304) { - // // if (node->ne[0] == 21504 || node->ne[0] == 7) { - // node_name = std::string(node->view_src->name); - // outputs[node_name] = node; - // } else { - // outputs[node_name] = node; - // } - // if (node->ne[0] == 3072 && node->ne[1] == 1 && node->ne[2] == 1) { - // outputs[src0_name] = node; - // m_output_names.push_back(src0_name); - // } else { - // outputs[node_name] = node; - // m_output_names.push_back(node_name); - // } outputs[node_name] = node; m_input_names.push_back(src0_name); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); - - // ov::Shape input_shape = { static_cast(node->src[0]->ne[2]), - // static_cast(node->src[0]->ne[1]), - // static_cast(node->src[0]->ne[0])}; - // auto type = get_input_type(src0_name); - // auto input_param = std::make_shared(type, input_shape); - // m_params.push_back(input_param); - - // if (node->ne[0] > node->ne[1] && (node->ne[0] * node->nb[0] != node->nb[1]) && node->ne[2] == 1) { - // m_continuous = false; - // } else { - // m_continuous = true; - - // } - // m_continuous = false; - - // [TODO]: multiple cases - break; } // SCALE @@ -211,11 +170,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[1]->name); inputs[src0_name] = node->src[0]; - // if (node->ne[0] == 32 &&node->src[0]->type == GGML_TYPE_I32) { - // static_cast(inputs[src0_name]->data)[0] = 1; - // } else if (node->ne[0] == 32 && node->src[0]->type == GGML_TYPE_F16) { - // static_cast(inputs[src0_name]->data)[0] = static_cast(1); - // } inputs[src1_name] = node->src[1]; outputs[node_name] = node; m_input_names.push_back(src0_name); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 642f2b6662..736c7f690b 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -11,12 +11,9 @@ std::shared_ptr get_ggml_decoder(struct ggml_cgraph * cgraph, con return std::make_shared(nullptr, cgraph, start_index, end_index); } -// std::map get_ggml_graph_input_tensors(std::shared_ptr ggml_decoder) { std::vector> get_ggml_graph_input_tensors(std::shared_ptr ggml_decoder, bool flag) { - // std::map input_tensors; std::vector> input_tensors; auto input_names = ggml_decoder->get_input_names(); - // auto node_name = ggml_decoder->get_op_name(); size_t op_iter = 0; for (size_t inp = 0; inp < input_names.size(); ++inp) { auto name = input_names[inp]; @@ -40,48 +37,9 @@ std::vector> get_ggml_graph_input_tensors(std const size_t dim2 = static_cast(ggml_decoder->get_input_shape(name).to_shape()[0]); size_t phys_stride = static_cast(input_stride[1]) / element_size; ov::Shape input_shape = { dim2, num_rows, phys_stride }; // {1, 7, 9216 } - // if (!flag) { - // std::cout << "CONT input shape: " << input_shape << std::endl; - // } input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); - // if(!flag) { - // std::cout << std::left << "*[" << std::setw(2) << inp << "]*: " - // << "Input Name: " << std::setw(20) << name - // << "Type: " << std::setw(5) << ggml_decoder->get_input_type(name) - // << "OP: " << std::setw(10) << op_node_name - // << "CONT: " << check_if_contiguous - // << ", shape: " << std::setw(4) << input_tensor.get_shape()[0] << " " << std::setw(4) << input_tensor.get_shape()[1] << " " << input_tensor.get_shape()[2] - // << ", address: " - // << std::setw(15) << input_tensor.data() << " " - // << std::setw(15) << ((float*)input_tensor.data())[0] - // << std::setw(15) << ((float*)input_tensor.data())[1] - // << ", ne[0]: " - // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0]] << std::right - // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0] + 1] << std::right - // << std::right - // << std::endl; - // } } else { input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); - // if(!flag) { - // std::cout << std::left << "[ " << std::setw(2) << inp << " ]: " - // << "Input Name: " << std::setw(20) << name - // << "Type: " << std::setw(5) << ggml_decoder->get_input_type(name) - // << "OP: " << std::setw(10) << op_node_name - // << "CONT: " << check_if_contiguous - // << ", shape: " << std::setw(4) << input_tensor.get_shape()[0] << " " << std::setw(4) << input_tensor.get_shape()[1] << " " << input_tensor.get_shape()[2] - // << ", address: " - // << std::setw(15) << input_tensor.data() << " " - // << std::setw(15) << ((float*)input_tensor.data())[0] - // << std::setw(15) << ((float*)input_tensor.data())[1] - // << ", ne[0]-1: " - // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0]-1] - // << ", ne[0]: " - // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0]] << std::right - // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0] + 1] << std::right - // << std::right - // << std::endl; - // } } // input_tensors[name] = input_tensor; @@ -146,13 +104,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // Convert InputModel -> ov::Model std::shared_ptr model = front_end->convert(input_model); - // ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_model.xml"); - - // auto cloned_model = model->clone(); - // std::string model_dir = "/home/user/zhan/merge_git_commits/llama.cpp-ov"; - // auto path_base = model_dir + "/" + cloned_model->get_name(); - // // ov::pass::VisualizeTree(path_base + ".svg").run_on_model(cloned_model); - // ov::serialize(cloned_model, path_base + ".xml", path_base + ".bin"); if (!model) { GGML_LOG_ERROR("Model is not converted \n"); @@ -162,14 +113,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c #endif } - // model = core.read_model("/home/user/zhan/merge_git_commits/llama.cpp-ov/replaceWithInputLayer_000_model.xml"); - // Loading a model to the device - // std::cout << "Compile ..." << std::endl; ov::CompiledModel compiled_model = core.compile_model(model); - // ov::save_model(compiled_model.get_runtime_model(), "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_compile_model.xml"); - // std::ofstream output_file("/home/user/zhan/merge_git_commits/llama.cpp-ov/000_compile_model.xml"); - // compiled_model.export_model(output_file); - // output_file.close(); // Create infer request ov::InferRequest infer_request = compiled_model.create_infer_request(); @@ -180,19 +124,9 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // Set input tensor for (size_t i = 0; i < input_names.size(); i++) { - // infer_request.set_input_tensor(i, input_tensors[input_names[i]]); infer_request.set_input_tensor(i, input_tensors.at(i).second); - - // auto input_tensor = infer_request.get_input_tensor(i); - // auto input_shape = input_tensor.get_shape(); - // std::cout << "Input tensor " << i << " shape: "; - // for (const auto& dim : input_shape) { - // std::cout << dim << " "; - // } - // std::cout << std::endl; } - // std::cout << "Infer ..." << std::endl; infer_request.infer(); // Set dst data for outputs @@ -201,130 +135,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c for (size_t i = 0; i < output_names.size(); i++) { auto output_tensor = infer_request.get_output_tensor(i); std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size()); - // if(!flag) { - // auto tensor = ggml_decoder->get_output_ggml_tensor(output_names[i]); - // std::cout << std::left << "[ " << std::setw(2) << i << " ]: " - // << "output_names: " << std::setw(20) << output_names[i] - // << ", shape: " << std::setw(4) << tensor->ne[0] << " " << std::setw(4) << tensor->ne[1] << " " << tensor->ne[2] - // << ", address: " - // << std::setw(15) << tensor->data << " " - // << std::setw(15) << static_cast(((float*)output_tensor.data())[0]) - // << std::setw(15) << static_cast(((float*)output_tensor.data())[1]) - // << ", ne[0]-1: " - // << std::setw(15) << static_cast(((float*)output_tensor.data())[tensor->ne[0] - 1]) - // << ", ne[0]: " - // << std::setw(15) << static_cast(((float*)output_tensor.data())[tensor->ne[0]]) << std::right - // << std::setw(15) << static_cast(((float*)output_tensor.data())[tensor->ne[0] + 1]) << std::right - // << std::right - // << std::endl; - // if (i == 19) { - // auto output_tensor_18 = infer_request.get_output_tensor(18); - // auto tensor = ggml_decoder->get_output_ggml_tensor(output_names[18]); - // std::cout << std::left << " " << std::setw(2) << 18 << " : " - // << "output_names: " << std::setw(20) << output_names[18] - // << ", shape: " << std::setw(4) << tensor->ne[0] << " " << std::setw(4) << tensor->ne[1] << " " << tensor->ne[2] - // << ", address: " - // << std::setw(15) << tensor->data << " " - // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[0]) - // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[1]) - // << ", ne[0]-1: " - // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[tensor->ne[0] - 1]) - // << ", ne[0]: " - // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[tensor->ne[0]]) << std::right - // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[tensor->ne[0] + 1]) << std::right - // << std::right - // << std::endl; - // } - // if(i == 23) { - // auto output_tensor_15 = infer_request.get_output_tensor(15); - // auto tensor = ggml_decoder->get_output_ggml_tensor(output_names[15]); - // std::cout << std::left << " " << std::setw(2) << 15 << " : " - // << "output_names: " << std::setw(20) << output_names[15] - // << ", shape: " << std::setw(4) << tensor->ne[0] << " " << std::setw(4) << tensor->ne[1] << " " << tensor->ne[2] - // << ", address: " - // << std::setw(15) << tensor->data << " " - // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[0]) - // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[1]) - // << ", ne[0]-1: " - // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[tensor->ne[0] - 1]) - // << ", ne[0]: " - // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[tensor->ne[0]]) << std::right - // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[tensor->ne[0] + 1]) << std::right - // << std::right - // << std::endl; - // auto cache_k_l0_20 = ggml_decoder->get_input_names()[20]; - // // auto input_data = ggml_decoder->get_input_ggml_tensor(cache_k_l0_20)->data; - // auto input_tensor = input_tensors.at(20).second; - // std::cout << std::left << " " << std::setw(2) << 20 << " : " - // << "Input Name: " << std::setw(20) << cache_k_l0_20 - // << ", shape: " << std::setw(4) << input_tensor.get_shape()[0] << " " << std::setw(4) << input_tensor.get_shape()[1] << " " << input_tensor.get_shape()[2] - // << ", address: " - // << std::setw(15) << input_tensor.data() << " " - // << std::setw(15) << ((float*)input_tensor.data())[0] - // << std::setw(15) << ((float*)input_tensor.data())[1] - // << ", ne[0]-1: " - // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0] - 1] - // << ", ne[0]: " - // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0]] << std::right - // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0] + 1] << std::right - // << std::right - // << std::endl; - - // auto cache_k_l0_27 = ggml_decoder->get_input_names()[27]; - // // auto input_data = ggml_decoder->get_input_ggml_tensor(cache_k_l0_20)->data; - // auto input_tensor_27 = input_tensors.at(27).second; - // std::cout << std::left << " " << std::setw(2) << 27 << " : " - // << "Input Name: " << std::setw(20) << cache_k_l0_27 - // << ", shape: " << std::setw(4) << input_tensor_27.get_shape()[0] << " " << std::setw(4) << input_tensor_27.get_shape()[1] << " " << input_tensor_27.get_shape()[2] - // << ", address: " - // << std::setw(15) << input_tensor_27.data() << " " - // << std::setw(15) << ((float*)input_tensor_27.data())[0] - // << std::setw(15) << ((float*)input_tensor_27.data())[1] - // << ", ne[0]-1: " - // << std::setw(15) << ((float*)input_tensor_27.data())[input_tensor_27.get_shape()[0] - 1] - // << ", ne[0]: " - // << std::setw(15) << ((float*)input_tensor_27.data())[input_tensor_27.get_shape()[0]] << std::right - // << std::setw(15) << ((float*)input_tensor_27.data())[input_tensor_27.get_shape()[0] + 1] << std::right - // << std::right - // << std::endl; - - // auto cache_k_l0_29 = ggml_decoder->get_input_names()[29]; - // // auto input_data = ggml_decoder->get_input_ggml_tensor(cache_k_l0_20)->data; - // auto input_tensor_29 = input_tensors.at(29).second; - // std::cout << std::left << " " << std::setw(2) << 29 << " : " - // << "Input Name: " << std::setw(20) << cache_k_l0_29 - // << ", shape: " << std::setw(4) << input_tensor_29.get_shape()[0] << " " << std::setw(4) << input_tensor_29.get_shape()[1] << " " << input_tensor_29.get_shape()[2] - // << ", address: " - // << std::setw(15) << input_tensor_29.data() << " " - // << std::setw(15) << ((float*)input_tensor_29.data())[0] - // << std::setw(15) << ((float*)input_tensor_29.data())[1] - // << ", ne[0]-1: " - // << std::setw(15) << ((float*)input_tensor_29.data())[input_tensor_29.get_shape()[0] - 1] - // << ", ne[0]: " - // << std::setw(15) << ((float*)input_tensor_29.data())[input_tensor_29.get_shape()[0]] << std::right - // << std::setw(15) << ((float*)input_tensor_29.data())[input_tensor_29.get_shape()[0] + 1] << std::right - // << std::right - // << std::endl; - - // auto cache_k_l0_30 = ggml_decoder->get_input_names()[30]; - // // auto input_data = ggml_decoder->get_input_ggml_tensor(cache_k_l0_20)->data; - // auto input_tensor_30 = input_tensors.at(30).second; - // std::cout << std::left << " " << std::setw(2) << 30 << " : " - // << "Input Name: " << std::setw(20) << cache_k_l0_30 - // << ", shape: " << std::setw(4) << input_tensor_30.get_shape()[0] << " " << std::setw(4) << input_tensor_30.get_shape()[1] << " " << input_tensor_30.get_shape()[2] - // << ", address: " - // << std::setw(15) << input_tensor_30.data() << " " - // << std::setw(15) << ((float*)input_tensor_30.data())[0] - // << std::setw(15) << ((float*)input_tensor_30.data())[1] - // << ", ne[0]-1: " - // << std::setw(15) << ((float*)input_tensor_30.data())[input_tensor_30.get_shape()[0] - 1] - // << ", ne[0]: " - // << std::setw(15) << ((float*)input_tensor_30.data())[input_tensor_30.get_shape()[0]] << std::right - // << std::setw(15) << ((float*)input_tensor_30.data())[input_tensor_30.get_shape()[0] + 1] << std::right - // << std::right - // << std::endl; - // } - // } #ifdef GGML_OPENVINO_DEBUG printf("Output %s after: %g\n", output_names[i].c_str(), *(double*)(output_tensor.data())); #endif From 651b2c06cb593d62d4a4a925a492be27063a1cc0 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 14 Apr 2025 18:04:03 +0800 Subject: [PATCH 047/254] * Use find_package in CMake to configure OpenVINO * Remove OPENVINO_OP_DEBUG * Simplify set_input_output in decoder * Fix CPY in set_input_output * Use params from converted ov model in setting input --- ggml/src/ggml-openvino.cpp | 28 ++- ggml/src/ggml-openvino/ggml-decoder.cpp | 274 +++++------------------- ggml/src/ggml-openvino/utils.cpp | 55 +++-- 3 files changed, 114 insertions(+), 243 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index b9f1b89722..762ed786a9 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -3,6 +3,7 @@ #include "ggml-impl.h" #include "ggml-openvino.h" #include "ggml-openvino/utils.h" +#include "ggml.h" #include #include @@ -1367,7 +1368,7 @@ static const std::set& openvino_ops = []() -> const std::set& openvino_ops = []() -> const std::setop); - if (it == op_mapping.end()) { - return false; + static const std::map> op_mapping_unary = { + {GGML_UNARY_OP_SILU, {"Sigmoid", "Multiply"}}, + }; + + std::vector mapped_ops; + if (op->op == GGML_OP_UNARY) { + auto it = op_mapping_unary.find(ggml_get_unary_op(op)); + if (it == op_mapping_unary.end()) { + return false; + } + mapped_ops = it->second; + } else { + auto it = op_mapping.find(op->op); + if (it == op_mapping.end()) { + return false; + } + mapped_ops = it->second; } - for (const std::string& op_name : it->second) { + for (const std::string& op_name : mapped_ops) { if (openvino_ops.count(op_name) == 0) { return false; } } return true; -#endif } static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 3b396c05f7..d7895c3d7f 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -6,222 +6,66 @@ #include void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs) { + std::string node_name; + if (node->op == GGML_OP_CPY) { + // CPY updates the input tensor in place. For later ov op that uses the + // input tensor of CPY, we need to make sure they get the updated tensor + // by putting the src tensor name in the tensor_map in + // /src/frontends/ggml/src/translate_session.cpp + node_name = std::string(node->view_src->name); + } else { + node_name = std::string(node->name); + } + std::string src0_name = std::string(node->src[0]->name); - std::string node_name = std::string(node->name); + inputs[src0_name] = node->src[0]; + outputs[node_name] = node; + m_input_names.push_back(src0_name); + m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); + if (node->op == GGML_OP_CPY && node->view_src) { + m_output_names.push_back(node->view_src->name); + } else { + m_output_names.push_back(node_name); + } + + if (node->src[1]) { + std::string src1_name = std::string(node->src[1]->name); + inputs[src1_name] = node->src[1]; + m_input_names.push_back(src1_name); + m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); + } + if (node->src[2]) { + std::string src2_name = std::string(node->src[2]->name); + inputs[src2_name] = node->src[2]; + m_input_names.push_back(src2_name); + m_op_node_name.emplace_back(src2_name, ggml_op_name(node->op)); + } switch (node->op) { - // Unary OPs - case GGML_OP_UNARY: - case GGML_OP_RESHAPE: - case GGML_OP_TRANSPOSE: - case GGML_OP_PERMUTE: - case GGML_OP_RMS_NORM: - { - inputs[src0_name] = node->src[0]; - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - break; + case GGML_OP_CONT: { + if (ggml_is_contiguous(node->src[0]) && ggml_is_contiguous(node) && + (node->src[0]->ne[0] * node->src[0]->nb[0] == node->src[0]->nb[1])) { + m_continuous = true; + } else { + m_continuous = false; } - case GGML_OP_CONT: - { - if (ggml_is_contiguous(node->src[0]) - && ggml_is_contiguous(node) - && (node->src[0]->ne[0] * node->src[0]->nb[0] == node->src[0]->nb[1])) { - inputs[src0_name] = node->src[0]; - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - - ov::Shape input_shape = { static_cast(node->src[0]->ne[2]), - static_cast(node->src[0]->ne[1]), - static_cast(node->src[0]->ne[0])}; - auto input_param = std::make_shared(ov::element::f32, input_shape); - m_params.push_back(input_param); - - m_continuous = true; - break; - } - - if (node->src[0]->type == node->type && node->src[0]->ne[0] == node->ne[0] && - node->src[0]->nb[0] == ggml_type_size(node->src[0]->type) && - node->nb[0] == ggml_type_size(node->src[0]->type)) { - - inputs[src0_name] = node->src[0]; - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - - const size_t element_size = ggml_type_size(node->src[0]->type); - size_t valid_elems = static_cast(node->src[0]->ne[0]); // 3072 - size_t num_rows = static_cast(node->src[0]->ne[1]); // 7 - size_t dim2 = static_cast(node->src[0]->ne[2]); // 1 - size_t phys_stride = static_cast(node->src[0]->nb[1]) / element_size; // 9216 - // size_t total_phys = (num_rows - 1) * phys_stride + valid_elems; // 6*9216 + 3072 = 58368 - size_t total_phys = num_rows * phys_stride; // 7 * 9216 = 64512 - ov::Shape input_shape = { dim2, num_rows, phys_stride }; - auto input_param = std::make_shared(ov::element::f32, input_shape); - m_params.push_back(input_param); - - m_continuous = false; - break; - } - - if (ggml_is_contiguous(node)) { - inputs[src0_name] = node->src[0]; - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - - ov::Shape input_shape = { static_cast(node->src[0]->ne[2]), - static_cast(node->src[0]->ne[1]), - static_cast(node->src[0]->ne[0])}; - auto input_param = std::make_shared(ov::element::f32, input_shape); - m_params.push_back(input_param); - - m_continuous = false; - break; - } + break; + } + case GGML_OP_CPY: { + m_continuous = ggml_is_contiguous(node); + break; + } + case GGML_OP_MUL_MAT: { + if (!ggml_is_contiguous(node->src[1]) || + node->src[1]->ne[0] * node->src[1]->nb[0] != node->src[1]->nb[1]) { + m_continuous = false; + } else { + m_continuous = true; } - case GGML_OP_CPY: - { - if (ggml_is_contiguous(node)) { - std::string src1_name = std::string(node->src[1]->name); - inputs[src0_name] = node->src[0]; - src1_name = std::string(node->src[1]->view_src->name); - inputs[src1_name] = node->src[1]; - node_name = std::string(node->view_src->name); - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_input_names.push_back(src1_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - m_continuous = true; - - ov::Shape input1_shape = { static_cast(node->src[0]->ne[2]), - static_cast(node->src[0]->ne[1]), - static_cast(node->src[0]->ne[0])}; - auto input1_param = std::make_shared(ov::element::f32, input1_shape); - m_params.push_back(input1_param); - ov::Shape input2_shape = { static_cast(node->src[1]->ne[2]), - static_cast(node->src[1]->ne[1]), - static_cast(node->src[1]->view_src->ne[0])}; - auto input2_param = std::make_shared(ov::element::f16, input2_shape); - m_params.push_back(input2_param); - break; - } else { - std::string src1_name = std::string(node->src[1]->name); - inputs[src0_name] = node->src[0]; - src1_name = std::string(node->src[1]->view_src->name); - inputs[src1_name] = node->src[1]; - node_name = std::string(node->view_src->name); - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_input_names.push_back(src1_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - - ov::Shape input0_shape = { static_cast(node->src[0]->ne[2]), - static_cast(node->src[0]->ne[1]), - static_cast(node->src[0]->ne[0])}; - auto input0_param = std::make_shared(ov::element::f32, input0_shape); - m_params.push_back(input0_param); - ov::Shape input1_shape = { 1, 1, static_cast(node->src[1]->nb[2] / node->src[1]->nb[0])}; - auto input1_param = std::make_shared(ov::element::f16, input1_shape); - m_params.push_back(input1_param); - - m_continuous = false; - - break; - } - } - // For view, input is node itself - case GGML_OP_VIEW: - { - inputs[src0_name] = node->src[0]; - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - break; - } - // SCALE - case GGML_OP_SCALE: - { - inputs[src0_name] = node->src[0]; - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - break; - } - case GGML_OP_MUL_MAT: - { - if (!ggml_is_contiguous(node->src[1]) || node->src[1]->ne[0] * node->src[1]->nb[0] != node->src[1]->nb[1]) { - m_continuous = false; - } else { - m_continuous = true; - } - std::string src1_name = std::string(node->src[1]->name); - inputs[src0_name] = node->src[0]; - inputs[src1_name] = node->src[1]; - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_input_names.push_back(src1_name); - m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - break; - } - // OPs with 2 inputs - case GGML_OP_ADD: - case GGML_OP_DIV: - case GGML_OP_MUL: - case GGML_OP_SUB: - case GGML_OP_GET_ROWS: - case GGML_OP_SOFT_MAX: - { - inputs[src0_name] = node->src[0]; - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - if (node->src[1]) { - std::string src1_name = std::string(node->src[1]->name); - inputs[src1_name] = node->src[1]; - m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); - m_input_names.push_back(src1_name); - } - break; - } - // OPs with 3 inputs: - case GGML_OP_ROPE: - { - std::string src1_name = std::string(node->src[1]->name); - inputs[src0_name] = node->src[0]; - inputs[src1_name] = node->src[1]; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_input_names.push_back(src1_name); - m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); - outputs[node_name] = node; - m_output_names.push_back(node_name); - if (node->src[2]) { - std::string src2_name = std::string(node->src[2]->name); - inputs[src2_name] = node->src[2]; - m_input_names.push_back(src2_name); - m_op_node_name.emplace_back(src2_name, ggml_op_name(node->op)); - } - break; - } - default: - break; + break; + } + default: + break; } } @@ -334,7 +178,6 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr m_op_node_name.clear(); m_decoders.clear(); - // If first init if (m_node) { set_input_output(m_node, m_inputs, m_outputs); } else { @@ -353,7 +196,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr ov::PartialShape GgmlOvDecoder::get_input_shape(const std::string& name) const { ov::PartialShape input_shape; - // Use input_node->ne + // Use input_node->ne ggml_tensor * node = m_inputs.at(name); std::vector shape; @@ -440,7 +283,6 @@ const std::vector>& GgmlOvDecoder::get_pa ov::PartialShape GgmlOvDecoder::get_output_shape(const std::string& name) const { ov::PartialShape output_shape; - // Use input_node->ne ggml_tensor * node = m_outputs.at(name); std::vector shape; @@ -552,10 +394,10 @@ const std::string& GgmlOvDecoder::get_op_type() const { auto unary_it = unaryOpTypeMap.find(ggml_get_unary_op(m_node)); if (unary_it != unaryOpTypeMap.end()) { return unary_it->second; - } + } } return it->second; - } + } static const std::string unknown_op = "UNKNOWN_OP"; return unknown_op; } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 736c7f690b..f4d9c7705a 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -1,9 +1,11 @@ #include "utils.h" -#include "ggml-impl.h" #include "ggml-backend-impl.h" +#include "ggml-impl.h" +#include +#include +#include #include #include -#include using ov::frontend::ggml::GgmlDecoder; @@ -20,27 +22,14 @@ std::vector> get_ggml_graph_input_tensors(std std::string op_node_name = ggml_decoder->get_op_node_name(name, op_iter++); // auto node_op_name = ggml_decoder->get_node_op_name(name); auto input_data = ggml_decoder->get_input_ggml_tensor(name)->data; - auto check_if_contiguous = ggml_is_contiguous(ggml_decoder->get_input_ggml_tensor(name)); #ifdef GGML_OPENVINO_DEBUG printf("Subgraph input %d: %g\n", inp, *(double*)(input_data)); #endif ov::Tensor input_tensor; ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape(); - ov::element::Type input_type = ggml_decoder->get_input_type(name); - size_t element_size = input_type.size(); std::vector input_stride = ggml_decoder->get_input_stride(name); - if (op_node_name == "CONT" && input_shape[0] == 1 // Except for the kqv_merge node - && (input_shape[1] != 1 && flag || input_shape[2]*element_size!=input_stride[1]) - ) { - const size_t num_rows = static_cast(ggml_decoder->get_input_shape(name).to_shape()[1]); - const size_t dim2 = static_cast(ggml_decoder->get_input_shape(name).to_shape()[0]); - size_t phys_stride = static_cast(input_stride[1]) / element_size; - ov::Shape input_shape = { dim2, num_rows, phys_stride }; // {1, 7, 9216 } - input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); - } else { - input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); - } + input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); // input_tensors[name] = input_tensor; input_tensors.emplace_back(name, input_tensor); @@ -49,6 +38,18 @@ std::vector> get_ggml_graph_input_tensors(std return input_tensors; } +ov::Tensor get_ggml_graph_input_tensor(std::shared_ptr ggml_decoder, std::string& name) { + auto input_data = ggml_decoder->get_input_ggml_tensor(name)->data; + #ifdef GGML_OPENVINO_DEBUG + printf("Subgraph input %s: %g\n", name.c_str(), *(double*)(input_data)); + #endif + ov::Tensor input_tensor; + ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape(); + std::vector input_stride = ggml_decoder->get_input_stride(name); + input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); + return input_tensor; +} + std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder) { std::map output_tensors; auto output_names = ggml_decoder->get_output_names(); @@ -79,7 +80,7 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, const int32_t start_index, const int32_t end_index, bool flag) { static ov::Core core; // auto devices = core.get_available_devices(); - // Get GGML Frontend + // Get GGML Frontend static auto front_end = get_ggml_frontend(); if (!front_end) { GGML_LOG_ERROR("GGML FrontEnd is not initialized \n"); @@ -102,9 +103,17 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c #endif } - // Convert InputModel -> ov::Model + // Convert InputModel -> ov::Model std::shared_ptr model = front_end->convert(input_model); + if (getenv("OPENVINO_DUMP_GRAPH")) { + char timestamped_filename[64]; + auto timestamp = (long long)ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), + "model_%lld.xml", timestamp); + ov::serialize(model, timestamped_filename); + } + if (!model) { GGML_LOG_ERROR("Model is not converted \n"); } else { @@ -122,10 +131,14 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto input_names = ggml_decoder->get_input_names(); auto input_tensors = get_ggml_graph_input_tensors(ggml_decoder, flag); - // Set input tensor - for (size_t i = 0; i < input_names.size(); i++) { - infer_request.set_input_tensor(i, input_tensors.at(i).second); + auto ov_params = model->get_parameters(); + for (size_t i = 0; i < ov_params.size(); i++) { + auto param_name = ov_params[i]->get_friendly_name(); + infer_request.set_input_tensor(i, get_ggml_graph_input_tensor(ggml_decoder, param_name)); } + // for (size_t i = 0; i < input_names.size(); i++) { + // infer_request.set_input_tensor(i, input_tensors.at(i).second); + // } infer_request.infer(); From 91d2a195b56dd4967846951cf1dbaf646576438b Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 15 Apr 2025 14:34:00 +0800 Subject: [PATCH 048/254] change op mappings to list in openvino_supports_op --- ggml/src/ggml-openvino.cpp | 96 +++----------------------------- ggml/src/ggml-openvino/utils.cpp | 21 +++---- ggml/src/ggml-openvino/utils.h | 2 +- 3 files changed, 17 insertions(+), 102 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 762ed786a9..5ea2351e06 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -1036,9 +1036,7 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe // Process nodes in order - bool prompt_process_flag = true; if (cgraph->nodes[0]->ne[1] == 1) { - prompt_process_flag = false; for (int i = 0; i < cgraph->n_nodes; i++) { if (std::find(add_indices.begin(), add_indices.end(), i) != add_indices.end()) { ggml_backend_openvino_add_forward(cgraph->nodes[i]); @@ -1066,13 +1064,13 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe i++; } if (start_index < i) { - openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag); + openvino_frontend_compute(backend, cgraph, start_index, --i); } } } } else { int end_node = cgraph->n_nodes - 1; - openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag); + openvino_frontend_compute(backend, cgraph, 0, end_node); } return GGML_STATUS_SUCCESS; @@ -1331,91 +1329,11 @@ static const std::set& openvino_ops = []() -> const std::set> op_mapping = { - {GGML_OP_ACC, {"Add"}}, - {GGML_OP_ADD, {"Add"}}, - {GGML_OP_ADD1, {"Add"}}, - {GGML_OP_ADD_REL_POS, {"Add", "MatMul", "Reshape"}}, - {GGML_OP_ARANGE, {"Range"}}, - {GGML_OP_ARGMAX, {"TopK"}}, - {GGML_OP_ARGSORT, {"TopK"}}, - {GGML_OP_CLAMP, {"Clamp"}}, - {GGML_OP_CONCAT, {"Concat"}}, - {GGML_OP_CONV_TRANSPOSE_1D, {"ConvolutionBackpropData"}}, - {GGML_OP_CONV_TRANSPOSE_2D, {"ConvolutionBackpropData"}}, - {GGML_OP_COS, {"Cos"}}, - {GGML_OP_CROSS_ENTROPY_LOSS, {"Softmax", "Log", "Multiply", "ReduceSum", "Negative"}}, - {GGML_OP_DIAG, {"Eye", "Multiply"}}, - {GGML_OP_DIAG_MASK_INF, {"Eye", "Multiply", "Select", "Broadcast"}}, - {GGML_OP_DIAG_MASK_ZERO, {"Eye", "Multiply", "Select", "Broadcast"}}, - {GGML_OP_DIV, {"Divide"}}, - {GGML_OP_FLASH_ATTN_EXT, {"ScaledDotProductAttention"}}, - {GGML_OP_GET_ROWS, {"Gather"}}, - {GGML_OP_GROUP_NORM, {"GroupNormalization"}}, - {GGML_OP_IM2COL, {"Custom", "Reshape", "Transpose"}}, - {GGML_OP_LEAKY_RELU, {"PReLU"}}, - {GGML_OP_LOG, {"Log"}}, - {GGML_OP_MEAN, {"ReduceMean"}}, - {GGML_OP_MUL, {"Multiply"}}, - {GGML_OP_MUL_MAT, {"MatMul"}}, - {GGML_OP_MUL_MAT_ID, {"MatMul", "Identity"}}, - {GGML_OP_NORM, {"NormalizeL2"}}, - {GGML_OP_OUT_PROD, {"MatMul", "Reshape"}}, - {GGML_OP_PAD, {"Pad"}}, - {GGML_OP_PERMUTE, {"Transpose"}}, - {GGML_OP_POOL_1D, {"AvgPool", "MaxPool"}}, - {GGML_OP_POOL_2D, {"AvgPool", "MaxPool"}}, - {GGML_OP_REPEAT, {"Tile"}}, - {GGML_OP_RESHAPE, {"Reshape"}}, - {GGML_OP_RMS_NORM, {"Multiply", "Divide", "Sqrt"}}, - {GGML_OP_ROPE, {"Sin", "Cos", "Multiply", "Add", "Subtract", "Split", "StridedSlice", "Concat"}}, - {GGML_OP_SCALE, {"Multiply", "Constant"}}, - {GGML_OP_SET, {"Assign"}}, - {GGML_OP_SIN, {"Sin"}}, - {GGML_OP_SOFT_MAX, {"Softmax"}}, - {GGML_OP_SQR, {"Power"}}, - {GGML_OP_SQRT, {"Sqrt"}}, - {GGML_OP_SSM_CONV, {"Custom"}}, - {GGML_OP_SSM_SCAN, {"Custom"}}, - {GGML_OP_SUB, {"Subtract"}}, - {GGML_OP_SUM, {"ReduceSum"}}, - {GGML_OP_SUM_ROWS, {"ReduceSum", "Squeeze", "Unsqueeze"}}, - {GGML_OP_TIMESTEP_EMBEDDING, {"Range", "Power", "Multiply", "Sin", "Cos", "Concat"}}, - {GGML_OP_TRANSPOSE, {"Transpose"}}, - {GGML_OP_UPSCALE, {"Interpolate"}}, - {GGML_OP_VIEW, {"Reshape"}}, - {GGML_OP_CONT, {"Reshape", "StridedSlice"}}, - {GGML_OP_CPY, {"Reshape", "ScatterNDUpdate"}}, - {GGML_OP_WIN_PART, {"StridedSlice", "Concat", "Reshape", "Custom"}}, - {GGML_OP_WIN_UNPART, {"Reshape", "Transpose", "Custom"}}, - }; - - static const std::map> op_mapping_unary = { - {GGML_UNARY_OP_SILU, {"Sigmoid", "Multiply"}}, - }; - - std::vector mapped_ops; - if (op->op == GGML_OP_UNARY) { - auto it = op_mapping_unary.find(ggml_get_unary_op(op)); - if (it == op_mapping_unary.end()) { - return false; - } - mapped_ops = it->second; - } else { - auto it = op_mapping.find(op->op); - if (it == op_mapping.end()) { - return false; - } - mapped_ops = it->second; - } - - for (const std::string& op_name : mapped_ops) { - if (openvino_ops.count(op_name) == 0) { - return false; - } - } - - return true; + if (op->op == GGML_OP_UNARY) { + return supported_unary_ops.find(ggml_get_unary_op(op)) != + supported_unary_ops.end(); + } + return supported_ops.find(op->op) != supported_ops.end(); } static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index f4d9c7705a..c32ad65842 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -1,6 +1,7 @@ #include "utils.h" #include "ggml-backend-impl.h" #include "ggml-impl.h" +#include "ggml.h" #include #include #include @@ -13,7 +14,7 @@ std::shared_ptr get_ggml_decoder(struct ggml_cgraph * cgraph, con return std::make_shared(nullptr, cgraph, start_index, end_index); } -std::vector> get_ggml_graph_input_tensors(std::shared_ptr ggml_decoder, bool flag) { +std::vector> get_ggml_graph_input_tensors(std::shared_ptr ggml_decoder) { std::vector> input_tensors; auto input_names = ggml_decoder->get_input_names(); size_t op_iter = 0; @@ -77,10 +78,13 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { return front_end; } -enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, const int32_t start_index, const int32_t end_index, bool flag) { +enum ggml_status openvino_frontend_compute(ggml_backend_t backend, + struct ggml_cgraph *cgraph, + const int32_t start_index, + const int32_t end_index) { static ov::Core core; + // auto devices = core.get_available_devices(); - // Get GGML Frontend static auto front_end = get_ggml_frontend(); if (!front_end) { GGML_LOG_ERROR("GGML FrontEnd is not initialized \n"); @@ -90,6 +94,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c GGML_LOG_INFO("GGML FrontEnd is initialized \n"); #endif } + auto ggml_decoder = get_ggml_decoder(cgraph, start_index, end_index); std::shared_ptr graph_decoder = ggml_decoder; // Load GraphIterator -> InputModel @@ -123,26 +128,18 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c } ov::CompiledModel compiled_model = core.compile_model(model); - - // Create infer request ov::InferRequest infer_request = compiled_model.create_infer_request(); - // Get input tensor auto input_names = ggml_decoder->get_input_names(); - auto input_tensors = get_ggml_graph_input_tensors(ggml_decoder, flag); - + auto input_tensors = get_ggml_graph_input_tensors(ggml_decoder); auto ov_params = model->get_parameters(); for (size_t i = 0; i < ov_params.size(); i++) { auto param_name = ov_params[i]->get_friendly_name(); infer_request.set_input_tensor(i, get_ggml_graph_input_tensor(ggml_decoder, param_name)); } - // for (size_t i = 0; i < input_names.size(); i++) { - // infer_request.set_input_tensor(i, input_tensors.at(i).second); - // } infer_request.infer(); - // Set dst data for outputs auto output_names = ggml_decoder->get_output_names(); auto output_tensors = get_ggml_graph_output_dst(ggml_decoder); for (size_t i = 0; i < output_names.size(); i++) { diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 7806c418cb..0f5617ab4b 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -1,4 +1,4 @@ #include "ggml-decoder.h" #include "ggml-backend-impl.h" -enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, const int32_t start_index=0, const int32_t end_index=0, bool flag = true); +enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, const int32_t start_index=0, const int32_t end_index=0); From 8d263bd6a520d4b01bc382fe66f099ddd7e9e70e Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 15 Apr 2025 19:43:29 +0800 Subject: [PATCH 049/254] 2nd+ token correct by fix CPY in OV, remove single op backend compute code --- ggml/src/ggml-openvino.cpp | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 5ea2351e06..efb8ff12bc 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -1,18 +1,14 @@ #include "ggml-backend-impl.h" -#include "ggml-cpu-impl.h" #include "ggml-impl.h" #include "ggml-openvino.h" #include "ggml-openvino/utils.h" #include "ggml.h" -#include #include -#include #include -#include -#include -#include -#include +#include +#include +#include #define GGML_OPENVINO_MAX_STREAMS 8 @@ -55,10 +51,10 @@ static ggml_backend_buffer_type_t ggml_backend_openvino_get_default_buffer_type( GGML_UNUSED(backend); } -static void ggml_backend_openvino_add_forward(ggml_tensor * dst) { - // Step 1: get the input tensor src0 和 src1 - const struct ggml_tensor *src0 = dst->src[0]; - const struct ggml_tensor *src1 = dst->src[1]; +static enum ggml_status +ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph *cgraph) { + int end_node = cgraph->n_nodes - 1; + openvino_frontend_compute(backend, cgraph, 0, end_node); ov::Core core; @@ -1267,17 +1263,6 @@ static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(g return nullptr; } -std::set get_openvino_available_opsets() { - ov::Core core; - std::set unique_ops; - for (const auto& opset : ov::get_available_opsets()) { - for (const auto& op : opset.second().get_type_info_set()) { - unique_ops.insert(op.name); - } - } - return unique_ops; -} - static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { GGML_ASSERT(dev->reg != nullptr); From 8b408869ae97ebdbf5c55fb9e5dd7feac3446087 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 17 Apr 2025 17:42:44 +0800 Subject: [PATCH 050/254] Arbitrary token len (>32) work; Fix bug in mulmat --- ggml/src/ggml-openvino/ggml-decoder.cpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index d7895c3d7f..b1fc8ec67e 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -56,13 +56,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[1]) || - node->src[1]->ne[0] * node->src[1]->nb[0] != node->src[1]->nb[1]) { - m_continuous = false; - } else { - m_continuous = true; - } - break; + m_continuous = node->src[0]->view_src == nullptr; + break; } default: break; From 6ed44a3dffee5bf4c4aa49966b99ec13a5f65fb2 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 21 Apr 2025 15:14:43 +0800 Subject: [PATCH 051/254] FEAT: do PERMUTE eagerly --- ggml/src/ggml-openvino/ggml-decoder.cpp | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index b1fc8ec67e..c639d630f3 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -43,12 +43,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapop) { case GGML_OP_CONT: { - if (ggml_is_contiguous(node->src[0]) && ggml_is_contiguous(node) && - (node->src[0]->ne[0] * node->src[0]->nb[0] == node->src[0]->nb[1])) { - m_continuous = true; - } else { - m_continuous = false; - } + // Currently only two cases, either the input comes from a VIEW which is subtensor or from a PERMUTE + m_continuous = ggml_nelements(node->src[0]) == ggml_nelements(node->src[0]->view_src); break; } case GGML_OP_CPY: { @@ -183,9 +179,9 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr // Init model input and output set_input_output(cur_node, m_inputs, m_outputs); } - #ifdef GGML_OPENVINO_DEBUG - ggml_graph_op_print(m_cgraph); - #endif + if (getenv("GGML_OPENVINO_DEBUG")) { + ggml_graph_op_print(m_cgraph); + } } } From 0c7b026ecc486e823cec2e0931fbc1ff7ab0174e Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 22 Apr 2025 19:03:12 +0800 Subject: [PATCH 052/254] FEAT: Add interleaved mode for ROPE --- ggml/src/ggml-openvino/ggml-decoder.cpp | 28 ++++++++++++++----------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index c639d630f3..2dbde9ea5a 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -103,12 +103,6 @@ void ggml_graph_op_print(const struct ggml_cgraph * cgraph) { << std::setw(5) << node->src[0]->ne[3] << "] " << std::setw(12) << "0: " << std::left << std::setw(12) << ggml_op_name(node->src[0]->op) << std::right; - // // Custom logic to handle '\000' - // const char* name_ptr = node->src[0]->name; - // while (*name_ptr != '\0' || *(name_ptr + 1) != '\0' || *(name_ptr + 2) != '\0') { - // file << *name_ptr; - // name_ptr++; - // } file << std::left << std::setw(30) << node->src[0]->name << std::right << std::setw(16) << "[ " << std::setw(0) << node->src[0]->nb[0] << ", " @@ -125,12 +119,6 @@ void ggml_graph_op_print(const struct ggml_cgraph * cgraph) { << std::setw(5) << node->src[1]->ne[3] << "] " << std::setw(12) << "1: " << std::left << std::setw(12) << ggml_op_name(node->src[1]->op) << std::right; - // // Custom logic to handle '\000' - // const char* name_ptr = node->src[1]->name; - // while (*name_ptr != '\0' || *(name_ptr + 1) != '\0' || *(name_ptr + 2) != '\0') { - // file << *name_ptr; - // name_ptr++; - // } file << std::left << std::setw(30) << node->src[1]->name << std::right << std::setw(16) << "[ " << std::setw(0) << node->src[1]->nb[0] << ", " @@ -139,6 +127,22 @@ void ggml_graph_op_print(const struct ggml_cgraph * cgraph) { << std::setw(5) << node->src[1]->nb[3] << "] " << "\n"; } + if (node->src[2]) { + file << std::setw(10) << " [ " + << std::setw(5) << node->src[2]->ne[0] << ", " + << std::setw(5) << node->src[2]->ne[1] << ", " + << std::setw(5) << node->src[2]->ne[2] << ", " + << std::setw(5) << node->src[2]->ne[3] << "] " + << std::setw(12) + << "2: " << std::left << std::setw(12) << ggml_op_name(node->src[1]->op) << std::right; + file << std::left << std::setw(30) << node->src[2]->name << std::right + << std::setw(16) << "[ " + << std::setw(0) << node->src[2]->nb[0] << ", " + << std::setw(5) << node->src[2]->nb[1] << ", " + << std::setw(5) << node->src[2]->nb[2] << ", " + << std::setw(5) << node->src[2]->nb[3] << "] " + << "\n"; + } } file << "n_leafs = " << cgraph->n_leafs << "\n"; From c04966cda6b756470dee7e2809c23baf0840cf69 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 28 Apr 2025 12:00:13 +0800 Subject: [PATCH 053/254] REFACTOR: support weigts as constant --- ggml/src/ggml-openvino.cpp | 3 +- ggml/src/ggml-openvino/decoder.h | 22 +- ggml/src/ggml-openvino/ggml-decoder.cpp | 397 +++++++++++++----------- ggml/src/ggml-openvino/ggml-decoder.h | 34 +- ggml/src/ggml-openvino/utils.cpp | 154 ++++----- ggml/src/ggml-openvino/utils.h | 2 +- 6 files changed, 334 insertions(+), 278 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index efb8ff12bc..5221a1ff8b 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -53,8 +53,7 @@ static ggml_backend_buffer_type_t ggml_backend_openvino_get_default_buffer_type( static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph *cgraph) { - int end_node = cgraph->n_nodes - 1; - openvino_frontend_compute(backend, cgraph, 0, end_node); + openvino_frontend_compute(backend, cgraph); ov::Core core; diff --git a/ggml/src/ggml-openvino/decoder.h b/ggml/src/ggml-openvino/decoder.h index e287f31e23..c0641e2662 100644 --- a/ggml/src/ggml-openvino/decoder.h +++ b/ggml/src/ggml-openvino/decoder.h @@ -1,19 +1,14 @@ #pragma once +#include + #include "openvino/core/node.hpp" #include "openvino/frontend/decoder.hpp" -#include "openvino/op/parameter.hpp" namespace ov { namespace frontend { namespace ggml { -// 定义 tensor_info 结构体 -struct tensor_info { - - std::vector shape; - std::vector stride; -}; // TODO: Directly include from openvino class GgmlDecoder : public DecoderBase { public: @@ -36,10 +31,6 @@ public: virtual std::vector get_input_names() const = 0; - virtual std::string& get_op_node_name(const std::string& name, const int index = -1) = 0; - - // virtual const struct tensor_info get_node_op_info(const std::string& name) const = 0; - virtual PartialShape get_output_shape(const std::string& name) const = 0; virtual std::vector get_output_stride(const std::string& name) const = 0; @@ -64,14 +55,11 @@ public: virtual void visit_subgraph(std::function)> node_visitor) const = 0; - // virtual const std::vector& outputs() const = 0; - - // virtual size_t output(size_t index) const = 0; - virtual bool check_if_continuous() const = 0; - virtual const std::vector>& get_params() const = 0; - + virtual const std::unordered_map>& get_model_inputs() const = 0; + virtual const std::unordered_map>& get_model_weights() const = 0; + virtual const std::vector& get_model_output_names() const = 0; }; } // namespace ggml diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 2dbde9ea5a..05947ff579 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -1,11 +1,62 @@ #include "ggml-decoder.h" -#include -#include -#include -#include -#include -void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs) { +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ggml-backend-impl.h" +#include "ggml-backend.h" + +GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph) + : m_cgraph(cgraph), + m_node(node), + m_op_name(m_node ? std::string(m_node->name) : "NONE_OP") { + if (m_node) { + set_input_output(m_node); + } else { + // std::map> address_map; + // for (int node_n = start_index; node_n <= end_index; node_n++) { + // auto node = cgraph->nodes[node_n]; + // if (node->data) { + // auto it = address_map.find(node->data); + // if (it == address_map.end()) { + // address_map[node->data] = std::vector(); + // } + // address_map[node->data].push_back(node->name); + // } + // } + // for (const auto& pair : address_map) { + // std::cout << "Address: " << pair.first << " -> "; + // for (const auto& name : pair.second) { + // std::cout << name << " ;"; + // } + // std::cout << std::endl; + // } + + for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { + auto* cur_node = m_cgraph->nodes[node_n]; + m_nodes.push_back(cur_node); + // Init model input and output + set_input_output(cur_node); + } + if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { + dump_cgraph(m_cgraph); + } + } +} + +// Called in GgmlOvDecoder constructor. Two cases: 1. constructing a decoder for the whole graph; +// 2. constructing a decoder for a node. +void GgmlOvDecoder::set_input_output(ggml_tensor* node) { std::string node_name; if (node->op == GGML_OP_CPY) { // CPY updates the input tensor in place. For later ov op that uses the @@ -17,51 +68,130 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapname); } - std::string src0_name = std::string(node->src[0]->name); - inputs[src0_name] = node->src[0]; - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - if (node->op == GGML_OP_CPY && node->view_src) { - m_output_names.push_back(node->view_src->name); - } else { - m_output_names.push_back(node_name); + m_output_names.push_back(node_name); + m_outputs[node_name] = node; + + for (int i = 0; i < GGML_MAX_SRC; i++) { + auto* src = node->src[i]; + if (src == nullptr) { + continue; + } + std::string src_name = std::string(src->name); + m_input_names.push_back(src_name); + m_inputs[src_name] = src; + m_op_node_name.emplace_back(src_name, ggml_op_name(node->op)); + + // If called for the whole graph, create constant nodes for weights and param nodes for inputs + if (!m_node && !src->view_src) { + ggml_backend_buffer* buffer = src->buffer; + + if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { + bool weight_as_input = getenv("GGML_OPENVINO_WEIGHT_AS_INPUT"); + auto& weights_map = weight_as_input ? m_model_inputs : m_model_weights; + if (weights_map.find(src_name) != weights_map.end()) { + continue; + } + + std::shared_ptr weight_node = + weight_as_input + ? std::make_shared(get_ov_type(src), ov::Shape{get_shape(src)}) + : create_weight_node(src); + weight_node->set_friendly_name(src_name); + weights_map[src_name] = weight_node; + + } else if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) { + // GGML_BACKEND_BUFFER_USAGE_ANY are kv caches + if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) { + assert(src_name.find("cache_k") == 0 || src_name.find("cache_v") == 0); + } + if (m_model_inputs.find(src_name) != m_model_inputs.end()) { + continue; + } + auto param_node = std::make_shared(get_ov_type(src), ov::Shape{get_shape(src)}); + param_node->set_friendly_name(src_name); + m_model_inputs[src_name] = param_node; + } + } } - if (node->src[1]) { - std::string src1_name = std::string(node->src[1]->name); - inputs[src1_name] = node->src[1]; - m_input_names.push_back(src1_name); - m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); - } - if (node->src[2]) { - std::string src2_name = std::string(node->src[2]->name); - inputs[src2_name] = node->src[2]; - m_input_names.push_back(src2_name); - m_op_node_name.emplace_back(src2_name, ggml_op_name(node->op)); + if (!m_node) { + // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph + if (node->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || node->flags & GGML_TENSOR_FLAG_OUTPUT || + std::string(node->name).find("result") == 0) { + auto name = node->view_src ? std::string(node->view_src->name) : std::string(node->name); + if (node->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) { + assert(name.find("cache_k") == 0 || name.find("cache_v") == 0); + } + auto it = std::find(m_model_output_names.begin(), m_model_output_names.end(), name); + if (it == m_model_output_names.end()) { + m_model_output_names.push_back(name); + } + } } - switch (node->op) { - case GGML_OP_CONT: { - // Currently only two cases, either the input comes from a VIEW which is subtensor or from a PERMUTE - m_continuous = ggml_nelements(node->src[0]) == ggml_nelements(node->src[0]->view_src); - break; - } - case GGML_OP_CPY: { - m_continuous = ggml_is_contiguous(node); - break; - } - case GGML_OP_MUL_MAT: { - m_continuous = node->src[0]->view_src == nullptr; - break; - } - default: - break; + if (m_node) { + switch (node->op) { + case GGML_OP_CONT: { + // Currently only two cases, either the input comes from a VIEW which is subtensor or from a PERMUTE + m_continuous = ggml_nelements(node->src[0]) == ggml_nelements(node->src[0]->view_src); + break; + } + case GGML_OP_CPY: { + m_continuous = ggml_is_contiguous(node); + break; + } + case GGML_OP_MUL_MAT: { + m_continuous = node->src[0]->view_src == nullptr; + break; + } + default: + break; + } } } -void ggml_graph_op_print(const struct ggml_cgraph * cgraph) { - std::ofstream file("01_nodes.txt"); +std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) { + std::shared_ptr weight_node; + auto node_type = get_ov_type(tensor); + auto node_shape = get_shape(tensor); + auto ne_total = ggml_nelements(tensor); + switch (tensor->type) { + case GGML_TYPE_I32: { + const auto* ptr = reinterpret_cast(tensor->data); + std::vector data(ptr, ptr + ne_total); + weight_node = std::make_shared(node_type, node_shape, data); + break; + } + case GGML_TYPE_I64: { + const auto* ptr = reinterpret_cast(tensor->data); + std::vector data(ptr, ptr + ne_total); + weight_node = std::make_shared(node_type, node_shape, data); + break; + } + case GGML_TYPE_F32: { + const auto* ptr = reinterpret_cast(tensor->data); + std::vector data(ptr, ptr + ne_total); + weight_node = std::make_shared(node_type, node_shape, data); + break; + } + case GGML_TYPE_F16: { + const auto* ptr = reinterpret_cast(tensor->data); + std::vector data_f16; + data_f16.reserve(ne_total); + for (int i = 0; i < ne_total; ++i) { + data_f16.push_back(ov::float16::from_bits(ptr[i])); + } + weight_node = std::make_shared(node_type, node_shape, data_f16); + break; + } + default: + throw std::invalid_argument("Unsupported tensor type"); + } + return weight_node; +} + +void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph) { + std::ofstream file("cgraph.txt"); if (!file.is_open()) { std::cerr << "Failed to open file" << std::endl; return; @@ -160,88 +290,53 @@ void ggml_graph_op_print(const struct ggml_cgraph * cgraph) { file.close(); } - -GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgraph, const int32_t start_index, const int32_t end_index) - :m_cgraph(cgraph), - m_node(node), - m_op_name(m_node ? std::string(m_node->name) : "NONE_OP") { - m_inputs.clear(); - m_outputs.clear(); - m_input_names.clear(); - m_output_names.clear(); - m_params.clear(); - m_op_node_name.clear(); - m_decoders.clear(); - - if (m_node) { - set_input_output(m_node, m_inputs, m_outputs); - } else { - // for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { - for (int node_n = start_index; node_n <= end_index; node_n++) { - auto cur_node = m_cgraph->nodes[node_n]; - m_nodes.push_back(cur_node); - // Init model input and output - set_input_output(cur_node, m_inputs, m_outputs); - } - if (getenv("GGML_OPENVINO_DEBUG")) { - ggml_graph_op_print(m_cgraph); - } +std::vector GgmlOvDecoder::get_shape(const ggml_tensor* tensor) { + std::vector shape; + for (int i = GGML_MAX_DIMS - 2; i >= 0; --i) { + shape.push_back(static_cast(tensor->ne[i])); } + return shape; +} + +std::vector GgmlOvDecoder::get_stride(const ggml_tensor* tensor) { + std::vector stride; + for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { + stride.push_back(static_cast(tensor->nb[i])); + } + return stride; +} + +ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor* tensor) { + ov::element::Type type = ov::element::dynamic; + switch (tensor->type) { + case GGML_TYPE_F32: + type = ov::element::f32; + break; + case GGML_TYPE_F16: + type = ov::element::f16; + break; + case GGML_TYPE_I64: + type = ov::element::i64; + break; + case GGML_TYPE_I32: + type = ov::element::i32; + break; + default: + break; + } + return type; } ov::PartialShape GgmlOvDecoder::get_input_shape(const std::string& name) const { - ov::PartialShape input_shape; - // Use input_node->ne - ggml_tensor * node = m_inputs.at(name); - std::vector shape; - - for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { - if (node->ne[i] == 0) { - return input_shape; - } - shape.push_back(static_cast(node->ne[i])); - } - input_shape = ov::PartialShape(shape); - return input_shape; + return ov::PartialShape(get_shape(m_inputs.at(name))); } std::vector GgmlOvDecoder::get_input_stride(const std::string& name) const { - std::vector stride; - ggml_tensor * node = m_inputs.at(name); - for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { - stride.push_back(static_cast(node->nb[i])); - } - return stride; -} - -std::vector GgmlOvDecoder::get_output_stride(const std::string& name) const { - std::vector stride; - ggml_tensor * node = m_outputs.at(name); - for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { - stride.push_back(static_cast(node->nb[i])); - } - return stride; + return get_stride(m_inputs.at(name)); } ov::element::Type GgmlOvDecoder::get_input_type(const std::string& name) const { - ov::element::Type type = ov::element::dynamic; - switch (m_inputs.at(name)->type) { - case GGML_TYPE_F32: - type = ov::element::f32; - break; - case GGML_TYPE_F16: - type = ov::element::f16; - break; - case GGML_TYPE_I64: - type = ov::element::i64; - break; - case GGML_TYPE_I32: - type = ov::element::i32; - break; - default: - break; - } - return type; + return get_ov_type(m_inputs.at(name)); } size_t GgmlOvDecoder::get_input_size() const { @@ -257,69 +352,16 @@ std::vector GgmlOvDecoder::get_input_names() const { return m_input_names; } -std::string& GgmlOvDecoder::get_op_node_name(const std::string& key_name, const int index) { - if (index == -1) { - for (size_t i = 0; i < m_op_node_name.size(); ++i) { - if (m_op_node_name[i].first == key_name) { - return m_op_node_name[i].second; - } - } - } else { - return m_op_node_name[index].second; - } - - static std::string empty_string = ""; - return empty_string; // empty string -} - -const std::vector>& GgmlOvDecoder::get_params() const { - return m_params; +std::vector GgmlOvDecoder::get_output_stride(const std::string& name) const { + return get_stride(m_outputs.at(name)); } ov::PartialShape GgmlOvDecoder::get_output_shape(const std::string& name) const { - ov::PartialShape output_shape; - ggml_tensor * node = m_outputs.at(name); - std::vector shape; - - for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { - if (node->ne[i] == 0 ) { - // empty if any dimension has no elements - return output_shape; - } - shape.push_back(static_cast(node->ne[i])); - } - output_shape = ov::PartialShape(shape); - return output_shape; + return ov::PartialShape(get_shape(m_outputs.at(name))); } ov::element::Type GgmlOvDecoder::get_output_type(const std::string& name) const { - // TODO: Change to Output - ov::element::Type type = ov::element::dynamic; - switch (m_outputs.at(name)->type) { - case GGML_TYPE_F32: - type = ov::element::f32; - break; - case GGML_TYPE_F16: - type = ov::element::f16; - break; - case GGML_TYPE_I64: - type = ov::element::i64; - break; - case GGML_TYPE_I32: - type = ov::element::i32; - break; - default: - break; - } - return type; -} - -int32_t* GgmlOvDecoder::get_input_op_params(const std::string& name) const{ - return m_inputs.at(name)->op_params; -} - -int32_t* GgmlOvDecoder::get_output_op_params(const std::string& name) const{ - return m_outputs.at(name)->op_params; + return get_ov_type(m_outputs.at(name)); } std::string& GgmlOvDecoder::get_output_name(size_t index) const { @@ -335,10 +377,17 @@ const std::string& GgmlOvDecoder::get_op_name() const { return m_op_name; } +int32_t* GgmlOvDecoder::get_input_op_params(const std::string& name) const { + return m_inputs.at(name)->op_params; +} + +int32_t* GgmlOvDecoder::get_output_op_params(const std::string& name) const { + return m_outputs.at(name)->op_params; +} + void GgmlOvDecoder::visit_subgraph(std::function)> node_visitor) const { for (const auto& node : m_nodes) { auto decoder = std::make_shared(node, m_cgraph); - // m_decoders.push_back(decoder); node_visitor(decoder); } } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index eac045d158..2182ad624d 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -1,14 +1,17 @@ #pragma once +#include +#include +#include + #include "decoder.h" #include "ggml.h" -#include "openvino/op/parameter.hpp" class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { public: using ov::frontend::ggml::GgmlDecoder::GgmlDecoder; - GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgraph, const int32_t start_index=0, const int32_t end_index=0); + GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph); virtual ov::Any get_attribute(const std::string& name) const override { return nullptr; @@ -73,12 +76,23 @@ public: return m_continuous; } - std::string& get_op_node_name(const std::string& key_name, const int index) override; - - virtual const std::vector>& get_params() const override; + virtual const std::unordered_map>& get_model_inputs() const override { + return m_model_inputs; + } + virtual const std::unordered_map>& get_model_weights() const override { + return m_model_weights; + } + virtual const std::vector& get_model_output_names() const override { + return m_model_output_names; + } private: - void set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs); + void set_input_output(ggml_tensor* node); + static void dump_cgraph(const struct ggml_cgraph* cgraph); + static std::vector get_shape(const ggml_tensor* tensor); + static std::vector get_stride(const ggml_tensor* tensor); + static ov::element::Type get_ov_type(const ggml_tensor* tensor); + static std::shared_ptr create_weight_node(ggml_tensor* tensor); struct ggml_cgraph * m_cgraph; std::map m_inputs; @@ -86,12 +100,12 @@ private: std::map m_outputs; std::vector m_output_names; ggml_tensor* m_node; - std::vector m_nodes; - std::vector> m_decoders; + std::vector m_nodes; std::string m_op_name; mutable std::string m_name; bool m_continuous; - std::vector> m_params; std::vector> m_op_node_name; + std::unordered_map> m_model_inputs; + std::unordered_map> m_model_weights; + std::vector m_model_output_names; }; - diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index c32ad65842..7937d5793a 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -1,49 +1,22 @@ #include "utils.h" -#include "ggml-backend-impl.h" -#include "ggml-impl.h" -#include "ggml.h" + +#include +#include #include -#include #include +#include #include #include -using ov::frontend::ggml::GgmlDecoder; +#include "ggml-impl.h" +#include "ggml.h" -std::shared_ptr get_ggml_decoder(struct ggml_cgraph * cgraph, const int32_t start_index, const int32_t end_index) { - return std::make_shared(nullptr, cgraph, start_index, end_index); -} - -std::vector> get_ggml_graph_input_tensors(std::shared_ptr ggml_decoder) { - std::vector> input_tensors; - auto input_names = ggml_decoder->get_input_names(); - size_t op_iter = 0; - for (size_t inp = 0; inp < input_names.size(); ++inp) { - auto name = input_names[inp]; - std::string op_node_name = ggml_decoder->get_op_node_name(name, op_iter++); - // auto node_op_name = ggml_decoder->get_node_op_name(name); - auto input_data = ggml_decoder->get_input_ggml_tensor(name)->data; - #ifdef GGML_OPENVINO_DEBUG - printf("Subgraph input %d: %g\n", inp, *(double*)(input_data)); - #endif - ov::Tensor input_tensor; - ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape(); - - std::vector input_stride = ggml_decoder->get_input_stride(name); - input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); - - // input_tensors[name] = input_tensor; - input_tensors.emplace_back(name, input_tensor); - } - // std::cout << "input_names.size(): " << input_names.size() << std::endl; - return input_tensors; +std::shared_ptr get_ggml_decoder(struct ggml_cgraph* cgraph) { + return std::make_shared(nullptr, cgraph); } ov::Tensor get_ggml_graph_input_tensor(std::shared_ptr ggml_decoder, std::string& name) { - auto input_data = ggml_decoder->get_input_ggml_tensor(name)->data; - #ifdef GGML_OPENVINO_DEBUG - printf("Subgraph input %s: %g\n", name.c_str(), *(double*)(input_data)); - #endif + auto* input_data = ggml_decoder->get_input_ggml_tensor(name)->data; ov::Tensor input_tensor; ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape(); std::vector input_stride = ggml_decoder->get_input_stride(name); @@ -53,19 +26,16 @@ ov::Tensor get_ggml_graph_input_tensor(std::shared_ptr ggml_decod std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder) { std::map output_tensors; - auto output_names = ggml_decoder->get_output_names(); + auto output_names = ggml_decoder->get_model_output_names(); for (size_t inp = 0; inp < output_names.size(); ++inp) { auto name = output_names[inp]; - auto output_data = ggml_decoder->get_output_ggml_tensor(name)->data; - #ifdef GGML_OPENVINO_DEBUG - printf("Output %d: %g\n", inp, *(double*)(output_data)); - #endif + const auto* tensor = ggml_decoder->get_output_ggml_tensor(name); + auto* output_data = tensor->view_src ? tensor->view_src->data : tensor->data; output_tensors[name] = output_data; } return output_tensors; } - static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { ov::frontend::FrontEnd::Ptr front_end = nullptr; auto fem = ov::frontend::FrontEndManager(); @@ -78,10 +48,9 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { return front_end; } -enum ggml_status openvino_frontend_compute(ggml_backend_t backend, - struct ggml_cgraph *cgraph, - const int32_t start_index, - const int32_t end_index) { +enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph) { + auto start_time = ggml_time_us(); + static ov::Core core; // auto devices = core.get_available_devices(); @@ -89,65 +58,102 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, if (!front_end) { GGML_LOG_ERROR("GGML FrontEnd is not initialized \n"); return GGML_STATUS_FAILED; - } else { - #ifdef GGML_OPENVINO_DEBUG - GGML_LOG_INFO("GGML FrontEnd is initialized \n"); - #endif } - auto ggml_decoder = get_ggml_decoder(cgraph, start_index, end_index); + auto ggml_decoder = get_ggml_decoder(cgraph); std::shared_ptr graph_decoder = ggml_decoder; - // Load GraphIterator -> InputModel ov::frontend::InputModel::Ptr input_model = front_end->load(graph_decoder); if (!input_model) { GGML_LOG_ERROR("Input Model is not loaded \n"); return GGML_STATUS_FAILED; - } else { - #ifdef GGML_OPENVINO_DEBUG - GGML_LOG_INFO("Input Model loaded \n"); - #endif } - // Convert InputModel -> ov::Model std::shared_ptr model = front_end->convert(input_model); + auto conversion_end_time = ggml_time_us(); - if (getenv("OPENVINO_DUMP_GRAPH")) { - char timestamped_filename[64]; - auto timestamp = (long long)ggml_time_us(); - snprintf(timestamped_filename, sizeof(timestamped_filename), - "model_%lld.xml", timestamp); - ov::serialize(model, timestamped_filename); + if (getenv("GGML_OPENVINO_DUMP_IR")) { + char timestamped_filename[64]; + auto timestamp = (long long)ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); + ov::serialize(model, timestamped_filename); } if (!model) { GGML_LOG_ERROR("Model is not converted \n"); - } else { - #ifdef GGML_OPENVINO_DEBUG - GGML_LOG_INFO("Model converted \n"); - #endif } - ov::CompiledModel compiled_model = core.compile_model(model); + ov::CompiledModel compiled_model = + core.compile_model(model, "CPU", ov::device::properties("CPU", ov::cache_dir("/tmp/ov_cache"))); + auto compile_end_time = ggml_time_us(); + ov::InferRequest infer_request = compiled_model.create_infer_request(); + auto infer_request_start_time = ggml_time_us(); auto input_names = ggml_decoder->get_input_names(); - auto input_tensors = get_ggml_graph_input_tensors(ggml_decoder); auto ov_params = model->get_parameters(); for (size_t i = 0; i < ov_params.size(); i++) { auto param_name = ov_params[i]->get_friendly_name(); - infer_request.set_input_tensor(i, get_ggml_graph_input_tensor(ggml_decoder, param_name)); + auto input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name); + + if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { + std::cout << "Input name: " << param_name << ", Input shape: " << input_tensor.get_shape() + << ", Address: " << input_tensor.data() << std::endl; + switch (input_tensor.get_element_type()) { + case ov::element::f32: + std::cout << *(float*)(input_tensor.data()) << std::endl; + break; + case ov::element::f16: + std::cout << ov::float16::from_bits(*(uint16_t*)(input_tensor.data())) << std::endl; + break; + case ov::element::i32: + std::cout << *(int32_t*)(input_tensor.data()) << std::endl; + break; + case ov::element::i64: + std::cout << *(int64_t*)(input_tensor.data()) << std::endl; + break; + default: + break; + } + } + infer_request.set_input_tensor(i, input_tensor); } + auto input_end_time = ggml_time_us(); infer_request.infer(); + auto infer_end_time = ggml_time_us(); - auto output_names = ggml_decoder->get_output_names(); + auto output_names = ggml_decoder->get_model_output_names(); auto output_tensors = get_ggml_graph_output_dst(ggml_decoder); for (size_t i = 0; i < output_names.size(); i++) { auto output_tensor = infer_request.get_output_tensor(i); std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size()); - #ifdef GGML_OPENVINO_DEBUG - printf("Output %s after: %g\n", output_names[i].c_str(), *(double*)(output_tensor.data())); - #endif + + if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { + std::cout << "Output name: " << output_names[i] << ", Output shape: " << output_tensor.get_shape() + << ", Address: " << output_tensors[output_names[i]] << std::endl; + switch (output_tensor.get_element_type()) { + case ov::element::f32: + std::cout << *(float*)(output_tensors[output_names[i]]) << std::endl; + break; + case ov::element::f16: + std::cout << ov::float16::from_bits(*(uint16_t*)(output_tensors[output_names[i]])) << std::endl; + break; + default: + break; + } + } + } + auto end_time = ggml_time_us(); + + if (getenv("GGML_OPENVINO_PROFILING")) { + GGML_LOG_INFO("GGML OpenVINO Backend: \n"); + GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - start_time) / 1000); + GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); + GGML_LOG_INFO(" - Graph InferRequest created Time: %ld ms \n", + (infer_request_start_time - compile_end_time) / 1000); + GGML_LOG_INFO(" - Graph Input Time: %ld ms \n", (input_end_time - infer_request_start_time) / 1000); + GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - input_end_time) / 1000); + GGML_LOG_INFO(" - Graph Output Time: %ld ms \n", (end_time - infer_end_time) / 1000); } return GGML_STATUS_SUCCESS; diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 0f5617ab4b..b4174c9f21 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -1,4 +1,4 @@ #include "ggml-decoder.h" #include "ggml-backend-impl.h" -enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, const int32_t start_index=0, const int32_t end_index=0); +enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph); From 96ba47dd43b6b19b4b4689fed6b83f9ce61d0702 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 28 Apr 2025 17:03:21 +0800 Subject: [PATCH 054/254] STYLE: minor refactor --- ggml/src/ggml-openvino/ggml-decoder.cpp | 67 +++++++------------------ 1 file changed, 19 insertions(+), 48 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 05947ff579..6b20159720 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -199,6 +199,7 @@ void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph) { file << "=== GRAPH ===\n"; + // clang-format off file << "n_nodes = " << cgraph->n_nodes << "\n"; file << " " << std::setw(3) << "nodes" << std::setw(15) << "shape" @@ -225,53 +226,23 @@ void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph) { << std::setw(5) << node->nb[3] << "] " << "\n"; - if (node->src[0]) { - file << std::setw(10) << " [ " - << std::setw(5) << node->src[0]->ne[0] << ", " - << std::setw(5) << node->src[0]->ne[1] << ", " - << std::setw(5) << node->src[0]->ne[2] << ", " - << std::setw(5) << node->src[0]->ne[3] << "] " - << std::setw(12) - << "0: " << std::left << std::setw(12) << ggml_op_name(node->src[0]->op) << std::right; - file << std::left << std::setw(30) << node->src[0]->name << std::right - << std::setw(16) << "[ " - << std::setw(0) << node->src[0]->nb[0] << ", " - << std::setw(5) << node->src[0]->nb[1] << ", " - << std::setw(5) << node->src[0]->nb[2] << ", " - << std::setw(5) << node->src[0]->nb[3] << "] " - << "\n"; - } - if (node->src[1]) { - file << std::setw(10) << " [ " - << std::setw(5) << node->src[1]->ne[0] << ", " - << std::setw(5) << node->src[1]->ne[1] << ", " - << std::setw(5) << node->src[1]->ne[2] << ", " - << std::setw(5) << node->src[1]->ne[3] << "] " - << std::setw(12) - << "1: " << std::left << std::setw(12) << ggml_op_name(node->src[1]->op) << std::right; - file << std::left << std::setw(30) << node->src[1]->name << std::right - << std::setw(16) << "[ " - << std::setw(0) << node->src[1]->nb[0] << ", " - << std::setw(5) << node->src[1]->nb[1] << ", " - << std::setw(5) << node->src[1]->nb[2] << ", " - << std::setw(5) << node->src[1]->nb[3] << "] " - << "\n"; - } - if (node->src[2]) { - file << std::setw(10) << " [ " - << std::setw(5) << node->src[2]->ne[0] << ", " - << std::setw(5) << node->src[2]->ne[1] << ", " - << std::setw(5) << node->src[2]->ne[2] << ", " - << std::setw(5) << node->src[2]->ne[3] << "] " - << std::setw(12) - << "2: " << std::left << std::setw(12) << ggml_op_name(node->src[1]->op) << std::right; - file << std::left << std::setw(30) << node->src[2]->name << std::right - << std::setw(16) << "[ " - << std::setw(0) << node->src[2]->nb[0] << ", " - << std::setw(5) << node->src[2]->nb[1] << ", " - << std::setw(5) << node->src[2]->nb[2] << ", " - << std::setw(5) << node->src[2]->nb[3] << "] " - << "\n"; + for (int i = 0; i < GGML_MAX_SRC; i++) { + if (auto* src = node->src[i]) { + file << std::setw(10) << " [ " + << std::setw(5) << src->ne[0] << ", " + << std::setw(5) << src->ne[1] << ", " + << std::setw(5) << src->ne[2] << ", " + << std::setw(5) << src->ne[3] << "] " + << std::setw(12) + << i << ": " << std::left << std::setw(12) << ggml_op_name(src->op) << std::right; + file << std::left << std::setw(30) << src->name << std::right + << std::setw(16) << "[ " + << std::setw(0) << src->nb[0] << ", " + << std::setw(5) << src->nb[1] << ", " + << std::setw(5) << src->nb[2] << ", " + << std::setw(5) << src->nb[3] << "] " + << "\n"; + } } } @@ -285,7 +256,7 @@ void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph) { << std::setw(8) << ggml_op_name(node->op) << " " << std::setw(16) << ggml_get_name(node) << "\n"; } - + // clang-format on file << "========================================\n"; file.close(); From d3bdca25bd4d97ccf65d442c3185668414228d4d Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 28 Apr 2025 17:04:44 +0800 Subject: [PATCH 055/254] PERF: share const nodes for weights for diff infer --- ggml/src/ggml-openvino/ggml-decoder.cpp | 55 ++++++++++++++----------- ggml/src/ggml-openvino/ggml-decoder.h | 2 +- 2 files changed, 32 insertions(+), 25 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 6b20159720..d42aaf4664 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include "ggml-backend-impl.h" #include "ggml-backend.h" @@ -20,34 +22,16 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap : m_cgraph(cgraph), m_node(node), m_op_name(m_node ? std::string(m_node->name) : "NONE_OP") { + static std::unordered_map> model_weights; if (m_node) { - set_input_output(m_node); + set_input_output(m_node, model_weights); } else { - // std::map> address_map; - // for (int node_n = start_index; node_n <= end_index; node_n++) { - // auto node = cgraph->nodes[node_n]; - // if (node->data) { - // auto it = address_map.find(node->data); - // if (it == address_map.end()) { - // address_map[node->data] = std::vector(); - // } - // address_map[node->data].push_back(node->name); - // } - // } - // for (const auto& pair : address_map) { - // std::cout << "Address: " << pair.first << " -> "; - // for (const auto& name : pair.second) { - // std::cout << name << " ;"; - // } - // std::cout << std::endl; - // } - for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { auto* cur_node = m_cgraph->nodes[node_n]; m_nodes.push_back(cur_node); - // Init model input and output - set_input_output(cur_node); + set_input_output(cur_node, model_weights); } + m_model_weights = model_weights; if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { dump_cgraph(m_cgraph); } @@ -56,7 +40,8 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap // Called in GgmlOvDecoder constructor. Two cases: 1. constructing a decoder for the whole graph; // 2. constructing a decoder for a node. -void GgmlOvDecoder::set_input_output(ggml_tensor* node) { +void GgmlOvDecoder::set_input_output(ggml_tensor* node, + std::unordered_map>& model_weights) { std::string node_name; if (node->op == GGML_OP_CPY) { // CPY updates the input tensor in place. For later ov op that uses the @@ -87,7 +72,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { bool weight_as_input = getenv("GGML_OPENVINO_WEIGHT_AS_INPUT"); - auto& weights_map = weight_as_input ? m_model_inputs : m_model_weights; + auto& weights_map = weight_as_input ? m_model_inputs : model_weights; if (weights_map.find(src_name) != weights_map.end()) { continue; } @@ -261,6 +246,28 @@ void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph) { file.close(); } + +void print_tensor_address_map(const struct ggml_cgraph* cgraph) { + std::map> address_map; + for (int node_n = 0; node_n <= cgraph->n_nodes; node_n++) { + auto* node = cgraph->nodes[node_n]; + if (node->data) { + auto it = address_map.find(node->data); + if (it == address_map.end()) { + address_map[node->data] = std::vector(); + } + address_map[node->data].push_back(node->name); + } + } + for (const auto& pair : address_map) { + std::cout << "Address: " << pair.first << std::endl; + for (const auto& name : pair.second) { + std::cout << name << " ; "; + } + std::cout << std::endl << std::endl; + } +} + std::vector GgmlOvDecoder::get_shape(const ggml_tensor* tensor) { std::vector shape; for (int i = GGML_MAX_DIMS - 2; i >= 0; --i) { diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 2182ad624d..a71c5e4e1f 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -87,7 +87,7 @@ public: } private: - void set_input_output(ggml_tensor* node); + void set_input_output(ggml_tensor* node, std::unordered_map>& model_weights); static void dump_cgraph(const struct ggml_cgraph* cgraph); static std::vector get_shape(const ggml_tensor* tensor); static std::vector get_stride(const ggml_tensor* tensor); From 0a8cc9ab033fee537fb1e2298bbf23ede271ff48 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 29 Apr 2025 14:31:35 +0800 Subject: [PATCH 056/254] BUILD: update build doc, add cmake preset, add CACHE_DIR env var --- CMakePresets.json | 20 ++++++++++++++++++++ ggml/src/ggml-openvino/utils.cpp | 8 +++++--- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/CMakePresets.json b/CMakePresets.json index b5afeb3c0f..392c357f37 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -1,6 +1,26 @@ { "version": 4, "configurePresets": [ + { + "name": "ReleaseOV", + "generator": "Ninja", + "binaryDir": "${sourceDir}/build/${presetName}", + "installDir": "${sourceDir}/build/install/${presetName}", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Release", + "GGML_OPENVINO": true, + "OpenVINO_DIR": "$env{OPENVINO_LLAMA_PATH}/build/Release" + } + }, + { + "name": "ReleaseCPU", + "generator": "Ninja", + "binaryDir": "${sourceDir}/build/${presetName}", + "installDir": "${sourceDir}/build/install/${presetName}", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Release" + } + }, { "name": "base", "hidden": true, diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 7937d5793a..5feb67d681 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -37,7 +37,6 @@ std::map get_ggml_graph_output_dst(std::shared_ptr Date: Wed, 30 Apr 2025 13:40:43 +0800 Subject: [PATCH 057/254] FEAT: improve debug capability --- ggml/src/ggml-openvino/decoder.h | 6 +++--- ggml/src/ggml-openvino/ggml-decoder.cpp | 21 ++++++++++++++++----- ggml/src/ggml-openvino/ggml-decoder.h | 14 ++++++++------ ggml/src/ggml-openvino/utils.cpp | 15 +++++++++++++-- ggml/src/ggml-openvino/utils.h | 2 ++ 5 files changed, 42 insertions(+), 16 deletions(-) diff --git a/ggml/src/ggml-openvino/decoder.h b/ggml/src/ggml-openvino/decoder.h index c0641e2662..b0775d43aa 100644 --- a/ggml/src/ggml-openvino/decoder.h +++ b/ggml/src/ggml-openvino/decoder.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include "openvino/core/node.hpp" #include "openvino/frontend/decoder.hpp" @@ -57,8 +57,8 @@ public: virtual bool check_if_continuous() const = 0; - virtual const std::unordered_map>& get_model_inputs() const = 0; - virtual const std::unordered_map>& get_model_weights() const = 0; + virtual const std::map>& get_model_inputs() const = 0; + virtual const std::map>& get_model_weights() const = 0; virtual const std::vector& get_model_output_names() const = 0; }; diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index d42aaf4664..44b46f2c63 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -8,12 +8,14 @@ #include #include #include +#include #include #include #include #include +#include +#include #include -#include #include "ggml-backend-impl.h" #include "ggml-backend.h" @@ -22,16 +24,24 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap : m_cgraph(cgraph), m_node(node), m_op_name(m_node ? std::string(m_node->name) : "NONE_OP") { - static std::unordered_map> model_weights; + static std::map> model_weights; + if (m_node) { set_input_output(m_node, model_weights); } else { + static bool printed = false; + if (!printed && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) { + print_tensor_address_map(m_cgraph); + printed = true; + } + for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { auto* cur_node = m_cgraph->nodes[node_n]; m_nodes.push_back(cur_node); set_input_output(cur_node, model_weights); } m_model_weights = model_weights; + if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { dump_cgraph(m_cgraph); } @@ -41,7 +51,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap // Called in GgmlOvDecoder constructor. Two cases: 1. constructing a decoder for the whole graph; // 2. constructing a decoder for a node. void GgmlOvDecoder::set_input_output(ggml_tensor* node, - std::unordered_map>& model_weights) { + std::map>& model_weights) { std::string node_name; if (node->op == GGML_OP_CPY) { // CPY updates the input tensor in place. For later ov op that uses the @@ -100,9 +110,10 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, } if (!m_node) { + static std::set debug_output_names = {}; // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph if (node->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || node->flags & GGML_TENSOR_FLAG_OUTPUT || - std::string(node->name).find("result") == 0) { + std::string(node->name).find("result") == 0 || debug_output_names.count(node->name)) { auto name = node->view_src ? std::string(node->view_src->name) : std::string(node->name); if (node->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) { assert(name.find("cache_k") == 0 || name.find("cache_v") == 0); @@ -249,7 +260,7 @@ void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph) { void print_tensor_address_map(const struct ggml_cgraph* cgraph) { std::map> address_map; - for (int node_n = 0; node_n <= cgraph->n_nodes; node_n++) { + for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { auto* node = cgraph->nodes[node_n]; if (node->data) { auto it = address_map.find(node->data); diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index a71c5e4e1f..c4f7612d76 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -1,7 +1,7 @@ #pragma once +#include #include -#include #include #include "decoder.h" @@ -76,10 +76,10 @@ public: return m_continuous; } - virtual const std::unordered_map>& get_model_inputs() const override { + virtual const std::map>& get_model_inputs() const override { return m_model_inputs; } - virtual const std::unordered_map>& get_model_weights() const override { + virtual const std::map>& get_model_weights() const override { return m_model_weights; } virtual const std::vector& get_model_output_names() const override { @@ -87,7 +87,7 @@ public: } private: - void set_input_output(ggml_tensor* node, std::unordered_map>& model_weights); + void set_input_output(ggml_tensor* node, std::map>& model_weights); static void dump_cgraph(const struct ggml_cgraph* cgraph); static std::vector get_shape(const ggml_tensor* tensor); static std::vector get_stride(const ggml_tensor* tensor); @@ -105,7 +105,9 @@ private: mutable std::string m_name; bool m_continuous; std::vector> m_op_node_name; - std::unordered_map> m_model_inputs; - std::unordered_map> m_model_weights; + std::map> m_model_inputs; + std::map> m_model_weights; std::vector m_model_output_names; }; + +void print_tensor_address_map(const struct ggml_cgraph* cgraph); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 5feb67d681..32fa7cf481 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -135,10 +135,12 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c << ", Address: " << output_tensors[output_names[i]] << std::endl; switch (output_tensor.get_element_type()) { case ov::element::f32: - std::cout << *(float*)(output_tensors[output_names[i]]) << std::endl; + std::cout << *(float*)(output_tensor.data()) << std::endl; + std::cout << checksum(output_tensor.data(), output_tensor.get_byte_size()) << std::endl; break; case ov::element::f16: - std::cout << ov::float16::from_bits(*(uint16_t*)(output_tensors[output_names[i]])) << std::endl; + std::cout << ov::float16::from_bits(*(uint16_t*)(output_tensor.data())) << std::endl; + std::cout << checksum(output_tensor.data(), output_tensor.get_byte_size()) << std::endl; break; default: break; @@ -161,3 +163,12 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c return GGML_STATUS_SUCCESS; GGML_UNUSED(backend); } + +size_t checksum(const void* data, size_t size) { + const uint8_t* bytes = static_cast(data); + size_t sum = 0; + for (size_t i = 0; i < size; ++i) { + sum += bytes[i]; + } + return sum; +} diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index b4174c9f21..4458e71f54 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -2,3 +2,5 @@ #include "ggml-backend-impl.h" enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph); + +size_t checksum(const void* data, size_t size); From a8e5efa44e60faf9b08f3e0826ffc29da4746a5e Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 8 May 2025 16:07:14 +0800 Subject: [PATCH 058/254] PERF: compile once (dynamic graph + cache) --- ggml/src/ggml-openvino/decoder.h | 1 + ggml/src/ggml-openvino/ggml-decoder.cpp | 67 +++++++++- ggml/src/ggml-openvino/ggml-decoder.h | 13 ++ ggml/src/ggml-openvino/utils.cpp | 157 ++++++++++++++---------- ggml/src/ggml-openvino/utils.h | 6 + 5 files changed, 181 insertions(+), 63 deletions(-) diff --git a/ggml/src/ggml-openvino/decoder.h b/ggml/src/ggml-openvino/decoder.h index b0775d43aa..790ed2e88d 100644 --- a/ggml/src/ggml-openvino/decoder.h +++ b/ggml/src/ggml-openvino/decoder.h @@ -58,6 +58,7 @@ public: virtual bool check_if_continuous() const = 0; virtual const std::map>& get_model_inputs() const = 0; + virtual const std::map>& get_model_extra_inputs() const = 0; virtual const std::map>& get_model_weights() const = 0; virtual const std::vector& get_model_output_names() const = 0; }; diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 44b46f2c63..372f880b1d 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -10,9 +10,11 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -35,6 +37,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap printed = true; } + set_max_token_len(); for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { auto* cur_node = m_cgraph->nodes[node_n]; m_nodes.push_back(cur_node); @@ -42,6 +45,8 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap } m_model_weights = model_weights; + add_extra_inputs(); + if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { dump_cgraph(m_cgraph); } @@ -102,7 +107,16 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, if (m_model_inputs.find(src_name) != m_model_inputs.end()) { continue; } - auto param_node = std::make_shared(get_ov_type(src), ov::Shape{get_shape(src)}); + ov::PartialShape input_shape; + if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") { + input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_max_token_len)}; + } else if (std::string(src->name).find("KQ_mask") == 0) { + input_shape = + ov::PartialShape{1, ov::Dimension(1, m_max_token_len), ov::Dimension(1, m_max_token_len)}; + } else { + input_shape = ov::Shape{get_shape(src)}; + } + auto param_node = std::make_shared(get_ov_type(src), input_shape); param_node->set_friendly_name(src_name); m_model_inputs[src_name] = param_node; } @@ -146,6 +160,57 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, } } +void GgmlOvDecoder::set_max_token_len() { + for (int i = 0; i < m_cgraph->n_nodes; i++) { + auto* node = m_cgraph->nodes[i]; + if (std::string(node->name) == "v-0") { + auto* cache_v = node->src[0]; + m_max_token_len = cache_v->ne[0] / node->ne[1] / node->ne[2]; + break; + } + } +} + +void GgmlOvDecoder::add_extra_inputs() { + int64_t past_token_len; + int64_t attention_size; + + for (const auto& node : m_nodes) { + if (node->op == GGML_OP_CPY && ggml_is_contiguous(node)) { + assert(std::string(node->view_src->name).find("cache_k") == 0); + int64_t head_size = node->src[0]->ne[0]; + int64_t num_heads = node->src[0]->ne[1]; + past_token_len = (int64_t)(node->src[1]->op_params[0] / node->src[1]->nb[0] / head_size / num_heads); + + std::string name = "past_token_len"; + auto param_node = std::make_shared(ov::element::i64, ov::Shape{}); + param_node->set_friendly_name(name); + m_model_extra_inputs[name] = param_node; + + auto tensor = std::make_shared(ov::element::i64, ov::Shape{}); + *tensor->data() = past_token_len; + m_model_extra_input_values[name] = tensor; + break; + } + } + for (const auto& node : m_nodes) { + if (node->src[1] && std::string(node->src[1]->name).find("inp_tokens") == 0) { + int64_t total_token_len = node->src[1]->ne[0] + past_token_len; + attention_size = (total_token_len + 31) / 32 * 32; + + std::string name = "attention_size"; + auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); + param_node->set_friendly_name(name); + m_model_extra_inputs[name] = param_node; + + auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); + *tensor->data() = attention_size; + m_model_extra_input_values[name] = tensor; + break; + } + } +} + std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) { std::shared_ptr weight_node; auto node_type = get_ov_type(tensor); diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index c4f7612d76..22ff9d85f7 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -79,6 +80,12 @@ public: virtual const std::map>& get_model_inputs() const override { return m_model_inputs; } + virtual const std::map>& get_model_extra_inputs() const override { + return m_model_extra_inputs; + } + virtual const std::map>& get_model_extra_input_values() const { + return m_model_extra_input_values; + } virtual const std::map>& get_model_weights() const override { return m_model_weights; } @@ -88,12 +95,16 @@ public: private: void set_input_output(ggml_tensor* node, std::map>& model_weights); + void add_extra_inputs(); static void dump_cgraph(const struct ggml_cgraph* cgraph); static std::vector get_shape(const ggml_tensor* tensor); static std::vector get_stride(const ggml_tensor* tensor); static ov::element::Type get_ov_type(const ggml_tensor* tensor); static std::shared_ptr create_weight_node(ggml_tensor* tensor); + void set_max_token_len(); + int64_t m_max_token_len; + struct ggml_cgraph * m_cgraph; std::map m_inputs; std::vector m_input_names; @@ -106,6 +117,8 @@ private: bool m_continuous; std::vector> m_op_node_name; std::map> m_model_inputs; + std::map> m_model_extra_inputs; + std::map> m_model_extra_input_values; std::map> m_model_weights; std::vector m_model_output_names; }; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 32fa7cf481..6166161c41 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -3,10 +3,14 @@ #include #include #include +#include #include #include #include #include +#include +#include +#include #include "ggml-impl.h" #include "ggml.h" @@ -63,61 +67,65 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c return GGML_STATUS_FAILED; } + using CachedItem = std::pair, ov::CompiledModel>; + static std::unordered_map compiled_cache; + + std::shared_ptr model; + ov::CompiledModel compiled_model; + int64_t conversion_end_time; + int64_t compile_end_time; + auto ggml_decoder = get_ggml_decoder(cgraph); - std::shared_ptr graph_decoder = ggml_decoder; - ov::frontend::InputModel::Ptr input_model = front_end->load(graph_decoder); - if (!input_model) { - GGML_LOG_ERROR("Input Model is not loaded \n"); - return GGML_STATUS_FAILED; + auto it = compiled_cache.find(cgraph); + if (it != compiled_cache.end()) { + model = it->second.first; + conversion_end_time = ggml_time_us(); + + compiled_model = it->second.second; + compile_end_time = ggml_time_us(); + } else { + std::shared_ptr graph_decoder = ggml_decoder; + ov::frontend::InputModel::Ptr input_model = front_end->load(graph_decoder); + if (!input_model) { + GGML_LOG_ERROR("Input Model is not loaded \n"); + return GGML_STATUS_FAILED; + } + + model = front_end->convert(input_model); + conversion_end_time = ggml_time_us(); + + if (getenv("GGML_OPENVINO_DUMP_IR")) { + char timestamped_filename[64]; + auto timestamp = (long long)ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); + ov::serialize(model, timestamped_filename); + } + + if (!model) { + GGML_LOG_ERROR("Model is not converted \n"); + } + compiled_model = core.compile_model(model, "CPU"); + compile_end_time = ggml_time_us(); + + compiled_cache[cgraph] = std::make_pair(model, compiled_model); } - std::shared_ptr model = front_end->convert(input_model); - auto conversion_end_time = ggml_time_us(); - - if (getenv("GGML_OPENVINO_DUMP_IR")) { - char timestamped_filename[64]; - auto timestamp = (long long)ggml_time_us(); - snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); - ov::serialize(model, timestamped_filename); - } - - if (!model) { - GGML_LOG_ERROR("Model is not converted \n"); - } - - ov::CompiledModel compiled_model = core.compile_model(model, "CPU"); - auto compile_end_time = ggml_time_us(); - ov::InferRequest infer_request = compiled_model.create_infer_request(); - auto infer_request_start_time = ggml_time_us(); - auto input_names = ggml_decoder->get_input_names(); auto ov_params = model->get_parameters(); for (size_t i = 0; i < ov_params.size(); i++) { auto param_name = ov_params[i]->get_friendly_name(); - auto input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name); - - if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { - std::cout << "Input name: " << param_name << ", Input shape: " << input_tensor.get_shape() - << ", Address: " << input_tensor.data() << std::endl; - switch (input_tensor.get_element_type()) { - case ov::element::f32: - std::cout << *(float*)(input_tensor.data()) << std::endl; - break; - case ov::element::f16: - std::cout << ov::float16::from_bits(*(uint16_t*)(input_tensor.data())) << std::endl; - break; - case ov::element::i32: - std::cout << *(int32_t*)(input_tensor.data()) << std::endl; - break; - case ov::element::i64: - std::cout << *(int64_t*)(input_tensor.data()) << std::endl; - break; - default: - break; - } + ov::Tensor input_tensor; + if (ggml_decoder->get_model_extra_inputs().find(param_name) != ggml_decoder->get_model_extra_inputs().end()) { + input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name); + } else { + input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name); } infer_request.set_input_tensor(i, input_tensor); + + if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { + print_input_tensor_info(param_name, input_tensor); + } } auto input_end_time = ggml_time_us(); @@ -131,20 +139,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size()); if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { - std::cout << "Output name: " << output_names[i] << ", Output shape: " << output_tensor.get_shape() - << ", Address: " << output_tensors[output_names[i]] << std::endl; - switch (output_tensor.get_element_type()) { - case ov::element::f32: - std::cout << *(float*)(output_tensor.data()) << std::endl; - std::cout << checksum(output_tensor.data(), output_tensor.get_byte_size()) << std::endl; - break; - case ov::element::f16: - std::cout << ov::float16::from_bits(*(uint16_t*)(output_tensor.data())) << std::endl; - std::cout << checksum(output_tensor.data(), output_tensor.get_byte_size()) << std::endl; - break; - default: - break; - } + print_output_tensor_info(output_names[i], output_tensor, output_tensors); } } auto end_time = ggml_time_us(); @@ -153,9 +148,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c GGML_LOG_INFO("GGML OpenVINO Backend: \n"); GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - start_time) / 1000); GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); - GGML_LOG_INFO(" - Graph InferRequest created Time: %ld ms \n", - (infer_request_start_time - compile_end_time) / 1000); - GGML_LOG_INFO(" - Graph Input Time: %ld ms \n", (input_end_time - infer_request_start_time) / 1000); + GGML_LOG_INFO(" - Graph Input Time: %ld ms \n", (input_end_time - compile_end_time) / 1000); GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - input_end_time) / 1000); GGML_LOG_INFO(" - Graph Output Time: %ld ms \n", (end_time - infer_end_time) / 1000); } @@ -172,3 +165,43 @@ size_t checksum(const void* data, size_t size) { } return sum; } + +void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor) { + std::cout << "Input name: " << name << ", Input shape: " << tensor.get_shape() << ", Address: " << tensor.data() + << std::endl; + switch (tensor.get_element_type()) { + case ov::element::f32: + std::cout << *(float*)(tensor.data()) << std::endl; + break; + case ov::element::f16: + std::cout << ov::float16::from_bits(*(uint16_t*)(tensor.data())) << std::endl; + break; + case ov::element::i32: + std::cout << *(int32_t*)(tensor.data()) << std::endl; + break; + case ov::element::i64: + std::cout << *(int64_t*)(tensor.data()) << std::endl; + break; + default: + break; + } +} + +void print_output_tensor_info(const std::string& name, + const ov::Tensor& tensor, + std::map& output_dst) { + std::cout << "Output name: " << name << ", Output shape: " << tensor.get_shape() + << ", Address: " << output_dst[name] << std::endl; + switch (tensor.get_element_type()) { + case ov::element::f32: + std::cout << *(float*)(tensor.data()) << std::endl; + std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; + break; + case ov::element::f16: + std::cout << ov::float16::from_bits(*(uint16_t*)(tensor.data())) << std::endl; + std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; + break; + default: + break; + } +} diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 4458e71f54..96b07008ec 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -4,3 +4,9 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph); size_t checksum(const void* data, size_t size); + +void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor); + +void print_output_tensor_info(const std::string& name, + const ov::Tensor& tensor, + std::map& output_dst); From ffabe95e2a9ca4c73a718e31ff2c085eea967554 Mon Sep 17 00:00:00 2001 From: Viraj Wadhwa Date: Fri, 9 May 2025 11:37:10 -0700 Subject: [PATCH 059/254] Rebase - Bring up to date and fix build process --- docs/build.md | 61 ++ ggml/CMakeLists.txt | 5 + ggml/src/CMakeLists.txt | 1 + ggml/src/ggml-backend-reg.cpp | 7 + ggml/src/ggml-openvino.cpp | 1074 +---------------------- ggml/src/ggml-openvino/CMakeLists.txt | 42 + ggml/src/ggml-openvino/decoder.h | 13 +- ggml/src/ggml-openvino/ggml-decoder.cpp | 38 +- ggml/src/ggml-openvino/ggml-decoder.h | 14 +- ggml/src/ggml-openvino/utils.cpp | 9 +- ggml/src/ggml-openvino/utils.h | 4 +- 11 files changed, 152 insertions(+), 1116 deletions(-) create mode 100644 ggml/src/ggml-openvino/CMakeLists.txt diff --git a/docs/build.md b/docs/build.md index fce9361b2d..3079a91211 100644 --- a/docs/build.md +++ b/docs/build.md @@ -681,6 +681,67 @@ Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/m To read documentation for how to build on IBM Z & LinuxONE, [click here](./build-s390x.md) +## OPENVINO + +### Build openvino-llama + + ```bash + git lfs install --skip-smudge + git clone https://github.com/intel-sandbox/openvino-llama.git -b dev_ggml_frontend + cd openvino-llama + git submodule update --init --recursive + + export OPENVINO_LLAMA_PATH=$(pwd) + + cmake --preset Release + cmake --build build/Release + ``` + +### Build llama.cpp-ov + + ```bash + git clone https://github.com/intel-sandbox/llama.cpp-ov.git -b dev_backend_openvino + cd llama.cpp-ov + + cmake --preset ReleaseOV + cmake --build build/ReleaseOV + ``` + +Download the test model file [Phi-3-mini-4k-instruct-fp16.gguf](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf) from hugging face website. + ``` bash + wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-fp16.gguf?download=true -O ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf + ``` + +Execute the following command to test. + ```bash + export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache + # Currently GGML_OPENVINO_WEIGHT_AS_INPUT has better performance + export GGML_OPENVINO_WEIGHT_AS_INPUT=1 + ./build/ReleaseOV/bin/llama-simple -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf -n 10 "Hello, my name is " + ``` + +Environment variables: +- GGML_OPENVINO_WEIGHT_AS_INPUT: + Pass the weights as input to the OpenVINO model instead of creating Constant nodes for them. +- GGML_OPENVINO_CACHE_DIR: + If set, model caching in OpenVINO will be used. +- GGML_OPENVINO_DUMP_CGRAPH: + Dumped the compute graph to "cgraph.txt". Note that the the compute graph is different for every token, so the later cgraph will overwrite the previous one. +- GGML_OPENVINO_PROFILING: + Print the time taken for each phase in the OpenVINO backend. +- GGML_OPENVINO_DUMP_IR: + Dump the converted OpenVINO IR. The filenames are timestamps. +- GGML_OPENVINO_DEBUG_INPUT +- GGML_OPENVINO_DEBUG_OUTPUT + +To use Llama.cpp's builtin CPU backend: +```bash +cmake --preset ReleaseCPU +cmake --build build/ReleaseCPU + +./build/ReleaseCPU/bin/llama-simple -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf -n 10 "Hello, my name is " +``` + ## Notes about GPU-accelerated backends The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`. diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 0176ca1ce9..2fa05ab90c 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -246,6 +246,10 @@ set (GGML_SYCL_TARGET "INTEL" CACHE STRING set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING "ggml: sycl device architecture") +option(GGML_OPENVINO "ggml: use OPENVINO" OFF) +option(GGML_OPENVINO_DEBUG "ggml: enable OPENVINO debugging" OFF) +option(GGML_OV_FRONTEND "ggml: OPENVINO frontend path" ON) + option(GGML_OPENCL "ggml: use OpenCL" OFF) option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF) option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON) @@ -324,6 +328,7 @@ set(GGML_PUBLIC_HEADERS include/ggml-vulkan.h include/ggml-webgpu.h include/ggml-zendnn.h + include/ggml-openvino.h include/gguf.h) set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 6192a87046..1758050bae 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -458,6 +458,7 @@ ggml_add_backend(zDNN) ggml_add_backend(OpenCL) ggml_add_backend(Hexagon) ggml_add_backend(ZenDNN) +ggml_add_backend(OPENVINO) foreach (target ggml-base ggml) target_include_directories(${target} PUBLIC $ $) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 4181a714ad..1f8ae17363 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -77,6 +77,10 @@ #include "ggml-zendnn.h" #endif +#ifdef GGML_USE_OPENVINO +#include "ggml-openvino.h" +#endif + // disable C++17 deprecation warning for std::codecvt_utf8 #if defined(__clang__) # pragma clang diagnostic push @@ -222,6 +226,9 @@ struct ggml_backend_registry { #ifdef GGML_USE_RPC register_backend(ggml_backend_rpc_reg()); #endif +#ifdef GGML_USE_OPENVINO + register_backend(ggml_backend_openvino_reg()); +#endif #ifdef GGML_USE_CPU register_backend(ggml_backend_cpu_reg()); #endif diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 5221a1ff8b..f5d5c7ed67 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -55,1023 +55,8 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph *cgraph) { openvino_frontend_compute(backend, cgraph); - ov::Core core; - - // set the shape and stride of dst - dst->ne[0] = src0->ne[0]; - dst->ne[1] = src0->ne[1]; - dst->nb[0] = src0->nb[0]; - dst->nb[1] = src0->nb[1]; - - if (src0 == nullptr || src1 == nullptr) { - std::cerr << "Error: src0 or src1 is null." << std::endl; - return; - } - - // Step 2: Check that the input tensor types and shapes match - if (src0->type != GGML_TYPE_F32 || src1->type != GGML_TYPE_F32) { - std::cerr << "Error: Unsupported tensor type. Only GGML_TYPE_F32 is supported for OpenVINO." << std::endl; - return; - } - if (src0->ne[0] != src1->ne[0] || src0->ne[1] != src1->ne[1]) { - std::cerr << "Error: src0 and src1 shapes do not match." << std::endl; - return; - } - - ov::Tensor input0 = ov::Tensor(ov::element::f32, {static_cast(src0->ne[0]), static_cast(src0->ne[1])}, src0->data); - ov::Tensor input1 = ov::Tensor(ov::element::f32, {static_cast(src1->ne[0]), static_cast(src1->ne[1])}, src1->data); - - auto input0_param = std::make_shared(ov::element::f32, ov::Shape{static_cast(src0->ne[0]), static_cast(src0->ne[1])}); - auto input1_param = std::make_shared(ov::element::f32, ov::Shape{static_cast(src0->ne[0]), static_cast(src0->ne[1])}); - auto add = std::make_shared(input0_param, input1_param); - auto model = std::make_shared(add, ov::ParameterVector{input0_param, input1_param}); - - // compile model and store in context -#ifdef GGML_OPENVINO_GPU - auto compiled_model = core.compile_model(model, "GPU"); -#elif GGML_OPENVINO_NPU - auto compiled_model = core.compile_model(model, "NPU"); -#else - auto compiled_model = core.compile_model(model, "CPU"); -#endif - // initialize infer request - auto infer_request = compiled_model.create_infer_request(); - - // Step 4: set input data, copy src0 and src1 data to OpenVINO input tensors - infer_request.set_tensor(input0_param, input0); - infer_request.set_tensor(input1_param, input1); - - // Step 5: execute inference - infer_request.infer(); - - // Step 6: get output data - ov::Tensor output = infer_request.get_tensor(compiled_model.output()); - - // // Allocate memory for dst->data if not already allocated - // if (dst->data == nullptr) { - // dst->data = malloc(dst->nb[0] * dst->ne[0]); - // if (dst->data == nullptr) { - // std::cerr << "Error: Failed to allocate memory for dst->data." << std::endl; - // return; - // } - // } - - std::memcpy(dst->data, output.data(), output.get_byte_size()); - - if (dst->ne[0] != src0->ne[0] || dst->ne[1] != src0->ne[1]) { - std::cerr << "Error: dst tensor shape does not match input tensor shape." << std::endl; - return; - } - - // float* dst_data1 = (float*)(dst->data); - // printf("Output data:");; - // for (int i = 0; i < (10 < (int)(dst->ne[0]) ? 10 : (int)(dst->ne[0])); ++i) { - // printf("%f ", dst_data1[i]); - // } - // printf("\n"); - // fflush(stdout); -} - -static void ggml_backend_openvino_mul_forward(ggml_tensor * dst) { - struct ggml_tensor *src0 = dst->src[0]; - struct ggml_tensor *src1 = dst->src[1]; - - ov::Core core; - - // define shape - ov::Shape shape0 = {static_cast(src0->ne[1]), static_cast(src0->ne[0])}; // For Example: [7, 3072] - ov::Shape shape1 = {static_cast(src1->ne[1]), static_cast(src1->ne[0])}; // For Example: [1, 3072] -> broadcast to [7, 3072] - - // create OpenVINO tensor (src0 and src1) - ov::Tensor tensor0(ov::element::f32, shape0, src0->data); - ov::Tensor tensor1(ov::element::f32, shape1, src1->data); - - // define input parameters - auto input0 = std::make_shared(ov::element::f32, shape0); - auto input1 = std::make_shared(ov::element::f32, shape1); - - // create a multiply operation using broadcasting - auto multiply = std::make_shared(input0, input1); - - // create model - auto model = std::make_shared(multiply, ov::ParameterVector{input0, input1}); - // compile model and store in context -#ifdef GGML_OPENVINO_GPU - ov::CompiledModel compiled_model = core.compile_model(model, "GPU"); -#elif GGML_OPENVINO_NPU - ov::CompiledModel compiled_model = core.compile_model(model, "NPU"); -#else - ov::CompiledModel compiled_model = core.compile_model(model, "CPU"); -#endif - - ov::InferRequest infer_request = compiled_model.create_infer_request(); - infer_request.set_tensor(input0, tensor0); - infer_request.set_tensor(input1, tensor1); - - infer_request.infer(); - - // get output tensor and copy it back to dst->data - ov::Tensor output_tensor = infer_request.get_output_tensor(); - std::memcpy(dst->data, output_tensor.data(), src0->ne[0] * src0->ne[1] * sizeof(float)); -} - -static void ggml_backend_openvino_add(ggml_tensor * dst) { - // Placeholder for OpenVINO add operation - // GGML_ASSERT(ctx.device != 0); - GGML_ASSERT(dst->data != nullptr); - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - switch (src0->type) { - case GGML_TYPE_F16: - { - if (src1->type == GGML_TYPE_F16) { - // ggml_backend_openvino_add_forward(ctx, dst, src0, src1); - } else if (src1->type == GGML_TYPE_F32) { - // ggml_compute_forward_add_f16_f32(params, dst); - } else { - GGML_ABORT("fatal error"); - } - } break; - case GGML_TYPE_F32: - { - if (src1->type == GGML_TYPE_F32) { - { - ggml_backend_openvino_add_forward(dst); - } - } - else { - GGML_ABORT("fatal error"); - } - } break; - default: - GGML_ABORT("%s: unsupported type %d\n", __func__, src1->type); - } - -} - -static void ggml_backend_openvino_mul(ggml_tensor * dst) { - GGML_ASSERT(dst->data != nullptr); - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now"); - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_backend_openvino_mul_forward(dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -void ggml_compute_forward_get_rows_f16(struct ggml_tensor *dst) { - const struct ggml_tensor *src0 = dst->src[0]; - const struct ggml_tensor *src1 = dst->src[1]; - - ov::Core core; - - ov::Shape shape0 = {static_cast(src0->ne[1]), static_cast(src0->ne[0])}; // [3072, 7] - ov::Shape shape1 = {static_cast(src1->ne[0])}; // [7] - - ov::Tensor tensor0(ov::element::f16, shape0, src0->data); - ov::Tensor tensor1(ov::element::i32, shape1, src1->data); - - auto input0 = std::make_shared(ov::element::f16, shape0); - auto input1 = std::make_shared(ov::element::i32, shape1); - - auto gather = std::make_shared(input0, input1, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {0})); - - auto model = std::make_shared(gather, ov::ParameterVector{input0, input1}); - ov::CompiledModel compiled_model = core.compile_model(model, "CPU"); - - ov::InferRequest infer_request = compiled_model.create_infer_request(); - infer_request.set_tensor(input0, tensor0); - infer_request.set_tensor(input1, tensor1); - - infer_request.infer(); - - ov::Tensor output_tensor = infer_request.get_output_tensor(); - // Convert output tensor data type from f16 to f32 - ov::Tensor output_tensor_f32 = ov::Tensor(ov::element::f32, output_tensor.get_shape()); - for (size_t i = 0; i < output_tensor.get_size(); ++i) { - output_tensor_f32.data()[i] = static_cast(output_tensor.data()[i]); - } - - // Copy the converted data to dst->data - std::memcpy(dst->data, output_tensor_f32.data(), output_tensor_f32.get_byte_size()); -} - -void ggml_compute_forward_get_rows_f32(struct ggml_tensor *dst) { - const struct ggml_tensor *src0 = dst->src[0]; - const struct ggml_tensor *src1 = dst->src[1]; - - ov::Core core; - - ov::Shape shape0 = {static_cast(src0->ne[1]), static_cast(src0->ne[0])}; // [3072, 7] - ov::Shape shape1 = {static_cast(src1->ne[0])}; // [7] - - ov::Tensor tensor0(ov::element::f32, shape0, src0->data); - ov::Tensor tensor1(ov::element::i32, shape1, src1->data); - - auto input0 = std::make_shared(ov::element::f32, shape0); - auto input1 = std::make_shared(ov::element::i32, shape1); - - auto gather = std::make_shared(input0, input1, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {0})); - - auto model = std::make_shared(gather, ov::ParameterVector{input0, input1}); - ov::CompiledModel compiled_model = core.compile_model(model, "CPU"); - - ov::InferRequest infer_request = compiled_model.create_infer_request(); - infer_request.set_tensor(input0, tensor0); - infer_request.set_tensor(input1, tensor1); - - infer_request.infer(); - - ov::Tensor output_tensor = infer_request.get_output_tensor(); - - // Copy the converted data to dst->data - std::memcpy(dst->data, output_tensor.data(), output_tensor.get_byte_size()); -} - -void ggml_compute_forward_get_rows(struct ggml_tensor *dst) { - const struct ggml_tensor *src0 = dst->src[0]; - const struct ggml_tensor *src1 = dst->src[1]; - - switch (src0->type) { - case GGML_TYPE_F16: - { - ggml_compute_forward_get_rows_f16(dst); - } break; - case GGML_TYPE_F32: - { - ggml_compute_forward_get_rows_f32(dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } - -} - -void ggml_backend_openvino_rms_norm_f32(ggml_tensor *dst) { - const struct ggml_tensor *src0 = dst->src[0]; - assert(src0 != nullptr); - - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - GGML_ASSERT(src0->nb[0] == sizeof(float)); - - const int64_t ne0 = src0->ne[0]; - const int64_t ne1 = src0->ne[1]; - const int64_t ne2 = src0->ne[2]; - - const size_t input_size = ne0 * ne1 * ne2; - - const float *src_data = static_cast(src0->data); - float *dst_data = static_cast(dst->data); - assert(dst_data != nullptr); - - ov::Core core; - - ov::Shape input_shape = {static_cast(ne2), static_cast(ne1), static_cast(ne0)}; - ov::Tensor input_tensor(ov::element::f32, input_shape, const_cast(src_data)); - - auto input_param = std::make_shared( - input_tensor.get_element_type(), - input_tensor.get_shape() - ); - assert(input_param != nullptr && "Input parameter creation failed!"); - - auto square = std::make_shared(input_param, input_param); - auto reduce_sum = std::make_shared( - square, - ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}), - true - ); - - auto mean = std::make_shared( - reduce_sum, - ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {static_cast(ne0)}) - ); - - float eps; - memcpy(&eps, dst->op_params, sizeof(float)); - auto rms = std::make_shared( - std::make_shared( - mean, - ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {eps}) - ) - ); - - auto scale = std::make_shared( - ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {1.0f}), - rms - ); - - auto normalized_input = std::make_shared(input_param, scale); - - ov::ParameterVector parameters = {input_param}; - auto model = std::make_shared(ov::NodeVector{normalized_input}, parameters); - - // static bool model_saved = false; - // if (!model_saved) { - // std::cout << "\n rms model saved" << std::endl; - // ov::save_model(model, "//rms_norm_model.xml"); - // model_saved = true; - // } - - auto compiled_model = core.compile_model(model, "CPU"); - - auto infer_request = compiled_model.create_infer_request(); - - infer_request.set_input_tensor(0, input_tensor); - - infer_request.infer(); - - auto output_tensor = infer_request.get_output_tensor(); - assert(output_tensor.get_size() == input_size); - - std::memcpy(dst_data, output_tensor.data(), input_size * sizeof(float)); -} - -void ggml_backend_openvino_rms_norm(ggml_tensor * dst) { - const struct ggml_tensor * src0 = dst->src[0]; - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_backend_openvino_rms_norm_f32(dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -static void ggml_backend_openvino_permute(const struct ggml_tensor * dst) { - // NOP - GGML_UNUSED(dst); -} - -// Extracting valid shapes -std::vector get_effective_shape(const ggml_tensor * t) { - std::vector shape; - for (int i = 2; i >= 0; i--) { - if (t->ne[i] != 1 || t->ne[2] != 1) - shape.push_back(t->ne[i]); - } - return shape; -} - -/* -* Construct an index vector for Gather to extract non-contiguous data. -* Parameters: -* - valid_cols: number of valid columns per row (e.g., for src0, valid columns = 96) -* - num_rows: number of rows in each batch (e.g., src0: 32 rows per batch) -* - batch: number of batches (e.g., 32) -* - row_stride: physical row length (in elements), e.g., src0: nb[1]/(element_size) = 6144/2 = 3072 -* - batch_stride: physical batch stride (in elements), e.g., src0: nb[2]/(element_size) = 192/2 = 96 -*/ -std::vector build_indices(int valid_cols, int num_rows, int batch, int row_stride, int batch_stride) { - std::vector indices; - indices.reserve(valid_cols * num_rows * batch); - for (int b = 0; b < batch; b++) { - for (int r = 0; r < num_rows; r++) { - for (int c = 0; c < valid_cols; c++) { - // 计算物理索引 = b * batch_stride + r * row_stride + c - indices.push_back(b * batch_stride + r * row_stride + c); - } - } - } - return indices; -} - -void ggml_backend_openvino_mul_mat(struct ggml_tensor * dst) { - assert(dst && dst->src[0] && dst->src[1]); - const ggml_tensor * src0 = dst->src[0]; // src0 type F16 - const ggml_tensor * src1 = dst->src[1]; // src1 type F32 - - if(!ggml_is_contiguous(src1) || dst->src[1]->ne[0] * dst->src[1]->nb[0] != dst->src[1]->nb[1]) { - int valid_cols_src0 = src0->ne[0]; // 96 - int num_rows_src0 = src0->ne[1]; // 32 - int batch_src0 = src0->ne[2]; // 32 - - int valid_cols_src1 = src1->ne[0]; // 96 - int num_rows_src1 = src1->ne[1]; // 7 - int batch_src1 = src1->ne[2]; // 32 - - // 对 src0:row_stride = nb[1] / nb[0] - int row_stride_src0 = src0->nb[1] / src0->nb[0]; // 6144 / 2 = 3072 - int batch_stride_src0 = src0->nb[2] / src0->nb[0]; // 192 / 2 = 96 - - // 对 src1:row_stride = nb[1] / nb[0] - int row_stride_src1 = src1->nb[1] / src1->nb[0]; // 12288 / 4 = 3072 - int batch_stride_src1 = src1->nb[2] / src1->nb[0]; // 384 / 4 = 96 - - std::vector indices_src0 = build_indices(valid_cols_src0, num_rows_src0, batch_src0, row_stride_src0, batch_stride_src0); - std::vector indices_src1 = build_indices(valid_cols_src1, num_rows_src1, batch_src1, row_stride_src1, batch_stride_src1); - - size_t total_src0 = indices_src0.size(); // = 96 * 32 * 32 - size_t total_src1 = indices_src1.size(); // = 96 * 7 * 32 - - ov::Shape orig_shape_src0 = { static_cast(src0->ne[2]), - static_cast(src0->ne[1]), - static_cast(src0->ne[0])}; - ov::Shape orig_shape_src1 = { static_cast(src1->ne[2]), - static_cast(src1->ne[1]), - static_cast(src1->ne[0])}; - - auto param_src0 = std::make_shared(ov::element::f16, orig_shape_src0); - auto param_src1 = std::make_shared(ov::element::f32, orig_shape_src1); - - ov::Shape flat_shape_src0 = { total_src0 }; - ov::Shape flat_shape_src1 = { total_src1 }; - - auto flatten_src0 = std::make_shared( - param_src0, - ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{ static_cast(total_src0) }), - false); - auto flatten_src1 = std::make_shared( - param_src1, - ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{ static_cast(total_src1) }), - false); - - auto indices_const_src0 = ov::op::v0::Constant::create(ov::element::i64, flat_shape_src0, indices_src0); - auto indices_const_src1 = ov::op::v0::Constant::create(ov::element::i64, flat_shape_src1, indices_src1); - auto axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - - auto gathered_src0 = std::make_shared(flatten_src0, indices_const_src0, axis_const); - auto gathered_src1 = std::make_shared(flatten_src1, indices_const_src1, axis_const); - - std::vector shape_src0_cont = { batch_src0, num_rows_src0, valid_cols_src0 }; - auto reshape_src0 = std::make_shared( - gathered_src0, - ov::op::v0::Constant::create(ov::element::i64, { shape_src0_cont.size() }, shape_src0_cont), - false); - - std::vector shape_src1_cont = { batch_src1, num_rows_src1, valid_cols_src1 }; - auto reshape_src1 = std::make_shared( - gathered_src1, - ov::op::v0::Constant::create(ov::element::i64, { shape_src1_cont.size() }, shape_src1_cont), - false); - - auto src0_f32 = std::make_shared(reshape_src0, ov::element::f32); - auto transpose_order = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{0, 2, 1}); - auto src0_transposed = std::make_shared(src0_f32, transpose_order); - - auto A = src0_transposed; - auto B = reshape_src1; - - auto batched_matmul = std::make_shared(B, A, false, false); - - std::vector final_output_shape = {static_cast(dst->ne[2]), - static_cast(dst->ne[1]), - static_cast(dst->ne[0])}; - - auto reshape_output = std::make_shared( - batched_matmul, - ov::op::v0::Constant::create(ov::element::i64, {3}, final_output_shape), - false); - - auto model = std::make_shared(ov::NodeVector{ reshape_output }, - ov::ParameterVector{ param_src0, param_src1 }); - - ov::Tensor tensor_src0{ ov::element::f16, orig_shape_src0, src0->data }; - ov::Tensor tensor_src1{ ov::element::f32, orig_shape_src1, src1->data }; - ov::Shape output_shape = { static_cast(dst->ne[2]), - static_cast(dst->ne[1]), - static_cast(dst->ne[0]) }; - ov::Tensor tensor_dst(ov::element::f32, output_shape, dst->data); - - ov::Core core; - auto compiled_model = core.compile_model(model, "CPU"); - auto infer_request = compiled_model.create_infer_request(); - infer_request.set_input_tensor(0, tensor_src0); - infer_request.set_input_tensor(1, tensor_src1); - infer_request.set_output_tensor(0, tensor_dst); - infer_request.infer(); - return ; - } - - int rank = 0; - if (dst->ne[2] == 1 && dst->ne[3] == 1) { - rank = 2; - } else if (dst->ne[3] == 1) { - rank = 3; - } else { - throw std::runtime_error("Only rank 2 or rank 3 are supported in this implementation."); - } - - std::vector eff_shape_src0 = get_effective_shape(src0); - std::vector eff_shape_src1 = get_effective_shape(src1); - std::vector eff_shape_dst = get_effective_shape(dst); - - ov::Shape orig_shape_src0 = { static_cast(src0->ne[2]), - static_cast(src0->ne[1]), - static_cast(src0->ne[0])}; - ov::Shape orig_shape_src1 = { static_cast(src1->ne[2]), - static_cast(src1->ne[1]), - static_cast(src1->ne[0])}; - auto param_src0 = std::make_shared(ov::element::f16, orig_shape_src0); - auto param_src1 = std::make_shared(ov::element::f32, orig_shape_src1); - - auto reshape_src0 = std::make_shared( - param_src0, - ov::op::v0::Constant::create(ov::element::i64, { eff_shape_src0.size() }, eff_shape_src0), - false); - auto reshape_src1 = std::make_shared( - param_src1, - ov::op::v0::Constant::create(ov::element::i64, { eff_shape_src1.size() }, eff_shape_src1), - false); - - auto src0_f32 = std::make_shared(reshape_src0, ov::element::f32); - - ov::Output A_for_mul; - if (rank == 2) { - auto trans_order = ov::op::v0::Constant::create(ov::element::i64, { 2 }, std::vector{1, 0}); - A_for_mul = std::make_shared(src0_f32, trans_order); - } else if (rank == 3) { - auto trans_order = ov::op::v0::Constant::create(ov::element::i64, { 3 }, std::vector{0, 2, 1}); - A_for_mul = std::make_shared(src0_f32, trans_order); - } else { - A_for_mul = src0_f32; - } - - auto matmul = std::make_shared(reshape_src1, A_for_mul, false, false); - - auto matmul_output_shape = matmul->get_output_shape(0); - std::vector final_output_shape; - if (matmul_output_shape.size() == 1) { - final_output_shape = { 1, 1, static_cast(matmul_output_shape[0]) }; - } else if (matmul_output_shape.size() == 2) { - final_output_shape = { 1, static_cast(matmul_output_shape[0]), static_cast(matmul_output_shape[1]) }; - } else { - final_output_shape = { static_cast(matmul_output_shape[0]), static_cast(matmul_output_shape[1]), static_cast(matmul_output_shape[2]) }; - } - - auto reshape_output = std::make_shared( - matmul, - ov::op::v0::Constant::create(ov::element::i64, {3}, final_output_shape), - false - ); - - auto model = std::make_shared(ov::NodeVector{ reshape_output }, - ov::ParameterVector{ param_src0, param_src1 }); - - ov::Tensor tensor_src0{ ov::element::f16, orig_shape_src0, (void *)src0->data }; - ov::Tensor tensor_src1{ ov::element::f32, orig_shape_src1, (void *)src1->data }; - - ov::Shape output_shape = { static_cast(dst->ne[2]), - static_cast(dst->ne[1]), - static_cast(dst->ne[0]) }; - ov::Tensor tensor_dst(ov::element::f32, output_shape, dst->data); - - ov::Core core; - auto compiled_model = core.compile_model(model, "CPU"); - auto infer_request = compiled_model.create_infer_request(); - infer_request.set_input_tensor(0, tensor_src0); - infer_request.set_input_tensor(1, tensor_src1); - infer_request.set_output_tensor(0, tensor_dst); - infer_request.infer(); -} - -void ggml_backend_openvino_reshape(ggml_tensor *dst) { - - GGML_UNUSED(dst); -} - -void ggml_backend_openvino_view(ggml_tensor *dst) { - - GGML_UNUSED(dst); -} - -void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) { - const struct ggml_tensor *src0 = dst->src[0]; - - // Validate tensor properties - GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); - GGML_ASSERT(src0->type == dst->type); - - // Determine tensor properties - const size_t element_size = ggml_type_size(src0->type); - - // Case 1: Both tensors are contiguous - if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && (src0->ne[0] * element_size == src0->nb[1])) { - ov::Shape input_shape = { - static_cast(src0->ne[2]), - static_cast(src0->ne[1]), - static_cast(src0->ne[0]) - }; - size_t num_elements = 1; - for (auto d : input_shape) { - num_elements *= d; - } - ov::Shape flat_shape = { num_elements }; - - ov::Shape dst_shape = { - static_cast(dst->ne[2]), - static_cast(dst->ne[1]), - static_cast(dst->ne[0]) - }; - - auto input_param = std::make_shared(ov::element::f32, input_shape); - - std::vector flat_shape_vec(flat_shape.begin(), flat_shape.end()); - auto flat_reshape_const = ov::op::v0::Constant::create(ov::element::i64, { flat_shape_vec.size() }, flat_shape_vec); - auto flat_reshape = std::make_shared(input_param, flat_reshape_const, false); - - std::vector dst_shape_vec(dst_shape.begin(), dst_shape.end()); - auto dst_reshape_const = ov::op::v0::Constant::create(ov::element::i64, { dst_shape_vec.size() }, dst_shape_vec); - auto final_reshape = std::make_shared(flat_reshape, dst_reshape_const, false); - - auto model = std::make_shared(ov::OutputVector{ final_reshape }, ov::ParameterVector{ input_param }); - - ov::Core core; - auto compiled_model = core.compile_model(model, "CPU"); - auto infer_request = compiled_model.create_infer_request(); - - ov::Tensor input_tensor(ov::element::f32, input_shape, src0->data); - infer_request.set_input_tensor(0, input_tensor); - - ov::Tensor output_tensor(ov::element::f32, dst_shape, dst->data); - infer_request.set_output_tensor(0, output_tensor); - - infer_request.infer(); - return; - } - - // Case 2: Compatible types, dimensions, and strides - const size_t ne00 = src0->ne[0]; - const size_t ne01 = src0->ne[1]; - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb0 = dst->nb[0]; - - if (src0->type == dst->type && ne00 == dst->ne[0] && nb00 == element_size && nb0 == element_size) { - const size_t valid_elems = static_cast(src0->ne[0]); // 3072 - const size_t num_rows = static_cast(src0->ne[1]); // 7 - const size_t dim2 = static_cast(src0->ne[2]); // 1 - - size_t phys_stride = static_cast(src0->nb[1]) / element_size; // 9216 - // size_t phys_stride = static_cast(src0->ne[0]); // 3072 - - ov::Shape input_shape = { dim2, num_rows, phys_stride }; // 如 {1, 7, 9216 } - ov::Shape logical_shape = { dim2, num_rows, valid_elems }; // {1, 7, 3072} - - // std::cout << "CONT input shape: " << input_shape << std::endl; - auto input_param = std::make_shared(ov::element::f32, input_shape); - - // int64_t split_addr = dst->src[0]->view_offs / dst->src[0]->nb[0]; - // std::vector begin = { 0, 0, split_addr }; - // std::vector end = { static_cast(dim2), - // static_cast(num_rows), - // split_addr + static_cast(valid_elems) }; - - std::vector begin = { 0, 0, 0 }; - std::vector end = { static_cast(dim2), - static_cast(num_rows), - static_cast(valid_elems) }; - std::vector strides = { 1, 1, 1 }; - - auto begin_const = ov::op::v0::Constant::create(ov::element::i64, { begin.size() }, begin); - auto end_const = ov::op::v0::Constant::create(ov::element::i64, { end.size() }, end); - auto strides_const = ov::op::v0::Constant::create(ov::element::i64, { strides.size() }, strides); - - std::vector begin_mask = {0, 0, 0}; - std::vector end_mask = {0, 0, 0}; - auto slice = std::make_shared( - input_param, - begin_const, - end_const, - strides_const, - begin_mask, - end_mask - ); - - auto model = std::make_shared(ov::OutputVector{ slice }, - ov::ParameterVector{ input_param }); - - ov::Core core; - auto compiled_model = core.compile_model(model, "CPU"); - auto infer_request = compiled_model.create_infer_request(); - - //[NOTE]: input_shape should be {1, 7, 9216} not the original shap of src0. - ov::Tensor input_tensor(ov::element::f32, input_shape, src0->data); - infer_request.set_input_tensor(0, input_tensor); - - ov::Tensor output_tensor(ov::element::f32, logical_shape, dst->data); - infer_request.set_output_tensor(0, output_tensor); - - infer_request.infer(); - return; - } - - // Case 3: Non-contiguous source, contiguous destination - // dst->ne =[3072,7,1,1], dst->nb =[4,12288,86016,86016], dst->type=GGML_TYPE_F32 - // dst->src[0]->ne=[96,32,7,1], dst->src[0]->nb=[4,2688,384,86016], dst->src[0]->type=GGML_TYPE_F32 - if (ggml_is_contiguous(dst)) { - size_t valid_i = static_cast(src0->ne[0]); // 96 - size_t valid_j = static_cast(src0->ne[1]); // 32 - size_t valid_k = static_cast(src0->ne[2]); // 7 - - ov::Shape src_shape = { valid_k, valid_j, valid_i }; // {7, 32, 96}; - auto src_param = std::make_shared(ov::element::f32, src_shape); - - ov::Shape input_shape = { valid_j, valid_k, valid_i }; // {32, 7, 96} - auto tmp_param = ov::op::v0::Constant::create(ov::element::i64, { input_shape.size() }, input_shape); - auto input_param = std::make_shared(src_param, tmp_param, false); - - // 添加 Transpose 节点,将 {32,7,96} 变换为 {7,32,96},恢复逻辑顺序 - // 这里交换第 0 与第 1 维,即 permutation = {1, 0, 2} - std::vector order = {1, 0, 2}; - auto order_const = ov::op::v0::Constant::create(ov::element::i64, {order.size()}, order); - auto transpose = std::make_shared(input_param, order_const); - - ov::Shape target_shape = { static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0]) }; // {1, 7, 3072} - std::vector target_shape_vec = { static_cast(dst->ne[2]), - static_cast(dst->ne[1]), - static_cast(dst->ne[0]) }; - auto reshape_const = ov::op::v0::Constant::create(ov::element::i64, { target_shape_vec.size() }, target_shape_vec); - auto reshaped = std::make_shared(transpose, reshape_const, false); - - auto model = std::make_shared(ov::OutputVector{ reshaped }, - ov::ParameterVector{ src_param }); - ov::Core core; - auto compiled_model = core.compile_model(model, "CPU"); - auto infer_request = compiled_model.create_infer_request(); - - ov::Tensor input_tensor(ov::element::f32, src_shape, src0->data); - infer_request.set_input_tensor(0, input_tensor); - - ov::Tensor output_tensor(ov::element::f32, target_shape, dst->data); - infer_request.set_output_tensor(0, output_tensor); - - infer_request.infer(); - return; - } -} - -static void ggml_backend_openvino_transpose(ggml_tensor *dst) { - // ov::Core core; - // ov::Shape input_shape{static_cast(dst->src[0]->ne[2]), static_cast(dst->src[0]->ne[1]), static_cast(dst->src[0]->ne[0])}; - // ov::Shape output_shape{static_cast(dst->ne[2]), static_cast(dst->ne[1]), static_cast(dst->ne[0])}; - // auto input_param = std::make_shared(ov::element::f32, input_shape); - - // //auto res = std::make_shared(input_param, ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 2, 1})); - - - - // auto new_shape_node = ov::op::v0::Constant::create(ov::element::i64, - // ov::Shape{output_shape.size()}, - // std::vector(output_shape.begin(), output_shape.end())); - // auto res = std::make_shared(input_param, new_shape_node, false); - - - - - // std::shared_ptr model = std::make_shared(ov::OutputVector{res}, - // ov::ParameterVector{input_param}); - // auto compiled_model = core.compile_model(model, "CPU"); - // ov::InferRequest infer_request = compiled_model.create_infer_request(); - - // ov::Tensor input_tensor(ov::element::f32, input_shape, dst->src[0]->data); - // ov::Tensor output_tensor(ov::element::f32, output_shape, dst->data); - // infer_request.set_input_tensor(0, input_tensor); - // infer_request.set_output_tensor(0, output_tensor); - - // infer_request.infer(); - - // NOP - GGML_UNUSED(dst); -} - -void ggml_backend_openvino_cpy(struct ggml_tensor *dst) { - const struct ggml_tensor *src0 = dst->src[0]; - const struct ggml_tensor *src1 = dst->src[1]; - assert(src0 != nullptr); - assert(ggml_nelements(dst) == ggml_nelements(src0)); - - // Extract shapes - ov::Shape src_shape(src0->ne, src0->ne + 4); - ov::Shape dst_shape(dst->ne, dst->ne + 4); - - // Initialize OpenVINO core - ov::Core core; - - // Create OpenVINO parameter for the source tensor - auto src_input = std::make_shared(ov::element::f32, src_shape); - - std::shared_ptr model; - if (ggml_is_contiguous(dst)) { - // Contiguous Case: Flatten src and reshape to dst shape - ov::Shape flattened_shape = {static_cast(ggml_nelements(src0))}; - auto flatten = std::make_shared( - src_input, ov::op::v0::Constant::create(ov::element::i64, {1}, flattened_shape), false); - - auto reshape_to_dst = std::make_shared( - flatten, ov::op::v0::Constant::create(ov::element::i64, {4}, dst_shape), false); - - auto dst_output = std::make_shared(reshape_to_dst, ov::element::f16); - - model = std::make_shared( - ov::ResultVector{std::make_shared(dst_output)}, - ov::ParameterVector{src_input}, - "ContiguousCopy"); - // Compile and execute the model - auto compiled_model = core.compile_model(model, "CPU"); - - ov::Tensor src_tensor(ov::element::f32, src_shape, src0->data); - ov::Tensor dst_tensor(ov::element::f16, dst_shape, dst->data); - - auto infer_request = compiled_model.create_infer_request(); - infer_request.set_input_tensor(0, src_tensor); - infer_request.set_output_tensor(0, dst_tensor); - infer_request.infer(); - } else { - int src0_elem_size = ggml_type_size(src0->type); - int src1_elem_size = ggml_type_size(src1->type); - - int src0_logical_cols = src0->ne[0]; - int src0_logical_rows = src0->ne[1]; - int src1_logical_cols = src1->ne[0]; - int src1_logical_rows = src1->ne[1]; - - int src0_phys_cols = src0->nb[0] / src0_elem_size; - int src0_phys_rows = src0_logical_rows; - - int src1_phys_cols = src1->nb[1] / src1_elem_size; - int src1_phys_rows = src1_logical_rows; - - ov::Shape src0_phys_shape = {1, static_cast(src0_phys_rows), static_cast(src0_phys_cols) }; - ov::Shape src1_phys_shape = {1, static_cast(src1_phys_rows), static_cast(src1_phys_cols) }; - - size_t logical_elems = static_cast(src0_logical_cols * src0_logical_rows); - size_t src_flat_size = 1 * src0_phys_cols * src0_phys_rows; - size_t dst_flat_size = 1 * src1_phys_rows * src1_phys_cols; - - ov::Core core; - - std::vector gather_idx; - gather_idx.reserve(logical_elems); - for (int row = 0; row < src0_logical_rows; row++) { - for (int col = 0; col < src0_logical_cols; col++) { - gather_idx.push_back(static_cast(row + col * src0_phys_rows)); - } - } - ov::Shape gather_idx_shape = { logical_elems }; - - std::vector scatter_idx; - scatter_idx.reserve(logical_elems); - for (int row = 0; row < src1_logical_rows; row++) { - for (int col = 0; col < src1_logical_cols; col++) { - scatter_idx.push_back(static_cast(row * src1_phys_cols + col)); - } - } - ov::Shape scatter_idx_shape = { logical_elems, 1 }; - - auto param_src0 = std::make_shared(ov::element::f32, src0_phys_shape); - auto param_src1 = std::make_shared(ov::element::f16, src1_phys_shape); - - auto src_flat_shape_const = ov::op::v0::Constant::create(ov::element::i64, {1}, - { static_cast(src_flat_size) }); - auto reshape_src = std::make_shared(param_src0, src_flat_shape_const, false); - auto dst_flat_shape_const = ov::op::v0::Constant::create(ov::element::i64, {1}, - { static_cast(dst_flat_size) }); - auto reshape_dst = std::make_shared(param_src1, dst_flat_shape_const, false); - - auto gather_indices_const = ov::op::v0::Constant::create(ov::element::i64, gather_idx_shape, gather_idx); - auto axis_const = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto gathered = std::make_shared(reshape_src, gather_indices_const, axis_const); - auto converted = std::make_shared(gathered, ov::element::f16); - - auto scatter_indices_const = ov::op::v0::Constant::create(ov::element::i64, scatter_idx_shape, scatter_idx); - auto scatter = std::make_shared(reshape_dst, scatter_indices_const, converted); - - std::vector dst_phys_shape_vec = {1, static_cast(src1_phys_rows), - static_cast(src1_phys_cols) }; - auto dst_phys_shape_const = ov::op::v0::Constant::create(ov::element::i64, {3}, dst_phys_shape_vec); - auto final_output = std::make_shared(scatter, dst_phys_shape_const, false); - - ov::ParameterVector params = { param_src0, param_src1 }; - auto model = std::make_shared(ov::OutputVector{ final_output }, params); - auto compiled_model = core.compile_model(model, "CPU"); - auto infer_request = compiled_model.create_infer_request(); - - ov::Tensor tensor_src(ov::element::f32, src0_phys_shape, src0->data); - ov::Tensor tensor_dst(ov::element::f16, src1_phys_shape, src1->data); - infer_request.set_input_tensor(0, tensor_src); - infer_request.set_input_tensor(1, tensor_dst); - - ov::Tensor out_tensor(ov::element::f16, src1_phys_shape, dst->data); - infer_request.set_output_tensor(0, out_tensor); - - infer_request.infer(); - } -} - -static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - // Find the indices of GGML_OP_CONT, GGML_OP_CPY nodes, GGML_OP_MUL_MAT and so on. - std::vector cont_indices; - std::vector reshape_indices; - std::vector view_indices; - std::vector view_indices_prompt; - std::vector view_split; - - std::vector cpy_indices; - std::vector cpy_split_16; - std::vector cpy_split_19; - std::vector transpose_indices; - std::vector permute_indices; - - std::vector mul_mat_indices; - std::vector add_indices; - - for (int i = 0; i < cgraph->n_nodes; i++) { - if (cgraph->nodes[i]->op == GGML_OP_CONT) { - cont_indices.push_back(i); - } else if (cgraph->nodes[i]->op == GGML_OP_RESHAPE) { - reshape_indices.push_back(i); - // } else if (cgraph->nodes[i]->op == GGML_OP_VIEW) { - } else if (cgraph->nodes[i]->op == GGML_OP_VIEW) { - // if (cgraph->nodes[i]->src[0]->ne[0] == 98304 && (cgraph->nodes[i]->ne[0] == 3072 || cgraph->nodes[i]->ne[0] == 1)) - // continue; - view_indices.push_back(i); - if (cgraph->nodes[i]->ne[0] == 32) { - view_indices_prompt.push_back(i); - } - if (i == 18) { - view_split.push_back(i); - } - } else if (cgraph->nodes[i]->op == GGML_OP_CPY) { - cpy_indices.push_back(i); - if (i == 16) { - cpy_split_16.push_back(i); - } - if (i == 19) { - cpy_split_19.push_back(i); - } - } else if (cgraph->nodes[i]->op == GGML_OP_TRANSPOSE) { - transpose_indices.push_back(i); - } else if (cgraph->nodes[i]->op == GGML_OP_PERMUTE) { - permute_indices.push_back(i); - } else if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT) { - mul_mat_indices.push_back(i); - } else if (cgraph->nodes[i]->op == GGML_OP_ADD) { - add_indices.push_back(i); - } - } - - - // Process nodes in order - - if (cgraph->nodes[0]->ne[1] == 1) { - for (int i = 0; i < cgraph->n_nodes; i++) { - if (std::find(add_indices.begin(), add_indices.end(), i) != add_indices.end()) { - ggml_backend_openvino_add_forward(cgraph->nodes[i]); - } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { - ggml_backend_openvino_transpose(cgraph->nodes[i]); - } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { - ggml_backend_openvino_cpy(cgraph->nodes[i]); - } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { - ggml_backend_openvino_view(cgraph->nodes[i]); - } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { - ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); - } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { - ggml_backend_openvino_reshape(cgraph->nodes[i]); - } else { - // Process a range of nodes with openvino_frontend_compute - int start_index = i; - while (i < cgraph->n_nodes - && std::find(add_indices.begin(), add_indices.end(), i) == add_indices.end() - && std::find(transpose_indices.begin(), transpose_indices.end(), i) == transpose_indices.end() - && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() - && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() - && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() - && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() - ) { - i++; - } - if (start_index < i) { - openvino_frontend_compute(backend, cgraph, start_index, --i); - } - } - } - } else { - int end_node = cgraph->n_nodes - 1; - openvino_frontend_compute(backend, cgraph, 0, end_node); - } - return GGML_STATUS_SUCCESS; - GGML_UNUSED(backend); - GGML_UNUSED(ctx); } static const ggml_backend_i ggml_backend_openvino_interface = { @@ -1265,53 +250,15 @@ static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(g static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { GGML_ASSERT(dev->reg != nullptr); -#ifdef OPENVINO_OP_DEBUG -static const std::set& openvino_ops = []() -> const std::set& { - static const std::set ops = get_openvino_available_opsets(); - return ops; - }(); - switch (op->op) { - case GGML_OP_NONE: - case GGML_OP_PERMUTE: - case GGML_OP_RESHAPE: - case GGML_OP_TRANSPOSE: - case GGML_OP_VIEW: - return true; - case GGML_OP_ADD: - return true; - case GGML_OP_MUL: - case GGML_OP_MUL_MAT: - return false; - case GGML_OP_UNARY: - switch (ggml_get_unary_op(op)) - { - case GGML_UNARY_OP_SILU: - return true; - case GGML_UNARY_OP_ABS: - case GGML_UNARY_OP_SGN: - case GGML_UNARY_OP_NEG: - case GGML_UNARY_OP_STEP: - case GGML_UNARY_OP_TANH: - case GGML_UNARY_OP_ELU: - case GGML_UNARY_OP_RELU: - case GGML_UNARY_OP_SIGMOID: - case GGML_UNARY_OP_GELU: - case GGML_UNARY_OP_GELU_QUICK: - case GGML_UNARY_OP_HARDSWISH: - case GGML_UNARY_OP_HARDSIGMOID: - case GGML_UNARY_OP_EXP: - case GGML_UNARY_OP_COUNT: - return false; - } - return false; - default: - return false; - } -#else - static const std::set& openvino_ops = []() -> const std::set& { - static const std::set ops = get_openvino_available_opsets(); - return ops; - }(); + static const std::set supported_ops{ + GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW, + GGML_OP_CONT, GGML_OP_CPY, GGML_OP_RESHAPE, GGML_OP_PERMUTE, + GGML_OP_TRANSPOSE, GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, + GGML_OP_SCALE, GGML_OP_SOFT_MAX, + }; + static const std::set supported_unary_ops{ + GGML_UNARY_OP_SILU, + }; if (op->op == GGML_OP_UNARY) { return supported_unary_ops.find(ggml_get_unary_op(op)) != @@ -1457,5 +404,4 @@ GGML_API ggml_backend_reg_t ggml_backend_openvino_reg(void) { } return ® -} - +} \ No newline at end of file diff --git a/ggml/src/ggml-openvino/CMakeLists.txt b/ggml/src/ggml-openvino/CMakeLists.txt new file mode 100644 index 0000000000..75b1144843 --- /dev/null +++ b/ggml/src/ggml-openvino/CMakeLists.txt @@ -0,0 +1,42 @@ +find_package(OpenVINO REQUIRED) +list(APPEND GGML_EXTRA_LIBS_PRIVATE openvino::runtime) + +# Set header and libs +file(GLOB GGML_HEADERS_OPENVINO "ggml-openvino/*.h") +list(APPEND GGML_HEADERS_OPENVINO "../include/ggml-openvino.h") +file(GLOB GGML_SOURCES_OPENVINO "ggml-openvino/*.cpp") +list(APPEND GGML_SOURCES_OPENVINO "ggml-openvino.cpp") + +list(APPEND GGML_CDEF_PUBLIC GGML_USE_OPENVINO) + +if (OPENVINO_DEVICE) + if (OPENVINO_DEVICE STREQUAL "GPU") + add_compile_definitions(GGML_OPENVINO_GPU) + elseif (OPENVINO_DEVICE STREQUAL "NPU") + add_compile_definitions(GGML_OPENVINO_NPU) + endif() +endif() + +if(NOT DEFINED GGML_OV_FRONTEND) + set(GGML_OV_FRONTEND OpenVINO_DIR) +endif() +add_definitions(-DGGML_OV_FRONTEND="${GGML_OV_FRONTEND}") + +if (OpenVINO_DIR) + if (GGML_OPENVINO) + if (NOT UNIX) + set(GGML_OPENVINO OFF) + message(WARNING "OpenVINO: OpenVINO toolkit supports unix but not ${CMAKE_SYSTEM_NAME}. Turning off GGML_OPENVINO") + endif() + endif() + + if (GGML_OPENVINO) + if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") + elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64") + else() + set(GGML_OPENVINO OFF) + message(WARNING "OpenVINO: OpenVINO toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}. Turning off GGML_OPENVINO") + endif() + endif() + +endif() diff --git a/ggml/src/ggml-openvino/decoder.h b/ggml/src/ggml-openvino/decoder.h index 790ed2e88d..3404e7c211 100644 --- a/ggml/src/ggml-openvino/decoder.h +++ b/ggml/src/ggml-openvino/decoder.h @@ -1,9 +1,8 @@ #pragma once #include - -#include "openvino/core/node.hpp" -#include "openvino/frontend/decoder.hpp" +#include +#include namespace ov { namespace frontend { @@ -43,11 +42,7 @@ public: virtual std::string& get_output_name(size_t index) const = 0; - virtual size_t get_output_size() const = 0; - - virtual bool is_graph_output(size_t index) const = 0; - - virtual std::string& get_output_name(size_t index) const = 0; + virtual std::vector get_output_names() const = 0; virtual const std::string& get_op_type() const = 0; @@ -65,4 +60,4 @@ public: } // namespace ggml } // namespace frontend -} // namespace ov +} // namespace ov \ No newline at end of file diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 372f880b1d..28409186f8 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -354,7 +354,7 @@ std::vector GgmlOvDecoder::get_shape(const ggml_tensor* tensor) { std::vector GgmlOvDecoder::get_stride(const ggml_tensor* tensor) { std::vector stride; - for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { + for (int i = GGML_MAX_DIMS - 2; i >= 0; --i) { stride.push_back(static_cast(tensor->nb[i])); } return stride; @@ -448,27 +448,16 @@ void GgmlOvDecoder::visit_subgraph(std::function opTypeMap = { - {GGML_OP_ACC, "GGML_OP_ACC"}, - {GGML_OP_ADD, "GGML_OP_ADD"}, - {GGML_OP_ADD1, "GGML_OP_ADD1"}, - {GGML_OP_CONT, "GGML_OP_CONT"}, - {GGML_OP_CPY, "GGML_OP_CPY"}, - {GGML_OP_DIV, "GGML_OP_DIV"}, - {GGML_OP_DUP, "GGML_OP_DUP"}, - {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS"}, - {GGML_OP_MUL, "GGML_OP_MUL"}, - {GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT"}, - {GGML_OP_PERMUTE, "GGML_OP_PERMUTE"}, - {GGML_OP_RESHAPE, "GGML_OP_RESHAPE"}, - {GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM"}, - {GGML_OP_ROPE, "GGML_OP_ROPE"}, - {GGML_OP_SCALE, "GGML_OP_SCALE"}, - {GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX"}, - {GGML_OP_SUB, "GGML_OP_SUB"}, - {GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE"}, - {GGML_OP_UNARY, "GGML_OP_UNARY"}, - {GGML_OP_VIEW, "GGML_OP_VIEW"} - }; + {GGML_OP_ACC, "GGML_OP_ACC"}, {GGML_OP_ADD, "GGML_OP_ADD"}, + {GGML_OP_ADD1, "GGML_OP_ADD1"}, {GGML_OP_CONT, "GGML_OP_CONT"}, + {GGML_OP_CPY, "GGML_OP_CPY"}, {GGML_OP_DIV, "GGML_OP_DIV"}, + {GGML_OP_DUP, "GGML_OP_DUP"}, {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS"}, + {GGML_OP_MUL, "GGML_OP_MUL"}, {GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT"}, + {GGML_OP_PERMUTE, "GGML_OP_PERMUTE"}, {GGML_OP_RESHAPE, "GGML_OP_RESHAPE"}, + {GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM"}, {GGML_OP_ROPE, "GGML_OP_ROPE"}, + {GGML_OP_SCALE, "GGML_OP_SCALE"}, {GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX"}, + {GGML_OP_SUB, "GGML_OP_SUB"}, {GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE"}, + {GGML_OP_UNARY, "GGML_OP_UNARY"}, {GGML_OP_VIEW, "GGML_OP_VIEW"}}; static const std::map unaryOpTypeMap = { {GGML_UNARY_OP_ABS, "GGML_UNARY_OP_ABS"}, {GGML_UNARY_OP_SGN, "GGML_UNARY_OP_SGN"}, @@ -484,8 +473,7 @@ const std::string& GgmlOvDecoder::get_op_type() const { {GGML_UNARY_OP_HARDSWISH, "GGML_UNARY_OP_HARDSWISH"}, {GGML_UNARY_OP_HARDSIGMOID, "GGML_UNARY_OP_HARDSIGMOID"}, {GGML_UNARY_OP_EXP, "GGML_UNARY_OP_EXP"}, - {GGML_UNARY_OP_COUNT, "GGML_UNARY_OP_COUNT"} - }; + {GGML_UNARY_OP_COUNT, "GGML_UNARY_OP_COUNT"}}; auto it = opTypeMap.find(m_node->op); if (it != opTypeMap.end()) { if (it->first == GGML_OP_UNARY) { @@ -498,4 +486,4 @@ const std::string& GgmlOvDecoder::get_op_type() const { } static const std::string unknown_op = "UNKNOWN_OP"; return unknown_op; -} +} \ No newline at end of file diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 22ff9d85f7..a0f6cbea30 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -53,11 +53,7 @@ public: virtual std::string& get_output_name(size_t index) const override; - virtual size_t get_output_size() const override; - - virtual bool is_graph_output(size_t index) const override; - - virtual std::string& get_output_name(size_t index) const override; + virtual std::vector get_output_names() const override; virtual const std::string& get_op_type() const override; @@ -105,10 +101,10 @@ private: void set_max_token_len(); int64_t m_max_token_len; - struct ggml_cgraph * m_cgraph; - std::map m_inputs; + struct ggml_cgraph* m_cgraph; + std::map m_inputs; std::vector m_input_names; - std::map m_outputs; + std::map m_outputs; std::vector m_output_names; ggml_tensor* m_node; std::vector m_nodes; @@ -123,4 +119,4 @@ private: std::vector m_model_output_names; }; -void print_tensor_address_map(const struct ggml_cgraph* cgraph); +void print_tensor_address_map(const struct ggml_cgraph* cgraph); \ No newline at end of file diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 6166161c41..f36700d5ec 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -42,12 +42,7 @@ std::map get_ggml_graph_output_dst(std::shared_ptr& output_dst); + std::map& output_dst); \ No newline at end of file From 4c905b2b2583635881f91d08e78d8fb19c39ffb3 Mon Sep 17 00:00:00 2001 From: Zijun Yu Date: Tue, 13 May 2025 14:31:23 +0800 Subject: [PATCH 060/254] fix build error --- ggml/include/ggml-openvino.h | 32 +++++------- ggml/src/ggml-openvino/CMakeLists.txt | 49 +++++-------------- ggml/src/ggml-openvino/ggml-decoder.cpp | 6 +-- .../src/{ => ggml-openvino}/ggml-openvino.cpp | 31 +++++------- 4 files changed, 40 insertions(+), 78 deletions(-) rename ggml/src/{ => ggml-openvino}/ggml-openvino.cpp (94%) diff --git a/ggml/include/ggml-openvino.h b/ggml/include/ggml-openvino.h index 9172414c29..151c48d40d 100644 --- a/ggml/include/ggml-openvino.h +++ b/ggml/include/ggml-openvino.h @@ -14,37 +14,29 @@ extern "C" { #define GGML_OPENVINO_MAX_DEVICES 16 // backend API -GGML_API ggml_backend_t ggml_backend_openvino_init(int device); +GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device); -GGML_API bool ggml_backend_is_openvino(ggml_backend_t backend); +GGML_BACKEND_API bool ggml_backend_is_openvino(ggml_backend_t backend); // device buffer -GGML_API ggml_backend_buffer_type_t -ggml_backend_openvino_buffer_type(int device); +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device); // split tensor buffer that splits matrices by rows across multiple devices -GGML_API ggml_backend_buffer_type_t -ggml_backend_openvino_split_buffer_type(const float *tensor_split); +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_split_buffer_type(const float * tensor_split); // pinned host buffer for use with the CPU backend for faster copies between CPU // and GPU -GGML_API ggml_backend_buffer_type_t -ggml_backend_openvino_host_buffer_type(void); +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(void); -GGML_API int ggml_backend_openvino_get_device_count(void); -// GGML_API void ggml_backend_openvino_get_device_description(int device, -// char *description, -// size_t -// description_size); -// GGML_API void ggml_backend_openvino_get_device_memory(int device, size_t -// *free, -// size_t *total); +GGML_BACKEND_API int ggml_backend_openvino_get_device_count(void); +// GGML_BACKEND_API void ggml_backend_openvino_get_device_description(int device, char * description, +// size_t description_size); +// GGML_BACKEND_API void ggml_backend_openvino_get_device_memory(int device, size_t * free, size_t * total); -// GGML_API bool ggml_backend_openvino_register_host_buffer(void *buffer, size_t -// size); GGML_API void ggml_backend_openvino_unregister_host_buffer(void -// *buffer); +// GGML_BACKEND_API bool ggml_backend_openvino_register_host_buffer(void * buffer, size_t size); +// GGML_BACKEND_API void ggml_backend_openvino_unregister_host_buffer(void * buffer); -GGML_API ggml_backend_reg_t ggml_backend_openvino_reg(void); +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void); struct ggml_openvino_device_info { int device_count; diff --git a/ggml/src/ggml-openvino/CMakeLists.txt b/ggml/src/ggml-openvino/CMakeLists.txt index 75b1144843..08712c1527 100644 --- a/ggml/src/ggml-openvino/CMakeLists.txt +++ b/ggml/src/ggml-openvino/CMakeLists.txt @@ -1,42 +1,19 @@ find_package(OpenVINO REQUIRED) -list(APPEND GGML_EXTRA_LIBS_PRIVATE openvino::runtime) -# Set header and libs -file(GLOB GGML_HEADERS_OPENVINO "ggml-openvino/*.h") -list(APPEND GGML_HEADERS_OPENVINO "../include/ggml-openvino.h") -file(GLOB GGML_SOURCES_OPENVINO "ggml-openvino/*.cpp") -list(APPEND GGML_SOURCES_OPENVINO "ggml-openvino.cpp") +file(GLOB_RECURSE GGML_HEADERS_OPENVINO "*.h" "*.hpp") +file(GLOB_RECURSE GGML_SOURCES_OPENVINO "*.cpp") -list(APPEND GGML_CDEF_PUBLIC GGML_USE_OPENVINO) +ggml_add_backend_library(ggml-openvino + ${GGML_SOURCES_OPENVINO} + ${GGML_HEADERS_OPENVINO} +) -if (OPENVINO_DEVICE) - if (OPENVINO_DEVICE STREQUAL "GPU") - add_compile_definitions(GGML_OPENVINO_GPU) - elseif (OPENVINO_DEVICE STREQUAL "NPU") - add_compile_definitions(GGML_OPENVINO_NPU) +target_link_libraries(ggml-openvino PRIVATE openvino::runtime) + +if (GGML_OPENVINO) + if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") + elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64") + else() + message(FATAL_ERROR "OpenVINO: OpenVINO toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}") endif() endif() - -if(NOT DEFINED GGML_OV_FRONTEND) - set(GGML_OV_FRONTEND OpenVINO_DIR) -endif() -add_definitions(-DGGML_OV_FRONTEND="${GGML_OV_FRONTEND}") - -if (OpenVINO_DIR) - if (GGML_OPENVINO) - if (NOT UNIX) - set(GGML_OPENVINO OFF) - message(WARNING "OpenVINO: OpenVINO toolkit supports unix but not ${CMAKE_SYSTEM_NAME}. Turning off GGML_OPENVINO") - endif() - endif() - - if (GGML_OPENVINO) - if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") - elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64") - else() - set(GGML_OPENVINO OFF) - message(WARNING "OpenVINO: OpenVINO toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}. Turning off GGML_OPENVINO") - endif() - endif() - -endif() diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 28409186f8..43869ec228 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -278,8 +279,7 @@ void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph) { << std::setw(5) << node->ne[2] << ", " << std::setw(5) << node->ne[3] << "] " << std::left << std::setw(20) << ggml_op_name(node->op) << std::right << " " - << std::left << std::setw(44) << node->name << std::right - << ((node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ") + << std::left << std::setw(45) << node->name << std::right << std::setw(2) << "[ " << std::setw(0) << node->nb[0] << ", " << std::setw(5) << node->nb[1] << ", " @@ -486,4 +486,4 @@ const std::string& GgmlOvDecoder::get_op_type() const { } static const std::string unknown_op = "UNKNOWN_OP"; return unknown_op; -} \ No newline at end of file +} diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp similarity index 94% rename from ggml/src/ggml-openvino.cpp rename to ggml/src/ggml-openvino/ggml-openvino.cpp index f5d5c7ed67..01fccea47a 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -62,7 +62,6 @@ ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph * static const ggml_backend_i ggml_backend_openvino_interface = { /* .get_name = */ ggml_backend_openvino_get_name, /* .free = */ ggml_backend_openvino_free, - /* .get_default_buffer_type = */ ggml_backend_openvino_get_default_buffer_type, /* .set_tensor_async = */ NULL, /* .get_tensor_async = */ NULL, /* .cpy_tensor_async = */ NULL, @@ -72,9 +71,6 @@ static const ggml_backend_i ggml_backend_openvino_interface = { /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_openvino_graph_compute, - /* .supports_op = */ NULL, - /* .supports_buft = */ NULL, - /* .offload_op = */ NULL, /* .event_record = */ NULL, /* .event_wait = */ NULL, }; @@ -89,7 +85,7 @@ static ggml_guid_t ggml_backend_openvino_guid(void) { } // backend API -GGML_API ggml_backend_t ggml_backend_openvino_init(int device) { +GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device) { if (device < 0 || device >= ggml_backend_openvino_get_device_count()) { GGML_LOG_ERROR("%s: invalid device %d\n", __func__, device); return nullptr; @@ -111,30 +107,28 @@ GGML_API ggml_backend_t ggml_backend_openvino_init(int device) { return openvino_backend; } -GGML_API bool ggml_backend_is_openvino(ggml_backend_t backend) { +GGML_BACKEND_API bool ggml_backend_is_openvino(ggml_backend_t backend) { GGML_ASSERT(backend->context != nullptr); return true; } // device buffer -GGML_API ggml_backend_buffer_type_t -ggml_backend_openvino_buffer_type(int device) { +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device) { GGML_ASSERT(device >= 0); return nullptr; } // split tensor buffer that splits matrices by rows across multiple devices -GGML_API ggml_backend_buffer_type_t -ggml_backend_openvino_split_buffer_type(const float *tensor_split) { +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_split_buffer_type(const float * tensor_split) { GGML_ASSERT(tensor_split != nullptr); return nullptr; } // pinned host buffer for use with the CPU backend for faster copies between CPU // and GPU -GGML_API ggml_backend_buffer_type_t -ggml_backend_openvino_host_buffer_type(void) { return nullptr;} - +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(void) { + return nullptr; +} struct ggml_backend_openvino_buffer_type_context { int device; @@ -367,7 +361,7 @@ const ggml_openvino_device_info & ggml_openvino_info() { return info; } -GGML_API ggml_backend_reg_t ggml_backend_openvino_reg(void) { +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void) { static ggml_backend_reg reg; static bool initialized = false; @@ -394,14 +388,13 @@ GGML_API ggml_backend_reg_t ggml_backend_openvino_reg(void) { ctx->devices.push_back(dev); } - reg = ggml_backend_reg { - /* .interface = */ ggml_backend_openvino_reg_interface, - /* .context = */ ctx - }; + reg = ggml_backend_reg{ /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_openvino_reg_interface, + /* .context = */ ctx }; } initialized = true; } return ® -} \ No newline at end of file +} From a0b30529bf0675c875adb00cd001756a3f86cce5 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 13 May 2025 17:45:47 +0800 Subject: [PATCH 061/254] FIX: backend buffer type issue --- ggml/src/ggml-backend-reg.cpp | 1 + ggml/src/ggml-openvino/ggml-openvino.cpp | 15 ++++----------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 1f8ae17363..15286b6aa6 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -630,6 +630,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) { ggml_backend_load_best("opencl", silent, dir_path); ggml_backend_load_best("hexagon", silent, dir_path); ggml_backend_load_best("musa", silent, dir_path); + ggml_backend_load_best("openvino", silent, dir_path); ggml_backend_load_best("cpu", silent, dir_path); // check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend const char * backend_path = std::getenv("GGML_BACKEND_PATH"); diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 01fccea47a..19e4ed5b77 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -46,17 +46,11 @@ static const char * ggml_backend_openvino_get_name(ggml_backend_t backend) { GGML_UNUSED(backend); } -static ggml_backend_buffer_type_t ggml_backend_openvino_get_default_buffer_type(ggml_backend_t backend) { - return ggml_backend_cpu_buffer_type(); - GGML_UNUSED(backend); -} - static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph *cgraph) { openvino_frontend_compute(backend, cgraph); return GGML_STATUS_SUCCESS; - GGML_UNUSED(backend); } static const ggml_backend_i ggml_backend_openvino_interface = { @@ -108,14 +102,14 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device) { } GGML_BACKEND_API bool ggml_backend_is_openvino(ggml_backend_t backend) { - GGML_ASSERT(backend->context != nullptr); - return true; + return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_openvino_guid()); } // device buffer GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device) { GGML_ASSERT(device >= 0); - return nullptr; + return ggml_backend_cpu_buffer_type(); + GGML_UNUSED(device); } // split tensor buffer that splits matrices by rows across multiple devices @@ -184,8 +178,7 @@ static void ggml_backend_openvino_device_get_memory(ggml_backend_dev_t dev, size static enum ggml_backend_dev_type ggml_backend_openvino_device_get_type(ggml_backend_dev_t dev) { GGML_UNUSED(dev); - return GGML_BACKEND_DEVICE_TYPE_CPU; - // return GGML_BACKEND_DEVICE_TYPE_GPU_FULL; + return GGML_BACKEND_DEVICE_TYPE_ACCEL; } static void ggml_backend_openvino_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { From f15a2cc0571e99a8e922bf1c45f4cc32b0282517 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 9 May 2025 13:07:27 +0800 Subject: [PATCH 062/254] STYLE: clang-format --- ggml/src/ggml-openvino/README.md | 30 ------------------------------ 1 file changed, 30 deletions(-) delete mode 100644 ggml/src/ggml-openvino/README.md diff --git a/ggml/src/ggml-openvino/README.md b/ggml/src/ggml-openvino/README.md deleted file mode 100644 index 46c2adb438..0000000000 --- a/ggml/src/ggml-openvino/README.md +++ /dev/null @@ -1,30 +0,0 @@ -# Instructions to Modify and Build ggml with OpenVINO - -## Step 1: Modify the Source Code - -In order to change the frontend `.so` path to the path to `.so` file, you need to add path to the `.so` file in cmake compiler option: -1. Open a terminal and navigate to the root directory of this repo. -2. Run the following commands to configure: - ```sh - mkdir build - cmake -B build -DGGML_OV_FRONTEND="${openvino_repo_dir}/bin/intel64/Release/libopenvino_ggml_frontend.so" - ``` -Where GGML_OV_FRONTEND should point to the path to `libopenvino_ggml_frontend.so` file. - -## Step 2: Build the Project - -After modifying the source code, you need to build the project using CMake. Follow these steps: - -1. (Optional) Enable debug option for ggml-openvino, this will output dump of subgraph sent to OpenVINO, information after convert ggml_cgraph to GraphIterator, and calculation input value/output value of each OP: - ```sh - cmake -B build -DGGML_OPENVINO_DEBUG=ON - ``` - -2. Run the following commands to configure and build the project: - ```sh - cmake -B build -DGGML_OPENVINO=ON - cmake --build build -j - ``` - -This will configure the project with OpenVINO support and build it using multiple cores for faster compilation. - From 0d009fe61a718942f9184c32594fb6ae66bca30a Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 9 May 2025 13:04:20 +0800 Subject: [PATCH 063/254] FEAT: Add all conversion code from ov side --- docs/build.md | 6 +- ggml/src/ggml-openvino/ggml-decoder.h | 2 +- .../{decoder.h => openvino/decoder.hpp} | 1 - ggml/src/ggml-openvino/openvino/frontend.cpp | 27 +++ ggml/src/ggml-openvino/openvino/frontend.hpp | 23 +++ .../ggml-openvino/openvino/input_model.cpp | 17 ++ .../ggml-openvino/openvino/input_model.hpp | 29 +++ .../ggml-openvino/openvino/node_context.hpp | 100 ++++++++++ ggml/src/ggml-openvino/openvino/op/add.cpp | 23 +++ ggml/src/ggml-openvino/openvino/op/cont.cpp | 56 ++++++ ggml/src/ggml-openvino/openvino/op/cpy.cpp | 106 +++++++++++ .../ggml-openvino/openvino/op/get_rows.cpp | 40 ++++ ggml/src/ggml-openvino/openvino/op/mul.cpp | 28 +++ ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 127 +++++++++++++ .../src/ggml-openvino/openvino/op/permute.cpp | 22 +++ .../src/ggml-openvino/openvino/op/reshape.cpp | 35 ++++ .../ggml-openvino/openvino/op/rms_norm.cpp | 47 +++++ ggml/src/ggml-openvino/openvino/op/rope.cpp | 171 ++++++++++++++++++ ggml/src/ggml-openvino/openvino/op/scale.cpp | 31 ++++ .../ggml-openvino/openvino/op/soft_max.cpp | 88 +++++++++ .../ggml-openvino/openvino/op/transpose.cpp | 23 +++ ggml/src/ggml-openvino/openvino/op/unary.cpp | 24 +++ .../ggml-openvino/openvino/op/unary_silu.cpp | 29 +++ ggml/src/ggml-openvino/openvino/op/view.cpp | 26 +++ ggml/src/ggml-openvino/openvino/op_table.cpp | 64 +++++++ ggml/src/ggml-openvino/openvino/op_table.hpp | 13 ++ .../openvino/translate_session.cpp | 145 +++++++++++++++ .../openvino/translate_session.hpp | 27 +++ ggml/src/ggml-openvino/openvino/utils.cpp | 52 ++++++ ggml/src/ggml-openvino/openvino/utils.hpp | 68 +++++++ ggml/src/ggml-openvino/utils.cpp | 30 +-- 31 files changed, 1465 insertions(+), 15 deletions(-) rename ggml/src/ggml-openvino/{decoder.h => openvino/decoder.hpp} (97%) create mode 100644 ggml/src/ggml-openvino/openvino/frontend.cpp create mode 100644 ggml/src/ggml-openvino/openvino/frontend.hpp create mode 100644 ggml/src/ggml-openvino/openvino/input_model.cpp create mode 100644 ggml/src/ggml-openvino/openvino/input_model.hpp create mode 100644 ggml/src/ggml-openvino/openvino/node_context.hpp create mode 100644 ggml/src/ggml-openvino/openvino/op/add.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/cont.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/cpy.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/get_rows.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/mul.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/mulmat.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/permute.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/reshape.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/rms_norm.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/rope.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/scale.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/soft_max.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/transpose.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/unary.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/unary_silu.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/view.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op_table.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op_table.hpp create mode 100644 ggml/src/ggml-openvino/openvino/translate_session.cpp create mode 100644 ggml/src/ggml-openvino/openvino/translate_session.hpp create mode 100644 ggml/src/ggml-openvino/openvino/utils.cpp create mode 100644 ggml/src/ggml-openvino/openvino/utils.hpp diff --git a/docs/build.md b/docs/build.md index 3079a91211..bb7c4137a5 100644 --- a/docs/build.md +++ b/docs/build.md @@ -692,7 +692,11 @@ To read documentation for how to build on IBM Z & LinuxONE, [click here](./build git submodule update --init --recursive export OPENVINO_LLAMA_PATH=$(pwd) + ``` + Before building, change "ENABLE_OV_GGML_FRONTEND" from true to false in the CMakePresets.json file since we already have the code from the ov side in this branch of llama.cpp (`full_backend`). You could also build the master branch of ov instead. + + ``` cmake --preset Release cmake --build build/Release ``` @@ -700,7 +704,7 @@ To read documentation for how to build on IBM Z & LinuxONE, [click here](./build ### Build llama.cpp-ov ```bash - git clone https://github.com/intel-sandbox/llama.cpp-ov.git -b dev_backend_openvino + git clone https://github.com/intel-sandbox/llama.cpp-ov.git -b full_backend cd llama.cpp-ov cmake --preset ReleaseOV diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index a0f6cbea30..959e00b65d 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -5,8 +5,8 @@ #include #include -#include "decoder.h" #include "ggml.h" +#include "openvino/decoder.hpp" class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { public: diff --git a/ggml/src/ggml-openvino/decoder.h b/ggml/src/ggml-openvino/openvino/decoder.hpp similarity index 97% rename from ggml/src/ggml-openvino/decoder.h rename to ggml/src/ggml-openvino/openvino/decoder.hpp index 3404e7c211..3987760a29 100644 --- a/ggml/src/ggml-openvino/decoder.h +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -8,7 +8,6 @@ namespace ov { namespace frontend { namespace ggml { -// TODO: Directly include from openvino class GgmlDecoder : public DecoderBase { public: virtual ov::Any get_attribute(const std::string& name) const = 0; diff --git a/ggml/src/ggml-openvino/openvino/frontend.cpp b/ggml/src/ggml-openvino/openvino/frontend.cpp new file mode 100644 index 0000000000..ff7f0e8392 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/frontend.cpp @@ -0,0 +1,27 @@ +#include "frontend.hpp" + +#include "input_model.hpp" +#include "op_table.hpp" +#include "translate_session.hpp" + +namespace ov { +namespace frontend { +namespace ggml { + +FrontEnd::FrontEnd() {} + +std::shared_ptr FrontEnd::convert(const InputModel::Ptr& model) { + auto ggml_model = std::dynamic_pointer_cast(model); + FRONT_END_GENERAL_CHECK(ggml_model, "Invalid input model"); + std::shared_ptr converted_model; + const auto& supported_ops = get_supported_ops(); + { + TranslateSession translate_session(model, supported_ops); + converted_model = translate_session.get_converted_model(); + } + return converted_model; +} + +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/frontend.hpp b/ggml/src/ggml-openvino/openvino/frontend.hpp new file mode 100644 index 0000000000..5cc7ff1773 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/frontend.hpp @@ -0,0 +1,23 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace ov { +namespace frontend { +namespace ggml { + +class FrontEnd { +public: + using Ptr = std::shared_ptr; + FrontEnd(); + + static std::shared_ptr convert(const InputModel::Ptr& model); +}; + +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/input_model.cpp b/ggml/src/ggml-openvino/openvino/input_model.cpp new file mode 100644 index 0000000000..5fb16ea2db --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/input_model.cpp @@ -0,0 +1,17 @@ +#include "input_model.hpp" + +#include "decoder.hpp" + +namespace ov { +namespace frontend { +namespace ggml { + +InputModel::InputModel(const std::shared_ptr& gdecoder) : m_decoder(gdecoder) {} + +const std::shared_ptr& InputModel::get_model_decoder() const { + return m_decoder; +} + +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/input_model.hpp b/ggml/src/ggml-openvino/openvino/input_model.hpp new file mode 100644 index 0000000000..9bc9a28e9a --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/input_model.hpp @@ -0,0 +1,29 @@ +#pragma once + +#include + +#include "decoder.hpp" + +namespace ov { +namespace frontend { +namespace ggml { + +class FrontEnd; +class GgmlDecoder; +using ov::frontend::ggml::GgmlDecoder; + +class InputModel : public ov::frontend::InputModel { + friend class ::ov::frontend::ggml::FrontEnd; + +public: + explicit InputModel(const std::shared_ptr& gdecoder); + + const std::shared_ptr& get_model_decoder() const; + +private: + std::shared_ptr m_decoder; +}; + +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp new file mode 100644 index 0000000000..bac135270d --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -0,0 +1,100 @@ +#pragma once + +#include + +#include "decoder.hpp" + +namespace ov { +namespace frontend { +namespace ggml { + +class TranslateSession; + +typedef std::map> TensorMap; + +class NodeContext : public frontend::NodeContext { +public: + NodeContext(const std::shared_ptr& decoder, + std::shared_ptr& tensor_map, + TranslateSession* translate_session = nullptr) + : ov::frontend::NodeContext(decoder->get_op_type()), + m_decoder(decoder), + m_tensor_map(tensor_map), + m_translate_session(translate_session) { + m_input_names = decoder->get_input_names(); + m_output_names = decoder->get_output_names(); + } + + TranslateSession* get_translate_session() const { + return m_translate_session; + } + + size_t get_input_size() const override { + return m_decoder->get_input_size(); + } + + Any get_input_type(size_t index) const { + return m_decoder->get_input_type(m_input_names[index]); + } + + PartialShape get_input_shape(size_t index) const { + return m_decoder->get_input_shape(m_input_names[index]); + } + + std::vector get_input_stride(size_t index) const { + return m_decoder->get_input_stride(m_input_names[index]); + } + + PartialShape get_output_shape(size_t index) const { + return m_decoder->get_output_shape(m_output_names[index]); + } + + std::vector get_output_stride(size_t index) const { + return m_decoder->get_output_stride(m_output_names[index]); + } + + int32_t* get_input_op_params(size_t index) const { + return m_decoder->get_input_op_params(m_input_names[index]); + } + + int32_t* get_output_op_params(size_t index) const { + return m_decoder->get_output_op_params(m_output_names[index]); + } + + ov::element::Type get_output_type(size_t index) const { + return m_decoder->get_output_type(m_output_names[index]); + } + + Output get_input(int idx) const override { + return m_tensor_map->at(m_decoder->get_input_name(idx)); + } + + Output get_input(const std::string& name) const override { + return m_tensor_map->at(name); + } + + const std::string& get_name() const override { + return m_decoder->get_op_name(); + } + + ov::Any get_attribute_as_any(const std::string& name) const override { + return m_decoder->get_attribute(name); + } + + bool check_if_continuous() const { + return m_decoder->check_if_continuous(); + } + +private: + std::shared_ptr m_decoder; + std::shared_ptr& m_tensor_map; + TranslateSession* m_translate_session; + std::vector m_input_names; + std::vector m_output_names; +}; + +using CreatorFunction = std::function; + +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/add.cpp b/ggml/src/ggml-openvino/openvino/op/add.cpp new file mode 100644 index 0000000000..c218cf34de --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/add.cpp @@ -0,0 +1,23 @@ +#include "openvino/op/add.hpp" + +#include "../node_context.hpp" +#include "../utils.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_add(const NodeContext& context) { + num_inputs_check(context, 2, 2); + + auto lhs = context.get_input(0); + auto rhs = context.get_input(1); + auto add = std::make_shared(lhs, rhs); + return {add}; +}; + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/cont.cpp b/ggml/src/ggml-openvino/openvino/op/cont.cpp new file mode 100644 index 0000000000..2ebc890fda --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp @@ -0,0 +1,56 @@ + +#include +#include +#include +#include + +#include "../node_context.hpp" +#include "../utils.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/op/slice.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_cont(const NodeContext& context) { + num_inputs_check(context, 1, 1); + + auto src_shape = context.get_input_shape(0).to_shape(); + auto dst_shape = context.get_output_shape(0).to_shape(); + + bool continuous = context.check_if_continuous(); + if (continuous) { + // The input comes from a PERMUTE + dst_shape[1] = -1; + auto result = std::make_shared( + context.get_input(0), + ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), + false); + + return {result}; + } else { + // The input comes from a VIEW + // Currently all cases are slicing at lowest dim + int32_t* op_params = context.get_input_op_params(0); + auto output_stride = context.get_output_stride(0); + + int64_t split_addr = op_params[0] / output_stride[2]; + std::vector begin = {0, 0, split_addr}; + std::vector end = {(int64_t)src_shape[0], INT_MAX, split_addr + (int64_t)src_shape[2]}; + std::vector strides = {1, 1, 1}; + + auto begin_const = ov::op::v0::Constant::create(ov::element::i64, {begin.size()}, begin); + auto end_const = ov::op::v0::Constant::create(ov::element::i64, {end.size()}, end); + auto strides_const = ov::op::v0::Constant::create(ov::element::i64, {strides.size()}, strides); + auto slice = std::make_shared(context.get_input(0), begin_const, end_const, strides_const); + + return {slice}; + } +}; + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp new file mode 100644 index 0000000000..b4f4d59408 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -0,0 +1,106 @@ +#include +#include +#include + +#include "../node_context.hpp" +#include "../utils.hpp" +#include "openvino/core/node.hpp" +#include "openvino/core/node_output.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convert_like.hpp" +#include "openvino/op/range.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/op/scatter_nd_update.hpp" +#include "openvino/op/transpose.hpp" +#include "openvino/op/unsqueeze.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_cpy(const NodeContext& context) { + num_inputs_check(context, 2, 2); + auto src0 = context.get_input(0); + auto src1 = context.get_input(1); + auto past_token_len = context.get_input("past_token_len"); + + auto src0_shape = context.get_input_shape(0).to_shape(); + auto output_shape = context.get_output_shape(0).to_shape(); + bool continuous = context.check_if_continuous(); + + std::vector input0_strides = context.get_input_stride(0); + std::vector output_strides = context.get_output_stride(0); + + auto one = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1}); + + src0 = std::make_shared(src0, src1); + if (continuous) { + // Write K to cache_k + int64_t head_size = src0_shape[2]; + int64_t num_heads = src0_shape[1]; + + auto reshaped_src1_shape = + ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{-1, num_heads, head_size}); + auto reshaped_src1 = std::make_shared(src1, reshaped_src1_shape, false); + + auto token_len = get_dimensions(src0.get_node_shared_ptr(), {0}); + token_len = std::make_shared(token_len, + ov::op::v0::Constant::create(ov::element::i64, {0}, {}), + false); + auto total_token_len = std::make_shared(past_token_len, token_len); + std::shared_ptr indices = + std::make_shared(past_token_len, total_token_len, one, ov::element::i64); + indices = std::make_shared( + indices, + ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{1})); + + auto res = std::make_shared(reshaped_src1, indices, src0); + return {res}; + } else { + // Write V to cache_v + int64_t total_head_size = src0_shape[1]; + + auto reshaped_src0 = std::make_shared( + src0, + ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{total_head_size, -1}), + false); + auto transposed_src0 = + std::make_shared(reshaped_src0, + ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 0})); + + auto reshaped_src1 = std::make_shared( + src1, + ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{total_head_size, -1}), + false); + auto transposed_src1 = + std::make_shared(reshaped_src1, + ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 0})); + + auto token_len = get_dimensions(src0.get_node_shared_ptr(), {2}); + token_len = std::make_shared(token_len, + ov::op::v0::Constant::create(ov::element::i64, {0}, {}), + false); + auto total_token_len = std::make_shared(past_token_len, token_len); + std::shared_ptr indices = + std::make_shared(past_token_len, total_token_len, one, ov::element::i64); + indices = std::make_shared( + indices, + ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{1})); + + auto res = std::make_shared(transposed_src1, indices, transposed_src0); + auto transposed_res = + std::make_shared(res, ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 0})); + auto reshaped_res = std::make_shared( + transposed_res, + ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, -1}), + false); + return {reshaped_res}; + } +}; + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp new file mode 100644 index 0000000000..edb25d9124 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp @@ -0,0 +1,40 @@ +#include +#include + +#include "../node_context.hpp" +#include "../utils.hpp" +#include "openvino/core/node.hpp" +#include "openvino/core/node_output.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convert.hpp" +#include "openvino/op/gather.hpp" +#include "openvino/op/reshape.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_get_rows(const NodeContext& context) { + num_inputs_check(context, 2, 2); + + auto data_node = context.get_input(0); + auto indices_node = context.get_input(1); + + auto indices_shape = get_dimensions(indices_node.get_node_shared_ptr(), {2}); + Output indice_reshaped = std::make_shared(indices_node, indices_shape, false); + + auto axis_node = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1}); + + Output res = std::make_shared(data_node, indice_reshaped, axis_node); + if (res.get_element_type() != context.get_output_type(0)) { + res = std::make_shared(res, context.get_output_type(0)); + } + + return {res}; +}; + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/mul.cpp b/ggml/src/ggml-openvino/openvino/op/mul.cpp new file mode 100644 index 0000000000..1b1c69f7df --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/mul.cpp @@ -0,0 +1,28 @@ +#include +#include + +#include "../node_context.hpp" +#include "../utils.hpp" +#include "openvino/core/node.hpp" +#include "openvino/core/node_output.hpp" +#include "openvino/op/broadcast.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/reshape.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_mul(const NodeContext& context) { + num_inputs_check(context, 2, 2); + + auto res = std::make_shared(context.get_input(0), context.get_input(1)); + return {res}; +}; + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp new file mode 100644 index 0000000000..e00435ef81 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -0,0 +1,127 @@ +#include +#include +#include +#include + +#include "../node_context.hpp" +#include "../utils.hpp" +#include "openvino/core/node.hpp" +#include "openvino/core/node_output.hpp" +#include "openvino/op/concat.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convert_like.hpp" +#include "openvino/op/matmul.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/op/slice.hpp" +#include "openvino/op/transpose.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_mulmat(const NodeContext& context) { + num_inputs_check(context, 2, 2); + + bool continuous = context.check_if_continuous(); + if (continuous) { + auto src1 = context.get_input(1); + auto src0_converted = std::make_shared(context.get_input(0), src1); + auto result = std::make_shared(src1, src0_converted, false, true); + return {result}; + } else { + /* + Two cases here: + - 21: [ 96, 32, 32, 1] VIEW k-0 [ 2, 6144, 192, 6144] + [ 196608, 1, 1, 1] 0: NONE cache_k_l0 [ 2, 393216, 393216, 393216] + - 22: [ 96, 7, 32, 1] PERMUTE q-0 [ 4, 12288, 384, 86016] + [ 96, 32, 7, 1] 0: SCALE Qcur-0 [ 4, 384, 12288, 86016] + - 23: [ 32, 7, 32, 1] MUL_MAT kq-0 [ 4, 128, 896, 28672] + [ 96, 32, 32, 1] 0: VIEW k-0 [ 2, 6144, 192, 6144] + [ 96, 7, 32, 1] 1: PERMUTE q-0 [ 4, 12288, 384, 86016] + + - 20: [ 32, 96, 32, 1] VIEW v-0 [ 2, 128, 12288, 393216] + [ 196608, 1, 1, 1] 0: NONE cache_v_l0 [ 2, 393216, 393216, 393216] + - 25: [ 96, 7, 32, 1] MUL_MAT kqv-0 [ 4, 384, 2688, 86016] + [ 32, 96, 32, 1] 0: VIEW v-0 [ 2, 128, 12288, 393216] + [ 32, 7, 32, 1] 1: SOFT_MAX kq_soft_max_ext-0 [ 4, 128, 896, 28672] + + For case 1, for src0, Reshape + Slice + Transpose + For case 2, for src0, Reshape + Slice + */ + ov::Output A; + ov::Output B; + + auto attention_size = context.get_input("attention_size"); + + auto src0 = context.get_input(0); + auto src0_shape = context.get_input_shape(0).to_shape(); + auto src0_stride = context.get_input_stride(0); + auto permuted = is_permuted(src0_stride); + auto token_dim = permuted ? 0 : 2; + + auto src0_perm = argsort_descend(src0_stride); + auto src0_original_shape_ = permute(src0_shape, src0_perm); + std::vector src0_original_shape(src0_original_shape_.begin(), src0_original_shape_.end()); + src0_original_shape[token_dim] = -1; + + auto src0_slice_shape = src0_original_shape; + src0_slice_shape.erase(src0_slice_shape.begin() + token_dim); + + auto src0_reshape_shape = + ov::op::v0::Constant::create(ov::element::i64, {src0_original_shape.size()}, src0_original_shape); + auto src0_reshape = std::make_shared(src0, src0_reshape_shape, false); + + std::shared_ptr slice_end; + if (permuted) { + slice_end = std::make_shared( + ov::OutputVector{attention_size, ov::op::v0::Constant::create(ov::element::i64, {2}, src0_slice_shape)}, + 0); + } else { + slice_end = std::make_shared( + ov::OutputVector{ov::op::v0::Constant::create(ov::element::i64, {2}, src0_slice_shape), attention_size}, + 0); + } + auto slice_start = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector(3, 0)); + auto slice_step = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector(3, 1)); + auto src0_slice = std::make_shared(src0_reshape, slice_start, slice_end, slice_step); + + if (permuted) { + B = std::make_shared( + src0_slice, + ov::op::v0::Constant::create(ov::element::i64, {src0_perm.size()}, src0_perm)); + } else { + B = src0_slice; + } + + A = context.get_input(1); + B = std::make_shared(B, A); + + int64_t num_heads = context.get_input_shape(1).to_shape()[0]; + int64_t num_heads_kv = src0_shape[0]; + int64_t kv_num_heads_factor = num_heads / num_heads_kv; + if (kv_num_heads_factor > 1) { + auto num_heads_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{num_heads}); + auto num_heads_kv_node = + ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{num_heads_kv}); + auto B_shape_last_two = get_dimensions(B.get_node_shared_ptr(), {1, 2}); + + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + std::shared_ptr new_B_shape = + std::make_shared(ov::OutputVector{num_heads_kv_node, one, B_shape_last_two}, 0); + B = std::make_shared(B, new_B_shape, false); + + B = std::make_shared(ov::OutputVector(kv_num_heads_factor, B), 1); + new_B_shape = std::make_shared(ov::OutputVector{num_heads_node, B_shape_last_two}, 0); + B = std::make_shared(B, new_B_shape, false); + } + + auto result = std::make_shared(A, B, false, true); + return {result}; + } +}; + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp new file mode 100644 index 0000000000..42472f18cc --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -0,0 +1,22 @@ +#include "../node_context.hpp" +#include "../utils.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/transpose.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { +OutputVector translate_permute(const NodeContext& context) { + num_inputs_check(context, 1, 1); + + // TODO: make this more general + auto res = std::make_shared(context.get_input(0), + ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); + + return {res}; +}; +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/reshape.cpp b/ggml/src/ggml-openvino/openvino/op/reshape.cpp new file mode 100644 index 0000000000..ca18b72c42 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/reshape.cpp @@ -0,0 +1,35 @@ +#include "openvino/op/reshape.hpp" + +#include +#include + +#include "../node_context.hpp" +#include "../utils.hpp" +#include "openvino/core/node.hpp" +#include "openvino/core/node_output.hpp" +#include "openvino/op/constant.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_reshape(const NodeContext& context) { + num_inputs_check(context, 1, 1); + if (context.get_input_shape(0) == context.get_output_shape(0)) { + return {context.get_input(0)}; + } + + auto output_shape = context.get_output_shape(0).to_shape(); + auto new_shape_node = + ov::op::v0::Constant::create(ov::element::i64, + {3}, + std::vector{-1, (int64_t)output_shape[1], (int64_t)output_shape[2]}); + Output res = std::make_shared(context.get_input(0), new_shape_node, false); + return {res}; +}; + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp new file mode 100644 index 0000000000..7b9783e8c9 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp @@ -0,0 +1,47 @@ +#include "../node_context.hpp" +#include "../utils.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/divide.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/reduce_sum.hpp" +#include "openvino/op/sqrt.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_rms_norm(const NodeContext& context) { + num_inputs_check(context, 1, 1); + + ov::Shape input_shape = context.get_input_shape(0).to_shape(); + auto input_node = context.get_input(0); + auto square = std::make_shared(input_node, input_node); + + auto reduce_sum = + std::make_shared(square, + ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}), + true); + + auto mean = std::make_shared( + reduce_sum, + ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {static_cast(input_shape[2])})); + + float eps; + memcpy(&eps, context.get_output_op_params(0), sizeof(float)); + auto rms = std::make_shared( + std::make_shared(mean, ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {eps}))); + + auto scale = + std::make_shared(ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {1.0f}), rms); + + auto res = std::make_shared(input_node, scale); + + return {res}; +}; + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp new file mode 100644 index 0000000000..d5083ae14b --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -0,0 +1,171 @@ + +#include +#include +#include + +#include "../node_context.hpp" +#include "../utils.hpp" +#include "openvino/core/node.hpp" +#include "openvino/core/node_output.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/broadcast.hpp" +#include "openvino/op/concat.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convert.hpp" +#include "openvino/op/cos.hpp" +#include "openvino/op/divide.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/op/shape_of.hpp" +#include "openvino/op/sin.hpp" +#include "openvino/op/slice.hpp" +#include "openvino/op/split.hpp" +#include "openvino/op/subtract.hpp" +#include "openvino/op/transpose.hpp" + +#define GGML_ROPE_TYPE_NEOX 2 + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) { + return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base)); +} + +void ggml_rope_yarn_corr_dims(int n_dims, + int n_ctx_orig, + float freq_base, + float beta_fast, + float beta_slow, + float dims[2]) { + float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base)); + float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base)); + dims[0] = MAX(0, start); + dims[1] = MIN(n_dims - 1, end); +} + +OutputVector translate_rope(const NodeContext& context) { + num_inputs_check(context, 2, 3); + + auto data_node = context.get_input(0); + auto pos_node = context.get_input(1); + pos_node = std::make_shared(pos_node, ov::element::f32); + + auto permutation_node = + std::make_shared(ov::element::i64, ov::Shape{3}, std::vector{2, 1, 0}); + Output pos_node_reshaped = std::make_shared(pos_node, permutation_node); + + auto output_shape = context.get_output_shape(0); + + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; + int32_t* op_params = context.get_output_op_params(0); + const int n_dims = op_params[1]; + const int mode = op_params[2]; + const int n_ctx_orig = op_params[4]; + memcpy(&freq_base, op_params + 5, sizeof(float)); + memcpy(&freq_scale, op_params + 6, sizeof(float)); + memcpy(&ext_factor, op_params + 7, sizeof(float)); + memcpy(&attn_factor, op_params + 8, sizeof(float)); + memcpy(&beta_fast, op_params + 9, sizeof(float)); + memcpy(&beta_slow, op_params + 10, sizeof(float)); + + const float theta_scale = powf(freq_base, -2.0f / n_dims); + + // TODO: corr_dims is not used in the current implementation + float corr_dims[2]; + ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); + + const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; + + // TODO: GGML_OP_ROPE_BACK -> false + bool forward = true; + const float sin_sign = forward ? 1.0f : -1.0f; + + const int64_t ne0 = output_shape[2].get_length(); + std::vector factor(ne0 / 2); + factor[0] = freq_scale; + for (int64_t i = 1; i < ne0 / 2; i++) { + factor[i] = theta_scale * factor[i - 1]; + } + + Output factor_node = + std::make_shared(ov::element::f32, ov::Shape{factor.size()}, factor); + if (context.get_input_size() == 3) { + auto freq_factors_node = context.get_input(2); + factor_node = std::make_shared(factor_node, freq_factors_node); + } + + auto half_last_dim = ov::op::v0::Constant::create(ov::element::i64, Shape{1}, {output_shape[2].get_length() / 2}); + Output input_shape_node = std::make_shared( + OutputVector{get_dimensions(data_node.get_node_shared_ptr(), {0, 1}), half_last_dim}, + 0); + Output factor_broadcasted_node = std::make_shared(factor_node, input_shape_node); + + Output cos_factor_broadcasted_node = std::make_shared( + std::make_shared(factor_broadcasted_node, pos_node_reshaped)); + Output sin_factor_broadcasted_node = std::make_shared( + std::make_shared(factor_broadcasted_node, pos_node_reshaped)); + + float mscale = attn_factor; + Output mscale_node = + std::make_shared(ov::element::f32, ov::Shape{}, std::vector{mscale}); + Output mscale_sin_sign_node = + std::make_shared(ov::element::f32, ov::Shape{}, std::vector{mscale * sin_sign}); + Output cos_theta_node = std::make_shared(cos_factor_broadcasted_node, mscale_node); + Output sin_theta_node = std::make_shared(sin_factor_broadcasted_node, mscale_node); + + if (!is_neox) { + auto input_shape = context.get_input_shape(0); + + auto begin_even = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {0, 0, 0}); + auto begin_odd = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {0, 0, 1}); + auto end = std::make_shared(data_node); + auto stride = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {1, 1, 2}); + auto even_slice = std::make_shared(data_node, begin_even, end, stride); + auto odd_slice = std::make_shared(data_node, begin_odd, end, stride); + + auto first_half = + std::make_shared(std::make_shared(even_slice, cos_theta_node), + std::make_shared(odd_slice, sin_theta_node)); + auto second_half = + std::make_shared(std::make_shared(even_slice, sin_theta_node), + std::make_shared(odd_slice, cos_theta_node)); + + auto stack = std::make_shared(OutputVector{first_half, second_half}, 2); + auto shape_const = ov::op::v0::Constant::create( + ov::element::i64, + Shape{3}, + std::vector{-1, input_shape[1].get_length(), input_shape[2].get_length()}); + auto reshaped = std::make_shared(stack, shape_const, false); + + return {reshaped}; + } else { + auto slice_node = + std::make_shared(data_node, + ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}), + 2); + Output slice_data_node_0 = slice_node->outputs()[0]; + Output slice_data_node_1 = slice_node->outputs()[1]; + + auto first_half_node = std::make_shared( + std::make_shared(slice_data_node_0, cos_theta_node), + std::make_shared(slice_data_node_1, sin_theta_node)); + + auto second_half_node = std::make_shared( + std::make_shared(slice_data_node_0, sin_theta_node), + std::make_shared(slice_data_node_1, cos_theta_node)); + + auto res_node = std::make_shared(ov::OutputVector{first_half_node, second_half_node}, 2); + return {res_node}; + } +}; + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/scale.cpp b/ggml/src/ggml-openvino/openvino/op/scale.cpp new file mode 100644 index 0000000000..392bfc1ed4 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/scale.cpp @@ -0,0 +1,31 @@ +#include +#include + +#include "../node_context.hpp" +#include "../utils.hpp" +#include "openvino/core/node.hpp" +#include "openvino/core/node_output.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/multiply.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_scale(const NodeContext& context) { + num_inputs_check(context, 1, 1); + + float scale; + memcpy(&scale, context.get_output_op_params(0), sizeof(float)); + auto scale_node = std::make_shared(ov::element::f32, ov::Shape{}, std::vector{scale}); + + auto res = std::make_shared(context.get_input(0), scale_node); + + return {res}; +}; + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp new file mode 100644 index 0000000000..27c7cefef0 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp @@ -0,0 +1,88 @@ + +#include +#include +#include + +#include "../node_context.hpp" +#include "../utils.hpp" +#include "openvino/core/node.hpp" +#include "openvino/core/node_output.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/concat.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convert.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/slice.hpp" +#include "openvino/op/softmax.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_soft_max(const NodeContext& context) { + num_inputs_check(context, 1, 2); + + auto input_node = context.get_input(0); + + float scale = 1.0f; + float max_bias = 0.0f; + auto op_params = context.get_output_op_params(0); + memcpy(&scale, (float*)op_params + 0, sizeof(float)); + memcpy(&max_bias, (float*)op_params + 1, sizeof(float)); + + const uint32_t n_head = context.get_input_shape(0)[0].get_length(); + const uint32_t n_head_log2 = 1u << (uint32_t)floor(log2(n_head)); + + // const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); + // const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); + const float slope = (max_bias > 0.0f) ? 1.0f : 1.0f; + // const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) + // : 1.0f; + + if (scale != 1.0f) { + auto scale_node = + std::make_shared(ov::element::f32, ov::Shape{}, std::vector{scale}); + input_node = std::make_shared(input_node, scale_node); + } + + if (context.get_input_size() == 2) { + // Calculate mask then softmax + auto mask_node = context.get_input(1); + ov::element::Type mask_type = (context.get_input_type(1)).as(); + if (mask_type == ov::element::f16) { + // Convert f16 to f32 + mask_node = std::make_shared(mask_node, ov::element::f32); + } + + // Stride slice mask node + Output mask_begin_node = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {0, 0, 0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {1}); + auto input_last_two_dim = get_dimensions(input_node.get_node_shared_ptr(), {1, 2}); + auto mask_slice_shape = std::make_shared(ov::NodeVector{one, input_last_two_dim}, 0); + Output mask_stride_node = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {1, 1, 1}); + auto mask_node_sliced = + std::make_shared(mask_node, mask_begin_node, mask_slice_shape, mask_stride_node); + + // slope * mask + auto slope_node = + std::make_shared(ov::element::f32, ov::Shape{}, std::vector{slope}); + auto slope_mask_node = std::make_shared(mask_node_sliced, slope_node); + + // input + slope * mask + auto input_slope_mask_node = std::make_shared(input_node, slope_mask_node); + + // Calculate softmax + auto res = std::make_shared(input_slope_mask_node, 2); + return {res}; + } else { + // Directly softmax + auto res = std::make_shared(input_node, 0); + return {res}; + } +}; + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/transpose.cpp b/ggml/src/ggml-openvino/openvino/op/transpose.cpp new file mode 100644 index 0000000000..f7408f40d4 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/transpose.cpp @@ -0,0 +1,23 @@ +#include "openvino/op/transpose.hpp" + +#include "../node_context.hpp" +#include "../utils.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_transpose(const NodeContext& context) { + num_inputs_check(context, 1, 1); + + auto perm = argsort_descend(context.get_output_stride(0)); + auto res = std::make_shared(context.get_input(0), + ov::op::v0::Constant::create(ov::element::i64, {3}, perm)); + return {res}; +}; + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/unary.cpp b/ggml/src/ggml-openvino/openvino/op/unary.cpp new file mode 100644 index 0000000000..391e0a7599 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/unary.cpp @@ -0,0 +1,24 @@ + +#include +#include + +#include "../node_context.hpp" +#include "../utils.hpp" +#include "openvino/core/node.hpp" +#include "openvino/core/node_output.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_unary(const NodeContext& context) { + num_inputs_check(context, 1, 1); + + return {context.get_input(0)}; +}; + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp b/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp new file mode 100644 index 0000000000..2a90a79475 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp @@ -0,0 +1,29 @@ +#include +#include + +#include "../node_context.hpp" +#include "../utils.hpp" +#include "openvino/core/node.hpp" +#include "openvino/core/node_output.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/sigmoid.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_unary_silu(const NodeContext& context) { + num_inputs_check(context, 1, 1); + + auto input = context.get_input(0); + auto sigmoid = std::make_shared(input); + auto res = std::make_shared(input, sigmoid); + + return {res}; +}; + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/view.cpp b/ggml/src/ggml-openvino/openvino/op/view.cpp new file mode 100644 index 0000000000..aaf117b662 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/view.cpp @@ -0,0 +1,26 @@ +#include +#include + +#include "../utils.hpp" +#include "openvino/core/node.hpp" +#include "openvino/core/node_output.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convert.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/op/strided_slice.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_view(const NodeContext& context) { + num_inputs_check(context, 1, 1); + + return {context.get_input(0)}; +}; + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp new file mode 100644 index 0000000000..af51bb157e --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op_table.cpp @@ -0,0 +1,64 @@ +#include "op_table.hpp" + +#include +#include +#include +#include +#include +#include + +#include "utils.hpp" + +using namespace ov::op; +namespace ov { +namespace frontend { +namespace ggml { + +namespace op { + +#define GGML_OP_CONVERTER(op) OutputVector op(const NodeContext& node) + +GGML_OP_CONVERTER(translate_add); +GGML_OP_CONVERTER(translate_cont); +GGML_OP_CONVERTER(translate_cpy); +GGML_OP_CONVERTER(translate_get_rows); +GGML_OP_CONVERTER(translate_mul); +GGML_OP_CONVERTER(translate_mulmat); +GGML_OP_CONVERTER(translate_permute); +GGML_OP_CONVERTER(translate_reshape); +GGML_OP_CONVERTER(translate_rms_norm); +GGML_OP_CONVERTER(translate_rope); +GGML_OP_CONVERTER(translate_scale); +GGML_OP_CONVERTER(translate_unary_silu); +GGML_OP_CONVERTER(translate_soft_max); +GGML_OP_CONVERTER(translate_transpose); +GGML_OP_CONVERTER(translate_unary); +GGML_OP_CONVERTER(translate_view); + +} // namespace op + +const std::unordered_map get_supported_ops() { + return {{"GGML_OP_ADD", op::translate_1to1_match_2_inputs}, + {"GGML_OP_ADD1", op::translate_1to1_match_2_inputs}, + {"GGML_OP_CONT", op::translate_cont}, + {"GGML_OP_CPY", op::translate_cpy}, + {"GGML_OP_DIV", op::translate_1to1_match_2_inputs}, + {"GGML_OP_GET_ROWS", op::translate_get_rows}, + // {"GGML_OP_MUL", op::translate_1to1_match_2_inputs}, + {"GGML_OP_MUL", op::translate_mul}, + {"GGML_OP_MUL_MAT", op::translate_mulmat}, + {"GGML_OP_PERMUTE", op::translate_permute}, + {"GGML_OP_RESHAPE", op::translate_reshape}, + {"GGML_OP_RMS_NORM", op::translate_rms_norm}, + {"GGML_OP_ROPE", op::translate_rope}, + {"GGML_OP_SCALE", op::translate_scale}, + {"GGML_OP_SOFT_MAX", op::translate_soft_max}, + {"GGML_OP_SUB", op::translate_1to1_match_2_inputs}, + {"GGML_OP_TRANSPOSE", op::translate_transpose}, + {"GGML_UNARY_OP_SILU", op::translate_unary_silu}, + {"GGML_OP_VIEW", op::translate_view}}; +}; + +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op_table.hpp b/ggml/src/ggml-openvino/openvino/op_table.hpp new file mode 100644 index 0000000000..c83aaa199f --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op_table.hpp @@ -0,0 +1,13 @@ +#pragma once + +#include "node_context.hpp" + +namespace ov { +namespace frontend { +namespace ggml { + +const std::unordered_map get_supported_ops(); + +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp new file mode 100644 index 0000000000..f5b14d3a0f --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -0,0 +1,145 @@ +#include "translate_session.hpp" + +#include +#include + +#include "input_model.hpp" + +namespace ov { +namespace frontend { +namespace ggml { + +using namespace ov::op; + +TranslateSession::TranslateSession(const frontend::InputModel::Ptr& input_model, + const std::unordered_map& translator_map) + : m_input_model(input_model), + m_translator_map(translator_map), + m_ov_model(nullptr) {} + +std::shared_ptr TranslateSession::get_converted_model() { + if (m_ov_model) { + return m_ov_model; + } + m_ov_model = translate_graph(m_input_model); + // print_model_topology(); + return m_ov_model; +} + +void TranslateSession::print_model_topology() { + try { + std::ofstream outfile("model_topology.txt", std::ios::out | std::ios::app); + if (!outfile.is_open()) { + throw std::runtime_error("Failed to open file for writing model topology."); + } + + outfile << "============ Model ============" << std::endl; + for (const auto& op : m_ov_model->get_ordered_ops()) { + outfile << "Operation: " << op->get_friendly_name() << std::endl; + outfile << " Inputs:" << std::endl; + for (const auto& input : op->inputs()) { + outfile << " " << input.get_node()->get_friendly_name() << " -> " << input.get_element_type() << " " + << input.get_shape() << std::endl; + } + outfile << " Outputs:" << std::endl; + for (const auto& output : op->outputs()) { + outfile << " " << output.get_node()->get_friendly_name() << " -> " << output.get_element_type() + << " " << output.get_shape() << std::endl; + } + outfile << std::endl; + } + outfile << "===============================" << std::endl; + outfile.close(); + } catch (const std::exception& ex) { + std::cout << ex.what() << std::endl; + } +} + +std::shared_ptr TranslateSession::translate_graph(const frontend::InputModel::Ptr& input_model) { + ov::ParameterVector params; + ov::ResultVector results; + auto tensor_map = std::make_shared(); + std::shared_ptr resulting_model; + + const auto& ggml_model = std::dynamic_pointer_cast(input_model); + std::shared_ptr ggml_model_decoder = ggml_model->get_model_decoder(); + + FRONT_END_GENERAL_CHECK(ggml_model, "nullptr for InputModel is given for translation into OV Model"); + const auto& model_inputs = ggml_model->get_inputs(); + const auto& model_outputs = ggml_model->get_outputs(); + + for (const auto& it : ggml_model_decoder->get_model_inputs()) { + params.push_back(std::dynamic_pointer_cast(it.second)); + (*tensor_map)[it.first] = it.second; + } + + for (const auto& it : ggml_model_decoder->get_model_extra_inputs()) { + params.push_back(std::dynamic_pointer_cast(it.second)); + (*tensor_map)[it.first] = it.second; + } + + for (const auto& it : ggml_model_decoder->get_model_weights()) { + (*tensor_map)[it.first] = it.second; + } + + auto node_visitor = [&](std::shared_ptr node) { + auto operation_type = node->get_op_type(); + ov::OutputVector converted_outputs; + auto it = m_translator_map.find(operation_type); + if (it != m_translator_map.end()) { + try { + NodeContext node_context(node, tensor_map, this); + converted_outputs = it->second(node_context); + } catch (const std::exception& ex) { + std::cout << ex.what() << std::endl; + } + } else { + // TODO + } + + const auto& node_output_names = node->get_output_names(); + FRONT_END_OP_CONVERSION_CHECK(node_output_names.size() == converted_outputs.size(), + "Number of ", + operation_type, + " outputs greater than number of converted outputs, which are ", + node_output_names.size(), + " and ", + converted_outputs.size(), + " respectively."); + + for (size_t i = 0; i < node_output_names.size(); ++i) { + auto output_name = node_output_names[i]; + if (i < converted_outputs.size() && converted_outputs[i].get_node_shared_ptr() != nullptr) { + (*tensor_map)[output_name] = converted_outputs[i]; + } + } + }; + + ggml_model_decoder->visit_subgraph(node_visitor); + + for (const auto& name : ggml_model_decoder->get_model_output_names()) { + FRONT_END_GENERAL_CHECK(tensor_map->find(name) != tensor_map->end(), + "Output name not found in tensor map: ", + name); + auto result = std::make_shared(tensor_map->at(name)); + // result->set_friendly_name(it); + results.push_back(result); + } + + ov::ParameterVector used_params; + for (const auto& param : params) { + if (!param->output(0).get_target_inputs().empty()) { + used_params.push_back(param); + } + } + if (auto diff = params.size() - used_params.size()) { + std::cout << diff << " parameters are not used in the model." << std::endl; + } + resulting_model = std::make_shared(results, used_params); + + return resulting_model; +} + +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/translate_session.hpp b/ggml/src/ggml-openvino/openvino/translate_session.hpp new file mode 100644 index 0000000000..5c7a9d464d --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/translate_session.hpp @@ -0,0 +1,27 @@ +#pragma once + +#include "input_model.hpp" +#include "node_context.hpp" + +namespace ov { +namespace frontend { +namespace ggml { + +class TranslateSession { +public: + TranslateSession(const frontend::InputModel::Ptr& input_model, + const std::unordered_map& translator_map); + + std::shared_ptr get_converted_model(); + std::shared_ptr translate_graph(const frontend::InputModel::Ptr& input_model); + +private: + void print_model_topology(); + const frontend::InputModel::Ptr m_input_model; + const std::unordered_map& m_translator_map; + std::shared_ptr m_ov_model; +}; + +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp new file mode 100644 index 0000000000..ff16e9d4ae --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/utils.cpp @@ -0,0 +1,52 @@ +#include "utils.hpp" + +#include +#include +#include +#include +#include + +namespace ov { +namespace frontend { +namespace ggml { + +std::string getCurrentTime() { + std::time_t now = std::time(nullptr); + char buf[100]; + std::strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", std::localtime(&now)); + return buf; +} + +void num_inputs_check(const NodeContext& context, size_t min_inputs, size_t max_inputs) { + auto input_size = context.get_input_size(); + FRONT_END_OP_CONVERSION_CHECK(input_size >= min_inputs, "Got less inputs than expected"); + FRONT_END_OP_CONVERSION_CHECK(input_size <= max_inputs, "Got more inputs than expected"); +} + +int non_cont_dim(std::vector ne, std::vector nb) { + int dim = nb.size() - 1; + size_t bytes = nb[dim]; + for (int i = dim; i > 0; i--) { + bytes *= ne[i]; + if (bytes != nb[i - 1]) { + return i; + } + } + return 0; +} + +std::shared_ptr get_dimensions(const std::shared_ptr& shape, + const std::vector& dims) { + using namespace ov::op; + const auto zero = v0::Constant::create(ov::element::i32, ov::Shape{}, {0}); + const auto dims_const = v0::Constant::create(ov::element::i32, ov::Shape{dims.size()}, dims); + return std::make_shared(shape, dims_const, zero); +} + +std::shared_ptr get_dimensions(const std::shared_ptr& node, const std::vector& dims) { + return get_dimensions(std::make_shared(node), dims); +} + +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/utils.hpp b/ggml/src/ggml-openvino/openvino/utils.hpp new file mode 100644 index 0000000000..6e106fa932 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/utils.hpp @@ -0,0 +1,68 @@ +#pragma once + +#include + +#include "node_context.hpp" + +namespace ov { +namespace frontend { +namespace ggml { + +void dump_ov_model(const std::shared_ptr model); + +void num_inputs_check(const NodeContext& context, size_t min_inputs, size_t max_inputs); + +int non_cont_dim(std::vector ne, std::vector nb); + +template +std::vector argsort_descend(const std::vector& v) { + std::vector idx(v.size()); + std::iota(idx.begin(), idx.end(), 0); + std::sort(idx.begin(), idx.end(), [&v](int i1, int i2) { + return v[i1] > v[i2]; + }); + return idx; +} + +template +std::vector sorted_descend(std::vector v) { + std::sort(v.begin(), v.end(), [](T a, T b) { + return a > b; + }); + return v; +} + +template +bool is_permuted(const std::vector& strides) { + for (size_t i = 0; i < strides.size() - 1; ++i) { + if (strides[i] < strides[i + 1]) { + return true; + } + } + return false; +} + +template +std::vector permute(const std::vector& x, const std::vector& perm) { + std::vector result; + result.reserve(perm.size()); + for (int i : perm) { + result.push_back(x[i]); + } + return result; +} + +std::shared_ptr get_dimensions(const std::shared_ptr& shape, const std::vector& dims); +std::shared_ptr get_dimensions(const std::shared_ptr& node, const std::vector& dims); + +namespace op { +template +OutputVector translate_1to1_match_2_inputs(const NodeContext& context) { + num_inputs_check(context, 2, 2); + return {std::make_shared(context.get_input(0), context.get_input(1))}; +} +} // namespace op + +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index f36700d5ec..34bcfc54a7 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -14,6 +14,8 @@ #include "ggml-impl.h" #include "ggml.h" +#include "openvino/frontend.hpp" +#include "openvino/input_model.hpp" std::shared_ptr get_ggml_decoder(struct ggml_cgraph* cgraph) { return std::make_shared(nullptr, cgraph); @@ -56,11 +58,11 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c } // auto devices = core.get_available_devices(); - static auto front_end = get_ggml_frontend(); - if (!front_end) { - GGML_LOG_ERROR("GGML FrontEnd is not initialized \n"); - return GGML_STATUS_FAILED; - } + // static auto front_end = get_ggml_frontend(); + // if (!front_end) { + // GGML_LOG_ERROR("GGML FrontEnd is not initialized \n"); + // return GGML_STATUS_FAILED; + // } using CachedItem = std::pair, ov::CompiledModel>; static std::unordered_map compiled_cache; @@ -79,14 +81,18 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c compiled_model = it->second.second; compile_end_time = ggml_time_us(); } else { - std::shared_ptr graph_decoder = ggml_decoder; - ov::frontend::InputModel::Ptr input_model = front_end->load(graph_decoder); - if (!input_model) { - GGML_LOG_ERROR("Input Model is not loaded \n"); - return GGML_STATUS_FAILED; - } + // std::shared_ptr graph_decoder = ggml_decoder; + // ov::frontend::InputModel::Ptr input_model = front_end->load(graph_decoder); + // if (!input_model) { + // GGML_LOG_ERROR("Input Model is not loaded \n"); + // return GGML_STATUS_FAILED; + // } + + // model = front_end->convert(input_model); + + ov::frontend::InputModel::Ptr input_model = std::make_shared(ggml_decoder); + model = ov::frontend::ggml::FrontEnd::convert(input_model); - model = front_end->convert(input_model); conversion_end_time = ggml_time_us(); if (getenv("GGML_OPENVINO_DUMP_IR")) { From cdf5370cb5e6d5c86628e8bcd862f78d4b8771ff Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 13 May 2025 08:42:54 +0800 Subject: [PATCH 064/254] PERF: favor low precision matmul --- .../ggml-openvino/openvino/node_context.hpp | 2 +- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 35 ++++++++++--------- .../ggml-openvino/openvino/op/soft_max.cpp | 4 +-- 3 files changed, 21 insertions(+), 20 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index bac135270d..e934e2ac36 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -33,7 +33,7 @@ public: return m_decoder->get_input_size(); } - Any get_input_type(size_t index) const { + ov::element::Type get_input_type(size_t index) const { return m_decoder->get_input_type(m_input_names[index]); } diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index e00435ef81..3e9c5c5083 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -1,19 +1,18 @@ -#include #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include "../node_context.hpp" #include "../utils.hpp" -#include "openvino/core/node.hpp" -#include "openvino/core/node_output.hpp" -#include "openvino/op/concat.hpp" -#include "openvino/op/constant.hpp" -#include "openvino/op/convert_like.hpp" -#include "openvino/op/matmul.hpp" -#include "openvino/op/reshape.hpp" -#include "openvino/op/slice.hpp" -#include "openvino/op/transpose.hpp" namespace ov { namespace frontend { @@ -25,9 +24,10 @@ OutputVector translate_mulmat(const NodeContext& context) { bool continuous = context.check_if_continuous(); if (continuous) { - auto src1 = context.get_input(1); - auto src0_converted = std::make_shared(context.get_input(0), src1); - auto result = std::make_shared(src1, src0_converted, false, true); + auto src0 = context.get_input(0); + auto src1 = std::make_shared(context.get_input(1), context.get_input_type(0)); + auto result_lp = std::make_shared(src1, src0, false, true); + auto result = std::make_shared(result_lp, context.get_output_type(0)); return {result}; } else { /* @@ -94,8 +94,7 @@ OutputVector translate_mulmat(const NodeContext& context) { B = src0_slice; } - A = context.get_input(1); - B = std::make_shared(B, A); + A = std::make_shared(context.get_input(1), context.get_input_type(0)); int64_t num_heads = context.get_input_shape(1).to_shape()[0]; int64_t num_heads_kv = src0_shape[0]; @@ -116,10 +115,12 @@ OutputVector translate_mulmat(const NodeContext& context) { B = std::make_shared(B, new_B_shape, false); } - auto result = std::make_shared(A, B, false, true); + auto result_lp = std::make_shared(A, B, false, true); + auto result = std::make_shared(result_lp, context.get_output_type(0)); + return {result}; } -}; +} } // namespace op } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp index 27c7cefef0..cdb59f47d9 100644 --- a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp +++ b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp @@ -49,7 +49,7 @@ OutputVector translate_soft_max(const NodeContext& context) { if (context.get_input_size() == 2) { // Calculate mask then softmax auto mask_node = context.get_input(1); - ov::element::Type mask_type = (context.get_input_type(1)).as(); + ov::element::Type mask_type = context.get_input_type(1); if (mask_type == ov::element::f16) { // Convert f16 to f32 mask_node = std::make_shared(mask_node, ov::element::f32); @@ -80,7 +80,7 @@ OutputVector translate_soft_max(const NodeContext& context) { auto res = std::make_shared(input_node, 0); return {res}; } -}; +} } // namespace op } // namespace ggml From 0d505b4e560735568b8511aad4ea0b4644d6572d Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 13 May 2025 10:34:51 +0800 Subject: [PATCH 065/254] STYLE and minor REFACTOR --- ggml/src/ggml-openvino/openvino/op/add.cpp | 4 +- ggml/src/ggml-openvino/openvino/op/cont.cpp | 6 +-- ggml/src/ggml-openvino/openvino/op/cpy.cpp | 22 ++++---- .../ggml-openvino/openvino/op/get_rows.cpp | 15 +++--- ggml/src/ggml-openvino/openvino/op/mul.cpp | 11 +--- .../src/ggml-openvino/openvino/op/permute.cpp | 14 ++--- .../src/ggml-openvino/openvino/op/reshape.cpp | 11 ++-- .../ggml-openvino/openvino/op/rms_norm.cpp | 15 +++--- ggml/src/ggml-openvino/openvino/op/rope.cpp | 36 ++++++------- ggml/src/ggml-openvino/openvino/op/scale.cpp | 9 ++-- .../ggml-openvino/openvino/op/soft_max.cpp | 18 +++---- .../ggml-openvino/openvino/op/transpose.cpp | 4 +- ggml/src/ggml-openvino/openvino/op/unary.cpp | 24 --------- .../ggml-openvino/openvino/op/unary_silu.cpp | 11 ++-- ggml/src/ggml-openvino/openvino/op/view.cpp | 11 +--- ggml/src/ggml-openvino/openvino/op_table.cpp | 2 +- ggml/src/ggml-openvino/openvino/op_table.hpp | 2 +- .../openvino/translate_session.cpp | 51 +++---------------- ggml/src/ggml-openvino/utils.cpp | 6 ++- 19 files changed, 97 insertions(+), 175 deletions(-) delete mode 100644 ggml/src/ggml-openvino/openvino/op/unary.cpp diff --git a/ggml/src/ggml-openvino/openvino/op/add.cpp b/ggml/src/ggml-openvino/openvino/op/add.cpp index c218cf34de..18bc463fb9 100644 --- a/ggml/src/ggml-openvino/openvino/op/add.cpp +++ b/ggml/src/ggml-openvino/openvino/op/add.cpp @@ -1,4 +1,4 @@ -#include "openvino/op/add.hpp" +#include #include "../node_context.hpp" #include "../utils.hpp" @@ -15,7 +15,7 @@ OutputVector translate_add(const NodeContext& context) { auto rhs = context.get_input(1); auto add = std::make_shared(lhs, rhs); return {add}; -}; +} } // namespace op } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/op/cont.cpp b/ggml/src/ggml-openvino/openvino/op/cont.cpp index 2ebc890fda..e8e9bf0a4e 100644 --- a/ggml/src/ggml-openvino/openvino/op/cont.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp @@ -2,12 +2,12 @@ #include #include #include +#include +#include #include #include "../node_context.hpp" #include "../utils.hpp" -#include "openvino/op/reshape.hpp" -#include "openvino/op/slice.hpp" namespace ov { namespace frontend { @@ -48,7 +48,7 @@ OutputVector translate_cont(const NodeContext& context) { return {slice}; } -}; +} } // namespace op } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index b4f4d59408..2808d3ee91 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -1,19 +1,19 @@ #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include "../node_context.hpp" #include "../utils.hpp" -#include "openvino/core/node.hpp" -#include "openvino/core/node_output.hpp" -#include "openvino/op/add.hpp" -#include "openvino/op/constant.hpp" -#include "openvino/op/convert_like.hpp" -#include "openvino/op/range.hpp" -#include "openvino/op/reshape.hpp" -#include "openvino/op/scatter_nd_update.hpp" -#include "openvino/op/transpose.hpp" -#include "openvino/op/unsqueeze.hpp" namespace ov { namespace frontend { @@ -98,7 +98,7 @@ OutputVector translate_cpy(const NodeContext& context) { false); return {reshaped_res}; } -}; +} } // namespace op } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp index edb25d9124..64fc57bd88 100644 --- a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp @@ -1,14 +1,13 @@ -#include +#include +#include +#include +#include +#include +#include #include #include "../node_context.hpp" #include "../utils.hpp" -#include "openvino/core/node.hpp" -#include "openvino/core/node_output.hpp" -#include "openvino/op/constant.hpp" -#include "openvino/op/convert.hpp" -#include "openvino/op/gather.hpp" -#include "openvino/op/reshape.hpp" namespace ov { namespace frontend { @@ -32,7 +31,7 @@ OutputVector translate_get_rows(const NodeContext& context) { } return {res}; -}; +} } // namespace op } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/op/mul.cpp b/ggml/src/ggml-openvino/openvino/op/mul.cpp index 1b1c69f7df..14473f4e27 100644 --- a/ggml/src/ggml-openvino/openvino/op/mul.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mul.cpp @@ -1,14 +1,7 @@ -#include -#include +#include #include "../node_context.hpp" #include "../utils.hpp" -#include "openvino/core/node.hpp" -#include "openvino/core/node_output.hpp" -#include "openvino/op/broadcast.hpp" -#include "openvino/op/constant.hpp" -#include "openvino/op/multiply.hpp" -#include "openvino/op/reshape.hpp" namespace ov { namespace frontend { @@ -20,7 +13,7 @@ OutputVector translate_mul(const NodeContext& context) { auto res = std::make_shared(context.get_input(0), context.get_input(1)); return {res}; -}; +} } // namespace op } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index 42472f18cc..478c9430f0 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -1,21 +1,23 @@ +#include +#include + #include "../node_context.hpp" #include "../utils.hpp" -#include "openvino/op/constant.hpp" -#include "openvino/op/transpose.hpp" namespace ov { namespace frontend { namespace ggml { namespace op { + OutputVector translate_permute(const NodeContext& context) { num_inputs_check(context, 1, 1); - // TODO: make this more general + auto perm = argsort_descend(context.get_output_stride(0)); auto res = std::make_shared(context.get_input(0), - ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); - + ov::op::v0::Constant::create(ov::element::i64, {3}, perm)); return {res}; -}; +} + } // namespace op } // namespace ggml } // namespace frontend diff --git a/ggml/src/ggml-openvino/openvino/op/reshape.cpp b/ggml/src/ggml-openvino/openvino/op/reshape.cpp index ca18b72c42..06b2bd339e 100644 --- a/ggml/src/ggml-openvino/openvino/op/reshape.cpp +++ b/ggml/src/ggml-openvino/openvino/op/reshape.cpp @@ -1,13 +1,12 @@ -#include "openvino/op/reshape.hpp" - #include +#include +#include +#include +#include #include #include "../node_context.hpp" #include "../utils.hpp" -#include "openvino/core/node.hpp" -#include "openvino/core/node_output.hpp" -#include "openvino/op/constant.hpp" namespace ov { namespace frontend { @@ -27,7 +26,7 @@ OutputVector translate_reshape(const NodeContext& context) { std::vector{-1, (int64_t)output_shape[1], (int64_t)output_shape[2]}); Output res = std::make_shared(context.get_input(0), new_shape_node, false); return {res}; -}; +} } // namespace op } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp index 7b9783e8c9..a91fffb72d 100644 --- a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp @@ -1,11 +1,12 @@ +#include +#include +#include +#include +#include +#include + #include "../node_context.hpp" #include "../utils.hpp" -#include "openvino/op/add.hpp" -#include "openvino/op/constant.hpp" -#include "openvino/op/divide.hpp" -#include "openvino/op/multiply.hpp" -#include "openvino/op/reduce_sum.hpp" -#include "openvino/op/sqrt.hpp" namespace ov { namespace frontend { @@ -39,7 +40,7 @@ OutputVector translate_rms_norm(const NodeContext& context) { auto res = std::make_shared(input_node, scale); return {res}; -}; +} } // namespace op } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index d5083ae14b..aad156082e 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -1,27 +1,27 @@ #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include "../node_context.hpp" #include "../utils.hpp" -#include "openvino/core/node.hpp" -#include "openvino/core/node_output.hpp" -#include "openvino/op/add.hpp" -#include "openvino/op/broadcast.hpp" -#include "openvino/op/concat.hpp" -#include "openvino/op/constant.hpp" -#include "openvino/op/convert.hpp" -#include "openvino/op/cos.hpp" -#include "openvino/op/divide.hpp" -#include "openvino/op/multiply.hpp" -#include "openvino/op/reshape.hpp" -#include "openvino/op/shape_of.hpp" -#include "openvino/op/sin.hpp" -#include "openvino/op/slice.hpp" -#include "openvino/op/split.hpp" -#include "openvino/op/subtract.hpp" -#include "openvino/op/transpose.hpp" #define GGML_ROPE_TYPE_NEOX 2 @@ -163,7 +163,7 @@ OutputVector translate_rope(const NodeContext& context) { auto res_node = std::make_shared(ov::OutputVector{first_half_node, second_half_node}, 2); return {res_node}; } -}; +} } // namespace op } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/op/scale.cpp b/ggml/src/ggml-openvino/openvino/op/scale.cpp index 392bfc1ed4..b393dd8aa2 100644 --- a/ggml/src/ggml-openvino/openvino/op/scale.cpp +++ b/ggml/src/ggml-openvino/openvino/op/scale.cpp @@ -1,12 +1,9 @@ -#include +#include +#include #include #include "../node_context.hpp" #include "../utils.hpp" -#include "openvino/core/node.hpp" -#include "openvino/core/node_output.hpp" -#include "openvino/op/constant.hpp" -#include "openvino/op/multiply.hpp" namespace ov { namespace frontend { @@ -23,7 +20,7 @@ OutputVector translate_scale(const NodeContext& context) { auto res = std::make_shared(context.get_input(0), scale_node); return {res}; -}; +} } // namespace op } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp index cdb59f47d9..549c35a9b6 100644 --- a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp +++ b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp @@ -1,19 +1,19 @@ #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include "../node_context.hpp" #include "../utils.hpp" -#include "openvino/core/node.hpp" -#include "openvino/core/node_output.hpp" -#include "openvino/op/add.hpp" -#include "openvino/op/concat.hpp" -#include "openvino/op/constant.hpp" -#include "openvino/op/convert.hpp" -#include "openvino/op/multiply.hpp" -#include "openvino/op/slice.hpp" -#include "openvino/op/softmax.hpp" namespace ov { namespace frontend { diff --git a/ggml/src/ggml-openvino/openvino/op/transpose.cpp b/ggml/src/ggml-openvino/openvino/op/transpose.cpp index f7408f40d4..7d33ca9d61 100644 --- a/ggml/src/ggml-openvino/openvino/op/transpose.cpp +++ b/ggml/src/ggml-openvino/openvino/op/transpose.cpp @@ -1,4 +1,4 @@ -#include "openvino/op/transpose.hpp" +#include #include "../node_context.hpp" #include "../utils.hpp" @@ -15,7 +15,7 @@ OutputVector translate_transpose(const NodeContext& context) { auto res = std::make_shared(context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {3}, perm)); return {res}; -}; +} } // namespace op } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/op/unary.cpp b/ggml/src/ggml-openvino/openvino/op/unary.cpp deleted file mode 100644 index 391e0a7599..0000000000 --- a/ggml/src/ggml-openvino/openvino/op/unary.cpp +++ /dev/null @@ -1,24 +0,0 @@ - -#include -#include - -#include "../node_context.hpp" -#include "../utils.hpp" -#include "openvino/core/node.hpp" -#include "openvino/core/node_output.hpp" - -namespace ov { -namespace frontend { -namespace ggml { -namespace op { - -OutputVector translate_unary(const NodeContext& context) { - num_inputs_check(context, 1, 1); - - return {context.get_input(0)}; -}; - -} // namespace op -} // namespace ggml -} // namespace frontend -} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp b/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp index 2a90a79475..1c396e6aaf 100644 --- a/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp @@ -1,12 +1,9 @@ -#include -#include +#include +#include +#include #include "../node_context.hpp" #include "../utils.hpp" -#include "openvino/core/node.hpp" -#include "openvino/core/node_output.hpp" -#include "openvino/op/multiply.hpp" -#include "openvino/op/sigmoid.hpp" namespace ov { namespace frontend { @@ -21,7 +18,7 @@ OutputVector translate_unary_silu(const NodeContext& context) { auto res = std::make_shared(input, sigmoid); return {res}; -}; +} } // namespace op } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/op/view.cpp b/ggml/src/ggml-openvino/openvino/op/view.cpp index aaf117b662..fcfb9f732c 100644 --- a/ggml/src/ggml-openvino/openvino/op/view.cpp +++ b/ggml/src/ggml-openvino/openvino/op/view.cpp @@ -1,13 +1,4 @@ -#include -#include - #include "../utils.hpp" -#include "openvino/core/node.hpp" -#include "openvino/core/node_output.hpp" -#include "openvino/op/constant.hpp" -#include "openvino/op/convert.hpp" -#include "openvino/op/reshape.hpp" -#include "openvino/op/strided_slice.hpp" namespace ov { namespace frontend { @@ -18,7 +9,7 @@ OutputVector translate_view(const NodeContext& context) { num_inputs_check(context, 1, 1); return {context.get_input(0)}; -}; +} } // namespace op } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp index af51bb157e..d588b2bff0 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.cpp +++ b/ggml/src/ggml-openvino/openvino/op_table.cpp @@ -37,7 +37,7 @@ GGML_OP_CONVERTER(translate_view); } // namespace op -const std::unordered_map get_supported_ops() { +std::unordered_map get_supported_ops() { return {{"GGML_OP_ADD", op::translate_1to1_match_2_inputs}, {"GGML_OP_ADD1", op::translate_1to1_match_2_inputs}, {"GGML_OP_CONT", op::translate_cont}, diff --git a/ggml/src/ggml-openvino/openvino/op_table.hpp b/ggml/src/ggml-openvino/openvino/op_table.hpp index c83aaa199f..1a71a06c18 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.hpp +++ b/ggml/src/ggml-openvino/openvino/op_table.hpp @@ -6,7 +6,7 @@ namespace ov { namespace frontend { namespace ggml { -const std::unordered_map get_supported_ops(); +std::unordered_map get_supported_ops(); } // namespace ggml } // namespace frontend diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index f5b14d3a0f..012e9178c6 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -1,8 +1,5 @@ #include "translate_session.hpp" -#include -#include - #include "input_model.hpp" namespace ov { @@ -22,39 +19,9 @@ std::shared_ptr TranslateSession::get_converted_model() { return m_ov_model; } m_ov_model = translate_graph(m_input_model); - // print_model_topology(); return m_ov_model; } -void TranslateSession::print_model_topology() { - try { - std::ofstream outfile("model_topology.txt", std::ios::out | std::ios::app); - if (!outfile.is_open()) { - throw std::runtime_error("Failed to open file for writing model topology."); - } - - outfile << "============ Model ============" << std::endl; - for (const auto& op : m_ov_model->get_ordered_ops()) { - outfile << "Operation: " << op->get_friendly_name() << std::endl; - outfile << " Inputs:" << std::endl; - for (const auto& input : op->inputs()) { - outfile << " " << input.get_node()->get_friendly_name() << " -> " << input.get_element_type() << " " - << input.get_shape() << std::endl; - } - outfile << " Outputs:" << std::endl; - for (const auto& output : op->outputs()) { - outfile << " " << output.get_node()->get_friendly_name() << " -> " << output.get_element_type() - << " " << output.get_shape() << std::endl; - } - outfile << std::endl; - } - outfile << "===============================" << std::endl; - outfile.close(); - } catch (const std::exception& ex) { - std::cout << ex.what() << std::endl; - } -} - std::shared_ptr TranslateSession::translate_graph(const frontend::InputModel::Ptr& input_model) { ov::ParameterVector params; ov::ResultVector results; @@ -86,16 +53,12 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo auto operation_type = node->get_op_type(); ov::OutputVector converted_outputs; auto it = m_translator_map.find(operation_type); - if (it != m_translator_map.end()) { - try { - NodeContext node_context(node, tensor_map, this); - converted_outputs = it->second(node_context); - } catch (const std::exception& ex) { - std::cout << ex.what() << std::endl; - } - } else { - // TODO - } + FRONT_END_OP_CONVERSION_CHECK(it != m_translator_map.end(), + "Translation for operation type ", + operation_type, + " is not implemented."); + NodeContext node_context(node, tensor_map, this); + converted_outputs = it->second(node_context); const auto& node_output_names = node->get_output_names(); FRONT_END_OP_CONVERSION_CHECK(node_output_names.size() == converted_outputs.size(), @@ -122,7 +85,7 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo "Output name not found in tensor map: ", name); auto result = std::make_shared(tensor_map->at(name)); - // result->set_friendly_name(it); + result->set_friendly_name(name); results.push_back(result); } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 34bcfc54a7..09bf0d0ac5 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -69,10 +69,13 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c std::shared_ptr model; ov::CompiledModel compiled_model; + int64_t decoder_end_time; int64_t conversion_end_time; int64_t compile_end_time; auto ggml_decoder = get_ggml_decoder(cgraph); + decoder_end_time = ggml_time_us(); + auto it = compiled_cache.find(cgraph); if (it != compiled_cache.end()) { model = it->second.first; @@ -147,7 +150,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c if (getenv("GGML_OPENVINO_PROFILING")) { GGML_LOG_INFO("GGML OpenVINO Backend: \n"); - GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - start_time) / 1000); + GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000); + GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000); GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); GGML_LOG_INFO(" - Graph Input Time: %ld ms \n", (input_end_time - compile_end_time) / 1000); GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - input_end_time) / 1000); From 041d220dfaa642c1ecbf350f9f658b4649634bbd Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 14 May 2025 14:06:15 +0800 Subject: [PATCH 066/254] FIX: Re-add tensor names in cgraph, Add another case for RESHAPE --- ggml/src/ggml-openvino/ggml-decoder.cpp | 39 +++++++++++++++---- ggml/src/ggml-openvino/ggml-decoder.h | 8 ++-- ggml/src/ggml-openvino/openvino/decoder.hpp | 4 +- .../ggml-openvino/openvino/node_context.hpp | 4 +- ggml/src/ggml-openvino/openvino/op/cont.cpp | 6 ++- ggml/src/ggml-openvino/openvino/op/cpy.cpp | 7 +++- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 6 ++- .../src/ggml-openvino/openvino/op/reshape.cpp | 21 ++++++++-- .../openvino/translate_session.cpp | 4 -- src/llama-graph.cpp | 12 ++++-- 10 files changed, 77 insertions(+), 34 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 43869ec228..0d612c1819 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -38,6 +38,10 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap printed = true; } + if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { + dump_cgraph(m_cgraph); + } + set_max_token_len(); for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { auto* cur_node = m_cgraph->nodes[node_n]; @@ -47,10 +51,6 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap m_model_weights = model_weights; add_extra_inputs(); - - if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { - dump_cgraph(m_cgraph); - } } } @@ -142,17 +142,40 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, if (m_node) { switch (node->op) { + case GGML_OP_RESHAPE: { + if (node->ne[0] * node->ne[1] == node->src[0]->ne[0]) { + m_op_case = 1; + } else if (node->src[0]->ne[0] * node->src[0]->ne[1] == node->ne[0]) { + m_op_case = 2; + } + break; + } case GGML_OP_CONT: { - // Currently only two cases, either the input comes from a VIEW which is subtensor or from a PERMUTE - m_continuous = ggml_nelements(node->src[0]) == ggml_nelements(node->src[0]->view_src); + if (ggml_nelements(node->src[0]) == ggml_nelements(node->src[0]->view_src)) { + // The input comes from a PERMUTE + m_op_case = 1; + } else { + // The input comes from a VIEW which is subtensor + m_op_case = 2; + } break; } case GGML_OP_CPY: { - m_continuous = ggml_is_contiguous(node); + if (ggml_is_contiguous(node)) { + // Write K to cache_k + m_op_case = 1; + } else { + // Write V to cache_v + m_op_case = 2; + } break; } case GGML_OP_MUL_MAT: { - m_continuous = node->src[0]->view_src == nullptr; + if (node->src[0]->view_src == nullptr) { + m_op_case = 1; + } else { + m_op_case = 2; + } break; } default: diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 959e00b65d..b8cc4c4cdf 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -69,8 +69,8 @@ public: return m_outputs.at(name); } - virtual bool check_if_continuous() const override { - return m_continuous; + virtual int get_op_case() const override { + return m_op_case; } virtual const std::map>& get_model_inputs() const override { @@ -110,7 +110,7 @@ private: std::vector m_nodes; std::string m_op_name; mutable std::string m_name; - bool m_continuous; + int m_op_case; std::vector> m_op_node_name; std::map> m_model_inputs; std::map> m_model_extra_inputs; @@ -119,4 +119,4 @@ private: std::vector m_model_output_names; }; -void print_tensor_address_map(const struct ggml_cgraph* cgraph); \ No newline at end of file +void print_tensor_address_map(const struct ggml_cgraph* cgraph); diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index 3987760a29..b3cf75817f 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -49,7 +49,7 @@ public: virtual void visit_subgraph(std::function)> node_visitor) const = 0; - virtual bool check_if_continuous() const = 0; + virtual int get_op_case() const = 0; virtual const std::map>& get_model_inputs() const = 0; virtual const std::map>& get_model_extra_inputs() const = 0; @@ -59,4 +59,4 @@ public: } // namespace ggml } // namespace frontend -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index e934e2ac36..44f55222e3 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -81,8 +81,8 @@ public: return m_decoder->get_attribute(name); } - bool check_if_continuous() const { - return m_decoder->check_if_continuous(); + int get_op_case() const { + return m_decoder->get_op_case(); } private: diff --git a/ggml/src/ggml-openvino/openvino/op/cont.cpp b/ggml/src/ggml-openvino/openvino/op/cont.cpp index e8e9bf0a4e..a052bf06ca 100644 --- a/ggml/src/ggml-openvino/openvino/op/cont.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp @@ -17,11 +17,13 @@ namespace op { OutputVector translate_cont(const NodeContext& context) { num_inputs_check(context, 1, 1); + int op_case = context.get_op_case(); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case"); + auto src_shape = context.get_input_shape(0).to_shape(); auto dst_shape = context.get_output_shape(0).to_shape(); - bool continuous = context.check_if_continuous(); - if (continuous) { + if (op_case == 1) { // The input comes from a PERMUTE dst_shape[1] = -1; auto result = std::make_shared( diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index 2808d3ee91..4ab1502f81 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -22,13 +22,16 @@ namespace op { OutputVector translate_cpy(const NodeContext& context) { num_inputs_check(context, 2, 2); + + int op_case = context.get_op_case(); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CPY case"); + auto src0 = context.get_input(0); auto src1 = context.get_input(1); auto past_token_len = context.get_input("past_token_len"); auto src0_shape = context.get_input_shape(0).to_shape(); auto output_shape = context.get_output_shape(0).to_shape(); - bool continuous = context.check_if_continuous(); std::vector input0_strides = context.get_input_stride(0); std::vector output_strides = context.get_output_stride(0); @@ -36,7 +39,7 @@ OutputVector translate_cpy(const NodeContext& context) { auto one = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1}); src0 = std::make_shared(src0, src1); - if (continuous) { + if (op_case == 1) { // Write K to cache_k int64_t head_size = src0_shape[2]; int64_t num_heads = src0_shape[1]; diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 3e9c5c5083..5673551f70 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -22,8 +22,10 @@ namespace op { OutputVector translate_mulmat(const NodeContext& context) { num_inputs_check(context, 2, 2); - bool continuous = context.check_if_continuous(); - if (continuous) { + int op_case = context.get_op_case(); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported MULMAT case"); + + if (op_case == 1) { auto src0 = context.get_input(0); auto src1 = std::make_shared(context.get_input(1), context.get_input_type(0)); auto result_lp = std::make_shared(src1, src0, false, true); diff --git a/ggml/src/ggml-openvino/openvino/op/reshape.cpp b/ggml/src/ggml-openvino/openvino/op/reshape.cpp index 06b2bd339e..f6586d674c 100644 --- a/ggml/src/ggml-openvino/openvino/op/reshape.cpp +++ b/ggml/src/ggml-openvino/openvino/op/reshape.cpp @@ -1,6 +1,8 @@ #include +#include #include #include +#include #include #include #include @@ -19,11 +21,22 @@ OutputVector translate_reshape(const NodeContext& context) { return {context.get_input(0)}; } + int op_case = context.get_op_case(); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported RESHAPE case"); + auto output_shape = context.get_output_shape(0).to_shape(); - auto new_shape_node = - ov::op::v0::Constant::create(ov::element::i64, - {3}, - std::vector{-1, (int64_t)output_shape[1], (int64_t)output_shape[2]}); + std::shared_ptr new_shape_node; + if (op_case == 1) { + new_shape_node = + ov::op::v0::Constant::create(ov::element::i64, + {3}, + std::vector{-1, (int64_t)output_shape[1], (int64_t)output_shape[2]}); + } else { + new_shape_node = + ov::op::v0::Constant::create(ov::element::i64, + {3}, + std::vector{(int64_t)output_shape[0], -1, (int64_t)output_shape[2]}); + } Output res = std::make_shared(context.get_input(0), new_shape_node, false); return {res}; } diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 012e9178c6..910a0d8336 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -31,10 +31,6 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo const auto& ggml_model = std::dynamic_pointer_cast(input_model); std::shared_ptr ggml_model_decoder = ggml_model->get_model_decoder(); - FRONT_END_GENERAL_CHECK(ggml_model, "nullptr for InputModel is given for translation into OV Model"); - const auto& model_inputs = ggml_model->get_inputs(); - const auto& model_outputs = ggml_model->get_outputs(); - for (const auto& it : ggml_model_decoder->get_model_inputs()) { params.push_back(std::dynamic_pointer_cast(it.second)); (*tensor_map)[it.first] = it.second; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 944c7e53bd..d4a25ab59b 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1275,7 +1275,7 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { if (ubatch.token) { inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); - //cb(inp->tokens, "inp_tokens", -1); + cb(inp->tokens, "inp_tokens", -1); ggml_set_input(inp->tokens); res->t_tokens = inp->tokens; @@ -1327,6 +1327,7 @@ ggml_tensor * llm_graph_context::build_inp_pos() const { auto & cur = inp->pos; cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, (int64_t)n_tokens*hparams.n_pos_per_embd()); + cb(cur, "inp_pos", -1); ggml_set_input(cur); res->add_input(std::move(inp)); @@ -1362,6 +1363,7 @@ ggml_tensor * llm_graph_context::build_inp_out_ids() const { auto & cur = inp->out_ids; cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs); + cb(cur, "inp_out_ids", -1); ggml_set_input(cur); res->add_input(std::move(inp)); @@ -1603,6 +1605,7 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1); + cb(inp->self_kq_mask, "KQ_mask", -1); ggml_set_input(inp->self_kq_mask); inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; @@ -1661,7 +1664,7 @@ ggml_tensor * llm_graph_context::build_attn( } if (wo_b) { - //cb(cur, "kqv_wo", il); + cb(cur, "kqv_wo", il); } if (wo_b) { @@ -1691,6 +1694,7 @@ static std::unique_ptr build_attn_inp_kv_impl( inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch); inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream); + cb(inp->self_kq_mask, "KQ_mask", -1); ggml_set_input(inp->self_kq_mask); inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; @@ -1818,7 +1822,7 @@ ggml_tensor * llm_graph_context::build_attn( } if (wo_b) { - //cb(cur, "kqv_wo", il); + cb(cur, "kqv_wo", il); } if (wo_b) { @@ -1873,7 +1877,7 @@ ggml_tensor * llm_graph_context::build_attn( } if (wo_b) { - //cb(cur, "kqv_wo", il); + cb(cur, "kqv_wo", il); } if (wo_b) { From c57f61494a42516759f327057b34d03a254b08a5 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 14 May 2025 17:48:20 +0800 Subject: [PATCH 067/254] FIX: input shape of KQ_mask --- ggml/src/ggml-openvino/ggml-decoder.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 0d612c1819..fd56900728 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -112,8 +112,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") { input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_max_token_len)}; } else if (std::string(src->name).find("KQ_mask") == 0) { - input_shape = - ov::PartialShape{1, ov::Dimension(1, m_max_token_len), ov::Dimension(1, m_max_token_len)}; + auto max_token_len = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD); + input_shape = ov::PartialShape{1, ov::Dimension(1, max_token_len), ov::Dimension(1, max_token_len)}; } else { input_shape = ov::Shape{get_shape(src)}; } @@ -187,9 +187,9 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, void GgmlOvDecoder::set_max_token_len() { for (int i = 0; i < m_cgraph->n_nodes; i++) { auto* node = m_cgraph->nodes[i]; - if (std::string(node->name) == "v-0") { - auto* cache_v = node->src[0]; - m_max_token_len = cache_v->ne[0] / node->ne[1] / node->ne[2]; + if (std::string(node->name) == "k-0") { + auto* cache_k = node->src[0]; + m_max_token_len = cache_k->ne[0] / node->ne[0] / node->ne[1]; break; } } From a30dc6e7267688c5fbc37c645d4b45ab88e1e4fa Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 14 May 2025 17:48:56 +0800 Subject: [PATCH 068/254] PERF: add weight constant in parallel --- ggml/src/ggml-openvino/ggml-decoder.cpp | 45 +++++++++++++++++++++++++ ggml/src/ggml-openvino/ggml-decoder.h | 2 ++ 2 files changed, 47 insertions(+) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index fd56900728..a8e1ad5556 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -3,9 +3,11 @@ #include #include +#include #include #include #include +#include #include #include #include @@ -42,6 +44,12 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap dump_cgraph(m_cgraph); } + static bool weight_created = false; + if (!getenv("GGML_OPENVINO_WEIGHT_AS_INPUT") && !weight_created) { + add_weight_const_parallel(model_weights); + weight_created = true; + } + set_max_token_len(); for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { auto* cur_node = m_cgraph->nodes[node_n]; @@ -235,6 +243,43 @@ void GgmlOvDecoder::add_extra_inputs() { } } +void GgmlOvDecoder::add_weight_const_parallel(std::map>& model_weights) { + static std::mutex weights_mutex; + auto* nodes = m_cgraph->nodes; + auto n_nodes = m_cgraph->n_nodes; + std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor* node) { + for (int i = 0; i < GGML_MAX_SRC; i++) { + auto* src = node->src[i]; + if (src == nullptr) { + continue; + } + + std::string src_name(src->name); + if (!src->view_src) { + ggml_backend_buffer* buffer = src->buffer; + if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { + bool should_create = false; + { + std::lock_guard lock(weights_mutex); + if (model_weights.find(src_name) == model_weights.end()) { + model_weights[src_name] = nullptr; + should_create = true; + } + } + if (should_create) { + auto weight_node = create_weight_node(src); + weight_node->set_friendly_name(src_name); + { + std::lock_guard lock(weights_mutex); + model_weights[src_name] = weight_node; + } + } + } + } + } + }); +} + std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) { std::shared_ptr weight_node; auto node_type = get_ov_type(tensor); diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index b8cc4c4cdf..4d4a928121 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -101,6 +101,8 @@ private: void set_max_token_len(); int64_t m_max_token_len; + void add_weight_const_parallel(std::map>& model_weights); + struct ggml_cgraph* m_cgraph; std::map m_inputs; std::vector m_input_names; From 8ac5c225aa26a4a58a9d296842524be2f3e756a5 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 16 May 2025 10:12:22 +0800 Subject: [PATCH 069/254] FIX: set_max_token_len --- ggml/src/ggml-openvino/ggml-decoder.cpp | 5 +++-- ggml/src/ggml-openvino/utils.cpp | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index a8e1ad5556..e6474d6def 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -44,13 +44,14 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap dump_cgraph(m_cgraph); } + set_max_token_len(); + static bool weight_created = false; if (!getenv("GGML_OPENVINO_WEIGHT_AS_INPUT") && !weight_created) { add_weight_const_parallel(model_weights); weight_created = true; } - set_max_token_len(); for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { auto* cur_node = m_cgraph->nodes[node_n]; m_nodes.push_back(cur_node); @@ -197,7 +198,7 @@ void GgmlOvDecoder::set_max_token_len() { auto* node = m_cgraph->nodes[i]; if (std::string(node->name) == "k-0") { auto* cache_k = node->src[0]; - m_max_token_len = cache_k->ne[0] / node->ne[0] / node->ne[1]; + m_max_token_len = cache_k->ne[0] / node->ne[0] / node->ne[2]; break; } } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 09bf0d0ac5..040ca1961e 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -209,4 +209,4 @@ void print_output_tensor_info(const std::string& name, default: break; } -} \ No newline at end of file +} From d7cc80229259279b1474f9b66a85489232360728 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 16 May 2025 10:14:05 +0800 Subject: [PATCH 070/254] PERF: use Slice+Concat in writing cache_v --- ggml/src/ggml-openvino/openvino/op/cpy.cpp | 62 +++++++++++----------- 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index 4ab1502f81..0c4a3d1558 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -1,13 +1,17 @@ +#include #include #include #include #include +#include #include +#include #include #include #include #include #include +#include #include #include #include @@ -64,42 +68,40 @@ OutputVector translate_cpy(const NodeContext& context) { } else { // Write V to cache_v int64_t total_head_size = src0_shape[1]; + auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size}); - auto reshaped_src0 = std::make_shared( - src0, - ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{total_head_size, -1}), - false); - auto transposed_src0 = - std::make_shared(reshaped_src0, - ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 0})); + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + + auto token_len = get_dimensions(src0.get_node_shared_ptr(), {2}); + past_token_len = std::make_shared(past_token_len, zero); + auto total_token_len = std::make_shared(past_token_len, token_len); auto reshaped_src1 = std::make_shared( src1, - ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{total_head_size, -1}), - false); - auto transposed_src1 = - std::make_shared(reshaped_src1, - ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 0})); - - auto token_len = get_dimensions(src0.get_node_shared_ptr(), {2}); - token_len = std::make_shared(token_len, - ov::op::v0::Constant::create(ov::element::i64, {0}, {}), - false); - auto total_token_len = std::make_shared(past_token_len, token_len); - std::shared_ptr indices = - std::make_shared(past_token_len, total_token_len, one, ov::element::i64); - indices = std::make_shared( - indices, - ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{1})); - - auto res = std::make_shared(transposed_src1, indices, transposed_src0); - auto transposed_res = - std::make_shared(res, ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 0})); - auto reshaped_res = std::make_shared( - transposed_res, ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, -1}), false); - return {reshaped_res}; + + auto src1_left = std::make_shared( + reshaped_src1, + ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, 0}), + std::make_shared(ov::OutputVector{one, total_head_size_node, past_token_len}, 0), + ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1})); + + auto src1_right = std::make_shared( + reshaped_src1, + std::make_shared(ov::OutputVector{zero, zero, total_token_len}, 0), + ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, INT_MAX}), + ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1})); + + auto reshaped_src0 = std::make_shared( + src0, + ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, -1}), + false); + + auto res = std::make_shared(ov::OutputVector{src1_left, reshaped_src0, src1_right}, 2); + + return {res}; } } From fd324366d04e1cd39ae7b39646751dfabaadd925 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 20 May 2025 10:38:15 +0800 Subject: [PATCH 071/254] Update build doc --- docs/build.md | 47 +++++++++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/docs/build.md b/docs/build.md index bb7c4137a5..a6028035c5 100644 --- a/docs/build.md +++ b/docs/build.md @@ -683,33 +683,30 @@ To read documentation for how to build on IBM Z & LinuxONE, [click here](./build ## OPENVINO -### Build openvino-llama +### Build openvino - ```bash - git lfs install --skip-smudge - git clone https://github.com/intel-sandbox/openvino-llama.git -b dev_ggml_frontend - cd openvino-llama - git submodule update --init --recursive +```bash +git clone https://github.com/openvinotoolkit/openvino.git +cd openvino +git submodule update --init --recursive +export OPENVINO_DIR=$(pwd) - export OPENVINO_LLAMA_PATH=$(pwd) - ``` +sudo ./install_build_dependencies.sh - Before building, change "ENABLE_OV_GGML_FRONTEND" from true to false in the CMakePresets.json file since we already have the code from the ov side in this branch of llama.cpp (`full_backend`). You could also build the master branch of ov instead. - - ``` - cmake --preset Release - cmake --build build/Release - ``` +mkdir -p build/Release && cd build/Release +cmake -DCMAKE_BUILD_TYPE=Release -DENABLE_DEBUG_CAPS=ON ../.. +``` ### Build llama.cpp-ov - ```bash - git clone https://github.com/intel-sandbox/llama.cpp-ov.git -b full_backend - cd llama.cpp-ov +```bash +git clone https://github.com/intel-sandbox/llama.cpp-ov.git +cd llama.cpp-ov +git switch dev_backend_openvino - cmake --preset ReleaseOV - cmake --build build/ReleaseOV - ``` +cmake --preset ReleaseOV +cmake --build build/ReleaseOV +``` Download the test model file [Phi-3-mini-4k-instruct-fp16.gguf](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf) from hugging face website. ``` bash @@ -717,12 +714,10 @@ Download the test model file [Phi-3-mini-4k-instruct-fp16.gguf](https://huggingf ``` Execute the following command to test. - ```bash - export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache - # Currently GGML_OPENVINO_WEIGHT_AS_INPUT has better performance - export GGML_OPENVINO_WEIGHT_AS_INPUT=1 - ./build/ReleaseOV/bin/llama-simple -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf -n 10 "Hello, my name is " - ``` +```bash +export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache +./build/ReleaseOV/bin/llama-simple -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf -n 10 "Hello, my name is " +``` Environment variables: - GGML_OPENVINO_WEIGHT_AS_INPUT: From 8ce5cc597a5f18e4adfa090e6394cbfefbb458db Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 22 May 2025 10:32:18 +0800 Subject: [PATCH 072/254] Add cgraph tensor output name to OV op name --- ggml/src/ggml-openvino/openvino/op/add.cpp | 7 +++---- ggml/src/ggml-openvino/openvino/op/cont.cpp | 13 ++++++------ ggml/src/ggml-openvino/openvino/op/cpy.cpp | 10 ++++----- .../ggml-openvino/openvino/op/get_rows.cpp | 2 +- ggml/src/ggml-openvino/openvino/op/mul.cpp | 2 +- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 11 +++++----- .../src/ggml-openvino/openvino/op/permute.cpp | 2 +- .../src/ggml-openvino/openvino/op/reshape.cpp | 4 ++-- .../ggml-openvino/openvino/op/rms_norm.cpp | 2 +- ggml/src/ggml-openvino/openvino/op/rope.cpp | 11 +++++----- ggml/src/ggml-openvino/openvino/op/scale.cpp | 2 +- .../ggml-openvino/openvino/op/soft_max.cpp | 21 ++++++++++--------- .../ggml-openvino/openvino/op/transpose.cpp | 2 +- .../ggml-openvino/openvino/op/unary_silu.cpp | 2 +- .../openvino/translate_session.cpp | 14 +++++++++++-- ggml/src/ggml-openvino/openvino/utils.cpp | 11 ++++++++++ ggml/src/ggml-openvino/openvino/utils.hpp | 2 ++ 17 files changed, 71 insertions(+), 47 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/op/add.cpp b/ggml/src/ggml-openvino/openvino/op/add.cpp index 18bc463fb9..5a75ff2148 100644 --- a/ggml/src/ggml-openvino/openvino/op/add.cpp +++ b/ggml/src/ggml-openvino/openvino/op/add.cpp @@ -11,10 +11,9 @@ namespace op { OutputVector translate_add(const NodeContext& context) { num_inputs_check(context, 2, 2); - auto lhs = context.get_input(0); - auto rhs = context.get_input(1); - auto add = std::make_shared(lhs, rhs); - return {add}; + auto res = std::make_shared(context.get_input(0), context.get_input(1)); + + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/cont.cpp b/ggml/src/ggml-openvino/openvino/op/cont.cpp index a052bf06ca..7cdfba051e 100644 --- a/ggml/src/ggml-openvino/openvino/op/cont.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp @@ -22,16 +22,15 @@ OutputVector translate_cont(const NodeContext& context) { auto src_shape = context.get_input_shape(0).to_shape(); auto dst_shape = context.get_output_shape(0).to_shape(); + ov::Output res; if (op_case == 1) { // The input comes from a PERMUTE dst_shape[1] = -1; - auto result = std::make_shared( + res = std::make_shared( context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), false); - - return {result}; } else { // The input comes from a VIEW // Currently all cases are slicing at lowest dim @@ -43,13 +42,13 @@ OutputVector translate_cont(const NodeContext& context) { std::vector end = {(int64_t)src_shape[0], INT_MAX, split_addr + (int64_t)src_shape[2]}; std::vector strides = {1, 1, 1}; - auto begin_const = ov::op::v0::Constant::create(ov::element::i64, {begin.size()}, begin); + auto begin_const = ov::op::v0::Constant::create(element::i64, {begin.size()}, begin); auto end_const = ov::op::v0::Constant::create(ov::element::i64, {end.size()}, end); auto strides_const = ov::op::v0::Constant::create(ov::element::i64, {strides.size()}, strides); - auto slice = std::make_shared(context.get_input(0), begin_const, end_const, strides_const); - - return {slice}; + res = std::make_shared(context.get_input(0), begin_const, end_const, strides_const); } + + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index 0c4a3d1558..7cdeddce38 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -33,6 +33,7 @@ OutputVector translate_cpy(const NodeContext& context) { auto src0 = context.get_input(0); auto src1 = context.get_input(1); auto past_token_len = context.get_input("past_token_len"); + ov::Output res; auto src0_shape = context.get_input_shape(0).to_shape(); auto output_shape = context.get_output_shape(0).to_shape(); @@ -63,8 +64,7 @@ OutputVector translate_cpy(const NodeContext& context) { indices, ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{1})); - auto res = std::make_shared(reshaped_src1, indices, src0); - return {res}; + res = std::make_shared(reshaped_src1, indices, src0); } else { // Write V to cache_v int64_t total_head_size = src0_shape[1]; @@ -99,10 +99,10 @@ OutputVector translate_cpy(const NodeContext& context) { ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, -1}), false); - auto res = std::make_shared(ov::OutputVector{src1_left, reshaped_src0, src1_right}, 2); - - return {res}; + res = std::make_shared(ov::OutputVector{src1_left, reshaped_src0, src1_right}, 2); } + + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp index 64fc57bd88..ca36548d9f 100644 --- a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp @@ -30,7 +30,7 @@ OutputVector translate_get_rows(const NodeContext& context) { res = std::make_shared(res, context.get_output_type(0)); } - return {res}; + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/mul.cpp b/ggml/src/ggml-openvino/openvino/op/mul.cpp index 14473f4e27..40caf4331e 100644 --- a/ggml/src/ggml-openvino/openvino/op/mul.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mul.cpp @@ -12,7 +12,7 @@ OutputVector translate_mul(const NodeContext& context) { num_inputs_check(context, 2, 2); auto res = std::make_shared(context.get_input(0), context.get_input(1)); - return {res}; + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 5673551f70..06e7d9ece0 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -25,12 +25,13 @@ OutputVector translate_mulmat(const NodeContext& context) { int op_case = context.get_op_case(); FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported MULMAT case"); + ov::Output res; + if (op_case == 1) { auto src0 = context.get_input(0); auto src1 = std::make_shared(context.get_input(1), context.get_input_type(0)); auto result_lp = std::make_shared(src1, src0, false, true); - auto result = std::make_shared(result_lp, context.get_output_type(0)); - return {result}; + res = std::make_shared(result_lp, context.get_output_type(0)); } else { /* Two cases here: @@ -118,10 +119,10 @@ OutputVector translate_mulmat(const NodeContext& context) { } auto result_lp = std::make_shared(A, B, false, true); - auto result = std::make_shared(result_lp, context.get_output_type(0)); - - return {result}; + res = std::make_shared(result_lp, context.get_output_type(0)); } + + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index 478c9430f0..649cf8f3e1 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -15,7 +15,7 @@ OutputVector translate_permute(const NodeContext& context) { auto perm = argsort_descend(context.get_output_stride(0)); auto res = std::make_shared(context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {3}, perm)); - return {res}; + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/reshape.cpp b/ggml/src/ggml-openvino/openvino/op/reshape.cpp index f6586d674c..49551eb815 100644 --- a/ggml/src/ggml-openvino/openvino/op/reshape.cpp +++ b/ggml/src/ggml-openvino/openvino/op/reshape.cpp @@ -37,8 +37,8 @@ OutputVector translate_reshape(const NodeContext& context) { {3}, std::vector{(int64_t)output_shape[0], -1, (int64_t)output_shape[2]}); } - Output res = std::make_shared(context.get_input(0), new_shape_node, false); - return {res}; + auto res = std::make_shared(context.get_input(0), new_shape_node, false); + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp index a91fffb72d..7b8b582dac 100644 --- a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp @@ -39,7 +39,7 @@ OutputVector translate_rms_norm(const NodeContext& context) { auto res = std::make_shared(input_node, scale); - return {res}; + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index aad156082e..94810e549d 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -52,6 +52,8 @@ void ggml_rope_yarn_corr_dims(int n_dims, OutputVector translate_rope(const NodeContext& context) { num_inputs_check(context, 2, 3); + ov::Output res; + auto data_node = context.get_input(0); auto pos_node = context.get_input(1); pos_node = std::make_shared(pos_node, ov::element::f32); @@ -141,9 +143,7 @@ OutputVector translate_rope(const NodeContext& context) { ov::element::i64, Shape{3}, std::vector{-1, input_shape[1].get_length(), input_shape[2].get_length()}); - auto reshaped = std::make_shared(stack, shape_const, false); - - return {reshaped}; + res = std::make_shared(stack, shape_const, false); } else { auto slice_node = std::make_shared(data_node, @@ -160,9 +160,10 @@ OutputVector translate_rope(const NodeContext& context) { std::make_shared(slice_data_node_0, sin_theta_node), std::make_shared(slice_data_node_1, cos_theta_node)); - auto res_node = std::make_shared(ov::OutputVector{first_half_node, second_half_node}, 2); - return {res_node}; + res = std::make_shared(ov::OutputVector{first_half_node, second_half_node}, 2); } + + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/scale.cpp b/ggml/src/ggml-openvino/openvino/op/scale.cpp index b393dd8aa2..8f0999432c 100644 --- a/ggml/src/ggml-openvino/openvino/op/scale.cpp +++ b/ggml/src/ggml-openvino/openvino/op/scale.cpp @@ -19,7 +19,7 @@ OutputVector translate_scale(const NodeContext& context) { auto res = std::make_shared(context.get_input(0), scale_node); - return {res}; + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp index 549c35a9b6..bb6b002395 100644 --- a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp +++ b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp @@ -24,6 +24,7 @@ OutputVector translate_soft_max(const NodeContext& context) { num_inputs_check(context, 1, 2); auto input_node = context.get_input(0); + ov::Output res; float scale = 1.0f; float max_bias = 0.0f; @@ -56,13 +57,13 @@ OutputVector translate_soft_max(const NodeContext& context) { } // Stride slice mask node - Output mask_begin_node = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {0, 0, 0}); + Output slice_start = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {0, 0, 0}); auto one = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {1}); - auto input_last_two_dim = get_dimensions(input_node.get_node_shared_ptr(), {1, 2}); - auto mask_slice_shape = std::make_shared(ov::NodeVector{one, input_last_two_dim}, 0); - Output mask_stride_node = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {1, 1, 1}); - auto mask_node_sliced = - std::make_shared(mask_node, mask_begin_node, mask_slice_shape, mask_stride_node); + auto token_len = get_dimensions(input_node.get_node_shared_ptr(), {1}); + auto total_token_len = get_dimensions(mask_node.get_node_shared_ptr(), {2}); + auto slice_end = std::make_shared(ov::NodeVector{one, token_len, total_token_len}, 0); + Output slice_stride = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {1, 1, 1}); + auto mask_node_sliced = std::make_shared(mask_node, slice_start, slice_end, slice_stride); // slope * mask auto slope_node = @@ -73,13 +74,13 @@ OutputVector translate_soft_max(const NodeContext& context) { auto input_slope_mask_node = std::make_shared(input_node, slope_mask_node); // Calculate softmax - auto res = std::make_shared(input_slope_mask_node, 2); - return {res}; + res = std::make_shared(input_slope_mask_node, 2); } else { // Directly softmax - auto res = std::make_shared(input_node, 0); - return {res}; + res = std::make_shared(input_node, 0); } + + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/transpose.cpp b/ggml/src/ggml-openvino/openvino/op/transpose.cpp index 7d33ca9d61..99178a1944 100644 --- a/ggml/src/ggml-openvino/openvino/op/transpose.cpp +++ b/ggml/src/ggml-openvino/openvino/op/transpose.cpp @@ -14,7 +14,7 @@ OutputVector translate_transpose(const NodeContext& context) { auto perm = argsort_descend(context.get_output_stride(0)); auto res = std::make_shared(context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {3}, perm)); - return {res}; + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp b/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp index 1c396e6aaf..6c73653ca4 100644 --- a/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp @@ -17,7 +17,7 @@ OutputVector translate_unary_silu(const NodeContext& context) { auto sigmoid = std::make_shared(input); auto res = std::make_shared(input, sigmoid); - return {res}; + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 910a0d8336..8eda23c1c5 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -1,5 +1,8 @@ #include "translate_session.hpp" +#include +#include + #include "input_model.hpp" namespace ov { @@ -91,11 +94,18 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo used_params.push_back(param); } } - if (auto diff = params.size() - used_params.size()) { - std::cout << diff << " parameters are not used in the model." << std::endl; + if (getenv("GGML_OPENVINO_PROFILING")) { + if (auto diff = params.size() - used_params.size()) { + std::cout << diff << " parameters are not used in the model." << std::endl; + } } resulting_model = std::make_shared(results, used_params); + ov::pass::Manager manager; + manager.set_per_pass_validation(true); + manager.register_pass(); + manager.run_passes(resulting_model); + return resulting_model; } diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp index ff16e9d4ae..69e26f05ca 100644 --- a/ggml/src/ggml-openvino/openvino/utils.cpp +++ b/ggml/src/ggml-openvino/openvino/utils.cpp @@ -47,6 +47,17 @@ std::shared_ptr get_dimensions(const std::shared_ptr& node, return get_dimensions(std::make_shared(node), dims); } +OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::string& suffix) { + for (const auto& output : outputs) { + auto node = output.get_node_shared_ptr(); + std::string name = node->get_friendly_name(); + name += "_"; + name += suffix; + node->set_friendly_name(name); + } + return outputs; +} + } // namespace ggml } // namespace frontend } // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/utils.hpp b/ggml/src/ggml-openvino/openvino/utils.hpp index 6e106fa932..e0fe250789 100644 --- a/ggml/src/ggml-openvino/openvino/utils.hpp +++ b/ggml/src/ggml-openvino/openvino/utils.hpp @@ -55,6 +55,8 @@ std::vector permute(const std::vector& x, const std::vector& perm) { std::shared_ptr get_dimensions(const std::shared_ptr& shape, const std::vector& dims); std::shared_ptr get_dimensions(const std::shared_ptr& node, const std::vector& dims); +OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::string& suffix); + namespace op { template OutputVector translate_1to1_match_2_inputs(const NodeContext& context) { From 3051d5ae07f24a2eb69cd81db72208d1e83fe25a Mon Sep 17 00:00:00 2001 From: Ravi Panchumarthy Date: Wed, 28 May 2025 18:32:18 -0700 Subject: [PATCH 073/254] Update openvino build instructions --- docs/build.md | 131 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 95 insertions(+), 36 deletions(-) diff --git a/docs/build.md b/docs/build.md index a6028035c5..d2dea5a572 100644 --- a/docs/build.md +++ b/docs/build.md @@ -13,6 +13,21 @@ cd llama.cpp The following sections describe how to build with different backends and options. +* [CPU Build](#cpu-build) +* [BLAS Build](#blas-build) +* [Metal Build](#metal-build) +* [SYCL](#sycl) +* [CUDA](#cuda) +* [MUSA](#musa) +* [HIP](#hip) +* [Vulkan](#vulkan) +* [CANN](#cann) +* [Arm® KleidiAI™](#arm-kleidiai) +* [OpenCL](#opencl) +* [Android](#android-1) +* [OPENVINO](#openvino) +* [Notes about GPU-accelerated backends](#notes-about-gpu-accelerated-backends) + ## CPU Build Build llama.cpp using `CMake`: @@ -683,62 +698,106 @@ To read documentation for how to build on IBM Z & LinuxONE, [click here](./build ## OPENVINO -### Build openvino +[OpenVINO](https://docs.openvino.ai/2025/index.html) is a open-source toolkit for optimizing and deploying performant AI inference, specifically designed for Intel hardware including CPUs, GPUs, and NPUs in the cloud, on-prem, and on the edge alike. The OpenVINO backend enhances performance by leveraging hardware-specific optimizations and can be enabled for use with llama.cpp. +Follow the instructions below to install OpenVINO runtime and build llama.cpp with OpenVINO support. + +### 1. Install OpenVINO Runtime + +- Follow the guide to install OpenVINO Runtime from an archive file: **[Install OpenVINO™ Runtime on Linux from an Archive File.](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-linux.html)** + +- After installation, make sure to [source the environment setup script](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-linux.html#step-2-configure-the-environment): ```bash -git clone https://github.com/openvinotoolkit/openvino.git -cd openvino -git submodule update --init --recursive -export OPENVINO_DIR=$(pwd) - -sudo ./install_build_dependencies.sh - -mkdir -p build/Release && cd build/Release -cmake -DCMAKE_BUILD_TYPE=Release -DENABLE_DEBUG_CAPS=ON ../.. +source /opt/intel/openvino_2025.1.0/setupvars.sh +``` +- Verify OpenVINO is initialized properly +```bash +echo $OpenVINO_DIR ``` -### Build llama.cpp-ov +### 2. Build llama.cpp with OpenVINO Backend + +Clone the OpenVINO-enabled llama.cpp fork and build it: ```bash -git clone https://github.com/intel-sandbox/llama.cpp-ov.git -cd llama.cpp-ov +git clone https://github.com/ravi9/llama.cpp.git +cd llama.cpp git switch dev_backend_openvino +# Build with OpenVINO support cmake --preset ReleaseOV -cmake --build build/ReleaseOV +cmake --build build/ReleaseOV --parallel + ``` -Download the test model file [Phi-3-mini-4k-instruct-fp16.gguf](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf) from hugging face website. - ``` bash - wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-fp16.gguf?download=true -O ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf - ``` +### 3. Download Sample Model + +Download the Phi-3 mini model for testing: + +```bash +# Create models directory +mkdir -p ~/models/Phi-3-mini-4k-instruct-gguf + +# Download model file +wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-fp16.gguf \ + -O ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf + +``` + +### 4. Run inference with OpenVINO backend: + +When using the OpenVINO backend, the first inference token may have slightly higher latency due to on-the-fly conversion to the OpenVINO graph. Subsequent tokens and runs will be faster. -Execute the following command to test. ```bash export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache -./build/ReleaseOV/bin/llama-simple -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf -n 10 "Hello, my name is " + +./build/ReleaseOV/bin/llama-simple \ + -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf \ + -n 50 \ + "Hello, my name is " + ``` -Environment variables: -- GGML_OPENVINO_WEIGHT_AS_INPUT: - Pass the weights as input to the OpenVINO model instead of creating Constant nodes for them. -- GGML_OPENVINO_CACHE_DIR: - If set, model caching in OpenVINO will be used. -- GGML_OPENVINO_DUMP_CGRAPH: - Dumped the compute graph to "cgraph.txt". Note that the the compute graph is different for every token, so the later cgraph will overwrite the previous one. -- GGML_OPENVINO_PROFILING: - Print the time taken for each phase in the OpenVINO backend. -- GGML_OPENVINO_DUMP_IR: - Dump the converted OpenVINO IR. The filenames are timestamps. -- GGML_OPENVINO_DEBUG_INPUT -- GGML_OPENVINO_DEBUG_OUTPUT +### Using Llama.cpp's Built-in CPU Backend (for Comparison) + +To compare performance with the deafult CPU backend: -To use Llama.cpp's builtin CPU backend: ```bash +# Build CPU-only version cmake --preset ReleaseCPU -cmake --build build/ReleaseCPU +cmake --build build/ReleaseCPU --parallel + +# Run with Default CPU backend +./build/ReleaseCPU/bin/llama-simple \ + -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf \ + -n 50 \ + "Hello, my name is " + +``` + +### Configuration Options + +Control OpenVINO behavior using these environment variables: + +- **`GGML_OPENVINO_CACHE_DIR`**: Directory for model caching (recommended: `/tmp/ov_cache`). If set, enables model caching in OpenVINO. +- **`GGML_OPENVINO_WEIGHT_AS_INPUT`**: Pass the weights as input to the OpenVINO model instead of creating Constant nodes for them. +- **`GGML_OPENVINO_PROFILING`**: Enable execution time profiling +- **`GGML_OPENVINO_DUMP_CGRAPH`**: Save compute graph to `cgraph.txt` +- **`GGML_OPENVINO_DUMP_IR`**: Export OpenVINO IR files with timestamps +- **`GGML_OPENVINO_DEBUG_INPUT`**: Enable input debugging +- **`GGML_OPENVINO_DEBUG_OUTPUT`**: Enable output debugging + +### Example with Profiling + +```bash +export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache +export GGML_OPENVINO_PROFILING=1 + +./build/ReleaseOV/bin/llama-simple \ + -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf \ + -n 50 \ + "Hello, my name is " -./build/ReleaseCPU/bin/llama-simple -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf -n 10 "Hello, my name is " ``` ## Notes about GPU-accelerated backends From 7fec22333410e9b7b04ed33e4b104ec533ac8a4f Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 27 May 2025 16:51:14 +0800 Subject: [PATCH 074/254] Add initial NPU support --- ggml/src/ggml-openvino/ggml-decoder.cpp | 54 ++++----- ggml/src/ggml-openvino/ggml-decoder.h | 13 ++- ggml/src/ggml-openvino/openvino/decoder.hpp | 2 + .../ggml-openvino/openvino/node_context.hpp | 3 + ggml/src/ggml-openvino/openvino/op/cpy.cpp | 106 ++++++++++++++---- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 8 +- .../ggml-openvino/openvino/op/rms_norm.cpp | 23 ++-- ggml/src/ggml-openvino/openvino/op/rope.cpp | 5 +- ggml/src/ggml-openvino/utils.cpp | 86 +++++++++----- 9 files changed, 201 insertions(+), 99 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index e6474d6def..7bb092a65c 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -25,14 +26,16 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" -GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph) +GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token) : m_cgraph(cgraph), m_node(node), - m_op_name(m_node ? std::string(m_node->name) : "NONE_OP") { + m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"), + m_is_static(is_static), + m_is_first_token(is_first_token) { static std::map> model_weights; if (m_node) { - set_input_output(m_node, model_weights); + set_input_output(m_node); } else { static bool printed = false; if (!printed && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) { @@ -47,7 +50,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap set_max_token_len(); static bool weight_created = false; - if (!getenv("GGML_OPENVINO_WEIGHT_AS_INPUT") && !weight_created) { + if (!weight_created) { add_weight_const_parallel(model_weights); weight_created = true; } @@ -55,7 +58,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { auto* cur_node = m_cgraph->nodes[node_n]; m_nodes.push_back(cur_node); - set_input_output(cur_node, model_weights); + set_input_output(cur_node); } m_model_weights = model_weights; @@ -65,8 +68,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap // Called in GgmlOvDecoder constructor. Two cases: 1. constructing a decoder for the whole graph; // 2. constructing a decoder for a node. -void GgmlOvDecoder::set_input_output(ggml_tensor* node, - std::map>& model_weights) { +void GgmlOvDecoder::set_input_output(ggml_tensor* node) { std::string node_name; if (node->op == GGML_OP_CPY) { // CPY updates the input tensor in place. For later ov op that uses the @@ -95,21 +97,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, if (!m_node && !src->view_src) { ggml_backend_buffer* buffer = src->buffer; - if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { - bool weight_as_input = getenv("GGML_OPENVINO_WEIGHT_AS_INPUT"); - auto& weights_map = weight_as_input ? m_model_inputs : model_weights; - if (weights_map.find(src_name) != weights_map.end()) { - continue; - } - - std::shared_ptr weight_node = - weight_as_input - ? std::make_shared(get_ov_type(src), ov::Shape{get_shape(src)}) - : create_weight_node(src); - weight_node->set_friendly_name(src_name); - weights_map[src_name] = weight_node; - - } else if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) { + if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) { // GGML_BACKEND_BUFFER_USAGE_ANY are kv caches if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) { assert(src_name.find("cache_k") == 0 || src_name.find("cache_v") == 0); @@ -119,10 +107,24 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, } ov::PartialShape input_shape; if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") { - input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_max_token_len)}; + if (m_is_static) { + input_shape = ov::PartialShape(get_shape(src)); + // if (m_is_first_token) { + // input_shape = ov::PartialShape{1, 1, m_max_token_len}; + // } else { + // input_shape = ov::PartialShape{1, 1, 1}; + // } + } else { + input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_max_token_len)}; + } } else if (std::string(src->name).find("KQ_mask") == 0) { - auto max_token_len = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD); - input_shape = ov::PartialShape{1, ov::Dimension(1, max_token_len), ov::Dimension(1, max_token_len)}; + if (m_is_static) { + input_shape = ov::PartialShape(get_shape(src)); + } else { + auto max_token_len = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD); + input_shape = + ov::PartialShape{1, ov::Dimension(1, max_token_len), ov::Dimension(1, max_token_len)}; + } } else { input_shape = ov::Shape{get_shape(src)}; } @@ -510,7 +512,7 @@ int32_t* GgmlOvDecoder::get_output_op_params(const std::string& name) const { void GgmlOvDecoder::visit_subgraph(std::function)> node_visitor) const { for (const auto& node : m_nodes) { - auto decoder = std::make_shared(node, m_cgraph); + auto decoder = std::make_shared(node, m_cgraph, m_is_static, m_is_first_token); node_visitor(decoder); } } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 4d4a928121..b372cc8040 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -12,7 +12,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { public: using ov::frontend::ggml::GgmlDecoder::GgmlDecoder; - GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph); + GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token); virtual ov::Any get_attribute(const std::string& name) const override { return nullptr; @@ -89,8 +89,15 @@ public: return m_model_output_names; } + virtual bool is_static() const override { + return m_is_static; + } + virtual bool is_first_token() const { + return m_is_first_token; + } + private: - void set_input_output(ggml_tensor* node, std::map>& model_weights); + void set_input_output(ggml_tensor* node); void add_extra_inputs(); static void dump_cgraph(const struct ggml_cgraph* cgraph); static std::vector get_shape(const ggml_tensor* tensor); @@ -119,6 +126,8 @@ private: std::map> m_model_extra_input_values; std::map> m_model_weights; std::vector m_model_output_names; + bool m_is_static; + bool m_is_first_token; }; void print_tensor_address_map(const struct ggml_cgraph* cgraph); diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index b3cf75817f..a0b9509336 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -55,6 +55,8 @@ public: virtual const std::map>& get_model_extra_inputs() const = 0; virtual const std::map>& get_model_weights() const = 0; virtual const std::vector& get_model_output_names() const = 0; + + virtual bool is_static() const = 0; }; } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index 44f55222e3..f5940585a6 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -84,6 +84,9 @@ public: int get_op_case() const { return m_decoder->get_op_case(); } + bool is_static() const { + return m_decoder->is_static(); + } private: std::shared_ptr m_decoder; diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index 7cdeddce38..fe755a5f64 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -12,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -57,6 +59,13 @@ OutputVector translate_cpy(const NodeContext& context) { token_len = std::make_shared(token_len, ov::op::v0::Constant::create(ov::element::i64, {0}, {}), false); + + if (context.is_static()) { + int32_t* op_params = context.get_input_op_params(1); + int64_t past_token_len_val = op_params[0] / context.get_input_stride(1)[2] / num_heads / head_size; + past_token_len = ov::op::v0::Constant::create(ov::element::i64, {}, {past_token_len_val}); + } + auto total_token_len = std::make_shared(past_token_len, token_len); std::shared_ptr indices = std::make_shared(past_token_len, total_token_len, one, ov::element::i64); @@ -67,39 +76,88 @@ OutputVector translate_cpy(const NodeContext& context) { res = std::make_shared(reshaped_src1, indices, src0); } else { // Write V to cache_v - int64_t total_head_size = src0_shape[1]; - auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size}); - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); + + auto zero_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {0}); + auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {1}); + + int64_t total_head_size = src0_shape[1]; + auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size}); + auto total_head_size_scalar = std::make_shared(total_head_size_node, zero); auto token_len = get_dimensions(src0.get_node_shared_ptr(), {2}); - past_token_len = std::make_shared(past_token_len, zero); - auto total_token_len = std::make_shared(past_token_len, token_len); + auto token_len_scalar = std::make_shared(token_len, zero); + if (context.is_static()) { + int32_t* op_params = context.get_input_op_params(1); + int64_t past_token_len_val = op_params[0] / context.get_input_stride(1)[2]; + past_token_len = ov::op::v0::Constant::create(ov::element::i64, {}, {past_token_len_val}); + } + auto total_token_len_scalar = std::make_shared(past_token_len, token_len_scalar); + // auto reshaped_src1 = std::make_shared( + // src1, + // ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, -1}), + // false); + + // auto src1_left = std::make_shared( + // reshaped_src1, + // ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, 0}), + // std::make_shared(ov::OutputVector{one, total_head_size_node, past_token_len}, 0), + // ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1})); + + // auto src1_right = std::make_shared( + // reshaped_src1, + // std::make_shared(ov::OutputVector{zero, zero, total_token_len}, 0), + // ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, INT_MAX}), + // ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1})); + + // auto reshaped_src0 = std::make_shared( + // src0, + // ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, -1}), + // false); + + // auto res = std::make_shared(ov::OutputVector{src1_left, reshaped_src0, src1_right}, 2); + + // 1D tensor of shape [total_head_size], values starting from 0 + auto range_row = + std::make_shared(zero_scalar, total_head_size_scalar, one_scalar, ov::element::i64); + auto range_row_reshaped = + std::make_shared(range_row, + ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2})); + auto row_indices = std::make_shared( + range_row_reshaped, + std::make_shared(ov::OutputVector{total_head_size_node, token_len, one}, 0)); + + // 1D tensor of shape [token_len], values starting from past_token_len + auto range_col = + std::make_shared(past_token_len, total_token_len_scalar, one_scalar, element::i64); + auto range_col_reshaped = + std::make_shared(range_col, + ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 2})); + auto col_indices = std::make_shared( + range_col_reshaped, + std::make_shared(ov::OutputVector{total_head_size_node, token_len, one}, 0)); + + // Stack row_indices and col_indices along last axis: [total_head_size, token_len, 2] + auto indices = std::make_shared(OutputVector{row_indices, col_indices}, 2); + auto indices_final = std::make_shared( + indices, + ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{-1, 2}), + false); + + auto flattend_src0 = + std::make_shared(src0, + ov::op::v0::Constant::create(element::i64, Shape{1}, {-1}), + false); auto reshaped_src1 = std::make_shared( src1, - ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, -1}), + ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{total_head_size, -1}), false); - auto src1_left = std::make_shared( - reshaped_src1, - ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, 0}), - std::make_shared(ov::OutputVector{one, total_head_size_node, past_token_len}, 0), - ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1})); - - auto src1_right = std::make_shared( - reshaped_src1, - std::make_shared(ov::OutputVector{zero, zero, total_token_len}, 0), - ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, INT_MAX}), - ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1})); - - auto reshaped_src0 = std::make_shared( - src0, - ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, -1}), - false); - - res = std::make_shared(ov::OutputVector{src1_left, reshaped_src0, src1_right}, 2); + auto updated = std::make_shared(reshaped_src1, indices_final, flattend_src0); + res = std::make_shared(updated, zero); } return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 06e7d9ece0..20ad5683b8 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -55,17 +55,21 @@ OutputVector translate_mulmat(const NodeContext& context) { ov::Output A; ov::Output B; - auto attention_size = context.get_input("attention_size"); - auto src0 = context.get_input(0); auto src0_shape = context.get_input_shape(0).to_shape(); auto src0_stride = context.get_input_stride(0); auto permuted = is_permuted(src0_stride); auto token_dim = permuted ? 0 : 2; + auto attention_size = context.get_input("attention_size"); + auto src0_perm = argsort_descend(src0_stride); auto src0_original_shape_ = permute(src0_shape, src0_perm); std::vector src0_original_shape(src0_original_shape_.begin(), src0_original_shape_.end()); + + if (context.is_static()) { + attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {src0_original_shape[token_dim]}); + } src0_original_shape[token_dim] = -1; auto src0_slice_shape = src0_original_shape; diff --git a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp index 7b8b582dac..4b230ad630 100644 --- a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp @@ -1,8 +1,9 @@ +#include #include #include #include #include -#include +#include #include #include "../node_context.hpp" @@ -16,28 +17,24 @@ namespace op { OutputVector translate_rms_norm(const NodeContext& context) { num_inputs_check(context, 1, 1); - ov::Shape input_shape = context.get_input_shape(0).to_shape(); auto input_node = context.get_input(0); auto square = std::make_shared(input_node, input_node); - auto reduce_sum = - std::make_shared(square, - ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}), - true); - - auto mean = std::make_shared( - reduce_sum, - ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {static_cast(input_shape[2])})); + auto mean = + std::make_shared(square, + ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}), + true); float eps; memcpy(&eps, context.get_output_op_params(0), sizeof(float)); + auto rms = std::make_shared( std::make_shared(mean, ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {eps}))); - auto scale = - std::make_shared(ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {1.0f}), rms); + auto reciprocal = + std::make_shared(ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {1.0f}), rms); - auto res = std::make_shared(input_node, scale); + auto res = std::make_shared(input_node, reciprocal); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index 94810e549d..b47b8a6a54 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -1,4 +1,3 @@ - #include #include #include @@ -23,6 +22,10 @@ #include "../node_context.hpp" #include "../utils.hpp" +#ifndef M_PI +# define M_PI 3.14159265358979323846 +#endif + #define GGML_ROPE_TYPE_NEOX 2 #define MIN(a, b) ((a) < (b) ? (a) : (b)) diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 040ca1961e..65a609f1d7 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -4,11 +4,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include @@ -17,8 +19,8 @@ #include "openvino/frontend.hpp" #include "openvino/input_model.hpp" -std::shared_ptr get_ggml_decoder(struct ggml_cgraph* cgraph) { - return std::make_shared(nullptr, cgraph); +std::shared_ptr get_ggml_decoder(struct ggml_cgraph* cgraph, bool is_static, bool is_first_token) { + return std::make_shared(nullptr, cgraph, is_static, is_first_token); } ov::Tensor get_ggml_graph_input_tensor(std::shared_ptr ggml_decoder, std::string& name) { @@ -49,50 +51,63 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { } enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph) { + static ov::Core core; + static bool is_first_token = true; + + static std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : ""; + if (device.empty()) { + // Prefer GPU over CPU + for (const auto& dev : core.get_available_devices()) { + device = dev; + if (device == "GPU") + break; + } + } + + bool is_static = device == "NPU" ? true : false; + ov::AnyMap config; + if (is_static) { + config = { + {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean"}, + {"NPU_USE_NPUW", "YES"}, + {"NPUW_DEVICES", "NPU"}, + {"NPUW_FOLD", "YES"}, + // {"NPU_COMPILER_TYPE", "MLIR"}, + }; + } + auto start_time = ggml_time_us(); - static ov::Core core; auto* cache_dir = getenv("GGML_OPENVINO_CACHE_DIR"); - if (cache_dir) { + if (cache_dir && !is_static) { core.set_property(ov::cache_dir(cache_dir)); } - // auto devices = core.get_available_devices(); - // static auto front_end = get_ggml_frontend(); - // if (!front_end) { - // GGML_LOG_ERROR("GGML FrontEnd is not initialized \n"); - // return GGML_STATUS_FAILED; - // } - - using CachedItem = std::pair, ov::CompiledModel>; + // For CPU and GPU, there is only one compiled model, so only use the first element of the pair + // For NPU, there are prefill model and kvcache model (This is the ideal approach, but not implemented yet, + // currently recompile for every token) + using CachedItem = std::pair, std::pair>; static std::unordered_map compiled_cache; std::shared_ptr model; - ov::CompiledModel compiled_model; + ov::CompiledModel compiled_model_prefill; + ov::CompiledModel compiled_model_kvcache; int64_t decoder_end_time; int64_t conversion_end_time; int64_t compile_end_time; - auto ggml_decoder = get_ggml_decoder(cgraph); + auto ggml_decoder = get_ggml_decoder(cgraph, is_static, is_first_token); decoder_end_time = ggml_time_us(); auto it = compiled_cache.find(cgraph); - if (it != compiled_cache.end()) { + if (it != compiled_cache.end() && !is_static) { model = it->second.first; conversion_end_time = ggml_time_us(); - compiled_model = it->second.second; + compiled_model_prefill = it->second.second.first; + compiled_model_kvcache = it->second.second.second; compile_end_time = ggml_time_us(); } else { - // std::shared_ptr graph_decoder = ggml_decoder; - // ov::frontend::InputModel::Ptr input_model = front_end->load(graph_decoder); - // if (!input_model) { - // GGML_LOG_ERROR("Input Model is not loaded \n"); - // return GGML_STATUS_FAILED; - // } - - // model = front_end->convert(input_model); - ov::frontend::InputModel::Ptr input_model = std::make_shared(ggml_decoder); model = ov::frontend::ggml::FrontEnd::convert(input_model); @@ -105,16 +120,23 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c ov::serialize(model, timestamped_filename); } - if (!model) { - GGML_LOG_ERROR("Model is not converted \n"); - } - compiled_model = core.compile_model(model, "CPU"); + compiled_model_prefill = core.compile_model(model, device, config); compile_end_time = ggml_time_us(); - compiled_cache[cgraph] = std::make_pair(model, compiled_model); + compiled_cache[cgraph] = std::make_pair(model, std::make_pair(compiled_model_prefill, compiled_model_kvcache)); } - ov::InferRequest infer_request = compiled_model.create_infer_request(); + ov::InferRequest infer_request; + if (!is_static) { + infer_request = compiled_model_prefill.create_infer_request(); + } else { + infer_request = compiled_model_prefill.create_infer_request(); + // if (is_first_token) { + // infer_request = compiled_model_prefill.create_infer_request(); + // } else { + // infer_request = compiled_model_kvcache.create_infer_request(); + // } + } auto ov_params = model->get_parameters(); for (size_t i = 0; i < ov_params.size(); i++) { @@ -148,6 +170,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c } auto end_time = ggml_time_us(); + is_first_token = false; + if (getenv("GGML_OPENVINO_PROFILING")) { GGML_LOG_INFO("GGML OpenVINO Backend: \n"); GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000); From 34531abce46b2aa3017bc66c14f7c87e8eca4c05 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 29 May 2025 17:53:00 +0800 Subject: [PATCH 075/254] draft NPU support version 2: prefill + kvcache --- ggml/src/ggml-openvino/ggml-decoder.cpp | 27 +-- ggml/src/ggml-openvino/ggml-decoder.h | 7 +- ggml/src/ggml-openvino/openvino/decoder.hpp | 3 + .../ggml-openvino/openvino/node_context.hpp | 7 + ggml/src/ggml-openvino/openvino/op/cpy.cpp | 90 ++++------ ggml/src/ggml-openvino/utils.cpp | 165 +++++++++++++----- ggml/src/ggml-openvino/utils.h | 27 ++- 7 files changed, 212 insertions(+), 114 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 7bb092a65c..29be4dbae8 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -108,22 +108,25 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { ov::PartialShape input_shape; if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") { if (m_is_static) { - input_shape = ov::PartialShape(get_shape(src)); - // if (m_is_first_token) { - // input_shape = ov::PartialShape{1, 1, m_max_token_len}; - // } else { - // input_shape = ov::PartialShape{1, 1, 1}; - // } + if (m_is_first_token) { + input_shape = ov::PartialShape{1, 1, m_max_token_len}; + } else { + input_shape = ov::PartialShape{1, 1, 1}; + } } else { input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_max_token_len)}; } - } else if (std::string(src->name).find("KQ_mask") == 0) { + } else if (std::string(src->name) == "KQ_mask") { if (m_is_static) { - input_shape = ov::PartialShape(get_shape(src)); + if (m_is_first_token) { + input_shape = ov::PartialShape{1, m_max_token_len, m_max_token_len}; + } else { + input_shape = ov::PartialShape{1, 1, m_max_token_len}; + } } else { - auto max_token_len = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD); + auto max_mask_size = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD); input_shape = - ov::PartialShape{1, ov::Dimension(1, max_token_len), ov::Dimension(1, max_token_len)}; + ov::PartialShape{1, ov::Dimension(1, max_mask_size), ov::Dimension(1, max_mask_size)}; } } else { input_shape = ov::Shape{get_shape(src)}; @@ -208,6 +211,7 @@ void GgmlOvDecoder::set_max_token_len() { void GgmlOvDecoder::add_extra_inputs() { int64_t past_token_len; + // attention_size not used for NPU int64_t attention_size; for (const auto& node : m_nodes) { @@ -231,8 +235,7 @@ void GgmlOvDecoder::add_extra_inputs() { for (const auto& node : m_nodes) { if (node->src[1] && std::string(node->src[1]->name).find("inp_tokens") == 0) { int64_t total_token_len = node->src[1]->ne[0] + past_token_len; - attention_size = (total_token_len + 31) / 32 * 32; - + attention_size = GGML_PAD(total_token_len, 32); std::string name = "attention_size"; auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); param_node->set_friendly_name(name); diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index b372cc8040..2c89d06267 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -92,9 +92,12 @@ public: virtual bool is_static() const override { return m_is_static; } - virtual bool is_first_token() const { + virtual bool is_first_token() const override { return m_is_first_token; } + virtual int get_max_token_len() const override { + return m_max_token_len; + } private: void set_input_output(ggml_tensor* node); @@ -106,7 +109,7 @@ private: static std::shared_ptr create_weight_node(ggml_tensor* tensor); void set_max_token_len(); - int64_t m_max_token_len; + int m_max_token_len; void add_weight_const_parallel(std::map>& model_weights); diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index a0b9509336..6212568399 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -57,6 +58,8 @@ public: virtual const std::vector& get_model_output_names() const = 0; virtual bool is_static() const = 0; + virtual bool is_first_token() const = 0; + virtual int get_max_token_len() const = 0; }; } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index f5940585a6..f4e7c4e31f 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include #include "decoder.hpp" @@ -87,6 +88,12 @@ public: bool is_static() const { return m_decoder->is_static(); } + bool is_first_token() const { + return m_decoder->is_first_token(); + } + int get_max_token_len() const { + return m_decoder->get_max_token_len(); + } private: std::shared_ptr m_decoder; diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index fe755a5f64..75dd0e7d83 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include @@ -34,18 +34,26 @@ OutputVector translate_cpy(const NodeContext& context) { auto src0 = context.get_input(0); auto src1 = context.get_input(1); - auto past_token_len = context.get_input("past_token_len"); + auto past_token_len_scalar = context.get_input("past_token_len"); + + src0 = std::make_shared(src0, context.get_input_type(1)); ov::Output res; + if (context.is_static() && context.is_first_token()) { + res = src0; + return rename_outputs_with_suffix({res}, context.get_name()); + } + auto src0_shape = context.get_input_shape(0).to_shape(); auto output_shape = context.get_output_shape(0).to_shape(); std::vector input0_strides = context.get_input_stride(0); std::vector output_strides = context.get_output_stride(0); - auto one = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1}); + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1}); - src0 = std::make_shared(src0, src1); if (op_case == 1) { // Write K to cache_k int64_t head_size = src0_shape[2]; @@ -56,32 +64,29 @@ OutputVector translate_cpy(const NodeContext& context) { auto reshaped_src1 = std::make_shared(src1, reshaped_src1_shape, false); auto token_len = get_dimensions(src0.get_node_shared_ptr(), {0}); - token_len = std::make_shared(token_len, - ov::op::v0::Constant::create(ov::element::i64, {0}, {}), - false); + auto token_len_scalar = std::make_shared(token_len, zero); + std::shared_ptr indices; if (context.is_static()) { - int32_t* op_params = context.get_input_op_params(1); - int64_t past_token_len_val = op_params[0] / context.get_input_stride(1)[2] / num_heads / head_size; - past_token_len = ov::op::v0::Constant::create(ov::element::i64, {}, {past_token_len_val}); + indices = past_token_len_scalar.get_node_shared_ptr(); + indices = std::make_shared( + indices, + ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{0, 1})); + } else { + auto total_token_len_scalar = std::make_shared(past_token_len_scalar, token_len_scalar); + indices = std::make_shared(past_token_len_scalar, + total_token_len_scalar, + one_scalar, + ov::element::i64); + indices = std::make_shared(indices, one); } - auto total_token_len = std::make_shared(past_token_len, token_len); - std::shared_ptr indices = - std::make_shared(past_token_len, total_token_len, one, ov::element::i64); - indices = std::make_shared( - indices, - ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{1})); - res = std::make_shared(reshaped_src1, indices, src0); } else { // Write V to cache_v - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - auto zero_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {0}); - auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {1}); int64_t total_head_size = src0_shape[1]; auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size}); @@ -89,36 +94,6 @@ OutputVector translate_cpy(const NodeContext& context) { auto token_len = get_dimensions(src0.get_node_shared_ptr(), {2}); auto token_len_scalar = std::make_shared(token_len, zero); - if (context.is_static()) { - int32_t* op_params = context.get_input_op_params(1); - int64_t past_token_len_val = op_params[0] / context.get_input_stride(1)[2]; - past_token_len = ov::op::v0::Constant::create(ov::element::i64, {}, {past_token_len_val}); - } - auto total_token_len_scalar = std::make_shared(past_token_len, token_len_scalar); - - // auto reshaped_src1 = std::make_shared( - // src1, - // ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, -1}), - // false); - - // auto src1_left = std::make_shared( - // reshaped_src1, - // ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, 0}), - // std::make_shared(ov::OutputVector{one, total_head_size_node, past_token_len}, 0), - // ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1})); - - // auto src1_right = std::make_shared( - // reshaped_src1, - // std::make_shared(ov::OutputVector{zero, zero, total_token_len}, 0), - // ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, INT_MAX}), - // ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1})); - - // auto reshaped_src0 = std::make_shared( - // src0, - // ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, -1}), - // false); - - // auto res = std::make_shared(ov::OutputVector{src1_left, reshaped_src0, src1_right}, 2); // 1D tensor of shape [total_head_size], values starting from 0 auto range_row = @@ -131,8 +106,19 @@ OutputVector translate_cpy(const NodeContext& context) { std::make_shared(ov::OutputVector{total_head_size_node, token_len, one}, 0)); // 1D tensor of shape [token_len], values starting from past_token_len - auto range_col = - std::make_shared(past_token_len, total_token_len_scalar, one_scalar, element::i64); + std::shared_ptr range_col; + if (context.is_static()) { + range_col = past_token_len_scalar.get_node_shared_ptr(); + range_col = std::make_shared( + range_col, + ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{0})); + } else { + auto total_token_len_scalar = std::make_shared(past_token_len_scalar, token_len_scalar); + range_col = std::make_shared(past_token_len_scalar, + total_token_len_scalar, + one_scalar, + ov::element::i64); + } auto range_col_reshaped = std::make_shared(range_col, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 2})); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 65a609f1d7..3e49081515 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -1,5 +1,7 @@ #include "utils.h" +#include +#include #include #include #include @@ -13,6 +15,7 @@ #include #include #include +#include #include "ggml-impl.h" #include "ggml.h" @@ -52,7 +55,6 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph) { static ov::Core core; - static bool is_first_token = true; static std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : ""; if (device.empty()) { @@ -66,12 +68,16 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c bool is_static = device == "NPU" ? true : false; ov::AnyMap config; - if (is_static) { + if (device == "NPU") { config = { {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean"}, {"NPU_USE_NPUW", "YES"}, {"NPUW_DEVICES", "NPU"}, {"NPUW_FOLD", "YES"}, + {"NPUW_DQ", "YES"}, + {"NPUW_FUNCALL_ASYNC", "YES"}, + {"NPUW_HOST_GATHER", "YES"}, + {"NPUW_WEIGHTS_BANK", "shared"}, // {"NPU_COMPILER_TYPE", "MLIR"}, }; } @@ -83,69 +89,128 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c core.set_property(ov::cache_dir(cache_dir)); } - // For CPU and GPU, there is only one compiled model, so only use the first element of the pair - // For NPU, there are prefill model and kvcache model (This is the ideal approach, but not implemented yet, - // currently recompile for every token) - using CachedItem = std::pair, std::pair>; - static std::unordered_map compiled_cache; + // CPU and GPU will only use cache_prefill + using CachedItem = std::pair, ov::CompiledModel>; + static std::unordered_map compiled_cache_prefill; + static std::unordered_map compiled_cache_kvcache; + std::shared_ptr ggml_decoder; std::shared_ptr model; - ov::CompiledModel compiled_model_prefill; - ov::CompiledModel compiled_model_kvcache; + ov::CompiledModel compiled_model; + int64_t decoder_end_time; int64_t conversion_end_time; int64_t compile_end_time; - auto ggml_decoder = get_ggml_decoder(cgraph, is_static, is_first_token); - decoder_end_time = ggml_time_us(); + auto it = compiled_cache_prefill.find(cgraph); + bool is_first_token = it == compiled_cache_prefill.end(); + if (!is_first_token) { + ggml_decoder = get_ggml_decoder(cgraph, is_static, false); + decoder_end_time = ggml_time_us(); - auto it = compiled_cache.find(cgraph); - if (it != compiled_cache.end() && !is_static) { - model = it->second.first; - conversion_end_time = ggml_time_us(); - - compiled_model_prefill = it->second.second.first; - compiled_model_kvcache = it->second.second.second; - compile_end_time = ggml_time_us(); - } else { - ov::frontend::InputModel::Ptr input_model = std::make_shared(ggml_decoder); - model = ov::frontend::ggml::FrontEnd::convert(input_model); - - conversion_end_time = ggml_time_us(); - - if (getenv("GGML_OPENVINO_DUMP_IR")) { - char timestamped_filename[64]; - auto timestamp = (long long)ggml_time_us(); - snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); - ov::serialize(model, timestamped_filename); + if (is_static) { + model = compiled_cache_kvcache[cgraph].first; + compiled_model = compiled_cache_kvcache[cgraph].second; + } else { + model = it->second.first; + compiled_model = it->second.second; } - - compiled_model_prefill = core.compile_model(model, device, config); - compile_end_time = ggml_time_us(); - - compiled_cache[cgraph] = std::make_pair(model, std::make_pair(compiled_model_prefill, compiled_model_kvcache)); - } - - ov::InferRequest infer_request; - if (!is_static) { - infer_request = compiled_model_prefill.create_infer_request(); + conversion_end_time = ggml_time_us(); + compile_end_time = conversion_end_time; } else { - infer_request = compiled_model_prefill.create_infer_request(); - // if (is_first_token) { - // infer_request = compiled_model_prefill.create_infer_request(); - // } else { - // infer_request = compiled_model_kvcache.create_infer_request(); - // } + if (is_static) { + ggml_decoder = get_ggml_decoder(cgraph, is_static, true); + auto ggml_decoder_kvcache = get_ggml_decoder(cgraph, is_static, false); + decoder_end_time = ggml_time_us(); + + auto input_model = std::make_shared(ggml_decoder); + auto input_model_kvcache = std::make_shared(ggml_decoder_kvcache); + + model = ov::frontend::ggml::FrontEnd::convert(input_model); + auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache); + conversion_end_time = ggml_time_us(); + + compiled_model = core.compile_model(model, device, config); + auto compiled_model_kvcache = core.compile_model(model_kvcache, device, config); + compile_end_time = ggml_time_us(); + + compiled_cache_prefill[cgraph] = std::make_pair(model, compiled_model); + compiled_cache_kvcache[cgraph] = std::make_pair(model_kvcache, compiled_model_kvcache); + + if (getenv("GGML_OPENVINO_DUMP_IR")) { + char timestamped_filename[64]; + auto timestamp = (long long)ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_prefill_%lld.xml", timestamp); + ov::serialize(model, timestamped_filename); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_kvcache_%lld.xml", timestamp); + ov::serialize(model_kvcache, timestamped_filename); + } + } else { + ggml_decoder = get_ggml_decoder(cgraph, is_static, true); + decoder_end_time = ggml_time_us(); + + auto input_model = std::make_shared(ggml_decoder); + model = ov::frontend::ggml::FrontEnd::convert(input_model); + conversion_end_time = ggml_time_us(); + + compiled_model = core.compile_model(model, device, config); + compile_end_time = ggml_time_us(); + compiled_cache_prefill[cgraph] = std::make_pair(model, compiled_model); + + if (getenv("GGML_OPENVINO_DUMP_IR")) { + char timestamped_filename[64]; + auto timestamp = (long long)ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); + ov::serialize(model, timestamped_filename); + } + } } + auto infer_request = compiled_model.create_infer_request(); auto ov_params = model->get_parameters(); for (size_t i = 0; i < ov_params.size(); i++) { auto param_name = ov_params[i]->get_friendly_name(); ov::Tensor input_tensor; + if (ggml_decoder->get_model_extra_inputs().find(param_name) != ggml_decoder->get_model_extra_inputs().end()) { input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name); - } else { + + } else if (!is_static) { input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name); + + } else { + if (param_name == "inp_tokens" || param_name == "inp_pos") { + if (is_first_token) { + size_t max_token_len = ggml_decoder->get_max_token_len(); + const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); + std::vector padded_data = pad_input(input_tensor_ggml, 1, max_token_len, 0); + input_tensor = ov::Tensor(ov::element::i32, ov::Shape{1, 1, max_token_len}); + auto* data_ptr = input_tensor.data(); + std::copy(padded_data.begin(), padded_data.end(), data_ptr); + } else { + input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name); + } + + } else if (param_name == "KQ_mask") { + size_t max_token_len = ggml_decoder->get_max_token_len(); + const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); + if (is_first_token) { + std::vector padded_data = + pad_input(input_tensor_ggml, max_token_len, max_token_len, -INFINITY); + set_zero_diagonal(padded_data, max_token_len); + input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, max_token_len, max_token_len}); + auto* data_ptr = input_tensor.data(); + std::copy(padded_data.begin(), padded_data.end(), data_ptr); + } else { + std::vector padded_data = pad_input(input_tensor_ggml, 1, max_token_len, -INFINITY); + input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, max_token_len}); + auto* data_ptr = input_tensor.data(); + std::copy(padded_data.begin(), padded_data.end(), data_ptr); + } + + } else { + input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name); + } } infer_request.set_input_tensor(i, input_tensor); @@ -234,3 +299,9 @@ void print_output_tensor_info(const std::string& name, break; } } + +void set_zero_diagonal(std::vector& matrix, size_t dim) { + for (size_t i = 0; i < dim; ++i) { + matrix[i * dim + i] = 0.0f; + } +} diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 88c182d9ed..000c2b87c1 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -1,12 +1,37 @@ +#include + #include "ggml-backend-impl.h" #include "ggml-decoder.h" enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph); +std::shared_ptr get_ggml_decoder(struct ggml_cgraph* cgraph, bool is_static, bool is_first_token); + +ov::Tensor get_ggml_graph_input_tensor(std::shared_ptr ggml_decoder, std::string& name); + +std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder); + size_t checksum(const void* data, size_t size); void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor); void print_output_tensor_info(const std::string& name, const ov::Tensor& tensor, - std::map& output_dst); \ No newline at end of file + std::map& output_dst); + +template +std::vector pad_input(const ggml_tensor* tensor, size_t padded_rows, size_t padded_cols, T pad_value) { + std::vector padded_data(padded_rows * padded_cols, pad_value); + size_t rows = tensor->ne[1]; + size_t cols = tensor->ne[0]; + T* data = static_cast(tensor->data); + + for (size_t i = 0; i < std::min(rows, padded_rows); ++i) { + for (size_t j = 0; j < std::min(cols, padded_cols); ++j) { + padded_data[i * padded_cols + j] = data[i * cols + j]; + } + } + return padded_data; +} + +void set_zero_diagonal(std::vector& matrix, size_t dim); From d9ca8f5dbef15981d563e59e497317f19ba5364e Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 3 Jun 2025 14:22:51 +0800 Subject: [PATCH 076/254] NPU support version 2: prefill + kvcache --- ggml/src/ggml-openvino/ggml-decoder.cpp | 4 +- ggml/src/ggml-openvino/openvino/op/cpy.cpp | 16 +++--- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 3 +- ggml/src/ggml-openvino/utils.cpp | 54 +++++++++++++------ ggml/src/ggml-openvino/utils.h | 3 ++ 5 files changed, 52 insertions(+), 28 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 29be4dbae8..66f82773e3 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -222,11 +222,11 @@ void GgmlOvDecoder::add_extra_inputs() { past_token_len = (int64_t)(node->src[1]->op_params[0] / node->src[1]->nb[0] / head_size / num_heads); std::string name = "past_token_len"; - auto param_node = std::make_shared(ov::element::i64, ov::Shape{}); + auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); param_node->set_friendly_name(name); m_model_extra_inputs[name] = param_node; - auto tensor = std::make_shared(ov::element::i64, ov::Shape{}); + auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); *tensor->data() = past_token_len; m_model_extra_input_values[name] = tensor; break; diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index 75dd0e7d83..4973645024 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -34,7 +34,7 @@ OutputVector translate_cpy(const NodeContext& context) { auto src0 = context.get_input(0); auto src1 = context.get_input(1); - auto past_token_len_scalar = context.get_input("past_token_len"); + auto past_token_len = context.get_input("past_token_len"); src0 = std::make_shared(src0, context.get_input_type(1)); ov::Output res; @@ -68,18 +68,16 @@ OutputVector translate_cpy(const NodeContext& context) { std::shared_ptr indices; if (context.is_static()) { - indices = past_token_len_scalar.get_node_shared_ptr(); - indices = std::make_shared( - indices, - ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{0, 1})); + indices = past_token_len.get_node_shared_ptr(); } else { + auto past_token_len_scalar = std::make_shared(past_token_len, zero); auto total_token_len_scalar = std::make_shared(past_token_len_scalar, token_len_scalar); indices = std::make_shared(past_token_len_scalar, total_token_len_scalar, one_scalar, ov::element::i64); - indices = std::make_shared(indices, one); } + indices = std::make_shared(indices, one); res = std::make_shared(reshaped_src1, indices, src0); } else { @@ -108,11 +106,9 @@ OutputVector translate_cpy(const NodeContext& context) { // 1D tensor of shape [token_len], values starting from past_token_len std::shared_ptr range_col; if (context.is_static()) { - range_col = past_token_len_scalar.get_node_shared_ptr(); - range_col = std::make_shared( - range_col, - ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{0})); + range_col = past_token_len.get_node_shared_ptr(); } else { + auto past_token_len_scalar = std::make_shared(past_token_len, zero); auto total_token_len_scalar = std::make_shared(past_token_len_scalar, token_len_scalar); range_col = std::make_shared(past_token_len_scalar, total_token_len_scalar, diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 20ad5683b8..0d3190f6c1 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -68,7 +69,7 @@ OutputVector translate_mulmat(const NodeContext& context) { std::vector src0_original_shape(src0_original_shape_.begin(), src0_original_shape_.end()); if (context.is_static()) { - attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {src0_original_shape[token_dim]}); + attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {INT_MAX}); } src0_original_shape[token_dim] = -1; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 3e49081515..fe46b8a794 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -1,6 +1,7 @@ #include "utils.h" #include +#include #include #include #include @@ -70,15 +71,17 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c ov::AnyMap config; if (device == "NPU") { config = { - {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean"}, - {"NPU_USE_NPUW", "YES"}, - {"NPUW_DEVICES", "NPU"}, - {"NPUW_FOLD", "YES"}, - {"NPUW_DQ", "YES"}, - {"NPUW_FUNCALL_ASYNC", "YES"}, - {"NPUW_HOST_GATHER", "YES"}, - {"NPUW_WEIGHTS_BANK", "shared"}, - // {"NPU_COMPILER_TYPE", "MLIR"}, + { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean" }, + { "NPU_USE_NPUW", "YES" }, + { "NPUW_DEVICES", "NPU" }, + { "NPUW_FOLD", "YES" }, + { "NPUW_HOST_GATHER", "YES" }, + { "NPUW_DQ", "YES" }, + { "NPUW_FUNCALL_ASYNC", "YES" }, + { "NPUW_WEIGHTS_BANK", "shared" }, + // Option 'CACHE_DIR' is not supported with MLIR compiler type + // {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""}, + { "NPU_COMPILER_TYPE", "MLIR" }, }; } @@ -102,15 +105,21 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c int64_t conversion_end_time; int64_t compile_end_time; + bool is_first_token = is_prefill(cgraph); + auto it = compiled_cache_prefill.find(cgraph); - bool is_first_token = it == compiled_cache_prefill.end(); - if (!is_first_token) { + if (it != compiled_cache_prefill.end()) { ggml_decoder = get_ggml_decoder(cgraph, is_static, false); decoder_end_time = ggml_time_us(); if (is_static) { - model = compiled_cache_kvcache[cgraph].first; - compiled_model = compiled_cache_kvcache[cgraph].second; + if (is_first_token) { + model = compiled_cache_prefill[cgraph].first; + compiled_model = compiled_cache_prefill[cgraph].second; + } else { + model = compiled_cache_kvcache[cgraph].first; + compiled_model = compiled_cache_kvcache[cgraph].second; + } } else { model = it->second.first; compiled_model = it->second.second; @@ -235,8 +244,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c } auto end_time = ggml_time_us(); - is_first_token = false; - if (getenv("GGML_OPENVINO_PROFILING")) { GGML_LOG_INFO("GGML OpenVINO Backend: \n"); GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000); @@ -305,3 +312,20 @@ void set_zero_diagonal(std::vector& matrix, size_t dim) { matrix[i * dim + i] = 0.0f; } } + +bool is_prefill(struct ggml_cgraph * cgraph) { + for (int i = 0; i < cgraph->n_nodes; ++i) { + auto * op = cgraph->nodes[i]; + for (int j = 0; j < GGML_MAX_SRC; ++j) { + auto* src = op->src[j]; + if (src == nullptr) { + break; + } + if (std::string(src->name) == "inp_tokens") { + return src->ne[0] != 1; + } + } + } + GGML_LOG_ERROR("is_prefill: inp_tokens not found in cgraph"); + throw std::runtime_error("is_prefill: inp_tokens not found in cgraph"); +} diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 000c2b87c1..2427b0b1ce 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -2,6 +2,7 @@ #include "ggml-backend-impl.h" #include "ggml-decoder.h" +#include "ggml-impl.h" enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph); @@ -35,3 +36,5 @@ std::vector pad_input(const ggml_tensor* tensor, size_t padded_rows, size_t p } void set_zero_diagonal(std::vector& matrix, size_t dim); + +bool is_prefill(struct ggml_cgraph * cgraph); From f7ad77930e949694d3509d5bdc0c9debf911a8c2 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 4 Jun 2025 17:22:50 +0800 Subject: [PATCH 077/254] Change due to ggml cgraph changes, not correct yet --- ggml/src/ggml-openvino/ggml-decoder.cpp | 10 ++++++++++ ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 9 ++++----- ggml/src/ggml-openvino/openvino/op/permute.cpp | 17 +++++++++++++---- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 66f82773e3..2a95c894f4 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -187,6 +187,16 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { case GGML_OP_MUL_MAT: { if (node->src[0]->view_src == nullptr) { m_op_case = 1; + } else if (std::string(node->src[0]->name).find("cache_k") == 0) { + m_op_case = 2; + } else if (std::string(node->src[0]->name).find("cache_v") == 0) { + m_op_case = 3; + } + break; + } + case GGML_OP_PERMUTE: { + if (ggml_is_contiguous(node->src[0])) { + m_op_case = 1; } else { m_op_case = 2; } diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 0d3190f6c1..728ee5cb5f 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -24,7 +24,7 @@ OutputVector translate_mulmat(const NodeContext& context) { num_inputs_check(context, 2, 2); int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported MULMAT case"); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported MULMAT case"); ov::Output res; @@ -59,8 +59,7 @@ OutputVector translate_mulmat(const NodeContext& context) { auto src0 = context.get_input(0); auto src0_shape = context.get_input_shape(0).to_shape(); auto src0_stride = context.get_input_stride(0); - auto permuted = is_permuted(src0_stride); - auto token_dim = permuted ? 0 : 2; + auto token_dim = op_case == 2 ? 0 : 2; auto attention_size = context.get_input("attention_size"); @@ -81,7 +80,7 @@ OutputVector translate_mulmat(const NodeContext& context) { auto src0_reshape = std::make_shared(src0, src0_reshape_shape, false); std::shared_ptr slice_end; - if (permuted) { + if (op_case == 2) { slice_end = std::make_shared( ov::OutputVector{attention_size, ov::op::v0::Constant::create(ov::element::i64, {2}, src0_slice_shape)}, 0); @@ -94,7 +93,7 @@ OutputVector translate_mulmat(const NodeContext& context) { auto slice_step = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector(3, 1)); auto src0_slice = std::make_shared(src0_reshape, slice_start, slice_end, slice_step); - if (permuted) { + if (op_case == 2) { B = std::make_shared( src0_slice, ov::op::v0::Constant::create(ov::element::i64, {src0_perm.size()}, src0_perm)); diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index 649cf8f3e1..8e91b61201 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -12,10 +12,19 @@ namespace op { OutputVector translate_permute(const NodeContext& context) { num_inputs_check(context, 1, 1); - auto perm = argsort_descend(context.get_output_stride(0)); - auto res = std::make_shared(context.get_input(0), - ov::op::v0::Constant::create(ov::element::i64, {3}, perm)); - return rename_outputs_with_suffix({res}, context.get_name()); + int op_case = context.get_op_case(); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case"); + ov::Output res; + + if (op_case == 1) { + auto perm = argsort_descend(context.get_output_stride(0)); + auto res = std::make_shared(context.get_input(0), + ov::op::v0::Constant::create(ov::element::i64, {3}, perm)); + return rename_outputs_with_suffix({res}, context.get_name()); + } else { + auto res = context.get_input(0); + return {res}; + } } } // namespace op From 592d7f8bbb537060ef8861328ebf8e10f804f60c Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 16 Jun 2025 11:46:40 +0800 Subject: [PATCH 078/254] Change due to ggml cgraph changes, llama-3.2 CPU work --- ggml/src/ggml-openvino/ggml-decoder.cpp | 9 ++- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 69 +------------------ .../src/ggml-openvino/openvino/op/permute.cpp | 53 +++++++++++++- ggml/src/ggml-openvino/utils.cpp | 1 + 4 files changed, 60 insertions(+), 72 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 2a95c894f4..7b4456c8d0 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -195,10 +195,15 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { break; } case GGML_OP_PERMUTE: { - if (ggml_is_contiguous(node->src[0])) { + if (node->src[0]->view_src == nullptr) { + // Permute Qcur m_op_case = 1; - } else { + } else if (ggml_is_contiguous(node->src[0])) { + // Permute cache_k (view) m_op_case = 2; + } else { + // Permute cache_v (view) + m_op_case = 3; } break; } diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 728ee5cb5f..b94f327a1f 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -34,75 +34,10 @@ OutputVector translate_mulmat(const NodeContext& context) { auto result_lp = std::make_shared(src1, src0, false, true); res = std::make_shared(result_lp, context.get_output_type(0)); } else { - /* - Two cases here: - - 21: [ 96, 32, 32, 1] VIEW k-0 [ 2, 6144, 192, 6144] - [ 196608, 1, 1, 1] 0: NONE cache_k_l0 [ 2, 393216, 393216, 393216] - - 22: [ 96, 7, 32, 1] PERMUTE q-0 [ 4, 12288, 384, 86016] - [ 96, 32, 7, 1] 0: SCALE Qcur-0 [ 4, 384, 12288, 86016] - - 23: [ 32, 7, 32, 1] MUL_MAT kq-0 [ 4, 128, 896, 28672] - [ 96, 32, 32, 1] 0: VIEW k-0 [ 2, 6144, 192, 6144] - [ 96, 7, 32, 1] 1: PERMUTE q-0 [ 4, 12288, 384, 86016] + ov::Output B = context.get_input(0); + ov::Output A = std::make_shared(context.get_input(1), context.get_input_type(0)); - - 20: [ 32, 96, 32, 1] VIEW v-0 [ 2, 128, 12288, 393216] - [ 196608, 1, 1, 1] 0: NONE cache_v_l0 [ 2, 393216, 393216, 393216] - - 25: [ 96, 7, 32, 1] MUL_MAT kqv-0 [ 4, 384, 2688, 86016] - [ 32, 96, 32, 1] 0: VIEW v-0 [ 2, 128, 12288, 393216] - [ 32, 7, 32, 1] 1: SOFT_MAX kq_soft_max_ext-0 [ 4, 128, 896, 28672] - - For case 1, for src0, Reshape + Slice + Transpose - For case 2, for src0, Reshape + Slice - */ - ov::Output A; - ov::Output B; - - auto src0 = context.get_input(0); auto src0_shape = context.get_input_shape(0).to_shape(); - auto src0_stride = context.get_input_stride(0); - auto token_dim = op_case == 2 ? 0 : 2; - - auto attention_size = context.get_input("attention_size"); - - auto src0_perm = argsort_descend(src0_stride); - auto src0_original_shape_ = permute(src0_shape, src0_perm); - std::vector src0_original_shape(src0_original_shape_.begin(), src0_original_shape_.end()); - - if (context.is_static()) { - attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {INT_MAX}); - } - src0_original_shape[token_dim] = -1; - - auto src0_slice_shape = src0_original_shape; - src0_slice_shape.erase(src0_slice_shape.begin() + token_dim); - - auto src0_reshape_shape = - ov::op::v0::Constant::create(ov::element::i64, {src0_original_shape.size()}, src0_original_shape); - auto src0_reshape = std::make_shared(src0, src0_reshape_shape, false); - - std::shared_ptr slice_end; - if (op_case == 2) { - slice_end = std::make_shared( - ov::OutputVector{attention_size, ov::op::v0::Constant::create(ov::element::i64, {2}, src0_slice_shape)}, - 0); - } else { - slice_end = std::make_shared( - ov::OutputVector{ov::op::v0::Constant::create(ov::element::i64, {2}, src0_slice_shape), attention_size}, - 0); - } - auto slice_start = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector(3, 0)); - auto slice_step = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector(3, 1)); - auto src0_slice = std::make_shared(src0_reshape, slice_start, slice_end, slice_step); - - if (op_case == 2) { - B = std::make_shared( - src0_slice, - ov::op::v0::Constant::create(ov::element::i64, {src0_perm.size()}, src0_perm)); - } else { - B = src0_slice; - } - - A = std::make_shared(context.get_input(1), context.get_input_type(0)); - int64_t num_heads = context.get_input_shape(1).to_shape()[0]; int64_t num_heads_kv = src0_shape[0]; int64_t kv_num_heads_factor = num_heads / num_heads_kv; diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index 8e91b61201..8b246f75cd 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -1,4 +1,11 @@ +#include +#include +#include +#include +#include #include +#include +#include #include #include "../node_context.hpp" @@ -13,7 +20,7 @@ OutputVector translate_permute(const NodeContext& context) { num_inputs_check(context, 1, 1); int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case"); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported CONT case"); ov::Output res; if (op_case == 1) { @@ -22,8 +29,48 @@ OutputVector translate_permute(const NodeContext& context) { ov::op::v0::Constant::create(ov::element::i64, {3}, perm)); return rename_outputs_with_suffix({res}, context.get_name()); } else { - auto res = context.get_input(0); - return {res}; + auto src = context.get_input(0); + auto attention_size = context.get_input("attention_size"); + if (context.is_static()) { + attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {INT_MAX}); + } + + auto src_shape_ = context.get_input_shape(0).to_shape(); + std::vector src_shape(src_shape_.begin(), src_shape_.end()); + + std::shared_ptr src_reshaped; + if (op_case == 2) { + src_reshaped = std::make_shared( + src, + ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{-1, src_shape[1], src_shape[2]}), + false); + } else { + src_reshaped = std::make_shared( + src, + ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{src_shape[1], src_shape[0], -1}), + false); + } + + auto slice_start = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector(3, 0)); + auto slice_step = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector(3, 1)); + std::shared_ptr slice_end; + if (op_case == 2) { + slice_end = std::make_shared( + ov::OutputVector{attention_size, ov::op::v0::Constant::create(ov::element::i64, {2}, {src_shape[1], src_shape[2]})}, + 0); + } else { + slice_end = std::make_shared( + ov::OutputVector{ov::op::v0::Constant::create(ov::element::i64, {2}, {src_shape[1], src_shape[0]}), attention_size}, + 0); + } + auto src_slice = std::make_shared(src_reshaped, slice_start, slice_end, slice_step); + + if (op_case == 2) { + res = std::make_shared(src_slice, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); + } else { + res = src_slice; + } + return rename_outputs_with_suffix({res}, context.get_name()); } } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index fe46b8a794..44356209ce 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -262,6 +262,7 @@ size_t checksum(const void* data, size_t size) { const uint8_t* bytes = static_cast(data); size_t sum = 0; for (size_t i = 0; i < size; ++i) { + sum += (uint8_t)i; sum += bytes[i]; } return sum; From e27738a987f672f837a873958c68e6353455eafd Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 16 Jun 2025 13:19:51 +0800 Subject: [PATCH 079/254] Add AMD64 to CMakeLists --- ggml/src/ggml-openvino/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino/CMakeLists.txt b/ggml/src/ggml-openvino/CMakeLists.txt index 08712c1527..216aa756a7 100644 --- a/ggml/src/ggml-openvino/CMakeLists.txt +++ b/ggml/src/ggml-openvino/CMakeLists.txt @@ -12,7 +12,7 @@ target_link_libraries(ggml-openvino PRIVATE openvino::runtime) if (GGML_OPENVINO) if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") - elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64") + elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64") else() message(FATAL_ERROR "OpenVINO: OpenVINO toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}") endif() From 42d4240937f234610c0873b8cf3d77095a036b9e Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 16 Jun 2025 13:20:11 +0800 Subject: [PATCH 080/254] Change due to ggml cgraph changes, all device work --- ggml/src/ggml-openvino/ggml-decoder.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 7b4456c8d0..7b62f4487c 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -216,9 +216,9 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { void GgmlOvDecoder::set_max_token_len() { for (int i = 0; i < m_cgraph->n_nodes; i++) { auto* node = m_cgraph->nodes[i]; - if (std::string(node->name) == "k-0") { + if (std::string(node->name) == "cache_k_l0 (view)") { auto* cache_k = node->src[0]; - m_max_token_len = cache_k->ne[0] / node->ne[0] / node->ne[2]; + m_max_token_len = cache_k->ne[1]; break; } } From 593484ce5f9075022e8872dc8e184f52406f29a9 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 20 Jun 2025 16:41:42 +0800 Subject: [PATCH 081/254] Refactor: clean, fix warning --- examples/simple/simple.cpp | 2 +- ggml/CMakeLists.txt | 2 - ggml/src/ggml-openvino/.clang-format | 4 + ggml/src/ggml-openvino/ggml-decoder.cpp | 7 +- ggml/src/ggml-openvino/ggml-decoder.h | 4 +- ggml/src/ggml-openvino/openvino/op/add.cpp | 22 -- ggml/src/ggml-openvino/openvino/op/cont.cpp | 1 + ggml/src/ggml-openvino/openvino/op/cpy.cpp | 1 + .../ggml-openvino/openvino/op/get_rows.cpp | 1 + ggml/src/ggml-openvino/openvino/op/mul.cpp | 21 -- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 1 + .../src/ggml-openvino/openvino/op/permute.cpp | 8 +- .../src/ggml-openvino/openvino/op/reshape.cpp | 1 + .../ggml-openvino/openvino/op/rms_norm.cpp | 1 + ggml/src/ggml-openvino/openvino/op/rope.cpp | 20 +- ggml/src/ggml-openvino/openvino/op/scale.cpp | 1 + .../ggml-openvino/openvino/op/soft_max.cpp | 11 +- .../ggml-openvino/openvino/op/transpose.cpp | 1 + .../ggml-openvino/openvino/op/unary_silu.cpp | 1 + ggml/src/ggml-openvino/openvino/op/view.cpp | 1 + ggml/src/ggml-openvino/openvino/op_table.cpp | 64 ++---- ggml/src/ggml-openvino/openvino/op_table.hpp | 23 ++ ggml/src/ggml-openvino/openvino/utils.hpp | 10 +- ggml/src/ggml-openvino/utils.cpp | 196 ++++++++++-------- ggml/src/ggml-openvino/utils.h | 6 +- setup.sh | 2 - 26 files changed, 213 insertions(+), 199 deletions(-) create mode 100644 ggml/src/ggml-openvino/.clang-format delete mode 100644 ggml/src/ggml-openvino/openvino/op/add.cpp delete mode 100644 ggml/src/ggml-openvino/openvino/op/mul.cpp delete mode 100755 setup.sh diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 9e6c678e83..d09771d104 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -140,7 +140,7 @@ int main(int argc, char ** argv) { std::string s(buf, n); printf("%s", s.c_str()); } - printf("\n"); + // prepare a batch for the prompt llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size()); diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 2fa05ab90c..4c2d79a723 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -247,8 +247,6 @@ set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING "ggml: sycl device architecture") option(GGML_OPENVINO "ggml: use OPENVINO" OFF) -option(GGML_OPENVINO_DEBUG "ggml: enable OPENVINO debugging" OFF) -option(GGML_OV_FRONTEND "ggml: OPENVINO frontend path" ON) option(GGML_OPENCL "ggml: use OpenCL" OFF) option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF) diff --git a/ggml/src/ggml-openvino/.clang-format b/ggml/src/ggml-openvino/.clang-format new file mode 100644 index 0000000000..8491f4e5c6 --- /dev/null +++ b/ggml/src/ggml-openvino/.clang-format @@ -0,0 +1,4 @@ +--- +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +ReferenceAlignment: Left diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 7b62f4487c..04f68a4950 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -225,9 +225,9 @@ void GgmlOvDecoder::set_max_token_len() { } void GgmlOvDecoder::add_extra_inputs() { - int64_t past_token_len; + int64_t past_token_len = -1; // attention_size not used for NPU - int64_t attention_size; + int64_t attention_size = -1; for (const auto& node : m_nodes) { if (node->op == GGML_OP_CPY && ggml_is_contiguous(node)) { @@ -247,6 +247,9 @@ void GgmlOvDecoder::add_extra_inputs() { break; } } + if (past_token_len == -1) { + throw std::runtime_error("Failed to find input \"cache_k\" in the graph"); + } for (const auto& node : m_nodes) { if (node->src[1] && std::string(node->src[1]->name).find("inp_tokens") == 0) { int64_t total_token_len = node->src[1]->ne[0] + past_token_len; diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 2c89d06267..b6b13d1f11 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -61,11 +61,11 @@ public: virtual void visit_subgraph(std::function)> node_visitor) const override; - const ggml_tensor* get_input_ggml_tensor(std::string& name) const { + const ggml_tensor* get_input_ggml_tensor(const std::string& name) const { return m_inputs.at(name); } - const ggml_tensor* get_output_ggml_tensor(std::string& name) const { + const ggml_tensor* get_output_ggml_tensor(const std::string& name) const { return m_outputs.at(name); } diff --git a/ggml/src/ggml-openvino/openvino/op/add.cpp b/ggml/src/ggml-openvino/openvino/op/add.cpp deleted file mode 100644 index 5a75ff2148..0000000000 --- a/ggml/src/ggml-openvino/openvino/op/add.cpp +++ /dev/null @@ -1,22 +0,0 @@ -#include - -#include "../node_context.hpp" -#include "../utils.hpp" - -namespace ov { -namespace frontend { -namespace ggml { -namespace op { - -OutputVector translate_add(const NodeContext& context) { - num_inputs_check(context, 2, 2); - - auto res = std::make_shared(context.get_input(0), context.get_input(1)); - - return rename_outputs_with_suffix({res}, context.get_name()); -} - -} // namespace op -} // namespace ggml -} // namespace frontend -} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/cont.cpp b/ggml/src/ggml-openvino/openvino/op/cont.cpp index 7cdfba051e..5c6953caff 100644 --- a/ggml/src/ggml-openvino/openvino/op/cont.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp @@ -7,6 +7,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index 4973645024..d27f4babb4 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -19,6 +19,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp index ca36548d9f..9ed5f4deaf 100644 --- a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp @@ -7,6 +7,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { diff --git a/ggml/src/ggml-openvino/openvino/op/mul.cpp b/ggml/src/ggml-openvino/openvino/op/mul.cpp deleted file mode 100644 index 40caf4331e..0000000000 --- a/ggml/src/ggml-openvino/openvino/op/mul.cpp +++ /dev/null @@ -1,21 +0,0 @@ -#include - -#include "../node_context.hpp" -#include "../utils.hpp" - -namespace ov { -namespace frontend { -namespace ggml { -namespace op { - -OutputVector translate_mul(const NodeContext& context) { - num_inputs_check(context, 2, 2); - - auto res = std::make_shared(context.get_input(0), context.get_input(1)); - return rename_outputs_with_suffix({res}, context.get_name()); -} - -} // namespace op -} // namespace ggml -} // namespace frontend -} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index b94f327a1f..d5a6ba2f03 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -13,6 +13,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index 8b246f75cd..09d15da427 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -9,6 +9,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { @@ -25,9 +26,8 @@ OutputVector translate_permute(const NodeContext& context) { if (op_case == 1) { auto perm = argsort_descend(context.get_output_stride(0)); - auto res = std::make_shared(context.get_input(0), - ov::op::v0::Constant::create(ov::element::i64, {3}, perm)); - return rename_outputs_with_suffix({res}, context.get_name()); + res = std::make_shared(context.get_input(0), + ov::op::v0::Constant::create(ov::element::i64, { 3 }, perm)); } else { auto src = context.get_input(0); auto attention_size = context.get_input("attention_size"); @@ -70,8 +70,8 @@ OutputVector translate_permute(const NodeContext& context) { } else { res = src_slice; } - return rename_outputs_with_suffix({res}, context.get_name()); } + return rename_outputs_with_suffix({ res }, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/reshape.cpp b/ggml/src/ggml-openvino/openvino/op/reshape.cpp index 49551eb815..3a695683bf 100644 --- a/ggml/src/ggml-openvino/openvino/op/reshape.cpp +++ b/ggml/src/ggml-openvino/openvino/op/reshape.cpp @@ -8,6 +8,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { diff --git a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp index 4b230ad630..211692a3c7 100644 --- a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp @@ -7,6 +7,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index b47b8a6a54..78523e5781 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -20,6 +20,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" #ifndef M_PI @@ -36,21 +37,19 @@ namespace frontend { namespace ggml { namespace op { -static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) { - return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base)); +namespace { +float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) { + return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float) M_PI)) / (2 * logf(base)); } -void ggml_rope_yarn_corr_dims(int n_dims, - int n_ctx_orig, - float freq_base, - float beta_fast, - float beta_slow, +void ggml_rope_yarn_corr_dims(int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]) { float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base)); float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base)); dims[0] = MAX(0, start); dims[1] = MIN(n_dims - 1, end); } +} // namespace OutputVector translate_rope(const NodeContext& context) { num_inputs_check(context, 2, 3); @@ -67,7 +66,12 @@ OutputVector translate_rope(const NodeContext& context) { auto output_shape = context.get_output_shape(0); - float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; + float freq_base; + float freq_scale; + float ext_factor; + float attn_factor; + float beta_fast; + float beta_slow; int32_t* op_params = context.get_output_op_params(0); const int n_dims = op_params[1]; const int mode = op_params[2]; diff --git a/ggml/src/ggml-openvino/openvino/op/scale.cpp b/ggml/src/ggml-openvino/openvino/op/scale.cpp index 8f0999432c..783440ebd9 100644 --- a/ggml/src/ggml-openvino/openvino/op/scale.cpp +++ b/ggml/src/ggml-openvino/openvino/op/scale.cpp @@ -3,6 +3,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp index bb6b002395..aeca9b3be5 100644 --- a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp +++ b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp @@ -1,5 +1,3 @@ - -#include #include #include #include @@ -13,6 +11,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { @@ -28,18 +27,18 @@ OutputVector translate_soft_max(const NodeContext& context) { float scale = 1.0f; float max_bias = 0.0f; - auto op_params = context.get_output_op_params(0); + auto * op_params = context.get_output_op_params(0); memcpy(&scale, (float*)op_params + 0, sizeof(float)); memcpy(&max_bias, (float*)op_params + 1, sizeof(float)); - const uint32_t n_head = context.get_input_shape(0)[0].get_length(); - const uint32_t n_head_log2 = 1u << (uint32_t)floor(log2(n_head)); + // const uint32_t n_head = context.get_input_shape(0)[0].get_length(); + // const uint32_t n_head_log2 = 1u << (uint32_t)floor(log2(n_head)); // const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); // const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - const float slope = (max_bias > 0.0f) ? 1.0f : 1.0f; // const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) // : 1.0f; + const float slope = 1.0; if (scale != 1.0f) { auto scale_node = diff --git a/ggml/src/ggml-openvino/openvino/op/transpose.cpp b/ggml/src/ggml-openvino/openvino/op/transpose.cpp index 99178a1944..b35f1fb861 100644 --- a/ggml/src/ggml-openvino/openvino/op/transpose.cpp +++ b/ggml/src/ggml-openvino/openvino/op/transpose.cpp @@ -1,6 +1,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { diff --git a/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp b/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp index 6c73653ca4..2b27c0be12 100644 --- a/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp @@ -3,6 +3,7 @@ #include #include "../node_context.hpp" +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { diff --git a/ggml/src/ggml-openvino/openvino/op/view.cpp b/ggml/src/ggml-openvino/openvino/op/view.cpp index fcfb9f732c..58143e667c 100644 --- a/ggml/src/ggml-openvino/openvino/op/view.cpp +++ b/ggml/src/ggml-openvino/openvino/op/view.cpp @@ -1,3 +1,4 @@ +#include "../op_table.hpp" #include "../utils.hpp" namespace ov { diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp index d588b2bff0..11d1c773c3 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.cpp +++ b/ggml/src/ggml-openvino/openvino/op_table.cpp @@ -9,55 +9,31 @@ #include "utils.hpp" -using namespace ov::op; namespace ov { namespace frontend { namespace ggml { -namespace op { - -#define GGML_OP_CONVERTER(op) OutputVector op(const NodeContext& node) - -GGML_OP_CONVERTER(translate_add); -GGML_OP_CONVERTER(translate_cont); -GGML_OP_CONVERTER(translate_cpy); -GGML_OP_CONVERTER(translate_get_rows); -GGML_OP_CONVERTER(translate_mul); -GGML_OP_CONVERTER(translate_mulmat); -GGML_OP_CONVERTER(translate_permute); -GGML_OP_CONVERTER(translate_reshape); -GGML_OP_CONVERTER(translate_rms_norm); -GGML_OP_CONVERTER(translate_rope); -GGML_OP_CONVERTER(translate_scale); -GGML_OP_CONVERTER(translate_unary_silu); -GGML_OP_CONVERTER(translate_soft_max); -GGML_OP_CONVERTER(translate_transpose); -GGML_OP_CONVERTER(translate_unary); -GGML_OP_CONVERTER(translate_view); - -} // namespace op - std::unordered_map get_supported_ops() { - return {{"GGML_OP_ADD", op::translate_1to1_match_2_inputs}, - {"GGML_OP_ADD1", op::translate_1to1_match_2_inputs}, - {"GGML_OP_CONT", op::translate_cont}, - {"GGML_OP_CPY", op::translate_cpy}, - {"GGML_OP_DIV", op::translate_1to1_match_2_inputs}, - {"GGML_OP_GET_ROWS", op::translate_get_rows}, - // {"GGML_OP_MUL", op::translate_1to1_match_2_inputs}, - {"GGML_OP_MUL", op::translate_mul}, - {"GGML_OP_MUL_MAT", op::translate_mulmat}, - {"GGML_OP_PERMUTE", op::translate_permute}, - {"GGML_OP_RESHAPE", op::translate_reshape}, - {"GGML_OP_RMS_NORM", op::translate_rms_norm}, - {"GGML_OP_ROPE", op::translate_rope}, - {"GGML_OP_SCALE", op::translate_scale}, - {"GGML_OP_SOFT_MAX", op::translate_soft_max}, - {"GGML_OP_SUB", op::translate_1to1_match_2_inputs}, - {"GGML_OP_TRANSPOSE", op::translate_transpose}, - {"GGML_UNARY_OP_SILU", op::translate_unary_silu}, - {"GGML_OP_VIEW", op::translate_view}}; -}; + using namespace ov::op; + return {{"GGML_OP_ADD", op::translate_1to1_match_2_inputs}, + {"GGML_OP_ADD1", op::translate_1to1_match_2_inputs}, + {"GGML_OP_CONT", op::translate_cont}, + {"GGML_OP_CPY", op::translate_cpy}, + {"GGML_OP_DIV", op::translate_1to1_match_2_inputs}, + {"GGML_OP_GET_ROWS", op::translate_get_rows}, + {"GGML_OP_MUL", op::translate_1to1_match_2_inputs}, + {"GGML_OP_MUL_MAT", op::translate_mulmat}, + {"GGML_OP_PERMUTE", op::translate_permute}, + {"GGML_OP_RESHAPE", op::translate_reshape}, + {"GGML_OP_RMS_NORM", op::translate_rms_norm}, + {"GGML_OP_ROPE", op::translate_rope}, + {"GGML_OP_SCALE", op::translate_scale}, + {"GGML_OP_SOFT_MAX", op::translate_soft_max}, + {"GGML_OP_SUB", op::translate_1to1_match_2_inputs}, + {"GGML_OP_TRANSPOSE", op::translate_transpose}, + {"GGML_UNARY_OP_SILU", op::translate_unary_silu}, + {"GGML_OP_VIEW", op::translate_view}}; +} } // namespace ggml } // namespace frontend diff --git a/ggml/src/ggml-openvino/openvino/op_table.hpp b/ggml/src/ggml-openvino/openvino/op_table.hpp index 1a71a06c18..d576c2a135 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.hpp +++ b/ggml/src/ggml-openvino/openvino/op_table.hpp @@ -6,6 +6,29 @@ namespace ov { namespace frontend { namespace ggml { +namespace op { + +#define GGML_OP_CONVERTER(op) OutputVector op(const NodeContext& context) + +GGML_OP_CONVERTER(translate_add); +GGML_OP_CONVERTER(translate_cont); +GGML_OP_CONVERTER(translate_cpy); +GGML_OP_CONVERTER(translate_get_rows); +GGML_OP_CONVERTER(translate_mul); +GGML_OP_CONVERTER(translate_mulmat); +GGML_OP_CONVERTER(translate_permute); +GGML_OP_CONVERTER(translate_reshape); +GGML_OP_CONVERTER(translate_rms_norm); +GGML_OP_CONVERTER(translate_rope); +GGML_OP_CONVERTER(translate_scale); +GGML_OP_CONVERTER(translate_unary_silu); +GGML_OP_CONVERTER(translate_soft_max); +GGML_OP_CONVERTER(translate_transpose); +GGML_OP_CONVERTER(translate_unary); +GGML_OP_CONVERTER(translate_view); + +} // namespace op + std::unordered_map get_supported_ops(); } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/utils.hpp b/ggml/src/ggml-openvino/openvino/utils.hpp index e0fe250789..1896f81427 100644 --- a/ggml/src/ggml-openvino/openvino/utils.hpp +++ b/ggml/src/ggml-openvino/openvino/utils.hpp @@ -8,7 +8,9 @@ namespace ov { namespace frontend { namespace ggml { -void dump_ov_model(const std::shared_ptr model); +std::string getCurrentTime(); + +void dump_ov_model(std::shared_ptr model); void num_inputs_check(const NodeContext& context, size_t min_inputs, size_t max_inputs); @@ -52,7 +54,8 @@ std::vector permute(const std::vector& x, const std::vector& perm) { return result; } -std::shared_ptr get_dimensions(const std::shared_ptr& shape, const std::vector& dims); +std::shared_ptr get_dimensions(const std::shared_ptr& shape, + const std::vector& dims); std::shared_ptr get_dimensions(const std::shared_ptr& node, const std::vector& dims); OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::string& suffix); @@ -61,7 +64,8 @@ namespace op { template OutputVector translate_1to1_match_2_inputs(const NodeContext& context) { num_inputs_check(context, 2, 2); - return {std::make_shared(context.get_input(0), context.get_input(1))}; + auto res = std::make_shared(context.get_input(0), context.get_input(1)); + return rename_outputs_with_suffix({ res }, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 44356209ce..ebcf8fdd75 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -27,13 +27,15 @@ std::shared_ptr get_ggml_decoder(struct ggml_cgraph* cgraph, bool return std::make_shared(nullptr, cgraph, is_static, is_first_token); } -ov::Tensor get_ggml_graph_input_tensor(std::shared_ptr ggml_decoder, std::string& name) { - auto* input_data = ggml_decoder->get_input_ggml_tensor(name)->data; - ov::Tensor input_tensor; - ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape(); - std::vector input_stride = ggml_decoder->get_input_stride(name); - input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); - return input_tensor; +ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, + const std::string& name) { + auto *input_data = ggml_decoder->get_input_ggml_tensor(name)->data; + ov::Tensor input_tensor; + ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape(); + std::vector input_stride = ggml_decoder->get_input_stride(name); + input_tensor = + ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); + return input_tensor; } std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder) { @@ -59,30 +61,21 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c static std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : ""; if (device.empty()) { - // Prefer GPU over CPU - for (const auto& dev : core.get_available_devices()) { - device = dev; - if (device == "GPU") - break; + const std::vector preferred_device = {"GPU", "CPU", "NPU"}; + const auto available_devices = core.get_available_devices(); + for (const auto& dev : preferred_device) { + if (std::find(available_devices.begin(), available_devices.end(), + dev) != available_devices.end()) { + device = dev; + break; } + } } bool is_static = device == "NPU" ? true : false; ov::AnyMap config; if (device == "NPU") { - config = { - { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean" }, - { "NPU_USE_NPUW", "YES" }, - { "NPUW_DEVICES", "NPU" }, - { "NPUW_FOLD", "YES" }, - { "NPUW_HOST_GATHER", "YES" }, - { "NPUW_DQ", "YES" }, - { "NPUW_FUNCALL_ASYNC", "YES" }, - { "NPUW_WEIGHTS_BANK", "shared" }, - // Option 'CACHE_DIR' is not supported with MLIR compiler type - // {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""}, - { "NPU_COMPILER_TYPE", "MLIR" }, - }; + config = get_npu_config(); } auto start_time = ggml_time_us(); @@ -179,48 +172,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto ov_params = model->get_parameters(); for (size_t i = 0; i < ov_params.size(); i++) { auto param_name = ov_params[i]->get_friendly_name(); - ov::Tensor input_tensor; - - if (ggml_decoder->get_model_extra_inputs().find(param_name) != ggml_decoder->get_model_extra_inputs().end()) { - input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name); - - } else if (!is_static) { - input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name); - - } else { - if (param_name == "inp_tokens" || param_name == "inp_pos") { - if (is_first_token) { - size_t max_token_len = ggml_decoder->get_max_token_len(); - const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); - std::vector padded_data = pad_input(input_tensor_ggml, 1, max_token_len, 0); - input_tensor = ov::Tensor(ov::element::i32, ov::Shape{1, 1, max_token_len}); - auto* data_ptr = input_tensor.data(); - std::copy(padded_data.begin(), padded_data.end(), data_ptr); - } else { - input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name); - } - - } else if (param_name == "KQ_mask") { - size_t max_token_len = ggml_decoder->get_max_token_len(); - const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); - if (is_first_token) { - std::vector padded_data = - pad_input(input_tensor_ggml, max_token_len, max_token_len, -INFINITY); - set_zero_diagonal(padded_data, max_token_len); - input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, max_token_len, max_token_len}); - auto* data_ptr = input_tensor.data(); - std::copy(padded_data.begin(), padded_data.end(), data_ptr); - } else { - std::vector padded_data = pad_input(input_tensor_ggml, 1, max_token_len, -INFINITY); - input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, max_token_len}); - auto* data_ptr = input_tensor.data(); - std::copy(padded_data.begin(), padded_data.end(), data_ptr); - } - - } else { - input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name); - } - } + auto input_tensor = get_ov_input_tensor(ggml_decoder, param_name); infer_request.set_input_tensor(i, input_tensor); if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { @@ -258,6 +210,80 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c GGML_UNUSED(backend); } +ov::AnyMap get_npu_config() { + ov::AnyMap config = { + { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean" }, + { "NPU_USE_NPUW", "YES" }, + { "NPUW_DEVICES", "NPU" }, + { "NPUW_FOLD", "YES" }, + { "NPUW_HOST_GATHER", "YES" }, + { "NPUW_DQ", "YES" }, + { "NPUW_FUNCALL_ASYNC", "YES" }, + { "NPUW_WEIGHTS_BANK", "shared" }, + // Option 'CACHE_DIR' is not supported with MLIR compiler type + // {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""}, + { "NPU_COMPILER_TYPE", "MLIR" }, + }; + return config; +} + +ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, + const std::string& param_name) { + bool is_static = ggml_decoder->is_static(); + bool is_first_token = ggml_decoder->is_first_token(); + + ov::Tensor input_tensor; + if (ggml_decoder->get_model_extra_inputs().find(param_name) != + ggml_decoder->get_model_extra_inputs().end()) { + input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name); + + } else if (!is_static) { + input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); + + } else { + if (param_name == "inp_tokens" || param_name == "inp_pos") { + if (is_first_token) { + size_t max_token_len = ggml_decoder->get_max_token_len(); + const auto *input_tensor_ggml = + ggml_decoder->get_input_ggml_tensor(param_name); + std::vector padded_data = + pad_input(input_tensor_ggml, 1, max_token_len, 0); + input_tensor = + ov::Tensor(ov::element::i32, ov::Shape{1, 1, max_token_len}); + auto *data_ptr = input_tensor.data(); + std::copy(padded_data.begin(), padded_data.end(), data_ptr); + } else { + input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); + } + + } else if (param_name == "KQ_mask") { + size_t max_token_len = ggml_decoder->get_max_token_len(); + const auto *input_tensor_ggml = + ggml_decoder->get_input_ggml_tensor(param_name); + if (is_first_token) { + std::vector padded_data = pad_input( + input_tensor_ggml, max_token_len, max_token_len, -INFINITY); + set_zero_diagonal(padded_data, max_token_len); + input_tensor = ov::Tensor(ov::element::f32, + ov::Shape{1, max_token_len, max_token_len}); + auto *data_ptr = input_tensor.data(); + std::copy(padded_data.begin(), padded_data.end(), data_ptr); + } else { + std::vector padded_data = + pad_input(input_tensor_ggml, 1, max_token_len, -INFINITY); + input_tensor = + ov::Tensor(ov::element::f32, ov::Shape{1, 1, max_token_len}); + auto *data_ptr = input_tensor.data(); + std::copy(padded_data.begin(), padded_data.end(), data_ptr); + } + + } else { + input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); + } + } + return input_tensor; +} + size_t checksum(const void* data, size_t size) { const uint8_t* bytes = static_cast(data); size_t sum = 0; @@ -268,22 +294,27 @@ size_t checksum(const void* data, size_t size) { return sum; } +// Suppress deprecation warning for ov::Tensor::data() +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor) { std::cout << "Input name: " << name << ", Input shape: " << tensor.get_shape() << ", Address: " << tensor.data() << std::endl; switch (tensor.get_element_type()) { case ov::element::f32: - std::cout << *(float*)(tensor.data()) << std::endl; - break; + std::cout << *(tensor.data()) << std::endl; + break; case ov::element::f16: - std::cout << ov::float16::from_bits(*(uint16_t*)(tensor.data())) << std::endl; - break; + std::cout << ov::float16::from_bits(*(tensor.data())) + << std::endl; + break; case ov::element::i32: - std::cout << *(int32_t*)(tensor.data()) << std::endl; - break; + std::cout << *(tensor.data()) << std::endl; + break; case ov::element::i64: - std::cout << *(int64_t*)(tensor.data()) << std::endl; - break; + std::cout << *(tensor.data()) << std::endl; + break; default: break; } @@ -296,18 +327,21 @@ void print_output_tensor_info(const std::string& name, << ", Address: " << output_dst[name] << std::endl; switch (tensor.get_element_type()) { case ov::element::f32: - std::cout << *(float*)(tensor.data()) << std::endl; - std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; - break; + std::cout << *(tensor.data()) << std::endl; + std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; + break; case ov::element::f16: - std::cout << ov::float16::from_bits(*(uint16_t*)(tensor.data())) << std::endl; - std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; - break; + std::cout << ov::float16::from_bits(*(tensor.data())) + << std::endl; + std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; + break; default: break; } } +#pragma GCC diagnostic pop + void set_zero_diagonal(std::vector& matrix, size_t dim) { for (size_t i = 0; i < dim; ++i) { matrix[i * dim + i] = 0.0f; diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 2427b0b1ce..1d23e28522 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -8,7 +8,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c std::shared_ptr get_ggml_decoder(struct ggml_cgraph* cgraph, bool is_static, bool is_first_token); -ov::Tensor get_ggml_graph_input_tensor(std::shared_ptr ggml_decoder, std::string& name); +ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string& name); std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder); @@ -38,3 +38,7 @@ std::vector pad_input(const ggml_tensor* tensor, size_t padded_rows, size_t p void set_zero_diagonal(std::vector& matrix, size_t dim); bool is_prefill(struct ggml_cgraph * cgraph); + +ov::AnyMap get_npu_config(); + +ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string& param_name); diff --git a/setup.sh b/setup.sh deleted file mode 100755 index 697639dd14..0000000000 --- a/setup.sh +++ /dev/null @@ -1,2 +0,0 @@ -cmake --build build --parallel $(nproc) - From 8afee795ad8d46c9f491859937584ce87ed04120 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 23 Jun 2025 11:56:36 +0800 Subject: [PATCH 082/254] Update clang-format --- ggml/src/ggml-openvino/.clang-format | 157 +++++++++++++++ ggml/src/ggml-openvino/openvino/op_table.cpp | 40 ++-- ggml/src/ggml-openvino/utils.cpp | 189 +++++++++---------- 3 files changed, 265 insertions(+), 121 deletions(-) diff --git a/ggml/src/ggml-openvino/.clang-format b/ggml/src/ggml-openvino/.clang-format index 8491f4e5c6..9382a117b8 100644 --- a/ggml/src/ggml-openvino/.clang-format +++ b/ggml/src/ggml-openvino/.clang-format @@ -1,4 +1,161 @@ --- +# Override root .clang-format AlignConsecutiveAssignments: false AlignConsecutiveDeclarations: false ReferenceAlignment: Left +PointerAlignment: Left + +Language: Cpp +AlignAfterOpenBracket: Align +AlignArrayOfStructures: Left +AlignConsecutiveBitFields: AcrossComments +AlignConsecutiveMacros: AcrossComments +# AlignConsecutiveShortCaseStatements: AcrossComments +AlignEscapedNewlines: Left # LeftWithLastLine +AlignOperands: Align +AlignTrailingComments: + Kind: Always + OverEmptyLines: 1 +AllowAllArgumentsOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: false +# AllowBreakBeforeNoexceptSpecifier: OnlyWithParen +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: Inline +AllowShortIfStatementsOnASingleLine: Never +AllowShortLambdasOnASingleLine: Inline +AllowShortLoopsOnASingleLine: false +AlwaysBreakBeforeMultilineStrings: true +BinPackArguments: true +BinPackParameters: true # OnePerLine +BitFieldColonSpacing: Both +BreakBeforeBraces: Custom # Attach +BraceWrapping: + AfterCaseLabel: true + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + BeforeLambdaBody: false + BeforeWhile: false + IndentBraces: false + SplitEmptyFunction: false + SplitEmptyRecord: false + SplitEmptyNamespace: false +# BreakAdjacentStringLiterals: true +BreakAfterAttributes: Never +BreakBeforeBinaryOperators: None +BreakBeforeInlineASMColon: OnlyMultiline +BreakBeforeTernaryOperators: false +# BreakBinaryOperations: Never +BreakConstructorInitializers: AfterColon +# BreakFunctionDefinitionParameters: false +BreakInheritanceList: AfterComma +BreakStringLiterals: true +# BreakTemplateDeclarations: Yes +ColumnLimit: 120 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: false +DerivePointerAlignment: false +DisableFormat: false +EmptyLineBeforeAccessModifier: Leave +EmptyLineAfterAccessModifier: Never +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +IncludeBlocks: Regroup +IncludeCategories: + - Regex: '^<.*\.h>' + Priority: 1 + SortPriority: 0 + - Regex: '^<.*' + Priority: 2 + SortPriority: 0 + - Regex: '.*' + Priority: 3 + SortPriority: 0 +IncludeIsMainRegex: '([-_](test|unittest))?$' +IncludeIsMainSourceRegex: '' +IndentAccessModifiers: false +IndentCaseBlocks: true +IndentCaseLabels: true +IndentExternBlock: NoIndent +IndentGotoLabels: false +IndentPPDirectives: AfterHash +IndentWidth: 4 +IndentWrappedFunctionNames: false +InsertBraces: true # NOTE: may lead to incorrect formatting +InsertNewlineAtEOF: true +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: false +LambdaBodyIndentation: Signature +LineEnding: LF +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Auto +ObjCBlockIndentWidth: 4 +ObjCSpaceAfterProperty: true +ObjCSpaceBeforeProtocolList: true +PPIndentWidth: -1 +PackConstructorInitializers: CurrentLine +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 1 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 200 +QualifierAlignment: Left +#QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict'] +RawStringFormats: + - Language: Cpp + Delimiters: + - cc + - CC + - cpp + - Cpp + - CPP + - 'c++' + - 'C++' + CanonicalDelimiter: '' +ReflowComments: false # IndentOnly +SeparateDefinitionBlocks: Always +SortIncludes: CaseInsensitive +SortUsingDeclarations: LexicographicNumeric +SpaceAfterCStyleCast: true +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyBlock: false +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 2 +SpacesInAngles: Never +SpacesInLineCommentPrefix: + Minimum: 1 + Maximum: -1 +SpacesInParentheses: false +SpacesInSquareBrackets: false +SpaceBeforeSquareBrackets: false +Standard: c++17 +TabWidth: 4 +UseTab: Never +WhitespaceSensitiveMacros: ['STRINGIZE'] +... diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp index 11d1c773c3..bf7d54d9a1 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.cpp +++ b/ggml/src/ggml-openvino/openvino/op_table.cpp @@ -14,25 +14,27 @@ namespace frontend { namespace ggml { std::unordered_map get_supported_ops() { - using namespace ov::op; - return {{"GGML_OP_ADD", op::translate_1to1_match_2_inputs}, - {"GGML_OP_ADD1", op::translate_1to1_match_2_inputs}, - {"GGML_OP_CONT", op::translate_cont}, - {"GGML_OP_CPY", op::translate_cpy}, - {"GGML_OP_DIV", op::translate_1to1_match_2_inputs}, - {"GGML_OP_GET_ROWS", op::translate_get_rows}, - {"GGML_OP_MUL", op::translate_1to1_match_2_inputs}, - {"GGML_OP_MUL_MAT", op::translate_mulmat}, - {"GGML_OP_PERMUTE", op::translate_permute}, - {"GGML_OP_RESHAPE", op::translate_reshape}, - {"GGML_OP_RMS_NORM", op::translate_rms_norm}, - {"GGML_OP_ROPE", op::translate_rope}, - {"GGML_OP_SCALE", op::translate_scale}, - {"GGML_OP_SOFT_MAX", op::translate_soft_max}, - {"GGML_OP_SUB", op::translate_1to1_match_2_inputs}, - {"GGML_OP_TRANSPOSE", op::translate_transpose}, - {"GGML_UNARY_OP_SILU", op::translate_unary_silu}, - {"GGML_OP_VIEW", op::translate_view}}; + using namespace ov::op; + return { + { "GGML_OP_ADD", op::translate_1to1_match_2_inputs }, + { "GGML_OP_ADD1", op::translate_1to1_match_2_inputs }, + { "GGML_OP_CONT", op::translate_cont }, + { "GGML_OP_CPY", op::translate_cpy }, + { "GGML_OP_DIV", op::translate_1to1_match_2_inputs }, + { "GGML_OP_GET_ROWS", op::translate_get_rows }, + { "GGML_OP_MUL", op::translate_1to1_match_2_inputs }, + { "GGML_OP_MUL_MAT", op::translate_mulmat }, + { "GGML_OP_PERMUTE", op::translate_permute }, + { "GGML_OP_RESHAPE", op::translate_reshape }, + { "GGML_OP_RMS_NORM", op::translate_rms_norm }, + { "GGML_OP_ROPE", op::translate_rope }, + { "GGML_OP_SCALE", op::translate_scale }, + { "GGML_OP_SOFT_MAX", op::translate_soft_max }, + { "GGML_OP_SUB", op::translate_1to1_match_2_inputs }, + { "GGML_OP_TRANSPOSE", op::translate_transpose }, + { "GGML_UNARY_OP_SILU", op::translate_unary_silu }, + { "GGML_OP_VIEW", op::translate_view } + }; } } // namespace ggml diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index ebcf8fdd75..d20e671064 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -27,15 +27,13 @@ std::shared_ptr get_ggml_decoder(struct ggml_cgraph* cgraph, bool return std::make_shared(nullptr, cgraph, is_static, is_first_token); } -ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, - const std::string& name) { - auto *input_data = ggml_decoder->get_input_ggml_tensor(name)->data; - ov::Tensor input_tensor; - ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape(); - std::vector input_stride = ggml_decoder->get_input_stride(name); - input_tensor = - ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); - return input_tensor; +ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string& name) { + auto* input_data = ggml_decoder->get_input_ggml_tensor(name)->data; + ov::Tensor input_tensor; + ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape(); + std::vector input_stride = ggml_decoder->get_input_stride(name); + input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); + return input_tensor; } std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder) { @@ -61,21 +59,20 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c static std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : ""; if (device.empty()) { - const std::vector preferred_device = {"GPU", "CPU", "NPU"}; - const auto available_devices = core.get_available_devices(); - for (const auto& dev : preferred_device) { - if (std::find(available_devices.begin(), available_devices.end(), - dev) != available_devices.end()) { - device = dev; - break; + const std::vector preferred_device = { "GPU", "CPU", "NPU" }; + const auto available_devices = core.get_available_devices(); + for (const auto& dev : preferred_device) { + if (std::find(available_devices.begin(), available_devices.end(), dev) != available_devices.end()) { + device = dev; + break; + } } - } } bool is_static = device == "NPU" ? true : false; ov::AnyMap config; if (device == "NPU") { - config = get_npu_config(); + config = get_npu_config(); } auto start_time = ggml_time_us(); @@ -107,10 +104,10 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c if (is_static) { if (is_first_token) { - model = compiled_cache_prefill[cgraph].first; + model = compiled_cache_prefill[cgraph].first; compiled_model = compiled_cache_prefill[cgraph].second; } else { - model = compiled_cache_kvcache[cgraph].first; + model = compiled_cache_kvcache[cgraph].first; compiled_model = compiled_cache_kvcache[cgraph].second; } } else { @@ -141,7 +138,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c if (getenv("GGML_OPENVINO_DUMP_IR")) { char timestamped_filename[64]; - auto timestamp = (long long)ggml_time_us(); + auto timestamp = (long long) ggml_time_us(); snprintf(timestamped_filename, sizeof(timestamped_filename), "model_prefill_%lld.xml", timestamp); ov::serialize(model, timestamped_filename); snprintf(timestamped_filename, sizeof(timestamped_filename), "model_kvcache_%lld.xml", timestamp); @@ -161,7 +158,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c if (getenv("GGML_OPENVINO_DUMP_IR")) { char timestamped_filename[64]; - auto timestamp = (long long)ggml_time_us(); + auto timestamp = (long long) ggml_time_us(); snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); ov::serialize(model, timestamped_filename); } @@ -227,68 +224,59 @@ ov::AnyMap get_npu_config() { return config; } -ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, - const std::string& param_name) { - bool is_static = ggml_decoder->is_static(); - bool is_first_token = ggml_decoder->is_first_token(); +ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string& param_name) { + bool is_static = ggml_decoder->is_static(); + bool is_first_token = ggml_decoder->is_first_token(); - ov::Tensor input_tensor; - if (ggml_decoder->get_model_extra_inputs().find(param_name) != - ggml_decoder->get_model_extra_inputs().end()) { - input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name); + ov::Tensor input_tensor; + if (ggml_decoder->get_model_extra_inputs().find(param_name) != ggml_decoder->get_model_extra_inputs().end()) { + input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name); - } else if (!is_static) { - input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); - - } else { - if (param_name == "inp_tokens" || param_name == "inp_pos") { - if (is_first_token) { - size_t max_token_len = ggml_decoder->get_max_token_len(); - const auto *input_tensor_ggml = - ggml_decoder->get_input_ggml_tensor(param_name); - std::vector padded_data = - pad_input(input_tensor_ggml, 1, max_token_len, 0); - input_tensor = - ov::Tensor(ov::element::i32, ov::Shape{1, 1, max_token_len}); - auto *data_ptr = input_tensor.data(); - std::copy(padded_data.begin(), padded_data.end(), data_ptr); - } else { + } else if (!is_static) { input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); - } - - } else if (param_name == "KQ_mask") { - size_t max_token_len = ggml_decoder->get_max_token_len(); - const auto *input_tensor_ggml = - ggml_decoder->get_input_ggml_tensor(param_name); - if (is_first_token) { - std::vector padded_data = pad_input( - input_tensor_ggml, max_token_len, max_token_len, -INFINITY); - set_zero_diagonal(padded_data, max_token_len); - input_tensor = ov::Tensor(ov::element::f32, - ov::Shape{1, max_token_len, max_token_len}); - auto *data_ptr = input_tensor.data(); - std::copy(padded_data.begin(), padded_data.end(), data_ptr); - } else { - std::vector padded_data = - pad_input(input_tensor_ggml, 1, max_token_len, -INFINITY); - input_tensor = - ov::Tensor(ov::element::f32, ov::Shape{1, 1, max_token_len}); - auto *data_ptr = input_tensor.data(); - std::copy(padded_data.begin(), padded_data.end(), data_ptr); - } } else { - input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); + if (param_name == "inp_tokens" || param_name == "inp_pos") { + if (is_first_token) { + size_t max_token_len = ggml_decoder->get_max_token_len(); + const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); + std::vector padded_data = pad_input(input_tensor_ggml, 1, max_token_len, 0); + input_tensor = ov::Tensor(ov::element::i32, ov::Shape{ 1, 1, max_token_len }); + auto* data_ptr = input_tensor.data(); + std::copy(padded_data.begin(), padded_data.end(), data_ptr); + } else { + input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); + } + + } else if (param_name == "KQ_mask") { + size_t max_token_len = ggml_decoder->get_max_token_len(); + const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); + if (is_first_token) { + std::vector padded_data = + pad_input(input_tensor_ggml, max_token_len, max_token_len, -INFINITY); + set_zero_diagonal(padded_data, max_token_len); + input_tensor = ov::Tensor(ov::element::f32, ov::Shape{ 1, max_token_len, max_token_len }); + auto* data_ptr = input_tensor.data(); + std::copy(padded_data.begin(), padded_data.end(), data_ptr); + } else { + std::vector padded_data = pad_input(input_tensor_ggml, 1, max_token_len, -INFINITY); + input_tensor = ov::Tensor(ov::element::f32, ov::Shape{ 1, 1, max_token_len }); + auto* data_ptr = input_tensor.data(); + std::copy(padded_data.begin(), padded_data.end(), data_ptr); + } + + } else { + input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); + } } - } - return input_tensor; + return input_tensor; } size_t checksum(const void* data, size_t size) { const uint8_t* bytes = static_cast(data); size_t sum = 0; for (size_t i = 0; i < size; ++i) { - sum += (uint8_t)i; + sum += (uint8_t) i; sum += bytes[i]; } return sum; @@ -302,41 +290,38 @@ void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor) std::cout << "Input name: " << name << ", Input shape: " << tensor.get_shape() << ", Address: " << tensor.data() << std::endl; switch (tensor.get_element_type()) { - case ov::element::f32: - std::cout << *(tensor.data()) << std::endl; - break; - case ov::element::f16: - std::cout << ov::float16::from_bits(*(tensor.data())) - << std::endl; - break; - case ov::element::i32: - std::cout << *(tensor.data()) << std::endl; - break; - case ov::element::i64: - std::cout << *(tensor.data()) << std::endl; - break; - default: - break; + case ov::element::f32: + std::cout << *(tensor.data()) << std::endl; + break; + case ov::element::f16: + std::cout << ov::float16::from_bits(*(tensor.data())) << std::endl; + break; + case ov::element::i32: + std::cout << *(tensor.data()) << std::endl; + break; + case ov::element::i64: + std::cout << *(tensor.data()) << std::endl; + break; + default: + break; } } -void print_output_tensor_info(const std::string& name, - const ov::Tensor& tensor, +void print_output_tensor_info(const std::string& name, const ov::Tensor& tensor, std::map& output_dst) { std::cout << "Output name: " << name << ", Output shape: " << tensor.get_shape() << ", Address: " << output_dst[name] << std::endl; switch (tensor.get_element_type()) { - case ov::element::f32: - std::cout << *(tensor.data()) << std::endl; - std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; - break; - case ov::element::f16: - std::cout << ov::float16::from_bits(*(tensor.data())) - << std::endl; - std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; - break; - default: - break; + case ov::element::f32: + std::cout << *(tensor.data()) << std::endl; + std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; + break; + case ov::element::f16: + std::cout << ov::float16::from_bits(*(tensor.data())) << std::endl; + std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; + break; + default: + break; } } @@ -348,9 +333,9 @@ void set_zero_diagonal(std::vector& matrix, size_t dim) { } } -bool is_prefill(struct ggml_cgraph * cgraph) { +bool is_prefill(struct ggml_cgraph* cgraph) { for (int i = 0; i < cgraph->n_nodes; ++i) { - auto * op = cgraph->nodes[i]; + auto* op = cgraph->nodes[i]; for (int j = 0; j < GGML_MAX_SRC; ++j) { auto* src = op->src[j]; if (src == nullptr) { From 4c582ac7a313a27639ab1b06d590b4b80b565864 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 26 Jun 2025 13:54:06 +0800 Subject: [PATCH 083/254] Statful transformation for CPU GPU --- ggml/src/ggml-openvino/ggml-decoder.cpp | 104 +++++++++++------- ggml/src/ggml-openvino/ggml-decoder.h | 40 ++++--- ggml/src/ggml-openvino/openvino/decoder.hpp | 6 + ggml/src/ggml-openvino/openvino/op/cpy.cpp | 13 +-- .../openvino/translate_session.cpp | 69 +++++++++--- .../openvino/translate_session.hpp | 2 +- ggml/src/ggml-openvino/utils.cpp | 102 ++++++++++------- 7 files changed, 216 insertions(+), 120 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 04f68a4950..e30f026e36 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -26,12 +26,13 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" -GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token) - : m_cgraph(cgraph), - m_node(node), - m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"), - m_is_static(is_static), - m_is_first_token(is_first_token) { +GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* m_cgraph, bool is_static, + bool is_first_token) : + m_cgraph(m_cgraph), + m_node(node), + m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"), + m_is_static(is_static), + m_is_first_token(is_first_token) { static std::map> model_weights; if (m_node) { @@ -44,10 +45,11 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap } if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { - dump_cgraph(m_cgraph); + std::string filename = "cgraph.txt"; + dump_cgraph(m_cgraph, filename); } - set_max_token_len(); + set_llm_params(); static bool weight_created = false; if (!weight_created) { @@ -105,33 +107,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { if (m_model_inputs.find(src_name) != m_model_inputs.end()) { continue; } - ov::PartialShape input_shape; - if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") { - if (m_is_static) { - if (m_is_first_token) { - input_shape = ov::PartialShape{1, 1, m_max_token_len}; - } else { - input_shape = ov::PartialShape{1, 1, 1}; - } - } else { - input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_max_token_len)}; - } - } else if (std::string(src->name) == "KQ_mask") { - if (m_is_static) { - if (m_is_first_token) { - input_shape = ov::PartialShape{1, m_max_token_len, m_max_token_len}; - } else { - input_shape = ov::PartialShape{1, 1, m_max_token_len}; - } - } else { - auto max_mask_size = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD); - input_shape = - ov::PartialShape{1, ov::Dimension(1, max_mask_size), ov::Dimension(1, max_mask_size)}; - } - } else { - input_shape = ov::Shape{get_shape(src)}; - } - auto param_node = std::make_shared(get_ov_type(src), input_shape); + auto param_node = std::make_shared(get_ov_type(src), get_graph_input_shape(src)); param_node->set_friendly_name(src_name); m_model_inputs[src_name] = param_node; } @@ -150,6 +126,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { auto it = std::find(m_model_output_names.begin(), m_model_output_names.end(), name); if (it == m_model_output_names.end()) { m_model_output_names.push_back(name); + m_kv_names.push_back(name); } } } @@ -213,17 +190,54 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { } } -void GgmlOvDecoder::set_max_token_len() { +void GgmlOvDecoder::set_llm_params() { for (int i = 0; i < m_cgraph->n_nodes; i++) { auto* node = m_cgraph->nodes[i]; - if (std::string(node->name) == "cache_k_l0 (view)") { + if (node->op == GGML_OP_VIEW && std::string(node->name) == "cache_k_l0 (view)") { auto* cache_k = node->src[0]; m_max_token_len = cache_k->ne[1]; - break; + } else if (node->op == GGML_OP_ROPE && std::string(node->name) == "Qcur-0") { + m_head_size = node->ne[0]; + m_num_heads = node->ne[1]; + } else if (node->op == GGML_OP_ROPE && std::string(node->name) == "Kcur-0") { + m_num_heads_kv = node->ne[1]; } } } +ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) const { + ov::PartialShape input_shape; + if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") { + if (m_is_static) { + if (m_is_first_token) { + input_shape = ov::PartialShape{ 1, 1, m_max_token_len }; + } else { + input_shape = ov::PartialShape{ 1, 1, 1 }; + } + } else { + input_shape = ov::PartialShape{ 1, 1, ov::Dimension(1, m_max_token_len) }; + } + } else if (std::string(src->name) == "KQ_mask") { + if (m_is_static) { + if (m_is_first_token) { + input_shape = ov::PartialShape{ 1, m_max_token_len, m_max_token_len }; + } else { + input_shape = ov::PartialShape{ 1, 1, m_max_token_len }; + } + } else { + auto max_mask_size = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD); + input_shape = ov::PartialShape{ 1, ov::Dimension(1, max_mask_size), ov::Dimension(1, max_mask_size) }; + } + } else if (std::string(src->name).find("cache_k") == 0) { + input_shape = ov::PartialShape{ m_max_token_len, m_num_heads_kv, m_head_size }; + } else if (std::string(src->name).find("cache_v") == 0) { + input_shape = ov::PartialShape{ m_num_heads_kv, m_head_size, m_max_token_len }; + } else { + input_shape = ov::PartialShape{ get_shape(src) }; + } + return input_shape; +} + void GgmlOvDecoder::add_extra_inputs() { int64_t past_token_len = -1; // attention_size not used for NPU @@ -267,6 +281,16 @@ void GgmlOvDecoder::add_extra_inputs() { } } +std::map GgmlOvDecoder::get_kv_param_res_names() const { + std::map kv_param_res_names; + for (const auto& name : m_kv_names) { + if (name.find("cache_k") == 0 || name.find("cache_v") == 0) { + kv_param_res_names[name] = name; + } + } + return kv_param_res_names; +} + void GgmlOvDecoder::add_weight_const_parallel(std::map>& model_weights) { static std::mutex weights_mutex; auto* nodes = m_cgraph->nodes; @@ -344,8 +368,8 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) return weight_node; } -void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph) { - std::ofstream file("cgraph.txt"); +void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename) { + std::ofstream file(filename); if (!file.is_open()) { std::cerr << "Failed to open file" << std::endl; return; diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index b6b13d1f11..6d3f24b093 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include "ggml.h" @@ -89,28 +90,34 @@ public: return m_model_output_names; } - virtual bool is_static() const override { - return m_is_static; - } - virtual bool is_first_token() const override { - return m_is_first_token; - } - virtual int get_max_token_len() const override { - return m_max_token_len; - } + virtual int get_max_token_len() const override { return m_max_token_len; } + + virtual int get_num_heads() const override { return m_num_heads; } + + virtual int get_num_heads_kv() const override { return m_num_heads_kv; } + + virtual int get_head_size() const override { return m_head_size; } + + virtual std::map get_kv_param_res_names() const override; + + virtual bool is_static() const override { return m_is_static; } + + virtual bool is_first_token() const override { return m_is_first_token; } + + ov::PartialShape get_graph_input_shape(const ggml_tensor* src) const; private: void set_input_output(ggml_tensor* node); void add_extra_inputs(); - static void dump_cgraph(const struct ggml_cgraph* cgraph); + static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename); static std::vector get_shape(const ggml_tensor* tensor); static std::vector get_stride(const ggml_tensor* tensor); static ov::element::Type get_ov_type(const ggml_tensor* tensor); + + // set max_token_len, num_heads, etc + void set_llm_params(); + static std::shared_ptr create_weight_node(ggml_tensor* tensor); - - void set_max_token_len(); - int m_max_token_len; - void add_weight_const_parallel(std::map>& model_weights); struct ggml_cgraph* m_cgraph; @@ -129,6 +136,11 @@ private: std::map> m_model_extra_input_values; std::map> m_model_weights; std::vector m_model_output_names; + int m_max_token_len; + int m_num_heads; + int m_num_heads_kv; + int m_head_size; + std::vector m_kv_names; bool m_is_static; bool m_is_first_token; }; diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index 6212568399..3105d0f16f 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -4,6 +4,7 @@ #include #include #include +#include namespace ov { namespace frontend { @@ -57,6 +58,11 @@ public: virtual const std::map>& get_model_weights() const = 0; virtual const std::vector& get_model_output_names() const = 0; + virtual int get_num_heads() const = 0; + virtual int get_num_heads_kv() const = 0; + virtual int get_head_size() const = 0; + virtual std::map get_kv_param_res_names() const = 0; + virtual bool is_static() const = 0; virtual bool is_first_token() const = 0; virtual int get_max_token_len() const = 0; diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index d27f4babb4..b183b97f23 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -57,13 +58,6 @@ OutputVector translate_cpy(const NodeContext& context) { if (op_case == 1) { // Write K to cache_k - int64_t head_size = src0_shape[2]; - int64_t num_heads = src0_shape[1]; - - auto reshaped_src1_shape = - ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{-1, num_heads, head_size}); - auto reshaped_src1 = std::make_shared(src1, reshaped_src1_shape, false); - auto token_len = get_dimensions(src0.get_node_shared_ptr(), {0}); auto token_len_scalar = std::make_shared(token_len, zero); @@ -80,7 +74,8 @@ OutputVector translate_cpy(const NodeContext& context) { } indices = std::make_shared(indices, one); - res = std::make_shared(reshaped_src1, indices, src0); + auto updated = std::make_shared(src1, indices, src0); + res = std::make_shared(updated, std::make_shared(src1), false); } else { // Write V to cache_v auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); @@ -140,7 +135,7 @@ OutputVector translate_cpy(const NodeContext& context) { false); auto updated = std::make_shared(reshaped_src1, indices_final, flattend_src0); - res = std::make_shared(updated, zero); + res = std::make_shared(updated, std::make_shared(src1), false); } return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 8eda23c1c5..3bf0403a64 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -1,7 +1,12 @@ #include "translate_session.hpp" #include +#include +#include +#include +#include #include +#include #include "input_model.hpp" @@ -11,6 +16,41 @@ namespace ggml { using namespace ov::op; +namespace { +ov::pass::MakeStateful::ParamResPairs get_kv_param_res_pairs( + const std::shared_ptr& model, const std::map& kv_param_res_names) { + ov::pass::MakeStateful::ParamResPairs pairs; + const auto& params = model->get_parameters(); + const auto& results = model->get_results(); + + for (const auto& param_res : kv_param_res_names) { + const auto& param_name = param_res.first; + const auto& res_name = param_res.second; + + auto param_it = std::find_if(params.begin(), params.end(), [&](const std::shared_ptr& node) { + return node->get_friendly_name() == param_name; + }); + + OPENVINO_ASSERT(param_it != params.end(), "The tensor name ", param_name, + " is not associated with any of " + "Parameters in the network."); + + auto res_it = std::find_if(results.begin(), results.end(), [&](const std::shared_ptr& node) { + return node->get_friendly_name() == res_name; + }); + + OPENVINO_ASSERT(res_it != results.end(), "The tensor name ", res_name, + " is not associated with any of " + "Results in the network."); + + std::shared_ptr param = *param_it; + std::shared_ptr res = *res_it; + pairs.emplace_back(param, res); + } + return pairs; +} +} // namespace + TranslateSession::TranslateSession(const frontend::InputModel::Ptr& input_model, const std::unordered_map& translator_map) : m_input_model(input_model), @@ -88,25 +128,26 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo results.push_back(result); } - ov::ParameterVector used_params; - for (const auto& param : params) { - if (!param->output(0).get_target_inputs().empty()) { - used_params.push_back(param); - } - } - if (getenv("GGML_OPENVINO_PROFILING")) { - if (auto diff = params.size() - used_params.size()) { - std::cout << diff << " parameters are not used in the model." << std::endl; - } - } - resulting_model = std::make_shared(results, used_params); + resulting_model = std::make_shared(results, params); + + apply_transformations(resulting_model); + return resulting_model; +} + +void TranslateSession::apply_transformations(const std::shared_ptr& model) { + auto ggml_model_decoder = std::dynamic_pointer_cast(m_input_model)->get_model_decoder(); ov::pass::Manager manager; manager.set_per_pass_validation(true); manager.register_pass(); - manager.run_passes(resulting_model); - return resulting_model; + if (!ggml_model_decoder->is_static()) { + const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names(); + const auto kv_param_res_pairs = get_kv_param_res_pairs(model, kv_param_res_names); + manager.register_pass(kv_param_res_pairs); + } + + manager.run_passes(model); } } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/translate_session.hpp b/ggml/src/ggml-openvino/openvino/translate_session.hpp index 5c7a9d464d..9167b55fe5 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.hpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.hpp @@ -16,7 +16,7 @@ public: std::shared_ptr translate_graph(const frontend::InputModel::Ptr& input_model); private: - void print_model_topology(); + void apply_transformations(const std::shared_ptr& model); const frontend::InputModel::Ptr m_input_model; const std::unordered_map& m_translator_map; std::shared_ptr m_ov_model; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index d20e671064..2620fa5615 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -9,10 +9,13 @@ #include #include #include +#include #include #include +#include #include #include +#include #include #include #include @@ -28,11 +31,15 @@ std::shared_ptr get_ggml_decoder(struct ggml_cgraph* cgraph, bool } ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string& name) { - auto* input_data = ggml_decoder->get_input_ggml_tensor(name)->data; - ov::Tensor input_tensor; - ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape(); - std::vector input_stride = ggml_decoder->get_input_stride(name); - input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); + const auto* ggml_tensor = ggml_decoder->get_input_ggml_tensor(name); + auto* input_data = ggml_tensor->data; + ov::Shape input_shape; + if (name.find("cache_k") == 0 || name.find("cache_v") == 0) { + input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor).to_shape(); + } else { + input_shape = ggml_decoder->get_input_shape(name).to_shape(); + } + auto input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); return input_tensor; } @@ -82,41 +89,37 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c core.set_property(ov::cache_dir(cache_dir)); } - // CPU and GPU will only use cache_prefill - using CachedItem = std::pair, ov::CompiledModel>; - static std::unordered_map compiled_cache_prefill; - static std::unordered_map compiled_cache_kvcache; + static std::unordered_map> infer_request_cache; + static std::unordered_map> ov_input_names_cache; + static std::unordered_map> ov_output_names_cache; + // For NPU, store the kvcache model, since we cannot create two infer_request + static std::unordered_map compiled_model_cache; std::shared_ptr ggml_decoder; - std::shared_ptr model; - ov::CompiledModel compiled_model; + ov::InferRequest infer_request; int64_t decoder_end_time; int64_t conversion_end_time; int64_t compile_end_time; - bool is_first_token = is_prefill(cgraph); - - auto it = compiled_cache_prefill.find(cgraph); - if (it != compiled_cache_prefill.end()) { + auto it = infer_request_cache.find(cgraph); + if (it != infer_request_cache.end()) { ggml_decoder = get_ggml_decoder(cgraph, is_static, false); decoder_end_time = ggml_time_us(); - if (is_static) { - if (is_first_token) { - model = compiled_cache_prefill[cgraph].first; - compiled_model = compiled_cache_prefill[cgraph].second; - } else { - model = compiled_cache_kvcache[cgraph].first; - compiled_model = compiled_cache_kvcache[cgraph].second; - } - } else { - model = it->second.first; - compiled_model = it->second.second; + // For NPU for the first time we call kvcache modle, pop the compiled kvcache model from cache + if (is_static && compiled_model_cache.find(cgraph) != compiled_model_cache.end()) { + infer_request_cache[cgraph] = + std::make_shared(compiled_model_cache[cgraph].create_infer_request()); + compiled_model_cache.erase(cgraph); } + infer_request = *infer_request_cache[cgraph]; + conversion_end_time = ggml_time_us(); compile_end_time = conversion_end_time; } else { + std::shared_ptr model; + if (is_static) { ggml_decoder = get_ggml_decoder(cgraph, is_static, true); auto ggml_decoder_kvcache = get_ggml_decoder(cgraph, is_static, false); @@ -129,12 +132,14 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache); conversion_end_time = ggml_time_us(); - compiled_model = core.compile_model(model, device, config); + auto compiled_model = core.compile_model(model, device, config); auto compiled_model_kvcache = core.compile_model(model_kvcache, device, config); + compiled_model_cache[cgraph] = compiled_model_kvcache; compile_end_time = ggml_time_us(); - compiled_cache_prefill[cgraph] = std::make_pair(model, compiled_model); - compiled_cache_kvcache[cgraph] = std::make_pair(model_kvcache, compiled_model_kvcache); + infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); + infer_request = *infer_request_cache[cgraph]; + compiled_model_cache[cgraph] = compiled_model_kvcache; if (getenv("GGML_OPENVINO_DUMP_IR")) { char timestamped_filename[64]; @@ -152,9 +157,10 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c model = ov::frontend::ggml::FrontEnd::convert(input_model); conversion_end_time = ggml_time_us(); - compiled_model = core.compile_model(model, device, config); + auto compiled_model = core.compile_model(model, device, config); compile_end_time = ggml_time_us(); - compiled_cache_prefill[cgraph] = std::make_pair(model, compiled_model); + infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); + infer_request = *infer_request_cache[cgraph]; if (getenv("GGML_OPENVINO_DUMP_IR")) { char timestamped_filename[64]; @@ -163,12 +169,23 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c ov::serialize(model, timestamped_filename); } } - } - auto infer_request = compiled_model.create_infer_request(); - auto ov_params = model->get_parameters(); - for (size_t i = 0; i < ov_params.size(); i++) { - auto param_name = ov_params[i]->get_friendly_name(); + std::vector ov_input_names; + std::vector ov_output_names; + for (const auto& ov_param : model->get_parameters()) { + ov_input_names.push_back(ov_param->get_friendly_name()); + } + for (const auto& ov_output : model->get_results()) { + ov_output_names.push_back(ov_output->get_friendly_name()); + } + ov_input_names_cache[cgraph] = ov_input_names; + ov_output_names_cache[cgraph] = ov_output_names; + } + + auto ov_input_names = ov_input_names_cache[cgraph]; + auto ov_output_names = ov_output_names_cache[cgraph]; + for (size_t i = 0; i < ov_input_names.size(); i++) { + auto param_name = ov_input_names[i]; auto input_tensor = get_ov_input_tensor(ggml_decoder, param_name); infer_request.set_input_tensor(i, input_tensor); @@ -181,14 +198,15 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c infer_request.infer(); auto infer_end_time = ggml_time_us(); - auto output_names = ggml_decoder->get_model_output_names(); - auto output_tensors = get_ggml_graph_output_dst(ggml_decoder); - for (size_t i = 0; i < output_names.size(); i++) { - auto output_tensor = infer_request.get_output_tensor(i); - std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size()); + auto gguf_tensor_addrs = get_ggml_graph_output_dst(ggml_decoder); + for (size_t i = 0; i < ov_output_names.size(); i++) { + auto result_name = ov_output_names[i]; + const auto output_tensor = infer_request.get_output_tensor(i); + + std::memcpy(gguf_tensor_addrs[result_name], output_tensor.data(), output_tensor.get_byte_size()); if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { - print_output_tensor_info(output_names[i], output_tensor, output_tensors); + print_output_tensor_info(result_name, output_tensor, gguf_tensor_addrs); } } auto end_time = ggml_time_us(); From 73ee84fffee5429724c9c3e1d2fec540578471d1 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 3 Jul 2025 11:03:40 +0800 Subject: [PATCH 084/254] Add SwiGLU --- ggml/src/ggml-openvino/.clang-format | 2 +- ggml/src/ggml-openvino/ggml-decoder.cpp | 87 +++++++++++-------- ggml/src/ggml-openvino/ggml-openvino.cpp | 36 +++++--- .../ggml-openvino/openvino/op/glu_swiglu.cpp | 29 +++++++ ggml/src/ggml-openvino/openvino/op_table.cpp | 37 ++++---- ggml/src/ggml-openvino/openvino/op_table.hpp | 2 +- 6 files changed, 123 insertions(+), 70 deletions(-) create mode 100644 ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp diff --git a/ggml/src/ggml-openvino/.clang-format b/ggml/src/ggml-openvino/.clang-format index 9382a117b8..6d77ecea3c 100644 --- a/ggml/src/ggml-openvino/.clang-format +++ b/ggml/src/ggml-openvino/.clang-format @@ -4,6 +4,7 @@ AlignConsecutiveAssignments: false AlignConsecutiveDeclarations: false ReferenceAlignment: Left PointerAlignment: Left +Cpp11BracedListStyle: true Language: Cpp AlignAfterOpenBracket: Align @@ -65,7 +66,6 @@ CommentPragmas: '^ IWYU pragma:' CompactNamespaces: false ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 -Cpp11BracedListStyle: false DerivePointerAlignment: false DisableFormat: false EmptyLineBeforeAccessModifier: Leave diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index e30f026e36..61c0fe4833 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -563,43 +563,58 @@ void GgmlOvDecoder::visit_subgraph(std::function opTypeMap = { - {GGML_OP_ACC, "GGML_OP_ACC"}, {GGML_OP_ADD, "GGML_OP_ADD"}, - {GGML_OP_ADD1, "GGML_OP_ADD1"}, {GGML_OP_CONT, "GGML_OP_CONT"}, - {GGML_OP_CPY, "GGML_OP_CPY"}, {GGML_OP_DIV, "GGML_OP_DIV"}, - {GGML_OP_DUP, "GGML_OP_DUP"}, {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS"}, - {GGML_OP_MUL, "GGML_OP_MUL"}, {GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT"}, - {GGML_OP_PERMUTE, "GGML_OP_PERMUTE"}, {GGML_OP_RESHAPE, "GGML_OP_RESHAPE"}, - {GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM"}, {GGML_OP_ROPE, "GGML_OP_ROPE"}, - {GGML_OP_SCALE, "GGML_OP_SCALE"}, {GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX"}, - {GGML_OP_SUB, "GGML_OP_SUB"}, {GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE"}, - {GGML_OP_UNARY, "GGML_OP_UNARY"}, {GGML_OP_VIEW, "GGML_OP_VIEW"}}; - static const std::map unaryOpTypeMap = { - {GGML_UNARY_OP_ABS, "GGML_UNARY_OP_ABS"}, - {GGML_UNARY_OP_SGN, "GGML_UNARY_OP_SGN"}, - {GGML_UNARY_OP_NEG, "GGML_UNARY_OP_NEG"}, - {GGML_UNARY_OP_STEP, "GGML_UNARY_OP_STEP"}, - {GGML_UNARY_OP_TANH, "GGML_UNARY_OP_TANH"}, - {GGML_UNARY_OP_ELU, "GGML_UNARY_OP_ELU"}, - {GGML_UNARY_OP_RELU, "GGML_UNARY_OP_RELU"}, - {GGML_UNARY_OP_SIGMOID, "GGML_UNARY_OP_SIGMOID"}, - {GGML_UNARY_OP_GELU, "GGML_UNARY_OP_GELU"}, - {GGML_UNARY_OP_GELU_QUICK, "GGML_UNARY_OP_GELU_QUICK"}, - {GGML_UNARY_OP_SILU, "GGML_UNARY_OP_SILU"}, - {GGML_UNARY_OP_HARDSWISH, "GGML_UNARY_OP_HARDSWISH"}, + static const std::map ops = { + {GGML_OP_ACC, "GGML_OP_ACC" }, + {GGML_OP_ADD, "GGML_OP_ADD" }, + {GGML_OP_ADD1, "GGML_OP_ADD1" }, + {GGML_OP_CONT, "GGML_OP_CONT" }, + {GGML_OP_CPY, "GGML_OP_CPY" }, + {GGML_OP_DIV, "GGML_OP_DIV" }, + {GGML_OP_DUP, "GGML_OP_DUP" }, + {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS" }, + {GGML_OP_MUL, "GGML_OP_MUL" }, + {GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT" }, + {GGML_OP_PERMUTE, "GGML_OP_PERMUTE" }, + {GGML_OP_RESHAPE, "GGML_OP_RESHAPE" }, + {GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM" }, + {GGML_OP_ROPE, "GGML_OP_ROPE" }, + {GGML_OP_SCALE, "GGML_OP_SCALE" }, + {GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX" }, + {GGML_OP_SUB, "GGML_OP_SUB" }, + {GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE"}, + {GGML_OP_VIEW, "GGML_OP_VIEW" } + }; + static const std::map unary_ops = { + {GGML_UNARY_OP_ABS, "GGML_UNARY_OP_ABS" }, + {GGML_UNARY_OP_SGN, "GGML_UNARY_OP_SGN" }, + {GGML_UNARY_OP_NEG, "GGML_UNARY_OP_NEG" }, + {GGML_UNARY_OP_STEP, "GGML_UNARY_OP_STEP" }, + {GGML_UNARY_OP_TANH, "GGML_UNARY_OP_TANH" }, + {GGML_UNARY_OP_ELU, "GGML_UNARY_OP_ELU" }, + {GGML_UNARY_OP_RELU, "GGML_UNARY_OP_RELU" }, + {GGML_UNARY_OP_SIGMOID, "GGML_UNARY_OP_SIGMOID" }, + {GGML_UNARY_OP_GELU, "GGML_UNARY_OP_GELU" }, + {GGML_UNARY_OP_GELU_QUICK, "GGML_UNARY_OP_GELU_QUICK" }, + {GGML_UNARY_OP_SILU, "GGML_UNARY_OP_SILU" }, + {GGML_UNARY_OP_HARDSWISH, "GGML_UNARY_OP_HARDSWISH" }, {GGML_UNARY_OP_HARDSIGMOID, "GGML_UNARY_OP_HARDSIGMOID"}, - {GGML_UNARY_OP_EXP, "GGML_UNARY_OP_EXP"}, - {GGML_UNARY_OP_COUNT, "GGML_UNARY_OP_COUNT"}}; - auto it = opTypeMap.find(m_node->op); - if (it != opTypeMap.end()) { - if (it->first == GGML_OP_UNARY) { - auto unary_it = unaryOpTypeMap.find(ggml_get_unary_op(m_node)); - if (unary_it != unaryOpTypeMap.end()) { - return unary_it->second; - } - } - return it->second; + {GGML_UNARY_OP_EXP, "GGML_UNARY_OP_EXP" }, + {GGML_UNARY_OP_COUNT, "GGML_UNARY_OP_COUNT" } + }; + static const std::map glu_ops = { + {GGML_GLU_OP_SWIGLU, "GGML_GLU_OP_SWIGLU"}, + {GGML_GLU_OP_GEGLU, "GGML_GLU_OP_GEGLU" }, + {GGML_GLU_OP_REGLU, "GGML_GLU_OP_REGLU" } + }; + + switch (m_node->op) { + case GGML_OP_UNARY: + return unary_ops.at(ggml_get_unary_op(m_node)); + case GGML_OP_GLU: + return glu_ops.at(ggml_get_glu_op(m_node)); + default: + return ops.at(m_node->op); } - static const std::string unknown_op = "UNKNOWN_OP"; + static const std::string unknown_op = "UNKNOWN_GGML_OP"; return unknown_op; } diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 19e4ed5b77..167453b215 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -237,21 +237,29 @@ static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(g static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { GGML_ASSERT(dev->reg != nullptr); - static const std::set supported_ops{ - GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW, - GGML_OP_CONT, GGML_OP_CPY, GGML_OP_RESHAPE, GGML_OP_PERMUTE, - GGML_OP_TRANSPOSE, GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, - GGML_OP_SCALE, GGML_OP_SOFT_MAX, - }; - static const std::set supported_unary_ops{ - GGML_UNARY_OP_SILU, - }; + static const std::set supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, + GGML_OP_VIEW, GGML_OP_CONT, GGML_OP_CPY, GGML_OP_RESHAPE, + GGML_OP_PERMUTE, GGML_OP_TRANSPOSE, GGML_OP_GET_ROWS, GGML_OP_ROPE, + GGML_OP_RMS_NORM, GGML_OP_SCALE, GGML_OP_SOFT_MAX}; + static const std::set supported_unary_ops{ + GGML_UNARY_OP_SILU, + }; + static const std::set supported_glu_ops{ + GGML_GLU_OP_SWIGLU, + }; - if (op->op == GGML_OP_UNARY) { - return supported_unary_ops.find(ggml_get_unary_op(op)) != - supported_unary_ops.end(); - } - return supported_ops.find(op->op) != supported_ops.end(); + auto res = false; + switch (op->op) { + case GGML_OP_UNARY: + res = supported_unary_ops.find(ggml_get_unary_op(op)) != supported_unary_ops.end(); + break; + case GGML_OP_GLU: + res = supported_glu_ops.find(ggml_get_glu_op(op)) != supported_glu_ops.end(); + break; + default: + res = supported_ops.find(op->op) != supported_ops.end(); + } + return res; } static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { diff --git a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp new file mode 100644 index 0000000000..28013fbaa0 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp @@ -0,0 +1,29 @@ +#include +#include +#include + +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_glu_swiglu(const NodeContext& context) { + num_inputs_check(context, 2, 2); + + auto src1 = context.get_input(0); + auto src2 = context.get_input(1); + auto sigmoid = std::make_shared(src1); + auto silu = std::make_shared(src1, sigmoid); + auto res = std::make_shared(silu, src2); + + return rename_outputs_with_suffix({res}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp index bf7d54d9a1..a99450ea95 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.cpp +++ b/ggml/src/ggml-openvino/openvino/op_table.cpp @@ -16,24 +16,25 @@ namespace ggml { std::unordered_map get_supported_ops() { using namespace ov::op; return { - { "GGML_OP_ADD", op::translate_1to1_match_2_inputs }, - { "GGML_OP_ADD1", op::translate_1to1_match_2_inputs }, - { "GGML_OP_CONT", op::translate_cont }, - { "GGML_OP_CPY", op::translate_cpy }, - { "GGML_OP_DIV", op::translate_1to1_match_2_inputs }, - { "GGML_OP_GET_ROWS", op::translate_get_rows }, - { "GGML_OP_MUL", op::translate_1to1_match_2_inputs }, - { "GGML_OP_MUL_MAT", op::translate_mulmat }, - { "GGML_OP_PERMUTE", op::translate_permute }, - { "GGML_OP_RESHAPE", op::translate_reshape }, - { "GGML_OP_RMS_NORM", op::translate_rms_norm }, - { "GGML_OP_ROPE", op::translate_rope }, - { "GGML_OP_SCALE", op::translate_scale }, - { "GGML_OP_SOFT_MAX", op::translate_soft_max }, - { "GGML_OP_SUB", op::translate_1to1_match_2_inputs }, - { "GGML_OP_TRANSPOSE", op::translate_transpose }, - { "GGML_UNARY_OP_SILU", op::translate_unary_silu }, - { "GGML_OP_VIEW", op::translate_view } + {"GGML_OP_ADD", op::translate_1to1_match_2_inputs }, + {"GGML_OP_ADD1", op::translate_1to1_match_2_inputs }, + {"GGML_OP_CONT", op::translate_cont }, + {"GGML_OP_CPY", op::translate_cpy }, + {"GGML_OP_DIV", op::translate_1to1_match_2_inputs }, + {"GGML_OP_GET_ROWS", op::translate_get_rows }, + {"GGML_OP_MUL", op::translate_1to1_match_2_inputs}, + {"GGML_OP_MUL_MAT", op::translate_mulmat }, + {"GGML_OP_PERMUTE", op::translate_permute }, + {"GGML_OP_RESHAPE", op::translate_reshape }, + {"GGML_OP_RMS_NORM", op::translate_rms_norm }, + {"GGML_OP_ROPE", op::translate_rope }, + {"GGML_OP_SCALE", op::translate_scale }, + {"GGML_OP_SOFT_MAX", op::translate_soft_max }, + {"GGML_OP_SUB", op::translate_1to1_match_2_inputs}, + {"GGML_OP_TRANSPOSE", op::translate_transpose }, + {"GGML_UNARY_OP_SILU", op::translate_unary_silu }, + {"GGML_OP_VIEW", op::translate_view }, + {"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu }, }; } diff --git a/ggml/src/ggml-openvino/openvino/op_table.hpp b/ggml/src/ggml-openvino/openvino/op_table.hpp index d576c2a135..9b141d6d20 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.hpp +++ b/ggml/src/ggml-openvino/openvino/op_table.hpp @@ -24,8 +24,8 @@ GGML_OP_CONVERTER(translate_scale); GGML_OP_CONVERTER(translate_unary_silu); GGML_OP_CONVERTER(translate_soft_max); GGML_OP_CONVERTER(translate_transpose); -GGML_OP_CONVERTER(translate_unary); GGML_OP_CONVERTER(translate_view); +GGML_OP_CONVERTER(translate_glu_swiglu); } // namespace op From ebc4fc9f95d383520e8565340cb19e303076e008 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 3 Jul 2025 13:22:39 +0800 Subject: [PATCH 085/254] Fuse to SDPA --- ggml/src/ggml-openvino/ggml-decoder.cpp | 48 ++++++----- ggml/src/ggml-openvino/ggml-decoder.h | 10 +-- ggml/src/ggml-openvino/openvino/decoder.hpp | 2 +- .../ggml-openvino/openvino/node_context.hpp | 13 ++- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 5 +- .../src/ggml-openvino/openvino/op/permute.cpp | 21 +++-- .../ggml-openvino/openvino/op/soft_max.cpp | 80 ++++++++++--------- .../openvino/pass/fuse_to_sdpa.cpp | 61 ++++++++++++++ .../openvino/pass/fuse_to_sdpa.hpp | 17 ++++ .../openvino/translate_session.cpp | 3 + ggml/src/ggml-openvino/openvino/utils.hpp | 2 +- ggml/src/ggml-openvino/utils.cpp | 20 ++--- 12 files changed, 189 insertions(+), 93 deletions(-) create mode 100644 ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp create mode 100644 ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.hpp diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 61c0fe4833..4a45aa2140 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -26,27 +26,36 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" -GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* m_cgraph, bool is_static, +GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token, + int context_size, int num_heads, int num_heads_kv, int head_size) : + GgmlOvDecoder::GgmlOvDecoder(node, cgraph, is_static, is_first_token) { + m_context_size = context_size; + m_num_heads = num_heads; + m_num_heads_kv = num_heads_kv; + m_head_size = head_size; +} + +GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token) : - m_cgraph(m_cgraph), + m_cgraph(cgraph), m_node(node), m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"), m_is_static(is_static), m_is_first_token(is_first_token) { + // TODO avoid static static std::map> model_weights; - if (m_node) { set_input_output(m_node); } else { static bool printed = false; if (!printed && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) { - print_tensor_address_map(m_cgraph); + print_tensor_address_map(cgraph); printed = true; } if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { std::string filename = "cgraph.txt"; - dump_cgraph(m_cgraph, filename); + dump_cgraph(cgraph, filename); } set_llm_params(); @@ -57,8 +66,8 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* m_cgr weight_created = true; } - for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { - auto* cur_node = m_cgraph->nodes[node_n]; + for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { + auto* cur_node = cgraph->nodes[node_n]; m_nodes.push_back(cur_node); set_input_output(cur_node); } @@ -195,7 +204,7 @@ void GgmlOvDecoder::set_llm_params() { auto* node = m_cgraph->nodes[i]; if (node->op == GGML_OP_VIEW && std::string(node->name) == "cache_k_l0 (view)") { auto* cache_k = node->src[0]; - m_max_token_len = cache_k->ne[1]; + m_context_size = cache_k->ne[1]; } else if (node->op == GGML_OP_ROPE && std::string(node->name) == "Qcur-0") { m_head_size = node->ne[0]; m_num_heads = node->ne[1]; @@ -210,30 +219,30 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") { if (m_is_static) { if (m_is_first_token) { - input_shape = ov::PartialShape{ 1, 1, m_max_token_len }; + input_shape = ov::PartialShape{1, 1, m_context_size}; } else { - input_shape = ov::PartialShape{ 1, 1, 1 }; + input_shape = ov::PartialShape{1, 1, 1}; } } else { - input_shape = ov::PartialShape{ 1, 1, ov::Dimension(1, m_max_token_len) }; + input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_context_size)}; } } else if (std::string(src->name) == "KQ_mask") { if (m_is_static) { if (m_is_first_token) { - input_shape = ov::PartialShape{ 1, m_max_token_len, m_max_token_len }; + input_shape = ov::PartialShape{1, m_context_size, m_context_size}; } else { - input_shape = ov::PartialShape{ 1, 1, m_max_token_len }; + input_shape = ov::PartialShape{1, 1, m_context_size}; } } else { - auto max_mask_size = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD); - input_shape = ov::PartialShape{ 1, ov::Dimension(1, max_mask_size), ov::Dimension(1, max_mask_size) }; + auto max_mask_size = GGML_PAD(m_context_size, GGML_KQ_MASK_PAD); + input_shape = ov::PartialShape{1, ov::Dimension(1, max_mask_size), ov::Dimension(1, max_mask_size)}; } } else if (std::string(src->name).find("cache_k") == 0) { - input_shape = ov::PartialShape{ m_max_token_len, m_num_heads_kv, m_head_size }; + input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size}; } else if (std::string(src->name).find("cache_v") == 0) { - input_shape = ov::PartialShape{ m_num_heads_kv, m_head_size, m_max_token_len }; + input_shape = ov::PartialShape{m_num_heads_kv, m_head_size, m_context_size}; } else { - input_shape = ov::PartialShape{ get_shape(src) }; + input_shape = ov::PartialShape{get_shape(src)}; } return input_shape; } @@ -557,7 +566,8 @@ int32_t* GgmlOvDecoder::get_output_op_params(const std::string& name) const { void GgmlOvDecoder::visit_subgraph(std::function)> node_visitor) const { for (const auto& node : m_nodes) { - auto decoder = std::make_shared(node, m_cgraph, m_is_static, m_is_first_token); + auto decoder = std::make_shared(node, m_cgraph, m_is_static, m_is_first_token, m_context_size, + m_num_heads, m_num_heads_kv, m_head_size); node_visitor(decoder); } } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 6d3f24b093..171300b406 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -11,9 +11,9 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { public: - using ov::frontend::ggml::GgmlDecoder::GgmlDecoder; - GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token); + GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token, + int context_size, int num_heads, int num_heads_kv, int head_size); virtual ov::Any get_attribute(const std::string& name) const override { return nullptr; @@ -90,7 +90,7 @@ public: return m_model_output_names; } - virtual int get_max_token_len() const override { return m_max_token_len; } + virtual int get_context_size() const override { return m_context_size; } virtual int get_num_heads() const override { return m_num_heads; } @@ -114,7 +114,7 @@ private: static std::vector get_stride(const ggml_tensor* tensor); static ov::element::Type get_ov_type(const ggml_tensor* tensor); - // set max_token_len, num_heads, etc + // set context_size, num_heads, etc void set_llm_params(); static std::shared_ptr create_weight_node(ggml_tensor* tensor); @@ -136,7 +136,7 @@ private: std::map> m_model_extra_input_values; std::map> m_model_weights; std::vector m_model_output_names; - int m_max_token_len; + int m_context_size; int m_num_heads; int m_num_heads_kv; int m_head_size; diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index 3105d0f16f..8d2e06c0e5 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -65,7 +65,7 @@ public: virtual bool is_static() const = 0; virtual bool is_first_token() const = 0; - virtual int get_max_token_len() const = 0; + virtual int get_context_size() const = 0; }; } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index f4e7c4e31f..62aa7d1fc5 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -91,11 +91,16 @@ public: bool is_first_token() const { return m_decoder->is_first_token(); } - int get_max_token_len() const { - return m_decoder->get_max_token_len(); - } -private: + int get_num_heads() const { return m_decoder->get_num_heads(); } + + int get_num_heads_kv() const { return m_decoder->get_num_heads_kv(); } + + int get_head_size() const { return m_decoder->get_head_size(); } + + int get_context_size() const { return m_decoder->get_context_size(); } + + private: std::shared_ptr m_decoder; std::shared_ptr& m_tensor_map; TranslateSession* m_translate_session; diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index d5a6ba2f03..cd027d2894 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -38,9 +38,8 @@ OutputVector translate_mulmat(const NodeContext& context) { ov::Output B = context.get_input(0); ov::Output A = std::make_shared(context.get_input(1), context.get_input_type(0)); - auto src0_shape = context.get_input_shape(0).to_shape(); - int64_t num_heads = context.get_input_shape(1).to_shape()[0]; - int64_t num_heads_kv = src0_shape[0]; + int64_t num_heads = context.get_num_heads(); + int64_t num_heads_kv = context.get_num_heads_kv(); int64_t kv_num_heads_factor = num_heads / num_heads_kv; if (kv_num_heads_factor > 1) { auto num_heads_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{num_heads}); diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index 09d15da427..978b5377fb 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -27,7 +27,7 @@ OutputVector translate_permute(const NodeContext& context) { if (op_case == 1) { auto perm = argsort_descend(context.get_output_stride(0)); res = std::make_shared(context.get_input(0), - ov::op::v0::Constant::create(ov::element::i64, { 3 }, perm)); + ov::op::v0::Constant::create(ov::element::i64, {3}, perm)); } else { auto src = context.get_input(0); auto attention_size = context.get_input("attention_size"); @@ -51,19 +51,16 @@ OutputVector translate_permute(const NodeContext& context) { false); } - auto slice_start = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector(3, 0)); - auto slice_step = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector(3, 1)); - std::shared_ptr slice_end; + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); + std::shared_ptr slice_axis; if (op_case == 2) { - slice_end = std::make_shared( - ov::OutputVector{attention_size, ov::op::v0::Constant::create(ov::element::i64, {2}, {src_shape[1], src_shape[2]})}, - 0); + slice_axis = zero; } else { - slice_end = std::make_shared( - ov::OutputVector{ov::op::v0::Constant::create(ov::element::i64, {2}, {src_shape[1], src_shape[0]}), attention_size}, - 0); + slice_axis = two; } - auto src_slice = std::make_shared(src_reshaped, slice_start, slice_end, slice_step); + auto src_slice = std::make_shared(src_reshaped, zero, attention_size, one, slice_axis); if (op_case == 2) { res = std::make_shared(src_slice, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); @@ -71,7 +68,7 @@ OutputVector translate_permute(const NodeContext& context) { res = src_slice; } } - return rename_outputs_with_suffix({ res }, context.get_name()); + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp index aeca9b3be5..81d43c37fe 100644 --- a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp +++ b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp @@ -1,3 +1,5 @@ +#include +#include #include #include #include @@ -5,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -22,62 +25,61 @@ namespace op { OutputVector translate_soft_max(const NodeContext& context) { num_inputs_check(context, 1, 2); - auto input_node = context.get_input(0); + auto input_node = context.get_input(0).get_node_shared_ptr(); ov::Output res; float scale = 1.0f; float max_bias = 0.0f; - auto * op_params = context.get_output_op_params(0); - memcpy(&scale, (float*)op_params + 0, sizeof(float)); - memcpy(&max_bias, (float*)op_params + 1, sizeof(float)); + auto* op_params = context.get_output_op_params(0); + memcpy(&scale, (float*) op_params + 0, sizeof(float)); + memcpy(&max_bias, (float*) op_params + 1, sizeof(float)); + const uint32_t h = context.get_head_size(); - // const uint32_t n_head = context.get_input_shape(0)[0].get_length(); - // const uint32_t n_head_log2 = 1u << (uint32_t)floor(log2(n_head)); + const uint32_t n_head = context.get_input_shape(0)[0].get_length(); + const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head)); - // const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); - // const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - // const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) - // : 1.0f; - const float slope = 1.0; + const float m0 = powf(2.0f, -(max_bias) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); + const float slope = + (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2 * (h - n_head_log2) + 1) : 1.0f; + std::shared_ptr scaled_input; if (scale != 1.0f) { auto scale_node = std::make_shared(ov::element::f32, ov::Shape{}, std::vector{scale}); - input_node = std::make_shared(input_node, scale_node); + scaled_input = std::make_shared(input_node, scale_node); } - if (context.get_input_size() == 2) { - // Calculate mask then softmax - auto mask_node = context.get_input(1); - ov::element::Type mask_type = context.get_input_type(1); - if (mask_type == ov::element::f16) { - // Convert f16 to f32 - mask_node = std::make_shared(mask_node, ov::element::f32); - } + auto mask_node = context.get_input(1); - // Stride slice mask node - Output slice_start = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {0, 0, 0}); - auto one = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {1}); - auto token_len = get_dimensions(input_node.get_node_shared_ptr(), {1}); - auto total_token_len = get_dimensions(mask_node.get_node_shared_ptr(), {2}); - auto slice_end = std::make_shared(ov::NodeVector{one, token_len, total_token_len}, 0); - Output slice_stride = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {1, 1, 1}); - auto mask_node_sliced = std::make_shared(mask_node, slice_start, slice_end, slice_stride); + // Use Q-cur to retrieve the token length, so that the translation of SOFT_MAX + // does not depend on the result of the QK MatMul, so that QK matmul + softmax + qkv matmul + // can be fused into SDPA. + if (input_node->get_type_info() != ov::op::v0::Convert::get_type_info_static()) { + throw std::runtime_error("Input of SOFT_MAX should be MatMul of qk followed by a Convert"); + } + auto qk = input_node->get_input_node_shared_ptr(0); + if (qk->get_type_info() != ov::op::v0::MatMul::get_type_info_static()) { + throw std::runtime_error("Input of SOFT_MAX should be MatMul of qk followed by a Convert"); + } + auto token_len = get_dimensions(qk->get_input_node_shared_ptr(0), {1}); - // slope * mask + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto mask_node_sliced = std::make_shared(mask_node, zero, token_len, one, one); + + Output slope_mask; + if (slope != 1.0f) { auto slope_node = std::make_shared(ov::element::f32, ov::Shape{}, std::vector{slope}); - auto slope_mask_node = std::make_shared(mask_node_sliced, slope_node); - - // input + slope * mask - auto input_slope_mask_node = std::make_shared(input_node, slope_mask_node); - - // Calculate softmax - res = std::make_shared(input_slope_mask_node, 2); - } else { - // Directly softmax - res = std::make_shared(input_node, 0); + slope_mask = std::make_shared(mask_node_sliced, slope_node); + throw std::runtime_error("Slope != 1.0f in softmax has not been tested, verify it before use."); } + slope_mask = mask_node_sliced; + + auto input_slope_mask_node = std::make_shared(scaled_input, slope_mask); + + res = std::make_shared(input_slope_mask_node, 2); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp new file mode 100644 index 0000000000..1b7ac60271 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp @@ -0,0 +1,61 @@ +#include "fuse_to_sdpa.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ov { +namespace frontend { +namespace ggml { +namespace pass { + +FuseToSDPA::FuseToSDPA() { + const auto m_k = ov::pass::pattern::any_input(); + const auto m_q = ov::pass::pattern::any_input(); + const auto m_qk = ov::pass::pattern::wrap_type({m_q, m_k}); + const auto m_qk_f32 = ov::pass::pattern::wrap_type({m_qk}); + const auto m_scale = ov::pass::pattern::any_input(); + const auto m_scaled_qk = ov::pass::pattern::wrap_type({m_qk_f32, m_scale}); + const auto m_mask = ov::pass::pattern::any_input(); + const auto m_masked_qk = ov::pass::pattern::wrap_type({m_scaled_qk, m_mask}); + const auto m_softmax_qk = ov::pass::pattern::wrap_type({m_masked_qk}); + const auto m_softmax_qk_f16 = ov::pass::pattern::wrap_type({m_softmax_qk}); + const auto m_v = ov::pass::pattern::any_input(); + const auto m_qkv = ov::pass::pattern::wrap_type({m_softmax_qk_f16, m_v}); + + const auto callback = [=](ov::pass::pattern::Matcher& m) { + auto& pattern_to_output = m.get_pattern_value_map(); + auto k = pattern_to_output[m_k]; + auto q = pattern_to_output[m_q]; + auto v = pattern_to_output[m_v]; + auto mask = pattern_to_output[m_mask]; + auto scale = pattern_to_output[m_scale]; + + auto v_trans = + register_new_node(v, ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 2, 1})); + auto mask_f16 = register_new_node(mask, ov::element::f16); + auto scale_f16 = register_new_node(scale, ov::element::f16); + auto sdpa = std::make_shared(q, k, v_trans, mask_f16, scale_f16, false); + + ov::replace_node(m.get_match_root(), sdpa); + ov::copy_runtime_info(m.get_matched_nodes(), sdpa); + + return true; + }; + register_matcher(std::make_shared(m_qkv, "ov::frontend::ggml::pass::FuseToSDPA"), + callback); +} + +} // namespace pass +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.hpp b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.hpp new file mode 100644 index 0000000000..8b5164d232 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.hpp @@ -0,0 +1,17 @@ +#include "openvino/pass/matcher_pass.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace pass { + +class FuseToSDPA : public ov::pass::MatcherPass { +public: + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::ggml::pass::FuseToSDPA") + FuseToSDPA(); +}; + +} // namespace pass +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 3bf0403a64..1f311b4a40 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -9,6 +9,7 @@ #include #include "input_model.hpp" +#include "pass/fuse_to_sdpa.hpp" namespace ov { namespace frontend { @@ -145,6 +146,8 @@ void TranslateSession::apply_transformations(const std::shared_ptr& model const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names(); const auto kv_param_res_pairs = get_kv_param_res_pairs(model, kv_param_res_names); manager.register_pass(kv_param_res_pairs); + + manager.register_pass(); } manager.run_passes(model); diff --git a/ggml/src/ggml-openvino/openvino/utils.hpp b/ggml/src/ggml-openvino/openvino/utils.hpp index 1896f81427..b54b2b92c9 100644 --- a/ggml/src/ggml-openvino/openvino/utils.hpp +++ b/ggml/src/ggml-openvino/openvino/utils.hpp @@ -65,7 +65,7 @@ template OutputVector translate_1to1_match_2_inputs(const NodeContext& context) { num_inputs_check(context, 2, 2); auto res = std::make_shared(context.get_input(0), context.get_input(1)); - return rename_outputs_with_suffix({ res }, context.get_name()); + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 2620fa5615..2c4f0afe58 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -88,6 +89,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c if (cache_dir && !is_static) { core.set_property(ov::cache_dir(cache_dir)); } + // core.set_property(ov::enable_profiling(true)); static std::unordered_map> infer_request_cache; static std::unordered_map> ov_input_names_cache; @@ -256,10 +258,10 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, cons } else { if (param_name == "inp_tokens" || param_name == "inp_pos") { if (is_first_token) { - size_t max_token_len = ggml_decoder->get_max_token_len(); + size_t context_size = ggml_decoder->get_context_size(); const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); - std::vector padded_data = pad_input(input_tensor_ggml, 1, max_token_len, 0); - input_tensor = ov::Tensor(ov::element::i32, ov::Shape{ 1, 1, max_token_len }); + std::vector padded_data = pad_input(input_tensor_ggml, 1, context_size, 0); + input_tensor = ov::Tensor(ov::element::i32, ov::Shape{1, 1, context_size}); auto* data_ptr = input_tensor.data(); std::copy(padded_data.begin(), padded_data.end(), data_ptr); } else { @@ -267,18 +269,18 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, cons } } else if (param_name == "KQ_mask") { - size_t max_token_len = ggml_decoder->get_max_token_len(); + size_t context_size = ggml_decoder->get_context_size(); const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); if (is_first_token) { std::vector padded_data = - pad_input(input_tensor_ggml, max_token_len, max_token_len, -INFINITY); - set_zero_diagonal(padded_data, max_token_len); - input_tensor = ov::Tensor(ov::element::f32, ov::Shape{ 1, max_token_len, max_token_len }); + pad_input(input_tensor_ggml, context_size, context_size, -INFINITY); + set_zero_diagonal(padded_data, context_size); + input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, context_size, context_size}); auto* data_ptr = input_tensor.data(); std::copy(padded_data.begin(), padded_data.end(), data_ptr); } else { - std::vector padded_data = pad_input(input_tensor_ggml, 1, max_token_len, -INFINITY); - input_tensor = ov::Tensor(ov::element::f32, ov::Shape{ 1, 1, max_token_len }); + std::vector padded_data = pad_input(input_tensor_ggml, 1, context_size, -INFINITY); + input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, context_size}); auto* data_ptr = input_tensor.data(); std::copy(padded_data.begin(), padded_data.end(), data_ptr); } From bf5414c95e0b9f83affc215cf8c8a6708375fc59 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 4 Jul 2025 14:38:15 +0800 Subject: [PATCH 086/254] Replace Concat with Broadcast in MulMat for GQA --- ggml/src/ggml-openvino/ggml-decoder.cpp | 3 +++ ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 20 ++++++++++++------- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 4a45aa2140..b731b26a9a 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -118,6 +118,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { } auto param_node = std::make_shared(get_ov_type(src), get_graph_input_shape(src)); param_node->set_friendly_name(src_name); + param_node->output(0).get_tensor().set_names({src_name}); m_model_inputs[src_name] = param_node; } } @@ -262,6 +263,7 @@ void GgmlOvDecoder::add_extra_inputs() { std::string name = "past_token_len"; auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); param_node->set_friendly_name(name); + param_node->output(0).get_tensor().set_names({name}); m_model_extra_inputs[name] = param_node; auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); @@ -280,6 +282,7 @@ void GgmlOvDecoder::add_extra_inputs() { std::string name = "attention_size"; auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); param_node->set_friendly_name(name); + param_node->output(0).get_tensor().set_names({name}); m_model_extra_inputs[name] = param_node; auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index cd027d2894..1394989395 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -10,6 +11,7 @@ #include #include #include +#include #include #include "../node_context.hpp" @@ -45,16 +47,20 @@ OutputVector translate_mulmat(const NodeContext& context) { auto num_heads_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{num_heads}); auto num_heads_kv_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{num_heads_kv}); + auto factor_node = + ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{kv_num_heads_factor}); auto B_shape_last_two = get_dimensions(B.get_node_shared_ptr(), {1, 2}); - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - std::shared_ptr new_B_shape = - std::make_shared(ov::OutputVector{num_heads_kv_node, one, B_shape_last_two}, 0); - B = std::make_shared(B, new_B_shape, false); + auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); + auto B_unsqueezed = std::make_shared(B, unsqueeze_axes); - B = std::make_shared(ov::OutputVector(kv_num_heads_factor, B), 1); - new_B_shape = std::make_shared(ov::OutputVector{num_heads_node, B_shape_last_two}, 0); - B = std::make_shared(B, new_B_shape, false); + auto broadcast_shape = std::make_shared( + ov::OutputVector{num_heads_kv_node, factor_node, B_shape_last_two}, 0); + auto B_broadcasted = std::make_shared(B_unsqueezed, broadcast_shape); + + auto new_B_shape = + std::make_shared(ov::OutputVector{num_heads_node, B_shape_last_two}, 0); + B = std::make_shared(B_broadcasted, new_B_shape, false); } auto result_lp = std::make_shared(A, B, false, true); From acf358d1ce321039d1452f9b6a853d1c54eb6540 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Sun, 6 Jul 2025 21:59:30 +0800 Subject: [PATCH 087/254] Pull out indices creation for kv cache update --- .../ggml-openvino/openvino/node_context.hpp | 3 + ggml/src/ggml-openvino/openvino/op/cpy.cpp | 86 ++---------------- .../openvino/translate_session.cpp | 87 +++++++++++++++++++ 3 files changed, 99 insertions(+), 77 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index 62aa7d1fc5..b5f0f37406 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -71,6 +71,9 @@ public: } Output get_input(const std::string& name) const override { + if (m_tensor_map->find(name) == m_tensor_map->end()) { + throw std::runtime_error("'" + name + "' not found in tensor map."); + } return m_tensor_map->at(name); } diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index b183b97f23..a70c62d9a8 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -4,19 +4,11 @@ #include #include #include -#include -#include -#include #include #include -#include #include #include -#include -#include #include -#include -#include #include #include "../node_context.hpp" @@ -36,8 +28,13 @@ OutputVector translate_cpy(const NodeContext& context) { auto src0 = context.get_input(0); auto src1 = context.get_input(1); + auto token_len = context.get_input("token_len"); auto past_token_len = context.get_input("past_token_len"); + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto token_len_scalar = std::make_shared(token_len, zero); + auto past_token_len_scalar = std::make_shared(past_token_len, zero); + src0 = std::make_shared(src0, context.get_input_type(1)); ov::Output res; @@ -52,89 +49,24 @@ OutputVector translate_cpy(const NodeContext& context) { std::vector input0_strides = context.get_input_stride(0); std::vector output_strides = context.get_output_stride(0); - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1}); - if (op_case == 1) { // Write K to cache_k - auto token_len = get_dimensions(src0.get_node_shared_ptr(), {0}); - auto token_len_scalar = std::make_shared(token_len, zero); - - std::shared_ptr indices; - if (context.is_static()) { - indices = past_token_len.get_node_shared_ptr(); - } else { - auto past_token_len_scalar = std::make_shared(past_token_len, zero); - auto total_token_len_scalar = std::make_shared(past_token_len_scalar, token_len_scalar); - indices = std::make_shared(past_token_len_scalar, - total_token_len_scalar, - one_scalar, - ov::element::i64); - } - indices = std::make_shared(indices, one); - + auto indices = context.get_input("update_indices_k"); auto updated = std::make_shared(src1, indices, src0); res = std::make_shared(updated, std::make_shared(src1), false); } else { // Write V to cache_v - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - auto zero_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {0}); - - int64_t total_head_size = src0_shape[1]; - auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size}); - auto total_head_size_scalar = std::make_shared(total_head_size_node, zero); - - auto token_len = get_dimensions(src0.get_node_shared_ptr(), {2}); - auto token_len_scalar = std::make_shared(token_len, zero); - - // 1D tensor of shape [total_head_size], values starting from 0 - auto range_row = - std::make_shared(zero_scalar, total_head_size_scalar, one_scalar, ov::element::i64); - auto range_row_reshaped = - std::make_shared(range_row, - ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2})); - auto row_indices = std::make_shared( - range_row_reshaped, - std::make_shared(ov::OutputVector{total_head_size_node, token_len, one}, 0)); - - // 1D tensor of shape [token_len], values starting from past_token_len - std::shared_ptr range_col; - if (context.is_static()) { - range_col = past_token_len.get_node_shared_ptr(); - } else { - auto past_token_len_scalar = std::make_shared(past_token_len, zero); - auto total_token_len_scalar = std::make_shared(past_token_len_scalar, token_len_scalar); - range_col = std::make_shared(past_token_len_scalar, - total_token_len_scalar, - one_scalar, - ov::element::i64); - } - auto range_col_reshaped = - std::make_shared(range_col, - ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 2})); - auto col_indices = std::make_shared( - range_col_reshaped, - std::make_shared(ov::OutputVector{total_head_size_node, token_len, one}, 0)); - - // Stack row_indices and col_indices along last axis: [total_head_size, token_len, 2] - auto indices = std::make_shared(OutputVector{row_indices, col_indices}, 2); - auto indices_final = std::make_shared( - indices, - ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{-1, 2}), - false); - auto flattend_src0 = std::make_shared(src0, ov::op::v0::Constant::create(element::i64, Shape{1}, {-1}), false); + int64_t total_head_size = src0_shape[1]; auto reshaped_src1 = std::make_shared( src1, ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{total_head_size, -1}), false); - - auto updated = std::make_shared(reshaped_src1, indices_final, flattend_src0); + auto indices = context.get_input("update_indices_v"); + auto updated = std::make_shared(reshaped_src1, indices, flattend_src0); res = std::make_shared(updated, std::make_shared(src1), false); } diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 1f311b4a40..31325a0c11 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -3,11 +3,20 @@ #include #include #include +#include +#include +#include #include +#include +#include #include +#include +#include #include #include +#include "ggml-openvino/openvino/node_context.hpp" +#include "ggml-openvino/openvino/utils.hpp" #include "input_model.hpp" #include "pass/fuse_to_sdpa.hpp" @@ -50,6 +59,83 @@ ov::pass::MakeStateful::ParamResPairs get_kv_param_res_pairs( } return pairs; } + +void add_token_len(TensorMap& tensor_map) { + auto inp_tokens = tensor_map.at("inp_tokens").get_node_shared_ptr(); + auto token_len = get_dimensions(inp_tokens, {2}); + token_len->set_friendly_name("token_len"); + tensor_map.insert({"token_len", token_len->output(0)}); +} + +void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { + // cache_k layout: [S, N, H] (seq, num_heads, head_size) + // cache_v layout: [N, H, S] (num_heads, head_size, seq) + // When writing to cache_v, cache should be reshaped to [N*H, S] and v-curr should be flattened + auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); + auto past_token_len = tensor_map.at("past_token_len").get_node_shared_ptr(); + auto token_len = tensor_map.at("token_len").get_node_shared_ptr(); + + std::shared_ptr update_indices_k; + std::shared_ptr update_indices_v; + + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto zero_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1}); + auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); + + if (ggml_model_decoder.is_static()) { + update_indices_k = past_token_len; + } else { + update_indices_k = + std::make_shared(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); + } + update_indices_k = std::make_shared(update_indices_k, one); + update_indices_k->set_friendly_name("update_indices_k"); + tensor_map.insert({"update_indices_k", update_indices_k->output(0)}); + + auto total_head_size = ggml_model_decoder.get_num_heads_kv() * ggml_model_decoder.get_head_size(); + auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size}); + auto total_head_size_scalar = std::make_shared(total_head_size_node, zero); + + // 1D tensor of shape [total_head_size], values starting from 0 + auto range_row = + std::make_shared(zero_scalar, total_head_size_scalar, one_scalar, ov::element::i32); + auto range_row_reshaped = + std::make_shared(range_row, ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2})); + auto row_indices = std::make_shared( + range_row_reshaped, + std::make_shared(ov::OutputVector{total_head_size_node, token_len, one}, 0)); + + // 1D tensor of shape [token_len], values starting from past_token_len + std::shared_ptr range_col; + if (ggml_model_decoder.is_static()) { + // aka inp_pos + range_col = past_token_len; + } else { + range_col = + std::make_shared(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); + } + auto range_col_reshaped = + std::make_shared(range_col, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 2})); + auto col_indices = std::make_shared( + range_col_reshaped, + std::make_shared(ov::OutputVector{total_head_size_node, token_len, one}, 0)); + + // Stack row_indices and col_indices along last axis: [total_head_size, token_len, 2] + auto indices = std::make_shared(OutputVector{row_indices, col_indices}, 2); + update_indices_v = std::make_shared( + indices, ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{-1, 2}), false); + update_indices_v->set_friendly_name("update_indices_v"); + tensor_map.insert({"update_indices_v", update_indices_v->output(0)}); +} + +// Create common patterns +void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { + add_token_len(tensor_map); + add_kv_update_indices(tensor_map, ggml_model_decoder); +} + } // namespace TranslateSession::TranslateSession(const frontend::InputModel::Ptr& input_model, @@ -118,6 +204,7 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo } }; + preprocess(*tensor_map, *ggml_model_decoder); ggml_model_decoder->visit_subgraph(node_visitor); for (const auto& name : ggml_model_decoder->get_model_output_names()) { From 0fa7a5efef6a8a9a5fa41d3247c811c67dabe45c Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 9 Jul 2025 10:15:17 +0800 Subject: [PATCH 088/254] Refactor: remove past_token_len from extra_inputs --- ggml/src/ggml-openvino/ggml-decoder.cpp | 14 ++---------- ggml/src/ggml-openvino/openvino/op/cpy.cpp | 13 +---------- .../openvino/translate_session.cpp | 22 +++++-------------- 3 files changed, 8 insertions(+), 41 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index b731b26a9a..19152a5e6d 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -249,26 +249,16 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co } void GgmlOvDecoder::add_extra_inputs() { - int64_t past_token_len = -1; // attention_size not used for NPU int64_t attention_size = -1; + int64_t past_token_len = -1; for (const auto& node : m_nodes) { if (node->op == GGML_OP_CPY && ggml_is_contiguous(node)) { assert(std::string(node->view_src->name).find("cache_k") == 0); int64_t head_size = node->src[0]->ne[0]; int64_t num_heads = node->src[0]->ne[1]; - past_token_len = (int64_t)(node->src[1]->op_params[0] / node->src[1]->nb[0] / head_size / num_heads); - - std::string name = "past_token_len"; - auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); - param_node->set_friendly_name(name); - param_node->output(0).get_tensor().set_names({name}); - m_model_extra_inputs[name] = param_node; - - auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); - *tensor->data() = past_token_len; - m_model_extra_input_values[name] = tensor; + past_token_len = (int64_t) (node->src[1]->op_params[0] / node->src[1]->nb[0] / head_size / num_heads); break; } } diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index a70c62d9a8..e85094bb18 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -28,12 +28,6 @@ OutputVector translate_cpy(const NodeContext& context) { auto src0 = context.get_input(0); auto src1 = context.get_input(1); - auto token_len = context.get_input("token_len"); - auto past_token_len = context.get_input("past_token_len"); - - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto token_len_scalar = std::make_shared(token_len, zero); - auto past_token_len_scalar = std::make_shared(past_token_len, zero); src0 = std::make_shared(src0, context.get_input_type(1)); ov::Output res; @@ -43,12 +37,6 @@ OutputVector translate_cpy(const NodeContext& context) { return rename_outputs_with_suffix({res}, context.get_name()); } - auto src0_shape = context.get_input_shape(0).to_shape(); - auto output_shape = context.get_output_shape(0).to_shape(); - - std::vector input0_strides = context.get_input_stride(0); - std::vector output_strides = context.get_output_stride(0); - if (op_case == 1) { // Write K to cache_k auto indices = context.get_input("update_indices_k"); @@ -60,6 +48,7 @@ OutputVector translate_cpy(const NodeContext& context) { std::make_shared(src0, ov::op::v0::Constant::create(element::i64, Shape{1}, {-1}), false); + auto src0_shape = context.get_input_shape(0).to_shape(); int64_t total_head_size = src0_shape[1]; auto reshaped_src1 = std::make_shared( src1, diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 31325a0c11..9580586684 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -72,7 +72,6 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode // cache_v layout: [N, H, S] (num_heads, head_size, seq) // When writing to cache_v, cache should be reshaped to [N*H, S] and v-curr should be flattened auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); - auto past_token_len = tensor_map.at("past_token_len").get_node_shared_ptr(); auto token_len = tensor_map.at("token_len").get_node_shared_ptr(); std::shared_ptr update_indices_k; @@ -84,12 +83,8 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1}); auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - if (ggml_model_decoder.is_static()) { - update_indices_k = past_token_len; - } else { - update_indices_k = - std::make_shared(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); - } + update_indices_k = + std::make_shared(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); update_indices_k = std::make_shared(update_indices_k, one); update_indices_k->set_friendly_name("update_indices_k"); tensor_map.insert({"update_indices_k", update_indices_k->output(0)}); @@ -108,14 +103,8 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode std::make_shared(ov::OutputVector{total_head_size_node, token_len, one}, 0)); // 1D tensor of shape [token_len], values starting from past_token_len - std::shared_ptr range_col; - if (ggml_model_decoder.is_static()) { - // aka inp_pos - range_col = past_token_len; - } else { - range_col = - std::make_shared(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); - } + auto range_col = + std::make_shared(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); auto range_col_reshaped = std::make_shared(range_col, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 2})); auto col_indices = std::make_shared( @@ -233,10 +222,9 @@ void TranslateSession::apply_transformations(const std::shared_ptr& model const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names(); const auto kv_param_res_pairs = get_kv_param_res_pairs(model, kv_param_res_names); manager.register_pass(kv_param_res_pairs); - - manager.register_pass(); } + manager.register_pass(); manager.run_passes(model); } From 3533c14cf6bee6b65e9bcf42a777235cba1fcec1 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 9 Jul 2025 10:16:06 +0800 Subject: [PATCH 089/254] Fix Phi3 SwiGLU and SoftMax --- .../ggml-openvino/openvino/op/glu_swiglu.cpp | 27 ++++++++++++++----- .../ggml-openvino/openvino/op/soft_max.cpp | 8 ++---- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp index 28013fbaa0..138ef65090 100644 --- a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp @@ -1,6 +1,11 @@ +#include +#include #include +#include #include #include +#include +#include #include "../node_context.hpp" #include "../op_table.hpp" @@ -12,13 +17,23 @@ namespace ggml { namespace op { OutputVector translate_glu_swiglu(const NodeContext& context) { - num_inputs_check(context, 2, 2); + num_inputs_check(context, 1, 2); - auto src1 = context.get_input(0); - auto src2 = context.get_input(1); - auto sigmoid = std::make_shared(src1); - auto silu = std::make_shared(src1, sigmoid); - auto res = std::make_shared(silu, src2); + ov::Output src0; + ov::Output src1; + if (context.get_input_size() == 2) { + src0 = context.get_input(0); + src1 = context.get_input(1); + } else { + auto combined = context.get_input(0); + auto split_axis = ov::op::v0::Constant::create(ov::element::i64, {}, {2}); + auto split = std::make_shared(combined, split_axis, 2); + src0 = split->output(0); + src1 = split->output(1); + } + auto sigmoid = std::make_shared(src0); + auto silu = std::make_shared(src0, sigmoid); + auto res = std::make_shared(silu, src1); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp index 81d43c37fe..d59f4499a3 100644 --- a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp +++ b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp @@ -43,12 +43,8 @@ OutputVector translate_soft_max(const NodeContext& context) { const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2 * (h - n_head_log2) + 1) : 1.0f; - std::shared_ptr scaled_input; - if (scale != 1.0f) { - auto scale_node = - std::make_shared(ov::element::f32, ov::Shape{}, std::vector{scale}); - scaled_input = std::make_shared(input_node, scale_node); - } + auto scale_node = std::make_shared(ov::element::f32, ov::Shape{}, std::vector{scale}); + auto scaled_input = std::make_shared(input_node, scale_node); auto mask_node = context.get_input(1); From a80da6944832c0fd609382cb822135a3a7ff44f3 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 9 Jul 2025 15:14:10 +0800 Subject: [PATCH 090/254] Pull out sin cos from rope --- ggml/src/ggml-openvino/ggml-decoder.cpp | 1 + ggml/src/ggml-openvino/ggml-decoder.h | 3 + ggml/src/ggml-openvino/openvino/decoder.hpp | 1 + ggml/src/ggml-openvino/openvino/op/rope.cpp | 116 ++---------------- .../openvino/translate_session.cpp | 92 ++++++++++++++ 5 files changed, 106 insertions(+), 107 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 19152a5e6d..ae4beca23e 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -209,6 +209,7 @@ void GgmlOvDecoder::set_llm_params() { } else if (node->op == GGML_OP_ROPE && std::string(node->name) == "Qcur-0") { m_head_size = node->ne[0]; m_num_heads = node->ne[1]; + m_rope_params = node->op_params; } else if (node->op == GGML_OP_ROPE && std::string(node->name) == "Kcur-0") { m_num_heads_kv = node->ne[1]; } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 171300b406..8b507438c5 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -98,6 +98,8 @@ public: virtual int get_head_size() const override { return m_head_size; } + virtual int32_t* get_rope_params() const override { return m_rope_params; } + virtual std::map get_kv_param_res_names() const override; virtual bool is_static() const override { return m_is_static; } @@ -140,6 +142,7 @@ private: int m_num_heads; int m_num_heads_kv; int m_head_size; + int32_t* m_rope_params; std::vector m_kv_names; bool m_is_static; bool m_is_first_token; diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index 8d2e06c0e5..a3387ba394 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -61,6 +61,7 @@ public: virtual int get_num_heads() const = 0; virtual int get_num_heads_kv() const = 0; virtual int get_head_size() const = 0; + virtual int32_t* get_rope_params() const = 0; virtual std::map get_kv_param_res_names() const = 0; virtual bool is_static() const = 0; diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index 78523e5781..f5736fefc8 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -3,131 +3,39 @@ #include #include #include -#include #include #include -#include -#include -#include #include #include #include -#include #include #include #include -#include #include #include "../node_context.hpp" #include "../op_table.hpp" #include "../utils.hpp" -#ifndef M_PI -# define M_PI 3.14159265358979323846 -#endif - -#define GGML_ROPE_TYPE_NEOX 2 - -#define MIN(a, b) ((a) < (b) ? (a) : (b)) -#define MAX(a, b) ((a) > (b) ? (a) : (b)) - namespace ov { namespace frontend { namespace ggml { namespace op { -namespace { -float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) { - return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float) M_PI)) / (2 * logf(base)); -} - -void ggml_rope_yarn_corr_dims(int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, - float dims[2]) { - float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base)); - float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base)); - dims[0] = MAX(0, start); - dims[1] = MIN(n_dims - 1, end); -} -} // namespace - OutputVector translate_rope(const NodeContext& context) { num_inputs_check(context, 2, 3); ov::Output res; - auto data_node = context.get_input(0); - auto pos_node = context.get_input(1); - pos_node = std::make_shared(pos_node, ov::element::f32); + auto data_node = context.get_input(0).get_node_shared_ptr(); + auto cos_theta_node = context.get_input("rope_cos"); + auto sin_theta_node = context.get_input("rope_sin"); - auto permutation_node = - std::make_shared(ov::element::i64, ov::Shape{3}, std::vector{2, 1, 0}); - Output pos_node_reshaped = std::make_shared(pos_node, permutation_node); - - auto output_shape = context.get_output_shape(0); - - float freq_base; - float freq_scale; - float ext_factor; - float attn_factor; - float beta_fast; - float beta_slow; int32_t* op_params = context.get_output_op_params(0); - const int n_dims = op_params[1]; const int mode = op_params[2]; - const int n_ctx_orig = op_params[4]; - memcpy(&freq_base, op_params + 5, sizeof(float)); - memcpy(&freq_scale, op_params + 6, sizeof(float)); - memcpy(&ext_factor, op_params + 7, sizeof(float)); - memcpy(&attn_factor, op_params + 8, sizeof(float)); - memcpy(&beta_fast, op_params + 9, sizeof(float)); - memcpy(&beta_slow, op_params + 10, sizeof(float)); - - const float theta_scale = powf(freq_base, -2.0f / n_dims); - - // TODO: corr_dims is not used in the current implementation - float corr_dims[2]; - ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); - + constexpr int GGML_ROPE_TYPE_NEOX = 2; const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; - // TODO: GGML_OP_ROPE_BACK -> false - bool forward = true; - const float sin_sign = forward ? 1.0f : -1.0f; - - const int64_t ne0 = output_shape[2].get_length(); - std::vector factor(ne0 / 2); - factor[0] = freq_scale; - for (int64_t i = 1; i < ne0 / 2; i++) { - factor[i] = theta_scale * factor[i - 1]; - } - - Output factor_node = - std::make_shared(ov::element::f32, ov::Shape{factor.size()}, factor); - if (context.get_input_size() == 3) { - auto freq_factors_node = context.get_input(2); - factor_node = std::make_shared(factor_node, freq_factors_node); - } - - auto half_last_dim = ov::op::v0::Constant::create(ov::element::i64, Shape{1}, {output_shape[2].get_length() / 2}); - Output input_shape_node = std::make_shared( - OutputVector{get_dimensions(data_node.get_node_shared_ptr(), {0, 1}), half_last_dim}, - 0); - Output factor_broadcasted_node = std::make_shared(factor_node, input_shape_node); - - Output cos_factor_broadcasted_node = std::make_shared( - std::make_shared(factor_broadcasted_node, pos_node_reshaped)); - Output sin_factor_broadcasted_node = std::make_shared( - std::make_shared(factor_broadcasted_node, pos_node_reshaped)); - - float mscale = attn_factor; - Output mscale_node = - std::make_shared(ov::element::f32, ov::Shape{}, std::vector{mscale}); - Output mscale_sin_sign_node = - std::make_shared(ov::element::f32, ov::Shape{}, std::vector{mscale * sin_sign}); - Output cos_theta_node = std::make_shared(cos_factor_broadcasted_node, mscale_node); - Output sin_theta_node = std::make_shared(sin_factor_broadcasted_node, mscale_node); - if (!is_neox) { auto input_shape = context.get_input_shape(0); @@ -146,18 +54,12 @@ OutputVector translate_rope(const NodeContext& context) { std::make_shared(odd_slice, cos_theta_node)); auto stack = std::make_shared(OutputVector{first_half, second_half}, 2); - auto shape_const = ov::op::v0::Constant::create( - ov::element::i64, - Shape{3}, - std::vector{-1, input_shape[1].get_length(), input_shape[2].get_length()}); - res = std::make_shared(stack, shape_const, false); + res = std::make_shared(stack, std::make_shared(data_node), false); } else { - auto slice_node = - std::make_shared(data_node, - ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}), - 2); - Output slice_data_node_0 = slice_node->outputs()[0]; - Output slice_data_node_1 = slice_node->outputs()[1]; + auto data_split = std::make_shared( + data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}), 2); + Output slice_data_node_0 = data_split->outputs()[0]; + Output slice_data_node_1 = data_split->outputs()[1]; auto first_half_node = std::make_shared( std::make_shared(slice_data_node_0, cos_theta_node), diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 9580586684..d122497e63 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -1,16 +1,23 @@ #include "translate_session.hpp" +#include #include #include #include #include #include #include +#include +#include +#include +#include #include #include #include #include +#include #include +#include #include #include #include @@ -119,10 +126,95 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode tensor_map.insert({"update_indices_v", update_indices_v->output(0)}); } +float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) { +#ifndef M_PI +# define M_PI 3.14159265358979323846 +#endif + return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float) M_PI)) / (2 * logf(base)); +} + +void ggml_rope_yarn_corr_dims(int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, + float dims[2]) { + float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base)); + float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base)); + dims[0] = std::max(0.0f, start); + dims[1] = std::min(static_cast(n_dims - 1), end); +} + +void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { + int32_t* rope_params = ggml_model_decoder.get_rope_params(); + auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); + std::shared_ptr rope_freqs_weight; + + inp_pos = std::make_shared(inp_pos, ov::element::f32); + auto pos_perm = + std::make_shared(ov::element::i64, ov::Shape{3}, std::vector{2, 1, 0}); + inp_pos = std::make_shared(inp_pos, pos_perm); + if (tensor_map.find("rope_freqs_weight") != tensor_map.end()) { + rope_freqs_weight = tensor_map.at("rope_freqs.weight").get_node_shared_ptr(); + } + + float freq_base; + float freq_scale; + float ext_factor; + float attn_factor; + float beta_fast; + float beta_slow; + const int n_dims = rope_params[1]; + const int n_ctx_orig = rope_params[4]; + memcpy(&freq_base, rope_params + 5, sizeof(float)); + memcpy(&freq_scale, rope_params + 6, sizeof(float)); + memcpy(&ext_factor, rope_params + 7, sizeof(float)); + memcpy(&attn_factor, rope_params + 8, sizeof(float)); + memcpy(&beta_fast, rope_params + 9, sizeof(float)); + memcpy(&beta_slow, rope_params + 10, sizeof(float)); + + const float theta_scale = powf(freq_base, -2.0f / n_dims); + + // TODO: corr_dims is not used in the current implementation + float corr_dims[2]; + ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); + + // TODO: GGML_OP_ROPE_BACK -> false + // bool forward = true; + // const float sin_sign = forward ? 1.0f : -1.0f; + + const int64_t half_head_size = ggml_model_decoder.get_head_size() / 2; + std::vector factor(half_head_size); + factor[0] = freq_scale; + for (int64_t i = 1; i < half_head_size; i++) { + factor[i] = theta_scale * factor[i - 1]; + } + + Output factor_node = + std::make_shared(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor); + if (rope_freqs_weight) { + factor_node = std::make_shared(factor_node, rope_freqs_weight); + } + + auto half_head_size_node = ov::op::v0::Constant::create(ov::element::i64, Shape{1}, {half_head_size}); + Output cos_factor = + std::make_shared(std::make_shared(factor_node, inp_pos)); + Output sin_factor = + std::make_shared(std::make_shared(factor_node, inp_pos)); + + float mscale = attn_factor; + Output mscale_node = + std::make_shared(ov::element::f32, ov::Shape{}, std::vector{mscale}); + + auto cos_theta = std::make_shared(cos_factor, mscale_node); + auto sin_theta = std::make_shared(sin_factor, mscale_node); + cos_theta->set_friendly_name("rope_cos"); + sin_theta->set_friendly_name("rope_sin"); + tensor_map.insert({"rope_cos", cos_theta->output(0)}); + tensor_map.insert({"rope_sin", sin_theta->output(0)}); +} + // Create common patterns void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { add_token_len(tensor_map); add_kv_update_indices(tensor_map, ggml_model_decoder); + add_rope_sin_cos(tensor_map, ggml_model_decoder); } } // namespace From f3c05190962cdf60e4e4d7311ef53dfe1c97b7fa Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 11 Jul 2025 15:44:19 +0800 Subject: [PATCH 091/254] Reduce memory: free ov weights node after graph conversion --- ggml/src/ggml-openvino/ggml-decoder.cpp | 16 +++++----------- ggml/src/ggml-openvino/ggml-decoder.h | 2 ++ ggml/src/ggml-openvino/utils.cpp | 4 +--- 3 files changed, 8 insertions(+), 14 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index ae4beca23e..20d8c1b7fe 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -42,28 +42,23 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"), m_is_static(is_static), m_is_first_token(is_first_token) { - // TODO avoid static - static std::map> model_weights; if (m_node) { set_input_output(m_node); } else { - static bool printed = false; - if (!printed && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) { + if (is_first_token && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) { print_tensor_address_map(cgraph); - printed = true; } if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { - std::string filename = "cgraph.txt"; + auto timestamp = (long long) ggml_time_us(); + std::string filename = "cgraph_" + std::to_string(timestamp) + ".txt"; dump_cgraph(cgraph, filename); } set_llm_params(); - static bool weight_created = false; - if (!weight_created) { - add_weight_const_parallel(model_weights); - weight_created = true; + if (is_first_token) { + add_weight_const_parallel(m_model_weights); } for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { @@ -71,7 +66,6 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap m_nodes.push_back(cur_node); set_input_output(cur_node); } - m_model_weights = model_weights; add_extra_inputs(); } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 8b507438c5..428edef3ae 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -108,6 +108,8 @@ public: ov::PartialShape get_graph_input_shape(const ggml_tensor* src) const; + void clear_model_weights() { m_model_weights.clear(); } + private: void set_input_output(ggml_tensor* node); void add_extra_inputs(); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 2c4f0afe58..e5a4401fec 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -9,10 +9,8 @@ #include #include #include -#include #include #include -#include #include #include #include @@ -89,7 +87,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c if (cache_dir && !is_static) { core.set_property(ov::cache_dir(cache_dir)); } - // core.set_property(ov::enable_profiling(true)); static std::unordered_map> infer_request_cache; static std::unordered_map> ov_input_names_cache; @@ -157,6 +154,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto input_model = std::make_shared(ggml_decoder); model = ov::frontend::ggml::FrontEnd::convert(input_model); + ggml_decoder->clear_model_weights(); conversion_end_time = ggml_time_us(); auto compiled_model = core.compile_model(model, device, config); From d61f83c9b741cc11b86dbab43359d1e1b419c51e Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 17 Jul 2025 13:43:33 +0800 Subject: [PATCH 092/254] Fix CPY due to cgraph change --- ggml/src/ggml-openvino/openvino/op/cpy.cpp | 5 +++++ src/llama-graph.cpp | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index e85094bb18..553f3c7966 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -39,6 +39,11 @@ OutputVector translate_cpy(const NodeContext& context) { if (op_case == 1) { // Write K to cache_k + int64_t head_size = context.get_head_size(); + int64_t num_heads_kv = context.get_num_heads_kv(); + auto src0_reshape_shape = + ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{-1, num_heads_kv, head_size}); + src0 = std::make_shared(src0, src0_reshape_shape, false); auto indices = context.get_input("update_indices_k"); auto updated = std::make_shared(src1, indices, src0); res = std::make_shared(updated, std::make_shared(src1), false); diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index d4a25ab59b..e9fbff5995 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1694,7 +1694,7 @@ static std::unique_ptr build_attn_inp_kv_impl( inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch); inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream); - cb(inp->self_kq_mask, "KQ_mask", -1); + ggml_set_name(inp->self_kq_mask, "KQ_mask"); ggml_set_input(inp->self_kq_mask); inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; From ea75772e482f4cc90a651280bf243213d517d4c9 Mon Sep 17 00:00:00 2001 From: ravi9 Date: Thu, 17 Jul 2025 17:51:10 -0700 Subject: [PATCH 093/254] Added OpenVINO CI/CD. Updated docs --- .devops/openvino.Dockerfile | 134 ++++++++++++++++++++++++++++++++++ .github/workflows/build.yml | 39 ++++++++++ .github/workflows/docker.yml | 1 + .github/workflows/release.yml | 57 +++++++++++++++ ci/run.sh | 12 +++ docs/build.md | 110 ++++++++++++++++++---------- 6 files changed, 314 insertions(+), 39 deletions(-) create mode 100644 .devops/openvino.Dockerfile diff --git a/.devops/openvino.Dockerfile b/.devops/openvino.Dockerfile new file mode 100644 index 0000000000..16924e3937 --- /dev/null +++ b/.devops/openvino.Dockerfile @@ -0,0 +1,134 @@ +ARG OPENVINO_VERSION_MAJOR=2025.2 +ARG OPENVINO_VERSION_FULL=2025.2.0.19140.c01cd93e24d +ARG UBUNTU_VERSION=24.04 + +# Optional proxy build arguments - empty by default +ARG http_proxy= +ARG https_proxy= + +## Build Image +FROM ubuntu:${UBUNTU_VERSION} AS build + +# Pass proxy args to build stage +ARG http_proxy +ARG https_proxy + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + ca-certificates \ + gnupg \ + wget \ + git \ + cmake \ + ninja-build \ + build-essential \ + libtbb12 \ + libcurl4-openssl-dev && \ + rm -rf /var/lib/apt/lists/* + +# Install OpenVINO for Ubuntu 24.04 +ARG OPENVINO_VERSION_MAJOR +ARG OPENVINO_VERSION_FULL +RUN mkdir -p /opt/intel && \ + wget https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \ + tar -xf openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \ + mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \ + cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \ + echo "Y" | ./install_dependencies/install_openvino_dependencies.sh && \ + cd - && \ + ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino + +ENV OpenVINO_DIR=/opt/intel/openvino + +WORKDIR /app + +COPY . . + +# Build Stage +RUN bash -c "source ${OpenVINO_DIR}/setupvars.sh && \ + cmake -B build/ReleaseOV -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_OPENVINO=ON && \ + cmake --build build/ReleaseOV -j$(nproc)" + +# Copy all necessary libraries +RUN mkdir -p /app/lib && \ + find build/ReleaseOV -name '*.so*' -exec cp {} /app/lib \; && \ + find ${OpenVINO_DIR}/runtime/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \; 2>/dev/null || \ + find ${OpenVINO_DIR}/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \; + +# Create runtime directories and copy binaries +RUN mkdir -p /app/full \ + && cp build/ReleaseOV/bin/* /app/full/ \ + && cp *.py /app/full \ + && cp -r gguf-py /app/full \ + && cp -r requirements /app/full \ + && cp requirements.txt /app/full \ + && cp .devops/tools.sh /app/full/tools.sh + +## Base Runtime Image +FROM ubuntu:${UBUNTU_VERSION} AS base + +# Pass proxy args to runtime stage +ARG http_proxy +ARG https_proxy + +RUN apt-get update \ + && apt-get install -y libgomp1 libtbb12 curl\ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + +COPY --from=build /app/lib/ /app/ + +### Full (all binaries) +FROM base AS full + +ARG http_proxy +ARG https_proxy + +COPY --from=build /app/full /app/ + +WORKDIR /app + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + git \ + python3 \ + python3-venv \ + python3-pip && \ + python3 -m venv /ov-venv && \ + /ov-venv/bin/pip install --no-cache-dir --upgrade pip setuptools wheel && \ + /ov-venv/bin/pip install --no-cache-dir -r requirements.txt && \ + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /tmp/* /var/tmp/* && \ + find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \ + find /var/cache -type f -delete + +ENTRYPOINT ["/bin/bash", "-c", "source /ov-venv/bin/activate && exec /app/tools.sh \"$@\"", "--"] + + +### Light, CLI only +FROM base AS light + +COPY --from=build /app/full/llama-cli /app/ + +WORKDIR /app + +ENTRYPOINT [ "/app/llama-cli" ] + +### Server, Server only +FROM base AS server + +ENV LLAMA_ARG_HOST=0.0.0.0 + +COPY --from=build /app/full/llama-server /app/ + +WORKDIR /app + +HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] + +ENTRYPOINT [ "/app/llama-server" ] \ No newline at end of file diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index e3b120fcda..3692a0a69b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -737,6 +737,45 @@ jobs: -DGGML_SYCL_F16=ON cmake --build build --config Release -j $(nproc) + ubuntu-24-cmake-openvino: + runs-on: ubuntu-24.04 + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + + - name: ccache + uses: hendrikmuhs/ccache-action@v1.2.16 + with: + key: ubuntu-24-cmake-openvino-no-preset-v1 + evict-old-files: 1d + + - name: Dependencies + id: depends + run: | + export OPENVINO_VERSION_MAJOR=2025.2 + export OPENVINO_VERSION_FULL=2025.2.0.19140.c01cd93e24d + sudo apt-get update + sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar + sudo mkdir -p /opt/intel + wget -O openvino_${OPENVINO_VERSION_MAJOR}.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz + tar -xf openvino_${OPENVINO_VERSION_MAJOR}.tgz + sudo mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} + rm openvino_${OPENVINO_VERSION_MAJOR}.tgz + cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} + echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh && cd - + sudo ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino + + - name: Build + id: cmake_build + run: | + source /opt/intel/openvino/setupvars.sh + cmake -B build/ReleaseOV -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_OPENVINO=ON + cmake --build build/ReleaseOV --config Release -j $(nproc) + build-linux-cross: uses: ./.github/workflows/build-linux-cross.yml diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index d9fe0686d3..22e236296f 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -47,6 +47,7 @@ jobs: - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" } - { tag: "s390x", dockerfile: ".devops/s390x.Dockerfile", platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04-s390x" } - { tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" } + - { tag: "openvino", dockerfile: ".devops/openvino.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false } steps: - name: Check out the repo uses: actions/checkout@v4 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 272701fb9e..2f67885ac7 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -231,6 +231,63 @@ jobs: path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz name: llama-bin-ubuntu-vulkan-x64.tar.gz + ubuntu-24-openvino: + runs-on: ubuntu-24.04 + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: ccache + uses: hendrikmuhs/ccache-action@v1.2.16 + with: + key: ubuntu-24-cmake-openvino-release-no-preset-v1 + evict-old-files: 1d + + - name: Dependencies + id: depends + run: | + export OPENVINO_VERSION_MAJOR=2025.2 + export OPENVINO_VERSION_FULL=2025.2.0.19140.c01cd93e24d + sudo apt-get update + sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar + sudo mkdir -p /opt/intel + wget -O openvino_${OPENVINO_VERSION_MAJOR}.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz + tar -xf openvino_${OPENVINO_VERSION_MAJOR}.tgz + sudo mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} + rm openvino_${OPENVINO_VERSION_MAJOR}.tgz + cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} + echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh && cd - + sudo ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino + + - name: Build + id: cmake_build + run: | + source /opt/intel/openvino/setupvars.sh + cmake -B build/ReleaseOV -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_OPENVINO=ON + cmake --build build/ReleaseOV --config Release -j $(nproc) + + - name: Determine tag name + id: tag + uses: ./.github/actions/get-tag-name + + - name: Pack artifacts + id: pack_artifacts + run: | + cp LICENSE ./build/ReleaseOV/bin/ + zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-x64.zip ./build/ReleaseOV/bin/* + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-x64.zip + name: llama-bin-ubuntu-openvino-x64.zip + windows-cpu: runs-on: windows-2025 diff --git a/ci/run.sh b/ci/run.sh index 6ca6ea5669..ea15ce49b1 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -25,6 +25,9 @@ # # with KLEIDIAI support # GG_BUILD_KLEIDIAI=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt # +# # with OPENVINO support +# GG_BUILD_OPENVINO=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt +# if [ -z "$2" ]; then echo "usage: $0 " @@ -165,6 +168,15 @@ if [ -n "${GG_BUILD_KLEIDIAI}" ]; then -DBUILD_SHARED_LIBS=OFF" fi +if [ ! -z ${GG_BUILD_OPENVINO} ]; then + if [ -z ${OpenVINO_DIR} ]; then + echo "OpenVINO_DIR not found, please install OpenVINO via archives and enable it by:" + echo "source /opt/intel/openvino/setupvars.sh" + exit 1 + fi + CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_OPENVINO=ON" +fi + ## helpers # download a file if it does not exist or if it is outdated diff --git a/docs/build.md b/docs/build.md index d2dea5a572..d40d257f59 100644 --- a/docs/build.md +++ b/docs/build.md @@ -25,7 +25,7 @@ The following sections describe how to build with different backends and options * [Arm® KleidiAI™](#arm-kleidiai) * [OpenCL](#opencl) * [Android](#android-1) -* [OPENVINO](#openvino) +* [OpenVINO](#openvino) * [Notes about GPU-accelerated backends](#notes-about-gpu-accelerated-backends) ## CPU Build @@ -696,20 +696,48 @@ Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/m To read documentation for how to build on IBM Z & LinuxONE, [click here](./build-s390x.md) -## OPENVINO +## OpenVINO -[OpenVINO](https://docs.openvino.ai/2025/index.html) is a open-source toolkit for optimizing and deploying performant AI inference, specifically designed for Intel hardware including CPUs, GPUs, and NPUs in the cloud, on-prem, and on the edge alike. The OpenVINO backend enhances performance by leveraging hardware-specific optimizations and can be enabled for use with llama.cpp. +[OpenVINO](https://docs.openvino.ai/2025/index.html) is an open-source toolkit for optimizing and deploying high-performance AI inference, specifically designed for Intel hardware, including CPUs, GPUs, and NPUs, in the cloud, on-premises, and on the edge. +The OpenVINO backend enhances performance by leveraging hardware-specific optimizations and can be enabled for use with llama.cpp. Follow the instructions below to install OpenVINO runtime and build llama.cpp with OpenVINO support. +### Prerequisites + +- Linux or Windows system with Intel hardware (CPU, GPU, or NPU) +- **For Intel GPU or NPU Usage**: Install the appropriate hardware drivers for your Intel GPU or NPU. For detailed instructions, see: [Additional Configurations for Hardware Acceleration](https://docs.openvino.ai/2025/get-started/install-openvino/configurations.html). +- Git, CMake, and Ninja software tools are needed for building +```bash + sudo apt-get update + sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar +``` + ### 1. Install OpenVINO Runtime - Follow the guide to install OpenVINO Runtime from an archive file: **[Install OpenVINO™ Runtime on Linux from an Archive File.](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-linux.html)** -- After installation, make sure to [source the environment setup script](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-linux.html#step-2-configure-the-environment): +
+📦 Click to expand OpenVINO 2025.2 installation commands +
+ ```bash -source /opt/intel/openvino_2025.1.0/setupvars.sh +export OPENVINO_VERSION_MAJOR=2025.2 +export OPENVINO_VERSION_FULL=2025.2.0.19140.c01cd93e24d +sudo apt-get update +sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar +sudo mkdir -p /opt/intel +wget -O openvino_${OPENVINO_VERSION_MAJOR}.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz +tar -xf openvino_${OPENVINO_VERSION_MAJOR}.tgz +sudo mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} +rm openvino_${OPENVINO_VERSION_MAJOR}.tgz +cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} +echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh && cd - +sudo ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino +source /opt/intel/openvino/setupvars.sh ``` +
+ - Verify OpenVINO is initialized properly ```bash echo $OpenVINO_DIR @@ -725,23 +753,26 @@ cd llama.cpp git switch dev_backend_openvino # Build with OpenVINO support -cmake --preset ReleaseOV -cmake --build build/ReleaseOV --parallel - +source /opt/intel/openvino/setupvars.sh +cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON +cmake --build build/ReleaseOV --config Release -j $(nproc) ``` ### 3. Download Sample Model -Download the Phi-3 mini model for testing: +Download models for testing: ```bash # Create models directory -mkdir -p ~/models/Phi-3-mini-4k-instruct-gguf +mkdir -p ~/models/ -# Download model file +# Download model file: Llama-3.2-1B-Instruct.fp16.gguf +wget https://huggingface.co/MaziyarPanahi/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct.fp16.gguf \ + -O ~/models/Llama-3.2-1B-Instruct.fp16.gguf + +# Download model file: Phi-3-mini-4k-instruct-fp16.gguf wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-fp16.gguf \ - -O ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf - + -O ~/models/Phi-3-mini-4k-instruct-fp16.gguf ``` ### 4. Run inference with OpenVINO backend: @@ -750,28 +781,19 @@ When using the OpenVINO backend, the first inference token may have slightly hig ```bash export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache +# Default device is GPU. +# If not set, automatically selects the first available device in priority order: GPU, CPU, NPU. +export GGML_OPENVINO_DEVICE=GPU -./build/ReleaseOV/bin/llama-simple \ - -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf \ - -n 50 \ - "Hello, my name is " +./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is " ``` -### Using Llama.cpp's Built-in CPU Backend (for Comparison) - -To compare performance with the deafult CPU backend: - +To run in chat mode: ```bash -# Build CPU-only version -cmake --preset ReleaseCPU -cmake --build build/ReleaseCPU --parallel +export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache -# Run with Default CPU backend -./build/ReleaseCPU/bin/llama-simple \ - -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf \ - -n 50 \ - "Hello, my name is " +./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is " ``` @@ -779,13 +801,14 @@ cmake --build build/ReleaseCPU --parallel Control OpenVINO behavior using these environment variables: -- **`GGML_OPENVINO_CACHE_DIR`**: Directory for model caching (recommended: `/tmp/ov_cache`). If set, enables model caching in OpenVINO. +- **`GGML_OPENVINO_DEVICE`**: Specify the target device for OpenVINO inference. If not set, automatically selects the first available device in priority order: GPU, CPU, NPU. When set to `NPU` to use Intel NPUs, it enables static compilation mode for optimal performance. +- **`GGML_OPENVINO_CACHE_DIR`**: Directory for model caching (recommended: `/tmp/ov_cache`). If set, enables model caching in OpenVINO. Note: Not supported when using NPU devices yet. - **`GGML_OPENVINO_WEIGHT_AS_INPUT`**: Pass the weights as input to the OpenVINO model instead of creating Constant nodes for them. -- **`GGML_OPENVINO_PROFILING`**: Enable execution time profiling -- **`GGML_OPENVINO_DUMP_CGRAPH`**: Save compute graph to `cgraph.txt` -- **`GGML_OPENVINO_DUMP_IR`**: Export OpenVINO IR files with timestamps -- **`GGML_OPENVINO_DEBUG_INPUT`**: Enable input debugging -- **`GGML_OPENVINO_DEBUG_OUTPUT`**: Enable output debugging +- **`GGML_OPENVINO_PROFILING`**: Enable execution time profiling. +- **`GGML_OPENVINO_DUMP_CGRAPH`**: Save compute graph to `cgraph.txt`. +- **`GGML_OPENVINO_DUMP_IR`**: Export OpenVINO IR files with timestamps. +- **`GGML_OPENVINO_DEBUG_INPUT`**: Enable input debugging. +- **`GGML_OPENVINO_DEBUG_OUTPUT`**: Enable output debugging. ### Example with Profiling @@ -793,11 +816,20 @@ Control OpenVINO behavior using these environment variables: export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache export GGML_OPENVINO_PROFILING=1 -./build/ReleaseOV/bin/llama-simple \ - -m ~/models/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-fp16.gguf \ - -n 50 \ - "Hello, my name is " +./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is " +``` +### Using Llama.cpp's Built-in CPU Backend (for Comparison) + +To compare performance with the default CPU backend: + +```bash +# Build CPU-only version +cmake --preset ReleaseCPU +cmake --build build/ReleaseCPU --parallel + +# Run with the default CPU backend +./build/ReleaseCPU/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is " ``` ## Notes about GPU-accelerated backends From 1ed49bbfaf27fcd0b850e7aac943f4634711b286 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 23 Jul 2025 11:19:56 +0800 Subject: [PATCH 094/254] Fix llama-cli --- ggml/src/ggml-openvino/ggml-decoder.cpp | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 20d8c1b7fe..a94a7ddf9c 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -244,22 +244,36 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co } void GgmlOvDecoder::add_extra_inputs() { - // attention_size not used for NPU + // Extra inputs: + // 1. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned, + // see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding. + // Not used for NPU int64_t attention_size = -1; int64_t past_token_len = -1; + int64_t past_token_len_from_inp_pos = -1; for (const auto& node : m_nodes) { + if (node->op == GGML_OP_ROPE && std::string(node->src[1]->name) == "inp_pos") { + if (node->src[1]->type != GGML_TYPE_I32) { + throw std::runtime_error("Expected cgraph input `inp_pos` to be of type GGML_TYPE_I32"); + } + past_token_len_from_inp_pos = ((int32_t*) (node->src[1]->data))[0]; + } if (node->op == GGML_OP_CPY && ggml_is_contiguous(node)) { assert(std::string(node->view_src->name).find("cache_k") == 0); - int64_t head_size = node->src[0]->ne[0]; - int64_t num_heads = node->src[0]->ne[1]; - past_token_len = (int64_t) (node->src[1]->op_params[0] / node->src[1]->nb[0] / head_size / num_heads); + past_token_len = + (int64_t) (node->src[1]->op_params[0] / node->src[1]->nb[0] / m_head_size / m_num_heads_kv); break; } } if (past_token_len == -1) { throw std::runtime_error("Failed to find input \"cache_k\" in the graph"); } + if (past_token_len != past_token_len_from_inp_pos) { + throw std::runtime_error("Mismatch between past_token_len from cache_k and inp_pos: " + + std::to_string(past_token_len) + " vs " + std::to_string(past_token_len_from_inp_pos)); + } + for (const auto& node : m_nodes) { if (node->src[1] && std::string(node->src[1]->name).find("inp_tokens") == 0) { int64_t total_token_len = node->src[1]->ne[0] + past_token_len; From 44f4cf34b12880a4e9e6e6ebc7a53fc0e2cc383c Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 21 Jul 2025 21:52:39 +0800 Subject: [PATCH 095/254] Fix Phi3 ROPE; Add test-backend-ops --- ggml/src/ggml-openvino/.clang-format | 26 +--- ggml/src/ggml-openvino/ggml-decoder.cpp | 77 ++++++++-- ggml/src/ggml-openvino/ggml-decoder.h | 10 +- ggml/src/ggml-openvino/ggml-openvino.cpp | 142 ++++++++++++++++-- ggml/src/ggml-openvino/openvino/frontend.cpp | 4 +- ggml/src/ggml-openvino/openvino/frontend.hpp | 2 +- .../ggml-openvino/openvino/node_context.hpp | 4 + ggml/src/ggml-openvino/openvino/op/cont.cpp | 14 +- .../ggml-openvino/openvino/op/get_rows.cpp | 31 +++- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 60 ++++---- ggml/src/ggml-openvino/openvino/op/rope.cpp | 66 +++++--- .../ggml-openvino/openvino/op/soft_max.cpp | 33 ++-- .../openvino/translate_session.cpp | 83 +++------- .../openvino/translate_session.hpp | 3 +- ggml/src/ggml-openvino/openvino/utils.cpp | 139 +++++++++++++++++ ggml/src/ggml-openvino/openvino/utils.hpp | 10 ++ ggml/src/ggml-openvino/utils.cpp | 44 ++++++ ggml/src/ggml-openvino/utils.h | 4 + 18 files changed, 550 insertions(+), 202 deletions(-) diff --git a/ggml/src/ggml-openvino/.clang-format b/ggml/src/ggml-openvino/.clang-format index 6d77ecea3c..d631bc6c01 100644 --- a/ggml/src/ggml-openvino/.clang-format +++ b/ggml/src/ggml-openvino/.clang-format @@ -5,6 +5,10 @@ AlignConsecutiveDeclarations: false ReferenceAlignment: Left PointerAlignment: Left Cpp11BracedListStyle: true +AccessModifierOffset: -4 +BinPackArguments: false +BinPackParameters: false +BreakBeforeBraces: Attach Language: Cpp AlignAfterOpenBracket: Align @@ -27,29 +31,7 @@ AllowShortIfStatementsOnASingleLine: Never AllowShortLambdasOnASingleLine: Inline AllowShortLoopsOnASingleLine: false AlwaysBreakBeforeMultilineStrings: true -BinPackArguments: true -BinPackParameters: true # OnePerLine BitFieldColonSpacing: Both -BreakBeforeBraces: Custom # Attach -BraceWrapping: - AfterCaseLabel: true - AfterClass: false - AfterControlStatement: false - AfterEnum: false - AfterFunction: false - AfterNamespace: false - AfterObjCDeclaration: false - AfterStruct: false - AfterUnion: false - AfterExternBlock: false - BeforeCatch: false - BeforeElse: false - BeforeLambdaBody: false - BeforeWhile: false - IndentBraces: false - SplitEmptyFunction: false - SplitEmptyRecord: false - SplitEmptyNamespace: false # BreakAdjacentStringLiterals: true BreakAfterAttributes: Never BreakBeforeBinaryOperators: None diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index a94a7ddf9c..8ce9354c69 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -15,6 +16,8 @@ #include #include #include +#include +#include #include #include #include @@ -71,9 +74,19 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap } } +GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) { + m_cgraph = cgraph; + for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { + auto* cur_node = cgraph->nodes[node_n]; + m_nodes.push_back(cur_node); + set_input_output(cur_node, true); + } +} + // Called in GgmlOvDecoder constructor. Two cases: 1. constructing a decoder for the whole graph; -// 2. constructing a decoder for a node. -void GgmlOvDecoder::set_input_output(ggml_tensor* node) { +// 2. constructing a decoder for a node; +// 3. constructing a decoder for the whole graph naively (op test case) +void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { std::string node_name; if (node->op == GGML_OP_CPY) { // CPY updates the input tensor in place. For later ov op that uses the @@ -98,8 +111,14 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { m_inputs[src_name] = src; m_op_node_name.emplace_back(src_name, ggml_op_name(node->op)); - // If called for the whole graph, create constant nodes for weights and param nodes for inputs - if (!m_node && !src->view_src) { + // Add model inputs and weights constants, if called for the whole graph + if (naive) { + auto param_node = std::make_shared(get_ov_type(src), get_graph_input_shape(src)); + param_node->set_friendly_name(src_name); + param_node->output(0).get_tensor().set_names({src_name}); + m_model_inputs[src_name] = param_node; + + } else if (!m_node && !src->view_src) { ggml_backend_buffer* buffer = src->buffer; if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) { @@ -118,7 +137,10 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { } } - if (!m_node) { + // Add model outputs, if called for the whole graph + if (naive) { + m_model_output_names.push_back(node->name); + } else if (!m_node) { static std::set debug_output_names = {}; // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph if (node->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || node->flags & GGML_TENSOR_FLAG_OUTPUT || @@ -164,17 +186,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { m_op_case = 2; } break; - } - case GGML_OP_MUL_MAT: { - if (node->src[0]->view_src == nullptr) { - m_op_case = 1; - } else if (std::string(node->src[0]->name).find("cache_k") == 0) { - m_op_case = 2; - } else if (std::string(node->src[0]->name).find("cache_v") == 0) { - m_op_case = 3; } - break; - } case GGML_OP_PERMUTE: { if (node->src[0]->view_src == nullptr) { // Permute Qcur @@ -188,6 +200,23 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { } break; } + case GGML_OP_GET_ROWS: + { + if (node->src[1]->op == GGML_OP_VIEW) { + m_op_case = 2; + } else { + m_op_case = 1; + } + break; + } + case GGML_OP_ROPE: + { + if (node->src[0]->op == GGML_OP_VIEW) { + m_op_case = 2; + } else { + m_op_case = 1; + } + } default: break; } @@ -237,6 +266,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size}; } else if (std::string(src->name).find("cache_v") == 0) { input_shape = ov::PartialShape{m_num_heads_kv, m_head_size, m_context_size}; + } else if (src->op == GGML_OP_VIEW) { + // This case is added to make test-backend-ops work + input_shape = ov::PartialShape{get_shape(src->view_src)}; } else { input_shape = ov::PartialShape{get_shape(src)}; } @@ -373,6 +405,17 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) weight_node = std::make_shared(node_type, node_shape, data_f16); break; } + case GGML_TYPE_BF16: + { + const auto* ptr = reinterpret_cast(tensor->data); + std::vector data_bf16; + data_bf16.reserve(ne_total); + for (int i = 0; i < ne_total; ++i) { + data_bf16.push_back(ov::bfloat16::from_bits(ptr[i])); + } + weight_node = std::make_shared(node_type, node_shape, data_bf16); + break; + } default: throw std::invalid_argument("Unsupported tensor type"); } @@ -496,6 +539,9 @@ ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor* tensor) { case GGML_TYPE_F16: type = ov::element::f16; break; + case GGML_TYPE_BF16: + type = ov::element::bf16; + break; case GGML_TYPE_I64: type = ov::element::i64; break; @@ -576,6 +622,7 @@ void GgmlOvDecoder::visit_subgraph(std::function ops = { + {GGML_OP_NONE, "GGML_OP_NONE" }, {GGML_OP_ACC, "GGML_OP_ACC" }, {GGML_OP_ADD, "GGML_OP_ADD" }, {GGML_OP_ADD1, "GGML_OP_ADD1" }, diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 428edef3ae..f4fe9c402d 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -15,6 +15,8 @@ public: GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token, int context_size, int num_heads, int num_heads_kv, int head_size); + // Naive decoder + GgmlOvDecoder(struct ggml_cgraph* cgraph); virtual ov::Any get_attribute(const std::string& name) const override { return nullptr; GGML_UNUSED(name); @@ -111,7 +113,7 @@ public: void clear_model_weights() { m_model_weights.clear(); } private: - void set_input_output(ggml_tensor* node); + void set_input_output(ggml_tensor* node, bool naive = false); void add_extra_inputs(); static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename); static std::vector get_shape(const ggml_tensor* tensor); @@ -124,13 +126,13 @@ private: static std::shared_ptr create_weight_node(ggml_tensor* tensor); void add_weight_const_parallel(std::map>& model_weights); - struct ggml_cgraph* m_cgraph; + struct ggml_cgraph* m_cgraph = nullptr; + ggml_tensor* m_node = nullptr; + std::vector m_nodes; std::map m_inputs; std::vector m_input_names; std::map m_outputs; std::vector m_output_names; - ggml_tensor* m_node; - std::vector m_nodes; std::string m_op_name; mutable std::string m_name; int m_op_case; diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 167453b215..2bc9d5199c 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -1,15 +1,17 @@ -#include "ggml-backend-impl.h" -#include "ggml-impl.h" #include "ggml-openvino.h" -#include "ggml-openvino/utils.h" -#include "ggml.h" +#include #include #include #include #include #include +#include "ggml-backend-impl.h" +#include "ggml-impl.h" +#include "ggml-openvino/utils.h" +#include "ggml.h" + #define GGML_OPENVINO_MAX_STREAMS 8 struct ggml_backend_openvino_context { @@ -234,9 +236,85 @@ static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(g return nullptr; } +static bool is_op_unsupported_case(const ggml_tensor* op) { + if (op->op == GGML_OP_SOFT_MAX) { + float scale = 1.0f; + float max_bias = 0.0f; + const auto* op_params = op->op_params; + memcpy(&scale, (const float*) op_params + 0, sizeof(float)); + memcpy(&max_bias, (const float*) op_params + 1, sizeof(float)); + const uint32_t h = op->src[0]->ne[2]; + const uint32_t n_head = op->src[0]->ne[0]; + const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head)); + + const float m0 = powf(2.0f, -(max_bias) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); + const float slope = + (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2 * (h - n_head_log2) + 1) : 1.0f; + + if (slope != 1.0f) { + GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with slope != 1.0f\n"); + return true; + } + } + + if (op->op == GGML_OP_MUL_MAT) { + if ((op->src[0]->view_src && op->src[0]->op != GGML_OP_PERMUTE) || + (op->src[1]->view_src && op->src[1]->op != GGML_OP_PERMUTE)) { + GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with view_src tensors that are not PERMUTE\n"); + return true; + } + } + + if (op->op == GGML_OP_ROPE) { + const int32_t* op_params = op->op_params; + const int n_dims = op_params[1]; + const int mode = op_params[2]; + if (mode == GGML_ROPE_TYPE_MROPE || mode == GGML_ROPE_TYPE_VISION) { + GGML_LOG_WARN("OpenVINO backend does not support ROPE with mode %d\n", mode); + return true; + } + if (n_dims != op->src[0]->ne[0]) { + GGML_LOG_WARN("OpenVINO backend does not support ROPE with n_dims %d != src[0]->ne[0] %ld\n", + n_dims, + op->src[0]->ne[0]); + return true; + } + if (op->type != GGML_TYPE_F32) { + GGML_LOG_WARN("OpenVINO backend does not support ROPE with type %s\n", ggml_type_name(op->type)); + return true; + } + float freq_scale; + memcpy(&freq_scale, op_params + 6, sizeof(float)); + if (freq_scale != 1.0f) { + GGML_LOG_WARN("OpenVINO backend does not support ROPE with freq_scale %f != 1.0f\n", freq_scale); + return true; + } + float ext_factor; + memcpy(&ext_factor, op_params + 7, sizeof(float)); + if (ext_factor != 0.0f) { + GGML_LOG_WARN("OpenVINO backend does not support ROPE with ext_factor %f != 0.0f\n", ext_factor); + return true; + } + if (op->src[0]->op == GGML_OP_VIEW) { + if (op->src[0]->view_src->ne[1] != op->src[0]->ne[2]) { + GGML_LOG_WARN( + "OpenVINO backend does not support ROPE with src[0]->view_src->ne[1] %ld != src[0]->ne[2] %ld\n", + op->src[0]->view_src->ne[1], + op->src[0]->ne[2]); + return true; + } + } + } + return false; +} + static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { GGML_ASSERT(dev->reg != nullptr); + static const std::set supported_types{ + GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_I64, GGML_TYPE_I32}; + static const std::set supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW, GGML_OP_CONT, GGML_OP_CPY, GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE, GGML_OP_GET_ROWS, GGML_OP_ROPE, @@ -248,18 +326,60 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con GGML_GLU_OP_SWIGLU, }; - auto res = false; switch (op->op) { case GGML_OP_UNARY: - res = supported_unary_ops.find(ggml_get_unary_op(op)) != supported_unary_ops.end(); - break; + { + auto supported = supported_unary_ops.find(ggml_get_unary_op(op)) != supported_unary_ops.end(); + if (!supported) { + GGML_LOG_WARN("OpenVINO backend does not support unary op %s\n", + ggml_unary_op_name(ggml_get_unary_op(op))); + return false; + } + break; + } case GGML_OP_GLU: - res = supported_glu_ops.find(ggml_get_glu_op(op)) != supported_glu_ops.end(); - break; + { + auto supported = supported_glu_ops.find(ggml_get_glu_op(op)) != supported_glu_ops.end(); + if (!supported) { + GGML_LOG_WARN("OpenVINO backend does not support GLU op %s\n", + ggml_glu_op_name(ggml_get_glu_op(op))); + return false; + } + break; + } default: - res = supported_ops.find(op->op) != supported_ops.end(); + { + auto supported = supported_ops.find(op->op) != supported_ops.end(); + if (!supported) { + GGML_LOG_WARN("OpenVINO backend does not support op %s\n", ggml_op_name(op->op)); + return false; + } + } } - return res; + + if (supported_types.find(op->type) == supported_types.end()) { + GGML_LOG_WARN("OpenVINO backend does not support tensor type %s\n", ggml_type_name(op->type)); + return false; + } + if (op->ne[3] != 1) { + GGML_LOG_WARN("OpenVINO backend does not support tensors with ne[3] != 1\n"); + return false; + } + for (int i = 0; i < GGML_MAX_SRC; i++) { + if (supported_types.find(op->type) == supported_types.end()) { + GGML_LOG_WARN("OpenVINO backend does not support tensor type %s\n", ggml_type_name(op->type)); + return false; + } + if (op->src[i] != nullptr && op->src[i]->ne[3] != 1) { + GGML_LOG_WARN("OpenVINO backend does not support tensors with ne[3] != 1\n"); + return false; + } + } + + if (is_op_unsupported_case(op)) { + return false; + } + return true; } static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { diff --git a/ggml/src/ggml-openvino/openvino/frontend.cpp b/ggml/src/ggml-openvino/openvino/frontend.cpp index ff7f0e8392..dbdae1ed45 100644 --- a/ggml/src/ggml-openvino/openvino/frontend.cpp +++ b/ggml/src/ggml-openvino/openvino/frontend.cpp @@ -10,13 +10,13 @@ namespace ggml { FrontEnd::FrontEnd() {} -std::shared_ptr FrontEnd::convert(const InputModel::Ptr& model) { +std::shared_ptr FrontEnd::convert(const InputModel::Ptr& model, bool naive) { auto ggml_model = std::dynamic_pointer_cast(model); FRONT_END_GENERAL_CHECK(ggml_model, "Invalid input model"); std::shared_ptr converted_model; const auto& supported_ops = get_supported_ops(); { - TranslateSession translate_session(model, supported_ops); + TranslateSession translate_session(model, supported_ops, naive); converted_model = translate_session.get_converted_model(); } return converted_model; diff --git a/ggml/src/ggml-openvino/openvino/frontend.hpp b/ggml/src/ggml-openvino/openvino/frontend.hpp index 5cc7ff1773..f1c6f0c3e3 100644 --- a/ggml/src/ggml-openvino/openvino/frontend.hpp +++ b/ggml/src/ggml-openvino/openvino/frontend.hpp @@ -15,7 +15,7 @@ public: using Ptr = std::shared_ptr; FrontEnd(); - static std::shared_ptr convert(const InputModel::Ptr& model); + static std::shared_ptr convert(const InputModel::Ptr& model, bool naive = false); }; } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index b5f0f37406..ceba642275 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -77,6 +77,10 @@ public: return m_tensor_map->at(name); } + bool has_input(const std::string& name) const { + return m_tensor_map->find(name) != m_tensor_map->end(); + } + const std::string& get_name() const override { return m_decoder->get_op_name(); } diff --git a/ggml/src/ggml-openvino/openvino/op/cont.cpp b/ggml/src/ggml-openvino/openvino/op/cont.cpp index 5c6953caff..f83c0e62df 100644 --- a/ggml/src/ggml-openvino/openvino/op/cont.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp @@ -34,19 +34,7 @@ OutputVector translate_cont(const NodeContext& context) { false); } else { // The input comes from a VIEW - // Currently all cases are slicing at lowest dim - int32_t* op_params = context.get_input_op_params(0); - auto output_stride = context.get_output_stride(0); - - int64_t split_addr = op_params[0] / output_stride[2]; - std::vector begin = {0, 0, split_addr}; - std::vector end = {(int64_t)src_shape[0], INT_MAX, split_addr + (int64_t)src_shape[2]}; - std::vector strides = {1, 1, 1}; - - auto begin_const = ov::op::v0::Constant::create(element::i64, {begin.size()}, begin); - auto end_const = ov::op::v0::Constant::create(ov::element::i64, {end.size()}, end); - auto strides_const = ov::op::v0::Constant::create(ov::element::i64, {strides.size()}, strides); - res = std::make_shared(context.get_input(0), begin_const, end_const, strides_const); + res = process_view_input(context, 0); } return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp index 9ed5f4deaf..c97bbbf5a3 100644 --- a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp @@ -1,10 +1,12 @@ +#include #include #include #include #include #include #include -#include +#include +#include #include "../node_context.hpp" #include "../op_table.hpp" @@ -18,19 +20,32 @@ namespace op { OutputVector translate_get_rows(const NodeContext& context) { num_inputs_check(context, 2, 2); - auto data_node = context.get_input(0); - auto indices_node = context.get_input(1); + int op_case = context.get_op_case(); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case"); - auto indices_shape = get_dimensions(indices_node.get_node_shared_ptr(), {2}); - Output indice_reshaped = std::make_shared(indices_node, indices_shape, false); + Output res; + auto data = context.get_input(0); + auto indices = context.get_input(1); - auto axis_node = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1}); + if (op_case == 2) { + // The input comes from a VIEW + indices = process_view_input(context, 1); + } + + auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1}); + if (indices.get_partial_shape()[1].get_length() == 1) { + indices = + std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); + res = std::make_shared(data, indices, axis); + } else { + indices = + std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); + res = std::make_shared(data, indices, axis, 1); + } - Output res = std::make_shared(data_node, indice_reshaped, axis_node); if (res.get_element_type() != context.get_output_type(0)) { res = std::make_shared(res, context.get_output_type(0)); } - return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 1394989395..52d1e575db 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -26,48 +26,46 @@ namespace op { OutputVector translate_mulmat(const NodeContext& context) { num_inputs_check(context, 2, 2); - int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported MULMAT case"); - ov::Output res; + ov::Output B = context.get_input(0); + ov::Output A = std::make_shared(context.get_input(1), context.get_input_type(0)); - if (op_case == 1) { - auto src0 = context.get_input(0); - auto src1 = std::make_shared(context.get_input(1), context.get_input_type(0)); - auto result_lp = std::make_shared(src1, src0, false, true); - res = std::make_shared(result_lp, context.get_output_type(0)); - } else { - ov::Output B = context.get_input(0); - ov::Output A = std::make_shared(context.get_input(1), context.get_input_type(0)); + auto B_shape = context.get_input_shape(0).to_shape(); + auto A_shape = context.get_input_shape(1).to_shape(); + int64_t A_batch = A_shape[0]; + int64_t B_batch = B_shape[0]; + auto A_batch_larger = A_batch > B_batch; + Output Z = A_batch_larger ? B : A; + int64_t factor = A_batch_larger ? A_batch / B_batch : B_batch / A_batch; + if (factor > 1) { + auto A_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{A_batch}); + auto B_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{B_batch}); + auto factor_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{factor}); - int64_t num_heads = context.get_num_heads(); - int64_t num_heads_kv = context.get_num_heads_kv(); - int64_t kv_num_heads_factor = num_heads / num_heads_kv; - if (kv_num_heads_factor > 1) { - auto num_heads_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{num_heads}); - auto num_heads_kv_node = - ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{num_heads_kv}); - auto factor_node = - ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{kv_num_heads_factor}); - auto B_shape_last_two = get_dimensions(B.get_node_shared_ptr(), {1, 2}); + auto Z_last_two_dim = get_dimensions(Z.get_node_shared_ptr(), {1, 2}); - auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); - auto B_unsqueezed = std::make_shared(B, unsqueeze_axes); + auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); + auto Z_unsqueezed = std::make_shared(Z, unsqueeze_axes); - auto broadcast_shape = std::make_shared( - ov::OutputVector{num_heads_kv_node, factor_node, B_shape_last_two}, 0); - auto B_broadcasted = std::make_shared(B_unsqueezed, broadcast_shape); + Output batch_small = A_batch_larger ? B_batch_node : A_batch_node; + Output batch_large = A_batch_larger ? A_batch_node : B_batch_node; + auto broadcast_shape = + std::make_shared(ov::OutputVector{batch_small, factor_node, Z_last_two_dim}, 0); + auto Z_broadcasted = std::make_shared(Z_unsqueezed, broadcast_shape); - auto new_B_shape = - std::make_shared(ov::OutputVector{num_heads_node, B_shape_last_two}, 0); - B = std::make_shared(B_broadcasted, new_B_shape, false); + auto new_Z_shape = std::make_shared(ov::OutputVector{batch_large, Z_last_two_dim}, 0); + Z = std::make_shared(Z_broadcasted, new_Z_shape, false); + } + if (A_batch_larger) { + B = Z; + } else { + A = Z; } auto result_lp = std::make_shared(A, B, false, true); res = std::make_shared(result_lp, context.get_output_type(0)); - } - return rename_outputs_with_suffix({res}, context.get_name()); + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index f5736fefc8..7951a1e012 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "../node_context.hpp" @@ -25,37 +26,66 @@ namespace op { OutputVector translate_rope(const NodeContext& context) { num_inputs_check(context, 2, 3); + int op_case = context.get_op_case(); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case"); + ov::Output res; auto data_node = context.get_input(0).get_node_shared_ptr(); - auto cos_theta_node = context.get_input("rope_cos"); - auto sin_theta_node = context.get_input("rope_sin"); - + auto output_shape = context.get_output_shape(0).to_shape(); int32_t* op_params = context.get_output_op_params(0); + + Output cos_theta_node; + Output sin_theta_node; + if (context.has_input("rope_cos")) { + cos_theta_node = context.get_input("rope_cos"); + sin_theta_node = context.get_input("rope_sin"); + } else { + auto inp_pos = context.get_input(1).get_node_shared_ptr(); + std::shared_ptr rope_freqs_weight; + if (context.get_input_size() == 3) { + rope_freqs_weight = context.get_input(2).get_node_shared_ptr(); + } + auto sin_cos = make_sin_cos(op_params, inp_pos, rope_freqs_weight); + sin_theta_node = sin_cos.first; + cos_theta_node = sin_cos.second; + } + + if (op_case == 2) { + // The input comes from a VIEW + int slice_len = output_shape[1] * output_shape[2]; + data_node = process_view_input(context, 0, slice_len).get_node_shared_ptr(); + auto data_shape = ov::op::v0::Constant::create( + ov::element::i64, {3}, std::vector{-1, (int64_t) output_shape[1], (int64_t) output_shape[2]}); + data_node = std::make_shared(data_node, data_shape, false); + } + const int mode = op_params[2]; - constexpr int GGML_ROPE_TYPE_NEOX = 2; - const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; + constexpr int ROPE_TYPE_NEOX = 2; + constexpr int ROPE_TYPE_NORM = 0; - if (!is_neox) { - auto input_shape = context.get_input_shape(0); + if (mode == ROPE_TYPE_NORM) { + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); + auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[2]}); + auto even_slice = std::make_shared(data_node, zero, end, two, two); + auto odd_slice = std::make_shared(data_node, one, end, two, two); - auto begin_even = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {0, 0, 0}); - auto begin_odd = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {0, 0, 1}); - auto end = std::make_shared(data_node); - auto stride = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {1, 1, 2}); - auto even_slice = std::make_shared(data_node, begin_even, end, stride); - auto odd_slice = std::make_shared(data_node, begin_odd, end, stride); - - auto first_half = + Output first_half = std::make_shared(std::make_shared(even_slice, cos_theta_node), std::make_shared(odd_slice, sin_theta_node)); - auto second_half = + Output second_half = std::make_shared(std::make_shared(even_slice, sin_theta_node), std::make_shared(odd_slice, cos_theta_node)); - auto stack = std::make_shared(OutputVector{first_half, second_half}, 2); + first_half = std::make_shared(first_half, + ov::op::v0::Constant::create(ov::element::i64, {1}, {3})); + second_half = std::make_shared(second_half, + ov::op::v0::Constant::create(ov::element::i64, {1}, {3})); + auto stack = std::make_shared(OutputVector{first_half, second_half}, 3); res = std::make_shared(stack, std::make_shared(data_node), false); - } else { + } else if (mode == ROPE_TYPE_NEOX) { auto data_split = std::make_shared( data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}), 2); Output slice_data_node_0 = data_split->outputs()[0]; diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp index d59f4499a3..001a62be8b 100644 --- a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp +++ b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp @@ -33,9 +33,9 @@ OutputVector translate_soft_max(const NodeContext& context) { auto* op_params = context.get_output_op_params(0); memcpy(&scale, (float*) op_params + 0, sizeof(float)); memcpy(&max_bias, (float*) op_params + 1, sizeof(float)); - const uint32_t h = context.get_head_size(); - - const uint32_t n_head = context.get_input_shape(0)[0].get_length(); + auto src0_shape = context.get_input_shape(0).get_shape(); + const uint32_t h = src0_shape[2]; + const uint32_t n_head = src0_shape[0]; const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head)); const float m0 = powf(2.0f, -(max_bias) / n_head_log2); @@ -46,23 +46,30 @@ OutputVector translate_soft_max(const NodeContext& context) { auto scale_node = std::make_shared(ov::element::f32, ov::Shape{}, std::vector{scale}); auto scaled_input = std::make_shared(input_node, scale_node); + if (context.get_input_size() < 2) { + res = std::make_shared(scaled_input, 2); + return rename_outputs_with_suffix({res}, context.get_name()); + } + auto mask_node = context.get_input(1); - // Use Q-cur to retrieve the token length, so that the translation of SOFT_MAX + std::shared_ptr token_len = get_dimensions(input_node, {1}); + // Try using Q-cur to retrieve the token length, so that the translation of SOFT_MAX // does not depend on the result of the QK MatMul, so that QK matmul + softmax + qkv matmul // can be fused into SDPA. - if (input_node->get_type_info() != ov::op::v0::Convert::get_type_info_static()) { - throw std::runtime_error("Input of SOFT_MAX should be MatMul of qk followed by a Convert"); + if (input_node->get_type_info() == ov::op::v0::Convert::get_type_info_static()) { + auto qk = input_node->get_input_node_shared_ptr(0); + if (qk->get_type_info() == ov::op::v0::MatMul::get_type_info_static()) { + token_len = get_dimensions(qk->get_input_node_shared_ptr(0), {1}); + } } - auto qk = input_node->get_input_node_shared_ptr(0); - if (qk->get_type_info() != ov::op::v0::MatMul::get_type_info_static()) { - throw std::runtime_error("Input of SOFT_MAX should be MatMul of qk followed by a Convert"); - } - auto token_len = get_dimensions(qk->get_input_node_shared_ptr(0), {1}); - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - auto mask_node_sliced = std::make_shared(mask_node, zero, token_len, one, one); + std::shared_ptr mask_node_sliced = + std::make_shared(mask_node, zero, token_len, one, one); + if (mask_node_sliced->get_element_type() != context.get_output_type(0)) { + mask_node_sliced = std::make_shared(mask_node_sliced, context.get_output_type(0)); + } Output slope_mask; if (slope != 1.0f) { diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index d122497e63..129c3592c9 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -145,69 +145,18 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { int32_t* rope_params = ggml_model_decoder.get_rope_params(); auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); std::shared_ptr rope_freqs_weight; - - inp_pos = std::make_shared(inp_pos, ov::element::f32); - auto pos_perm = - std::make_shared(ov::element::i64, ov::Shape{3}, std::vector{2, 1, 0}); - inp_pos = std::make_shared(inp_pos, pos_perm); if (tensor_map.find("rope_freqs_weight") != tensor_map.end()) { rope_freqs_weight = tensor_map.at("rope_freqs.weight").get_node_shared_ptr(); } - float freq_base; - float freq_scale; - float ext_factor; - float attn_factor; - float beta_fast; - float beta_slow; - const int n_dims = rope_params[1]; - const int n_ctx_orig = rope_params[4]; - memcpy(&freq_base, rope_params + 5, sizeof(float)); - memcpy(&freq_scale, rope_params + 6, sizeof(float)); - memcpy(&ext_factor, rope_params + 7, sizeof(float)); - memcpy(&attn_factor, rope_params + 8, sizeof(float)); - memcpy(&beta_fast, rope_params + 9, sizeof(float)); - memcpy(&beta_slow, rope_params + 10, sizeof(float)); + auto sin_cos = make_sin_cos(rope_params, inp_pos, rope_freqs_weight); + auto sin_theta = sin_cos.first; + auto cos_theta = sin_cos.second; - const float theta_scale = powf(freq_base, -2.0f / n_dims); - - // TODO: corr_dims is not used in the current implementation - float corr_dims[2]; - ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); - - // TODO: GGML_OP_ROPE_BACK -> false - // bool forward = true; - // const float sin_sign = forward ? 1.0f : -1.0f; - - const int64_t half_head_size = ggml_model_decoder.get_head_size() / 2; - std::vector factor(half_head_size); - factor[0] = freq_scale; - for (int64_t i = 1; i < half_head_size; i++) { - factor[i] = theta_scale * factor[i - 1]; - } - - Output factor_node = - std::make_shared(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor); - if (rope_freqs_weight) { - factor_node = std::make_shared(factor_node, rope_freqs_weight); - } - - auto half_head_size_node = ov::op::v0::Constant::create(ov::element::i64, Shape{1}, {half_head_size}); - Output cos_factor = - std::make_shared(std::make_shared(factor_node, inp_pos)); - Output sin_factor = - std::make_shared(std::make_shared(factor_node, inp_pos)); - - float mscale = attn_factor; - Output mscale_node = - std::make_shared(ov::element::f32, ov::Shape{}, std::vector{mscale}); - - auto cos_theta = std::make_shared(cos_factor, mscale_node); - auto sin_theta = std::make_shared(sin_factor, mscale_node); - cos_theta->set_friendly_name("rope_cos"); - sin_theta->set_friendly_name("rope_sin"); - tensor_map.insert({"rope_cos", cos_theta->output(0)}); - tensor_map.insert({"rope_sin", sin_theta->output(0)}); + cos_theta.get_node_shared_ptr()->set_friendly_name("rope_cos"); + sin_theta.get_node_shared_ptr()->set_friendly_name("rope_sin"); + tensor_map.insert({"rope_cos", cos_theta}); + tensor_map.insert({"rope_sin", sin_theta}); } // Create common patterns @@ -220,10 +169,12 @@ void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { } // namespace TranslateSession::TranslateSession(const frontend::InputModel::Ptr& input_model, - const std::unordered_map& translator_map) - : m_input_model(input_model), - m_translator_map(translator_map), - m_ov_model(nullptr) {} + const std::unordered_map& translator_map, + bool naive) : + m_input_model(input_model), + m_translator_map(translator_map), + m_ov_model(nullptr), + m_naive(naive) {} std::shared_ptr TranslateSession::get_converted_model() { if (m_ov_model) { @@ -258,6 +209,10 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo auto node_visitor = [&](std::shared_ptr node) { auto operation_type = node->get_op_type(); + if (operation_type == "GGML_OP_NONE") { + return; + } + ov::OutputVector converted_outputs; auto it = m_translator_map.find(operation_type); FRONT_END_OP_CONVERSION_CHECK(it != m_translator_map.end(), @@ -285,7 +240,9 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo } }; - preprocess(*tensor_map, *ggml_model_decoder); + if (!m_naive) { + preprocess(*tensor_map, *ggml_model_decoder); + } ggml_model_decoder->visit_subgraph(node_visitor); for (const auto& name : ggml_model_decoder->get_model_output_names()) { diff --git a/ggml/src/ggml-openvino/openvino/translate_session.hpp b/ggml/src/ggml-openvino/openvino/translate_session.hpp index 9167b55fe5..9eea5fd11c 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.hpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.hpp @@ -10,7 +10,7 @@ namespace ggml { class TranslateSession { public: TranslateSession(const frontend::InputModel::Ptr& input_model, - const std::unordered_map& translator_map); + const std::unordered_map& translator_map, bool naive = false); std::shared_ptr get_converted_model(); std::shared_ptr translate_graph(const frontend::InputModel::Ptr& input_model); @@ -20,6 +20,7 @@ private: const frontend::InputModel::Ptr m_input_model; const std::unordered_map& m_translator_map; std::shared_ptr m_ov_model; + bool m_naive; }; } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp index 69e26f05ca..9634900753 100644 --- a/ggml/src/ggml-openvino/openvino/utils.cpp +++ b/ggml/src/ggml-openvino/openvino/utils.cpp @@ -1,9 +1,20 @@ #include "utils.hpp" +#include #include #include +#include +#include +#include +#include +#include #include +#include +#include #include +#include +#include +#include #include namespace ov { @@ -58,6 +69,134 @@ OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std:: return outputs; } +namespace { +ov::Output rope_yarn_ramp_mix(int n_dims, const float corr_dims[2], float ext_factor) { + int half_n_dims = n_dims / 2; + std::vector dim_ids_vec(half_n_dims); + std::iota(dim_ids_vec.begin(), dim_ids_vec.end(), 0); + auto dim_ids = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, (size_t) half_n_dims}, dim_ids_vec); + auto corr_low = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {corr_dims[0]}); + auto corr_high = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {corr_dims[1]}); + auto denom = + std::make_shared(std::make_shared(corr_high, corr_low), + ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {0.001f})); + auto ramp_y = + std::make_shared(std::make_shared(dim_ids, corr_low), denom); + auto ramp_clamped = std::make_shared(ramp_y, 0.0f, 1.0f); + auto ext_factor_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {ext_factor}); + auto ramp_mix = std::make_shared(ramp_clamped, ext_factor_node); + return ramp_mix; +} + +float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) { +#ifndef M_PI +# define M_PI 3.14159265358979323846 +#endif + return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float) M_PI)) / (2 * logf(base)); +} + +void ggml_rope_yarn_corr_dims(int n_dims, + int n_ctx_orig, + float freq_base, + float beta_fast, + float beta_slow, + float dims[2]) { + float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base)); + float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base)); + dims[0] = std::max(0.0f, start); + dims[1] = std::min(static_cast(n_dims - 1), end); +} +} // namespace + +std::pair, ov::Output> make_sin_cos(int32_t* rope_params, + std::shared_ptr inp_pos, + std::shared_ptr rope_freqs_weight) { + inp_pos = std::make_shared(inp_pos, ov::element::f32); + auto pos_perm = + std::make_shared(ov::element::i64, ov::Shape{3}, std::vector{2, 1, 0}); + inp_pos = std::make_shared(inp_pos, pos_perm); + + float freq_base; + float freq_scale; + float ext_factor; + float attn_factor; + float beta_fast; + float beta_slow; + const int n_dims = rope_params[1]; + const int n_ctx_orig = rope_params[4]; + memcpy(&freq_base, rope_params + 5, sizeof(float)); + memcpy(&freq_scale, rope_params + 6, sizeof(float)); + memcpy(&ext_factor, rope_params + 7, sizeof(float)); + memcpy(&attn_factor, rope_params + 8, sizeof(float)); + memcpy(&beta_fast, rope_params + 9, sizeof(float)); + memcpy(&beta_slow, rope_params + 10, sizeof(float)); + + const float theta_scale = powf(freq_base, -2.0f / n_dims); + + float corr_dims[2]; + ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); + + std::vector factor(n_dims / 2); + factor[0] = freq_scale; + for (size_t i = 1; i < factor.size(); i++) { + factor[i] = theta_scale * factor[i - 1]; + } + + Output freq_factors = + std::make_shared(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor); + if (rope_freqs_weight) { + freq_factors = std::make_shared(freq_factors, rope_freqs_weight); + } + + auto theta_extrap = std::make_shared(freq_factors, inp_pos); + auto theta_interp = std::make_shared( + theta_extrap, ov::op::v0::Constant::create(ov::element::f32, {1}, {freq_scale})); + + Output theta; + float mscale = attn_factor; + if (ext_factor == 0.0f) { + theta = theta_interp; + } else { + auto ramp_mix = rope_yarn_ramp_mix(n_dims, corr_dims, ext_factor); + auto one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {1.0f}); + auto one_minus_ramp = std::make_shared(one, ramp_mix); + + theta = std::make_shared(std::make_shared(theta_interp, one_minus_ramp), + std::make_shared(theta_extrap, ramp_mix)); + mscale *= (1.0f + 0.1f * std::log(1.0f / freq_scale)); + } + + Output cos_theta = std::make_shared(theta); + Output sin_theta = std::make_shared(theta); + + auto mscale_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {mscale}); + + cos_theta = std::make_shared(cos_theta, mscale_node); + sin_theta = std::make_shared(sin_theta, mscale_node); + return std::make_pair(sin_theta, cos_theta); +} + +ov::Output process_view_input(const NodeContext& context, int input_index, int slice_len) { + // Only works for VIEW operations that slice at the lowest dimension + // If the VIEW also reshape the result, `slice_len` should be provided + auto input = context.get_input(input_index); + int32_t* op_params = context.get_input_op_params(input_index); + auto src1_stride = context.get_input_stride(input_index); + + int64_t split_addr = op_params[0] / src1_stride[2]; + if (slice_len == 0) { + slice_len = context.get_input_shape(input_index)[2].get_length(); + } + int64_t slice_end = split_addr + slice_len; + + auto begin = ov::op::v0::Constant::create(ov::element::i64, {1}, {split_addr}); + auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_end}); + auto stride = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); + auto sliced = std::make_shared(input, begin, end, stride, axes); + return sliced; +} + } // namespace ggml } // namespace frontend } // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/utils.hpp b/ggml/src/ggml-openvino/openvino/utils.hpp index b54b2b92c9..6c6d2ae8d4 100644 --- a/ggml/src/ggml-openvino/openvino/utils.hpp +++ b/ggml/src/ggml-openvino/openvino/utils.hpp @@ -1,6 +1,10 @@ #pragma once +#include +#include #include +#include +#include #include "node_context.hpp" @@ -60,6 +64,12 @@ std::shared_ptr get_dimensions(const std::shared_ptr& node, OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::string& suffix); +std::pair, ov::Output> make_sin_cos(int32_t* rope_params, + std::shared_ptr inp_pos, + std::shared_ptr rope_freqs_weight = nullptr); + +ov::Output process_view_input(const NodeContext& context, int input_index, int slice_len = 0); + namespace op { template OutputVector translate_1to1_match_2_inputs(const NodeContext& context) { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index e5a4401fec..fcfd3639a7 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -21,6 +21,7 @@ #include #include "ggml-impl.h" +#include "ggml-openvino/ggml-decoder.h" #include "ggml.h" #include "openvino/frontend.hpp" #include "openvino/input_model.hpp" @@ -35,6 +36,9 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, ov::Shape input_shape; if (name.find("cache_k") == 0 || name.find("cache_v") == 0) { input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor).to_shape(); + } else if (ggml_tensor->op == GGML_OP_VIEW) { + // This case is added to make test-backend-ops work + input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor->view_src).to_shape(); } else { input_shape = ggml_decoder->get_input_shape(name).to_shape(); } @@ -81,6 +85,10 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c config = get_npu_config(); } + if (cgraph->n_nodes == 1) { + return naive_compute(cgraph, core, device, config); + } + auto start_time = ggml_time_us(); auto* cache_dir = getenv("GGML_OPENVINO_CACHE_DIR"); @@ -242,6 +250,42 @@ ov::AnyMap get_npu_config() { return config; } +enum ggml_status naive_compute(struct ggml_cgraph* cgraph, + ov::Core& core, + const std::string& device, + const ov::AnyMap& config) { + if (cgraph->nodes[0]->op == GGML_OP_NONE) { + return GGML_STATUS_SUCCESS; + } + + auto decoder = std::make_shared(cgraph); + auto input_model = std::make_shared(decoder); + auto naive = true; + auto model = ov::frontend::ggml::FrontEnd::convert(input_model, naive); + auto infer_request = core.compile_model(model, device, config).create_infer_request(); + + ov::serialize(model, "IR.xml"); + + auto ov_params = model->get_parameters(); + for (size_t i = 0; i < ov_params.size(); i++) { + auto param_name = ov_params[i]->get_friendly_name(); + auto input_tensor = get_ov_input_tensor(decoder, param_name); + infer_request.set_input_tensor(i, input_tensor); + } + + infer_request.infer(); + + auto gguf_tensor_addrs = get_ggml_graph_output_dst(decoder); + auto ov_results = model->get_results(); + for (size_t i = 0; i < ov_results.size(); i++) { + auto result_name = ov_results[i]->get_friendly_name(); + const auto output_tensor = infer_request.get_output_tensor(i); + + std::memcpy(gguf_tensor_addrs[result_name], output_tensor.data(), output_tensor.get_byte_size()); + } + return GGML_STATUS_SUCCESS; +} + ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string& param_name) { bool is_static = ggml_decoder->is_static(); bool is_first_token = ggml_decoder->is_first_token(); diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 1d23e28522..367b2829be 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -1,4 +1,5 @@ #include +#include #include "ggml-backend-impl.h" #include "ggml-decoder.h" @@ -42,3 +43,6 @@ bool is_prefill(struct ggml_cgraph * cgraph); ov::AnyMap get_npu_config(); ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string& param_name); + +enum ggml_status naive_compute(struct ggml_cgraph* cgraph, ov::Core& core, const std::string& device, + const ov::AnyMap& config); From 6dc4b90635674e3a19402acb6828b90efdcc5a4a Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 23 Jul 2025 15:37:58 +0800 Subject: [PATCH 096/254] Fix NPU --- ggml/src/ggml-openvino/.clang-format | 2 +- ggml/src/ggml-openvino/ggml-decoder.cpp | 76 ++++++++++++------------- ggml/src/ggml-openvino/ggml-decoder.h | 14 +++-- ggml/src/ggml-openvino/utils.cpp | 16 +++--- 4 files changed, 56 insertions(+), 52 deletions(-) diff --git a/ggml/src/ggml-openvino/.clang-format b/ggml/src/ggml-openvino/.clang-format index d631bc6c01..18280772b6 100644 --- a/ggml/src/ggml-openvino/.clang-format +++ b/ggml/src/ggml-openvino/.clang-format @@ -7,7 +7,6 @@ PointerAlignment: Left Cpp11BracedListStyle: true AccessModifierOffset: -4 BinPackArguments: false -BinPackParameters: false BreakBeforeBraces: Attach Language: Cpp @@ -31,6 +30,7 @@ AllowShortIfStatementsOnASingleLine: Never AllowShortLambdasOnASingleLine: Inline AllowShortLoopsOnASingleLine: false AlwaysBreakBeforeMultilineStrings: true +BinPackParameters: true BitFieldColonSpacing: Both # BreakAdjacentStringLiterals: true BreakAfterAttributes: Never diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 8ce9354c69..b233ff8ebd 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -31,47 +31,45 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token, int context_size, int num_heads, int num_heads_kv, int head_size) : - GgmlOvDecoder::GgmlOvDecoder(node, cgraph, is_static, is_first_token) { - m_context_size = context_size; - m_num_heads = num_heads; - m_num_heads_kv = num_heads_kv; - m_head_size = head_size; -} - -GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, - bool is_first_token) : m_cgraph(cgraph), m_node(node), - m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"), + m_op_name(std::string(node->name)), + m_context_size(context_size), + m_num_heads(num_heads), + m_num_heads_kv(num_heads_kv), + m_head_size(head_size), m_is_static(is_static), m_is_first_token(is_first_token) { - if (m_node) { - set_input_output(m_node); - } else { - if (is_first_token && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) { - print_tensor_address_map(cgraph); - } + set_input_output(node); +} - if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { - auto timestamp = (long long) ggml_time_us(); - std::string filename = "cgraph_" + std::to_string(timestamp) + ".txt"; - dump_cgraph(cgraph, filename); - } - - set_llm_params(); - - if (is_first_token) { - add_weight_const_parallel(m_model_weights); - } - - for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { - auto* cur_node = cgraph->nodes[node_n]; - m_nodes.push_back(cur_node); - set_input_output(cur_node); - } - - add_extra_inputs(); +GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, + std::map>& model_weights, bool is_static, + bool is_first_token) : + m_cgraph(cgraph), + m_op_name(m_node ? std::string(m_node->name) : ""), + m_model_weights(model_weights), + m_is_static(is_static), + m_is_first_token(is_first_token) { + if (is_first_token && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) { + print_tensor_address_map(cgraph); } + + if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { + auto timestamp = (long long) ggml_time_us(); + std::string filename = "cgraph_" + std::to_string(timestamp) + ".txt"; + dump_cgraph(cgraph, filename); + } + + set_llm_params(); + + for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { + auto* cur_node = cgraph->nodes[node_n]; + m_nodes.push_back(cur_node); + set_input_output(cur_node); + } + + add_extra_inputs(); } GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) { @@ -334,10 +332,11 @@ std::map GgmlOvDecoder::get_kv_param_res_names() const return kv_param_res_names; } -void GgmlOvDecoder::add_weight_const_parallel(std::map>& model_weights) { +std::map> GgmlOvDecoder::create_weight_nodes(struct ggml_cgraph* cgraph) { + std::map> model_weights; static std::mutex weights_mutex; - auto* nodes = m_cgraph->nodes; - auto n_nodes = m_cgraph->n_nodes; + auto* nodes = cgraph->nodes; + auto n_nodes = cgraph->n_nodes; std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor* node) { for (int i = 0; i < GGML_MAX_SRC; i++) { auto* src = node->src[i]; @@ -369,6 +368,7 @@ void GgmlOvDecoder::add_weight_const_parallel(std::map GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) { diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index f4fe9c402d..78422afaf7 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -11,12 +11,17 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { public: - GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token); + // Graph decoder + GgmlOvDecoder(struct ggml_cgraph* cgraph, std::map>& model_weights, + bool is_static, bool is_first_token); + + // Node decoder, called in GgmlOvDecoder::visit_subgraph GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token, int context_size, int num_heads, int num_heads_kv, int head_size); - // Naive decoder + // Naive graph decoder GgmlOvDecoder(struct ggml_cgraph* cgraph); + virtual ov::Any get_attribute(const std::string& name) const override { return nullptr; GGML_UNUSED(name); @@ -110,6 +115,8 @@ public: ov::PartialShape get_graph_input_shape(const ggml_tensor* src) const; + static std::shared_ptr create_weight_node(ggml_tensor* tensor); + static std::map> create_weight_nodes(struct ggml_cgraph* cgraph); void clear_model_weights() { m_model_weights.clear(); } private: @@ -123,9 +130,6 @@ private: // set context_size, num_heads, etc void set_llm_params(); - static std::shared_ptr create_weight_node(ggml_tensor* tensor); - void add_weight_const_parallel(std::map>& model_weights); - struct ggml_cgraph* m_cgraph = nullptr; ggml_tensor* m_node = nullptr; std::vector m_nodes; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index fcfd3639a7..be06c54e8b 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -26,10 +26,6 @@ #include "openvino/frontend.hpp" #include "openvino/input_model.hpp" -std::shared_ptr get_ggml_decoder(struct ggml_cgraph* cgraph, bool is_static, bool is_first_token) { - return std::make_shared(nullptr, cgraph, is_static, is_first_token); -} - ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string& name) { const auto* ggml_tensor = ggml_decoder->get_input_ggml_tensor(name); auto* input_data = ggml_tensor->data; @@ -111,7 +107,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto it = infer_request_cache.find(cgraph); if (it != infer_request_cache.end()) { - ggml_decoder = get_ggml_decoder(cgraph, is_static, false); + std::map> model_weights; + ggml_decoder = std::make_shared(cgraph, model_weights, is_static, false); decoder_end_time = ggml_time_us(); // For NPU for the first time we call kvcache modle, pop the compiled kvcache model from cache @@ -126,17 +123,20 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c compile_end_time = conversion_end_time; } else { std::shared_ptr model; + auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); if (is_static) { - ggml_decoder = get_ggml_decoder(cgraph, is_static, true); - auto ggml_decoder_kvcache = get_ggml_decoder(cgraph, is_static, false); + ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true); + auto ggml_decoder_kvcache = std::make_shared(cgraph, model_weights, is_static, false); decoder_end_time = ggml_time_us(); auto input_model = std::make_shared(ggml_decoder); auto input_model_kvcache = std::make_shared(ggml_decoder_kvcache); model = ov::frontend::ggml::FrontEnd::convert(input_model); + ggml_decoder->clear_model_weights(); auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache); + ggml_decoder_kvcache->clear_model_weights(); conversion_end_time = ggml_time_us(); auto compiled_model = core.compile_model(model, device, config); @@ -157,7 +157,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c ov::serialize(model_kvcache, timestamped_filename); } } else { - ggml_decoder = get_ggml_decoder(cgraph, is_static, true); + ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true); decoder_end_time = ggml_time_us(); auto input_model = std::make_shared(ggml_decoder); From 75eec6265f3b44f43d6b7b46def367bf86513a10 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 24 Jul 2025 11:56:25 +0800 Subject: [PATCH 097/254] Fix llama-bench; Clang-format --- ggml/src/ggml-openvino/.clang-format | 4 +- ggml/src/ggml-openvino/ggml-decoder.cpp | 67 +++++++++++------------- ggml/src/ggml-openvino/ggml-openvino.cpp | 53 +++++++++---------- 3 files changed, 58 insertions(+), 66 deletions(-) diff --git a/ggml/src/ggml-openvino/.clang-format b/ggml/src/ggml-openvino/.clang-format index 18280772b6..63dc2c472a 100644 --- a/ggml/src/ggml-openvino/.clang-format +++ b/ggml/src/ggml-openvino/.clang-format @@ -8,6 +8,8 @@ Cpp11BracedListStyle: true AccessModifierOffset: -4 BinPackArguments: false BreakBeforeBraces: Attach +IndentCaseBlocks: false +IndentCaseLabels: false Language: Cpp AlignAfterOpenBracket: Align @@ -68,8 +70,6 @@ IncludeCategories: IncludeIsMainRegex: '([-_](test|unittest))?$' IncludeIsMainSourceRegex: '' IndentAccessModifiers: false -IndentCaseBlocks: true -IndentCaseLabels: true IndentExternBlock: NoIndent IndentGotoLabels: false IndentPPDirectives: AfterHash diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index b233ff8ebd..3dc2a3eeac 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -176,7 +176,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { break; } case GGML_OP_CPY: { - if (ggml_is_contiguous(node)) { + if (std::string(node->src[1]->name).find("cache_k") == 0) { // Write K to cache_k m_op_case = 1; } else { @@ -184,7 +184,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { m_op_case = 2; } break; - } + } case GGML_OP_PERMUTE: { if (node->src[0]->view_src == nullptr) { // Permute Qcur @@ -198,23 +198,21 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { } break; } - case GGML_OP_GET_ROWS: - { - if (node->src[1]->op == GGML_OP_VIEW) { - m_op_case = 2; - } else { - m_op_case = 1; - } - break; + case GGML_OP_GET_ROWS: { + if (node->src[1]->op == GGML_OP_VIEW) { + m_op_case = 2; + } else { + m_op_case = 1; } - case GGML_OP_ROPE: - { - if (node->src[0]->op == GGML_OP_VIEW) { - m_op_case = 2; - } else { - m_op_case = 1; - } + break; + } + case GGML_OP_ROPE: { + if (node->src[0]->op == GGML_OP_VIEW) { + m_op_case = 2; + } else { + m_op_case = 1; } + } default: break; } @@ -405,17 +403,16 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) weight_node = std::make_shared(node_type, node_shape, data_f16); break; } - case GGML_TYPE_BF16: - { - const auto* ptr = reinterpret_cast(tensor->data); - std::vector data_bf16; - data_bf16.reserve(ne_total); - for (int i = 0; i < ne_total; ++i) { - data_bf16.push_back(ov::bfloat16::from_bits(ptr[i])); - } - weight_node = std::make_shared(node_type, node_shape, data_bf16); - break; + case GGML_TYPE_BF16: { + const auto* ptr = reinterpret_cast(tensor->data); + std::vector data_bf16; + data_bf16.reserve(ne_total); + for (int i = 0; i < ne_total; ++i) { + data_bf16.push_back(ov::bfloat16::from_bits(ptr[i])); } + weight_node = std::make_shared(node_type, node_shape, data_bf16); + break; + } default: throw std::invalid_argument("Unsupported tensor type"); } @@ -614,8 +611,8 @@ int32_t* GgmlOvDecoder::get_output_op_params(const std::string& name) const { void GgmlOvDecoder::visit_subgraph(std::function)> node_visitor) const { for (const auto& node : m_nodes) { - auto decoder = std::make_shared(node, m_cgraph, m_is_static, m_is_first_token, m_context_size, - m_num_heads, m_num_heads_kv, m_head_size); + auto decoder = std::make_shared( + node, m_cgraph, m_is_static, m_is_first_token, m_context_size, m_num_heads, m_num_heads_kv, m_head_size); node_visitor(decoder); } } @@ -667,12 +664,12 @@ const std::string& GgmlOvDecoder::get_op_type() const { }; switch (m_node->op) { - case GGML_OP_UNARY: - return unary_ops.at(ggml_get_unary_op(m_node)); - case GGML_OP_GLU: - return glu_ops.at(ggml_get_glu_op(m_node)); - default: - return ops.at(m_node->op); + case GGML_OP_UNARY: + return unary_ops.at(ggml_get_unary_op(m_node)); + case GGML_OP_GLU: + return glu_ops.at(ggml_get_glu_op(m_node)); + default: + return ops.at(m_node->op); } static const std::string unknown_op = "UNKNOWN_GGML_OP"; return unknown_op; diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 2bc9d5199c..7edd4667d9 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -309,7 +309,7 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { return false; } -static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { +static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor* op) { GGML_ASSERT(dev->reg != nullptr); static const std::set supported_types{ @@ -327,34 +327,29 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con }; switch (op->op) { - case GGML_OP_UNARY: - { - auto supported = supported_unary_ops.find(ggml_get_unary_op(op)) != supported_unary_ops.end(); - if (!supported) { - GGML_LOG_WARN("OpenVINO backend does not support unary op %s\n", - ggml_unary_op_name(ggml_get_unary_op(op))); - return false; - } - break; - } - case GGML_OP_GLU: - { - auto supported = supported_glu_ops.find(ggml_get_glu_op(op)) != supported_glu_ops.end(); - if (!supported) { - GGML_LOG_WARN("OpenVINO backend does not support GLU op %s\n", - ggml_glu_op_name(ggml_get_glu_op(op))); - return false; - } - break; - } - default: - { - auto supported = supported_ops.find(op->op) != supported_ops.end(); - if (!supported) { - GGML_LOG_WARN("OpenVINO backend does not support op %s\n", ggml_op_name(op->op)); - return false; - } - } + case GGML_OP_UNARY: { + auto supported = supported_unary_ops.find(ggml_get_unary_op(op)) != supported_unary_ops.end(); + if (!supported) { + GGML_LOG_WARN("OpenVINO backend does not support unary op %s\n", ggml_unary_op_name(ggml_get_unary_op(op))); + return false; + } + break; + } + case GGML_OP_GLU: { + auto supported = supported_glu_ops.find(ggml_get_glu_op(op)) != supported_glu_ops.end(); + if (!supported) { + GGML_LOG_WARN("OpenVINO backend does not support GLU op %s\n", ggml_glu_op_name(ggml_get_glu_op(op))); + return false; + } + break; + } + default: { + auto supported = supported_ops.find(op->op) != supported_ops.end(); + if (!supported) { + GGML_LOG_WARN("OpenVINO backend does not support op %s\n", ggml_op_name(op->op)); + return false; + } + } } if (supported_types.find(op->type) == supported_types.end()) { From 4e7f04a307158d882fd8f5e63bc8c8f7f3bd885c Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 24 Jul 2025 17:44:32 +0800 Subject: [PATCH 098/254] Fix llama-perplexity --- ggml/src/ggml-openvino/ggml-decoder.cpp | 65 ++++++++++++------- .../openvino/translate_session.cpp | 53 +++++++-------- ggml/src/ggml-openvino/utils.cpp | 9 ++- 3 files changed, 70 insertions(+), 57 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 3dc2a3eeac..b43f45dbbd 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -236,8 +236,9 @@ void GgmlOvDecoder::set_llm_params() { } ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) const { + auto name = std::string(src->name); ov::PartialShape input_shape; - if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") { + if (name == "inp_tokens" || name == "inp_pos") { if (m_is_static) { if (m_is_first_token) { input_shape = ov::PartialShape{1, 1, m_context_size}; @@ -247,7 +248,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co } else { input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_context_size)}; } - } else if (std::string(src->name) == "KQ_mask") { + } else if (name == "inp_out_ids" && !m_is_static) { + input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_context_size)}; + } else if (name == "KQ_mask") { if (m_is_static) { if (m_is_first_token) { input_shape = ov::PartialShape{1, m_context_size, m_context_size}; @@ -258,9 +261,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co auto max_mask_size = GGML_PAD(m_context_size, GGML_KQ_MASK_PAD); input_shape = ov::PartialShape{1, ov::Dimension(1, max_mask_size), ov::Dimension(1, max_mask_size)}; } - } else if (std::string(src->name).find("cache_k") == 0) { + } else if (name.find("cache_k") == 0) { input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size}; - } else if (std::string(src->name).find("cache_v") == 0) { + } else if (name.find("cache_v") == 0) { input_shape = ov::PartialShape{m_num_heads_kv, m_head_size, m_context_size}; } else if (src->op == GGML_OP_VIEW) { // This case is added to make test-backend-ops work @@ -273,18 +276,22 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co void GgmlOvDecoder::add_extra_inputs() { // Extra inputs: - // 1. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned, - // see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding. - // Not used for NPU + // 1. `past_token_len`, used to create indices for updating kv cache. Usually equal to inp_pos[0], except for + // llama-perplexity. + // 2. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned, + // see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding. + // Not used for NPU + int64_t past_token_len = -1; int64_t attention_size = -1; - int64_t past_token_len = -1; + int64_t token_len = -1; int64_t past_token_len_from_inp_pos = -1; for (const auto& node : m_nodes) { if (node->op == GGML_OP_ROPE && std::string(node->src[1]->name) == "inp_pos") { if (node->src[1]->type != GGML_TYPE_I32) { throw std::runtime_error("Expected cgraph input `inp_pos` to be of type GGML_TYPE_I32"); } + token_len = node->src[1]->ne[0]; past_token_len_from_inp_pos = ((int32_t*) (node->src[1]->data))[0]; } if (node->op == GGML_OP_CPY && ggml_is_contiguous(node)) { @@ -294,29 +301,39 @@ void GgmlOvDecoder::add_extra_inputs() { break; } } + if (past_token_len == -1) { throw std::runtime_error("Failed to find input \"cache_k\" in the graph"); } if (past_token_len != past_token_len_from_inp_pos) { - throw std::runtime_error("Mismatch between past_token_len from cache_k and inp_pos: " + - std::to_string(past_token_len) + " vs " + std::to_string(past_token_len_from_inp_pos)); + GGML_LOG_DEBUG("Mismatch between past_token_len from cache_k and inp_pos: %ld vs %ld\n", + past_token_len, + past_token_len_from_inp_pos); } - for (const auto& node : m_nodes) { - if (node->src[1] && std::string(node->src[1]->name).find("inp_tokens") == 0) { - int64_t total_token_len = node->src[1]->ne[0] + past_token_len; - attention_size = GGML_PAD(total_token_len, 32); - std::string name = "attention_size"; - auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); - param_node->set_friendly_name(name); - param_node->output(0).get_tensor().set_names({name}); - m_model_extra_inputs[name] = param_node; + { + std::string name = "past_token_len"; + auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); + param_node->set_friendly_name(name); + param_node->output(0).get_tensor().set_names({name}); + m_model_extra_inputs[name] = param_node; - auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); - *tensor->data() = attention_size; - m_model_extra_input_values[name] = tensor; - break; - } + auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); + *tensor->data() = past_token_len; + m_model_extra_input_values[name] = tensor; + } + { + int64_t total_token_len = token_len + past_token_len; + attention_size = GGML_PAD(total_token_len, 32); + std::string name = "attention_size"; + auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); + param_node->set_friendly_name(name); + param_node->output(0).get_tensor().set_names({name}); + m_model_extra_inputs[name] = param_node; + + auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); + *tensor->data() = attention_size; + m_model_extra_input_values[name] = tensor; } } diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 129c3592c9..83581ec5a8 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -78,11 +79,11 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode // cache_k layout: [S, N, H] (seq, num_heads, head_size) // cache_v layout: [N, H, S] (num_heads, head_size, seq) // When writing to cache_v, cache should be reshaped to [N*H, S] and v-curr should be flattened - auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); + auto past_token_len = tensor_map.at("past_token_len").get_node_shared_ptr(); auto token_len = tensor_map.at("token_len").get_node_shared_ptr(); - std::shared_ptr update_indices_k; - std::shared_ptr update_indices_v; + Output update_indices_k; + Output update_indices_v; auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto zero_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {0}); @@ -90,11 +91,19 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1}); auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - update_indices_k = - std::make_shared(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); - update_indices_k = std::make_shared(update_indices_k, one); - update_indices_k->set_friendly_name("update_indices_k"); - tensor_map.insert({"update_indices_k", update_indices_k->output(0)}); + auto past_token_len_scalar = std::make_shared(past_token_len, zero); + auto token_len_scalar = std::make_shared(token_len, zero); + auto total_token_len_scalar = std::make_shared(past_token_len_scalar, token_len_scalar); + + Output update_indices = std::make_shared( + past_token_len_scalar, total_token_len_scalar, one_scalar, ov::element::i64); + if (ggml_model_decoder.is_static()) { + update_indices = past_token_len; + } + + update_indices_k = std::make_shared(update_indices, one); + update_indices_k.get_node_shared_ptr()->set_friendly_name("update_indices_k"); + tensor_map.insert({"update_indices_k", update_indices_k}); auto total_head_size = ggml_model_decoder.get_num_heads_kv() * ggml_model_decoder.get_head_size(); auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size}); @@ -102,7 +111,7 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode // 1D tensor of shape [total_head_size], values starting from 0 auto range_row = - std::make_shared(zero_scalar, total_head_size_scalar, one_scalar, ov::element::i32); + std::make_shared(zero_scalar, total_head_size_scalar, one_scalar, ov::element::i64); auto range_row_reshaped = std::make_shared(range_row, ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2})); auto row_indices = std::make_shared( @@ -110,8 +119,7 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode std::make_shared(ov::OutputVector{total_head_size_node, token_len, one}, 0)); // 1D tensor of shape [token_len], values starting from past_token_len - auto range_col = - std::make_shared(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); + auto range_col = update_indices; auto range_col_reshaped = std::make_shared(range_col, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 2})); auto col_indices = std::make_shared( @@ -119,26 +127,11 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode std::make_shared(ov::OutputVector{total_head_size_node, token_len, one}, 0)); // Stack row_indices and col_indices along last axis: [total_head_size, token_len, 2] - auto indices = std::make_shared(OutputVector{row_indices, col_indices}, 2); + update_indices_v = std::make_shared(OutputVector{row_indices, col_indices}, 2); update_indices_v = std::make_shared( - indices, ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{-1, 2}), false); - update_indices_v->set_friendly_name("update_indices_v"); - tensor_map.insert({"update_indices_v", update_indices_v->output(0)}); -} - -float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) { -#ifndef M_PI -# define M_PI 3.14159265358979323846 -#endif - return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float) M_PI)) / (2 * logf(base)); -} - -void ggml_rope_yarn_corr_dims(int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, - float dims[2]) { - float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base)); - float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base)); - dims[0] = std::max(0.0f, start); - dims[1] = std::min(static_cast(n_dims - 1), end); + update_indices_v, ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{-1, 2}), false); + update_indices_v.get_node_shared_ptr()->set_friendly_name("update_indices_v"); + tensor_map.insert({"update_indices_v", update_indices_v}); } void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index be06c54e8b..45ed73499f 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -356,10 +356,13 @@ void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor) std::cout << *(tensor.data()) << std::endl; break; case ov::element::f16: - std::cout << ov::float16::from_bits(*(tensor.data())) << std::endl; + std::cout << *(tensor.data()) << std::endl; break; case ov::element::i32: - std::cout << *(tensor.data()) << std::endl; + for (size_t i = 0; i < tensor.get_size(); ++i) { + std::cout << tensor.data()[i] << " "; + } + std::cout << std::endl; break; case ov::element::i64: std::cout << *(tensor.data()) << std::endl; @@ -379,7 +382,7 @@ void print_output_tensor_info(const std::string& name, const ov::Tensor& tensor, std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; break; case ov::element::f16: - std::cout << ov::float16::from_bits(*(tensor.data())) << std::endl; + std::cout << *(tensor.data()) << std::endl; std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; break; default: From 9cf56d6837a7d387b85b8b981916b8ba288ede71 Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Mon, 28 Jul 2025 17:14:20 -0700 Subject: [PATCH 099/254] temp. changes for mark decomp --- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 12 +++++++++++- .../src/ggml-openvino/openvino/translate_session.cpp | 2 ++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 52d1e575db..aa230550a4 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -28,7 +28,17 @@ OutputVector translate_mulmat(const NodeContext& context) { ov::Output res; ov::Output B = context.get_input(0); - ov::Output A = std::make_shared(context.get_input(1), context.get_input_type(0)); + ov::Output A = context.get_input(1); + if (context.get_op_case() == 1) { + if (context.get_input_type(0) == ov::element::f16) { + B = std::make_shared(context.get_input(0), ov::element::f32); + } + if (context.get_input_type(1) == ov::element::f16) { + A = std::make_shared(context.get_input(1), ov::element::f32); + } + } else { + A = std::make_shared(context.get_input(1), context.get_input_type(0)); + } auto B_shape = context.get_input_shape(0).to_shape(); auto A_shape = context.get_input_shape(1).to_shape(); diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 83581ec5a8..563613aa7f 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include "ggml-openvino/openvino/node_context.hpp" #include "ggml-openvino/openvino/utils.hpp" @@ -258,6 +259,7 @@ void TranslateSession::apply_transformations(const std::shared_ptr& model ov::pass::Manager manager; manager.set_per_pass_validation(true); + manager.register_pass(); manager.register_pass(); if (!ggml_model_decoder->is_static()) { From 01cdf4a9cc685fe7c5f1a64b20fd7c02e8383083 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 29 Jul 2025 14:07:03 +0800 Subject: [PATCH 100/254] matmul in fp32 --- ggml/src/ggml-openvino/ggml-decoder.cpp | 1 + ggml/src/ggml-openvino/ggml-decoder.h | 2 +- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 14 ++------- .../ggml-openvino/openvino/op/soft_max.cpp | 7 ++--- .../openvino/pass/fuse_to_sdpa.cpp | 11 +++---- .../openvino/translate_session.cpp | 29 ++++++++++--------- .../openvino/translate_session.hpp | 2 +- 7 files changed, 28 insertions(+), 38 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index b43f45dbbd..f7846382b9 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -212,6 +212,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { } else { m_op_case = 1; } + break; } default: break; diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 78422afaf7..c1970af53a 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -139,7 +139,7 @@ private: std::vector m_output_names; std::string m_op_name; mutable std::string m_name; - int m_op_case; + int m_op_case = 0; std::vector> m_op_node_name; std::map> m_model_inputs; std::map> m_model_extra_inputs; diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index aa230550a4..57fd476f0a 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -29,15 +29,8 @@ OutputVector translate_mulmat(const NodeContext& context) { ov::Output res; ov::Output B = context.get_input(0); ov::Output A = context.get_input(1); - if (context.get_op_case() == 1) { - if (context.get_input_type(0) == ov::element::f16) { - B = std::make_shared(context.get_input(0), ov::element::f32); - } - if (context.get_input_type(1) == ov::element::f16) { - A = std::make_shared(context.get_input(1), ov::element::f32); - } - } else { - A = std::make_shared(context.get_input(1), context.get_input_type(0)); + if (context.get_input_type(0) != context.get_input_type(1)) { + B = std::make_shared(context.get_input(0), context.get_input_type(1)); } auto B_shape = context.get_input_shape(0).to_shape(); @@ -72,8 +65,7 @@ OutputVector translate_mulmat(const NodeContext& context) { A = Z; } - auto result_lp = std::make_shared(A, B, false, true); - res = std::make_shared(result_lp, context.get_output_type(0)); + res = std::make_shared(A, B, false, true); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp index 001a62be8b..401acaf865 100644 --- a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp +++ b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp @@ -57,11 +57,8 @@ OutputVector translate_soft_max(const NodeContext& context) { // Try using Q-cur to retrieve the token length, so that the translation of SOFT_MAX // does not depend on the result of the QK MatMul, so that QK matmul + softmax + qkv matmul // can be fused into SDPA. - if (input_node->get_type_info() == ov::op::v0::Convert::get_type_info_static()) { - auto qk = input_node->get_input_node_shared_ptr(0); - if (qk->get_type_info() == ov::op::v0::MatMul::get_type_info_static()) { - token_len = get_dimensions(qk->get_input_node_shared_ptr(0), {1}); - } + if (input_node->get_type_info() == ov::op::v0::MatMul::get_type_info_static()) { + token_len = get_dimensions(input_node->get_input_node_shared_ptr(0), {1}); } auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); diff --git a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp index 1b7ac60271..aa6e28b627 100644 --- a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +++ b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -22,15 +23,13 @@ FuseToSDPA::FuseToSDPA() { const auto m_k = ov::pass::pattern::any_input(); const auto m_q = ov::pass::pattern::any_input(); const auto m_qk = ov::pass::pattern::wrap_type({m_q, m_k}); - const auto m_qk_f32 = ov::pass::pattern::wrap_type({m_qk}); const auto m_scale = ov::pass::pattern::any_input(); - const auto m_scaled_qk = ov::pass::pattern::wrap_type({m_qk_f32, m_scale}); + const auto m_scaled_qk = ov::pass::pattern::wrap_type({m_qk, m_scale}); const auto m_mask = ov::pass::pattern::any_input(); const auto m_masked_qk = ov::pass::pattern::wrap_type({m_scaled_qk, m_mask}); const auto m_softmax_qk = ov::pass::pattern::wrap_type({m_masked_qk}); - const auto m_softmax_qk_f16 = ov::pass::pattern::wrap_type({m_softmax_qk}); const auto m_v = ov::pass::pattern::any_input(); - const auto m_qkv = ov::pass::pattern::wrap_type({m_softmax_qk_f16, m_v}); + const auto m_qkv = ov::pass::pattern::wrap_type({m_softmax_qk, m_v}); const auto callback = [=](ov::pass::pattern::Matcher& m) { auto& pattern_to_output = m.get_pattern_value_map(); @@ -42,9 +41,7 @@ FuseToSDPA::FuseToSDPA() { auto v_trans = register_new_node(v, ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 2, 1})); - auto mask_f16 = register_new_node(mask, ov::element::f16); - auto scale_f16 = register_new_node(scale, ov::element::f16); - auto sdpa = std::make_shared(q, k, v_trans, mask_f16, scale_f16, false); + auto sdpa = std::make_shared(q, k, v_trans, mask, scale, false); ov::replace_node(m.get_match_root(), sdpa); ov::copy_runtime_info(m.get_matched_nodes(), sdpa); diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 563613aa7f..c4fe8c88ee 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include "ggml-openvino/openvino/node_context.hpp" #include "ggml-openvino/openvino/utils.hpp" @@ -254,22 +254,25 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo return resulting_model; } -void TranslateSession::apply_transformations(const std::shared_ptr& model) { +std::shared_ptr TranslateSession::apply_transformations(std::shared_ptr model) { auto ggml_model_decoder = std::dynamic_pointer_cast(m_input_model)->get_model_decoder(); + { + ov::pass::Manager manager; + manager.set_per_pass_validation(true); - ov::pass::Manager manager; - manager.set_per_pass_validation(true); - manager.register_pass(); - manager.register_pass(); + if (!ggml_model_decoder->is_static()) { + const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names(); + const auto kv_param_res_pairs = get_kv_param_res_pairs(model, kv_param_res_names); + manager.register_pass(kv_param_res_pairs); + } - if (!ggml_model_decoder->is_static()) { - const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names(); - const auto kv_param_res_pairs = get_kv_param_res_pairs(model, kv_param_res_names); - manager.register_pass(kv_param_res_pairs); + // SDPA is even worse on performance + // manager.register_pass(); + manager.run_passes(model); } - - manager.register_pass(); - manager.run_passes(model); + auto preprocessor = ov::preprocess::PrePostProcessor(model); + model = preprocessor.build(); + return model; } } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/translate_session.hpp b/ggml/src/ggml-openvino/openvino/translate_session.hpp index 9eea5fd11c..7072d4a9e8 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.hpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.hpp @@ -16,7 +16,7 @@ public: std::shared_ptr translate_graph(const frontend::InputModel::Ptr& input_model); private: - void apply_transformations(const std::shared_ptr& model); + std::shared_ptr apply_transformations(std::shared_ptr model); const frontend::InputModel::Ptr m_input_model; const std::unordered_map& m_translator_map; std::shared_ptr m_ov_model; From e2fdc1b9884b99379d5d9f7bcbea0c6311ef3165 Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Tue, 29 Jul 2025 17:55:15 -0700 Subject: [PATCH 101/254] mulmat input conversion fix --- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 57fd476f0a..6905777a09 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include "../node_context.hpp" @@ -29,8 +30,10 @@ OutputVector translate_mulmat(const NodeContext& context) { ov::Output res; ov::Output B = context.get_input(0); ov::Output A = context.get_input(1); - if (context.get_input_type(0) != context.get_input_type(1)) { + if (ov::op::util::is_constant(B.get_node()) && context.get_input_type(0) != context.get_input_type(1)) { B = std::make_shared(context.get_input(0), context.get_input_type(1)); + } else if (context.get_input_type(0) != context.get_input_type(1)) { + A = std::make_shared(context.get_input(1), context.get_input_type(0)); } auto B_shape = context.get_input_shape(0).to_shape(); From 93b2d09a2dd9998c2bdbe965bc5ce3f97f158508 Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Tue, 29 Jul 2025 18:17:14 -0700 Subject: [PATCH 102/254] mulmat type conversion update --- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 6905777a09..9148a27517 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -30,10 +30,13 @@ OutputVector translate_mulmat(const NodeContext& context) { ov::Output res; ov::Output B = context.get_input(0); ov::Output A = context.get_input(1); + + bool convert_out_type = false; if (ov::op::util::is_constant(B.get_node()) && context.get_input_type(0) != context.get_input_type(1)) { B = std::make_shared(context.get_input(0), context.get_input_type(1)); } else if (context.get_input_type(0) != context.get_input_type(1)) { A = std::make_shared(context.get_input(1), context.get_input_type(0)); + convert_out_type = true; } auto B_shape = context.get_input_shape(0).to_shape(); @@ -68,7 +71,12 @@ OutputVector translate_mulmat(const NodeContext& context) { A = Z; } - res = std::make_shared(A, B, false, true); + if (convert_out_type) { + auto result_lp = std::make_shared(A, B, false, true); + res = std::make_shared(result_lp, context.get_output_type(0)); + } else { + res = std::make_shared(A, B, false, true); + } return rename_outputs_with_suffix({res}, context.get_name()); } From 1a19566b23b097a84f30ccd86319eca72e8bcff6 Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Tue, 29 Jul 2025 21:37:57 -0700 Subject: [PATCH 103/254] add mark decomp pass --- ...decompression_convert_constant_folding.hpp | 29 +++++++++++++++++++ .../openvino/translate_session.cpp | 5 +++- 2 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.hpp diff --git a/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.hpp b/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.hpp new file mode 100644 index 0000000000..163422bf33 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.hpp @@ -0,0 +1,29 @@ +#pragma once + +#include "mark_decompression_convert_constant_folding.hpp" +#include "openvino/pass/matcher_pass.hpp" +#include "openvino/core/visibility.hpp" + +#ifdef OPENVINO_STATIC_LIBRARY +# define TRANSFORMATIONS_API +#else +# ifdef IMPLEMENT_OPENVINO_API +# define TRANSFORMATIONS_API OPENVINO_CORE_EXPORTS +# else +# define TRANSFORMATIONS_API OPENVINO_CORE_IMPORTS +# endif // IMPLEMENT_OPENVINO_API +#endif // OPENVINO_STATIC_LIBRARY + +namespace ov { +namespace pass { + +class TRANSFORMATIONS_API MarkCompressedFloatConstants; + +} // namespace pass +} // namespace ov + +class ov::pass::MarkCompressedFloatConstants : public MatcherPass { +public: + OPENVINO_MATCHER_PASS_RTTI("MarkCompressedFloatConstants"); + MarkCompressedFloatConstants(); +}; diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index c4fe8c88ee..ed7db61414 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -28,6 +28,7 @@ #include "ggml-openvino/openvino/utils.hpp" #include "input_model.hpp" #include "pass/fuse_to_sdpa.hpp" +#include "pass/mark_decompression_convert_constant_folding.hpp" namespace ov { namespace frontend { @@ -259,6 +260,8 @@ std::shared_ptr TranslateSession::apply_transformations(std::shared_ptr(); + manager.register_pass(); if (!ggml_model_decoder->is_static()) { const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names(); @@ -267,7 +270,7 @@ std::shared_ptr TranslateSession::apply_transformations(std::shared_ptr(); + manager.register_pass(); manager.run_passes(model); } auto preprocessor = ov::preprocess::PrePostProcessor(model); From 43489bbfaac116b2f4b6caa9448988e4343cc98d Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 30 Jul 2025 22:55:41 +0800 Subject: [PATCH 104/254] Revert changes in fuse_to_sdpa --- ggml/src/ggml-openvino/openvino/op/soft_max.cpp | 8 +------- ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp | 11 +++++++---- ggml/src/ggml-openvino/openvino/translate_session.cpp | 4 ---- 3 files changed, 8 insertions(+), 15 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp index 401acaf865..046cb93c8b 100644 --- a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp +++ b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp @@ -53,13 +53,7 @@ OutputVector translate_soft_max(const NodeContext& context) { auto mask_node = context.get_input(1); - std::shared_ptr token_len = get_dimensions(input_node, {1}); - // Try using Q-cur to retrieve the token length, so that the translation of SOFT_MAX - // does not depend on the result of the QK MatMul, so that QK matmul + softmax + qkv matmul - // can be fused into SDPA. - if (input_node->get_type_info() == ov::op::v0::MatMul::get_type_info_static()) { - token_len = get_dimensions(input_node->get_input_node_shared_ptr(0), {1}); - } + auto token_len = context.get_input("token_len"); auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); std::shared_ptr mask_node_sliced = diff --git a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp index aa6e28b627..1b7ac60271 100644 --- a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +++ b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp @@ -10,7 +10,6 @@ #include #include #include -#include #include #include @@ -23,13 +22,15 @@ FuseToSDPA::FuseToSDPA() { const auto m_k = ov::pass::pattern::any_input(); const auto m_q = ov::pass::pattern::any_input(); const auto m_qk = ov::pass::pattern::wrap_type({m_q, m_k}); + const auto m_qk_f32 = ov::pass::pattern::wrap_type({m_qk}); const auto m_scale = ov::pass::pattern::any_input(); - const auto m_scaled_qk = ov::pass::pattern::wrap_type({m_qk, m_scale}); + const auto m_scaled_qk = ov::pass::pattern::wrap_type({m_qk_f32, m_scale}); const auto m_mask = ov::pass::pattern::any_input(); const auto m_masked_qk = ov::pass::pattern::wrap_type({m_scaled_qk, m_mask}); const auto m_softmax_qk = ov::pass::pattern::wrap_type({m_masked_qk}); + const auto m_softmax_qk_f16 = ov::pass::pattern::wrap_type({m_softmax_qk}); const auto m_v = ov::pass::pattern::any_input(); - const auto m_qkv = ov::pass::pattern::wrap_type({m_softmax_qk, m_v}); + const auto m_qkv = ov::pass::pattern::wrap_type({m_softmax_qk_f16, m_v}); const auto callback = [=](ov::pass::pattern::Matcher& m) { auto& pattern_to_output = m.get_pattern_value_map(); @@ -41,7 +42,9 @@ FuseToSDPA::FuseToSDPA() { auto v_trans = register_new_node(v, ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 2, 1})); - auto sdpa = std::make_shared(q, k, v_trans, mask, scale, false); + auto mask_f16 = register_new_node(mask, ov::element::f16); + auto scale_f16 = register_new_node(scale, ov::element::f16); + auto sdpa = std::make_shared(q, k, v_trans, mask_f16, scale_f16, false); ov::replace_node(m.get_match_root(), sdpa); ov::copy_runtime_info(m.get_matched_nodes(), sdpa); diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index ed7db61414..daef12fb90 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -22,7 +22,6 @@ #include #include #include -#include #include "ggml-openvino/openvino/node_context.hpp" #include "ggml-openvino/openvino/utils.hpp" @@ -269,12 +268,9 @@ std::shared_ptr TranslateSession::apply_transformations(std::shared_ptr(kv_param_res_pairs); } - // SDPA is even worse on performance manager.register_pass(); manager.run_passes(model); } - auto preprocessor = ov::preprocess::PrePostProcessor(model); - model = preprocessor.build(); return model; } From 2f99135ccca8bf151034625106cf016f882e07d4 Mon Sep 17 00:00:00 2001 From: Ravi Panchumarthy Date: Wed, 30 Jul 2025 19:34:10 -0700 Subject: [PATCH 105/254] Update build.md --- docs/build.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/build.md b/docs/build.md index d40d257f59..1424a06508 100644 --- a/docs/build.md +++ b/docs/build.md @@ -707,7 +707,7 @@ Follow the instructions below to install OpenVINO runtime and build llama.cpp wi - Linux or Windows system with Intel hardware (CPU, GPU, or NPU) - **For Intel GPU or NPU Usage**: Install the appropriate hardware drivers for your Intel GPU or NPU. For detailed instructions, see: [Additional Configurations for Hardware Acceleration](https://docs.openvino.ai/2025/get-started/install-openvino/configurations.html). -- Git, CMake, and Ninja software tools are needed for building +- Git, CMake, and Ninja software tools are needed for building. ```bash sudo apt-get update sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar @@ -715,10 +715,10 @@ Follow the instructions below to install OpenVINO runtime and build llama.cpp wi ### 1. Install OpenVINO Runtime -- Follow the guide to install OpenVINO Runtime from an archive file: **[Install OpenVINO™ Runtime on Linux from an Archive File.](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-linux.html)** +- Follow the guide to install OpenVINO Runtime from an archive file: [Linux](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-linux.html) | [Windows](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-windows.html)
-📦 Click to expand OpenVINO 2025.2 installation commands +📦 Click to expand OpenVINO 2025.2 installation commands on Linux
```bash @@ -792,7 +792,6 @@ export GGML_OPENVINO_DEVICE=GPU To run in chat mode: ```bash export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache - ./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is " ``` @@ -818,6 +817,7 @@ export GGML_OPENVINO_PROFILING=1 ./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is " ``` +> **Note:** To apply your code changes, clear the `GGML_OPENVINO_CACHE_DIR` directory and rebuild the project. ### Using Llama.cpp's Built-in CPU Backend (for Comparison) From fc865340d5215b42f552a6410a47102b8b1c36e1 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 31 Jul 2025 16:22:21 +0800 Subject: [PATCH 106/254] Fix test-backend-ops --- ggml/src/ggml-openvino/ggml-decoder.cpp | 3 +++ ggml/src/ggml-openvino/ggml-openvino.cpp | 13 +++++++++++++ ggml/src/ggml-openvino/openvino/op/soft_max.cpp | 2 +- .../mark_decompression_convert_constant_folding.hpp | 2 +- ggml/src/ggml-openvino/utils.cpp | 11 +++++++---- ggml/src/ggml-openvino/utils.h | 2 ++ 6 files changed, 27 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index f7846382b9..2f7ae333e7 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -76,6 +76,9 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) { m_cgraph = cgraph; for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { auto* cur_node = cgraph->nodes[node_n]; + if (cur_node->op == GGML_OP_NONE) { + continue; + } m_nodes.push_back(cur_node); set_input_output(cur_node, true); } diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 7edd4667d9..8c700445b2 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -258,12 +258,25 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { } } + if (op->op == GGML_OP_PERMUTE) { + if (op->type == GGML_TYPE_BF16) { + // err msg: [GPU] Could not find a suitable kernel for transpose + GGML_LOG_WARN("OpenVINO backend does not support PERMUTE with BF16 type\n"); + return true; + } + } + if (op->op == GGML_OP_MUL_MAT) { if ((op->src[0]->view_src && op->src[0]->op != GGML_OP_PERMUTE) || (op->src[1]->view_src && op->src[1]->op != GGML_OP_PERMUTE)) { GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with view_src tensors that are not PERMUTE\n"); return true; } + if (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16) { + // Has accuracy issue, try enabling this and see `test-backend-ops -o "MUL_MAT"` + GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with two F16 tensors\n"); + return true; + } } if (op->op == GGML_OP_ROPE) { diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp index 046cb93c8b..e072658ecb 100644 --- a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp +++ b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp @@ -53,7 +53,7 @@ OutputVector translate_soft_max(const NodeContext& context) { auto mask_node = context.get_input(1); - auto token_len = context.get_input("token_len"); + auto token_len = context.has_input("token_len") ? context.get_input("token_len") : get_dimensions(input_node, {1}); auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); std::shared_ptr mask_node_sliced = diff --git a/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.hpp b/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.hpp index 163422bf33..b40eaf4205 100644 --- a/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.hpp +++ b/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.hpp @@ -24,6 +24,6 @@ class TRANSFORMATIONS_API MarkCompressedFloatConstants; class ov::pass::MarkCompressedFloatConstants : public MatcherPass { public: - OPENVINO_MATCHER_PASS_RTTI("MarkCompressedFloatConstants"); + OPENVINO_MATCHER_PASS_RTTI("MarkCompressedFloatConstants") MarkCompressedFloatConstants(); }; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 45ed73499f..a64637f950 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -81,7 +81,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c config = get_npu_config(); } - if (cgraph->n_nodes == 1) { + if (is_naive(cgraph)) { return naive_compute(cgraph, core, device, config); } @@ -250,11 +250,16 @@ ov::AnyMap get_npu_config() { return config; } +bool is_naive(struct ggml_cgraph* cgraph) { + constexpr int naive_graph_size_threshold = 20; + return cgraph->n_nodes < naive_graph_size_threshold; +} + enum ggml_status naive_compute(struct ggml_cgraph* cgraph, ov::Core& core, const std::string& device, const ov::AnyMap& config) { - if (cgraph->nodes[0]->op == GGML_OP_NONE) { + if (cgraph->n_nodes == 1 && cgraph->nodes[0]->op == GGML_OP_NONE) { return GGML_STATUS_SUCCESS; } @@ -264,8 +269,6 @@ enum ggml_status naive_compute(struct ggml_cgraph* cgraph, auto model = ov::frontend::ggml::FrontEnd::convert(input_model, naive); auto infer_request = core.compile_model(model, device, config).create_infer_request(); - ov::serialize(model, "IR.xml"); - auto ov_params = model->get_parameters(); for (size_t i = 0; i < ov_params.size(); i++) { auto param_name = ov_params[i]->get_friendly_name(); diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 367b2829be..0d71963f53 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -44,5 +44,7 @@ ov::AnyMap get_npu_config(); ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string& param_name); +bool is_naive(struct ggml_cgraph* cgraph); + enum ggml_status naive_compute(struct ggml_cgraph* cgraph, ov::Core& core, const std::string& device, const ov::AnyMap& config); From 11413503109e44d7f3f2b7f42e3daedf12e61382 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 31 Jul 2025 16:50:58 +0800 Subject: [PATCH 107/254] Skip test-thread-safety; Run ctest only in ci/run.sh --- ci/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/run.sh b/ci/run.sh index ea15ce49b1..bfce48f337 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -26,7 +26,7 @@ # GG_BUILD_KLEIDIAI=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt # # # with OPENVINO support -# GG_BUILD_OPENVINO=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt +# GG_BUILD_OPENVINO=1 GG_BUILD_LOW_PERF=1 GGML_OPENVINO_DEVICE=CPU bash ./ci/run.sh ./tmp/results ./tmp/mnt # if [ -z "$2" ]; then From 37ff226bb6365a303c3c5e461fc911633d3ee546 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 1 Aug 2025 11:46:52 +0800 Subject: [PATCH 108/254] Use CiD for NPU --- ggml/src/ggml-openvino/utils.cpp | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index a64637f950..cf0fc4dfd3 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -235,17 +235,15 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c ov::AnyMap get_npu_config() { ov::AnyMap config = { - { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean" }, - { "NPU_USE_NPUW", "YES" }, - { "NPUW_DEVICES", "NPU" }, - { "NPUW_FOLD", "YES" }, - { "NPUW_HOST_GATHER", "YES" }, - { "NPUW_DQ", "YES" }, - { "NPUW_FUNCALL_ASYNC", "YES" }, - { "NPUW_WEIGHTS_BANK", "shared" }, - // Option 'CACHE_DIR' is not supported with MLIR compiler type - // {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""}, - { "NPU_COMPILER_TYPE", "MLIR" }, + {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean" }, + {"NPU_USE_NPUW", "YES" }, + {"NPUW_DEVICES", "NPU" }, + {"NPUW_FOLD", "YES" }, + {"NPUW_HOST_GATHER", "YES" }, + {"NPUW_DQ", "YES" }, + {"NPUW_FUNCALL_ASYNC", "YES" }, + {"NPUW_WEIGHTS_BANK", "shared" }, + {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""}, }; return config; } From 9a91ca6ef96cc14e3c6658b68fca0e7950034ab8 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 4 Aug 2025 17:20:06 +0800 Subject: [PATCH 109/254] Optimize tensor conversion, improve TTFT --- ggml/src/ggml-openvino/ggml-decoder.cpp | 75 ++++++------------------- 1 file changed, 17 insertions(+), 58 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 2f7ae333e7..eb0cdcb28d 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include "ggml-backend-impl.h" @@ -391,53 +392,12 @@ std::map> GgmlOvDecoder::create_weight_no } std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) { - std::shared_ptr weight_node; auto node_type = get_ov_type(tensor); auto node_shape = get_shape(tensor); auto ne_total = ggml_nelements(tensor); - switch (tensor->type) { - case GGML_TYPE_I32: { - const auto* ptr = reinterpret_cast(tensor->data); - std::vector data(ptr, ptr + ne_total); - weight_node = std::make_shared(node_type, node_shape, data); - break; - } - case GGML_TYPE_I64: { - const auto* ptr = reinterpret_cast(tensor->data); - std::vector data(ptr, ptr + ne_total); - weight_node = std::make_shared(node_type, node_shape, data); - break; - } - case GGML_TYPE_F32: { - const auto* ptr = reinterpret_cast(tensor->data); - std::vector data(ptr, ptr + ne_total); - weight_node = std::make_shared(node_type, node_shape, data); - break; - } - case GGML_TYPE_F16: { - const auto* ptr = reinterpret_cast(tensor->data); - std::vector data_f16; - data_f16.reserve(ne_total); - for (int i = 0; i < ne_total; ++i) { - data_f16.push_back(ov::float16::from_bits(ptr[i])); - } - weight_node = std::make_shared(node_type, node_shape, data_f16); - break; - } - case GGML_TYPE_BF16: { - const auto* ptr = reinterpret_cast(tensor->data); - std::vector data_bf16; - data_bf16.reserve(ne_total); - for (int i = 0; i < ne_total; ++i) { - data_bf16.push_back(ov::bfloat16::from_bits(ptr[i])); - } - weight_node = std::make_shared(node_type, node_shape, data_bf16); - break; - } - default: - throw std::invalid_argument("Unsupported tensor type"); - } - return weight_node; + ov::Tensor weights(node_type, node_shape); + memcpy(weights.data(), tensor->data, ne_total * node_type.size()); + return std::make_shared(weights); } void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename) { @@ -549,27 +509,26 @@ std::vector GgmlOvDecoder::get_stride(const ggml_tensor* tensor) { } ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor* tensor) { - ov::element::Type type = ov::element::dynamic; switch (tensor->type) { + case GGML_TYPE_F64: + return ov::element::f64; case GGML_TYPE_F32: - type = ov::element::f32; - break; + return ov::element::f32; case GGML_TYPE_F16: - type = ov::element::f16; - break; + return ov::element::f16; case GGML_TYPE_BF16: - type = ov::element::bf16; - break; - case GGML_TYPE_I64: - type = ov::element::i64; - break; + return ov::element::bf16; + case GGML_TYPE_I8: + return ov::element::i8; + case GGML_TYPE_I16: + return ov::element::i16; case GGML_TYPE_I32: - type = ov::element::i32; - break; + return ov::element::i32; + case GGML_TYPE_I64: + return ov::element::i64; default: - break; + throw std::runtime_error("Unsupported tensor type"); } - return type; } ov::PartialShape GgmlOvDecoder::get_input_shape(const std::string& name) const { From 63d000ba40a6c9902eeef29b236caffdc47c4507 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 13 Aug 2025 10:57:22 +0800 Subject: [PATCH 110/254] Support op SET_ROWS --- ggml/src/ggml-openvino/ggml-decoder.cpp | 33 ++++++++++-- ggml/src/ggml-openvino/ggml-decoder.h | 3 ++ ggml/src/ggml-openvino/ggml-openvino.cpp | 2 +- .../ggml-openvino/openvino/node_context.hpp | 2 + .../src/ggml-openvino/openvino/op/reshape.cpp | 7 ++- .../ggml-openvino/openvino/op/set_rows.cpp | 51 +++++++++++++++++++ ggml/src/ggml-openvino/openvino/op_table.cpp | 1 + ggml/src/ggml-openvino/openvino/op_table.hpp | 1 + 8 files changed, 93 insertions(+), 7 deletions(-) create mode 100644 ggml/src/ggml-openvino/openvino/op/set_rows.cpp diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index eb0cdcb28d..c952fb8eaf 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -90,7 +90,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) { // 3. constructing a decoder for the whole graph naively (op test case) void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { std::string node_name; - if (node->op == GGML_OP_CPY) { + if (node->op == GGML_OP_CPY || node->op == GGML_OP_SET_ROWS) { // CPY updates the input tensor in place. For later ov op that uses the // input tensor of CPY, we need to make sure they get the updated tensor // by putting the src tensor name in the tensor_map in @@ -151,9 +151,11 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { if (node->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) { assert(name.find("cache_k") == 0 || name.find("cache_v") == 0); } - auto it = std::find(m_model_output_names.begin(), m_model_output_names.end(), name); - if (it == m_model_output_names.end()) { + if (auto it = std::find(m_model_output_names.begin(), m_model_output_names.end(), name); + it == m_model_output_names.end()) { m_model_output_names.push_back(name); + } + if (auto it = std::find(m_kv_names.begin(), m_kv_names.end(), name); it == m_kv_names.end()) { m_kv_names.push_back(name); } } @@ -166,6 +168,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { m_op_case = 1; } else if (node->src[0]->ne[0] * node->src[0]->ne[1] == node->ne[0]) { m_op_case = 2; + } else if (node->src[0]->ne[0] * node->src[0]->ne[1] == node->ne[1]) { + m_op_case = 3; } break; } @@ -270,6 +274,8 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size}; } else if (name.find("cache_v") == 0) { input_shape = ov::PartialShape{m_num_heads_kv, m_head_size, m_context_size}; + } else if (get_tensor_used_op(src)->op == GGML_OP_SET_ROWS) { + input_shape = ov::PartialShape{1, 1, -1}; } else if (src->op == GGML_OP_VIEW) { // This case is added to make test-backend-ops work input_shape = ov::PartialShape{get_shape(src->view_src)}; @@ -283,6 +289,8 @@ void GgmlOvDecoder::add_extra_inputs() { // Extra inputs: // 1. `past_token_len`, used to create indices for updating kv cache. Usually equal to inp_pos[0], except for // llama-perplexity. + // Update: SET_ROWS replaces CPY for updating kv cache. The indices creation is not needed anymore. See: + // https://github.com/ggml-org/llama.cpp/pull/14285 // 2. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned, // see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding. // Not used for NPU @@ -305,6 +313,10 @@ void GgmlOvDecoder::add_extra_inputs() { (int64_t) (node->src[1]->op_params[0] / node->src[1]->nb[0] / m_head_size / m_num_heads_kv); break; } + if (node->op == GGML_OP_SET_ROWS && std::string(node->name).find("cache_k") == 0) { + assert(node->src[1]->type == GGML_TYPE_I64); + past_token_len = *(int64_t*) (node->src[1]->data); + } } if (past_token_len == -1) { @@ -342,6 +354,18 @@ void GgmlOvDecoder::add_extra_inputs() { } } +const ggml_tensor* GgmlOvDecoder::get_tensor_used_op(const ggml_tensor* tensor) const { + for (int i = 0; i < m_cgraph->n_nodes; i++) { + const auto* node = m_cgraph->nodes[i]; + for (int j = 0; j < GGML_MAX_SRC; j++) { + if (node->src[j] == tensor) { + return node; + } + } + } + throw std::runtime_error("Tensor not found in cgraph"); +} + std::map GgmlOvDecoder::get_kv_param_res_names() const { std::map kv_param_res_names; for (const auto& name : m_kv_names) { @@ -618,7 +642,8 @@ const std::string& GgmlOvDecoder::get_op_type() const { {GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX" }, {GGML_OP_SUB, "GGML_OP_SUB" }, {GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE"}, - {GGML_OP_VIEW, "GGML_OP_VIEW" } + {GGML_OP_VIEW, "GGML_OP_VIEW" }, + {GGML_OP_SET_ROWS, "GGML_OP_SET_ROWS" }, }; static const std::map unary_ops = { {GGML_UNARY_OP_ABS, "GGML_UNARY_OP_ABS" }, diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index c1970af53a..f6a4f74163 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -117,6 +117,9 @@ public: static std::shared_ptr create_weight_node(ggml_tensor* tensor); static std::map> create_weight_nodes(struct ggml_cgraph* cgraph); + + const ggml_tensor* get_tensor_used_op(const ggml_tensor* tensor) const; + void clear_model_weights() { m_model_weights.clear(); } private: diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 8c700445b2..14999ba66b 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -331,7 +331,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con static const std::set supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW, GGML_OP_CONT, GGML_OP_CPY, GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE, GGML_OP_GET_ROWS, GGML_OP_ROPE, - GGML_OP_RMS_NORM, GGML_OP_SCALE, GGML_OP_SOFT_MAX}; + GGML_OP_RMS_NORM, GGML_OP_SCALE, GGML_OP_SOFT_MAX, GGML_OP_SET_ROWS}; static const std::set supported_unary_ops{ GGML_UNARY_OP_SILU, }; diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index ceba642275..cc1b5c0332 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -46,6 +46,8 @@ public: return m_decoder->get_input_stride(m_input_names[index]); } + std::string get_output_name() const { return m_output_names[0]; } + PartialShape get_output_shape(size_t index) const { return m_decoder->get_output_shape(m_output_names[index]); } diff --git a/ggml/src/ggml-openvino/openvino/op/reshape.cpp b/ggml/src/ggml-openvino/openvino/op/reshape.cpp index 3a695683bf..4ef3833c90 100644 --- a/ggml/src/ggml-openvino/openvino/op/reshape.cpp +++ b/ggml/src/ggml-openvino/openvino/op/reshape.cpp @@ -23,7 +23,7 @@ OutputVector translate_reshape(const NodeContext& context) { } int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported RESHAPE case"); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported RESHAPE case"); auto output_shape = context.get_output_shape(0).to_shape(); std::shared_ptr new_shape_node; @@ -32,11 +32,14 @@ OutputVector translate_reshape(const NodeContext& context) { ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{-1, (int64_t)output_shape[1], (int64_t)output_shape[2]}); - } else { + } else if (op_case == 2) { new_shape_node = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{(int64_t)output_shape[0], -1, (int64_t)output_shape[2]}); + } else { + new_shape_node = + ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{(int64_t) output_shape[0], -1, 1}); } auto res = std::make_shared(context.get_input(0), new_shape_node, false); return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp new file mode 100644 index 0000000000..b6caa372b8 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp @@ -0,0 +1,51 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_set_rows(const NodeContext& context) { + num_inputs_check(context, 2, 2); + + auto data = context.get_input(0); + auto indices = context.get_input(1); + auto dst = context.get_input(context.get_output_name()); + auto dst_shape = context.get_output_shape(0).to_shape(); + FRONT_END_OP_CONVERSION_CHECK(dst_shape[0] == 1, "Unsupported shape in SET_ROWS"); + + auto zero = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {0}); + + auto dst_reshaped = std::make_shared( + dst, + ov::op::v0::Constant::create(ov::element::i64, {2}, {(int64_t) dst_shape[1], (int64_t) dst_shape[2]}), + false); + auto indices_reshaped = + std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); + auto data_converted = std::make_shared(data, context.get_output_type(0)); + auto data_reshaped = std::make_shared(data_converted, zero); + auto updated = std::make_shared(dst_reshaped, indices_reshaped, data_reshaped, zero); + auto res = std::make_shared(updated, std::make_shared(dst), false); + return rename_outputs_with_suffix({res}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp index a99450ea95..744f355a54 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.cpp +++ b/ggml/src/ggml-openvino/openvino/op_table.cpp @@ -35,6 +35,7 @@ std::unordered_map get_supported_ops() { {"GGML_UNARY_OP_SILU", op::translate_unary_silu }, {"GGML_OP_VIEW", op::translate_view }, {"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu }, + {"GGML_OP_SET_ROWS", op::translate_set_rows }, }; } diff --git a/ggml/src/ggml-openvino/openvino/op_table.hpp b/ggml/src/ggml-openvino/openvino/op_table.hpp index 9b141d6d20..631812aaa3 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.hpp +++ b/ggml/src/ggml-openvino/openvino/op_table.hpp @@ -26,6 +26,7 @@ GGML_OP_CONVERTER(translate_soft_max); GGML_OP_CONVERTER(translate_transpose); GGML_OP_CONVERTER(translate_view); GGML_OP_CONVERTER(translate_glu_swiglu); +GGML_OP_CONVERTER(translate_set_rows); } // namespace op From 7bda5021f982c8c5d7835766bd49cea7d36f1439 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 14 Aug 2025 15:40:36 +0800 Subject: [PATCH 111/254] Fix NPU --- ggml/src/ggml-openvino/ggml-decoder.cpp | 37 ++++++++++++++++++- ggml/src/ggml-openvino/ggml-decoder.h | 1 + .../ggml-openvino/openvino/op/set_rows.cpp | 30 ++++++++++++--- ggml/src/ggml-openvino/utils.cpp | 3 ++ 4 files changed, 65 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index c952fb8eaf..472dd157ef 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -193,6 +193,14 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { } break; } + case GGML_OP_SET_ROWS: { + if (std::string(node->name).find("cache_k") == 0) { + m_op_case = 1; + } else { + m_op_case = 2; + } + break; + } case GGML_OP_PERMUTE: { if (node->src[0]->view_src == nullptr) { // Permute Qcur @@ -274,8 +282,18 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size}; } else if (name.find("cache_v") == 0) { input_shape = ov::PartialShape{m_num_heads_kv, m_head_size, m_context_size}; - } else if (get_tensor_used_op(src)->op == GGML_OP_SET_ROWS) { + } else if (const auto* op = get_tensor_used_op(src); op->op == GGML_OP_SET_ROWS) { input_shape = ov::PartialShape{1, 1, -1}; + if (m_is_static) { + if (m_is_first_token) { + // Dummy static shape, since the indices are not used in this case + input_shape = ov::PartialShape{1}; + } else if (std::string(op->name).find("cache_k") == 0) { + input_shape = ov::PartialShape{1, 1, 1}; + } else { + input_shape = ov::PartialShape{1, 1, m_num_heads_kv * m_head_size}; + } + } } else if (src->op == GGML_OP_VIEW) { // This case is added to make test-backend-ops work input_shape = ov::PartialShape{get_shape(src->view_src)}; @@ -316,6 +334,7 @@ void GgmlOvDecoder::add_extra_inputs() { if (node->op == GGML_OP_SET_ROWS && std::string(node->name).find("cache_k") == 0) { assert(node->src[1]->type == GGML_TYPE_I64); past_token_len = *(int64_t*) (node->src[1]->data); + break; } } @@ -366,6 +385,22 @@ const ggml_tensor* GgmlOvDecoder::get_tensor_used_op(const ggml_tensor* tensor) throw std::runtime_error("Tensor not found in cgraph"); } +const ggml_tensor* GgmlOvDecoder::get_tensor_from_name(const std::string& name) const { + for (int i = 0; i < m_cgraph->n_nodes; i++) { + const auto* node = m_cgraph->nodes[i]; + for (int j = 0; j < GGML_MAX_SRC; j++) { + const auto* src = node->src[j]; + if (src == nullptr) { + break; + } + if (std::string(src->name) == name) { + return src; + } + } + } + return nullptr; +} + std::map GgmlOvDecoder::get_kv_param_res_names() const { std::map kv_param_res_names; for (const auto& name : m_kv_names) { diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index f6a4f74163..ae378273d3 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -119,6 +119,7 @@ public: static std::map> create_weight_nodes(struct ggml_cgraph* cgraph); const ggml_tensor* get_tensor_used_op(const ggml_tensor* tensor) const; + const ggml_tensor* get_tensor_from_name(const std::string& name) const; void clear_model_weights() { m_model_weights.clear(); } diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp index b6caa372b8..758454cd9d 100644 --- a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include "../node_context.hpp" #include "../op_table.hpp" @@ -25,21 +26,40 @@ OutputVector translate_set_rows(const NodeContext& context) { num_inputs_check(context, 2, 2); auto data = context.get_input(0); - auto indices = context.get_input(1); - auto dst = context.get_input(context.get_output_name()); + data = std::make_shared(data, context.get_output_type(0)); + auto dst_shape = context.get_output_shape(0).to_shape(); FRONT_END_OP_CONVERSION_CHECK(dst_shape[0] == 1, "Unsupported shape in SET_ROWS"); - auto zero = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {0}); + if (context.is_static() && context.is_first_token()) { + Output res; + if (context.get_op_case() == 2) { + res = std::make_shared( + data, + ov::op::v0::Constant::create( + ov::element::i64, + {3}, + {context.get_context_size(), context.get_num_heads_kv(), context.get_head_size()}), + false); + res = std::make_shared( + res, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 2, 0})); + } else { + res = data; + } + return rename_outputs_with_suffix({res}, context.get_name()); + } + auto indices = context.get_input(1); + auto dst = context.get_input(context.get_output_name()); + + auto zero = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {0}); auto dst_reshaped = std::make_shared( dst, ov::op::v0::Constant::create(ov::element::i64, {2}, {(int64_t) dst_shape[1], (int64_t) dst_shape[2]}), false); auto indices_reshaped = std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); - auto data_converted = std::make_shared(data, context.get_output_type(0)); - auto data_reshaped = std::make_shared(data_converted, zero); + auto data_reshaped = std::make_shared(data, zero); auto updated = std::make_shared(dst_reshaped, indices_reshaped, data_reshaped, zero); auto res = std::make_shared(updated, std::make_shared(dst), false); return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index cf0fc4dfd3..83ab7353a9 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -328,6 +328,9 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, cons std::copy(padded_data.begin(), padded_data.end(), data_ptr); } + } else if (const auto* op = ggml_decoder->get_tensor_used_op(ggml_decoder->get_tensor_from_name(param_name)); + op->op == GGML_OP_SET_ROWS && is_static && is_first_token) { + input_tensor = ov::Tensor(ov::element::i64, ov::Shape{1}); } else { input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); } From 839f8c66a0f69bca54c3f067a73dcb870daf70bf Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 14 Aug 2025 16:00:38 +0800 Subject: [PATCH 112/254] Remove CPY --- ggml/src/ggml-openvino/ggml-decoder.cpp | 71 +++--------------- ggml/src/ggml-openvino/ggml-openvino.cpp | 19 ++++- ggml/src/ggml-openvino/openvino/op/cpy.cpp | 73 ------------------- ggml/src/ggml-openvino/openvino/op_table.cpp | 1 - ggml/src/ggml-openvino/openvino/op_table.hpp | 1 - .../openvino/translate_session.cpp | 60 --------------- 6 files changed, 25 insertions(+), 200 deletions(-) delete mode 100644 ggml/src/ggml-openvino/openvino/op/cpy.cpp diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 472dd157ef..38c7122f4c 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -90,10 +90,10 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) { // 3. constructing a decoder for the whole graph naively (op test case) void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { std::string node_name; - if (node->op == GGML_OP_CPY || node->op == GGML_OP_SET_ROWS) { - // CPY updates the input tensor in place. For later ov op that uses the - // input tensor of CPY, we need to make sure they get the updated tensor - // by putting the src tensor name in the tensor_map in + if (node->op == GGML_OP_SET_ROWS) { + // SET_ROWS updates the tensor in place. For later ov op that uses the + // the view_src of SET_ROWS, we need to make sure they get the updated tensor + // by putting the view_src name in the tensor_map in // /src/frontends/ggml/src/translate_session.cpp node_name = std::string(node->view_src->name); } else { @@ -183,16 +183,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { } break; } - case GGML_OP_CPY: { - if (std::string(node->src[1]->name).find("cache_k") == 0) { - // Write K to cache_k - m_op_case = 1; - } else { - // Write V to cache_v - m_op_case = 2; - } - break; - } case GGML_OP_SET_ROWS: { if (std::string(node->name).find("cache_k") == 0) { m_op_case = 1; @@ -305,62 +295,22 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co void GgmlOvDecoder::add_extra_inputs() { // Extra inputs: - // 1. `past_token_len`, used to create indices for updating kv cache. Usually equal to inp_pos[0], except for - // llama-perplexity. - // Update: SET_ROWS replaces CPY for updating kv cache. The indices creation is not needed anymore. See: - // https://github.com/ggml-org/llama.cpp/pull/14285 - // 2. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned, + // 1. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned, // see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding. // Not used for NPU - int64_t past_token_len = -1; int64_t attention_size = -1; - - int64_t token_len = -1; - int64_t past_token_len_from_inp_pos = -1; for (const auto& node : m_nodes) { - if (node->op == GGML_OP_ROPE && std::string(node->src[1]->name) == "inp_pos") { - if (node->src[1]->type != GGML_TYPE_I32) { - throw std::runtime_error("Expected cgraph input `inp_pos` to be of type GGML_TYPE_I32"); + if (node->op == GGML_OP_SOFT_MAX) { + auto* mask = node->src[1]; + if (std::string(mask->name).find("KQ_mask") != 0) { + throw std::runtime_error("Unexpected softmax node: " + std::string(mask->name)); } - token_len = node->src[1]->ne[0]; - past_token_len_from_inp_pos = ((int32_t*) (node->src[1]->data))[0]; - } - if (node->op == GGML_OP_CPY && ggml_is_contiguous(node)) { - assert(std::string(node->view_src->name).find("cache_k") == 0); - past_token_len = - (int64_t) (node->src[1]->op_params[0] / node->src[1]->nb[0] / m_head_size / m_num_heads_kv); - break; - } - if (node->op == GGML_OP_SET_ROWS && std::string(node->name).find("cache_k") == 0) { - assert(node->src[1]->type == GGML_TYPE_I64); - past_token_len = *(int64_t*) (node->src[1]->data); + attention_size = mask->ne[0]; break; } } - if (past_token_len == -1) { - throw std::runtime_error("Failed to find input \"cache_k\" in the graph"); - } - if (past_token_len != past_token_len_from_inp_pos) { - GGML_LOG_DEBUG("Mismatch between past_token_len from cache_k and inp_pos: %ld vs %ld\n", - past_token_len, - past_token_len_from_inp_pos); - } - { - std::string name = "past_token_len"; - auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); - param_node->set_friendly_name(name); - param_node->output(0).get_tensor().set_names({name}); - m_model_extra_inputs[name] = param_node; - - auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); - *tensor->data() = past_token_len; - m_model_extra_input_values[name] = tensor; - } - { - int64_t total_token_len = token_len + past_token_len; - attention_size = GGML_PAD(total_token_len, 32); std::string name = "attention_size"; auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); param_node->set_friendly_name(name); @@ -663,7 +613,6 @@ const std::string& GgmlOvDecoder::get_op_type() const { {GGML_OP_ADD, "GGML_OP_ADD" }, {GGML_OP_ADD1, "GGML_OP_ADD1" }, {GGML_OP_CONT, "GGML_OP_CONT" }, - {GGML_OP_CPY, "GGML_OP_CPY" }, {GGML_OP_DIV, "GGML_OP_DIV" }, {GGML_OP_DUP, "GGML_OP_DUP" }, {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS" }, diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 14999ba66b..fb5451be32 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -328,10 +328,21 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con static const std::set supported_types{ GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_I64, GGML_TYPE_I32}; - static const std::set supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, - GGML_OP_VIEW, GGML_OP_CONT, GGML_OP_CPY, GGML_OP_RESHAPE, - GGML_OP_PERMUTE, GGML_OP_TRANSPOSE, GGML_OP_GET_ROWS, GGML_OP_ROPE, - GGML_OP_RMS_NORM, GGML_OP_SCALE, GGML_OP_SOFT_MAX, GGML_OP_SET_ROWS}; + static const std::set supported_ops{GGML_OP_NONE, + GGML_OP_ADD, + GGML_OP_MUL, + GGML_OP_MUL_MAT, + GGML_OP_VIEW, + GGML_OP_CONT, + GGML_OP_RESHAPE, + GGML_OP_PERMUTE, + GGML_OP_TRANSPOSE, + GGML_OP_GET_ROWS, + GGML_OP_ROPE, + GGML_OP_RMS_NORM, + GGML_OP_SCALE, + GGML_OP_SOFT_MAX, + GGML_OP_SET_ROWS}; static const std::set supported_unary_ops{ GGML_UNARY_OP_SILU, }; diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp deleted file mode 100644 index 553f3c7966..0000000000 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ /dev/null @@ -1,73 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../node_context.hpp" -#include "../op_table.hpp" -#include "../utils.hpp" - -namespace ov { -namespace frontend { -namespace ggml { -namespace op { - -OutputVector translate_cpy(const NodeContext& context) { - num_inputs_check(context, 2, 2); - - int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CPY case"); - - auto src0 = context.get_input(0); - auto src1 = context.get_input(1); - - src0 = std::make_shared(src0, context.get_input_type(1)); - ov::Output res; - - if (context.is_static() && context.is_first_token()) { - res = src0; - return rename_outputs_with_suffix({res}, context.get_name()); - } - - if (op_case == 1) { - // Write K to cache_k - int64_t head_size = context.get_head_size(); - int64_t num_heads_kv = context.get_num_heads_kv(); - auto src0_reshape_shape = - ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{-1, num_heads_kv, head_size}); - src0 = std::make_shared(src0, src0_reshape_shape, false); - auto indices = context.get_input("update_indices_k"); - auto updated = std::make_shared(src1, indices, src0); - res = std::make_shared(updated, std::make_shared(src1), false); - } else { - // Write V to cache_v - auto flattend_src0 = - std::make_shared(src0, - ov::op::v0::Constant::create(element::i64, Shape{1}, {-1}), - false); - auto src0_shape = context.get_input_shape(0).to_shape(); - int64_t total_head_size = src0_shape[1]; - auto reshaped_src1 = std::make_shared( - src1, - ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{total_head_size, -1}), - false); - auto indices = context.get_input("update_indices_v"); - auto updated = std::make_shared(reshaped_src1, indices, flattend_src0); - res = std::make_shared(updated, std::make_shared(src1), false); - } - - return rename_outputs_with_suffix({res}, context.get_name()); -} - -} // namespace op -} // namespace ggml -} // namespace frontend -} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp index 744f355a54..ce4b01c3b5 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.cpp +++ b/ggml/src/ggml-openvino/openvino/op_table.cpp @@ -19,7 +19,6 @@ std::unordered_map get_supported_ops() { {"GGML_OP_ADD", op::translate_1to1_match_2_inputs }, {"GGML_OP_ADD1", op::translate_1to1_match_2_inputs }, {"GGML_OP_CONT", op::translate_cont }, - {"GGML_OP_CPY", op::translate_cpy }, {"GGML_OP_DIV", op::translate_1to1_match_2_inputs }, {"GGML_OP_GET_ROWS", op::translate_get_rows }, {"GGML_OP_MUL", op::translate_1to1_match_2_inputs}, diff --git a/ggml/src/ggml-openvino/openvino/op_table.hpp b/ggml/src/ggml-openvino/openvino/op_table.hpp index 631812aaa3..332930c3ac 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.hpp +++ b/ggml/src/ggml-openvino/openvino/op_table.hpp @@ -12,7 +12,6 @@ namespace op { GGML_OP_CONVERTER(translate_add); GGML_OP_CONVERTER(translate_cont); -GGML_OP_CONVERTER(translate_cpy); GGML_OP_CONVERTER(translate_get_rows); GGML_OP_CONVERTER(translate_mul); GGML_OP_CONVERTER(translate_mulmat); diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index daef12fb90..a09247347f 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -76,65 +76,6 @@ void add_token_len(TensorMap& tensor_map) { tensor_map.insert({"token_len", token_len->output(0)}); } -void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { - // cache_k layout: [S, N, H] (seq, num_heads, head_size) - // cache_v layout: [N, H, S] (num_heads, head_size, seq) - // When writing to cache_v, cache should be reshaped to [N*H, S] and v-curr should be flattened - auto past_token_len = tensor_map.at("past_token_len").get_node_shared_ptr(); - auto token_len = tensor_map.at("token_len").get_node_shared_ptr(); - - Output update_indices_k; - Output update_indices_v; - - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto zero_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {0}); - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1}); - auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - - auto past_token_len_scalar = std::make_shared(past_token_len, zero); - auto token_len_scalar = std::make_shared(token_len, zero); - auto total_token_len_scalar = std::make_shared(past_token_len_scalar, token_len_scalar); - - Output update_indices = std::make_shared( - past_token_len_scalar, total_token_len_scalar, one_scalar, ov::element::i64); - if (ggml_model_decoder.is_static()) { - update_indices = past_token_len; - } - - update_indices_k = std::make_shared(update_indices, one); - update_indices_k.get_node_shared_ptr()->set_friendly_name("update_indices_k"); - tensor_map.insert({"update_indices_k", update_indices_k}); - - auto total_head_size = ggml_model_decoder.get_num_heads_kv() * ggml_model_decoder.get_head_size(); - auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size}); - auto total_head_size_scalar = std::make_shared(total_head_size_node, zero); - - // 1D tensor of shape [total_head_size], values starting from 0 - auto range_row = - std::make_shared(zero_scalar, total_head_size_scalar, one_scalar, ov::element::i64); - auto range_row_reshaped = - std::make_shared(range_row, ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2})); - auto row_indices = std::make_shared( - range_row_reshaped, - std::make_shared(ov::OutputVector{total_head_size_node, token_len, one}, 0)); - - // 1D tensor of shape [token_len], values starting from past_token_len - auto range_col = update_indices; - auto range_col_reshaped = - std::make_shared(range_col, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 2})); - auto col_indices = std::make_shared( - range_col_reshaped, - std::make_shared(ov::OutputVector{total_head_size_node, token_len, one}, 0)); - - // Stack row_indices and col_indices along last axis: [total_head_size, token_len, 2] - update_indices_v = std::make_shared(OutputVector{row_indices, col_indices}, 2); - update_indices_v = std::make_shared( - update_indices_v, ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{-1, 2}), false); - update_indices_v.get_node_shared_ptr()->set_friendly_name("update_indices_v"); - tensor_map.insert({"update_indices_v", update_indices_v}); -} - void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { int32_t* rope_params = ggml_model_decoder.get_rope_params(); auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); @@ -156,7 +97,6 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { // Create common patterns void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { add_token_len(tensor_map); - add_kv_update_indices(tensor_map, ggml_model_decoder); add_rope_sin_cos(tensor_map, ggml_model_decoder); } From f4123be9678c5da35c1bc4356a8ad3c30fa9fdd6 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 14 Aug 2025 16:27:24 +0800 Subject: [PATCH 113/254] Fix test-backend-ops --- ggml/src/ggml-openvino/ggml-decoder.cpp | 7 +++++-- ggml/src/ggml-openvino/ggml-openvino.cpp | 4 ++++ ggml/src/ggml-openvino/utils.cpp | 2 +- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 38c7122f4c..6bc2c253e8 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -272,7 +272,7 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size}; } else if (name.find("cache_v") == 0) { input_shape = ov::PartialShape{m_num_heads_kv, m_head_size, m_context_size}; - } else if (const auto* op = get_tensor_used_op(src); op->op == GGML_OP_SET_ROWS) { + } else if (const auto* op = get_tensor_used_op(src); op && op->op == GGML_OP_SET_ROWS) { input_shape = ov::PartialShape{1, 1, -1}; if (m_is_static) { if (m_is_first_token) { @@ -324,6 +324,9 @@ void GgmlOvDecoder::add_extra_inputs() { } const ggml_tensor* GgmlOvDecoder::get_tensor_used_op(const ggml_tensor* tensor) const { + if (tensor == nullptr) { + return nullptr; + } for (int i = 0; i < m_cgraph->n_nodes; i++) { const auto* node = m_cgraph->nodes[i]; for (int j = 0; j < GGML_MAX_SRC; j++) { @@ -332,7 +335,7 @@ const ggml_tensor* GgmlOvDecoder::get_tensor_used_op(const ggml_tensor* tensor) } } } - throw std::runtime_error("Tensor not found in cgraph"); + return nullptr; } const ggml_tensor* GgmlOvDecoder::get_tensor_from_name(const std::string& name) const { diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index fb5451be32..13c2ef7462 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -238,6 +238,10 @@ static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(g static bool is_op_unsupported_case(const ggml_tensor* op) { if (op->op == GGML_OP_SOFT_MAX) { + if (op->src[2] != nullptr) { + GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with sinks\n"); + return true; + } float scale = 1.0f; float max_bias = 0.0f; const auto* op_params = op->op_params; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 83ab7353a9..522e922db8 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -329,7 +329,7 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, cons } } else if (const auto* op = ggml_decoder->get_tensor_used_op(ggml_decoder->get_tensor_from_name(param_name)); - op->op == GGML_OP_SET_ROWS && is_static && is_first_token) { + op && op->op == GGML_OP_SET_ROWS && is_static && is_first_token) { input_tensor = ov::Tensor(ov::element::i64, ov::Shape{1}); } else { input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); From a7b611bc933060cf2c051051cf5001c8f30cac36 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 14 Aug 2025 16:52:29 +0800 Subject: [PATCH 114/254] Minor updates for raising PR --- CMakePresets.json | 20 -------------------- docs/build.md | 21 +++------------------ ggml/src/ggml-openvino/ggml-decoder.cpp | 3 +-- 3 files changed, 4 insertions(+), 40 deletions(-) diff --git a/CMakePresets.json b/CMakePresets.json index 392c357f37..b5afeb3c0f 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -1,26 +1,6 @@ { "version": 4, "configurePresets": [ - { - "name": "ReleaseOV", - "generator": "Ninja", - "binaryDir": "${sourceDir}/build/${presetName}", - "installDir": "${sourceDir}/build/install/${presetName}", - "cacheVariables": { - "CMAKE_BUILD_TYPE": "Release", - "GGML_OPENVINO": true, - "OpenVINO_DIR": "$env{OPENVINO_LLAMA_PATH}/build/Release" - } - }, - { - "name": "ReleaseCPU", - "generator": "Ninja", - "binaryDir": "${sourceDir}/build/${presetName}", - "installDir": "${sourceDir}/build/install/${presetName}", - "cacheVariables": { - "CMAKE_BUILD_TYPE": "Release" - } - }, { "name": "base", "hidden": true, diff --git a/docs/build.md b/docs/build.md index 1424a06508..9e44f18eae 100644 --- a/docs/build.md +++ b/docs/build.md @@ -698,7 +698,7 @@ To read documentation for how to build on IBM Z & LinuxONE, [click here](./build ## OpenVINO -[OpenVINO](https://docs.openvino.ai/2025/index.html) is an open-source toolkit for optimizing and deploying high-performance AI inference, specifically designed for Intel hardware, including CPUs, GPUs, and NPUs, in the cloud, on-premises, and on the edge. +[OpenVINO](https://docs.openvino.ai/2025/index.html) is an open-source toolkit for optimizing and deploying high-performance AI inference, specifically designed for Intel hardware, including CPUs, GPUs, and NPUs, in the cloud, on-premises, and on the edge. The OpenVINO backend enhances performance by leveraging hardware-specific optimizations and can be enabled for use with llama.cpp. Follow the instructions below to install OpenVINO runtime and build llama.cpp with OpenVINO support. @@ -800,9 +800,8 @@ export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache Control OpenVINO behavior using these environment variables: -- **`GGML_OPENVINO_DEVICE`**: Specify the target device for OpenVINO inference. If not set, automatically selects the first available device in priority order: GPU, CPU, NPU. When set to `NPU` to use Intel NPUs, it enables static compilation mode for optimal performance. -- **`GGML_OPENVINO_CACHE_DIR`**: Directory for model caching (recommended: `/tmp/ov_cache`). If set, enables model caching in OpenVINO. Note: Not supported when using NPU devices yet. -- **`GGML_OPENVINO_WEIGHT_AS_INPUT`**: Pass the weights as input to the OpenVINO model instead of creating Constant nodes for them. +- **`GGML_OPENVINO_DEVICE`**: Specify the target device for OpenVINO inference. If not set, automatically selects the first available device in priority order: GPU, CPU, NPU. When set to `NPU` to use Intel NPUs, it enables static compilation mode for optimal performance. +- **`GGML_OPENVINO_CACHE_DIR`**: Directory for model caching (recommended: `/tmp/ov_cache`). If set, enables model caching in OpenVINO. Note: Not supported when using NPU devices yet. - **`GGML_OPENVINO_PROFILING`**: Enable execution time profiling. - **`GGML_OPENVINO_DUMP_CGRAPH`**: Save compute graph to `cgraph.txt`. - **`GGML_OPENVINO_DUMP_IR`**: Export OpenVINO IR files with timestamps. @@ -817,20 +816,6 @@ export GGML_OPENVINO_PROFILING=1 ./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is " ``` -> **Note:** To apply your code changes, clear the `GGML_OPENVINO_CACHE_DIR` directory and rebuild the project. - -### Using Llama.cpp's Built-in CPU Backend (for Comparison) - -To compare performance with the default CPU backend: - -```bash -# Build CPU-only version -cmake --preset ReleaseCPU -cmake --build build/ReleaseCPU --parallel - -# Run with the default CPU backend -./build/ReleaseCPU/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is " -``` ## Notes about GPU-accelerated backends diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 6bc2c253e8..09919c8505 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -57,8 +57,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, } if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { - auto timestamp = (long long) ggml_time_us(); - std::string filename = "cgraph_" + std::to_string(timestamp) + ".txt"; + std::string filename = "cgraph.txt"; dump_cgraph(cgraph, filename); } From 14c8a85c32decfb043a3b2320e6ac4d2c9681ef5 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 27 Aug 2025 17:06:35 +0800 Subject: [PATCH 115/254] Perf: RMS fused to OV internal RMS op --- ggml/src/ggml-openvino/openvino/op/rms_norm.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp index 211692a3c7..c9df4c42f3 100644 --- a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -19,18 +20,17 @@ OutputVector translate_rms_norm(const NodeContext& context) { num_inputs_check(context, 1, 1); auto input_node = context.get_input(0); - auto square = std::make_shared(input_node, input_node); + auto square = std::make_shared( + input_node, ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {2.0f})); - auto mean = - std::make_shared(square, - ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}), - true); + auto mean = std::make_shared( + square, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {-1}), true); float eps; memcpy(&eps, context.get_output_op_params(0), sizeof(float)); auto rms = std::make_shared( - std::make_shared(mean, ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {eps}))); + std::make_shared(mean, ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {eps}))); auto reciprocal = std::make_shared(ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {1.0f}), rms); From 65e1b1af6d7e0f61da15a42d4e5c0ace67862fca Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 4 Sep 2025 17:42:39 +0800 Subject: [PATCH 116/254] Fix after rebasing - Layout of cache k and cache v are unified: [seq, n_head, head_size] - Add CPY and FLASH_ATTN_EXT, flash attn is not used yet - Skip test-backend-ops due to flash attn test crash - Add mutex around graph conversion to avoid test-thread-safety fali in the future - Update NPU config - Update GPU config to disable SDPA opt to make phi-3 run --- ggml/src/ggml-openvino/ggml-decoder.cpp | 96 +++++---- ggml/src/ggml-openvino/ggml-openvino.cpp | 14 +- ggml/src/ggml-openvino/openvino/op/cont.cpp | 5 +- ggml/src/ggml-openvino/openvino/op/cpy.cpp | 20 ++ .../openvino/op/flash_attn_ext.cpp | 35 ++++ .../ggml-openvino/openvino/op/get_rows.cpp | 1 - ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 19 +- .../src/ggml-openvino/openvino/op/permute.cpp | 5 +- ggml/src/ggml-openvino/openvino/op/rope.cpp | 1 - .../ggml-openvino/openvino/op/set_rows.cpp | 16 +- .../openvino/op/{soft_max.cpp => softmax.cpp} | 0 .../ggml-openvino/openvino/op/transpose.cpp | 3 +- ggml/src/ggml-openvino/openvino/op_table.cpp | 40 ++-- ggml/src/ggml-openvino/openvino/op_table.hpp | 2 + .../openvino/pass/fuse_to_sdpa.cpp | 4 +- ggml/src/ggml-openvino/openvino/utils.cpp | 1 + ggml/src/ggml-openvino/utils.cpp | 190 ++++++++++-------- ggml/src/ggml-openvino/utils.h | 3 +- tests/CMakeLists.txt | 4 +- 19 files changed, 267 insertions(+), 192 deletions(-) create mode 100644 ggml/src/ggml-openvino/openvino/op/cpy.cpp create mode 100644 ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp rename ggml/src/ggml-openvino/openvino/op/{soft_max.cpp => softmax.cpp} (100%) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 09919c8505..0ee2338199 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -73,6 +73,11 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, } GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) { + if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { + std::string filename = "cgraph.txt"; + dump_cgraph(cgraph, filename); + } + m_cgraph = cgraph; for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { auto* cur_node = cgraph->nodes[node_n]; @@ -173,32 +178,33 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { break; } case GGML_OP_CONT: { - if (ggml_nelements(node->src[0]) == ggml_nelements(node->src[0]->view_src)) { - // The input comes from a PERMUTE + if (node->src[0]->op == GGML_OP_PERMUTE) { m_op_case = 1; - } else { + } else if (node->src[0]->op == GGML_OP_TRANSPOSE) { + m_op_case = 2; + } else if (node->src[0]->op == GGML_OP_VIEW) { // The input comes from a VIEW which is subtensor - m_op_case = 2; - } - break; - } - case GGML_OP_SET_ROWS: { - if (std::string(node->name).find("cache_k") == 0) { - m_op_case = 1; - } else { - m_op_case = 2; + m_op_case = 3; } break; } case GGML_OP_PERMUTE: { - if (node->src[0]->view_src == nullptr) { - // Permute Qcur + if (node->src[0]->op != GGML_OP_VIEW) { m_op_case = 1; } else if (ggml_is_contiguous(node->src[0])) { // Permute cache_k (view) m_op_case = 2; } else { - // Permute cache_v (view) + // Permute cache_v (view), deprecated, cache_v will also fall to case 2 + m_op_case = 3; + } + break; + } + case GGML_OP_MUL_MAT: { + if (node->src[0]->op == GGML_OP_CONT && node->src[0]->src[0]->op == GGML_OP_TRANSPOSE) { + m_op_case = 2; + } else if (node->src[0]->op == GGML_OP_VIEW && node->src[1]->op == GGML_OP_VIEW) { + // test-backend-ops case m_op_case = 3; } break; @@ -206,16 +212,12 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { case GGML_OP_GET_ROWS: { if (node->src[1]->op == GGML_OP_VIEW) { m_op_case = 2; - } else { - m_op_case = 1; } break; } case GGML_OP_ROPE: { if (node->src[0]->op == GGML_OP_VIEW) { m_op_case = 2; - } else { - m_op_case = 1; } break; } @@ -270,19 +272,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co } else if (name.find("cache_k") == 0) { input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size}; } else if (name.find("cache_v") == 0) { - input_shape = ov::PartialShape{m_num_heads_kv, m_head_size, m_context_size}; + input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size}; } else if (const auto* op = get_tensor_used_op(src); op && op->op == GGML_OP_SET_ROWS) { - input_shape = ov::PartialShape{1, 1, -1}; - if (m_is_static) { - if (m_is_first_token) { - // Dummy static shape, since the indices are not used in this case - input_shape = ov::PartialShape{1}; - } else if (std::string(op->name).find("cache_k") == 0) { - input_shape = ov::PartialShape{1, 1, 1}; - } else { - input_shape = ov::PartialShape{1, 1, m_num_heads_kv * m_head_size}; - } - } + input_shape = ov::PartialShape{1, 1, m_is_static ? 1 : -1}; } else if (src->op == GGML_OP_VIEW) { // This case is added to make test-backend-ops work input_shape = ov::PartialShape{get_shape(src->view_src)}; @@ -610,26 +602,28 @@ void GgmlOvDecoder::visit_subgraph(std::function ops = { - {GGML_OP_NONE, "GGML_OP_NONE" }, - {GGML_OP_ACC, "GGML_OP_ACC" }, - {GGML_OP_ADD, "GGML_OP_ADD" }, - {GGML_OP_ADD1, "GGML_OP_ADD1" }, - {GGML_OP_CONT, "GGML_OP_CONT" }, - {GGML_OP_DIV, "GGML_OP_DIV" }, - {GGML_OP_DUP, "GGML_OP_DUP" }, - {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS" }, - {GGML_OP_MUL, "GGML_OP_MUL" }, - {GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT" }, - {GGML_OP_PERMUTE, "GGML_OP_PERMUTE" }, - {GGML_OP_RESHAPE, "GGML_OP_RESHAPE" }, - {GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM" }, - {GGML_OP_ROPE, "GGML_OP_ROPE" }, - {GGML_OP_SCALE, "GGML_OP_SCALE" }, - {GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX" }, - {GGML_OP_SUB, "GGML_OP_SUB" }, - {GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE"}, - {GGML_OP_VIEW, "GGML_OP_VIEW" }, - {GGML_OP_SET_ROWS, "GGML_OP_SET_ROWS" }, + {GGML_OP_NONE, "GGML_OP_NONE" }, + {GGML_OP_ACC, "GGML_OP_ACC" }, + {GGML_OP_ADD, "GGML_OP_ADD" }, + {GGML_OP_ADD1, "GGML_OP_ADD1" }, + {GGML_OP_CONT, "GGML_OP_CONT" }, + {GGML_OP_DIV, "GGML_OP_DIV" }, + {GGML_OP_DUP, "GGML_OP_DUP" }, + {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS" }, + {GGML_OP_MUL, "GGML_OP_MUL" }, + {GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT" }, + {GGML_OP_PERMUTE, "GGML_OP_PERMUTE" }, + {GGML_OP_RESHAPE, "GGML_OP_RESHAPE" }, + {GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM" }, + {GGML_OP_ROPE, "GGML_OP_ROPE" }, + {GGML_OP_SCALE, "GGML_OP_SCALE" }, + {GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX" }, + {GGML_OP_SUB, "GGML_OP_SUB" }, + {GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE" }, + {GGML_OP_VIEW, "GGML_OP_VIEW" }, + {GGML_OP_SET_ROWS, "GGML_OP_SET_ROWS" }, + {GGML_OP_CPY, "GGML_OP_CPY" }, + {GGML_OP_FLASH_ATTN_EXT, "GGML_OP_FLASH_ATTN_EXT"}, }; static const std::map unary_ops = { {GGML_UNARY_OP_ABS, "GGML_UNARY_OP_ABS" }, diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 13c2ef7462..e3eaf40254 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -270,12 +270,14 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { } } - if (op->op == GGML_OP_MUL_MAT) { - if ((op->src[0]->view_src && op->src[0]->op != GGML_OP_PERMUTE) || - (op->src[1]->view_src && op->src[1]->op != GGML_OP_PERMUTE)) { - GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with view_src tensors that are not PERMUTE\n"); + if (op->op == GGML_OP_CPY) { + if (op->src[1] != op) { + GGML_LOG_WARN("OpenVINO backend only supports CPY that is a cast\n"); return true; } + } + + if (op->op == GGML_OP_MUL_MAT) { if (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16) { // Has accuracy issue, try enabling this and see `test-backend-ops -o "MUL_MAT"` GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with two F16 tensors\n"); @@ -346,7 +348,9 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con GGML_OP_RMS_NORM, GGML_OP_SCALE, GGML_OP_SOFT_MAX, - GGML_OP_SET_ROWS}; + GGML_OP_SET_ROWS, + GGML_OP_FLASH_ATTN_EXT, + GGML_OP_CPY}; static const std::set supported_unary_ops{ GGML_UNARY_OP_SILU, }; diff --git a/ggml/src/ggml-openvino/openvino/op/cont.cpp b/ggml/src/ggml-openvino/openvino/op/cont.cpp index f83c0e62df..9ae0f420cc 100644 --- a/ggml/src/ggml-openvino/openvino/op/cont.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp @@ -19,7 +19,7 @@ OutputVector translate_cont(const NodeContext& context) { num_inputs_check(context, 1, 1); int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case"); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported CONT case"); auto src_shape = context.get_input_shape(0).to_shape(); auto dst_shape = context.get_output_shape(0).to_shape(); @@ -32,6 +32,9 @@ OutputVector translate_cont(const NodeContext& context) { context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), false); + } else if (op_case == 2) { + // The input comes from a TRANSPOSE + return {context.get_input(0)}; } else { // The input comes from a VIEW res = process_view_input(context, 0); diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp new file mode 100644 index 0000000000..54b49018a9 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -0,0 +1,20 @@ +#include +#include +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_cpy(const NodeContext& context) { + auto res = std::make_shared(context.get_input(0), context.get_output_type(0)); + return rename_outputs_with_suffix({res}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp new file mode 100644 index 0000000000..5c0ad4c20e --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp @@ -0,0 +1,35 @@ +#include +#include +#include +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_flash_attn_ext(const NodeContext& context) { + num_inputs_check(context, 4, 4); + auto q_f32 = context.get_input(0); + auto k = context.get_input(1); + auto v = context.get_input(2); + auto mask = context.get_input(3); + + float* params = reinterpret_cast(context.get_output_op_params(0)); + float scale = params[0]; + // float max_bias = params[1]; + // float logit_softcap = params[2]; + + auto q = std::make_shared(q_f32, ov::element::f16); + auto scale_node = std::make_shared(ov::element::f16, ov::Shape{}, std::vector{scale}); + auto res = std::make_shared(q, k, v , mask, scale_node, false); + auto res_f32 = std::make_shared(res, ov::element::f32); + return rename_outputs_with_suffix({res_f32}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp index c97bbbf5a3..36795fd43e 100644 --- a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp @@ -21,7 +21,6 @@ OutputVector translate_get_rows(const NodeContext& context) { num_inputs_check(context, 2, 2); int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case"); Output res; auto data = context.get_input(0); diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 9148a27517..150fbcbb88 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -27,15 +27,26 @@ namespace op { OutputVector translate_mulmat(const NodeContext& context) { num_inputs_check(context, 2, 2); + int op_case = context.get_op_case(); + ov::Output res; ov::Output B = context.get_input(0); ov::Output A = context.get_input(1); + bool transpose_b = true; + if (op_case == 2) { + B = B.get_node_shared_ptr()->input_value(0); + transpose_b = false; + } else if (op_case == 3) { + B = process_view_input(context, 0); + A = process_view_input(context, 1); + } + bool convert_out_type = false; if (ov::op::util::is_constant(B.get_node()) && context.get_input_type(0) != context.get_input_type(1)) { - B = std::make_shared(context.get_input(0), context.get_input_type(1)); + B = std::make_shared(B, context.get_input_type(1)); } else if (context.get_input_type(0) != context.get_input_type(1)) { - A = std::make_shared(context.get_input(1), context.get_input_type(0)); + A = std::make_shared(A, context.get_input_type(0)); convert_out_type = true; } @@ -72,10 +83,10 @@ OutputVector translate_mulmat(const NodeContext& context) { } if (convert_out_type) { - auto result_lp = std::make_shared(A, B, false, true); + auto result_lp = std::make_shared(A, B, false, transpose_b); res = std::make_shared(result_lp, context.get_output_type(0)); } else { - res = std::make_shared(A, B, false, true); + res = std::make_shared(A, B, false, transpose_b); } return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index 978b5377fb..fcb091016a 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -21,13 +21,12 @@ OutputVector translate_permute(const NodeContext& context) { num_inputs_check(context, 1, 1); int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported CONT case"); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported PERMUTE case"); ov::Output res; if (op_case == 1) { - auto perm = argsort_descend(context.get_output_stride(0)); res = std::make_shared(context.get_input(0), - ov::op::v0::Constant::create(ov::element::i64, {3}, perm)); + ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); } else { auto src = context.get_input(0); auto attention_size = context.get_input("attention_size"); diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index 7951a1e012..4b1e3b500c 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -27,7 +27,6 @@ OutputVector translate_rope(const NodeContext& context) { num_inputs_check(context, 2, 3); int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case"); ov::Output res; diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp index 758454cd9d..0d94a95e44 100644 --- a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp @@ -32,21 +32,7 @@ OutputVector translate_set_rows(const NodeContext& context) { FRONT_END_OP_CONVERSION_CHECK(dst_shape[0] == 1, "Unsupported shape in SET_ROWS"); if (context.is_static() && context.is_first_token()) { - Output res; - if (context.get_op_case() == 2) { - res = std::make_shared( - data, - ov::op::v0::Constant::create( - ov::element::i64, - {3}, - {context.get_context_size(), context.get_num_heads_kv(), context.get_head_size()}), - false); - res = std::make_shared( - res, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 2, 0})); - } else { - res = data; - } - return rename_outputs_with_suffix({res}, context.get_name()); + return rename_outputs_with_suffix({data}, context.get_name()); } auto indices = context.get_input(1); diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/softmax.cpp similarity index 100% rename from ggml/src/ggml-openvino/openvino/op/soft_max.cpp rename to ggml/src/ggml-openvino/openvino/op/softmax.cpp diff --git a/ggml/src/ggml-openvino/openvino/op/transpose.cpp b/ggml/src/ggml-openvino/openvino/op/transpose.cpp index b35f1fb861..c585dffa6e 100644 --- a/ggml/src/ggml-openvino/openvino/op/transpose.cpp +++ b/ggml/src/ggml-openvino/openvino/op/transpose.cpp @@ -12,9 +12,8 @@ namespace op { OutputVector translate_transpose(const NodeContext& context) { num_inputs_check(context, 1, 1); - auto perm = argsort_descend(context.get_output_stride(0)); auto res = std::make_shared(context.get_input(0), - ov::op::v0::Constant::create(ov::element::i64, {3}, perm)); + ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 2, 1})); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp index ce4b01c3b5..ee55f84b96 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.cpp +++ b/ggml/src/ggml-openvino/openvino/op_table.cpp @@ -16,25 +16,27 @@ namespace ggml { std::unordered_map get_supported_ops() { using namespace ov::op; return { - {"GGML_OP_ADD", op::translate_1to1_match_2_inputs }, - {"GGML_OP_ADD1", op::translate_1to1_match_2_inputs }, - {"GGML_OP_CONT", op::translate_cont }, - {"GGML_OP_DIV", op::translate_1to1_match_2_inputs }, - {"GGML_OP_GET_ROWS", op::translate_get_rows }, - {"GGML_OP_MUL", op::translate_1to1_match_2_inputs}, - {"GGML_OP_MUL_MAT", op::translate_mulmat }, - {"GGML_OP_PERMUTE", op::translate_permute }, - {"GGML_OP_RESHAPE", op::translate_reshape }, - {"GGML_OP_RMS_NORM", op::translate_rms_norm }, - {"GGML_OP_ROPE", op::translate_rope }, - {"GGML_OP_SCALE", op::translate_scale }, - {"GGML_OP_SOFT_MAX", op::translate_soft_max }, - {"GGML_OP_SUB", op::translate_1to1_match_2_inputs}, - {"GGML_OP_TRANSPOSE", op::translate_transpose }, - {"GGML_UNARY_OP_SILU", op::translate_unary_silu }, - {"GGML_OP_VIEW", op::translate_view }, - {"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu }, - {"GGML_OP_SET_ROWS", op::translate_set_rows }, + {"GGML_OP_ADD", op::translate_1to1_match_2_inputs }, + {"GGML_OP_ADD1", op::translate_1to1_match_2_inputs }, + {"GGML_OP_CONT", op::translate_cont }, + {"GGML_OP_DIV", op::translate_1to1_match_2_inputs }, + {"GGML_OP_GET_ROWS", op::translate_get_rows }, + {"GGML_OP_MUL", op::translate_1to1_match_2_inputs}, + {"GGML_OP_MUL_MAT", op::translate_mulmat }, + {"GGML_OP_PERMUTE", op::translate_permute }, + {"GGML_OP_RESHAPE", op::translate_reshape }, + {"GGML_OP_RMS_NORM", op::translate_rms_norm }, + {"GGML_OP_ROPE", op::translate_rope }, + {"GGML_OP_SCALE", op::translate_scale }, + {"GGML_OP_SOFT_MAX", op::translate_soft_max }, + {"GGML_OP_SUB", op::translate_1to1_match_2_inputs}, + {"GGML_OP_TRANSPOSE", op::translate_transpose }, + {"GGML_UNARY_OP_SILU", op::translate_unary_silu }, + {"GGML_OP_VIEW", op::translate_view }, + {"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu }, + {"GGML_OP_SET_ROWS", op::translate_set_rows }, + {"GGML_OP_CPY", op::translate_cpy }, + {"GGML_OP_FLASH_ATTN_EXT", op::translate_flash_attn_ext }, }; } diff --git a/ggml/src/ggml-openvino/openvino/op_table.hpp b/ggml/src/ggml-openvino/openvino/op_table.hpp index 332930c3ac..faa61f5f6c 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.hpp +++ b/ggml/src/ggml-openvino/openvino/op_table.hpp @@ -26,6 +26,8 @@ GGML_OP_CONVERTER(translate_transpose); GGML_OP_CONVERTER(translate_view); GGML_OP_CONVERTER(translate_glu_swiglu); GGML_OP_CONVERTER(translate_set_rows); +GGML_OP_CONVERTER(translate_cpy); +GGML_OP_CONVERTER(translate_flash_attn_ext); } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp index 1b7ac60271..c36579910d 100644 --- a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +++ b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp @@ -40,11 +40,9 @@ FuseToSDPA::FuseToSDPA() { auto mask = pattern_to_output[m_mask]; auto scale = pattern_to_output[m_scale]; - auto v_trans = - register_new_node(v, ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 2, 1})); auto mask_f16 = register_new_node(mask, ov::element::f16); auto scale_f16 = register_new_node(scale, ov::element::f16); - auto sdpa = std::make_shared(q, k, v_trans, mask_f16, scale_f16, false); + auto sdpa = std::make_shared(q, k, v, mask_f16, scale_f16, false); ov::replace_node(m.get_match_root(), sdpa); ov::copy_runtime_info(m.get_matched_nodes(), sdpa); diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp index 9634900753..c4197ccc3a 100644 --- a/ggml/src/ggml-openvino/openvino/utils.cpp +++ b/ggml/src/ggml-openvino/openvino/utils.cpp @@ -65,6 +65,7 @@ OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std:: name += "_"; name += suffix; node->set_friendly_name(name); + // std::cout << name << " " << output.get_partial_shape() << std::endl; } return outputs; } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 522e922db8..473fa72f99 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -77,8 +78,10 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c bool is_static = device == "NPU" ? true : false; ov::AnyMap config; - if (device == "NPU") { - config = get_npu_config(); + if (device == "GPU") { + config = { + {"GPU_ENABLE_SDPA_OPTIMIZATION", "0"} + }; } if (is_naive(cgraph)) { @@ -92,6 +95,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c core.set_property(ov::cache_dir(cache_dir)); } + static std::mutex cache_mutex; static std::unordered_map> infer_request_cache; static std::unordered_map> ov_input_names_cache; static std::unordered_map> ov_output_names_cache; @@ -105,89 +109,93 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c int64_t conversion_end_time; int64_t compile_end_time; - auto it = infer_request_cache.find(cgraph); - if (it != infer_request_cache.end()) { - std::map> model_weights; - ggml_decoder = std::make_shared(cgraph, model_weights, is_static, false); - decoder_end_time = ggml_time_us(); + { + std::lock_guard lock(cache_mutex); - // For NPU for the first time we call kvcache modle, pop the compiled kvcache model from cache - if (is_static && compiled_model_cache.find(cgraph) != compiled_model_cache.end()) { - infer_request_cache[cgraph] = - std::make_shared(compiled_model_cache[cgraph].create_infer_request()); - compiled_model_cache.erase(cgraph); - } - infer_request = *infer_request_cache[cgraph]; - - conversion_end_time = ggml_time_us(); - compile_end_time = conversion_end_time; - } else { - std::shared_ptr model; - auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); - - if (is_static) { - ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true); - auto ggml_decoder_kvcache = std::make_shared(cgraph, model_weights, is_static, false); + auto it = infer_request_cache.find(cgraph); + if (it != infer_request_cache.end()) { + std::map> model_weights; + ggml_decoder = std::make_shared(cgraph, model_weights, is_static, false); decoder_end_time = ggml_time_us(); - auto input_model = std::make_shared(ggml_decoder); - auto input_model_kvcache = std::make_shared(ggml_decoder_kvcache); - - model = ov::frontend::ggml::FrontEnd::convert(input_model); - ggml_decoder->clear_model_weights(); - auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache); - ggml_decoder_kvcache->clear_model_weights(); - conversion_end_time = ggml_time_us(); - - auto compiled_model = core.compile_model(model, device, config); - auto compiled_model_kvcache = core.compile_model(model_kvcache, device, config); - compiled_model_cache[cgraph] = compiled_model_kvcache; - compile_end_time = ggml_time_us(); - - infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); - infer_request = *infer_request_cache[cgraph]; - compiled_model_cache[cgraph] = compiled_model_kvcache; - - if (getenv("GGML_OPENVINO_DUMP_IR")) { - char timestamped_filename[64]; - auto timestamp = (long long) ggml_time_us(); - snprintf(timestamped_filename, sizeof(timestamped_filename), "model_prefill_%lld.xml", timestamp); - ov::serialize(model, timestamped_filename); - snprintf(timestamped_filename, sizeof(timestamped_filename), "model_kvcache_%lld.xml", timestamp); - ov::serialize(model_kvcache, timestamped_filename); + // For NPU for the first time we call kvcache modle, pop the compiled kvcache model from cache + if (is_static && compiled_model_cache.find(cgraph) != compiled_model_cache.end()) { + infer_request_cache[cgraph] = + std::make_shared(compiled_model_cache[cgraph].create_infer_request()); + compiled_model_cache.erase(cgraph); } + infer_request = *infer_request_cache[cgraph]; + + conversion_end_time = ggml_time_us(); + compile_end_time = conversion_end_time; } else { - ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true); - decoder_end_time = ggml_time_us(); + std::shared_ptr model; + auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); - auto input_model = std::make_shared(ggml_decoder); - model = ov::frontend::ggml::FrontEnd::convert(input_model); - ggml_decoder->clear_model_weights(); - conversion_end_time = ggml_time_us(); + if (is_static) { + ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true); + auto ggml_decoder_kvcache = std::make_shared(cgraph, model_weights, is_static, false); + decoder_end_time = ggml_time_us(); - auto compiled_model = core.compile_model(model, device, config); - compile_end_time = ggml_time_us(); - infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); - infer_request = *infer_request_cache[cgraph]; + auto input_model = std::make_shared(ggml_decoder); + auto input_model_kvcache = std::make_shared(ggml_decoder_kvcache); - if (getenv("GGML_OPENVINO_DUMP_IR")) { - char timestamped_filename[64]; - auto timestamp = (long long) ggml_time_us(); - snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); - ov::serialize(model, timestamped_filename); + model = ov::frontend::ggml::FrontEnd::convert(input_model); + ggml_decoder->clear_model_weights(); + auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache); + ggml_decoder_kvcache->clear_model_weights(); + conversion_end_time = ggml_time_us(); + + if (getenv("GGML_OPENVINO_DUMP_IR")) { + char timestamped_filename[64]; + auto timestamp = (long long) ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_prefill_%lld.xml", timestamp); + ov::serialize(model, timestamped_filename); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_kvcache_%lld.xml", timestamp); + ov::serialize(model_kvcache, timestamped_filename); + } + + auto compiled_model = core.compile_model(model, device, get_npu_prefill_config()); + auto compiled_model_kvcache = core.compile_model(model_kvcache, device, get_npu_generate_config()); + compiled_model_cache[cgraph] = compiled_model_kvcache; + compile_end_time = ggml_time_us(); + + infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); + infer_request = *infer_request_cache[cgraph]; + compiled_model_cache[cgraph] = compiled_model_kvcache; + } else { + ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true); + decoder_end_time = ggml_time_us(); + + auto input_model = std::make_shared(ggml_decoder); + model = ov::frontend::ggml::FrontEnd::convert(input_model); + ggml_decoder->clear_model_weights(); + conversion_end_time = ggml_time_us(); + + if (getenv("GGML_OPENVINO_DUMP_IR")) { + char timestamped_filename[64]; + auto timestamp = (long long) ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); + ov::serialize(model, timestamped_filename); + } + + auto compiled_model = core.compile_model(model, device, config); + compile_end_time = ggml_time_us(); + infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); + infer_request = *infer_request_cache[cgraph]; } - } - std::vector ov_input_names; - std::vector ov_output_names; - for (const auto& ov_param : model->get_parameters()) { - ov_input_names.push_back(ov_param->get_friendly_name()); + std::vector ov_input_names; + std::vector ov_output_names; + for (const auto& ov_param : model->get_parameters()) { + ov_input_names.push_back(ov_param->get_friendly_name()); + } + for (const auto& ov_output : model->get_results()) { + ov_output_names.push_back(ov_output->get_friendly_name()); + } + ov_input_names_cache[cgraph] = ov_input_names; + ov_output_names_cache[cgraph] = ov_output_names; } - for (const auto& ov_output : model->get_results()) { - ov_output_names.push_back(ov_output->get_friendly_name()); - } - ov_input_names_cache[cgraph] = ov_input_names; - ov_output_names_cache[cgraph] = ov_output_names; } auto ov_input_names = ov_input_names_cache[cgraph]; @@ -233,21 +241,30 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c GGML_UNUSED(backend); } -ov::AnyMap get_npu_config() { +ov::AnyMap get_npu_prefill_config() { ov::AnyMap config = { - {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean" }, - {"NPU_USE_NPUW", "YES" }, - {"NPUW_DEVICES", "NPU" }, - {"NPUW_FOLD", "YES" }, - {"NPUW_HOST_GATHER", "YES" }, - {"NPUW_DQ", "YES" }, - {"NPUW_FUNCALL_ASYNC", "YES" }, - {"NPUW_WEIGHTS_BANK", "shared" }, - {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""}, + {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm" }, + {"NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES" }, + {"NPU_USE_NPUW", "YES" }, + {"NPUW_DEVICES", "NPU" }, + {"NPUW_FOLD", "YES" }, + {"NPUW_WEIGHTS_BANK", "shared" }, + {"NPUW_SLICE_OUT", "YES" }, + {"NPUW_FUNCALL_ASYNC", "YES" }, + {"NPUW_FUNCALL_FOR_ALL", "YES" }, + {"NPUW_DQ", "YES" }, + {"NPUW_DQ_FULL", "NO" }, + {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""}, }; return config; } +ov::AnyMap get_npu_generate_config() { + ov::AnyMap config = get_npu_prefill_config(); + config.emplace("NPUW_UNFOLD_IREQS", "YES"); + return config; +} + bool is_naive(struct ggml_cgraph* cgraph) { constexpr int naive_graph_size_threshold = 20; return cgraph->n_nodes < naive_graph_size_threshold; @@ -257,9 +274,12 @@ enum ggml_status naive_compute(struct ggml_cgraph* cgraph, ov::Core& core, const std::string& device, const ov::AnyMap& config) { - if (cgraph->n_nodes == 1 && cgraph->nodes[0]->op == GGML_OP_NONE) { + if (cgraph->n_nodes == 1 && (cgraph->nodes[0]->op == GGML_OP_NONE || cgraph->nodes[0]->op == GGML_OP_VIEW)) { return GGML_STATUS_SUCCESS; } + if (cgraph->nodes[0]->op == GGML_OP_FLASH_ATTN_EXT) { + return GGML_STATUS_FAILED; + } auto decoder = std::make_shared(cgraph); auto input_model = std::make_shared(decoder); diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 0d71963f53..f377fe9d27 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -40,7 +40,8 @@ void set_zero_diagonal(std::vector& matrix, size_t dim); bool is_prefill(struct ggml_cgraph * cgraph); -ov::AnyMap get_npu_config(); +ov::AnyMap get_npu_prefill_config(); +ov::AnyMap get_npu_generate_config(); ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string& param_name); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e556a7773b..efb51d23c5 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -228,7 +228,9 @@ if (NOT LLAMA_SANITIZE_ADDRESS AND NOT GGML_SCHED_NO_REALLOC) llama_build_and_test(test-opt.cpp) endif() llama_build_and_test(test-gguf.cpp) -llama_build_and_test(test-backend-ops.cpp) +if (NOT GGML_OPENVINO) + llama_build_and_test(test-backend-ops.cpp) +endif() llama_build_and_test(test-model-load-cancel.cpp LABEL "model") llama_build_and_test(test-autorelease.cpp LABEL "model") From 56d596775dc3171d019bb9ec7abe9c53d3a01c50 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 5 Sep 2025 16:41:15 +0800 Subject: [PATCH 117/254] Change openvino device_type to GPU; Enable flash_attn --- ggml/src/ggml-openvino/ggml-decoder.cpp | 7 +++ ggml/src/ggml-openvino/ggml-openvino.cpp | 9 +-- .../openvino/op/flash_attn_ext.cpp | 56 ++++++++++++++++++- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 32 +++++------ .../src/ggml-openvino/openvino/op/softmax.cpp | 18 +++--- .../openvino/translate_session.cpp | 12 ++++ 6 files changed, 104 insertions(+), 30 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 0ee2338199..0fd64c685f 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -299,6 +299,13 @@ void GgmlOvDecoder::add_extra_inputs() { attention_size = mask->ne[0]; break; } + if (node->op == GGML_OP_FLASH_ATTN_EXT) { + auto* mask = node->src[3]; + if (std::string(mask->name).find("KQ_mask") != 0) { + throw std::runtime_error("Unexpected flash attention node: " + std::string(mask->name)); + } + attention_size = mask->ne[0]; + } } { diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index e3eaf40254..ed612a2466 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -173,14 +173,15 @@ static void ggml_backend_openvino_device_get_memory(ggml_backend_dev_t dev, size GGML_ASSERT(free != nullptr); GGML_ASSERT(total != nullptr); ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context; - // Placeholder GGML_ASSERT(ctx->device >= 0); // ggml_openvino_set_device(ctx->device); + *total = 1; + *free = 1; } static enum ggml_backend_dev_type ggml_backend_openvino_device_get_type(ggml_backend_dev_t dev) { GGML_UNUSED(dev); - return GGML_BACKEND_DEVICE_TYPE_ACCEL; + return GGML_BACKEND_DEVICE_TYPE_GPU; } static void ggml_backend_openvino_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { @@ -293,7 +294,7 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { GGML_LOG_WARN("OpenVINO backend does not support ROPE with mode %d\n", mode); return true; } - if (n_dims != op->src[0]->ne[0]) { + if (n_dims != 0.0f && n_dims != op->src[0]->ne[0]) { GGML_LOG_WARN("OpenVINO backend does not support ROPE with n_dims %d != src[0]->ne[0] %ld\n", n_dims, op->src[0]->ne[0]); @@ -305,7 +306,7 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { } float freq_scale; memcpy(&freq_scale, op_params + 6, sizeof(float)); - if (freq_scale != 1.0f) { + if (freq_scale != 0.0f && freq_scale != 1.0f) { GGML_LOG_WARN("OpenVINO backend does not support ROPE with freq_scale %f != 1.0f\n", freq_scale); return true; } diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp index 5c0ad4c20e..d97603d98a 100644 --- a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +++ b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp @@ -1,6 +1,12 @@ #include +#include +#include #include +#include #include +#include +#include + #include "../node_context.hpp" #include "../op_table.hpp" #include "../utils.hpp" @@ -24,9 +30,53 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) { auto q = std::make_shared(q_f32, ov::element::f16); auto scale_node = std::make_shared(ov::element::f16, ov::Shape{}, std::vector{scale}); - auto res = std::make_shared(q, k, v , mask, scale_node, false); - auto res_f32 = std::make_shared(res, ov::element::f32); - return rename_outputs_with_suffix({res_f32}, context.get_name()); + + ov::Output mask_sliced; + if (context.has_input("KQ_mask_sliced")) { + mask_sliced = context.get_input("KQ_mask_sliced"); + } else { + auto token_len = get_dimensions(q, {1}); + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + mask_sliced = std::make_shared(mask, zero, token_len, one, one); + } + + if (mask_sliced.get_element_type() != ov::element::f16) { + mask_sliced = std::make_shared(mask_sliced, ov::element::f16); + } + + auto tile_kv = [](int64_t q_batch, int64_t kv_batch, ov::Output kv) { + int64_t factor = q_batch / kv_batch; + if (factor > 1) { + auto q_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{q_batch}); + auto kv_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{kv_batch}); + auto factor_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{factor}); + + auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); + auto kv_unsqueezed = std::make_shared(kv, unsqueeze_axes); + + auto kv_last_two_dims = get_dimensions(kv.get_node_shared_ptr(), {1, 2}); + auto kv_broadcast_shape = + std::make_shared(ov::OutputVector{kv_batch_node, factor_node, kv_last_two_dims}, 0); + kv = std::make_shared(kv_unsqueezed, kv_broadcast_shape); + + auto new_kv_shape = + std::make_shared(ov::OutputVector{q_batch_node, kv_last_two_dims}, 0); + kv = std::make_shared(kv, new_kv_shape, false); + } + return kv; + }; + + auto q_shape = context.get_input_shape(0).to_shape(); + auto k_shape = context.get_input_shape(1).to_shape(); + k = tile_kv(q_shape[0], k_shape[0], k); + v = tile_kv(q_shape[0], k_shape[0], v); + + auto sdpa = std::make_shared(q, k, v, mask_sliced, scale_node, false); + auto sdpa_f32 = std::make_shared(sdpa, ov::element::f32); + auto res = std::make_shared(sdpa_f32, + ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 150fbcbb88..bfccc28163 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -62,7 +62,7 @@ OutputVector translate_mulmat(const NodeContext& context) { auto B_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{B_batch}); auto factor_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{factor}); - auto Z_last_two_dim = get_dimensions(Z.get_node_shared_ptr(), {1, 2}); + auto Z_last_two_dims = get_dimensions(Z.get_node_shared_ptr(), {1, 2}); auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); auto Z_unsqueezed = std::make_shared(Z, unsqueeze_axes); @@ -70,26 +70,26 @@ OutputVector translate_mulmat(const NodeContext& context) { Output batch_small = A_batch_larger ? B_batch_node : A_batch_node; Output batch_large = A_batch_larger ? A_batch_node : B_batch_node; auto broadcast_shape = - std::make_shared(ov::OutputVector{batch_small, factor_node, Z_last_two_dim}, 0); + std::make_shared(ov::OutputVector{batch_small, factor_node, Z_last_two_dims}, 0); auto Z_broadcasted = std::make_shared(Z_unsqueezed, broadcast_shape); - auto new_Z_shape = std::make_shared(ov::OutputVector{batch_large, Z_last_two_dim}, 0); + auto new_Z_shape = std::make_shared(ov::OutputVector{batch_large, Z_last_two_dims}, 0); Z = std::make_shared(Z_broadcasted, new_Z_shape, false); - } - if (A_batch_larger) { - B = Z; - } else { - A = Z; - } + } + if (A_batch_larger) { + B = Z; + } else { + A = Z; + } - if (convert_out_type) { - auto result_lp = std::make_shared(A, B, false, transpose_b); - res = std::make_shared(result_lp, context.get_output_type(0)); - } else { - res = std::make_shared(A, B, false, transpose_b); - } + if (convert_out_type) { + auto result_lp = std::make_shared(A, B, false, transpose_b); + res = std::make_shared(result_lp, context.get_output_type(0)); + } else { + res = std::make_shared(A, B, false, transpose_b); + } - return rename_outputs_with_suffix({res}, context.get_name()); + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/softmax.cpp b/ggml/src/ggml-openvino/openvino/op/softmax.cpp index e072658ecb..1aa3bf76a0 100644 --- a/ggml/src/ggml-openvino/openvino/op/softmax.cpp +++ b/ggml/src/ggml-openvino/openvino/op/softmax.cpp @@ -51,14 +51,18 @@ OutputVector translate_soft_max(const NodeContext& context) { return rename_outputs_with_suffix({res}, context.get_name()); } - auto mask_node = context.get_input(1); + ov::Output mask_node_sliced; + if (context.has_input("KQ_mask_sliced")) { + mask_node_sliced = context.get_input("KQ_mask_sliced"); + } else { + auto token_len = get_dimensions(input_node, {1}); + auto mask_node = context.get_input(1); + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + mask_node_sliced = std::make_shared(mask_node, zero, token_len, one, one); + } - auto token_len = context.has_input("token_len") ? context.get_input("token_len") : get_dimensions(input_node, {1}); - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - std::shared_ptr mask_node_sliced = - std::make_shared(mask_node, zero, token_len, one, one); - if (mask_node_sliced->get_element_type() != context.get_output_type(0)) { + if (mask_node_sliced.get_element_type() != context.get_output_type(0)) { mask_node_sliced = std::make_shared(mask_node_sliced, context.get_output_type(0)); } diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index a09247347f..3e27a689d5 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -36,6 +36,7 @@ namespace ggml { using namespace ov::op; namespace { + ov::pass::MakeStateful::ParamResPairs get_kv_param_res_pairs( const std::shared_ptr& model, const std::map& kv_param_res_names) { ov::pass::MakeStateful::ParamResPairs pairs; @@ -76,6 +77,16 @@ void add_token_len(TensorMap& tensor_map) { tensor_map.insert({"token_len", token_len->output(0)}); } +void add_sliced_mask(TensorMap& tensor_map) { + auto mask = tensor_map.at("KQ_mask").get_node_shared_ptr(); + auto token_len = tensor_map.at("token_len").get_node_shared_ptr(); + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + std::shared_ptr mask_sliced = std::make_shared(mask, zero, token_len, one, one); + mask_sliced->set_friendly_name("KQ_mask_sliced"); + tensor_map.insert({"KQ_mask_sliced", mask_sliced->output(0)}); +} + void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { int32_t* rope_params = ggml_model_decoder.get_rope_params(); auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); @@ -97,6 +108,7 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { // Create common patterns void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { add_token_len(tensor_map); + add_sliced_mask(tensor_map); add_rope_sin_cos(tensor_map, ggml_model_decoder); } From 3e897df51c10dfcb73c738554ca94fa7be0740a7 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 5 Aug 2025 19:51:01 +0800 Subject: [PATCH 118/254] Update supports_buft and supports_op for quantized models --- ggml/src/ggml-openvino/ggml-openvino.cpp | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index ed612a2466..f81b1ee483 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -8,6 +8,7 @@ #include #include "ggml-backend-impl.h" +#include "ggml-backend.h" #include "ggml-impl.h" #include "ggml-openvino/utils.h" #include "ggml.h" @@ -332,8 +333,16 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor* op) { GGML_ASSERT(dev->reg != nullptr); - static const std::set supported_types{ - GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_I64, GGML_TYPE_I32}; + static const std::set supported_types{GGML_TYPE_F32, + GGML_TYPE_F16, + GGML_TYPE_BF16, + GGML_TYPE_I64, + GGML_TYPE_I32, + GGML_TYPE_Q4_0, + GGML_TYPE_Q4_1, + GGML_TYPE_Q4_K, + GGML_TYPE_Q8_0, + GGML_TYPE_Q6_K}; static const std::set supported_ops{GGML_OP_NONE, GGML_OP_ADD, @@ -411,7 +420,8 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con } static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - return ggml_backend_buft_is_host(buft); + // TODO quantized weigts are cpu_repack_buffer_type which does not implement ggml_backend_buft_is_host + return ggml_backend_buft_is_host(buft) || strcmp(buft->device->iface.get_name(buft->device), "CPU") == 0; GGML_UNUSED(dev); } From d4ca760da8019c62bdeb913fd3f9746666988079 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 5 Aug 2025 20:56:50 +0800 Subject: [PATCH 119/254] Add quant weight conversion functions from genai gguf reader --- ggml/src/ggml-openvino/ggml-decoder.cpp | 76 +++++- ggml/src/ggml-openvino/ggml-quant.cpp | 313 ++++++++++++++++++++++++ ggml/src/ggml-openvino/ggml-quant.hpp | 44 ++++ 3 files changed, 429 insertions(+), 4 deletions(-) create mode 100644 ggml/src/ggml-openvino/ggml-quant.cpp create mode 100644 ggml/src/ggml-openvino/ggml-quant.hpp diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 0fd64c685f..c2e164b808 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -29,6 +30,7 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" +#include "ggml-quant.hpp" GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token, int context_size, int num_heads, int num_heads_kv, int head_size) : @@ -402,12 +404,78 @@ std::map> GgmlOvDecoder::create_weight_no } std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) { + std::set weight_types = { + GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K}; + if (weight_types.find(tensor->type) == weight_types.end()) { + throw std::runtime_error("Unexpected weight tensor type: " + std::string(tensor->name) + " with type " + + ggml_type_name(tensor->type)); + } + auto node_type = get_ov_type(tensor); auto node_shape = get_shape(tensor); auto ne_total = ggml_nelements(tensor); - ov::Tensor weights(node_type, node_shape); - memcpy(weights.data(), tensor->data, ne_total * node_type.size()); - return std::make_shared(weights); + + if (node_type != ov::element::dynamic) { + ov::Tensor weights(node_type, node_shape); + memcpy(weights.data(), tensor->data, ne_total * node_type.size()); + std::shared_ptr weight_node = std::make_shared(weights); + if (node_type == ov::element::f16) { + weight_node = std::make_shared(weight_node, ov::element::f32); + } + weight_node->set_friendly_name(tensor->name); + return weight_node; + } + + uint64_t weights_per_byte; + if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) { + weights_per_byte = 2; + } else { // tensor.type == GGUF_TYPE_Q8_0 || tensor.type == GGUF_TYPE_Q6_K + weights_per_byte = 1; + } + + uint64_t weights_per_block; + // here we only consider sub block, q6k:16 q4k:32 + if (tensor->type == GGML_TYPE_Q6_K) { + weights_per_block = 16; + } else { + weights_per_block = 32; + } + + OPENVINO_ASSERT(node_shape.back() % weights_per_block == 0, + "[load_gguf] tensor ", + tensor->name, + " has incompatible last dim shape: ", + node_shape.back()); + + auto weights_shape = node_shape; + weights_shape.back() /= (weights_per_byte * 4); // means u32 type can store 8 q4 or 4 q8 + + ov::Tensor weights(ov::element::u32, weights_shape); + // For scales and bias + node_shape[node_shape.size() - 1] = node_shape[node_shape.size() - 1] / weights_per_block; + + ov::Tensor scales(ov::element::f16, node_shape); + ov::Tensor biases(ov::element::f16, node_shape); + ov::Output weight_node; + if (tensor->type == GGML_TYPE_Q4_0) { + extract_q4_0_data(tensor, weights, scales, biases); + weight_node = make_int8_weights(weights, scales, biases, weights_per_block); + } else if (tensor->type == GGML_TYPE_Q4_1) { + extract_q4_1_data(tensor, weights, scales, biases); + weight_node = make_int4_weights(weights, scales, biases, weights_per_block); + } else if (tensor->type == GGML_TYPE_Q8_0) { + extract_q8_0_data(tensor, weights, scales, biases); + weight_node = make_int8_weights(weights, scales, biases, weights_per_block); + } else if (tensor->type == GGML_TYPE_Q6_K) { + // due to WA #2135, this case will not be used, extract_q6_k_data temporarily disabled. + extract_q6_k_data(tensor, weights, scales, biases); + weight_node = make_int8_weights(weights, scales, biases, weights_per_block); + } else if (tensor->type == GGML_TYPE_Q4_K) { + extract_q4_k_data(tensor, weights, scales, biases); + weight_node = make_int4_weights(weights, scales, biases, weights_per_block); + } + weight_node.get_node_shared_ptr()->set_friendly_name(tensor->name); + return weight_node.get_node_shared_ptr(); } void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename) { @@ -537,7 +605,7 @@ ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor* tensor) { case GGML_TYPE_I64: return ov::element::i64; default: - throw std::runtime_error("Unsupported tensor type"); + return ov::element::dynamic; } } diff --git a/ggml/src/ggml-openvino/ggml-quant.cpp b/ggml/src/ggml-openvino/ggml-quant.cpp new file mode 100644 index 0000000000..4311ab138e --- /dev/null +++ b/ggml/src/ggml-openvino/ggml-quant.cpp @@ -0,0 +1,313 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ggml.h" + +void unpack_32_4(const uint8_t* data, uint8_t* dst) { + std::fill_n(dst, 16, 0); + for (int j = 0; j < 16; ++j) { + uint8_t x = (data[j + 2] & 0x0F); // j+2 to skip scale bytes. + uint8_t y = (data[j + 2] >> 4); + if (j % 2 != 0) { + x <<= 4; + y <<= 4; + } + dst[j / 2] |= x; + dst[8 + j / 2] |= y; // Last 16 weights are in the higher bits + } +} + +// Extracts (weight, scales, biases) from Q4_0 tensors. +// Data layout is: |16 bit scale|32 x 4bit weights|. +void extract_q4_0_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr) { + const uint64_t bytes_per_block = 18; // 2 bytes scale, 32x0.5 byte weights + auto data = static_cast(tensor->data); + auto weights = static_cast(weights_arr.data()); + auto scales = scales_arr.data::value_type>(); + auto biases = biases_arr.data::value_type>(); + + ov::parallel_for(scales_arr.get_size(), [&](size_t i) { + scales[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block))); + biases[i] = ov::float16(-8.f * static_cast(scales[i])); + unpack_32_4(data + i * bytes_per_block, weights + i * 16); + }); +} + +// Extracts (weight, scales, biases) from Q4_1 tensors. +// Data layout is: |16 bit scale|16 bit bias|32 x 4bit weights|. +void extract_q4_1_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr) { + const uint64_t bytes_per_block = 20; // 2 bytes scale, 2 bytes bias, 32x0.5 byte weights + auto data = static_cast(tensor->data); + auto weights = static_cast(weights_arr.data()); + auto scales = scales_arr.data::value_type>(); + auto biases = biases_arr.data::value_type>(); + ov::parallel_for(scales_arr.get_size(), [&](size_t i) { + scales[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block))); + biases[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block + 1))); + unpack_32_4(data + i * bytes_per_block, weights + i * 16); + }); +} + +// Extracts (weight, scales, biases) from Q8_0 tensors. +// Data layout is: |16 bit scale|32 x 8bit weights|. +void extract_q8_0_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr) { + const uint64_t weights_per_block = 32; + const uint64_t bytes_per_block = 34; // 2 bytes scale, 32x1 byte weights + auto data = static_cast(tensor->data); + auto weights = static_cast(weights_arr.data()); + auto scales = scales_arr.data::value_type>(); + auto biases = biases_arr.data::value_type>(); + for (int64_t i = 0; i < scales_arr.get_size(); i++) { + uint8_t* block_data = data + i * bytes_per_block; + scales[i] = ov::float16::from_bits(*(uint16_t*)block_data); + biases[i] = ov::float16(-128.f * static_cast(scales[i])); + for (int64_t j = 0; j < weights_per_block; ++j) { + uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes. + // Original data is in int8_t, so we add a bias of -128 and invert the + // first bit. + x ^= 1 << 7; + weights[i * weights_per_block + j] = x; + } + } +} + +void unpack_256_4(const uint8_t* data, uint8_t* dst) { + // Initialize the output array with zeros + std::fill_n(dst, 128, 0); + + for (size_t i = 0; i < 4; ++i) { + for (int j = 0; j < 32; ++j) { + uint8_t x = (data[i * 32 + j] & 0x0F); + uint8_t y = (data[i * 32 + j] >> 4); + if (j % 2 != 0) { + x <<= 4; + y <<= 4; + } + dst[i * 32 + j / 2] |= x; + dst[i * 32 + 16 + j / 2] |= y; // Last 16 weights are in the higher bits + } + } +} + +void extract_q4_k_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr) { + const uint64_t bytes_per_block = 2 + 2 + 12 + 128; + // TODO tensor->nb[3] + const uint64_t n_super_block = tensor->nb[3] / bytes_per_block; + auto data = static_cast(tensor->data); + auto weights = static_cast(weights_arr.data()); + auto scales = scales_arr.data::value_type>(); + auto biases = biases_arr.data::value_type>(); + + ov::parallel_for(n_super_block, [&](size_t i) { + uint8_t* block_data = data + i * bytes_per_block; + + // Extract scale factors and offsets + float scale_scales = static_cast(ov::float16::from_bits(*((uint16_t*)block_data))); + float scale_biases = static_cast(ov::float16::from_bits(*((uint16_t*)block_data + 1))); + + // Extract qs1 and qs2 + uint8_t* qs1 = block_data + 4; + uint8_t* qs2 = block_data + 16; + + scales[i * 8] = ov::float16(scale_scales * static_cast((*(qs1) & 0b111111))); + scales[i * 8 + 1] = ov::float16(scale_scales * static_cast((*(qs1 + 1) & 0b111111))); + scales[i * 8 + 2] = ov::float16(scale_scales * static_cast((*(qs1 + 2) & 0b111111))); + scales[i * 8 + 3] = ov::float16(scale_scales * static_cast((*(qs1 + 3) & 0b111111))); + scales[i * 8 + 4] = + ov::float16(scale_scales * static_cast((*(qs1 + 8) & 0b00001111) | ((*(qs1) >> 6) << 4))); + scales[i * 8 + 5] = + ov::float16(scale_scales * static_cast((*(qs1 + 9) & 0b00001111) | ((*(qs1 + 1) >> 6) << 4))); + scales[i * 8 + 6] = + ov::float16(scale_scales * static_cast((*(qs1 + 10) & 0b00001111) | ((*(qs1 + 2) >> 6) << 4))); + scales[i * 8 + 7] = + ov::float16(scale_scales * static_cast((*(qs1 + 11) & 0b00001111) | ((*(qs1 + 3) >> 6) << 4))); + + biases[i * 8] = ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 4) & 0b111111))); + biases[i * 8 + 1] = ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 5) & 0b111111))); + biases[i * 8 + 2] = ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 6) & 0b111111))); + biases[i * 8 + 3] = ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 7) & 0b111111))); + biases[i * 8 + 4] = + ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 8) >> 4) | ((*(qs1 + 4) >> 6) << 4))); + biases[i * 8 + 5] = + ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 9) >> 4) | ((*(qs1 + 5) >> 6) << 4))); + biases[i * 8 + 6] = + ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 10) >> 4) | ((*(qs1 + 6) >> 6) << 4))); + biases[i * 8 + 7] = + ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 11) >> 4) | ((*(qs1 + 7) >> 6) << 4))); + unpack_256_4(block_data + 16, weights + i * 128); + }); +} + +void extract_q6_k_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr) { + const uint64_t bytes_per_block = 128 + 64 + 16 + 2; + const uint64_t n_super_block = tensor->nb[3] / bytes_per_block; + auto data = static_cast(tensor->data); + auto weights = static_cast(weights_arr.data()); + auto scales = scales_arr.data::value_type>(); + auto biases = biases_arr.data::value_type>(); + // std::string name(tensor.name, tensor.namelen); + for (int64_t i = 0; i < n_super_block; i++) { + uint8_t* block_data = data + i * bytes_per_block; + + float scale_factor = + static_cast(ov::float16::from_bits(*((uint16_t*)block_data + 104))); // (128+64+16)/2 + + for (size_t j = 0; j < 16; j++) { + scales[j + i * 16] = + ov::float16(scale_factor * static_cast(*((int8_t*)(block_data + 128 + 64 + j)))); + biases[j + i * 16] = ov::float16(-32.f * static_cast(scales[j + i * 16])); + } + + // Extract ql and qh + uint8_t* ql = block_data; + uint8_t* qh = block_data + 128; + + // Extract weights + for (int64_t j = 0; j < 32; ++j) { + weights[i * 256 + j] = (ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4); + weights[i * 256 + j + 32] = (ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4); + weights[i * 256 + j + 64] = (ql[j] >> 4) | (((qh[j] >> 4) & 3) << 4); + weights[i * 256 + j + 96] = (ql[32 + j] >> 4) | (((qh[j] >> 6) & 3) << 4); + weights[i * 256 + j + 128] = (ql[64 + j] & 0xF) | (((qh[32 + j] >> 0) & 3) << 4); + weights[i * 256 + j + 160] = (ql[96 + j] & 0xF) | (((qh[32 + j] >> 2) & 3) << 4); + weights[i * 256 + j + 192] = (ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4); + weights[i * 256 + j + 224] = (ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4); + } + } +} + +// TODO Reorder for make_intX_weights + +ov::Output make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) { + + // Reshape weight to (num_heads, -1, group_size) + ov::Shape orig_shape = weight.get_shape(); + orig_shape[1] *= sizeof(uint32_t) / sizeof(uint8_t); + size_t num_groups = orig_shape[1] / group_size; + + // Expand dimensions for scales and biases + auto scale_shape = scales.get_shape(); + scale_shape.push_back(1); + scales.set_shape(scale_shape); + biases.set_shape(scale_shape); + + // Create graph nodes + auto weights_node = std::make_shared(ov::element::u8, ov::Shape{orig_shape[0], num_groups, group_size}, static_cast(weight.data()), nullptr); + weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; + auto scales_f16 = std::make_shared(scales); + ov::Tensor biases_u8(ov::element::u8, scale_shape); + + // Calculate zero point + const ov::float16* bias_data = biases.data::value_type>(); + const ov::float16* scale_data = scales.data::value_type>(); + uint8_t* bias_u8_data = biases_u8.data(); + for (size_t i = 0; i < biases_u8.get_size(); ++i) { + bias_u8_data[i] = (uint8_t)std::round(-1.f * static_cast(bias_data[i]) / static_cast(scale_data[i])); + } + + auto zero_point = std::make_shared(biases_u8); + + // Quantization operations + auto weights_f16 = std::make_shared(weights_node, ov::element::f16); + auto zero_point_f16 = std::make_shared(zero_point, ov::element::f16); + + auto w_zp = std::make_shared( + weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY + ); + auto w_zp_s = std::make_shared( + w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY + ); + + // Reshape back to original dimensions + auto final_shape = std::make_shared( + ov::element::i64, ov::Shape{orig_shape.size()}, orig_shape + ); + auto w_zp_s_r = std::make_shared( + w_zp_s, final_shape, false + ); + + return std::make_shared(w_zp_s_r, ov::element::f32); +} + +ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) { + + // Convert weight to uint8 view and adjust shape + ov::Shape orig_weight_shape = weight.get_shape(); + orig_weight_shape[1] *= sizeof(uint32_t) / sizeof(uint8_t) * 2; // Double number of columns for 4-bit representation + + // Expand dimensions for scales and biases + ov::Shape scale_bias_shape = scales.get_shape(); + scale_bias_shape.push_back(1); // Add new axis at the end + scales.set_shape(scale_bias_shape); + biases.set_shape(scale_bias_shape); + + // Create INT4 weight tensor + ov::Shape packed_shape = { + orig_weight_shape[0], + orig_weight_shape[1] / group_size, + group_size + }; + + auto weights_node = std::make_shared(ov::element::u4, packed_shape, static_cast(weight.data()), nullptr); + weights_node->get_rt_info()["__gguf_tensor_holde"] = weight; + auto weights_f16 = std::make_shared(weights_node, ov::element::f16); + + // Pack zero points: two subsequent values into one + const ov::float16* bias_data = biases.data::value_type>(); + const ov::float16* scale_data = scales.data::value_type>(); + ov::Tensor zero_point_tensor(ov::element::u4, scale_bias_shape); + uint8_t* zero_point_data = static_cast(zero_point_tensor.data()); + for (size_t i = 0; i < zero_point_tensor.get_byte_size(); ++i) { + uint8_t bias1 = (uint8_t)std::round(-1.f * static_cast(bias_data[i * 2]) / static_cast(scale_data[i * 2])); + uint8_t bias2 = (uint8_t)std::round(-1.f * static_cast(bias_data[i * 2 + 1]) / static_cast(scale_data[i * 2 + 1])); + zero_point_data[i] = (bias2 << 4) | (bias1 & 0x0F); + } + + // CVS-166438: GGUF Q4_0 zp array (U4) with all same value (8) will be converted to single U4 scalar via ConvertU4WeightsZeroPointToScalar transformation. + // This corner case can be handled by CPU plugin properly, but will trigger compilation error on GPU plugin. + // Temporal WA by adding one small bias to keep zp array shape for GPU plugin, confirm no accuracy impact for final LLM generation results. + zero_point_data[0] += 1; + + auto zero_points_node = std::make_shared(zero_point_tensor); + auto zero_points_f16 = std::make_shared(zero_points_node, ov::element::f16); + + auto scales_f16 = std::make_shared(scales); + + // Perform dequantization + auto w_zp = std::make_shared( + weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY); + + auto w_zp_s = std::make_shared( + w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); + + // Reshape back to original shape + auto final_shape = std::make_shared( + ov::element::i64, ov::Shape{orig_weight_shape.size()}, orig_weight_shape); + + auto w_zp_s_r = std::make_shared( + w_zp_s, final_shape, false); + + return std::make_shared(w_zp_s_r, ov::element::f32); +} diff --git a/ggml/src/ggml-openvino/ggml-quant.hpp b/ggml/src/ggml-openvino/ggml-quant.hpp new file mode 100644 index 0000000000..9c0dd89a95 --- /dev/null +++ b/ggml/src/ggml-openvino/ggml-quant.hpp @@ -0,0 +1,44 @@ +#include +#include +#include "ggml.h" + +void unpack_32_4(const uint8_t* data, uint8_t* dst); + +void extract_q4_0_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr); + +void extract_q4_1_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr); + +void extract_q8_0_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr); + +void unpack_256_4(const uint8_t* data, uint8_t* dst); + +void extract_q4_k_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr); + +void extract_q6_k_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr); + +static constexpr size_t GGML_QUANTIZATION_GROUP_SIZE = 32; + +ov::Output make_int8_weights(ov::Tensor& weight, + ov::Tensor& scales, + ov::Tensor& biases, + size_t group_size = GGML_QUANTIZATION_GROUP_SIZE); + +ov::Output make_int4_weights(ov::Tensor& weight, + ov::Tensor& scales, + ov::Tensor& biases, + size_t group_size = GGML_QUANTIZATION_GROUP_SIZE); From 663a0b8cce302fcf9f56d7b5019d427ff6e60689 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 6 Aug 2025 15:54:40 +0800 Subject: [PATCH 120/254] Quant models run with accuracy issue --- ggml/src/ggml-openvino/ggml-decoder.cpp | 20 ++++++++++++++++++- ggml/src/ggml-openvino/ggml-quant.cpp | 4 +++- .../ggml-openvino/openvino/op/get_rows.cpp | 11 ++++++++-- .../openvino/translate_session.cpp | 1 - ggml/src/ggml-openvino/openvino/utils.cpp | 2 ++ 5 files changed, 33 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index c2e164b808..a3e7059fa2 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -22,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -415,6 +417,9 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) auto node_shape = get_shape(tensor); auto ne_total = ggml_nelements(tensor); + OPENVINO_ASSERT(node_shape[0] == 1, "Got 3D weights, expect all weights to be 2D: ", tensor->name); + + // F16 and F32 case if (node_type != ov::element::dynamic) { ov::Tensor weights(node_type, node_shape); memcpy(weights.data(), tensor->data, ne_total * node_type.size()); @@ -426,6 +431,9 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) return weight_node; } + // Quantized case + node_shape.erase(node_shape.begin()); + uint64_t weights_per_byte; if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) { weights_per_byte = 2; @@ -459,7 +467,7 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) ov::Output weight_node; if (tensor->type == GGML_TYPE_Q4_0) { extract_q4_0_data(tensor, weights, scales, biases); - weight_node = make_int8_weights(weights, scales, biases, weights_per_block); + weight_node = make_int4_weights(weights, scales, biases, weights_per_block); } else if (tensor->type == GGML_TYPE_Q4_1) { extract_q4_1_data(tensor, weights, scales, biases); weight_node = make_int4_weights(weights, scales, biases, weights_per_block); @@ -474,7 +482,17 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) extract_q4_k_data(tensor, weights, scales, biases); weight_node = make_int4_weights(weights, scales, biases, weights_per_block); } + + OPENVINO_ASSERT(weight_node.get_shape().size() == 2, "Weight should be 2D"); + // weight_node = std::make_shared( + // weight_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {0})); + weight_node.get_node_shared_ptr()->set_friendly_name(tensor->name); + // GGML_LOG_DEBUG("Created weight node: %s %s %s%s\n", + // tensor->name, + // ggml_type_name(tensor->type), + // weight_node.get_element_type().get_type_name().c_str(), + // weight_node.get_partial_shape().to_string().c_str()); return weight_node.get_node_shared_ptr(); } diff --git a/ggml/src/ggml-openvino/ggml-quant.cpp b/ggml/src/ggml-openvino/ggml-quant.cpp index 4311ab138e..14ef58a3f7 100644 --- a/ggml/src/ggml-openvino/ggml-quant.cpp +++ b/ggml/src/ggml-openvino/ggml-quant.cpp @@ -1,4 +1,7 @@ +#include "ggml-quant.hpp" + #include +#include #include #include #include @@ -6,7 +9,6 @@ #include #include #include -#include #include "ggml.h" diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp index 36795fd43e..0de77da59f 100644 --- a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp @@ -1,4 +1,3 @@ -#include #include #include #include @@ -7,6 +6,7 @@ #include #include #include +#include #include "../node_context.hpp" #include "../op_table.hpp" @@ -31,11 +31,18 @@ OutputVector translate_get_rows(const NodeContext& context) { indices = process_view_input(context, 1); } - auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1}); + Output axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1}); if (indices.get_partial_shape()[1].get_length() == 1) { indices = std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); + if (data.get_partial_shape().rank() == 2) { + axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {0}); + } res = std::make_shared(data, indices, axis); + if (data.get_partial_shape().rank() == 2) { + res = + std::make_shared(res, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); + } } else { indices = std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 3e27a689d5..6280467041 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -212,7 +212,6 @@ std::shared_ptr TranslateSession::apply_transformations(std::shared_ptr(); - manager.register_pass(); if (!ggml_model_decoder->is_static()) { const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names(); diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp index c4197ccc3a..ef5f51ebbc 100644 --- a/ggml/src/ggml-openvino/openvino/utils.cpp +++ b/ggml/src/ggml-openvino/openvino/utils.cpp @@ -17,6 +17,8 @@ #include #include +#include "ggml-impl.h" + namespace ov { namespace frontend { namespace ggml { From 6ab76ed10aa7696cacea783b0ff17fce802e440c Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 7 Aug 2025 14:25:20 +0800 Subject: [PATCH 121/254] Fix accuracy: disable cpu_repack --- docs/build.md | 2 +- ggml/src/ggml-openvino/ggml-decoder.cpp | 4 ++++ ggml/src/ggml-openvino/ggml-openvino.cpp | 3 +-- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/build.md b/docs/build.md index 9e44f18eae..e2af1b96dd 100644 --- a/docs/build.md +++ b/docs/build.md @@ -754,7 +754,7 @@ git switch dev_backend_openvino # Build with OpenVINO support source /opt/intel/openvino/setupvars.sh -cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON +cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF cmake --build build/ReleaseOV --config Release -j $(nproc) ``` diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index a3e7059fa2..cd897e5f68 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -432,6 +432,10 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) } // Quantized case + OPENVINO_ASSERT( + tensor->extra == nullptr, + "Unsupported weight tensor: " + std::string(tensor->name) + " Possibly this is a repacked quantized weights"); + node_shape.erase(node_shape.begin()); uint64_t weights_per_byte; diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index f81b1ee483..23a92c58ac 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -420,8 +420,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con } static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - // TODO quantized weigts are cpu_repack_buffer_type which does not implement ggml_backend_buft_is_host - return ggml_backend_buft_is_host(buft) || strcmp(buft->device->iface.get_name(buft->device), "CPU") == 0; + return ggml_backend_buft_is_host(buft); GGML_UNUSED(dev); } From dd80b04235584263ffc4b7d96304de316f3bd2e7 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 7 Aug 2025 15:22:58 +0800 Subject: [PATCH 122/254] Fix CI; Disable test-backend-ops --- ci/run.sh | 2 +- ggml/src/ggml-openvino/ggml-decoder.cpp | 2 +- .../ggml-openvino/{ggml-quant.cpp => ggml-quants.cpp} | 10 +++++----- .../ggml-openvino/{ggml-quant.hpp => ggml-quants.hpp} | 0 4 files changed, 7 insertions(+), 7 deletions(-) rename ggml/src/ggml-openvino/{ggml-quant.cpp => ggml-quants.cpp} (98%) rename ggml/src/ggml-openvino/{ggml-quant.hpp => ggml-quants.hpp} (100%) diff --git a/ci/run.sh b/ci/run.sh index bfce48f337..564dd270bd 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -174,7 +174,7 @@ if [ ! -z ${GG_BUILD_OPENVINO} ]; then echo "source /opt/intel/openvino/setupvars.sh" exit 1 fi - CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_OPENVINO=ON" + CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF" fi ## helpers diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index cd897e5f68..cde99f3288 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -32,7 +32,7 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" -#include "ggml-quant.hpp" +#include "ggml-quants.hpp" GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token, int context_size, int num_heads, int num_heads_kv, int head_size) : diff --git a/ggml/src/ggml-openvino/ggml-quant.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp similarity index 98% rename from ggml/src/ggml-openvino/ggml-quant.cpp rename to ggml/src/ggml-openvino/ggml-quants.cpp index 14ef58a3f7..8d4fb14189 100644 --- a/ggml/src/ggml-openvino/ggml-quant.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -1,4 +1,4 @@ -#include "ggml-quant.hpp" +#include "ggml-quants.hpp" #include #include @@ -75,11 +75,11 @@ void extract_q8_0_data(const ggml_tensor* tensor, auto weights = static_cast(weights_arr.data()); auto scales = scales_arr.data::value_type>(); auto biases = biases_arr.data::value_type>(); - for (int64_t i = 0; i < scales_arr.get_size(); i++) { + for (size_t i = 0; i < scales_arr.get_size(); i++) { uint8_t* block_data = data + i * bytes_per_block; scales[i] = ov::float16::from_bits(*(uint16_t*)block_data); biases[i] = ov::float16(-128.f * static_cast(scales[i])); - for (int64_t j = 0; j < weights_per_block; ++j) { + for (size_t j = 0; j < weights_per_block; ++j) { uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes. // Original data is in int8_t, so we add a bias of -128 and invert the // first bit. @@ -128,7 +128,7 @@ void extract_q4_k_data(const ggml_tensor* tensor, // Extract qs1 and qs2 uint8_t* qs1 = block_data + 4; - uint8_t* qs2 = block_data + 16; + // uint8_t* qs2 = block_data + 16; scales[i * 8] = ov::float16(scale_scales * static_cast((*(qs1) & 0b111111))); scales[i * 8 + 1] = ov::float16(scale_scales * static_cast((*(qs1 + 1) & 0b111111))); @@ -170,7 +170,7 @@ void extract_q6_k_data(const ggml_tensor* tensor, auto scales = scales_arr.data::value_type>(); auto biases = biases_arr.data::value_type>(); // std::string name(tensor.name, tensor.namelen); - for (int64_t i = 0; i < n_super_block; i++) { + for (size_t i = 0; i < n_super_block; i++) { uint8_t* block_data = data + i * bytes_per_block; float scale_factor = diff --git a/ggml/src/ggml-openvino/ggml-quant.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp similarity index 100% rename from ggml/src/ggml-openvino/ggml-quant.hpp rename to ggml/src/ggml-openvino/ggml-quants.hpp From a1ce428004783bc60511c2048b519138e8ad0698 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 8 Aug 2025 11:07:10 +0800 Subject: [PATCH 123/254] Fix Q4_1 --- ggml/src/ggml-openvino/ggml-quants.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index 8d4fb14189..e969b0b54a 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -15,8 +15,8 @@ void unpack_32_4(const uint8_t* data, uint8_t* dst) { std::fill_n(dst, 16, 0); for (int j = 0; j < 16; ++j) { - uint8_t x = (data[j + 2] & 0x0F); // j+2 to skip scale bytes. - uint8_t y = (data[j + 2] >> 4); + uint8_t x = (data[j] & 0x0F); + uint8_t y = (data[j] >> 4); if (j % 2 != 0) { x <<= 4; y <<= 4; @@ -41,7 +41,7 @@ void extract_q4_0_data(const ggml_tensor* tensor, ov::parallel_for(scales_arr.get_size(), [&](size_t i) { scales[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block))); biases[i] = ov::float16(-8.f * static_cast(scales[i])); - unpack_32_4(data + i * bytes_per_block, weights + i * 16); + unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16); }); } @@ -58,8 +58,8 @@ void extract_q4_1_data(const ggml_tensor* tensor, auto biases = biases_arr.data::value_type>(); ov::parallel_for(scales_arr.get_size(), [&](size_t i) { scales[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block))); - biases[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block + 1))); - unpack_32_4(data + i * bytes_per_block, weights + i * 16); + biases[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block + 2))); + unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16); }); } From 9900245e0b6b673df7a12f717cf971df9c7ffa68 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 12 Aug 2025 09:44:21 +0800 Subject: [PATCH 124/254] Fix test-backend-ops: Treat quantized tensors as weights --- ggml/src/ggml-openvino/ggml-decoder.cpp | 16 ++++++++++------ ggml/src/ggml-openvino/ggml-decoder.h | 5 +++-- ggml/src/ggml-openvino/ggml-openvino.cpp | 14 +++++++++++--- ggml/src/ggml-openvino/utils.cpp | 6 +++++- 4 files changed, 29 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index cde99f3288..b20bfd0c76 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -76,13 +76,15 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, add_extra_inputs(); } -GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) { +GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, + std::map>& model_weights) { if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { std::string filename = "cgraph.txt"; dump_cgraph(cgraph, filename); } m_cgraph = cgraph; + m_model_weights = model_weights; for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { auto* cur_node = cgraph->nodes[node_n]; if (cur_node->op == GGML_OP_NONE) { @@ -123,10 +125,12 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { // Add model inputs and weights constants, if called for the whole graph if (naive) { - auto param_node = std::make_shared(get_ov_type(src), get_graph_input_shape(src)); - param_node->set_friendly_name(src_name); - param_node->output(0).get_tensor().set_names({src_name}); - m_model_inputs[src_name] = param_node; + if (m_model_weights.find(src_name) == m_model_weights.end()) { + auto param_node = std::make_shared(get_ov_type(src), get_graph_input_shape(src)); + param_node->set_friendly_name(src_name); + param_node->output(0).get_tensor().set_names({src_name}); + m_model_inputs[src_name] = param_node; + } } else if (!m_node && !src->view_src) { ggml_backend_buffer* buffer = src->buffer; @@ -381,7 +385,7 @@ std::map> GgmlOvDecoder::create_weight_no std::string src_name(src->name); if (!src->view_src) { ggml_backend_buffer* buffer = src->buffer; - if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { + if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS || ggml_is_quantized(src->type)) { bool should_create = false; { std::lock_guard lock(weights_mutex); diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index ae378273d3..df23c649f4 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -20,7 +20,7 @@ public: int context_size, int num_heads, int num_heads_kv, int head_size); // Naive graph decoder - GgmlOvDecoder(struct ggml_cgraph* cgraph); + GgmlOvDecoder(struct ggml_cgraph* cgraph, std::map>& model_weights); virtual ov::Any get_attribute(const std::string& name) const override { return nullptr; @@ -115,6 +115,8 @@ public: ov::PartialShape get_graph_input_shape(const ggml_tensor* src) const; + static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename); + static std::shared_ptr create_weight_node(ggml_tensor* tensor); static std::map> create_weight_nodes(struct ggml_cgraph* cgraph); @@ -126,7 +128,6 @@ public: private: void set_input_output(ggml_tensor* node, bool naive = false); void add_extra_inputs(); - static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename); static std::vector get_shape(const ggml_tensor* tensor); static std::vector get_stride(const ggml_tensor* tensor); static ov::element::Type get_ov_type(const ggml_tensor* tensor); diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 23a92c58ac..4b743be688 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -403,14 +403,22 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con return false; } for (int i = 0; i < GGML_MAX_SRC; i++) { - if (supported_types.find(op->type) == supported_types.end()) { - GGML_LOG_WARN("OpenVINO backend does not support tensor type %s\n", ggml_type_name(op->type)); + auto* src = op->src[i]; + if (src == nullptr) { + break; + } + if (supported_types.find(src->type) == supported_types.end()) { + GGML_LOG_WARN("OpenVINO backend does not support tensor type %s\n", ggml_type_name(src->type)); return false; } - if (op->src[i] != nullptr && op->src[i]->ne[3] != 1) { + if (src->ne[3] != 1) { GGML_LOG_WARN("OpenVINO backend does not support tensors with ne[3] != 1\n"); return false; } + if (ggml_is_quantized(src->type) && src->ne[2] != 1) { + GGML_LOG_WARN("OpenVINO backend does not support 3D quantized tensors\n"); + return false; + } } if (is_op_unsupported_case(op)) { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 473fa72f99..43fa0c469d 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -281,10 +281,14 @@ enum ggml_status naive_compute(struct ggml_cgraph* cgraph, return GGML_STATUS_FAILED; } - auto decoder = std::make_shared(cgraph); + auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); + auto decoder = std::make_shared(cgraph, model_weights); auto input_model = std::make_shared(decoder); auto naive = true; auto model = ov::frontend::ggml::FrontEnd::convert(input_model, naive); + if (getenv("GGML_OPENVINO_DUMP_IR")) { + ov::serialize(model, "IR_naive.xml"); + } auto infer_request = core.compile_model(model, device, config).create_infer_request(); auto ov_params = model->get_parameters(); From 9ca53c79917aa13954fddda4ae45878e6261b19d Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 19 Aug 2025 14:56:28 +0800 Subject: [PATCH 125/254] Add NPU Q4_0 support --- ggml/src/ggml-openvino/ggml-openvino.cpp | 28 +++++++++++++++--------- ggml/src/ggml-openvino/ggml-quants.cpp | 13 ++++++----- ggml/src/ggml-openvino/ggml-quants.hpp | 13 +++++++++++ 3 files changed, 39 insertions(+), 15 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 4b743be688..a6ec1c64c2 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -333,16 +333,24 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor* op) { GGML_ASSERT(dev->reg != nullptr); - static const std::set supported_types{GGML_TYPE_F32, - GGML_TYPE_F16, - GGML_TYPE_BF16, - GGML_TYPE_I64, - GGML_TYPE_I32, - GGML_TYPE_Q4_0, - GGML_TYPE_Q4_1, - GGML_TYPE_Q4_K, - GGML_TYPE_Q8_0, - GGML_TYPE_Q6_K}; + static std::set supported_types{GGML_TYPE_F32, + GGML_TYPE_F16, + GGML_TYPE_BF16, + GGML_TYPE_I64, + GGML_TYPE_I32, + GGML_TYPE_Q4_0, + GGML_TYPE_Q4_1, + GGML_TYPE_Q4_K, + GGML_TYPE_Q8_0, + GGML_TYPE_Q6_K}; + + std::string device = std::string(getenv("GGML_OPENVINO_DEVICE")); + bool is_npu = device == "NPU"; + if (is_npu) { + // NPU has poor support for asymmetric quantization + supported_types.erase(GGML_TYPE_Q4_1); + supported_types.erase(GGML_TYPE_Q4_K); + } static const std::set supported_ops{GGML_OP_NONE, GGML_OP_ADD, diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index e969b0b54a..97aa494ed8 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -230,6 +230,10 @@ ov::Output make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, o } auto zero_point = std::make_shared(biases_u8); + float zp_value; + if (ov::op::util::get_single_value(zero_point, zp_value)) { + zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value}); + } // Quantization operations auto weights_f16 = std::make_shared(weights_node, ov::element::f16); @@ -287,12 +291,11 @@ ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o zero_point_data[i] = (bias2 << 4) | (bias1 & 0x0F); } - // CVS-166438: GGUF Q4_0 zp array (U4) with all same value (8) will be converted to single U4 scalar via ConvertU4WeightsZeroPointToScalar transformation. - // This corner case can be handled by CPU plugin properly, but will trigger compilation error on GPU plugin. - // Temporal WA by adding one small bias to keep zp array shape for GPU plugin, confirm no accuracy impact for final LLM generation results. - zero_point_data[0] += 1; - auto zero_points_node = std::make_shared(zero_point_tensor); + float zp_value; + if (ov::op::util::get_single_value(zero_points_node, zp_value)) { + zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value}); + } auto zero_points_f16 = std::make_shared(zero_points_node, ov::element::f16); auto scales_f16 = std::make_shared(scales); diff --git a/ggml/src/ggml-openvino/ggml-quants.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp index 9c0dd89a95..ae37b1618e 100644 --- a/ggml/src/ggml-openvino/ggml-quants.hpp +++ b/ggml/src/ggml-openvino/ggml-quants.hpp @@ -1,5 +1,7 @@ #include +#include #include + #include "ggml.h" void unpack_32_4(const uint8_t* data, uint8_t* dst); @@ -42,3 +44,14 @@ ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size = GGML_QUANTIZATION_GROUP_SIZE); + +namespace ov { +namespace op { +namespace util { +// From /src/common/transformations/include/transformations/utils/utils.hpp +bool get_single_value(const std::shared_ptr& const_node, + float& value, + bool check_value_range = true); +} // namespace util +} // namespace op +} // namespace ov From 82c98335d3ce54546f8884416c24a1295ce862b1 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 22 Aug 2025 15:00:38 +0800 Subject: [PATCH 126/254] NPU perf: eliminate zp --- .../openvino/pass/eliminate_zp.cpp | 116 ++++++++++++++++++ .../openvino/pass/eliminate_zp.hpp | 17 +++ .../openvino/translate_session.cpp | 2 + 3 files changed, 135 insertions(+) create mode 100644 ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp create mode 100644 ggml/src/ggml-openvino/openvino/pass/eliminate_zp.hpp diff --git a/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp b/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp new file mode 100644 index 0000000000..d2e5a040dd --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp @@ -0,0 +1,116 @@ +#include "eliminate_zp.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ov { +namespace frontend { +namespace ggml { +namespace pass { + +EliminateZeroPoints::EliminateZeroPoints() { + // Find pattern: + // (Multiply Any(scale) + // (Subtract (Convert Constant(data))) + // (Convert Constant(zero_point))) + // where zero_point is a scalar + // If data is u4 and zp value is 8 (q4_0), Replace the Subtract with an i4 Constant whose value is data - zp_val + // If data is u8 and zp value is 128 (q8_0) or 32 (q6_k), Replace the Subtract with an i8 Constant + + auto m_data_constant = ov::pass::pattern::wrap_type(); + auto m_data_convert = ov::pass::pattern::wrap_type({m_data_constant}); + + auto m_zp_constant = ov::pass::pattern::wrap_type(); + auto m_zp_convert = ov::pass::pattern::wrap_type({m_zp_constant}); + + auto m_subtract = ov::pass::pattern::wrap_type({m_data_convert, m_zp_convert}); + auto m_scale = ov::pass::pattern::any_input(); + auto m_multiply = ov::pass::pattern::wrap_type({m_scale, m_subtract}); + + const auto callback = [=](ov::pass::pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + + auto multiply_node = std::dynamic_pointer_cast(pattern_map.at(m_multiply).get_node_shared_ptr()); + auto subtract_node = std::dynamic_pointer_cast(pattern_map.at(m_subtract).get_node_shared_ptr()); + auto data_constant = std::dynamic_pointer_cast(pattern_map.at(m_data_constant).get_node_shared_ptr()); + auto zp_constant = std::dynamic_pointer_cast(pattern_map.at(m_zp_constant).get_node_shared_ptr()); + + if (!multiply_node || !subtract_node || !data_constant || !zp_constant) { + return false; + } + + if (ov::shape_size(zp_constant->get_shape()) != 1) { + return false; + } + + auto data_type = data_constant->get_element_type(); + auto zp_data = zp_constant->cast_vector(); + + if (zp_data.empty()) { + return false; + } + + int zp_value = zp_data[0]; + + bool should_eliminate = false; + ov::element::Type target_type; + + if (data_type == ov::element::u4 && zp_value == 8) { + should_eliminate = true; + target_type = ov::element::i4; + } else if (data_type == ov::element::u8 && (zp_value == 128 || zp_value == 32)) { + should_eliminate = true; + target_type = ov::element::i8; + } + + if (!should_eliminate) { + return false; + } + + auto data_shape = data_constant->get_shape(); + size_t total_elements = ov::shape_size(data_shape); + + std::shared_ptr new_constant; + + if (data_type == ov::element::u4) { + auto data_values = data_constant->cast_vector(); + std::vector adjusted_values(total_elements); + + ov::parallel_for(total_elements, [&](size_t i) { + adjusted_values[i] = static_cast(static_cast(data_values[i]) - 8); + }); + + new_constant = std::make_shared(target_type, data_shape, adjusted_values); + } else if (data_type == ov::element::u8) { + auto data_values = data_constant->cast_vector(); + std::vector adjusted_values(total_elements); + + ov::parallel_for(total_elements, [&, zp_value](size_t i) { + adjusted_values[i] = static_cast(static_cast(data_values[i]) - zp_value); + }); + + new_constant = std::make_shared(target_type, data_shape, adjusted_values); + } + + auto new_convert = std::make_shared(new_constant, subtract_node->get_output_element_type(0)); + ov::replace_node(subtract_node, new_convert); + + return true; + }; + + register_matcher(std::make_shared(m_multiply, "ov::frontend::ggml::pass::EliminateZeroPoints"), + callback); +} + +} // namespace pass +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.hpp b/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.hpp new file mode 100644 index 0000000000..edd3cd718d --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.hpp @@ -0,0 +1,17 @@ +#include "openvino/pass/matcher_pass.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace pass { + +class EliminateZeroPoints : public ov::pass::MatcherPass { +public: + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::ggml::pass::EliminateZeroPoints") + EliminateZeroPoints(); +}; + +} // namespace pass +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 6280467041..634fea40e9 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -26,6 +26,7 @@ #include "ggml-openvino/openvino/node_context.hpp" #include "ggml-openvino/openvino/utils.hpp" #include "input_model.hpp" +#include "pass/eliminate_zp.hpp" #include "pass/fuse_to_sdpa.hpp" #include "pass/mark_decompression_convert_constant_folding.hpp" @@ -219,6 +220,7 @@ std::shared_ptr TranslateSession::apply_transformations(std::shared_ptr(kv_param_res_pairs); } + manager.register_pass(); manager.register_pass(); manager.run_passes(model); } From b593428eb30cb5daf8000fde044a4f1da35f86f4 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 29 Aug 2025 11:39:27 +0800 Subject: [PATCH 127/254] Dequantize q4_1 q4_k q6_k for NPU --- ggml/src/ggml-openvino/ggml-decoder.cpp | 25 +++++++++++++++++------- ggml/src/ggml-openvino/ggml-decoder.h | 5 +++-- ggml/src/ggml-openvino/ggml-openvino.cpp | 8 -------- ggml/src/ggml-openvino/utils.cpp | 6 +++++- 4 files changed, 26 insertions(+), 18 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index b20bfd0c76..fef8648ebd 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -370,7 +370,8 @@ std::map GgmlOvDecoder::get_kv_param_res_names() const return kv_param_res_names; } -std::map> GgmlOvDecoder::create_weight_nodes(struct ggml_cgraph* cgraph) { +std::map> GgmlOvDecoder::create_weight_nodes( + struct ggml_cgraph* cgraph, std::set types_to_dequantize) { std::map> model_weights; static std::mutex weights_mutex; auto* nodes = cgraph->nodes; @@ -395,7 +396,7 @@ std::map> GgmlOvDecoder::create_weight_no } } if (should_create) { - auto weight_node = create_weight_node(src); + auto weight_node = create_weight_node(src, types_to_dequantize.count(src->type) > 0); weight_node->set_friendly_name(src_name); { std::lock_guard lock(weights_mutex); @@ -409,7 +410,7 @@ std::map> GgmlOvDecoder::create_weight_no return model_weights; } -std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) { +std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, bool to_dequantize) { std::set weight_types = { GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K}; if (weight_types.find(tensor->type) == weight_types.end()) { @@ -422,15 +423,17 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) auto ne_total = ggml_nelements(tensor); OPENVINO_ASSERT(node_shape[0] == 1, "Got 3D weights, expect all weights to be 2D: ", tensor->name); + node_shape.erase(node_shape.begin()); // F16 and F32 case if (node_type != ov::element::dynamic) { ov::Tensor weights(node_type, node_shape); memcpy(weights.data(), tensor->data, ne_total * node_type.size()); std::shared_ptr weight_node = std::make_shared(weights); - if (node_type == ov::element::f16) { - weight_node = std::make_shared(weight_node, ov::element::f32); - } + // Disabled because it triggers a bug in NPUW, no performance impact on CPU GPU + // if (node_type == ov::element::f16) { + // weight_node = std::make_shared(weight_node, ov::element::f32); + // } weight_node->set_friendly_name(tensor->name); return weight_node; } @@ -440,7 +443,15 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) tensor->extra == nullptr, "Unsupported weight tensor: " + std::string(tensor->name) + " Possibly this is a repacked quantized weights"); - node_shape.erase(node_shape.begin()); + if (to_dequantize) { + std::vector weights_f32(ne_total); + ggml_get_type_traits(tensor->type)->to_float(tensor->data, weights_f32.data(), ggml_nelements(tensor)); + ov::Tensor weights(ov::element::f16, node_shape); + ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), ggml_nelements(tensor)); + std::shared_ptr weight_node = std::make_shared(weights); + weight_node->set_friendly_name(tensor->name); + return weight_node; + } uint64_t weights_per_byte; if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) { diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index df23c649f4..b446841514 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -117,8 +117,9 @@ public: static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename); - static std::shared_ptr create_weight_node(ggml_tensor* tensor); - static std::map> create_weight_nodes(struct ggml_cgraph* cgraph); + static std::shared_ptr create_weight_node(ggml_tensor* tensor, bool to_dequantize); + static std::map> create_weight_nodes( + struct ggml_cgraph* cgraph, std::set types_to_dequantize = {}); const ggml_tensor* get_tensor_used_op(const ggml_tensor* tensor) const; const ggml_tensor* get_tensor_from_name(const std::string& name) const; diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index a6ec1c64c2..60a2eb388e 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -344,14 +344,6 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con GGML_TYPE_Q8_0, GGML_TYPE_Q6_K}; - std::string device = std::string(getenv("GGML_OPENVINO_DEVICE")); - bool is_npu = device == "NPU"; - if (is_npu) { - // NPU has poor support for asymmetric quantization - supported_types.erase(GGML_TYPE_Q4_1); - supported_types.erase(GGML_TYPE_Q4_K); - } - static const std::set supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 43fa0c469d..e49d941da4 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -130,7 +130,11 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c compile_end_time = conversion_end_time; } else { std::shared_ptr model; - auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); + std::set types_to_dequantize; + if (is_static) { + types_to_dequantize = {GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K}; + } + auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, types_to_dequantize); if (is_static) { ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true); From 6926655f5bf3dcf8d834f25cb78e0cbf28b7ba36 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 2 Sep 2025 13:52:45 +0800 Subject: [PATCH 128/254] Add custom quant type: q8_1_c, q4_0_128 --- ggml/src/ggml-openvino/ggml-decoder.cpp | 44 ++---- ggml/src/ggml-openvino/ggml-decoder.h | 7 +- ggml/src/ggml-openvino/ggml-quants.cpp | 192 +++++++++++++++++++----- ggml/src/ggml-openvino/ggml-quants.hpp | 10 ++ ggml/src/ggml-openvino/utils.cpp | 16 +- 5 files changed, 202 insertions(+), 67 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index fef8648ebd..d00b78e891 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -371,7 +372,7 @@ std::map GgmlOvDecoder::get_kv_param_res_names() const } std::map> GgmlOvDecoder::create_weight_nodes( - struct ggml_cgraph* cgraph, std::set types_to_dequantize) { + struct ggml_cgraph* cgraph, std::map types_to_requantize) { std::map> model_weights; static std::mutex weights_mutex; auto* nodes = cgraph->nodes; @@ -396,7 +397,10 @@ std::map> GgmlOvDecoder::create_weight_no } } if (should_create) { - auto weight_node = create_weight_node(src, types_to_dequantize.count(src->type) > 0); + auto requant_type = types_to_requantize.count(src->type) ? + std::optional(types_to_requantize.at(src->type)) : + std::nullopt; + auto weight_node = create_weight_node(src, requant_type); weight_node->set_friendly_name(src_name); { std::lock_guard lock(weights_mutex); @@ -410,7 +414,8 @@ std::map> GgmlOvDecoder::create_weight_no return model_weights; } -std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, bool to_dequantize) { +std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, + std::optional requant_type) { std::set weight_types = { GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K}; if (weight_types.find(tensor->type) == weight_types.end()) { @@ -443,21 +448,15 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, tensor->extra == nullptr, "Unsupported weight tensor: " + std::string(tensor->name) + " Possibly this is a repacked quantized weights"); - if (to_dequantize) { - std::vector weights_f32(ne_total); - ggml_get_type_traits(tensor->type)->to_float(tensor->data, weights_f32.data(), ggml_nelements(tensor)); - ov::Tensor weights(ov::element::f16, node_shape); - ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), ggml_nelements(tensor)); - std::shared_ptr weight_node = std::make_shared(weights); - weight_node->set_friendly_name(tensor->name); - return weight_node; + if (requant_type.has_value()) { + return requantize(tensor, requant_type.value()); } - uint64_t weights_per_byte; + ov::element::Type weight_type; if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) { - weights_per_byte = 2; + weight_type = ov::element::u4; } else { // tensor.type == GGUF_TYPE_Q8_0 || tensor.type == GGUF_TYPE_Q6_K - weights_per_byte = 1; + weight_type = ov::element::u8; } uint64_t weights_per_block; @@ -474,15 +473,12 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, " has incompatible last dim shape: ", node_shape.back()); - auto weights_shape = node_shape; - weights_shape.back() /= (weights_per_byte * 4); // means u32 type can store 8 q4 or 4 q8 - - ov::Tensor weights(ov::element::u32, weights_shape); - // For scales and bias + ov::Tensor weights(weight_type, node_shape); + // For scales and biases node_shape[node_shape.size() - 1] = node_shape[node_shape.size() - 1] / weights_per_block; - ov::Tensor scales(ov::element::f16, node_shape); ov::Tensor biases(ov::element::f16, node_shape); + ov::Output weight_node; if (tensor->type == GGML_TYPE_Q4_0) { extract_q4_0_data(tensor, weights, scales, biases); @@ -494,7 +490,6 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, extract_q8_0_data(tensor, weights, scales, biases); weight_node = make_int8_weights(weights, scales, biases, weights_per_block); } else if (tensor->type == GGML_TYPE_Q6_K) { - // due to WA #2135, this case will not be used, extract_q6_k_data temporarily disabled. extract_q6_k_data(tensor, weights, scales, biases); weight_node = make_int8_weights(weights, scales, biases, weights_per_block); } else if (tensor->type == GGML_TYPE_Q4_K) { @@ -503,15 +498,8 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, } OPENVINO_ASSERT(weight_node.get_shape().size() == 2, "Weight should be 2D"); - // weight_node = std::make_shared( - // weight_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {0})); weight_node.get_node_shared_ptr()->set_friendly_name(tensor->name); - // GGML_LOG_DEBUG("Created weight node: %s %s %s%s\n", - // tensor->name, - // ggml_type_name(tensor->type), - // weight_node.get_element_type().get_type_name().c_str(), - // weight_node.get_partial_shape().to_string().c_str()); return weight_node.get_node_shared_ptr(); } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index b446841514..24e1d92dcf 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -4,8 +4,10 @@ #include #include #include +#include #include +#include "ggml-quants.hpp" #include "ggml.h" #include "openvino/decoder.hpp" @@ -117,9 +119,10 @@ public: static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename); - static std::shared_ptr create_weight_node(ggml_tensor* tensor, bool to_dequantize); + static std::shared_ptr create_weight_node(ggml_tensor* tensor, + std::optional requant_type = std::nullopt); static std::map> create_weight_nodes( - struct ggml_cgraph* cgraph, std::set types_to_dequantize = {}); + struct ggml_cgraph* cgraph, std::map types_to_requantize = {}); const ggml_tensor* get_tensor_used_op(const ggml_tensor* tensor) const; const ggml_tensor* get_tensor_from_name(const std::string& name) const; diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index 97aa494ed8..1603e65355 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -1,15 +1,20 @@ #include "ggml-quants.hpp" #include +#include +#include #include #include +#include #include #include #include #include #include #include +#include +#include "ggml-impl.h" #include "ggml.h" void unpack_32_4(const uint8_t* data, uint8_t* dst) { @@ -203,20 +208,24 @@ void extract_q6_k_data(const ggml_tensor* tensor, // TODO Reorder for make_intX_weights ov::Output make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) { - - // Reshape weight to (num_heads, -1, group_size) ov::Shape orig_shape = weight.get_shape(); - orig_shape[1] *= sizeof(uint32_t) / sizeof(uint8_t); - size_t num_groups = orig_shape[1] / group_size; // Expand dimensions for scales and biases auto scale_shape = scales.get_shape(); - scale_shape.push_back(1); - scales.set_shape(scale_shape); - biases.set_shape(scale_shape); + + ov::Shape packed_shape = {orig_shape[0], orig_shape[1] / group_size, group_size}; + + if (packed_shape[1] == 1) { + packed_shape.erase(packed_shape.begin() + 1); + } else { + scale_shape.push_back(1); + scales.set_shape(scale_shape); + biases.set_shape(scale_shape); + } // Create graph nodes - auto weights_node = std::make_shared(ov::element::u8, ov::Shape{orig_shape[0], num_groups, group_size}, static_cast(weight.data()), nullptr); + auto weights_node = std::make_shared( + ov::element::u8, packed_shape, static_cast(weight.data()), nullptr); weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; auto scales_f16 = std::make_shared(scales); ov::Tensor biases_u8(ov::element::u8, scale_shape); @@ -242,32 +251,24 @@ ov::Output make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, o auto w_zp = std::make_shared( weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY ); - auto w_zp_s = std::make_shared( - w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY - ); + ov::Output w_zp_s = + std::make_shared(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); - // Reshape back to original dimensions - auto final_shape = std::make_shared( - ov::element::i64, ov::Shape{orig_shape.size()}, orig_shape - ); - auto w_zp_s_r = std::make_shared( - w_zp_s, final_shape, false - ); + if (packed_shape.size() != 2) { + // If not requantized channel-wise case, reshape back to original shape + auto final_shape = + std::make_shared(ov::element::i64, ov::Shape{orig_shape.size()}, orig_shape); + w_zp_s = std::make_shared(w_zp_s, final_shape, false); + } - return std::make_shared(w_zp_s_r, ov::element::f32); + return std::make_shared(w_zp_s, ov::element::f32); } ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) { - - // Convert weight to uint8 view and adjust shape ov::Shape orig_weight_shape = weight.get_shape(); - orig_weight_shape[1] *= sizeof(uint32_t) / sizeof(uint8_t) * 2; // Double number of columns for 4-bit representation // Expand dimensions for scales and biases ov::Shape scale_bias_shape = scales.get_shape(); - scale_bias_shape.push_back(1); // Add new axis at the end - scales.set_shape(scale_bias_shape); - biases.set_shape(scale_bias_shape); // Create INT4 weight tensor ov::Shape packed_shape = { @@ -276,8 +277,17 @@ ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o group_size }; + // Requantized channel-wise case + if (packed_shape[1] == 1) { + packed_shape.erase(packed_shape.begin() + 1); + } else { + scale_bias_shape.push_back(1); + scales.set_shape(scale_bias_shape); + biases.set_shape(scale_bias_shape); + } + auto weights_node = std::make_shared(ov::element::u4, packed_shape, static_cast(weight.data()), nullptr); - weights_node->get_rt_info()["__gguf_tensor_holde"] = weight; + weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; auto weights_f16 = std::make_shared(weights_node, ov::element::f16); // Pack zero points: two subsequent values into one @@ -304,15 +314,129 @@ ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o auto w_zp = std::make_shared( weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY); - auto w_zp_s = std::make_shared( - w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); + ov::Output w_zp_s = + std::make_shared(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); - // Reshape back to original shape - auto final_shape = std::make_shared( - ov::element::i64, ov::Shape{orig_weight_shape.size()}, orig_weight_shape); + if (packed_shape.size() != 2) { + // If not requantized channel-wise case, reshape back to original shape + auto final_shape = std::make_shared( + ov::element::i64, ov::Shape{orig_weight_shape.size()}, orig_weight_shape); - auto w_zp_s_r = std::make_shared( - w_zp_s, final_shape, false); + w_zp_s = std::make_shared(w_zp_s, final_shape, false); + } - return std::make_shared(w_zp_s_r, ov::element::f32); + return std::make_shared(w_zp_s, ov::element::f32); +} + +std::shared_ptr requantize(const ggml_tensor* tensor, ExtraQuantType requant_type) { + std::vector weights_f32(tensor->ne[0] * tensor->ne[1]); + ggml_get_type_traits(tensor->type)->to_float(tensor->data, weights_f32.data(), ggml_nelements(tensor)); + + std::shared_ptr weight_node; + ov::Shape node_shape = {(uint64_t) (tensor->ne[1]), (uint64_t) (tensor->ne[0])}; + + if (requant_type == ExtraQuantType::F16) { + ov::Tensor weights(ov::element::f16, node_shape); + ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), ggml_nelements(tensor)); + std::shared_ptr weight_node = std::make_shared(weights); + weight_node->set_friendly_name(tensor->name); + return weight_node; + } + + int64_t block_size = node_shape[1]; + if (requant_type == ExtraQuantType::Q4_0_128) { + block_size = 128; + } + auto scales_shape = ov::Shape{node_shape[0], node_shape[1] / block_size}; + + ov::Tensor weights; + ov::Tensor scales(ov::element::f16, scales_shape); + ov::Tensor bias(ov::element::f16, scales_shape); + + if (requant_type == ExtraQuantType::Q4_0_C) { + weights = ov::Tensor(ov::element::u4, node_shape); + quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size); + weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr(); + } else if (requant_type == ExtraQuantType::Q8_1_C) { + weights = ov::Tensor(ov::element::u8, node_shape); + quantize_q8_1(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size); + weight_node = make_int8_weights(weights, scales, bias, block_size).get_node_shared_ptr(); + } else if (requant_type == ExtraQuantType::Q4_0_128) { + weights = ov::Tensor(ov::element::u4, node_shape); + quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size); + weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr(); + } + + weight_node->set_friendly_name(tensor->name); + return weight_node; +} + +void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, + int64_t qk) { + assert(k % qk == 0); + const int nb = k / qk; + + auto* weights = static_cast(weights_arr.data()); + auto* scales = scales_arr.data::value_type>(); + auto* biases = biases_arr.data::value_type>(); + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + float max = 0.0f; + + for (int j = 0; j < qk; j++) { + const float v = x[i * qk + j]; + if (amax < fabsf(v)) { + amax = fabsf(v); + max = v; + } + } + + const float d = max / -8; + const float id = d ? 1.0f / d : 0.0f; + scales[i] = ov::float16(d); + biases[i] = ov::float16(-8.f * d); + + for (int j = 0; j < qk / 2; ++j) { + const float x0 = x[i * qk + 2 * j] * id; + const float x1 = x[i * qk + 2 * j + 1] * id; + const uint8_t xi0 = MIN(15, (int8_t) (x0 + 8.5f)); + const uint8_t xi1 = MIN(15, (int8_t) (x1 + 8.5f)); + weights[i * qk / 2 + j] = xi0 | (xi1 << 4); + } + } +} + +void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, + int64_t qk) { + assert(k % qk == 0); + const int nb = k / qk; + + auto* weights = static_cast(weights_arr.data()); + auto* scales = scales_arr.data::value_type>(); + auto* biases = biases_arr.data::value_type>(); + for (int i = 0; i < nb; i++) { + float min = std::numeric_limits::max(); + float max = std::numeric_limits::lowest(); + + for (int j = 0; j < qk; j++) { + const float v = x[i * qk + j]; + if (v < min) { + min = v; + } + if (v > max) { + max = v; + } + } + + const float d = (max - min) / ((1 << 8) - 1); + const float id = d ? 1.0f / d : 0.0f; + scales[i] = ov::float16(d); + biases[i] = ov::float16(min); + + for (int j = 0; j < qk; ++j) { + const float x0 = (x[i * qk + j] - min) * id; + const uint8_t xi0 = roundf(x0); + weights[i * qk + j] = xi0; + } + } } diff --git a/ggml/src/ggml-openvino/ggml-quants.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp index ae37b1618e..fbae2aa1f4 100644 --- a/ggml/src/ggml-openvino/ggml-quants.hpp +++ b/ggml/src/ggml-openvino/ggml-quants.hpp @@ -1,3 +1,4 @@ +#pragma once #include #include #include @@ -45,6 +46,15 @@ ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& biases, size_t group_size = GGML_QUANTIZATION_GROUP_SIZE); +enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128 }; + +std::shared_ptr requantize(const ggml_tensor* tensor, ExtraQuantType requant_type); + +void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, + int64_t qk); +void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, + int64_t qk); + namespace ov { namespace op { namespace util { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index e49d941da4..3f728c242d 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -130,11 +130,21 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c compile_end_time = conversion_end_time; } else { std::shared_ptr model; - std::set types_to_dequantize; + std::map types_to_requantize; if (is_static) { - types_to_dequantize = {GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K}; + types_to_requantize = { + {GGML_TYPE_Q4_0, ExtraQuantType::Q4_0_128}, + {GGML_TYPE_Q4_1, ExtraQuantType::Q4_0_128}, + {GGML_TYPE_Q4_K, ExtraQuantType::Q4_0_128}, + {GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C }, + }; + } else if (device == "GPU") { + types_to_requantize = { + // CVS-166739 + {GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C}, + }; } - auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, types_to_dequantize); + auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, types_to_requantize); if (is_static) { ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true); From c5231a24481f76ab3dbca2a1b5ffd16a5dee0663 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 2 Sep 2025 14:52:04 +0800 Subject: [PATCH 129/254] Set m_is_static=false as default in decoder --- ggml/src/ggml-openvino/ggml-decoder.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 24e1d92dcf..4ba147da20 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -161,7 +161,7 @@ private: int m_head_size; int32_t* m_rope_params; std::vector m_kv_names; - bool m_is_static; + bool m_is_static = false; bool m_is_first_token; }; From 810eb480f529148ee6e20437755dbb3273589f60 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 2 Sep 2025 14:53:09 +0800 Subject: [PATCH 130/254] Simpilfy translation of get_rows --- .../ggml-openvino/openvino/op/get_rows.cpp | 26 ++++++------------- 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp index 0de77da59f..5e4c7d901a 100644 --- a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp @@ -3,10 +3,7 @@ #include #include #include -#include -#include #include -#include #include "../node_context.hpp" #include "../op_table.hpp" @@ -31,22 +28,15 @@ OutputVector translate_get_rows(const NodeContext& context) { indices = process_view_input(context, 1); } - Output axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1}); - if (indices.get_partial_shape()[1].get_length() == 1) { - indices = - std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); - if (data.get_partial_shape().rank() == 2) { - axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {0}); - } - res = std::make_shared(data, indices, axis); - if (data.get_partial_shape().rank() == 2) { - res = - std::make_shared(res, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); - } - } else { - indices = - std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); + // data[b,x,y] ind[1,b,x'] test-backend-ops case + // data[x,y] ind[1,1,x'] normal case + indices = std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); + if (data.get_partial_shape().rank() == 3) { + auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1}); res = std::make_shared(data, indices, axis, 1); + } else { + auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {0}); + res = std::make_shared(data, indices, axis); } if (res.get_element_type() != context.get_output_type(0)) { From 0f7b253cb3daa67f4ac8d2659459a0609692b1a3 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 8 Sep 2025 16:52:58 +0800 Subject: [PATCH 131/254] Fix after rebasing --- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index bfccc28163..b4103378eb 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -41,13 +41,8 @@ OutputVector translate_mulmat(const NodeContext& context) { B = process_view_input(context, 0); A = process_view_input(context, 1); } - - bool convert_out_type = false; - if (ov::op::util::is_constant(B.get_node()) && context.get_input_type(0) != context.get_input_type(1)) { - B = std::make_shared(B, context.get_input_type(1)); - } else if (context.get_input_type(0) != context.get_input_type(1)) { - A = std::make_shared(A, context.get_input_type(0)); - convert_out_type = true; + if (A.get_element_type() != B.get_element_type()) { + B = std::make_shared(context.get_input(0), context.get_input_type(1)); } auto B_shape = context.get_input_shape(0).to_shape(); @@ -82,12 +77,7 @@ OutputVector translate_mulmat(const NodeContext& context) { A = Z; } - if (convert_out_type) { - auto result_lp = std::make_shared(A, B, false, transpose_b); - res = std::make_shared(result_lp, context.get_output_type(0)); - } else { - res = std::make_shared(A, B, false, transpose_b); - } + res = std::make_shared(A, B, false, transpose_b); return rename_outputs_with_suffix({res}, context.get_name()); } From 2ad1147b9b285d190aef33a1a346a8f0ca2c8d64 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 10 Sep 2025 15:38:15 +0800 Subject: [PATCH 132/254] Improve debug util; Eliminate nop ReshapeReshape --- ggml/src/ggml-openvino/ggml-decoder.cpp | 27 +++++---- .../src/ggml-openvino/openvino/op/reshape.cpp | 7 ++- ggml/src/ggml-openvino/utils.cpp | 55 +++++++++++++++---- 3 files changed, 65 insertions(+), 24 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index d00b78e891..0dfc11e490 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -154,22 +154,22 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { // Add model outputs, if called for the whole graph if (naive) { - m_model_output_names.push_back(node->name); + m_model_output_names.push_back(node_name); } else if (!m_node) { + // Model outputs are tensors with GGML_TENSOR_FLAG_OUTPUT flag and kv_caches static std::set debug_output_names = {}; // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph - if (node->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || node->flags & GGML_TENSOR_FLAG_OUTPUT || - std::string(node->name).find("result") == 0 || debug_output_names.count(node->name)) { - auto name = node->view_src ? std::string(node->view_src->name) : std::string(node->name); - if (node->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) { - assert(name.find("cache_k") == 0 || name.find("cache_v") == 0); + if (node->op == GGML_OP_SET_ROWS || node->flags & GGML_TENSOR_FLAG_OUTPUT || node_name.find("result") == 0 || + debug_output_names.count(node_name)) { + if (node->op == GGML_OP_SET_ROWS) { + assert(node_name.find("cache_k") == 0 || node_name.find("cache_v") == 0); + if (auto it = std::find(m_kv_names.begin(), m_kv_names.end(), node_name); it == m_kv_names.end()) { + m_kv_names.push_back(node_name); + } } - if (auto it = std::find(m_model_output_names.begin(), m_model_output_names.end(), name); + if (auto it = std::find(m_model_output_names.begin(), m_model_output_names.end(), node_name); it == m_model_output_names.end()) { - m_model_output_names.push_back(name); - } - if (auto it = std::find(m_kv_names.begin(), m_kv_names.end(), name); it == m_kv_names.end()) { - m_kv_names.push_back(name); + m_model_output_names.push_back(node_name); } } } @@ -177,7 +177,10 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { if (m_node) { switch (node->op) { case GGML_OP_RESHAPE: { - if (node->ne[0] * node->ne[1] == node->src[0]->ne[0]) { + if (node->src[0]->op == GGML_OP_RESHAPE && node->src[0]->src[0]->ne[0] == node->ne[0] && + node->src[0]->src[0]->ne[1] == node->ne[1]) { + m_op_case = 4; + } else if (node->ne[0] * node->ne[1] == node->src[0]->ne[0]) { m_op_case = 1; } else if (node->src[0]->ne[0] * node->src[0]->ne[1] == node->ne[0]) { m_op_case = 2; diff --git a/ggml/src/ggml-openvino/openvino/op/reshape.cpp b/ggml/src/ggml-openvino/openvino/op/reshape.cpp index 4ef3833c90..1ed6f4b880 100644 --- a/ggml/src/ggml-openvino/openvino/op/reshape.cpp +++ b/ggml/src/ggml-openvino/openvino/op/reshape.cpp @@ -23,7 +23,8 @@ OutputVector translate_reshape(const NodeContext& context) { } int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported RESHAPE case"); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3 || op_case == 4, + "Unsupported RESHAPE case"); auto output_shape = context.get_output_shape(0).to_shape(); std::shared_ptr new_shape_node; @@ -37,9 +38,11 @@ OutputVector translate_reshape(const NodeContext& context) { ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{(int64_t)output_shape[0], -1, (int64_t)output_shape[2]}); - } else { + } else if (op_case == 3) { new_shape_node = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{(int64_t) output_shape[0], -1, 1}); + } else if (op_case == 4) { + return {context.get_input(0).get_node_shared_ptr()->input_value(0)}; } auto res = std::make_shared(context.get_input(0), new_shape_node, false); return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 3f728c242d..588404df19 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -6,6 +6,8 @@ #include #include #include +#include +#include #include #include #include @@ -418,17 +420,50 @@ void print_output_tensor_info(const std::string& name, const ov::Tensor& tensor, std::map& output_dst) { std::cout << "Output name: " << name << ", Output shape: " << tensor.get_shape() << ", Address: " << output_dst[name] << std::endl; + + auto print_float_stats = [](const std::string& type_name, size_t size, auto get_value) { + if (size == 0) { + return; + } + + float first = get_value(0); + float min = first; + float max = first; + double sum = first; + + for (size_t i = 1; i < size; ++i) { + float v = get_value(i); + if (v < min) { + min = v; + } + if (v > max) { + max = v; + } + sum += v; + } + double mean = sum / size; + + std::cout << std::right << std::setw(6) << type_name << std::right << std::setw(12) << "First" << std::setw(12) + << "Min" << std::setw(12) << "Max" << std::setw(12) << "Mean" << std::endl; + std::cout << std::right << std::setw(6) << "" << std::right << std::setw(12) << first << std::setw(12) << min + << std::setw(12) << max << std::setw(12) << mean << std::endl; + }; + switch (tensor.get_element_type()) { - case ov::element::f32: - std::cout << *(tensor.data()) << std::endl; - std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; - break; - case ov::element::f16: - std::cout << *(tensor.data()) << std::endl; - std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; - break; - default: - break; + case ov::element::f32: { + const float* data = tensor.data(); + size_t size = tensor.get_size(); + print_float_stats("[f32]", size, [data](size_t i) { return data[i]; }); + break; + } + case ov::element::f16: { + const ov::float16* data = tensor.data(); + size_t size = tensor.get_size(); + print_float_stats("[f16]", size, [data](size_t i) { return static_cast(data[i]); }); + break; + } + default: + break; } } From dc77cbb3f68a951df3b4cbdc5d38090d65dd2aaf Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 10 Sep 2025 16:54:57 +0800 Subject: [PATCH 133/254] STYLE: make get_types_to_requant a function --- ggml/src/ggml-openvino/utils.cpp | 33 +++++++++++++++++--------------- ggml/src/ggml-openvino/utils.h | 2 ++ 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 588404df19..2438f2dd11 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -132,21 +132,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c compile_end_time = conversion_end_time; } else { std::shared_ptr model; - std::map types_to_requantize; - if (is_static) { - types_to_requantize = { - {GGML_TYPE_Q4_0, ExtraQuantType::Q4_0_128}, - {GGML_TYPE_Q4_1, ExtraQuantType::Q4_0_128}, - {GGML_TYPE_Q4_K, ExtraQuantType::Q4_0_128}, - {GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C }, - }; - } else if (device == "GPU") { - types_to_requantize = { - // CVS-166739 - {GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C}, - }; - } - auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, types_to_requantize); + auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, get_types_to_requant(device)); if (is_static) { ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true); @@ -275,6 +261,23 @@ ov::AnyMap get_npu_prefill_config() { return config; } +std::map get_types_to_requant(const std::string& device) { + if (device == "NPU") { + return { + {GGML_TYPE_Q4_0, ExtraQuantType::Q4_0_128}, + {GGML_TYPE_Q4_1, ExtraQuantType::Q4_0_128}, + {GGML_TYPE_Q4_K, ExtraQuantType::Q4_0_128}, + {GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C }, + }; + } + if (device == "GPU") { + return { + // CVS-166739 + {GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C}, + }; + } +} + ov::AnyMap get_npu_generate_config() { ov::AnyMap config = get_npu_prefill_config(); config.emplace("NPUW_UNFOLD_IREQS", "YES"); diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index f377fe9d27..42686c593b 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -43,6 +43,8 @@ bool is_prefill(struct ggml_cgraph * cgraph); ov::AnyMap get_npu_prefill_config(); ov::AnyMap get_npu_generate_config(); +std::map get_types_to_requant(const std::string& device); + ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string& param_name); bool is_naive(struct ggml_cgraph* cgraph); From bcc343af00d4e6e5f23a0327109e11da5def7745 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 11 Sep 2025 14:34:17 +0800 Subject: [PATCH 134/254] Support BF16 model --- ggml/src/ggml-openvino/ggml-decoder.cpp | 10 ++++++++-- ggml/src/ggml-openvino/utils.cpp | 1 + 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 0dfc11e490..0bdb9aa897 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -419,8 +419,14 @@ std::map> GgmlOvDecoder::create_weight_no std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, std::optional requant_type) { - std::set weight_types = { - GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K}; + std::set weight_types = {GGML_TYPE_F32, + GGML_TYPE_F16, + GGML_TYPE_BF16, + GGML_TYPE_Q8_0, + GGML_TYPE_Q4_0, + GGML_TYPE_Q4_1, + GGML_TYPE_Q4_K, + GGML_TYPE_Q6_K}; if (weight_types.find(tensor->type) == weight_types.end()) { throw std::runtime_error("Unexpected weight tensor type: " + std::string(tensor->name) + " with type " + ggml_type_name(tensor->type)); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 2438f2dd11..cf0a02c3ad 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -276,6 +276,7 @@ std::map get_types_to_requant(const std::string& devi {GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C}, }; } + return {}; } ov::AnyMap get_npu_generate_config() { From 434059aef7ac9330506f622d413f1b2e0a96adde Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 12 Sep 2025 11:42:02 +0800 Subject: [PATCH 135/254] Fix NPU compile --- ggml/src/ggml-openvino/utils.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index cf0a02c3ad..c03ec1acbc 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -251,7 +251,6 @@ ov::AnyMap get_npu_prefill_config() { {"NPUW_DEVICES", "NPU" }, {"NPUW_FOLD", "YES" }, {"NPUW_WEIGHTS_BANK", "shared" }, - {"NPUW_SLICE_OUT", "YES" }, {"NPUW_FUNCALL_ASYNC", "YES" }, {"NPUW_FUNCALL_FOR_ALL", "YES" }, {"NPUW_DQ", "YES" }, From da2cc993bcce01dc509df511392065554667e7f6 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 12 Sep 2025 16:32:41 +0800 Subject: [PATCH 136/254] WA for npu 1st token acc issue --- ggml/src/ggml-openvino/utils.cpp | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index c03ec1acbc..7b696769fb 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -218,7 +218,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto gguf_tensor_addrs = get_ggml_graph_output_dst(ggml_decoder); for (size_t i = 0; i < ov_output_names.size(); i++) { - auto result_name = ov_output_names[i]; + auto& result_name = ov_output_names[i]; const auto output_tensor = infer_request.get_output_tensor(i); std::memcpy(gguf_tensor_addrs[result_name], output_tensor.data(), output_tensor.get_byte_size()); @@ -243,20 +243,34 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c GGML_UNUSED(backend); } -ov::AnyMap get_npu_prefill_config() { - ov::AnyMap config = { +namespace { +ov::AnyMap get_npu_base_config() { + return { {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm" }, {"NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES" }, {"NPU_USE_NPUW", "YES" }, {"NPUW_DEVICES", "NPU" }, {"NPUW_FOLD", "YES" }, {"NPUW_WEIGHTS_BANK", "shared" }, - {"NPUW_FUNCALL_ASYNC", "YES" }, {"NPUW_FUNCALL_FOR_ALL", "YES" }, {"NPUW_DQ", "YES" }, {"NPUW_DQ_FULL", "NO" }, {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""}, }; +} +} // namespace + +ov::AnyMap get_npu_prefill_config() { + auto config = get_npu_base_config(); + config.emplace("NPUW_FUNCALL_ASYNC", "NO"); + config.emplace("NPUW_ACC_CHECK", "YES"); + config.emplace("NPUW_ACC_DEVICE", "CPU"); + return config; +} + +ov::AnyMap get_npu_generate_config() { + auto config = get_npu_base_config(); + config.emplace("NPUW_FUNCALL_ASYNC", "YES"); return config; } @@ -266,7 +280,7 @@ std::map get_types_to_requant(const std::string& devi {GGML_TYPE_Q4_0, ExtraQuantType::Q4_0_128}, {GGML_TYPE_Q4_1, ExtraQuantType::Q4_0_128}, {GGML_TYPE_Q4_K, ExtraQuantType::Q4_0_128}, - {GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C }, + {GGML_TYPE_Q6_K, ExtraQuantType::F16 }, }; } if (device == "GPU") { @@ -278,12 +292,6 @@ std::map get_types_to_requant(const std::string& devi return {}; } -ov::AnyMap get_npu_generate_config() { - ov::AnyMap config = get_npu_prefill_config(); - config.emplace("NPUW_UNFOLD_IREQS", "YES"); - return config; -} - bool is_naive(struct ggml_cgraph* cgraph) { constexpr int naive_graph_size_threshold = 20; return cgraph->n_nodes < naive_graph_size_threshold; @@ -373,7 +381,7 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, cons } else if (const auto* op = ggml_decoder->get_tensor_used_op(ggml_decoder->get_tensor_from_name(param_name)); op && op->op == GGML_OP_SET_ROWS && is_static && is_first_token) { - input_tensor = ov::Tensor(ov::element::i64, ov::Shape{1}); + input_tensor = ov::Tensor(ov::element::i64, ov::Shape{1, 1, 1}); } else { input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); } From be07073e0e10b23a1d1825c59860e846fe7d2293 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 12 Sep 2025 16:51:46 +0800 Subject: [PATCH 137/254] Apply EliminateZP only for npu --- ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp | 1 + ggml/src/ggml-openvino/openvino/translate_session.cpp | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp index c36579910d..f38c0837d1 100644 --- a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +++ b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp @@ -19,6 +19,7 @@ namespace ggml { namespace pass { FuseToSDPA::FuseToSDPA() { + // Not maintained since FLASH_ATTN_EXT has replaced this pattern const auto m_k = ov::pass::pattern::any_input(); const auto m_q = ov::pass::pattern::any_input(); const auto m_qk = ov::pass::pattern::wrap_type({m_q, m_k}); diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 634fea40e9..3b8c30361a 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -27,7 +27,6 @@ #include "ggml-openvino/openvino/utils.hpp" #include "input_model.hpp" #include "pass/eliminate_zp.hpp" -#include "pass/fuse_to_sdpa.hpp" #include "pass/mark_decompression_convert_constant_folding.hpp" namespace ov { @@ -220,8 +219,9 @@ std::shared_ptr TranslateSession::apply_transformations(std::shared_ptr(kv_param_res_pairs); } - manager.register_pass(); - manager.register_pass(); + if (ggml_model_decoder->is_static()) { + manager.register_pass(); + } manager.run_passes(model); } return model; From 597561242f54da7913509004f059b085f08618a5 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 15 Sep 2025 11:13:59 +0800 Subject: [PATCH 138/254] Add GeGLU --- ggml/src/ggml-openvino/ggml-openvino.cpp | 37 ++++++++++---- .../ggml-openvino/openvino/op/glu_geglu.cpp | 50 +++++++++++++++++++ .../ggml-openvino/openvino/op/glu_swiglu.cpp | 7 +++ ggml/src/ggml-openvino/openvino/op_table.cpp | 1 + ggml/src/ggml-openvino/openvino/op_table.hpp | 1 + 5 files changed, 85 insertions(+), 11 deletions(-) create mode 100644 ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 60a2eb388e..6da653716f 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -249,17 +249,30 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { const auto* op_params = op->op_params; memcpy(&scale, (const float*) op_params + 0, sizeof(float)); memcpy(&max_bias, (const float*) op_params + 1, sizeof(float)); - const uint32_t h = op->src[0]->ne[2]; - const uint32_t n_head = op->src[0]->ne[0]; - const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head)); + if (max_bias > 0) { + GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with max_bias > 0\n"); + return true; + } + } - const float m0 = powf(2.0f, -(max_bias) / n_head_log2); - const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - const float slope = - (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2 * (h - n_head_log2) + 1) : 1.0f; - - if (slope != 1.0f) { - GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with slope != 1.0f\n"); + if (op->op == GGML_OP_FLASH_ATTN_EXT) { + if (op->src[4] != nullptr) { + GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with sinks\n"); + return true; + } + float scale = 1.0f; + float max_bias = 0.0f; + float logit_softcap = 0.0f; + const auto* op_params = op->op_params; + memcpy(&scale, (const float*) op_params + 0, sizeof(float)); + memcpy(&max_bias, (const float*) op_params + 1, sizeof(float)); + memcpy(&logit_softcap, (const float*) op_params + 2, sizeof(float)); + if (max_bias > 0) { + GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with max_bias > 0\n"); + return true; + } + if (logit_softcap != 0) { + GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with logit_softcap != 0\n"); return true; } } @@ -357,7 +370,8 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE, - GGML_OP_SOFT_MAX, + // softmax is not updated due to replaced by flash_attn_ext + // GGML_OP_SOFT_MAX, GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY}; @@ -366,6 +380,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con }; static const std::set supported_glu_ops{ GGML_GLU_OP_SWIGLU, + GGML_GLU_OP_GEGLU, }; switch (op->op) { diff --git a/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp new file mode 100644 index 0000000000..4295bf7517 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp @@ -0,0 +1,50 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_glu_geglu(const NodeContext& context) { + num_inputs_check(context, 1, 2); + + ov::Output src0; + ov::Output src1; + if (context.get_input_size() == 2) { + src0 = context.get_input(0); + src1 = context.get_input(1); + } else { + auto combined = context.get_input(0); + auto split_axis = ov::op::v0::Constant::create(ov::element::i64, {}, {2}); + auto split = std::make_shared(combined, split_axis, 2); + src0 = split->output(0); + src1 = split->output(1); + } + + int32_t* params = context.get_output_op_params(0); + const int32_t swapped = params[1]; + if (swapped) { + std::swap(src0, src1); + } + + auto gelu = std::make_shared(src0); + auto res = std::make_shared(gelu, src1); + + return rename_outputs_with_suffix({res}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp index 138ef65090..bef42fe4b7 100644 --- a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp @@ -31,6 +31,13 @@ OutputVector translate_glu_swiglu(const NodeContext& context) { src0 = split->output(0); src1 = split->output(1); } + + int32_t* params = context.get_output_op_params(0); + const int32_t swapped = params[1]; + if (swapped) { + std::swap(src0, src1); + } + auto sigmoid = std::make_shared(src0); auto silu = std::make_shared(src0, sigmoid); auto res = std::make_shared(silu, src1); diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp index ee55f84b96..e36e8f17cc 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.cpp +++ b/ggml/src/ggml-openvino/openvino/op_table.cpp @@ -34,6 +34,7 @@ std::unordered_map get_supported_ops() { {"GGML_UNARY_OP_SILU", op::translate_unary_silu }, {"GGML_OP_VIEW", op::translate_view }, {"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu }, + {"GGML_GLU_OP_GEGLU", op::translate_glu_geglu }, {"GGML_OP_SET_ROWS", op::translate_set_rows }, {"GGML_OP_CPY", op::translate_cpy }, {"GGML_OP_FLASH_ATTN_EXT", op::translate_flash_attn_ext }, diff --git a/ggml/src/ggml-openvino/openvino/op_table.hpp b/ggml/src/ggml-openvino/openvino/op_table.hpp index faa61f5f6c..5d4f053860 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.hpp +++ b/ggml/src/ggml-openvino/openvino/op_table.hpp @@ -25,6 +25,7 @@ GGML_OP_CONVERTER(translate_soft_max); GGML_OP_CONVERTER(translate_transpose); GGML_OP_CONVERTER(translate_view); GGML_OP_CONVERTER(translate_glu_swiglu); +GGML_OP_CONVERTER(translate_glu_geglu); GGML_OP_CONVERTER(translate_set_rows); GGML_OP_CONVERTER(translate_cpy); GGML_OP_CONVERTER(translate_flash_attn_ext); From 7d81861a18bfc05b2c43c1c62513e54a4cf57001 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 15 Sep 2025 15:56:03 +0800 Subject: [PATCH 139/254] Fix Hunyuan --- ggml/src/ggml-openvino/ggml-decoder.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 0bdb9aa897..bc528e0cfb 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -242,14 +242,17 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { void GgmlOvDecoder::set_llm_params() { for (int i = 0; i < m_cgraph->n_nodes; i++) { auto* node = m_cgraph->nodes[i]; + std::string name = std::string(node->name); if (node->op == GGML_OP_VIEW && std::string(node->name) == "cache_k_l0 (view)") { auto* cache_k = node->src[0]; m_context_size = cache_k->ne[1]; - } else if (node->op == GGML_OP_ROPE && std::string(node->name) == "Qcur-0") { + } else if (node->op == GGML_OP_ROPE && + (name.find("Qcur-0") == 0 || std::string(node->src[0]->name).find("Qcur-0") == 0)) { m_head_size = node->ne[0]; m_num_heads = node->ne[1]; m_rope_params = node->op_params; - } else if (node->op == GGML_OP_ROPE && std::string(node->name) == "Kcur-0") { + } else if (node->op == GGML_OP_ROPE && + (name.find("Kcur-0") == 0 || std::string(node->src[0]->name).find("Kcur-0") == 0)) { m_num_heads_kv = node->ne[1]; } } From 9de874cb7b936333da396fd73b8b49c7109d48fb Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 16 Sep 2025 16:30:45 +0800 Subject: [PATCH 140/254] Support iSWA --- ggml/src/ggml-openvino/ggml-decoder.cpp | 103 ++++++++++++------ ggml/src/ggml-openvino/ggml-decoder.h | 13 ++- ggml/src/ggml-openvino/openvino/decoder.hpp | 2 + .../ggml-openvino/openvino/node_context.hpp | 13 +-- .../openvino/op/flash_attn_ext.cpp | 9 +- .../src/ggml-openvino/openvino/op/permute.cpp | 38 ++----- .../openvino/translate_session.cpp | 21 +++- ggml/src/ggml-openvino/utils.cpp | 2 +- src/llama-graph.cpp | 4 +- 9 files changed, 124 insertions(+), 81 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index bc528e0cfb..e3dd5e0c1d 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -30,17 +30,21 @@ #include #include #include +#include #include "ggml-backend-impl.h" #include "ggml-backend.h" #include "ggml-quants.hpp" GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token, - int context_size, int num_heads, int num_heads_kv, int head_size) : + int context_size, int context_size_swa, int num_heads, int num_heads_kv, int head_size, + const std::vector& swa_layers) : m_cgraph(cgraph), m_node(node), m_op_name(std::string(node->name)), m_context_size(context_size), + m_context_size_swa(context_size_swa), + m_swa_layers(swa_layers), m_num_heads(num_heads), m_num_heads_kv(num_heads_kv), m_head_size(head_size), @@ -204,11 +208,14 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { if (node->src[0]->op != GGML_OP_VIEW) { m_op_case = 1; } else if (ggml_is_contiguous(node->src[0])) { - // Permute cache_k (view) - m_op_case = 2; - } else { - // Permute cache_v (view), deprecated, cache_v will also fall to case 2 - m_op_case = 3; + // Permute kv cache (view) + std::string src_name(node->view_src->name); + int layer = extract_layer_from_name(src_name); + if (!is_swa_layer(layer)) { + m_op_case = 2; + } else { + m_op_case = 3; + } } break; } @@ -239,13 +246,34 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { } } +int extract_layer_from_name(const std::string& name) { + size_t pos1 = name.find("_l"); + assert(pos1 != std::string::npos); + pos1 += 2; + size_t pos2 = name.find(' ', pos1); + if (pos2 == std::string::npos) { + pos2 = name.length(); + } + std::string layer_str = name.substr(pos1, pos2 - pos1); + int layer = std::stoi(layer_str); + return layer; +} + void GgmlOvDecoder::set_llm_params() { for (int i = 0; i < m_cgraph->n_nodes; i++) { auto* node = m_cgraph->nodes[i]; std::string name = std::string(node->name); - if (node->op == GGML_OP_VIEW && std::string(node->name) == "cache_k_l0 (view)") { - auto* cache_k = node->src[0]; - m_context_size = cache_k->ne[1]; + if (node->op == GGML_OP_FLASH_ATTN_EXT) { + auto* cache_k = node->src[1]; + cache_k = cache_k->view_src ? cache_k->view_src : cache_k; + int layer = extract_layer_from_name(cache_k->name); + + if (std::string(node->src[3]->name).find("swa") != std::string::npos) { + m_swa_layers.push_back(layer); + m_context_size_swa = cache_k->ne[1]; + } else { + m_context_size = cache_k->ne[1]; + } } else if (node->op == GGML_OP_ROPE && (name.find("Qcur-0") == 0 || std::string(node->src[0]->name).find("Qcur-0") == 0)) { m_head_size = node->ne[0]; @@ -269,11 +297,11 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co input_shape = ov::PartialShape{1, 1, 1}; } } else { - input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_context_size)}; + input_shape = ov::PartialShape{1, 1, -1}; } } else if (name == "inp_out_ids" && !m_is_static) { - input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_context_size)}; - } else if (name == "KQ_mask") { + input_shape = ov::PartialShape{1, 1, -1}; + } else if (name.find("KQ_mask") == 0) { if (m_is_static) { if (m_is_first_token) { input_shape = ov::PartialShape{1, m_context_size, m_context_size}; @@ -281,13 +309,12 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co input_shape = ov::PartialShape{1, 1, m_context_size}; } } else { - auto max_mask_size = GGML_PAD(m_context_size, GGML_KQ_MASK_PAD); - input_shape = ov::PartialShape{1, ov::Dimension(1, max_mask_size), ov::Dimension(1, max_mask_size)}; + input_shape = ov::PartialShape{1, -1, -1}; } - } else if (name.find("cache_k") == 0) { - input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size}; - } else if (name.find("cache_v") == 0) { - input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size}; + } else if (name.find("cache_") == 0) { + int layer = extract_layer_from_name(name); + bool is_swa = is_swa_layer(layer); + input_shape = ov::PartialShape{is_swa ? m_context_size_swa : m_context_size, m_num_heads_kv, m_head_size}; } else if (const auto* op = get_tensor_used_op(src); op && op->op == GGML_OP_SET_ROWS) { input_shape = ov::PartialShape{1, 1, m_is_static ? 1 : -1}; } else if (src->op == GGML_OP_VIEW) { @@ -305,35 +332,35 @@ void GgmlOvDecoder::add_extra_inputs() { // see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding. // Not used for NPU int64_t attention_size = -1; + int64_t attention_size_swa = -1; for (const auto& node : m_nodes) { - if (node->op == GGML_OP_SOFT_MAX) { - auto* mask = node->src[1]; - if (std::string(mask->name).find("KQ_mask") != 0) { - throw std::runtime_error("Unexpected softmax node: " + std::string(mask->name)); - } - attention_size = mask->ne[0]; - break; - } if (node->op == GGML_OP_FLASH_ATTN_EXT) { auto* mask = node->src[3]; - if (std::string(mask->name).find("KQ_mask") != 0) { + std::string mask_name(mask->name); + if (mask_name.find("KQ_mask") != 0) { throw std::runtime_error("Unexpected flash attention node: " + std::string(mask->name)); } - attention_size = mask->ne[0]; + if (mask_name.find("swa") != std::string::npos) { + attention_size_swa = mask->ne[0]; + } else { + attention_size = mask->ne[0]; + } } } - { - std::string name = "attention_size"; + auto create_attention_size_input = [this](const std::string& name, int64_t size) { auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); param_node->set_friendly_name(name); param_node->output(0).get_tensor().set_names({name}); m_model_extra_inputs[name] = param_node; auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); - *tensor->data() = attention_size; + *tensor->data() = size; m_model_extra_input_values[name] = tensor; - } + }; + + create_attention_size_input("attention_size", attention_size); + create_attention_size_input("attention_size_swa", attention_size_swa); } const ggml_tensor* GgmlOvDecoder::get_tensor_used_op(const ggml_tensor* tensor) const { @@ -706,8 +733,16 @@ int32_t* GgmlOvDecoder::get_output_op_params(const std::string& name) const { void GgmlOvDecoder::visit_subgraph(std::function)> node_visitor) const { for (const auto& node : m_nodes) { - auto decoder = std::make_shared( - node, m_cgraph, m_is_static, m_is_first_token, m_context_size, m_num_heads, m_num_heads_kv, m_head_size); + auto decoder = std::make_shared(node, + m_cgraph, + m_is_static, + m_is_first_token, + m_context_size, + m_context_size_swa, + m_num_heads, + m_num_heads_kv, + m_head_size, + m_swa_layers); node_visitor(decoder); } } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 4ba147da20..35e79ecefc 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -19,7 +19,8 @@ public: // Node decoder, called in GgmlOvDecoder::visit_subgraph GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token, - int context_size, int num_heads, int num_heads_kv, int head_size); + int context_size, int context_size_swa, int num_heads, int num_heads_kv, int head_size, + const std::vector& swa_layers); // Naive graph decoder GgmlOvDecoder(struct ggml_cgraph* cgraph, std::map>& model_weights); @@ -101,6 +102,12 @@ public: virtual int get_context_size() const override { return m_context_size; } + virtual int get_context_size_swa() const override { return m_context_size_swa; } + + virtual int is_swa_layer(int layer) const override { + return std::find(m_swa_layers.begin(), m_swa_layers.end(), layer) != m_swa_layers.end(); + } + virtual int get_num_heads() const override { return m_num_heads; } virtual int get_num_heads_kv() const override { return m_num_heads_kv; } @@ -156,6 +163,8 @@ private: std::map> m_model_weights; std::vector m_model_output_names; int m_context_size; + int m_context_size_swa; + std::vector m_swa_layers; int m_num_heads; int m_num_heads_kv; int m_head_size; @@ -166,3 +175,5 @@ private: }; void print_tensor_address_map(const struct ggml_cgraph* cgraph); + +int extract_layer_from_name(const std::string& name); diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index a3387ba394..6f11ff1283 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -67,6 +67,8 @@ public: virtual bool is_static() const = 0; virtual bool is_first_token() const = 0; virtual int get_context_size() const = 0; + virtual int get_context_size_swa() const = 0; + virtual int is_swa_layer(int layer) const = 0; }; } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index cc1b5c0332..a64ae098ab 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -2,6 +2,7 @@ #include #include +#include #include "decoder.hpp" @@ -30,6 +31,8 @@ public: return m_translate_session; } + const std::vector& get_input_names() const { return m_input_names; } + size_t get_input_size() const override { return m_decoder->get_input_size(); } @@ -101,15 +104,7 @@ public: return m_decoder->is_first_token(); } - int get_num_heads() const { return m_decoder->get_num_heads(); } - - int get_num_heads_kv() const { return m_decoder->get_num_heads_kv(); } - - int get_head_size() const { return m_decoder->get_head_size(); } - - int get_context_size() const { return m_decoder->get_context_size(); } - - private: +private: std::shared_ptr m_decoder; std::shared_ptr& m_tensor_map; TranslateSession* m_translate_session; diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp index d97603d98a..8b67778fb9 100644 --- a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +++ b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include "../node_context.hpp" #include "../op_table.hpp" @@ -32,8 +33,12 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) { auto scale_node = std::make_shared(ov::element::f16, ov::Shape{}, std::vector{scale}); ov::Output mask_sliced; - if (context.has_input("KQ_mask_sliced")) { - mask_sliced = context.get_input("KQ_mask_sliced"); + std::string mask_name = "KQ_mask_sliced"; + if (context.get_input_names()[3].find("swa") != std::string::npos) { + mask_name = "KQ_mask_swa_sliced"; + } + if (context.has_input(mask_name)) { + mask_sliced = context.get_input(mask_name); } else { auto token_len = get_dimensions(q, {1}); auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index fcb091016a..086b1e4cdb 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -29,43 +29,29 @@ OutputVector translate_permute(const NodeContext& context) { ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); } else { auto src = context.get_input(0); - auto attention_size = context.get_input("attention_size"); + Output attention_size; if (context.is_static()) { attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {INT_MAX}); + } else if (op_case == 2) { + attention_size = context.get_input("attention_size"); + } else { + attention_size = context.get_input("attention_size_swa"); } auto src_shape_ = context.get_input_shape(0).to_shape(); std::vector src_shape(src_shape_.begin(), src_shape_.end()); - std::shared_ptr src_reshaped; - if (op_case == 2) { - src_reshaped = std::make_shared( - src, - ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{-1, src_shape[1], src_shape[2]}), - false); - } else { - src_reshaped = std::make_shared( - src, - ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{src_shape[1], src_shape[0], -1}), - false); - } + auto src_reshaped = std::make_shared( + src, + ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{-1, src_shape[1], src_shape[2]}), + false); auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - std::shared_ptr slice_axis; - if (op_case == 2) { - slice_axis = zero; - } else { - slice_axis = two; - } - auto src_slice = std::make_shared(src_reshaped, zero, attention_size, one, slice_axis); + auto src_slice = std::make_shared(src_reshaped, zero, attention_size, one, zero); - if (op_case == 2) { - res = std::make_shared(src_slice, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); - } else { - res = src_slice; - } + res = std::make_shared(src_slice, + ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); } return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 3b8c30361a..9c82fe5f85 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -78,13 +78,22 @@ void add_token_len(TensorMap& tensor_map) { } void add_sliced_mask(TensorMap& tensor_map) { - auto mask = tensor_map.at("KQ_mask").get_node_shared_ptr(); auto token_len = tensor_map.at("token_len").get_node_shared_ptr(); - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - std::shared_ptr mask_sliced = std::make_shared(mask, zero, token_len, one, one); - mask_sliced->set_friendly_name("KQ_mask_sliced"); - tensor_map.insert({"KQ_mask_sliced", mask_sliced->output(0)}); + + auto create_sliced_mask = [&](const std::string& mask_name, const std::string& sliced_name) { + if (tensor_map.find(mask_name) != tensor_map.end()) { + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto mask = tensor_map.at(mask_name).get_node_shared_ptr(); + std::shared_ptr mask_sliced = + std::make_shared(mask, zero, token_len, one, one); + mask_sliced->set_friendly_name(sliced_name); + tensor_map.insert({sliced_name, mask_sliced->output(0)}); + } + }; + + create_sliced_mask("KQ_mask", "KQ_mask_sliced"); + create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced"); } void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 7b696769fb..8724404098 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -362,7 +362,7 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, cons input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); } - } else if (param_name == "KQ_mask") { + } else if (param_name.find("KQ_mask") == 0) { size_t context_size = ggml_decoder->get_context_size(); const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); if (is_first_token) { diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index e9fbff5995..257d86cd3e 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1605,7 +1605,7 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1); - cb(inp->self_kq_mask, "KQ_mask", -1); + cb(inp->self_kq_mask, "self_kq_mask", -1); ggml_set_input(inp->self_kq_mask); inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; @@ -1694,7 +1694,7 @@ static std::unique_ptr build_attn_inp_kv_impl( inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch); inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream); - ggml_set_name(inp->self_kq_mask, "KQ_mask"); + ggml_set_name(inp->self_kq_mask, "self_kq_mask"); ggml_set_input(inp->self_kq_mask); inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; From 602f9ca4afa1fa0402f0cb2a263ab202deb488ce Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 17 Sep 2025 11:16:14 +0800 Subject: [PATCH 141/254] Fix NPU accuracy --- .../openvino/translate_session.cpp | 25 +++++++++++-------- ggml/src/ggml-openvino/utils.cpp | 5 +--- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 9c82fe5f85..c37aa21602 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -77,23 +77,28 @@ void add_token_len(TensorMap& tensor_map) { tensor_map.insert({"token_len", token_len->output(0)}); } -void add_sliced_mask(TensorMap& tensor_map) { +void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { auto token_len = tensor_map.at("token_len").get_node_shared_ptr(); - auto create_sliced_mask = [&](const std::string& mask_name, const std::string& sliced_name) { + auto create_sliced_mask = [&](const std::string& mask_name, const std::string& sliced_name, bool is_static) { if (tensor_map.find(mask_name) != tensor_map.end()) { - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); auto mask = tensor_map.at(mask_name).get_node_shared_ptr(); - std::shared_ptr mask_sliced = - std::make_shared(mask, zero, token_len, one, one); - mask_sliced->set_friendly_name(sliced_name); + std::shared_ptr mask_sliced; + if (is_static) { + mask_sliced = mask; + } else { + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + mask_sliced = std::make_shared(mask, zero, token_len, one, one); + mask_sliced = std::make_shared(mask_sliced, ov::element::f16); + mask_sliced->set_friendly_name(sliced_name); + } tensor_map.insert({sliced_name, mask_sliced->output(0)}); } }; - create_sliced_mask("KQ_mask", "KQ_mask_sliced"); - create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced"); + create_sliced_mask("KQ_mask", "KQ_mask_sliced", ggml_model_decoder.is_static()); + create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static()); } void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { @@ -117,7 +122,7 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { // Create common patterns void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { add_token_len(tensor_map); - add_sliced_mask(tensor_map); + add_sliced_mask(tensor_map, ggml_model_decoder); add_rope_sin_cos(tensor_map, ggml_model_decoder); } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 8724404098..db47163645 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -253,6 +253,7 @@ ov::AnyMap get_npu_base_config() { {"NPUW_FOLD", "YES" }, {"NPUW_WEIGHTS_BANK", "shared" }, {"NPUW_FUNCALL_FOR_ALL", "YES" }, + {"NPUW_FUNCALL_ASYNC", "YES" }, {"NPUW_DQ", "YES" }, {"NPUW_DQ_FULL", "NO" }, {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""}, @@ -262,15 +263,11 @@ ov::AnyMap get_npu_base_config() { ov::AnyMap get_npu_prefill_config() { auto config = get_npu_base_config(); - config.emplace("NPUW_FUNCALL_ASYNC", "NO"); - config.emplace("NPUW_ACC_CHECK", "YES"); - config.emplace("NPUW_ACC_DEVICE", "CPU"); return config; } ov::AnyMap get_npu_generate_config() { auto config = get_npu_base_config(); - config.emplace("NPUW_FUNCALL_ASYNC", "YES"); return config; } From 1a38339cea8b0488082f12483284e4ec2f44b448 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 17 Sep 2025 15:35:27 +0800 Subject: [PATCH 142/254] Fix ROPE accuracy when freq_scale != 1 --- ggml/src/ggml-openvino/ggml-openvino.cpp | 6 +----- ggml/src/ggml-openvino/openvino/utils.cpp | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 6da653716f..683f768c5f 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -319,12 +319,8 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { return true; } float freq_scale; - memcpy(&freq_scale, op_params + 6, sizeof(float)); - if (freq_scale != 0.0f && freq_scale != 1.0f) { - GGML_LOG_WARN("OpenVINO backend does not support ROPE with freq_scale %f != 1.0f\n", freq_scale); - return true; - } float ext_factor; + memcpy(&freq_scale, op_params + 6, sizeof(float)); memcpy(&ext_factor, op_params + 7, sizeof(float)); if (ext_factor != 0.0f) { GGML_LOG_WARN("OpenVINO backend does not support ROPE with ext_factor %f != 0.0f\n", ext_factor); diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp index ef5f51ebbc..f70cb91a17 100644 --- a/ggml/src/ggml-openvino/openvino/utils.cpp +++ b/ggml/src/ggml-openvino/openvino/utils.cpp @@ -140,7 +140,7 @@ std::pair, ov::Output> make_sin_cos(int32_t* rope_params, ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); std::vector factor(n_dims / 2); - factor[0] = freq_scale; + factor[0] = 1.0f; for (size_t i = 1; i < factor.size(); i++) { factor[i] = theta_scale * factor[i - 1]; } From 67e178a2f63cd0aa59796abee293bf2f8dc6a653 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 17 Sep 2025 16:50:54 +0800 Subject: [PATCH 143/254] Minor: not add attention_size_swa for non-swa model --- ggml/src/ggml-openvino/ggml-decoder.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index e3dd5e0c1d..8286052f8b 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -360,7 +360,9 @@ void GgmlOvDecoder::add_extra_inputs() { }; create_attention_size_input("attention_size", attention_size); - create_attention_size_input("attention_size_swa", attention_size_swa); + if (attention_size_swa != -1) { + create_attention_size_input("attention_size_swa", attention_size_swa); + } } const ggml_tensor* GgmlOvDecoder::get_tensor_used_op(const ggml_tensor* tensor) const { From 2f1d50fb07201890b19e7dbf0f48862d323f46f5 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 19 Sep 2025 16:50:27 +0800 Subject: [PATCH 144/254] Minor refactor --- ggml/src/ggml-openvino/ggml-decoder.cpp | 10 ---------- ggml/src/ggml-openvino/utils.cpp | 5 +++++ 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 8286052f8b..a5d9d6967f 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -65,11 +65,6 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, print_tensor_address_map(cgraph); } - if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { - std::string filename = "cgraph.txt"; - dump_cgraph(cgraph, filename); - } - set_llm_params(); for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { @@ -83,11 +78,6 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, std::map>& model_weights) { - if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { - std::string filename = "cgraph.txt"; - dump_cgraph(cgraph, filename); - } - m_cgraph = cgraph; m_model_weights = model_weights; for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index db47163645..07cbb2e437 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -86,6 +86,11 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c }; } + if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { + std::string filename = "cgraph.txt"; + GgmlOvDecoder::dump_cgraph(cgraph, filename); + } + if (is_naive(cgraph)) { return naive_compute(cgraph, core, device, config); } From e4bfe5a20d21b98d58cfc8d3b8dffd97159385c7 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 23 Sep 2025 16:07:51 +0800 Subject: [PATCH 145/254] Add Q5_K to support phi-3-q4_k_m --- ggml/src/ggml-openvino/ggml-decoder.cpp | 8 +- ggml/src/ggml-openvino/ggml-openvino.cpp | 1 + ggml/src/ggml-openvino/ggml-quants.cpp | 143 ++++++++++++++++++----- ggml/src/ggml-openvino/ggml-quants.hpp | 5 + ggml/src/ggml-openvino/utils.cpp | 1 + 5 files changed, 124 insertions(+), 34 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index a5d9d6967f..38b0fa3db4 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -448,6 +448,7 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, + GGML_TYPE_Q5_K, GGML_TYPE_Q6_K}; if (weight_types.find(tensor->type) == weight_types.end()) { throw std::runtime_error("Unexpected weight tensor type: " + std::string(tensor->name) + " with type " + @@ -486,12 +487,12 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, ov::element::Type weight_type; if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) { weight_type = ov::element::u4; - } else { // tensor.type == GGUF_TYPE_Q8_0 || tensor.type == GGUF_TYPE_Q6_K + } else { // tensor.type == GGUF_TYPE_Q8_0 || tensor.type == GGUF_TYPE_Q6_K || tensor.type == GGUF_TYPE_Q5_K weight_type = ov::element::u8; } uint64_t weights_per_block; - // here we only consider sub block, q6k:16 q4k:32 + // here we only consider sub block, q6k:16 q4k:32 q5k:32 if (tensor->type == GGML_TYPE_Q6_K) { weights_per_block = 16; } else { @@ -526,6 +527,9 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, } else if (tensor->type == GGML_TYPE_Q4_K) { extract_q4_k_data(tensor, weights, scales, biases); weight_node = make_int4_weights(weights, scales, biases, weights_per_block); + } else if (tensor->type == GGML_TYPE_Q5_K) { + extract_q5_k_data(tensor, weights, scales, biases); + weight_node = make_int8_weights(weights, scales, biases, weights_per_block); } OPENVINO_ASSERT(weight_node.get_shape().size() == 2, "Weight should be 2D"); diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 683f768c5f..648acb4e35 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -350,6 +350,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, + GGML_TYPE_Q5_K, GGML_TYPE_Q8_0, GGML_TYPE_Q6_K}; diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index 1603e65355..9b8bfff072 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -1,9 +1,17 @@ #include "ggml-quants.hpp" +#include +#include +#include +#include #include #include #include +#include +#include #include +#include +#include #include #include #include @@ -11,9 +19,12 @@ #include #include #include +#include #include #include +#include +#include "ggml-common.h" #include "ggml-impl.h" #include "ggml.h" @@ -38,10 +49,10 @@ void extract_q4_0_data(const ggml_tensor* tensor, ov::Tensor& scales_arr, ov::Tensor& biases_arr) { const uint64_t bytes_per_block = 18; // 2 bytes scale, 32x0.5 byte weights - auto data = static_cast(tensor->data); - auto weights = static_cast(weights_arr.data()); - auto scales = scales_arr.data::value_type>(); - auto biases = biases_arr.data::value_type>(); + auto* data = static_cast(tensor->data); + auto* weights = static_cast(weights_arr.data()); + auto* scales = scales_arr.data::value_type>(); + auto* biases = biases_arr.data::value_type>(); ov::parallel_for(scales_arr.get_size(), [&](size_t i) { scales[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block))); @@ -57,10 +68,10 @@ void extract_q4_1_data(const ggml_tensor* tensor, ov::Tensor& scales_arr, ov::Tensor& biases_arr) { const uint64_t bytes_per_block = 20; // 2 bytes scale, 2 bytes bias, 32x0.5 byte weights - auto data = static_cast(tensor->data); - auto weights = static_cast(weights_arr.data()); - auto scales = scales_arr.data::value_type>(); - auto biases = biases_arr.data::value_type>(); + auto* data = static_cast(tensor->data); + auto* weights = static_cast(weights_arr.data()); + auto* scales = scales_arr.data::value_type>(); + auto* biases = biases_arr.data::value_type>(); ov::parallel_for(scales_arr.get_size(), [&](size_t i) { scales[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block))); biases[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block + 2))); @@ -76,22 +87,22 @@ void extract_q8_0_data(const ggml_tensor* tensor, ov::Tensor& biases_arr) { const uint64_t weights_per_block = 32; const uint64_t bytes_per_block = 34; // 2 bytes scale, 32x1 byte weights - auto data = static_cast(tensor->data); - auto weights = static_cast(weights_arr.data()); - auto scales = scales_arr.data::value_type>(); - auto biases = biases_arr.data::value_type>(); - for (size_t i = 0; i < scales_arr.get_size(); i++) { + auto* data = static_cast(tensor->data); + auto* weights = static_cast(weights_arr.data()); + auto* scales = scales_arr.data::value_type>(); + auto* biases = biases_arr.data::value_type>(); + + ov::parallel_for(scales_arr.get_size(), [&](size_t i) { uint8_t* block_data = data + i * bytes_per_block; - scales[i] = ov::float16::from_bits(*(uint16_t*)block_data); + scales[i] = ov::float16::from_bits(*(uint16_t*) block_data); biases[i] = ov::float16(-128.f * static_cast(scales[i])); for (size_t j = 0; j < weights_per_block; ++j) { uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes. - // Original data is in int8_t, so we add a bias of -128 and invert the - // first bit. + // Original data is in int8_t, so we add a bias of -128 and invert the first bit. x ^= 1 << 7; weights[i * weights_per_block + j] = x; } - } + }); } void unpack_256_4(const uint8_t* data, uint8_t* dst) { @@ -117,12 +128,11 @@ void extract_q4_k_data(const ggml_tensor* tensor, ov::Tensor& scales_arr, ov::Tensor& biases_arr) { const uint64_t bytes_per_block = 2 + 2 + 12 + 128; - // TODO tensor->nb[3] const uint64_t n_super_block = tensor->nb[3] / bytes_per_block; - auto data = static_cast(tensor->data); - auto weights = static_cast(weights_arr.data()); - auto scales = scales_arr.data::value_type>(); - auto biases = biases_arr.data::value_type>(); + auto* data = static_cast(tensor->data); + auto* weights = static_cast(weights_arr.data()); + auto* scales = scales_arr.data::value_type>(); + auto* biases = biases_arr.data::value_type>(); ov::parallel_for(n_super_block, [&](size_t i) { uint8_t* block_data = data + i * bytes_per_block; @@ -170,28 +180,26 @@ void extract_q6_k_data(const ggml_tensor* tensor, ov::Tensor& biases_arr) { const uint64_t bytes_per_block = 128 + 64 + 16 + 2; const uint64_t n_super_block = tensor->nb[3] / bytes_per_block; - auto data = static_cast(tensor->data); - auto weights = static_cast(weights_arr.data()); - auto scales = scales_arr.data::value_type>(); - auto biases = biases_arr.data::value_type>(); - // std::string name(tensor.name, tensor.namelen); - for (size_t i = 0; i < n_super_block; i++) { + auto* data = static_cast(tensor->data); + auto* weights = static_cast(weights_arr.data()); + auto* scales = scales_arr.data::value_type>(); + auto* biases = biases_arr.data::value_type>(); + + ov::parallel_for(n_super_block, [&](size_t i) { uint8_t* block_data = data + i * bytes_per_block; float scale_factor = - static_cast(ov::float16::from_bits(*((uint16_t*)block_data + 104))); // (128+64+16)/2 + static_cast(ov::float16::from_bits(*((uint16_t*) block_data + 104))); // (128+64+16)/2 for (size_t j = 0; j < 16; j++) { scales[j + i * 16] = - ov::float16(scale_factor * static_cast(*((int8_t*)(block_data + 128 + 64 + j)))); + ov::float16(scale_factor * static_cast(*((int8_t*) (block_data + 128 + 64 + j)))); biases[j + i * 16] = ov::float16(-32.f * static_cast(scales[j + i * 16])); } - // Extract ql and qh uint8_t* ql = block_data; uint8_t* qh = block_data + 128; - // Extract weights for (int64_t j = 0; j < 32; ++j) { weights[i * 256 + j] = (ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4); weights[i * 256 + j + 32] = (ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4); @@ -202,9 +210,80 @@ void extract_q6_k_data(const ggml_tensor* tensor, weights[i * 256 + j + 192] = (ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4); weights[i * 256 + j + 224] = (ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4); } + }); +} + +static inline void get_scale_min_k4(int j, const uint8_t* q, uint8_t* d, uint8_t* m) { + if (j < 4) { + *d = q[j] & 63; + *m = q[j + 4] & 63; + } else { + *d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4); + *m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4); } } +void extract_q5_k_data(const ggml_tensor* tensor, ov::Tensor& weights_arr, ov::Tensor& scales_arr, + ov::Tensor& biases_arr) { + const uint64_t bytes_per_block = 4 + 12 + 32 + 128; + const uint64_t n_super_block = tensor->nb[3] / bytes_per_block; + auto* data = static_cast(tensor->data); + auto* weights = static_cast(weights_arr.data()); + auto* scales = scales_arr.data::value_type>(); + auto* biases = biases_arr.data::value_type>(); + + ov::parallel_for(n_super_block, [&](size_t i) { + uint8_t* block_data = data + i * bytes_per_block; + + const float d = static_cast(ov::float16::from_bits(*((uint16_t*) block_data))); + const float min = static_cast(ov::float16::from_bits(*((uint16_t*) block_data + 1))); + + const uint8_t* scales_data = block_data + 4; // 12 bytes of scales + const uint8_t* qh = block_data + 4 + 12; // 32 bytes of high bits + const uint8_t* ql = block_data + 4 + 12 + 32; // 128 bytes of low bits + + int is = 0; + uint8_t u1 = 1; + uint8_t u2 = 2; + + // Process 2 blocks in one iteration + for (int j = 0; j < 256; j += 64) { // 256 = QK_K, so 4 iterations of 64 + uint8_t sc; + uint8_t m; + + // Get scale and min for first 32 elements + get_scale_min_k4(is + 0, scales_data, &sc, &m); + const float d1 = d * sc; + const float m1 = min * m; + + // Get scale and min for second 32 elements + get_scale_min_k4(is + 1, scales_data, &sc, &m); + const float d2 = d * sc; + const float m2 = min * m; + + scales[i * 8 + is] = ov::float16(d1); + biases[i * 8 + is] = ov::float16(-m1); + scales[i * 8 + is + 1] = ov::float16(d2); + biases[i * 8 + is + 1] = ov::float16(-m2); + + // Extract weights for first 32 elements (matching deq formula exactly) + for (int l = 0; l < 32; ++l) { + weights[i * 256 + j + l] = (ql[l] & 0xF) + ((qh[l] & u1) ? 16 : 0); + } + + // Extract weights for second 32 elements + for (int l = 0; l < 32; ++l) { + weights[i * 256 + j + l + 32] = (ql[l] >> 4) + ((qh[l] & u2) ? 16 : 0); + } + + ql += 32; + is += 2; + u1 <<= 2; + u2 <<= 2; + } + }); +} + // TODO Reorder for make_intX_weights ov::Output make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) { diff --git a/ggml/src/ggml-openvino/ggml-quants.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp index fbae2aa1f4..5496785eb1 100644 --- a/ggml/src/ggml-openvino/ggml-quants.hpp +++ b/ggml/src/ggml-openvino/ggml-quants.hpp @@ -29,6 +29,11 @@ void extract_q4_k_data(const ggml_tensor* tensor, ov::Tensor& scales_arr, ov::Tensor& biases_arr); +void extract_q5_k_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr); + void extract_q6_k_data(const ggml_tensor* tensor, ov::Tensor& weights_arr, ov::Tensor& scales_arr, diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 07cbb2e437..e9084cf387 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -283,6 +283,7 @@ std::map get_types_to_requant(const std::string& devi {GGML_TYPE_Q4_1, ExtraQuantType::Q4_0_128}, {GGML_TYPE_Q4_K, ExtraQuantType::Q4_0_128}, {GGML_TYPE_Q6_K, ExtraQuantType::F16 }, + {GGML_TYPE_Q5_K, ExtraQuantType::F16 }, }; } if (device == "GPU") { From f3afa7b91466fc288488744278f81dd642525716 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 26 Sep 2025 15:50:32 +0800 Subject: [PATCH 146/254] Requantize Q6_K (gs16) to gs32 on GPU --- ggml/src/ggml-openvino/ggml-quants.cpp | 43 +++++++++++++++++++++++--- ggml/src/ggml-openvino/ggml-quants.hpp | 4 ++- ggml/src/ggml-openvino/utils.cpp | 4 +-- 3 files changed, 43 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index 9b8bfff072..1538a8207c 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -425,6 +425,8 @@ std::shared_ptr requantize(const ggml_tensor* tensor, ExtraQuantType r int64_t block_size = node_shape[1]; if (requant_type == ExtraQuantType::Q4_0_128) { block_size = 128; + } else if (requant_type == ExtraQuantType::Q8_0_32) { + block_size = 32; } auto scales_shape = ov::Shape{node_shape[0], node_shape[1] / block_size}; @@ -432,7 +434,7 @@ std::shared_ptr requantize(const ggml_tensor* tensor, ExtraQuantType r ov::Tensor scales(ov::element::f16, scales_shape); ov::Tensor bias(ov::element::f16, scales_shape); - if (requant_type == ExtraQuantType::Q4_0_C) { + if (requant_type == ExtraQuantType::Q4_0_C || requant_type == ExtraQuantType::Q4_0_128) { weights = ov::Tensor(ov::element::u4, node_shape); quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size); weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr(); @@ -440,10 +442,10 @@ std::shared_ptr requantize(const ggml_tensor* tensor, ExtraQuantType r weights = ov::Tensor(ov::element::u8, node_shape); quantize_q8_1(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size); weight_node = make_int8_weights(weights, scales, bias, block_size).get_node_shared_ptr(); - } else if (requant_type == ExtraQuantType::Q4_0_128) { - weights = ov::Tensor(ov::element::u4, node_shape); - quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size); - weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr(); + } else if (requant_type == ExtraQuantType::Q8_0_C || requant_type == ExtraQuantType::Q8_0_32) { + weights = ov::Tensor(ov::element::u8, node_shape); + quantize_q8_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size); + weight_node = make_int8_weights(weights, scales, bias, block_size).get_node_shared_ptr(); } weight_node->set_friendly_name(tensor->name); @@ -485,6 +487,37 @@ void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_a } } +void quantize_q8_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, + int64_t qk) { + assert(k % qk == 0); + const int nb = k / qk; + + auto* weights = static_cast(weights_arr.data()); + auto* scales = scales_arr.data::value_type>(); + auto* biases = biases_arr.data::value_type>(); + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + + for (int j = 0; j < qk; j++) { + const float v = x[i * qk + j]; + if (amax < fabsf(v)) { + amax = fabsf(v); + } + } + + const float d = amax / 127.0f; + const float id = d ? 1.0f / d : 0.0f; + scales[i] = ov::float16(d); + biases[i] = ov::float16(-128.0f * d); + + for (int j = 0; j < qk; ++j) { + const float x0 = x[i * qk + j] * id; + const int8_t xi0 = roundf(x0); + weights[i * qk + j] = (uint8_t) (xi0 + 128); + } + } +} + void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, int64_t qk) { assert(k % qk == 0); diff --git a/ggml/src/ggml-openvino/ggml-quants.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp index 5496785eb1..71ae317a39 100644 --- a/ggml/src/ggml-openvino/ggml-quants.hpp +++ b/ggml/src/ggml-openvino/ggml-quants.hpp @@ -51,7 +51,7 @@ ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& biases, size_t group_size = GGML_QUANTIZATION_GROUP_SIZE); -enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128 }; +enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128, Q8_0_C, Q8_0_32 }; std::shared_ptr requantize(const ggml_tensor* tensor, ExtraQuantType requant_type); @@ -59,6 +59,8 @@ void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_a int64_t qk); void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, int64_t qk); +void quantize_q8_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, + int64_t qk); namespace ov { namespace op { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index e9084cf387..0ec815f07f 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -288,8 +288,8 @@ std::map get_types_to_requant(const std::string& devi } if (device == "GPU") { return { - // CVS-166739 - {GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C}, + // gs16 is WIP + {GGML_TYPE_Q6_K, ExtraQuantType::Q8_0_32}, }; } return {}; From fdadca1e89a3200cbd98e6ba0d17fe5cb6d7c0c6 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Sun, 28 Sep 2025 11:24:13 +0800 Subject: [PATCH 147/254] Fix after rebasing --- ggml/src/ggml-openvino/ggml-decoder.cpp | 24 +++++++++++++++---- .../ggml-openvino/openvino/op/set_rows.cpp | 4 +++- ggml/src/ggml-openvino/openvino/op/view.cpp | 4 ++++ 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 38b0fa3db4..751fa192a4 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -198,13 +198,17 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { if (node->src[0]->op != GGML_OP_VIEW) { m_op_case = 1; } else if (ggml_is_contiguous(node->src[0])) { - // Permute kv cache (view) std::string src_name(node->view_src->name); - int layer = extract_layer_from_name(src_name); - if (!is_swa_layer(layer)) { - m_op_case = 2; + if (src_name.find("cache") == std::string::npos) { + m_op_case = 1; } else { - m_op_case = 3; + // Permute kv cache (view) + int layer = extract_layer_from_name(src_name); + if (!is_swa_layer(layer)) { + m_op_case = 2; + } else { + m_op_case = 3; + } } } break; @@ -230,6 +234,16 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { } break; } + case GGML_OP_VIEW: { + if (node->src[0]->op == GGML_OP_VIEW) { + auto* src = node->src[0]; + auto* view_src = src->view_src; + if (view_src->ne[1] != src->ne[2]) { + throw std::runtime_error("Unsupported VIEW case"); + } + m_op_case = 2; + } + } default: break; } diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp index 0d94a95e44..50817c8323 100644 --- a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp @@ -45,7 +45,9 @@ OutputVector translate_set_rows(const NodeContext& context) { false); auto indices_reshaped = std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); - auto data_reshaped = std::make_shared(data, zero); + auto data_reshaped = std::make_shared( + data, ov::op::v0::Constant::create(ov::element::i64, {2}, {(int64_t) -1, (int64_t) dst_shape[2]}), false); + auto updated = std::make_shared(dst_reshaped, indices_reshaped, data_reshaped, zero); auto res = std::make_shared(updated, std::make_shared(dst), false); return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/openvino/op/view.cpp b/ggml/src/ggml-openvino/openvino/op/view.cpp index 58143e667c..034b6df119 100644 --- a/ggml/src/ggml-openvino/openvino/op/view.cpp +++ b/ggml/src/ggml-openvino/openvino/op/view.cpp @@ -9,6 +9,10 @@ namespace op { OutputVector translate_view(const NodeContext& context) { num_inputs_check(context, 1, 1); + if (context.get_op_case() == 2) { + auto dst_shape = context.get_output_shape(0).to_shape(); + return rename_outputs_with_suffix({process_view_input(context, 0, dst_shape[1] * dst_shape[2])}, context.get_name()); + } return {context.get_input(0)}; } From 973a80fd02987882548cce1cdae147fb882b2624 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Sun, 28 Sep 2025 22:21:23 +0800 Subject: [PATCH 148/254] Always apply Eliminate_ZP to fix GPU compile issue on some platforms --- ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp | 1 + ggml/src/ggml-openvino/openvino/translate_session.cpp | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp b/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp index d2e5a040dd..4759e86e1e 100644 --- a/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +++ b/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp @@ -80,6 +80,7 @@ EliminateZeroPoints::EliminateZeroPoints() { std::shared_ptr new_constant; + // TODO improve performance if (data_type == ov::element::u4) { auto data_values = data_constant->cast_vector(); std::vector adjusted_values(total_elements); diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index c37aa21602..9443819682 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -233,9 +233,9 @@ std::shared_ptr TranslateSession::apply_transformations(std::shared_ptr(kv_param_res_pairs); } - if (ggml_model_decoder->is_static()) { - manager.register_pass(); - } + // if (ggml_model_decoder->is_static()) { + manager.register_pass(); + // } manager.run_passes(model); } return model; From c112bc4e738940ce46c7852147542a79c46c06ce Mon Sep 17 00:00:00 2001 From: cavusmustafa Date: Wed, 1 Oct 2025 14:02:11 -0700 Subject: [PATCH 149/254] kvcachefusion support --- ggml/src/ggml-openvino/ggml-decoder.cpp | 10 ++- .../openvino/op/flash_attn_ext.cpp | 64 +++++++++++++------ ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 20 ++++-- .../src/ggml-openvino/openvino/op/permute.cpp | 34 ++++++---- ggml/src/ggml-openvino/openvino/op/rope.cpp | 3 + .../ggml-openvino/openvino/op/set_rows.cpp | 34 +++++++--- .../src/ggml-openvino/openvino/op/softmax.cpp | 19 +++++- .../openvino/translate_session.cpp | 16 ++++- 8 files changed, 145 insertions(+), 55 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 751fa192a4..0000319f63 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -316,9 +316,13 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co input_shape = ov::PartialShape{1, -1, -1}; } } else if (name.find("cache_") == 0) { - int layer = extract_layer_from_name(name); - bool is_swa = is_swa_layer(layer); - input_shape = ov::PartialShape{is_swa ? m_context_size_swa : m_context_size, m_num_heads_kv, m_head_size}; + if (m_is_static) { + int layer = extract_layer_from_name(name); + bool is_swa = is_swa_layer(layer); + input_shape = ov::PartialShape{is_swa ? m_context_size_swa : m_context_size, m_num_heads_kv, m_head_size}; + } else { + input_shape = ov::PartialShape{1, -1, m_num_heads_kv, m_head_size}; + } } else if (const auto* op = get_tensor_used_op(src); op && op->op == GGML_OP_SET_ROWS) { input_shape = ov::PartialShape{1, 1, m_is_static ? 1 : -1}; } else if (src->op == GGML_OP_VIEW) { diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp index 8b67778fb9..36d0f8844a 100644 --- a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +++ b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -32,7 +33,7 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) { auto q = std::make_shared(q_f32, ov::element::f16); auto scale_node = std::make_shared(ov::element::f16, ov::Shape{}, std::vector{scale}); - ov::Output mask_sliced; + ov::Output mask_sliced, res; std::string mask_name = "KQ_mask_sliced"; if (context.get_input_names()[3].find("swa") != std::string::npos) { mask_name = "KQ_mask_swa_sliced"; @@ -40,33 +41,55 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) { if (context.has_input(mask_name)) { mask_sliced = context.get_input(mask_name); } else { - auto token_len = get_dimensions(q, {1}); - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - mask_sliced = std::make_shared(mask, zero, token_len, one, one); + auto token_len = get_dimensions(q, {2}); + auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0}); + auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1}); + auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); + auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2}); + auto leaf_8 = context.get_input("leaf_8"); + auto shape_of_leaf_8 = std::make_shared(leaf_8); + auto gather_leaf_8 = std::make_shared(shape_of_leaf_8, two_1d, zero_1d); + auto stop = std::make_shared(ov::OutputVector{token_len, gather_leaf_8}, 0); + mask_sliced = + std::make_shared(mask, zero_2d, stop, one_2d, axes); + mask_sliced = std::make_shared(mask_sliced, zero_1d); } if (mask_sliced.get_element_type() != ov::element::f16) { mask_sliced = std::make_shared(mask_sliced, ov::element::f16); } - auto tile_kv = [](int64_t q_batch, int64_t kv_batch, ov::Output kv) { + auto tile_kv = [](int64_t q_batch, int64_t kv_batch, ov::Output kv, bool is_static) { int64_t factor = q_batch / kv_batch; if (factor > 1) { auto q_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{q_batch}); auto kv_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{kv_batch}); auto factor_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{factor}); - auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); - auto kv_unsqueezed = std::make_shared(kv, unsqueeze_axes); + ov::Output kv_broadcast_shape, kv_unsqueezed, new_kv_shape; + if (is_static) { + auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); + kv_unsqueezed = std::make_shared(kv, unsqueeze_axes); + + auto kv_last_two_dims = get_dimensions(kv.get_node_shared_ptr(), {1, 2}); + kv_broadcast_shape = + std::make_shared(ov::OutputVector{kv_batch_node, factor_node, kv_last_two_dims}, 0); + new_kv_shape = + std::make_shared(ov::OutputVector{q_batch_node, kv_last_two_dims}, 0); + } else { + auto one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {2}); + kv_unsqueezed = std::make_shared(kv, unsqueeze_axes); + + auto kv_last_two_dims = get_dimensions(kv.get_node_shared_ptr(), {2, 3}); + kv_broadcast_shape = + std::make_shared(ov::OutputVector{one_1d, kv_batch_node, factor_node, kv_last_two_dims}, 0); + new_kv_shape = + std::make_shared(ov::OutputVector{one_1d, q_batch_node, kv_last_two_dims}, 0); + } - auto kv_last_two_dims = get_dimensions(kv.get_node_shared_ptr(), {1, 2}); - auto kv_broadcast_shape = - std::make_shared(ov::OutputVector{kv_batch_node, factor_node, kv_last_two_dims}, 0); kv = std::make_shared(kv_unsqueezed, kv_broadcast_shape); - - auto new_kv_shape = - std::make_shared(ov::OutputVector{q_batch_node, kv_last_two_dims}, 0); kv = std::make_shared(kv, new_kv_shape, false); } return kv; @@ -74,13 +97,18 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) { auto q_shape = context.get_input_shape(0).to_shape(); auto k_shape = context.get_input_shape(1).to_shape(); - k = tile_kv(q_shape[0], k_shape[0], k); - v = tile_kv(q_shape[0], k_shape[0], v); + k = tile_kv(q_shape[0], k_shape[0], k, context.is_static()); + v = tile_kv(q_shape[0], k_shape[0], v, context.is_static()); auto sdpa = std::make_shared(q, k, v, mask_sliced, scale_node, false); auto sdpa_f32 = std::make_shared(sdpa, ov::element::f32); - auto res = std::make_shared(sdpa_f32, - ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); + if (context.is_static()) { + res = std::make_shared(sdpa_f32, + ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); + } else { + res = std::make_shared(sdpa_f32, + ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); + } return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index b4103378eb..3a1ca34166 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -59,13 +59,23 @@ OutputVector translate_mulmat(const NodeContext& context) { auto Z_last_two_dims = get_dimensions(Z.get_node_shared_ptr(), {1, 2}); - auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); - auto Z_unsqueezed = std::make_shared(Z, unsqueeze_axes); - Output batch_small = A_batch_larger ? B_batch_node : A_batch_node; Output batch_large = A_batch_larger ? A_batch_node : B_batch_node; - auto broadcast_shape = - std::make_shared(ov::OutputVector{batch_small, factor_node, Z_last_two_dims}, 0); + + ov::Output broadcast_shape; + ov::Output Z_unsqueezed; + if (context.is_static()) { + auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); + Z_unsqueezed = std::make_shared(Z, unsqueeze_axes); + broadcast_shape = + std::make_shared(ov::OutputVector{batch_small, factor_node, Z_last_two_dims}, 0); + } else { + auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {2}); + Z_unsqueezed = std::make_shared(Z, unsqueeze_axes); + auto one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + broadcast_shape = + std::make_shared(ov::OutputVector{one_1d, batch_small, factor_node, Z_last_two_dims}, 0); + } auto Z_broadcasted = std::make_shared(Z_unsqueezed, broadcast_shape); auto new_Z_shape = std::make_shared(ov::OutputVector{batch_large, Z_last_two_dims}, 0); diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index 086b1e4cdb..cd0d073ab3 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -25,8 +25,13 @@ OutputVector translate_permute(const NodeContext& context) { ov::Output res; if (op_case == 1) { - res = std::make_shared(context.get_input(0), - ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); + if (context.is_static()) { + res = std::make_shared(context.get_input(0), + ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); + } else { + res = std::make_shared(context.get_input(0), + ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); + } } else { auto src = context.get_input(0); Output attention_size; @@ -38,20 +43,23 @@ OutputVector translate_permute(const NodeContext& context) { attention_size = context.get_input("attention_size_swa"); } - auto src_shape_ = context.get_input_shape(0).to_shape(); - std::vector src_shape(src_shape_.begin(), src_shape_.end()); - - auto src_reshaped = std::make_shared( - src, - ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{-1, src_shape[1], src_shape[2]}), - false); - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - auto src_slice = std::make_shared(src_reshaped, zero, attention_size, one, zero); - res = std::make_shared(src_slice, - ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); + if (context.is_static()) { + auto src_shape_ = context.get_input_shape(0).to_shape(); + std::vector src_shape(src_shape_.begin(), src_shape_.end()); + auto src_reshaped = std::make_shared( + src, + ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{-1, src_shape[1], src_shape[2]}), + false); + auto src_slice = std::make_shared(src_reshaped, zero, attention_size, one, zero); + res = std::make_shared(src_slice, + ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); + } else { + res = std::make_shared(src, + ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); + } } return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index 4b1e3b500c..484730d289 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -84,6 +84,9 @@ OutputVector translate_rope(const NodeContext& context) { ov::op::v0::Constant::create(ov::element::i64, {1}, {3})); auto stack = std::make_shared(OutputVector{first_half, second_half}, 3); res = std::make_shared(stack, std::make_shared(data_node), false); + if (!(context.is_static())) { + res = std::make_shared(res, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); + } } else if (mode == ROPE_TYPE_NEOX) { auto data_split = std::make_shared( data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}), 2); diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp index 50817c8323..a3285d41ce 100644 --- a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp @@ -3,10 +3,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include #include @@ -39,17 +41,29 @@ OutputVector translate_set_rows(const NodeContext& context) { auto dst = context.get_input(context.get_output_name()); auto zero = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {0}); - auto dst_reshaped = std::make_shared( - dst, - ov::op::v0::Constant::create(ov::element::i64, {2}, {(int64_t) dst_shape[1], (int64_t) dst_shape[2]}), - false); - auto indices_reshaped = - std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); - auto data_reshaped = std::make_shared( - data, ov::op::v0::Constant::create(ov::element::i64, {2}, {(int64_t) -1, (int64_t) dst_shape[2]}), false); + Output res; + if (context.is_static()) { + auto dst_reshaped = std::make_shared( + dst, + ov::op::v0::Constant::create(ov::element::i64, {2}, {(int64_t) dst_shape[1], (int64_t) dst_shape[2]}), + false); + auto indices_reshaped = + std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); + auto data_reshaped = std::make_shared( + data, ov::op::v0::Constant::create(ov::element::i64, {2}, {(int64_t) -1, (int64_t) dst_shape[2]}), false); - auto updated = std::make_shared(dst_reshaped, indices_reshaped, data_reshaped, zero); - auto res = std::make_shared(updated, std::make_shared(dst), false); + auto updated = std::make_shared(dst_reshaped, indices_reshaped, data_reshaped, zero); + res = std::make_shared(updated, std::make_shared(dst), false); + } else { + // TODO: Better solution would be to reshape the data into 4D at first place (for stateful model) + if (data.get_partial_shape().rank() + 1 == dst.get_partial_shape().rank()) { + data = std::make_shared(data, zero); + } + int concat_axis = 1; + if (context.is_static()) + concat_axis = 0; + res = std::make_shared(OutputVector{dst, data}, concat_axis); + } return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/softmax.cpp b/ggml/src/ggml-openvino/openvino/op/softmax.cpp index 1aa3bf76a0..8f134626c8 100644 --- a/ggml/src/ggml-openvino/openvino/op/softmax.cpp +++ b/ggml/src/ggml-openvino/openvino/op/softmax.cpp @@ -7,8 +7,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -57,9 +59,20 @@ OutputVector translate_soft_max(const NodeContext& context) { } else { auto token_len = get_dimensions(input_node, {1}); auto mask_node = context.get_input(1); - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - mask_node_sliced = std::make_shared(mask_node, zero, token_len, one, one); + auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0}); + auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1}); + auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); + auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2}); + auto leaf_8 = context.get_input("leaf_8"); + auto shape_of_leaf_8 = std::make_shared(leaf_8); + auto gather_leaf_8 = std::make_shared(shape_of_leaf_8, two_1d, zero_1d); + auto stop = std::make_shared(ov::OutputVector{token_len, gather_leaf_8}, 0); + mask_node_sliced = + std::make_shared(mask_node, zero_2d, stop, one_2d, axes); + if (!(context.is_static())) { + mask_node_sliced = std::make_shared(mask_node_sliced, zero_1d); + } } if (mask_node_sliced.get_element_type() != context.get_output_type(0)) { diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 9443819682..58a94d6149 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -87,9 +88,18 @@ void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { if (is_static) { mask_sliced = mask; } else { - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - mask_sliced = std::make_shared(mask, zero, token_len, one, one); + auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0}); + auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1}); + auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); + auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2}); + auto leaf_8 = tensor_map.at("leaf_8").get_node_shared_ptr(); + auto shape_of_leaf_8 = std::make_shared(leaf_8); + auto gather_leaf_8 = std::make_shared(shape_of_leaf_8, two_1d, zero_1d); + auto stop = std::make_shared(ov::OutputVector{token_len, gather_leaf_8}, 0); + mask_sliced = + std::make_shared(mask, zero_2d, stop, one_2d, axes); + mask_sliced = std::make_shared(mask_sliced, zero_1d); mask_sliced = std::make_shared(mask_sliced, ov::element::f16); mask_sliced->set_friendly_name(sliced_name); } From e7252920e182e3616696e280aa40819b991c62cf Mon Sep 17 00:00:00 2001 From: cavusmustafa Date: Wed, 1 Oct 2025 14:33:48 -0700 Subject: [PATCH 150/254] env variable GGML_OPENVINO_DISABLE_SDPA_OPTIMIZATION added --- ggml/src/ggml-openvino/utils.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 0ec815f07f..9b000f26d5 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -80,11 +80,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c bool is_static = device == "NPU" ? true : false; ov::AnyMap config; - if (device == "GPU") { - config = { - {"GPU_ENABLE_SDPA_OPTIMIZATION", "0"} - }; - } if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { std::string filename = "cgraph.txt"; @@ -186,6 +181,13 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c ov::serialize(model, timestamped_filename); } + auto* disable_sdpa_optimization = getenv("GGML_OPENVINO_DISABLE_SDPA_OPTIMIZATION"); + if (disable_sdpa_optimization && std::string(disable_sdpa_optimization) != "0") { + config = { + {"GPU_ENABLE_SDPA_OPTIMIZATION", "0"} + }; + } + auto compiled_model = core.compile_model(model, device, config); compile_end_time = ggml_time_us(); infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); From 05d7abae8cf0c3cae615c93feb9ce78e67b7a967 Mon Sep 17 00:00:00 2001 From: cavusmustafa Date: Thu, 2 Oct 2025 11:24:40 -0700 Subject: [PATCH 151/254] Fix for Phi3 --- .../ggml-openvino/openvino/op/flash_attn_ext.cpp | 8 ++++---- ggml/src/ggml-openvino/openvino/op/permute.cpp | 12 ++++++++++-- ggml/src/ggml-openvino/openvino/op/set_rows.cpp | 16 +++++++--------- ggml/src/ggml-openvino/openvino/op/softmax.cpp | 8 ++++---- .../ggml-openvino/openvino/translate_session.cpp | 8 ++++---- 5 files changed, 29 insertions(+), 23 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp index 36d0f8844a..ec9bb0aac5 100644 --- a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +++ b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp @@ -47,10 +47,10 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) { auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2}); - auto leaf_8 = context.get_input("leaf_8"); - auto shape_of_leaf_8 = std::make_shared(leaf_8); - auto gather_leaf_8 = std::make_shared(shape_of_leaf_8, two_1d, zero_1d); - auto stop = std::make_shared(ov::OutputVector{token_len, gather_leaf_8}, 0); + auto inp_pos = context.get_input("inp_pos"); + auto shape_of_inp_pos = std::make_shared(inp_pos); + auto gather_inp_pos = std::make_shared(shape_of_inp_pos, two_1d, zero_1d); + auto stop = std::make_shared(ov::OutputVector{token_len, gather_inp_pos}, 0); mask_sliced = std::make_shared(mask, zero_2d, stop, one_2d, axes); mask_sliced = std::make_shared(mask_sliced, zero_1d); diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index cd0d073ab3..ea5e417965 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include "../node_context.hpp" #include "../op_table.hpp" @@ -23,13 +24,18 @@ OutputVector translate_permute(const NodeContext& context) { int op_case = context.get_op_case(); FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported PERMUTE case"); ov::Output res; + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); if (op_case == 1) { if (context.is_static()) { res = std::make_shared(context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); } else { - res = std::make_shared(context.get_input(0), + auto src = context.get_input(0); + if (src.get_partial_shape().rank() == 3) { + src = std::make_shared(src, zero); + } + res = std::make_shared(src, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); } } else { @@ -43,7 +49,6 @@ OutputVector translate_permute(const NodeContext& context) { attention_size = context.get_input("attention_size_swa"); } - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); if (context.is_static()) { @@ -57,6 +62,9 @@ OutputVector translate_permute(const NodeContext& context) { res = std::make_shared(src_slice, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); } else { + if (src.get_partial_shape().rank() == 3) { + src = std::make_shared(src, zero); + } res = std::make_shared(src, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); } diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp index a3285d41ce..0b2f29441a 100644 --- a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -8,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -55,14 +55,12 @@ OutputVector translate_set_rows(const NodeContext& context) { auto updated = std::make_shared(dst_reshaped, indices_reshaped, data_reshaped, zero); res = std::make_shared(updated, std::make_shared(dst), false); } else { - // TODO: Better solution would be to reshape the data into 4D at first place (for stateful model) - if (data.get_partial_shape().rank() + 1 == dst.get_partial_shape().rank()) { - data = std::make_shared(data, zero); - } - int concat_axis = 1; - if (context.is_static()) - concat_axis = 0; - res = std::make_shared(OutputVector{dst, data}, concat_axis); + assert(dst.get_partial_shape().rank() == 4 && dst.get_partial_shape()[2].is_static() && dst.get_partial_shape()[3].is_static()); + int64_t dim2 = dst.get_partial_shape()[2].get_length(); + int64_t dim3 = dst.get_partial_shape()[3].get_length(); + data = std::make_shared( + data, ov::op::v0::Constant::create(ov::element::i64, {4}, {(int64_t) 1, (int64_t) -1, dim2, dim3}), false); + res = std::make_shared(OutputVector{dst, data}, 1); } return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/softmax.cpp b/ggml/src/ggml-openvino/openvino/op/softmax.cpp index 8f134626c8..12db9e82a0 100644 --- a/ggml/src/ggml-openvino/openvino/op/softmax.cpp +++ b/ggml/src/ggml-openvino/openvino/op/softmax.cpp @@ -64,10 +64,10 @@ OutputVector translate_soft_max(const NodeContext& context) { auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2}); - auto leaf_8 = context.get_input("leaf_8"); - auto shape_of_leaf_8 = std::make_shared(leaf_8); - auto gather_leaf_8 = std::make_shared(shape_of_leaf_8, two_1d, zero_1d); - auto stop = std::make_shared(ov::OutputVector{token_len, gather_leaf_8}, 0); + auto inp_pos = context.get_input("inp_pos"); + auto shape_of_inp_pos = std::make_shared(inp_pos); + auto gather_inp_pos = std::make_shared(shape_of_inp_pos, two_1d, zero_1d); + auto stop = std::make_shared(ov::OutputVector{token_len, gather_inp_pos}, 0); mask_node_sliced = std::make_shared(mask_node, zero_2d, stop, one_2d, axes); if (!(context.is_static())) { diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 58a94d6149..830344020c 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -93,10 +93,10 @@ void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2}); - auto leaf_8 = tensor_map.at("leaf_8").get_node_shared_ptr(); - auto shape_of_leaf_8 = std::make_shared(leaf_8); - auto gather_leaf_8 = std::make_shared(shape_of_leaf_8, two_1d, zero_1d); - auto stop = std::make_shared(ov::OutputVector{token_len, gather_leaf_8}, 0); + auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); + auto shape_of_inp_pos = std::make_shared(inp_pos); + auto gather_inp_pos = std::make_shared(shape_of_inp_pos, two_1d, zero_1d); + auto stop = std::make_shared(ov::OutputVector{token_len, gather_inp_pos}, 0); mask_sliced = std::make_shared(mask, zero_2d, stop, one_2d, axes); mask_sliced = std::make_shared(mask_sliced, zero_1d); From a9371ea646af77d271876bc66348ad7246ecbd4d Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 9 Oct 2025 14:50:52 +0800 Subject: [PATCH 152/254] Fix llama-cli (need to run with --no-warmup) --- ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp | 8 ++++---- ggml/src/ggml-openvino/openvino/translate_session.cpp | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp index ec9bb0aac5..c07a7ccb16 100644 --- a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +++ b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp @@ -42,15 +42,15 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) { mask_sliced = context.get_input(mask_name); } else { auto token_len = get_dimensions(q, {2}); + auto kv_len = get_dimensions(k.get_node_shared_ptr(), {2}); + auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0}); auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1}); auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2}); - auto inp_pos = context.get_input("inp_pos"); - auto shape_of_inp_pos = std::make_shared(inp_pos); - auto gather_inp_pos = std::make_shared(shape_of_inp_pos, two_1d, zero_1d); - auto stop = std::make_shared(ov::OutputVector{token_len, gather_inp_pos}, 0); + + auto stop = std::make_shared(ov::OutputVector{token_len, kv_len}, 0); mask_sliced = std::make_shared(mask, zero_2d, stop, one_2d, axes); mask_sliced = std::make_shared(mask_sliced, zero_1d); diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 830344020c..0b16c06fd0 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -132,7 +132,7 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { // Create common patterns void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { add_token_len(tensor_map); - add_sliced_mask(tensor_map, ggml_model_decoder); + // add_sliced_mask(tensor_map, ggml_model_decoder); add_rope_sin_cos(tensor_map, ggml_model_decoder); } From 8b82d1153bdc81905ec40f0bf09db090bb897358 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 10 Oct 2025 13:17:12 +0800 Subject: [PATCH 153/254] Fix add_sliced_mask; Revert mulmat, softmax; Remove input attention_size, iSWA model not working --- ggml/src/ggml-openvino/ggml-decoder.cpp | 7 ++-- .../openvino/op/flash_attn_ext.cpp | 1 - ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 20 +++-------- .../src/ggml-openvino/openvino/op/permute.cpp | 14 ++------ .../src/ggml-openvino/openvino/op/softmax.cpp | 19 ++--------- .../openvino/translate_session.cpp | 34 +++++++++++++------ 6 files changed, 38 insertions(+), 57 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 0000319f63..7c6bfe7ee7 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -73,7 +73,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, set_input_output(cur_node); } - add_extra_inputs(); + // add_extra_inputs(); } GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, @@ -336,9 +336,10 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co void GgmlOvDecoder::add_extra_inputs() { // Extra inputs: - // 1. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned, + // 1. `attention_size`, used in FLASH_ATTN where the shape of the matmul's are 256 aligned, // see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding. - // Not used for NPU + // Not used for NPU. + // Update: not used anymore after the optimization of making kvcache dynamic (but breaks iSWA models) int64_t attention_size = -1; int64_t attention_size_swa = -1; for (const auto& node : m_nodes) { diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp index c07a7ccb16..9845fe0a02 100644 --- a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +++ b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp @@ -2,7 +2,6 @@ #include #include #include -#include #include #include #include diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 3a1ca34166..b4103378eb 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -59,23 +59,13 @@ OutputVector translate_mulmat(const NodeContext& context) { auto Z_last_two_dims = get_dimensions(Z.get_node_shared_ptr(), {1, 2}); + auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); + auto Z_unsqueezed = std::make_shared(Z, unsqueeze_axes); + Output batch_small = A_batch_larger ? B_batch_node : A_batch_node; Output batch_large = A_batch_larger ? A_batch_node : B_batch_node; - - ov::Output broadcast_shape; - ov::Output Z_unsqueezed; - if (context.is_static()) { - auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); - Z_unsqueezed = std::make_shared(Z, unsqueeze_axes); - broadcast_shape = - std::make_shared(ov::OutputVector{batch_small, factor_node, Z_last_two_dims}, 0); - } else { - auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {2}); - Z_unsqueezed = std::make_shared(Z, unsqueeze_axes); - auto one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - broadcast_shape = - std::make_shared(ov::OutputVector{one_1d, batch_small, factor_node, Z_last_two_dims}, 0); - } + auto broadcast_shape = + std::make_shared(ov::OutputVector{batch_small, factor_node, Z_last_two_dims}, 0); auto Z_broadcasted = std::make_shared(Z_unsqueezed, broadcast_shape); auto new_Z_shape = std::make_shared(ov::OutputVector{batch_large, Z_last_two_dims}, 0); diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index ea5e417965..5f86f47c1c 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -40,15 +40,6 @@ OutputVector translate_permute(const NodeContext& context) { } } else { auto src = context.get_input(0); - Output attention_size; - if (context.is_static()) { - attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {INT_MAX}); - } else if (op_case == 2) { - attention_size = context.get_input("attention_size"); - } else { - attention_size = context.get_input("attention_size_swa"); - } - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); if (context.is_static()) { @@ -58,9 +49,8 @@ OutputVector translate_permute(const NodeContext& context) { src, ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{-1, src_shape[1], src_shape[2]}), false); - auto src_slice = std::make_shared(src_reshaped, zero, attention_size, one, zero); - res = std::make_shared(src_slice, - ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); + res = std::make_shared( + src_reshaped, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); } else { if (src.get_partial_shape().rank() == 3) { src = std::make_shared(src, zero); diff --git a/ggml/src/ggml-openvino/openvino/op/softmax.cpp b/ggml/src/ggml-openvino/openvino/op/softmax.cpp index 12db9e82a0..1aa3bf76a0 100644 --- a/ggml/src/ggml-openvino/openvino/op/softmax.cpp +++ b/ggml/src/ggml-openvino/openvino/op/softmax.cpp @@ -7,10 +7,8 @@ #include #include #include -#include #include #include -#include #include #include #include @@ -59,20 +57,9 @@ OutputVector translate_soft_max(const NodeContext& context) { } else { auto token_len = get_dimensions(input_node, {1}); auto mask_node = context.get_input(1); - auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0}); - auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1}); - auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2}); - auto inp_pos = context.get_input("inp_pos"); - auto shape_of_inp_pos = std::make_shared(inp_pos); - auto gather_inp_pos = std::make_shared(shape_of_inp_pos, two_1d, zero_1d); - auto stop = std::make_shared(ov::OutputVector{token_len, gather_inp_pos}, 0); - mask_node_sliced = - std::make_shared(mask_node, zero_2d, stop, one_2d, axes); - if (!(context.is_static())) { - mask_node_sliced = std::make_shared(mask_node_sliced, zero_1d); - } + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + mask_node_sliced = std::make_shared(mask_node, zero, token_len, one, one); } if (mask_node_sliced.get_element_type() != context.get_output_type(0)) { diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 0b16c06fd0..e35599084e 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -11,14 +11,15 @@ #include #include #include -#include #include #include #include #include #include #include +#include #include +#include #include #include #include @@ -88,15 +89,27 @@ void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { if (is_static) { mask_sliced = mask; } else { - auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0}); - auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1}); + auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 0}); + auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 1}); + auto one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2}); - auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); - auto shape_of_inp_pos = std::make_shared(inp_pos); - auto gather_inp_pos = std::make_shared(shape_of_inp_pos, two_1d, zero_1d); - auto stop = std::make_shared(ov::OutputVector{token_len, gather_inp_pos}, 0); + auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2}); + + std::shared_ptr kv_len; + { + auto start = ov::op::v0::Constant::create(element::i64, Shape{3}, {0, 0, -1}); + auto stride = ov::op::v0::Constant::create(element::i64, Shape{3}, {1, 1, 1}); + auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); + kv_len = std::make_shared( + inp_pos, start, start, stride, std::vector{0, 0, 0}, std::vector{1, 1, 1}); + } + kv_len = std::make_shared( + kv_len, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); + kv_len = std::make_shared(kv_len, ov::element::i64); + kv_len = std::make_shared(kv_len, one_1d); + auto stop = std::make_shared(ov::OutputVector{token_len, kv_len}, 0); + mask_sliced = std::make_shared(mask, zero_2d, stop, one_2d, axes); mask_sliced = std::make_shared(mask_sliced, zero_1d); @@ -108,7 +121,8 @@ void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { }; create_sliced_mask("KQ_mask", "KQ_mask_sliced", ggml_model_decoder.is_static()); - create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static()); + // swa is not working for the `kv_len` is not correct + // create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static()); } void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { @@ -132,7 +146,7 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { // Create common patterns void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { add_token_len(tensor_map); - // add_sliced_mask(tensor_map, ggml_model_decoder); + add_sliced_mask(tensor_map, ggml_model_decoder); add_rope_sin_cos(tensor_map, ggml_model_decoder); } From 299f4923bbb01ac24e660832d7b514fc281290ee Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Sat, 11 Oct 2025 13:45:39 +0800 Subject: [PATCH 154/254] fix after rebasing --- ggml/src/ggml-openvino/ggml-openvino.cpp | 1 + ggml/src/ggml-openvino/openvino/op/set_rows.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 648acb4e35..309fc19b37 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -70,6 +70,7 @@ static const ggml_backend_i ggml_backend_openvino_interface = { /* .graph_compute = */ ggml_backend_openvino_graph_compute, /* .event_record = */ NULL, /* .event_wait = */ NULL, + /* .graph_optimize = */ NULL, }; int ggml_backend_openvino_get_device_count() { diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp index 0b2f29441a..001bd08773 100644 --- a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp @@ -25,7 +25,7 @@ namespace ggml { namespace op { OutputVector translate_set_rows(const NodeContext& context) { - num_inputs_check(context, 2, 2); + num_inputs_check(context, 3, 3); auto data = context.get_input(0); data = std::make_shared(data, context.get_output_type(0)); From 2d2f00a41f677faa558cedd2447d203da85a2e72 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 14 Oct 2025 14:51:42 +0800 Subject: [PATCH 155/254] Fix llama-3-8b and phi3-mini q4_0 NPU --- ggml/src/ggml-openvino/ggml-quants.cpp | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index 1538a8207c..017d2ad28c 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -414,6 +414,13 @@ std::shared_ptr requantize(const ggml_tensor* tensor, ExtraQuantType r std::shared_ptr weight_node; ov::Shape node_shape = {(uint64_t) (tensor->ne[1]), (uint64_t) (tensor->ne[0])}; + // FIXME hardcoded workaround to fix the case where token_emb.weight is q4_0 (instead of q6_k) + // (In some q4_0 models which use two different weight for token_emb and output, token_emb is q4_0) + std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : ""; + if (device == "NPU" && std::string(tensor->name) == "token_embd.weight") { + requant_type = ExtraQuantType::F16; + } + if (requant_type == ExtraQuantType::F16) { ov::Tensor weights(ov::element::f16, node_shape); ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), ggml_nelements(tensor)); @@ -473,7 +480,16 @@ void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_a } const float d = max / -8; - const float id = d ? 1.0f / d : 0.0f; + + if (d == 0) { + scales[i] = ov::float16(1.0f); + biases[i] = ov::float16(-8.0f); + uint8_t zp = 8; + memset(weights + i * qk / 2, zp | (zp << 4), qk / 2); + continue; + } + + const float id = 1.0f / d; scales[i] = ov::float16(d); biases[i] = ov::float16(-8.f * d); From 841d673bd00f0f49083e7c8243d24f913c96ea22 Mon Sep 17 00:00:00 2001 From: Ravi Panchumarthy Date: Tue, 14 Oct 2025 17:01:28 -0700 Subject: [PATCH 156/254] Update to OV-2025.3 and CMakeLists.txt --- docs/build.md | 18 ++++-------------- ggml/src/ggml-openvino/CMakeLists.txt | 4 +++- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/docs/build.md b/docs/build.md index e2af1b96dd..e20bb836c2 100644 --- a/docs/build.md +++ b/docs/build.md @@ -718,23 +718,13 @@ Follow the instructions below to install OpenVINO runtime and build llama.cpp wi - Follow the guide to install OpenVINO Runtime from an archive file: [Linux](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-linux.html) | [Windows](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-windows.html)
-📦 Click to expand OpenVINO 2025.2 installation commands on Linux +📦 Click to expand OpenVINO 2025.3 installation on Ubuntu
```bash -export OPENVINO_VERSION_MAJOR=2025.2 -export OPENVINO_VERSION_FULL=2025.2.0.19140.c01cd93e24d -sudo apt-get update -sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar -sudo mkdir -p /opt/intel -wget -O openvino_${OPENVINO_VERSION_MAJOR}.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz -tar -xf openvino_${OPENVINO_VERSION_MAJOR}.tgz -sudo mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} -rm openvino_${OPENVINO_VERSION_MAJOR}.tgz -cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} -echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh && cd - -sudo ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino -source /opt/intel/openvino/setupvars.sh +wget https://raw.githubusercontent.com/ravi9/misc-scripts/main/openvino/ov-archive-install/install-openvino-from-archive.sh +chmod +x install-openvino-from-archive.sh +./install-openvino-from-archive.sh ```
diff --git a/ggml/src/ggml-openvino/CMakeLists.txt b/ggml/src/ggml-openvino/CMakeLists.txt index 216aa756a7..3051a8b240 100644 --- a/ggml/src/ggml-openvino/CMakeLists.txt +++ b/ggml/src/ggml-openvino/CMakeLists.txt @@ -1,5 +1,7 @@ find_package(OpenVINO REQUIRED) +include("${OpenVINO_DIR}/../3rdparty/tbb/lib/cmake/TBB/TBBConfig.cmake") + file(GLOB_RECURSE GGML_HEADERS_OPENVINO "*.h" "*.hpp") file(GLOB_RECURSE GGML_SOURCES_OPENVINO "*.cpp") @@ -8,7 +10,7 @@ ggml_add_backend_library(ggml-openvino ${GGML_HEADERS_OPENVINO} ) -target_link_libraries(ggml-openvino PRIVATE openvino::runtime) +target_link_libraries(ggml-openvino PRIVATE openvino::runtime TBB::tbb) if (GGML_OPENVINO) if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") From 4c8406eb70fda0f4d2fe9a24c440d2ce0834398b Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 15 Oct 2025 11:48:08 +0800 Subject: [PATCH 157/254] Add OV CI cache --- .../actions/linux-setup-openvino/action.yml | 30 ++++++++++++++++ .github/workflows/build-cache.yml | 33 ++++++++++++++++++ .github/workflows/build.yml | 34 ++++++++++++------- 3 files changed, 85 insertions(+), 12 deletions(-) create mode 100644 .github/actions/linux-setup-openvino/action.yml diff --git a/.github/actions/linux-setup-openvino/action.yml b/.github/actions/linux-setup-openvino/action.yml new file mode 100644 index 0000000000..e4177407ab --- /dev/null +++ b/.github/actions/linux-setup-openvino/action.yml @@ -0,0 +1,30 @@ +name: "Linux - Setup OpenVINO Toolkit" +description: "Setup OpenVINO Toolkit for Linux" +inputs: + path: + description: "Installation path" + required: true + version_major: + description: "OpenVINO major version (e.g., 2025.2)" + required: true + version_full: + description: "OpenVINO full version (e.g., 2025.2.0.19140.c01cd93e24d)" + required: true + +runs: + using: "composite" + steps: + - name: Setup OpenVINO Toolkit + id: setup + uses: ./.github/actions/unarchive-tar + with: + url: https://storage.openvinotoolkit.org/repositories/openvino/packages/${{ inputs.version_major }}/linux/openvino_toolkit_ubuntu24_${{ inputs.version_full }}_x86_64.tgz + path: ${{ inputs.path }} + type: "z" + strip: 1 + + - name: Install OpenVINO dependencies + shell: bash + run: | + cd ${{ inputs.path }} + echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh diff --git a/.github/workflows/build-cache.yml b/.github/workflows/build-cache.yml index 6a22e41c3b..43d2355472 100644 --- a/.github/workflows/build-cache.yml +++ b/.github/workflows/build-cache.yml @@ -63,6 +63,39 @@ jobs: path: ./spacemit_toolchain version: ${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }} + ubuntu-24-openvino-cache: + runs-on: ubuntu-24.04 + + env: + # Make sure this is in sync with build.yml + OPENVINO_VERSION_MAJOR: "2025.2" + OPENVINO_VERSION_FULL: "2025.2.0.19140.c01cd93e24d" + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + + - name: Dependencies + run: | + sudo apt-get update + sudo apt-get install -y libtbb12 + + - name: Setup Cache + uses: actions/cache@v4 + id: cache-openvino + with: + path: ./openvino_toolkit + key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }} + + - name: Setup OpenVINO Toolkit + if: steps.cache-openvino.outputs.cache-hit != 'true' + uses: ./.github/actions/linux-setup-openvino + with: + path: ./openvino_toolkit + version_major: ${{ env.OPENVINO_VERSION_MAJOR }} + version_full: ${{ env.OPENVINO_VERSION_FULL }} + windows-2022-rocm-cache: runs-on: windows-2022 diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 3692a0a69b..c2f99bf95b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -740,6 +740,11 @@ jobs: ubuntu-24-cmake-openvino: runs-on: ubuntu-24.04 + env: + # Make sure this is in sync with build-cache.yml + OPENVINO_VERSION_MAJOR: "2025.2" + OPENVINO_VERSION_FULL: "2025.2.0.19140.c01cd93e24d" + steps: - name: Clone id: checkout @@ -754,23 +759,28 @@ jobs: - name: Dependencies id: depends run: | - export OPENVINO_VERSION_MAJOR=2025.2 - export OPENVINO_VERSION_FULL=2025.2.0.19140.c01cd93e24d sudo apt-get update - sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar - sudo mkdir -p /opt/intel - wget -O openvino_${OPENVINO_VERSION_MAJOR}.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz - tar -xf openvino_${OPENVINO_VERSION_MAJOR}.tgz - sudo mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} - rm openvino_${OPENVINO_VERSION_MAJOR}.tgz - cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} - echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh && cd - - sudo ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino + sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip + + - name: Use OpenVINO Toolkit Cache + uses: actions/cache@v4 + id: cache-openvino + with: + path: ./openvino_toolkit + key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }} + + - name: Setup OpenVINO Toolkit + if: steps.cache-openvino.outputs.cache-hit != 'true' + uses: ./.github/actions/linux-setup-openvino + with: + path: ./openvino_toolkit + version_major: ${{ env.OPENVINO_VERSION_MAJOR }} + version_full: ${{ env.OPENVINO_VERSION_FULL }} - name: Build id: cmake_build run: | - source /opt/intel/openvino/setupvars.sh + source ./openvino_toolkit/setupvars.sh cmake -B build/ReleaseOV -G Ninja \ -DCMAKE_BUILD_TYPE=Release \ -DGGML_OPENVINO=ON From 38e8a19f50ceef1bd7ba792f5e029a0fccd24dae Mon Sep 17 00:00:00 2001 From: Ravi Panchumarthy Date: Wed, 15 Oct 2025 13:25:31 -0700 Subject: [PATCH 158/254] Apply CISC review and update CI to OV2025.3 --- .../actions/linux-setup-openvino/action.yml | 6 +-- .github/workflows/build-cache.yml | 9 +---- .github/workflows/build.yml | 24 ++++++------ .github/workflows/release.yml | 39 ++++++++++++------- 4 files changed, 41 insertions(+), 37 deletions(-) diff --git a/.github/actions/linux-setup-openvino/action.yml b/.github/actions/linux-setup-openvino/action.yml index e4177407ab..7cd136548f 100644 --- a/.github/actions/linux-setup-openvino/action.yml +++ b/.github/actions/linux-setup-openvino/action.yml @@ -5,10 +5,10 @@ inputs: description: "Installation path" required: true version_major: - description: "OpenVINO major version (e.g., 2025.2)" + description: "OpenVINO major version (e.g., 2025.3)" required: true version_full: - description: "OpenVINO full version (e.g., 2025.2.0.19140.c01cd93e24d)" + description: "OpenVINO full version (e.g., 2025.3.0.19807.44526285f24)" required: true runs: @@ -20,7 +20,7 @@ runs: with: url: https://storage.openvinotoolkit.org/repositories/openvino/packages/${{ inputs.version_major }}/linux/openvino_toolkit_ubuntu24_${{ inputs.version_full }}_x86_64.tgz path: ${{ inputs.path }} - type: "z" + type: z strip: 1 - name: Install OpenVINO dependencies diff --git a/.github/workflows/build-cache.yml b/.github/workflows/build-cache.yml index 43d2355472..3d8b2b2eaf 100644 --- a/.github/workflows/build-cache.yml +++ b/.github/workflows/build-cache.yml @@ -68,19 +68,14 @@ jobs: env: # Make sure this is in sync with build.yml - OPENVINO_VERSION_MAJOR: "2025.2" - OPENVINO_VERSION_FULL: "2025.2.0.19140.c01cd93e24d" + OPENVINO_VERSION_MAJOR: "2025.3" + OPENVINO_VERSION_FULL: "2025.3.0.19807.44526285f24" steps: - name: Clone id: checkout uses: actions/checkout@v4 - - name: Dependencies - run: | - sudo apt-get update - sudo apt-get install -y libtbb12 - - name: Setup Cache uses: actions/cache@v4 id: cache-openvino diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c2f99bf95b..06d6105a8d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -742,8 +742,8 @@ jobs: env: # Make sure this is in sync with build-cache.yml - OPENVINO_VERSION_MAJOR: "2025.2" - OPENVINO_VERSION_FULL: "2025.2.0.19140.c01cd93e24d" + OPENVINO_VERSION_MAJOR: "2025.3" + OPENVINO_VERSION_FULL: "2025.3.0.19807.44526285f24" steps: - name: Clone @@ -751,7 +751,7 @@ jobs: uses: actions/checkout@v4 - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.16 with: key: ubuntu-24-cmake-openvino-no-preset-v1 evict-old-files: 1d @@ -1764,12 +1764,12 @@ jobs: GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp ggml-ci-arm64-cpu-kleidiai: - runs-on: ubuntu-22.04-arm + runs-on: ubuntu-22.04-arm - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 - name: ccache uses: ggml-org/ccache-action@v1.2.16 @@ -1784,10 +1784,10 @@ jobs: sudo apt-get update sudo apt-get install -y build-essential - - name: Test - id: ggml-ci - run: | - GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt + - name: Test + id: ggml-ci + run: | + GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt ubuntu-cpu-cmake-riscv64-native: runs-on: RISCV64 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 2f67885ac7..25ac9ed156 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -234,6 +234,11 @@ jobs: ubuntu-24-openvino: runs-on: ubuntu-24.04 + env: + # Make sure this is in sync with build.yml + OPENVINO_VERSION_MAJOR: "2025.3" + OPENVINO_VERSION_FULL: "2025.3.0.19807.44526285f24" + steps: - name: Clone id: checkout @@ -242,31 +247,35 @@ jobs: fetch-depth: 0 - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 + uses: ggml-org/ccache-action@v1.2.16 with: - key: ubuntu-24-cmake-openvino-release-no-preset-v1 + key: ubuntu-24-cmake-openvino-release-no-preset-v1 evict-old-files: 1d - name: Dependencies - id: depends run: | - export OPENVINO_VERSION_MAJOR=2025.2 - export OPENVINO_VERSION_FULL=2025.2.0.19140.c01cd93e24d sudo apt-get update - sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar - sudo mkdir -p /opt/intel - wget -O openvino_${OPENVINO_VERSION_MAJOR}.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz - tar -xf openvino_${OPENVINO_VERSION_MAJOR}.tgz - sudo mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} - rm openvino_${OPENVINO_VERSION_MAJOR}.tgz - cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} - echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh && cd - - sudo ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino + sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip + + - name: Use OpenVINO Toolkit Cache + uses: actions/cache@v4 + id: cache-openvino + with: + path: ./openvino_toolkit + key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }} + + - name: Setup OpenVINO Toolkit + if: steps.cache-openvino.outputs.cache-hit != 'true' + uses: ./.github/actions/linux-setup-openvino + with: + path: ./openvino_toolkit + version_major: ${{ env.OPENVINO_VERSION_MAJOR }} + version_full: ${{ env.OPENVINO_VERSION_FULL }} - name: Build id: cmake_build run: | - source /opt/intel/openvino/setupvars.sh + source ./openvino_toolkit/setupvars.sh cmake -B build/ReleaseOV -G Ninja \ -DCMAKE_BUILD_TYPE=Release \ -DGGML_OPENVINO=ON From 45af912b48f6341510d917443c034281da5b1db3 Mon Sep 17 00:00:00 2001 From: Ravi Panchumarthy Date: Wed, 15 Oct 2025 13:52:08 -0700 Subject: [PATCH 159/254] Update CI to run OV dep install before build --- .github/actions/linux-setup-openvino/action.yml | 5 ----- .github/workflows/build.yml | 6 ++++++ .github/workflows/release.yml | 6 ++++++ 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/.github/actions/linux-setup-openvino/action.yml b/.github/actions/linux-setup-openvino/action.yml index 7cd136548f..46a659a827 100644 --- a/.github/actions/linux-setup-openvino/action.yml +++ b/.github/actions/linux-setup-openvino/action.yml @@ -23,8 +23,3 @@ runs: type: z strip: 1 - - name: Install OpenVINO dependencies - shell: bash - run: | - cd ${{ inputs.path }} - echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 06d6105a8d..a0be0c704b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -777,6 +777,12 @@ jobs: version_major: ${{ env.OPENVINO_VERSION_MAJOR }} version_full: ${{ env.OPENVINO_VERSION_FULL }} + - name: Install OpenVINO dependencies + run: | + cd ./openvino_toolkit + chmod +x ./install_dependencies/install_openvino_dependencies.sh + echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh + - name: Build id: cmake_build run: | diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 25ac9ed156..638ae9f056 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -272,6 +272,12 @@ jobs: version_major: ${{ env.OPENVINO_VERSION_MAJOR }} version_full: ${{ env.OPENVINO_VERSION_FULL }} + - name: Install OpenVINO dependencies + run: | + cd ./openvino_toolkit + chmod +x ./install_dependencies/install_openvino_dependencies.sh + echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh + - name: Build id: cmake_build run: | From 3a1129e0731c2708da2e586548fdfaced49f5926 Mon Sep 17 00:00:00 2001 From: Ravi Panchumarthy Date: Wed, 15 Oct 2025 16:23:15 -0700 Subject: [PATCH 160/254] Update OV dockerfile to use OV2025.3 and update build docs --- .devops/openvino.Dockerfile | 4 +-- docs/build.md | 63 +++++++++++++++++++++++++++++++++++-- 2 files changed, 63 insertions(+), 4 deletions(-) diff --git a/.devops/openvino.Dockerfile b/.devops/openvino.Dockerfile index 16924e3937..41310c6633 100644 --- a/.devops/openvino.Dockerfile +++ b/.devops/openvino.Dockerfile @@ -1,5 +1,5 @@ -ARG OPENVINO_VERSION_MAJOR=2025.2 -ARG OPENVINO_VERSION_FULL=2025.2.0.19140.c01cd93e24d +ARG OPENVINO_VERSION_MAJOR=2025.3 +ARG OPENVINO_VERSION_FULL=2025.3.0.19807.44526285f24 ARG UBUNTU_VERSION=24.04 # Optional proxy build arguments - empty by default diff --git a/docs/build.md b/docs/build.md index e20bb836c2..a63b45a1af 100644 --- a/docs/build.md +++ b/docs/build.md @@ -718,7 +718,7 @@ Follow the instructions below to install OpenVINO runtime and build llama.cpp wi - Follow the guide to install OpenVINO Runtime from an archive file: [Linux](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-linux.html) | [Windows](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-windows.html)
-📦 Click to expand OpenVINO 2025.3 installation on Ubuntu +📦 Click to expand OpenVINO 2025.3 installation from an archive file on Ubuntu
```bash @@ -804,9 +804,68 @@ Control OpenVINO behavior using these environment variables: export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache export GGML_OPENVINO_PROFILING=1 -./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is " +GGML_OPENVINO_DEVICE=GPU ./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is " ``` +### Docker build Llama.cpp with OpenVINO Backend +You can build and run llama.cpp with OpenVINO backend using Docker. + +```bash +# Build the base runtime image with compiled shared libraries and minimal dependencies. +docker build -t llama-openvino:base -f .devops/openvino.Dockerfile . + +# Build the complete image with all binaries, Python tools, gguf-py library, and model conversion utilities. +docker build --target=full -t llama-openvino:full -f .devops/openvino.Dockerfile . + +# Build a minimal CLI-only image containing just the llama-cli executable. +docker build --target=light -t llama-openvino:light -f .devops/openvino.Dockerfile . + +# Builds a server-only image with llama-server executable, health check endpoint, and REST API support. +docker build --target=server -t llama-openvino:server -f .devops/openvino.Dockerfile . + +# If you are behind a proxy: +docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy --target=light --t llama-openvino:light -f .devops/openvino.Dockerfile . +``` + +Run llama.cpp with OpenVINO backend Docker container. +Save sample models in `~/models` as [shown above](#3-download-sample-model). It will be mounted to the container in the examples below. + +```bash +# Run Docker container +docker run --rm -it -v ~/models:/models llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct.fp16.gguf + +# With Intel GPU access (iGPU or dGPU) +docker run --rm -it -v ~/models:/models \ +--device=/dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \ +llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct.fp16.gguf + +# With Intel NPU access +docker run --rm -it --env GGML_OPENVINO_DEVICE=NPU -v ~/models:/models \ +--device=/dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \ +llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct.fp16.gguf +``` + +Run Llama.cpp Server with OpenVINO Backend +```bash +# Run the Server Docker container server +docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct.fp16.gguf + +# In a NEW terminal, test the server with curl + +# If you are behind a proxy, make sure to set NO_PROXY to avoid proxy for localhost +export NO_PROXY=localhost,127.0.0.1 + +# Test health endpoint +curl -f http://localhost:8080/health + +# Test with a simple prompt +curl -X POST "http://localhost:8080/v1/chat/completions" -H "Content-Type: application/json" \ + -d '{"messages":[{"role":"user","content":"Write a poem about OpenVINO"}],"max_tokens":100}' | jq . + +``` + + +--- ## Notes about GPU-accelerated backends The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`. From bd3093f90cab18adff813d274d3bdcdaad19bbbe Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 21 Oct 2025 11:33:26 +0800 Subject: [PATCH 161/254] Style: use switch in supports_ops --- ggml/src/ggml-openvino/ggml-openvino.cpp | 30 ++++++++++++++---------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 309fc19b37..75c2a76c54 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -240,7 +240,8 @@ static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(g } static bool is_op_unsupported_case(const ggml_tensor* op) { - if (op->op == GGML_OP_SOFT_MAX) { + switch (op->op) { + case GGML_OP_SOFT_MAX: { if (op->src[2] != nullptr) { GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with sinks\n"); return true; @@ -254,9 +255,9 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with max_bias > 0\n"); return true; } + break; } - - if (op->op == GGML_OP_FLASH_ATTN_EXT) { + case GGML_OP_FLASH_ATTN_EXT: { if (op->src[4] != nullptr) { GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with sinks\n"); return true; @@ -276,32 +277,32 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with logit_softcap != 0\n"); return true; } + break; } - - if (op->op == GGML_OP_PERMUTE) { + case GGML_OP_PERMUTE: { if (op->type == GGML_TYPE_BF16) { // err msg: [GPU] Could not find a suitable kernel for transpose GGML_LOG_WARN("OpenVINO backend does not support PERMUTE with BF16 type\n"); return true; } + break; } - - if (op->op == GGML_OP_CPY) { + case GGML_OP_CPY: { if (op->src[1] != op) { GGML_LOG_WARN("OpenVINO backend only supports CPY that is a cast\n"); return true; } + break; } - - if (op->op == GGML_OP_MUL_MAT) { + case GGML_OP_MUL_MAT: { if (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16) { // Has accuracy issue, try enabling this and see `test-backend-ops -o "MUL_MAT"` GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with two F16 tensors\n"); return true; } + break; } - - if (op->op == GGML_OP_ROPE) { + case GGML_OP_ROPE: { const int32_t* op_params = op->op_params; const int n_dims = op_params[1]; const int mode = op_params[2]; @@ -330,12 +331,17 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { if (op->src[0]->op == GGML_OP_VIEW) { if (op->src[0]->view_src->ne[1] != op->src[0]->ne[2]) { GGML_LOG_WARN( - "OpenVINO backend does not support ROPE with src[0]->view_src->ne[1] %ld != src[0]->ne[2] %ld\n", + "OpenVINO backend does not support ROPE with src[0]->view_src->ne[1] %ld != src[0]->ne[2] " + "%ld\n", op->src[0]->view_src->ne[1], op->src[0]->ne[2]); return true; } } + break; + } + default: + break; } return false; } From eba8113dc4655e16e4c7513f48cde57c9cfe5791 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 21 Oct 2025 14:45:32 +0800 Subject: [PATCH 162/254] Style: middle ptr and ref align, omit optional struct keyword --- ggml/include/ggml-openvino.h | 24 +- ggml/src/ggml-openvino/.clang-format | 27 +- ggml/src/ggml-openvino/ggml-decoder.cpp | 168 ++++++------ ggml/src/ggml-openvino/ggml-decoder.h | 132 +++++----- ggml/src/ggml-openvino/ggml-openvino.cpp | 162 ++++++------ ggml/src/ggml-openvino/ggml-quants.cpp | 247 ++++++++++-------- ggml/src/ggml-openvino/openvino/frontend.cpp | 4 +- .../ggml-openvino/openvino/input_model.cpp | 4 +- ggml/src/ggml-openvino/openvino/op/cont.cpp | 14 +- ggml/src/ggml-openvino/openvino/op/cpy.cpp | 7 +- .../openvino/op/flash_attn_ext.cpp | 35 ++- .../ggml-openvino/openvino/op/get_rows.cpp | 10 +- .../ggml-openvino/openvino/op/glu_geglu.cpp | 12 +- .../ggml-openvino/openvino/op/glu_swiglu.cpp | 12 +- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 10 +- .../src/ggml-openvino/openvino/op/permute.cpp | 25 +- .../src/ggml-openvino/openvino/op/reshape.cpp | 22 +- .../ggml-openvino/openvino/op/rms_norm.cpp | 10 +- ggml/src/ggml-openvino/openvino/op/rope.cpp | 15 +- ggml/src/ggml-openvino/openvino/op/scale.cpp | 10 +- .../ggml-openvino/openvino/op/set_rows.cpp | 16 +- .../src/ggml-openvino/openvino/op/softmax.cpp | 16 +- .../ggml-openvino/openvino/op/transpose.cpp | 6 +- .../ggml-openvino/openvino/op/unary_silu.cpp | 10 +- ggml/src/ggml-openvino/openvino/op/view.cpp | 5 +- ggml/src/ggml-openvino/openvino/op_table.cpp | 4 +- .../openvino/pass/eliminate_zp.cpp | 32 ++- .../openvino/pass/fuse_to_sdpa.cpp | 4 +- .../openvino/translate_session.cpp | 80 +++--- ggml/src/ggml-openvino/openvino/utils.cpp | 22 +- ggml/src/ggml-openvino/utils.cpp | 136 +++++----- ggml/src/ggml-openvino/utils.h | 42 +-- 32 files changed, 670 insertions(+), 653 deletions(-) diff --git a/ggml/include/ggml-openvino.h b/ggml/include/ggml-openvino.h index 151c48d40d..7b5298e520 100644 --- a/ggml/include/ggml-openvino.h +++ b/ggml/include/ggml-openvino.h @@ -1,17 +1,17 @@ #pragma once -#include "ggml.h" #include "ggml-backend.h" +#include "ggml.h" -#include #include +#include #ifdef __cplusplus extern "C" { #endif -#define GGML_OPENVINO_NAME "OPENVINO" -#define GGML_OPENVINO_MAX_DEVICES 16 +#define GGML_OPENVINO_NAME "OPENVINO" +#define GGML_OPENVINO_MAX_DEVICES 16 // backend API GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device); @@ -28,7 +28,7 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_split_buffer_t // and GPU GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(void); -GGML_BACKEND_API int ggml_backend_openvino_get_device_count(void); +GGML_BACKEND_API int ggml_backend_openvino_get_device_count(void); // GGML_BACKEND_API void ggml_backend_openvino_get_device_description(int device, char * description, // size_t description_size); // GGML_BACKEND_API void ggml_backend_openvino_get_device_memory(int device, size_t * free, size_t * total); @@ -42,13 +42,13 @@ struct ggml_openvino_device_info { int device_count; struct openvino_device_info { - int cc; // compute capability - int nsm; // number of streaming multiprocessors - size_t smpb; // max. shared memory per block - size_t smpbo; // max. shared memory per block (with opt-in) - bool vmm; // virtual memory support - size_t vmm_granularity; // granularity of virtual memory - size_t total_vram; + int cc; // compute capability + int nsm; // number of streaming multiprocessors + size_t smpb; // max. shared memory per block + size_t smpbo; // max. shared memory per block (with opt-in) + bool vmm; // virtual memory support + size_t vmm_granularity; // granularity of virtual memory + size_t total_vram; }; openvino_device_info devices[GGML_OPENVINO_MAX_DEVICES] = {}; diff --git a/ggml/src/ggml-openvino/.clang-format b/ggml/src/ggml-openvino/.clang-format index 63dc2c472a..a2a24d7d33 100644 --- a/ggml/src/ggml-openvino/.clang-format +++ b/ggml/src/ggml-openvino/.clang-format @@ -2,12 +2,10 @@ # Override root .clang-format AlignConsecutiveAssignments: false AlignConsecutiveDeclarations: false -ReferenceAlignment: Left -PointerAlignment: Left Cpp11BracedListStyle: true -AccessModifierOffset: -4 -BinPackArguments: false +SpacesInContainerLiterals: false BreakBeforeBraces: Attach +AccessModifierOffset: -4 IndentCaseBlocks: false IndentCaseLabels: false @@ -32,7 +30,15 @@ AllowShortIfStatementsOnASingleLine: Never AllowShortLambdasOnASingleLine: Inline AllowShortLoopsOnASingleLine: false AlwaysBreakBeforeMultilineStrings: true -BinPackParameters: true +# Treat CUDA keywords/attributes as "attribute macros" and avoid breaking lines inside them +AttributeMacros: + - __host__ + - __device__ + - __global__ + - __forceinline__ + - __launch_bounds__ +BinPackArguments: true +BinPackParameters: false # OnePerLine BitFieldColonSpacing: Both # BreakAdjacentStringLiterals: true BreakAfterAttributes: Never @@ -58,15 +64,18 @@ ExperimentalAutoDetectBinPacking: false FixNamespaceComments: true IncludeBlocks: Regroup IncludeCategories: - - Regex: '^<.*\.h>' + - Regex: '".*"' Priority: 1 SortPriority: 0 - - Regex: '^<.*' + - Regex: '^<.*\.h>' Priority: 2 SortPriority: 0 - - Regex: '.*' + - Regex: '^<.*' Priority: 3 SortPriority: 0 + - Regex: '.*' + Priority: 4 + SortPriority: 0 IncludeIsMainRegex: '([-_](test|unittest))?$' IncludeIsMainSourceRegex: '' IndentAccessModifiers: false @@ -100,6 +109,7 @@ PenaltyBreakString: 1000 PenaltyBreakTemplateDeclaration: 10 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 200 +PointerAlignment: Middle QualifierAlignment: Left #QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict'] RawStringFormats: @@ -113,6 +123,7 @@ RawStringFormats: - 'c++' - 'C++' CanonicalDelimiter: '' +ReferenceAlignment: Middle ReflowComments: false # IndentOnly SeparateDefinitionBlocks: Always SortIncludes: CaseInsensitive diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 7c6bfe7ee7..392d45dd6b 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -1,5 +1,9 @@ #include "ggml-decoder.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" +#include "ggml-quants.hpp" + #include #include @@ -32,13 +36,16 @@ #include #include -#include "ggml-backend-impl.h" -#include "ggml-backend.h" -#include "ggml-quants.hpp" - -GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token, - int context_size, int context_size_swa, int num_heads, int num_heads_kv, int head_size, - const std::vector& swa_layers) : +GgmlOvDecoder::GgmlOvDecoder(ggml_tensor * node, + ggml_cgraph * cgraph, + bool is_static, + bool is_first_token, + int context_size, + int context_size_swa, + int num_heads, + int num_heads_kv, + int head_size, + const std::vector & swa_layers) : m_cgraph(cgraph), m_node(node), m_op_name(std::string(node->name)), @@ -53,8 +60,9 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap set_input_output(node); } -GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, - std::map>& model_weights, bool is_static, +GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, + std::map> & model_weights, + bool is_static, bool is_first_token) : m_cgraph(cgraph), m_op_name(m_node ? std::string(m_node->name) : ""), @@ -68,7 +76,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, set_llm_params(); for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { - auto* cur_node = cgraph->nodes[node_n]; + auto * cur_node = cgraph->nodes[node_n]; m_nodes.push_back(cur_node); set_input_output(cur_node); } @@ -76,12 +84,11 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, // add_extra_inputs(); } -GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, - std::map>& model_weights) { +GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map> & model_weights) { m_cgraph = cgraph; m_model_weights = model_weights; for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { - auto* cur_node = cgraph->nodes[node_n]; + auto * cur_node = cgraph->nodes[node_n]; if (cur_node->op == GGML_OP_NONE) { continue; } @@ -93,7 +100,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, // Called in GgmlOvDecoder constructor. Two cases: 1. constructing a decoder for the whole graph; // 2. constructing a decoder for a node; // 3. constructing a decoder for the whole graph naively (op test case) -void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { +void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { std::string node_name; if (node->op == GGML_OP_SET_ROWS) { // SET_ROWS updates the tensor in place. For later ov op that uses the @@ -109,7 +116,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { m_outputs[node_name] = node; for (int i = 0; i < GGML_MAX_SRC; i++) { - auto* src = node->src[i]; + auto * src = node->src[i]; if (src == nullptr) { continue; } @@ -128,7 +135,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { } } else if (!m_node && !src->view_src) { - ggml_backend_buffer* buffer = src->buffer; + ggml_backend_buffer * buffer = src->buffer; if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) { // GGML_BACKEND_BUFFER_USAGE_ANY are kv caches @@ -236,8 +243,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { } case GGML_OP_VIEW: { if (node->src[0]->op == GGML_OP_VIEW) { - auto* src = node->src[0]; - auto* view_src = src->view_src; + auto * src = node->src[0]; + auto * view_src = src->view_src; if (view_src->ne[1] != src->ne[2]) { throw std::runtime_error("Unsupported VIEW case"); } @@ -250,7 +257,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { } } -int extract_layer_from_name(const std::string& name) { +int extract_layer_from_name(const std::string & name) { size_t pos1 = name.find("_l"); assert(pos1 != std::string::npos); pos1 += 2; @@ -265,10 +272,10 @@ int extract_layer_from_name(const std::string& name) { void GgmlOvDecoder::set_llm_params() { for (int i = 0; i < m_cgraph->n_nodes; i++) { - auto* node = m_cgraph->nodes[i]; + auto * node = m_cgraph->nodes[i]; std::string name = std::string(node->name); if (node->op == GGML_OP_FLASH_ATTN_EXT) { - auto* cache_k = node->src[1]; + auto * cache_k = node->src[1]; cache_k = cache_k->view_src ? cache_k->view_src : cache_k; int layer = extract_layer_from_name(cache_k->name); @@ -290,7 +297,7 @@ void GgmlOvDecoder::set_llm_params() { } } -ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) const { +ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * src) const { auto name = std::string(src->name); ov::PartialShape input_shape; if (name == "inp_tokens" || name == "inp_pos") { @@ -323,7 +330,7 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co } else { input_shape = ov::PartialShape{1, -1, m_num_heads_kv, m_head_size}; } - } else if (const auto* op = get_tensor_used_op(src); op && op->op == GGML_OP_SET_ROWS) { + } else if (const auto * op = get_tensor_used_op(src); op && op->op == GGML_OP_SET_ROWS) { input_shape = ov::PartialShape{1, 1, m_is_static ? 1 : -1}; } else if (src->op == GGML_OP_VIEW) { // This case is added to make test-backend-ops work @@ -342,9 +349,9 @@ void GgmlOvDecoder::add_extra_inputs() { // Update: not used anymore after the optimization of making kvcache dynamic (but breaks iSWA models) int64_t attention_size = -1; int64_t attention_size_swa = -1; - for (const auto& node : m_nodes) { + for (const auto & node : m_nodes) { if (node->op == GGML_OP_FLASH_ATTN_EXT) { - auto* mask = node->src[3]; + auto * mask = node->src[3]; std::string mask_name(mask->name); if (mask_name.find("KQ_mask") != 0) { throw std::runtime_error("Unexpected flash attention node: " + std::string(mask->name)); @@ -357,7 +364,7 @@ void GgmlOvDecoder::add_extra_inputs() { } } - auto create_attention_size_input = [this](const std::string& name, int64_t size) { + auto create_attention_size_input = [this](const std::string & name, int64_t size) { auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); param_node->set_friendly_name(name); param_node->output(0).get_tensor().set_names({name}); @@ -374,12 +381,12 @@ void GgmlOvDecoder::add_extra_inputs() { } } -const ggml_tensor* GgmlOvDecoder::get_tensor_used_op(const ggml_tensor* tensor) const { +const ggml_tensor * GgmlOvDecoder::get_tensor_used_op(const ggml_tensor * tensor) const { if (tensor == nullptr) { return nullptr; } for (int i = 0; i < m_cgraph->n_nodes; i++) { - const auto* node = m_cgraph->nodes[i]; + const auto * node = m_cgraph->nodes[i]; for (int j = 0; j < GGML_MAX_SRC; j++) { if (node->src[j] == tensor) { return node; @@ -389,11 +396,11 @@ const ggml_tensor* GgmlOvDecoder::get_tensor_used_op(const ggml_tensor* tensor) return nullptr; } -const ggml_tensor* GgmlOvDecoder::get_tensor_from_name(const std::string& name) const { +const ggml_tensor * GgmlOvDecoder::get_tensor_from_name(const std::string & name) const { for (int i = 0; i < m_cgraph->n_nodes; i++) { - const auto* node = m_cgraph->nodes[i]; + const auto * node = m_cgraph->nodes[i]; for (int j = 0; j < GGML_MAX_SRC; j++) { - const auto* src = node->src[j]; + const auto * src = node->src[j]; if (src == nullptr) { break; } @@ -407,7 +414,7 @@ const ggml_tensor* GgmlOvDecoder::get_tensor_from_name(const std::string& name) std::map GgmlOvDecoder::get_kv_param_res_names() const { std::map kv_param_res_names; - for (const auto& name : m_kv_names) { + for (const auto & name : m_kv_names) { if (name.find("cache_k") == 0 || name.find("cache_v") == 0) { kv_param_res_names[name] = name; } @@ -416,21 +423,22 @@ std::map GgmlOvDecoder::get_kv_param_res_names() const } std::map> GgmlOvDecoder::create_weight_nodes( - struct ggml_cgraph* cgraph, std::map types_to_requantize) { + ggml_cgraph * cgraph, + std::map types_to_requantize) { std::map> model_weights; static std::mutex weights_mutex; - auto* nodes = cgraph->nodes; + auto * nodes = cgraph->nodes; auto n_nodes = cgraph->n_nodes; - std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor* node) { + std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor * node) { for (int i = 0; i < GGML_MAX_SRC; i++) { - auto* src = node->src[i]; + auto * src = node->src[i]; if (src == nullptr) { continue; } std::string src_name(src->name); if (!src->view_src) { - ggml_backend_buffer* buffer = src->buffer; + ggml_backend_buffer * buffer = src->buffer; if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS || ggml_is_quantized(src->type)) { bool should_create = false; { @@ -458,17 +466,10 @@ std::map> GgmlOvDecoder::create_weight_no return model_weights; } -std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, +std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor, std::optional requant_type) { - std::set weight_types = {GGML_TYPE_F32, - GGML_TYPE_F16, - GGML_TYPE_BF16, - GGML_TYPE_Q8_0, - GGML_TYPE_Q4_0, - GGML_TYPE_Q4_1, - GGML_TYPE_Q4_K, - GGML_TYPE_Q5_K, - GGML_TYPE_Q6_K}; + std::set weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, + GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K}; if (weight_types.find(tensor->type) == weight_types.end()) { throw std::runtime_error("Unexpected weight tensor type: " + std::string(tensor->name) + " with type " + ggml_type_name(tensor->type)); @@ -495,9 +496,8 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, } // Quantized case - OPENVINO_ASSERT( - tensor->extra == nullptr, - "Unsupported weight tensor: " + std::string(tensor->name) + " Possibly this is a repacked quantized weights"); + OPENVINO_ASSERT(tensor->extra == nullptr, "Unsupported weight tensor: " + std::string(tensor->name) + + " Possibly this is a repacked quantized weights"); if (requant_type.has_value()) { return requantize(tensor, requant_type.value()); @@ -518,11 +518,8 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, weights_per_block = 32; } - OPENVINO_ASSERT(node_shape.back() % weights_per_block == 0, - "[load_gguf] tensor ", - tensor->name, - " has incompatible last dim shape: ", - node_shape.back()); + OPENVINO_ASSERT(node_shape.back() % weights_per_block == 0, "[load_gguf] tensor ", tensor->name, + " has incompatible last dim shape: ", node_shape.back()); ov::Tensor weights(weight_type, node_shape); // For scales and biases @@ -557,7 +554,7 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, return weight_node.get_node_shared_ptr(); } -void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename) { +void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filename) { std::ofstream file(filename); if (!file.is_open()) { std::cerr << "Failed to open file" << std::endl; @@ -576,7 +573,7 @@ void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph, std::string& f << std::setw(50) << "stride" << "\n"; for (int i = 0; i < cgraph->n_nodes; i++) { - struct ggml_tensor * node = cgraph->nodes[i]; + ggml_tensor * node = cgraph->nodes[i]; file << " - " << std::setw(3) << i << ": [ " << std::setw(5) << node->ne[0] << ", " @@ -614,7 +611,7 @@ void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph, std::string& f file << "n_leafs = " << cgraph->n_leafs << "\n"; for (int i = 0; i < cgraph->n_leafs; i++) { - struct ggml_tensor * node = cgraph->leafs[i]; + ggml_tensor * node = cgraph->leafs[i]; file << " - " << std::setw(3) << i << ": [ " << std::setw(5) << node->ne[0] << ", " @@ -628,10 +625,10 @@ void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph, std::string& f file.close(); } -void print_tensor_address_map(const struct ggml_cgraph* cgraph) { - std::map> address_map; +void print_tensor_address_map(const ggml_cgraph * cgraph) { + std::map> address_map; for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { - auto* node = cgraph->nodes[node_n]; + auto * node = cgraph->nodes[node_n]; if (node->data) { auto it = address_map.find(node->data); if (it == address_map.end()) { @@ -640,16 +637,16 @@ void print_tensor_address_map(const struct ggml_cgraph* cgraph) { address_map[node->data].push_back(node->name); } } - for (const auto& pair : address_map) { + for (const auto & pair : address_map) { std::cout << "Address: " << pair.first << std::endl; - for (const auto& name : pair.second) { + for (const auto & name : pair.second) { std::cout << name << " ; "; } std::cout << std::endl << std::endl; } } -std::vector GgmlOvDecoder::get_shape(const ggml_tensor* tensor) { +std::vector GgmlOvDecoder::get_shape(const ggml_tensor * tensor) { std::vector shape; for (int i = GGML_MAX_DIMS - 2; i >= 0; --i) { shape.push_back(static_cast(tensor->ne[i])); @@ -657,7 +654,7 @@ std::vector GgmlOvDecoder::get_shape(const ggml_tensor* tensor) { return shape; } -std::vector GgmlOvDecoder::get_stride(const ggml_tensor* tensor) { +std::vector GgmlOvDecoder::get_stride(const ggml_tensor * tensor) { std::vector stride; for (int i = GGML_MAX_DIMS - 2; i >= 0; --i) { stride.push_back(static_cast(tensor->nb[i])); @@ -665,7 +662,7 @@ std::vector GgmlOvDecoder::get_stride(const ggml_tensor* tensor) { return stride; } -ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor* tensor) { +ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor * tensor) { switch (tensor->type) { case GGML_TYPE_F64: return ov::element::f64; @@ -688,15 +685,15 @@ ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor* tensor) { } } -ov::PartialShape GgmlOvDecoder::get_input_shape(const std::string& name) const { +ov::PartialShape GgmlOvDecoder::get_input_shape(const std::string & name) const { return ov::PartialShape(get_shape(m_inputs.at(name))); } -std::vector GgmlOvDecoder::get_input_stride(const std::string& name) const { +std::vector GgmlOvDecoder::get_input_stride(const std::string & name) const { return get_stride(m_inputs.at(name)); } -ov::element::Type GgmlOvDecoder::get_input_type(const std::string& name) const { +ov::element::Type GgmlOvDecoder::get_input_type(const std::string & name) const { return get_ov_type(m_inputs.at(name)); } @@ -704,7 +701,7 @@ size_t GgmlOvDecoder::get_input_size() const { return m_input_names.size(); } -std::string& GgmlOvDecoder::get_input_name(size_t index) const { +std::string & GgmlOvDecoder::get_input_name(size_t index) const { m_name = m_input_names[index]; return m_name; } @@ -713,19 +710,19 @@ std::vector GgmlOvDecoder::get_input_names() const { return m_input_names; } -std::vector GgmlOvDecoder::get_output_stride(const std::string& name) const { +std::vector GgmlOvDecoder::get_output_stride(const std::string & name) const { return get_stride(m_outputs.at(name)); } -ov::PartialShape GgmlOvDecoder::get_output_shape(const std::string& name) const { +ov::PartialShape GgmlOvDecoder::get_output_shape(const std::string & name) const { return ov::PartialShape(get_shape(m_outputs.at(name))); } -ov::element::Type GgmlOvDecoder::get_output_type(const std::string& name) const { +ov::element::Type GgmlOvDecoder::get_output_type(const std::string & name) const { return get_ov_type(m_outputs.at(name)); } -std::string& GgmlOvDecoder::get_output_name(size_t index) const { +std::string & GgmlOvDecoder::get_output_name(size_t index) const { m_name = std::string(m_output_names[index]); return m_name; } @@ -734,35 +731,28 @@ std::vector GgmlOvDecoder::get_output_names() const { return m_output_names; } -const std::string& GgmlOvDecoder::get_op_name() const { +const std::string & GgmlOvDecoder::get_op_name() const { return m_op_name; } -int32_t* GgmlOvDecoder::get_input_op_params(const std::string& name) const { +int32_t * GgmlOvDecoder::get_input_op_params(const std::string & name) const { return m_inputs.at(name)->op_params; } -int32_t* GgmlOvDecoder::get_output_op_params(const std::string& name) const { +int32_t * GgmlOvDecoder::get_output_op_params(const std::string & name) const { return m_outputs.at(name)->op_params; } void GgmlOvDecoder::visit_subgraph(std::function)> node_visitor) const { - for (const auto& node : m_nodes) { - auto decoder = std::make_shared(node, - m_cgraph, - m_is_static, - m_is_first_token, - m_context_size, - m_context_size_swa, - m_num_heads, - m_num_heads_kv, - m_head_size, - m_swa_layers); + for (const auto & node : m_nodes) { + auto decoder = + std::make_shared(node, m_cgraph, m_is_static, m_is_first_token, m_context_size, + m_context_size_swa, m_num_heads, m_num_heads_kv, m_head_size, m_swa_layers); node_visitor(decoder); } } -const std::string& GgmlOvDecoder::get_op_type() const { +const std::string & GgmlOvDecoder::get_op_type() const { static const std::map ops = { {GGML_OP_NONE, "GGML_OP_NONE" }, {GGML_OP_ACC, "GGML_OP_ACC" }, diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 35e79ecefc..884151d32e 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -1,5 +1,9 @@ #pragma once +#include "ggml-quants.hpp" +#include "ggml.h" +#include "openvino/decoder.hpp" + #include #include #include @@ -7,98 +11,99 @@ #include #include -#include "ggml-quants.hpp" -#include "ggml.h" -#include "openvino/decoder.hpp" - class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { public: // Graph decoder - GgmlOvDecoder(struct ggml_cgraph* cgraph, std::map>& model_weights, - bool is_static, bool is_first_token); + GgmlOvDecoder(ggml_cgraph * cgraph, + std::map> & model_weights, + bool is_static, + bool is_first_token); // Node decoder, called in GgmlOvDecoder::visit_subgraph - GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token, - int context_size, int context_size_swa, int num_heads, int num_heads_kv, int head_size, - const std::vector& swa_layers); + GgmlOvDecoder(ggml_tensor * node, + ggml_cgraph * cgraph, + bool is_static, + bool is_first_token, + int context_size, + int context_size_swa, + int num_heads, + int num_heads_kv, + int head_size, + const std::vector & swa_layers); // Naive graph decoder - GgmlOvDecoder(struct ggml_cgraph* cgraph, std::map>& model_weights); + GgmlOvDecoder(ggml_cgraph * cgraph, std::map> & model_weights); - virtual ov::Any get_attribute(const std::string& name) const override { + virtual ov::Any get_attribute(const std::string & name) const override { return nullptr; GGML_UNUSED(name); } - virtual ov::PartialShape get_input_shape(const std::string& name) const override; + virtual ov::PartialShape get_input_shape(const std::string & name) const override; - virtual std::vector get_input_stride(const std::string& name) const override; + virtual std::vector get_input_stride(const std::string & name) const override; - virtual ov::element::Type get_input_type(const std::string& name) const override; + virtual ov::element::Type get_input_type(const std::string & name) const override; virtual size_t get_input_size() const override; virtual void get_input_node(size_t input_port_idx, - std::string& producer_name, - std::string& producer_output_port_name, - size_t& producer_output_port_index) const override { + std::string & producer_name, + std::string & producer_output_port_name, + size_t & producer_output_port_index) const override { GGML_UNUSED(input_port_idx); GGML_UNUSED(producer_name); GGML_UNUSED(producer_output_port_name); GGML_UNUSED(producer_output_port_index); } - virtual std::string& get_input_name(size_t index) const override; + virtual std::string & get_input_name(size_t index) const override; virtual std::vector get_input_names() const override; - virtual ov::PartialShape get_output_shape(const std::string& name) const override; + virtual ov::PartialShape get_output_shape(const std::string & name) const override; - virtual std::vector get_output_stride(const std::string& name) const override; + virtual std::vector get_output_stride(const std::string & name) const override; - virtual ov::element::Type get_output_type(const std::string& name) const override; + virtual ov::element::Type get_output_type(const std::string & name) const override; - virtual int32_t* get_input_op_params(const std::string& name) const override; + virtual int32_t * get_input_op_params(const std::string & name) const override; - virtual int32_t* get_output_op_params(const std::string& name) const override; + virtual int32_t * get_output_op_params(const std::string & name) const override; - virtual std::string& get_output_name(size_t index) const override; + virtual std::string & get_output_name(size_t index) const override; virtual std::vector get_output_names() const override; - virtual const std::string& get_op_type() const override; + virtual const std::string & get_op_type() const override; - virtual const std::string& get_op_name() const override; + virtual const std::string & get_op_name() const override; virtual void visit_subgraph(std::function)> node_visitor) const override; - const ggml_tensor* get_input_ggml_tensor(const std::string& name) const { - return m_inputs.at(name); - } + const ggml_tensor * get_input_ggml_tensor(const std::string & name) const { return m_inputs.at(name); } - const ggml_tensor* get_output_ggml_tensor(const std::string& name) const { - return m_outputs.at(name); - } + const ggml_tensor * get_output_ggml_tensor(const std::string & name) const { return m_outputs.at(name); } - virtual int get_op_case() const override { - return m_op_case; - } + virtual int get_op_case() const override { return m_op_case; } - virtual const std::map>& get_model_inputs() const override { + virtual const std::map> & get_model_inputs() const override { return m_model_inputs; } - virtual const std::map>& get_model_extra_inputs() const override { + + virtual const std::map> & get_model_extra_inputs() const override { return m_model_extra_inputs; } - virtual const std::map>& get_model_extra_input_values() const { + + virtual const std::map> & get_model_extra_input_values() const { return m_model_extra_input_values; } - virtual const std::map>& get_model_weights() const override { + + virtual const std::map> & get_model_weights() const override { return m_model_weights; } - virtual const std::vector& get_model_output_names() const override { - return m_model_output_names; - } + + virtual const std::vector & get_model_output_names() const override { return m_model_output_names; } virtual int get_context_size() const override { return m_context_size; } @@ -114,7 +119,7 @@ public: virtual int get_head_size() const override { return m_head_size; } - virtual int32_t* get_rope_params() const override { return m_rope_params; } + virtual int32_t * get_rope_params() const override { return m_rope_params; } virtual std::map get_kv_param_res_names() const override; @@ -122,36 +127,39 @@ public: virtual bool is_first_token() const override { return m_is_first_token; } - ov::PartialShape get_graph_input_shape(const ggml_tensor* src) const; + ov::PartialShape get_graph_input_shape(const ggml_tensor * src) const; - static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename); + static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename); - static std::shared_ptr create_weight_node(ggml_tensor* tensor, + static std::shared_ptr create_weight_node(ggml_tensor * tensor, std::optional requant_type = std::nullopt); - static std::map> create_weight_nodes( - struct ggml_cgraph* cgraph, std::map types_to_requantize = {}); - const ggml_tensor* get_tensor_used_op(const ggml_tensor* tensor) const; - const ggml_tensor* get_tensor_from_name(const std::string& name) const; + static std::map> create_weight_nodes( + ggml_cgraph * cgraph, + std::map types_to_requantize = {}); + + const ggml_tensor * get_tensor_used_op(const ggml_tensor * tensor) const; + + const ggml_tensor * get_tensor_from_name(const std::string & name) const; void clear_model_weights() { m_model_weights.clear(); } private: - void set_input_output(ggml_tensor* node, bool naive = false); + void set_input_output(ggml_tensor * node, bool naive = false); void add_extra_inputs(); - static std::vector get_shape(const ggml_tensor* tensor); - static std::vector get_stride(const ggml_tensor* tensor); - static ov::element::Type get_ov_type(const ggml_tensor* tensor); + static std::vector get_shape(const ggml_tensor * tensor); + static std::vector get_stride(const ggml_tensor * tensor); + static ov::element::Type get_ov_type(const ggml_tensor * tensor); // set context_size, num_heads, etc void set_llm_params(); - struct ggml_cgraph* m_cgraph = nullptr; - ggml_tensor* m_node = nullptr; - std::vector m_nodes; - std::map m_inputs; + ggml_cgraph * m_cgraph = nullptr; + ggml_tensor * m_node = nullptr; + std::vector m_nodes; + std::map m_inputs; std::vector m_input_names; - std::map m_outputs; + std::map m_outputs; std::vector m_output_names; std::string m_op_name; mutable std::string m_name; @@ -168,12 +176,12 @@ private: int m_num_heads; int m_num_heads_kv; int m_head_size; - int32_t* m_rope_params; + int32_t * m_rope_params; std::vector m_kv_names; bool m_is_static = false; bool m_is_first_token; }; -void print_tensor_address_map(const struct ggml_cgraph* cgraph); +void print_tensor_address_map(const ggml_cgraph * cgraph); -int extract_layer_from_name(const std::string& name); +int extract_layer_from_name(const std::string & name); diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 75c2a76c54..c5acb1ea26 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -1,5 +1,11 @@ #include "ggml-openvino.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" +#include "ggml-impl.h" +#include "ggml-openvino/utils.h" +#include "ggml.h" + #include #include #include @@ -7,39 +13,36 @@ #include #include -#include "ggml-backend-impl.h" -#include "ggml-backend.h" -#include "ggml-impl.h" -#include "ggml-openvino/utils.h" -#include "ggml.h" - #define GGML_OPENVINO_MAX_STREAMS 8 struct ggml_backend_openvino_context { - int device; // the device ID currently in use - std::string name; // context Name - std::string description; // context description + int device; // the device ID currently in use + std::string name; // context Name + std::string description; // context description // OpenVINO core components - ov::Core core; // OpenVINO core interface - std::shared_ptr model; // compiled Model - ov::InferRequest infer_request; // inference Request + ov::Core core; // OpenVINO core interface + std::shared_ptr model; // compiled Model + ov::InferRequest infer_request; // inference Request // OpenVINO Multi-stream support - static const int MAX_STREAMS = 8; // define the maximum number of flows - std::vector streams; // used to support multi-stream reasoning - int current_stream; // the currently active stream index + static const int MAX_STREAMS = 8; // define the maximum number of flows + std::vector streams; // used to support multi-stream reasoning + int current_stream; // the currently active stream index // state Management - bool is_initialized; // initialize + bool is_initialized; // initialize - ggml_backend_openvino_context() - : device(0), name("OpenVINO"), description("OpenVINO Backend Context"), - current_stream(0), is_initialized(false) {} + ggml_backend_openvino_context() : + device(0), + name("OpenVINO"), + description("OpenVINO Backend Context"), + current_stream(0), + is_initialized(false) {} }; static void ggml_backend_openvino_free(ggml_backend_t backend) { - ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *)backend->context; + ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *) backend->context; delete ctx; delete backend; } @@ -49,8 +52,7 @@ static const char * ggml_backend_openvino_get_name(ggml_backend_t backend) { GGML_UNUSED(backend); } -static enum ggml_status -ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph *cgraph) { +static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { openvino_frontend_compute(backend, cgraph); return GGML_STATUS_SUCCESS; @@ -78,7 +80,8 @@ int ggml_backend_openvino_get_device_count() { } static ggml_guid_t ggml_backend_openvino_guid(void) { - static ggml_guid guid = { 0x12, 0xa8, 0xae, 0xf4, 0xc0, 0x1e, 0x61, 0x97, 0x8f, 0xeb, 0x33, 0x04, 0xa1, 0x33, 0x51, 0x2d }; + static ggml_guid guid = {0x12, 0xa8, 0xae, 0xf4, 0xc0, 0x1e, 0x61, 0x97, + 0x8f, 0xeb, 0x33, 0x04, 0xa1, 0x33, 0x51, 0x2d}; return &guid; } @@ -95,7 +98,7 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device) { return nullptr; } - ggml_backend_t openvino_backend = new ggml_backend { + ggml_backend_t openvino_backend = new ggml_backend{ /* .guid = */ ggml_backend_openvino_guid(), /* .interface = */ ggml_backend_openvino_interface, /* .device = */ ggml_backend_reg_dev_get(ggml_backend_openvino_reg(), device), @@ -134,15 +137,15 @@ struct ggml_backend_openvino_buffer_type_context { }; static const char * ggml_backend_openvino_buffer_type_get_name(ggml_backend_buffer_type_t buft) { - ggml_backend_openvino_buffer_type_context * ctx = (ggml_backend_openvino_buffer_type_context *)buft->context; + ggml_backend_openvino_buffer_type_context * ctx = (ggml_backend_openvino_buffer_type_context *) buft->context; return ctx->name.c_str(); } + static bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft) { return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name; } - static const char * ggml_backend_openvino_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) { return GGML_OPENVINO_NAME "_Split"; @@ -160,12 +163,12 @@ struct ggml_backend_openvino_device_context { }; static const char * ggml_backend_openvino_device_get_name(ggml_backend_dev_t dev) { - ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context; + ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *) dev->context; return ctx->name.c_str(); } static const char * ggml_backend_openvino_device_get_description(ggml_backend_dev_t dev) { - ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context; + ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *) dev->context; return ctx->description.c_str(); } @@ -174,7 +177,7 @@ static void ggml_backend_openvino_device_get_memory(ggml_backend_dev_t dev, size GGML_ASSERT(dev->context != nullptr); GGML_ASSERT(free != nullptr); GGML_ASSERT(total != nullptr); - ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context; + ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *) dev->context; GGML_ASSERT(ctx->device >= 0); // ggml_openvino_set_device(ctx->device); *total = 1; @@ -187,9 +190,9 @@ static enum ggml_backend_dev_type ggml_backend_openvino_device_get_type(ggml_bac } static void ggml_backend_openvino_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { - props->name = ggml_backend_openvino_device_get_name(dev); + props->name = ggml_backend_openvino_device_get_name(dev); props->description = ggml_backend_openvino_device_get_description(dev); - props->type = ggml_backend_openvino_device_get_type(dev); + props->type = ggml_backend_openvino_device_get_type(dev); ggml_backend_openvino_device_get_memory(dev, &props->memory_free, &props->memory_total); bool host_buffer = getenv("GGML_OPENVINO_NO_PINNED") == nullptr; @@ -209,12 +212,12 @@ static void ggml_backend_openvino_device_get_props(ggml_backend_dev_t dev, ggml_ static ggml_backend_t ggml_backend_openvino_device_init(ggml_backend_dev_t dev, const char * params) { GGML_UNUSED(params); - ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context; + ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *) dev->context; return ggml_backend_openvino_init(ctx->device); } static ggml_backend_buffer_type_t ggml_backend_openvino_device_get_buffer_type(ggml_backend_dev_t dev) { - ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context; + ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *) dev->context; return ggml_backend_openvino_buffer_type(ctx->device); } @@ -223,7 +226,10 @@ static ggml_backend_buffer_type_t ggml_backend_openvino_device_get_host_buffer_t return ggml_backend_openvino_host_buffer_type(); } -static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { +static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_ptr(ggml_backend_dev_t dev, + void * ptr, + size_t size, + size_t max_tensor_size) { GGML_UNUSED(dev); GGML_UNUSED(ptr); GGML_UNUSED(size); @@ -231,7 +237,10 @@ static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_ptr(ggml_b return nullptr; } -static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { +static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(ggml_backend_dev_t dev, + void * ptr, + size_t size, + size_t max_tensor_size) { GGML_UNUSED(dev); GGML_UNUSED(ptr); GGML_UNUSED(size); @@ -239,7 +248,7 @@ static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(g return nullptr; } -static bool is_op_unsupported_case(const ggml_tensor* op) { +static bool is_op_unsupported_case(const ggml_tensor * op) { switch (op->op) { case GGML_OP_SOFT_MAX: { if (op->src[2] != nullptr) { @@ -248,9 +257,9 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { } float scale = 1.0f; float max_bias = 0.0f; - const auto* op_params = op->op_params; - memcpy(&scale, (const float*) op_params + 0, sizeof(float)); - memcpy(&max_bias, (const float*) op_params + 1, sizeof(float)); + const auto * op_params = op->op_params; + memcpy(&scale, (const float *) op_params + 0, sizeof(float)); + memcpy(&max_bias, (const float *) op_params + 1, sizeof(float)); if (max_bias > 0) { GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with max_bias > 0\n"); return true; @@ -265,10 +274,10 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { float scale = 1.0f; float max_bias = 0.0f; float logit_softcap = 0.0f; - const auto* op_params = op->op_params; - memcpy(&scale, (const float*) op_params + 0, sizeof(float)); - memcpy(&max_bias, (const float*) op_params + 1, sizeof(float)); - memcpy(&logit_softcap, (const float*) op_params + 2, sizeof(float)); + const auto * op_params = op->op_params; + memcpy(&scale, (const float *) op_params + 0, sizeof(float)); + memcpy(&max_bias, (const float *) op_params + 1, sizeof(float)); + memcpy(&logit_softcap, (const float *) op_params + 2, sizeof(float)); if (max_bias > 0) { GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with max_bias > 0\n"); return true; @@ -303,7 +312,7 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { break; } case GGML_OP_ROPE: { - const int32_t* op_params = op->op_params; + const int32_t * op_params = op->op_params; const int n_dims = op_params[1]; const int mode = op_params[2]; if (mode == GGML_ROPE_TYPE_MROPE || mode == GGML_ROPE_TYPE_VISION) { @@ -311,8 +320,7 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { return true; } if (n_dims != 0.0f && n_dims != op->src[0]->ne[0]) { - GGML_LOG_WARN("OpenVINO backend does not support ROPE with n_dims %d != src[0]->ne[0] %ld\n", - n_dims, + GGML_LOG_WARN("OpenVINO backend does not support ROPE with n_dims %d != src[0]->ne[0] %ld\n", n_dims, op->src[0]->ne[0]); return true; } @@ -333,8 +341,7 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { GGML_LOG_WARN( "OpenVINO backend does not support ROPE with src[0]->view_src->ne[1] %ld != src[0]->ne[2] " "%ld\n", - op->src[0]->view_src->ne[1], - op->src[0]->ne[2]); + op->src[0]->view_src->ne[1], op->src[0]->ne[2]); return true; } } @@ -346,39 +353,19 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { return false; } -static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor* op) { +static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { GGML_ASSERT(dev->reg != nullptr); - static std::set supported_types{GGML_TYPE_F32, - GGML_TYPE_F16, - GGML_TYPE_BF16, - GGML_TYPE_I64, - GGML_TYPE_I32, - GGML_TYPE_Q4_0, - GGML_TYPE_Q4_1, - GGML_TYPE_Q4_K, - GGML_TYPE_Q5_K, - GGML_TYPE_Q8_0, - GGML_TYPE_Q6_K}; + static std::set supported_types{GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_I64, + GGML_TYPE_I32, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, + GGML_TYPE_Q5_K, GGML_TYPE_Q8_0, GGML_TYPE_Q6_K}; - static const std::set supported_ops{GGML_OP_NONE, - GGML_OP_ADD, - GGML_OP_MUL, - GGML_OP_MUL_MAT, - GGML_OP_VIEW, - GGML_OP_CONT, - GGML_OP_RESHAPE, - GGML_OP_PERMUTE, - GGML_OP_TRANSPOSE, - GGML_OP_GET_ROWS, - GGML_OP_ROPE, - GGML_OP_RMS_NORM, - GGML_OP_SCALE, + static const std::set supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW, + GGML_OP_CONT, GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE, + GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE, // softmax is not updated due to replaced by flash_attn_ext // GGML_OP_SOFT_MAX, - GGML_OP_SET_ROWS, - GGML_OP_FLASH_ATTN_EXT, - GGML_OP_CPY}; + GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY}; static const std::set supported_unary_ops{ GGML_UNARY_OP_SILU, }; @@ -422,7 +409,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con return false; } for (int i = 0; i < GGML_MAX_SRC; i++) { - auto* src = op->src[i]; + auto * src = op->src[i]; if (src == nullptr) { break; } @@ -483,13 +470,13 @@ static size_t ggml_backend_openvino_reg_get_device_count(ggml_backend_reg_t reg) GGML_UNUSED(reg); // TODO - ggml_backend_openvino_reg_context * ctx = (ggml_backend_openvino_reg_context *)reg->context; + ggml_backend_openvino_reg_context * ctx = (ggml_backend_openvino_reg_context *) reg->context; return ctx->devices.size(); } static ggml_backend_dev_t ggml_backend_openvino_reg_get_device(ggml_backend_reg_t reg, size_t index) { - ggml_backend_openvino_reg_context * ctx = (ggml_backend_openvino_reg_context *)reg->context; + ggml_backend_openvino_reg_context * ctx = (ggml_backend_openvino_reg_context *) reg->context; GGML_ASSERT(index < ctx->devices.size()); return ctx->devices[index]; // GGML_ASSERT(index == 0); @@ -509,7 +496,7 @@ static ggml_backend_dev_t ggml_backend_openvino_reg_get_device(ggml_backend_reg_ static void * ggml_backend_openvino_get_proc_address(ggml_backend_reg_t reg, const char * name) { GGML_UNUSED(reg); if (strcmp(name, "ggml_backend_split_buffer_type") == 0) { - return (void *)ggml_backend_openvino_split_buffer_type; + return (void *) ggml_backend_openvino_split_buffer_type; } // if (strcmp(name, "ggml_backend_register_host_buffer") == 0) { // return (void *)ggml_backend_openvino_register_host_buffer; @@ -565,17 +552,16 @@ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void) { // ggml_openvino_set_device(i); dev_ctx->description = ov::get_openvino_version().description; - ggml_backend_dev_t dev = new ggml_backend_device { - /* .interface = */ ggml_backend_openvino_device_interface, - /* .reg = */ ®, - /* .context = */ dev_ctx - }; + ggml_backend_dev_t dev = + new ggml_backend_device{/* .interface = */ ggml_backend_openvino_device_interface, + /* .reg = */ ®, + /* .context = */ dev_ctx}; ctx->devices.push_back(dev); } - reg = ggml_backend_reg{ /* .api_version = */ GGML_BACKEND_API_VERSION, - /* .iface = */ ggml_backend_openvino_reg_interface, - /* .context = */ ctx }; + reg = ggml_backend_reg{/* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_openvino_reg_interface, + /* .context = */ ctx}; } initialized = true; diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index 017d2ad28c..2076c3c75d 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -1,5 +1,9 @@ #include "ggml-quants.hpp" +#include "ggml-common.h" +#include "ggml-impl.h" +#include "ggml.h" + #include #include #include @@ -24,11 +28,7 @@ #include #include -#include "ggml-common.h" -#include "ggml-impl.h" -#include "ggml.h" - -void unpack_32_4(const uint8_t* data, uint8_t* dst) { +void unpack_32_4(const uint8_t * data, uint8_t * dst) { std::fill_n(dst, 16, 0); for (int j = 0; j < 16; ++j) { uint8_t x = (data[j] & 0x0F); @@ -44,18 +44,19 @@ void unpack_32_4(const uint8_t* data, uint8_t* dst) { // Extracts (weight, scales, biases) from Q4_0 tensors. // Data layout is: |16 bit scale|32 x 4bit weights|. -void extract_q4_0_data(const ggml_tensor* tensor, - ov::Tensor& weights_arr, - ov::Tensor& scales_arr, - ov::Tensor& biases_arr) { +void extract_q4_0_data(const ggml_tensor * tensor, + ov::Tensor & weights_arr, + ov::Tensor & scales_arr, + ov::Tensor & biases_arr) { const uint64_t bytes_per_block = 18; // 2 bytes scale, 32x0.5 byte weights - auto* data = static_cast(tensor->data); - auto* weights = static_cast(weights_arr.data()); - auto* scales = scales_arr.data::value_type>(); - auto* biases = biases_arr.data::value_type>(); + + auto * data = static_cast(tensor->data); + auto * weights = static_cast(weights_arr.data()); + auto * scales = scales_arr.data::value_type>(); + auto * biases = biases_arr.data::value_type>(); ov::parallel_for(scales_arr.get_size(), [&](size_t i) { - scales[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block))); + scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))); biases[i] = ov::float16(-8.f * static_cast(scales[i])); unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16); }); @@ -63,38 +64,40 @@ void extract_q4_0_data(const ggml_tensor* tensor, // Extracts (weight, scales, biases) from Q4_1 tensors. // Data layout is: |16 bit scale|16 bit bias|32 x 4bit weights|. -void extract_q4_1_data(const ggml_tensor* tensor, - ov::Tensor& weights_arr, - ov::Tensor& scales_arr, - ov::Tensor& biases_arr) { +void extract_q4_1_data(const ggml_tensor * tensor, + ov::Tensor & weights_arr, + ov::Tensor & scales_arr, + ov::Tensor & biases_arr) { const uint64_t bytes_per_block = 20; // 2 bytes scale, 2 bytes bias, 32x0.5 byte weights - auto* data = static_cast(tensor->data); - auto* weights = static_cast(weights_arr.data()); - auto* scales = scales_arr.data::value_type>(); - auto* biases = biases_arr.data::value_type>(); + + auto * data = static_cast(tensor->data); + auto * weights = static_cast(weights_arr.data()); + auto * scales = scales_arr.data::value_type>(); + auto * biases = biases_arr.data::value_type>(); ov::parallel_for(scales_arr.get_size(), [&](size_t i) { - scales[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block))); - biases[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block + 2))); + scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))); + biases[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2))); unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16); }); } // Extracts (weight, scales, biases) from Q8_0 tensors. // Data layout is: |16 bit scale|32 x 8bit weights|. -void extract_q8_0_data(const ggml_tensor* tensor, - ov::Tensor& weights_arr, - ov::Tensor& scales_arr, - ov::Tensor& biases_arr) { +void extract_q8_0_data(const ggml_tensor * tensor, + ov::Tensor & weights_arr, + ov::Tensor & scales_arr, + ov::Tensor & biases_arr) { const uint64_t weights_per_block = 32; const uint64_t bytes_per_block = 34; // 2 bytes scale, 32x1 byte weights - auto* data = static_cast(tensor->data); - auto* weights = static_cast(weights_arr.data()); - auto* scales = scales_arr.data::value_type>(); - auto* biases = biases_arr.data::value_type>(); + + auto * data = static_cast(tensor->data); + auto * weights = static_cast(weights_arr.data()); + auto * scales = scales_arr.data::value_type>(); + auto * biases = biases_arr.data::value_type>(); ov::parallel_for(scales_arr.get_size(), [&](size_t i) { - uint8_t* block_data = data + i * bytes_per_block; - scales[i] = ov::float16::from_bits(*(uint16_t*) block_data); + uint8_t * block_data = data + i * bytes_per_block; + scales[i] = ov::float16::from_bits(*(uint16_t *) block_data); biases[i] = ov::float16(-128.f * static_cast(scales[i])); for (size_t j = 0; j < weights_per_block; ++j) { uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes. @@ -105,7 +108,7 @@ void extract_q8_0_data(const ggml_tensor* tensor, }); } -void unpack_256_4(const uint8_t* data, uint8_t* dst) { +void unpack_256_4(const uint8_t * data, uint8_t * dst) { // Initialize the output array with zeros std::fill_n(dst, 128, 0); @@ -123,26 +126,27 @@ void unpack_256_4(const uint8_t* data, uint8_t* dst) { } } -void extract_q4_k_data(const ggml_tensor* tensor, - ov::Tensor& weights_arr, - ov::Tensor& scales_arr, - ov::Tensor& biases_arr) { +void extract_q4_k_data(const ggml_tensor * tensor, + ov::Tensor & weights_arr, + ov::Tensor & scales_arr, + ov::Tensor & biases_arr) { const uint64_t bytes_per_block = 2 + 2 + 12 + 128; const uint64_t n_super_block = tensor->nb[3] / bytes_per_block; - auto* data = static_cast(tensor->data); - auto* weights = static_cast(weights_arr.data()); - auto* scales = scales_arr.data::value_type>(); - auto* biases = biases_arr.data::value_type>(); + + auto * data = static_cast(tensor->data); + auto * weights = static_cast(weights_arr.data()); + auto * scales = scales_arr.data::value_type>(); + auto * biases = biases_arr.data::value_type>(); ov::parallel_for(n_super_block, [&](size_t i) { - uint8_t* block_data = data + i * bytes_per_block; + uint8_t * block_data = data + i * bytes_per_block; // Extract scale factors and offsets - float scale_scales = static_cast(ov::float16::from_bits(*((uint16_t*)block_data))); - float scale_biases = static_cast(ov::float16::from_bits(*((uint16_t*)block_data + 1))); + float scale_scales = static_cast(ov::float16::from_bits(*((uint16_t *) block_data))); + float scale_biases = static_cast(ov::float16::from_bits(*((uint16_t *) block_data + 1))); // Extract qs1 and qs2 - uint8_t* qs1 = block_data + 4; + uint8_t * qs1 = block_data + 4; // uint8_t* qs2 = block_data + 16; scales[i * 8] = ov::float16(scale_scales * static_cast((*(qs1) & 0b111111))); @@ -174,31 +178,32 @@ void extract_q4_k_data(const ggml_tensor* tensor, }); } -void extract_q6_k_data(const ggml_tensor* tensor, - ov::Tensor& weights_arr, - ov::Tensor& scales_arr, - ov::Tensor& biases_arr) { +void extract_q6_k_data(const ggml_tensor * tensor, + ov::Tensor & weights_arr, + ov::Tensor & scales_arr, + ov::Tensor & biases_arr) { const uint64_t bytes_per_block = 128 + 64 + 16 + 2; const uint64_t n_super_block = tensor->nb[3] / bytes_per_block; - auto* data = static_cast(tensor->data); - auto* weights = static_cast(weights_arr.data()); - auto* scales = scales_arr.data::value_type>(); - auto* biases = biases_arr.data::value_type>(); + + auto * data = static_cast(tensor->data); + auto * weights = static_cast(weights_arr.data()); + auto * scales = scales_arr.data::value_type>(); + auto * biases = biases_arr.data::value_type>(); ov::parallel_for(n_super_block, [&](size_t i) { - uint8_t* block_data = data + i * bytes_per_block; + uint8_t * block_data = data + i * bytes_per_block; float scale_factor = - static_cast(ov::float16::from_bits(*((uint16_t*) block_data + 104))); // (128+64+16)/2 + static_cast(ov::float16::from_bits(*((uint16_t *) block_data + 104))); // (128+64+16)/2 for (size_t j = 0; j < 16; j++) { scales[j + i * 16] = - ov::float16(scale_factor * static_cast(*((int8_t*) (block_data + 128 + 64 + j)))); + ov::float16(scale_factor * static_cast(*((int8_t *) (block_data + 128 + 64 + j)))); biases[j + i * 16] = ov::float16(-32.f * static_cast(scales[j + i * 16])); } - uint8_t* ql = block_data; - uint8_t* qh = block_data + 128; + uint8_t * ql = block_data; + uint8_t * qh = block_data + 128; for (int64_t j = 0; j < 32; ++j) { weights[i * 256 + j] = (ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4); @@ -213,7 +218,7 @@ void extract_q6_k_data(const ggml_tensor* tensor, }); } -static inline void get_scale_min_k4(int j, const uint8_t* q, uint8_t* d, uint8_t* m) { +static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) { if (j < 4) { *d = q[j] & 63; *m = q[j + 4] & 63; @@ -223,24 +228,27 @@ static inline void get_scale_min_k4(int j, const uint8_t* q, uint8_t* d, uint8_t } } -void extract_q5_k_data(const ggml_tensor* tensor, ov::Tensor& weights_arr, ov::Tensor& scales_arr, - ov::Tensor& biases_arr) { +void extract_q5_k_data(const ggml_tensor * tensor, + ov::Tensor & weights_arr, + ov::Tensor & scales_arr, + ov::Tensor & biases_arr) { const uint64_t bytes_per_block = 4 + 12 + 32 + 128; const uint64_t n_super_block = tensor->nb[3] / bytes_per_block; - auto* data = static_cast(tensor->data); - auto* weights = static_cast(weights_arr.data()); - auto* scales = scales_arr.data::value_type>(); - auto* biases = biases_arr.data::value_type>(); + + auto * data = static_cast(tensor->data); + auto * weights = static_cast(weights_arr.data()); + auto * scales = scales_arr.data::value_type>(); + auto * biases = biases_arr.data::value_type>(); ov::parallel_for(n_super_block, [&](size_t i) { - uint8_t* block_data = data + i * bytes_per_block; + uint8_t * block_data = data + i * bytes_per_block; - const float d = static_cast(ov::float16::from_bits(*((uint16_t*) block_data))); - const float min = static_cast(ov::float16::from_bits(*((uint16_t*) block_data + 1))); + const float d = static_cast(ov::float16::from_bits(*((uint16_t *) block_data))); + const float min = static_cast(ov::float16::from_bits(*((uint16_t *) block_data + 1))); - const uint8_t* scales_data = block_data + 4; // 12 bytes of scales - const uint8_t* qh = block_data + 4 + 12; // 32 bytes of high bits - const uint8_t* ql = block_data + 4 + 12 + 32; // 128 bytes of low bits + const uint8_t * scales_data = block_data + 4; // 12 bytes of scales + const uint8_t * qh = block_data + 4 + 12; // 32 bytes of high bits + const uint8_t * ql = block_data + 4 + 12 + 32; // 128 bytes of low bits int is = 0; uint8_t u1 = 1; @@ -286,7 +294,10 @@ void extract_q5_k_data(const ggml_tensor* tensor, ov::Tensor& weights_arr, ov::T // TODO Reorder for make_intX_weights -ov::Output make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) { +ov::Output make_int8_weights(ov::Tensor & weight, + ov::Tensor & scales, + ov::Tensor & biases, + size_t group_size) { ov::Shape orig_shape = weight.get_shape(); // Expand dimensions for scales and biases @@ -303,18 +314,19 @@ ov::Output make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, o } // Create graph nodes - auto weights_node = std::make_shared( - ov::element::u8, packed_shape, static_cast(weight.data()), nullptr); + auto weights_node = std::make_shared(ov::element::u8, packed_shape, + static_cast(weight.data()), nullptr); weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; auto scales_f16 = std::make_shared(scales); ov::Tensor biases_u8(ov::element::u8, scale_shape); // Calculate zero point - const ov::float16* bias_data = biases.data::value_type>(); - const ov::float16* scale_data = scales.data::value_type>(); - uint8_t* bias_u8_data = biases_u8.data(); + const ov::float16 * bias_data = biases.data::value_type>(); + const ov::float16 * scale_data = scales.data::value_type>(); + uint8_t * bias_u8_data = biases_u8.data(); for (size_t i = 0; i < biases_u8.get_size(); ++i) { - bias_u8_data[i] = (uint8_t)std::round(-1.f * static_cast(bias_data[i]) / static_cast(scale_data[i])); + bias_u8_data[i] = + (uint8_t) std::round(-1.f * static_cast(bias_data[i]) / static_cast(scale_data[i])); } auto zero_point = std::make_shared(biases_u8); @@ -327,9 +339,7 @@ ov::Output make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, o auto weights_f16 = std::make_shared(weights_node, ov::element::f16); auto zero_point_f16 = std::make_shared(zero_point, ov::element::f16); - auto w_zp = std::make_shared( - weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY - ); + auto w_zp = std::make_shared(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY); ov::Output w_zp_s = std::make_shared(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); @@ -343,18 +353,17 @@ ov::Output make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, o return std::make_shared(w_zp_s, ov::element::f32); } -ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) { +ov::Output make_int4_weights(ov::Tensor & weight, + ov::Tensor & scales, + ov::Tensor & biases, + size_t group_size) { ov::Shape orig_weight_shape = weight.get_shape(); // Expand dimensions for scales and biases ov::Shape scale_bias_shape = scales.get_shape(); // Create INT4 weight tensor - ov::Shape packed_shape = { - orig_weight_shape[0], - orig_weight_shape[1] / group_size, - group_size - }; + ov::Shape packed_shape = {orig_weight_shape[0], orig_weight_shape[1] / group_size, group_size}; // Requantized channel-wise case if (packed_shape[1] == 1) { @@ -365,18 +374,21 @@ ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o biases.set_shape(scale_bias_shape); } - auto weights_node = std::make_shared(ov::element::u4, packed_shape, static_cast(weight.data()), nullptr); + auto weights_node = std::make_shared(ov::element::u4, packed_shape, + static_cast(weight.data()), nullptr); weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; auto weights_f16 = std::make_shared(weights_node, ov::element::f16); // Pack zero points: two subsequent values into one - const ov::float16* bias_data = biases.data::value_type>(); - const ov::float16* scale_data = scales.data::value_type>(); + const ov::float16 * bias_data = biases.data::value_type>(); + const ov::float16 * scale_data = scales.data::value_type>(); ov::Tensor zero_point_tensor(ov::element::u4, scale_bias_shape); - uint8_t* zero_point_data = static_cast(zero_point_tensor.data()); + uint8_t * zero_point_data = static_cast(zero_point_tensor.data()); for (size_t i = 0; i < zero_point_tensor.get_byte_size(); ++i) { - uint8_t bias1 = (uint8_t)std::round(-1.f * static_cast(bias_data[i * 2]) / static_cast(scale_data[i * 2])); - uint8_t bias2 = (uint8_t)std::round(-1.f * static_cast(bias_data[i * 2 + 1]) / static_cast(scale_data[i * 2 + 1])); + uint8_t bias1 = + (uint8_t) std::round(-1.f * static_cast(bias_data[i * 2]) / static_cast(scale_data[i * 2])); + uint8_t bias2 = (uint8_t) std::round(-1.f * static_cast(bias_data[i * 2 + 1]) / + static_cast(scale_data[i * 2 + 1])); zero_point_data[i] = (bias2 << 4) | (bias1 & 0x0F); } @@ -390,16 +402,15 @@ ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o auto scales_f16 = std::make_shared(scales); // Perform dequantization - auto w_zp = std::make_shared( - weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY); + auto w_zp = std::make_shared(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY); ov::Output w_zp_s = std::make_shared(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); if (packed_shape.size() != 2) { // If not requantized channel-wise case, reshape back to original shape - auto final_shape = std::make_shared( - ov::element::i64, ov::Shape{orig_weight_shape.size()}, orig_weight_shape); + auto final_shape = std::make_shared(ov::element::i64, ov::Shape{orig_weight_shape.size()}, + orig_weight_shape); w_zp_s = std::make_shared(w_zp_s, final_shape, false); } @@ -407,7 +418,7 @@ ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o return std::make_shared(w_zp_s, ov::element::f32); } -std::shared_ptr requantize(const ggml_tensor* tensor, ExtraQuantType requant_type) { +std::shared_ptr requantize(const ggml_tensor * tensor, ExtraQuantType requant_type) { std::vector weights_f32(tensor->ne[0] * tensor->ne[1]); ggml_get_type_traits(tensor->type)->to_float(tensor->data, weights_f32.data(), ggml_nelements(tensor)); @@ -459,14 +470,18 @@ std::shared_ptr requantize(const ggml_tensor* tensor, ExtraQuantType r return weight_node; } -void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, +void quantize_q4_0(const float * x, + ov::Tensor & weights_arr, + ov::Tensor & scales_arr, + ov::Tensor & biases_arr, + int64_t k, int64_t qk) { assert(k % qk == 0); const int nb = k / qk; - auto* weights = static_cast(weights_arr.data()); - auto* scales = scales_arr.data::value_type>(); - auto* biases = biases_arr.data::value_type>(); + auto * weights = static_cast(weights_arr.data()); + auto * scales = scales_arr.data::value_type>(); + auto * biases = biases_arr.data::value_type>(); for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max float max = 0.0f; @@ -503,14 +518,18 @@ void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_a } } -void quantize_q8_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, +void quantize_q8_0(const float * x, + ov::Tensor & weights_arr, + ov::Tensor & scales_arr, + ov::Tensor & biases_arr, + int64_t k, int64_t qk) { assert(k % qk == 0); const int nb = k / qk; - auto* weights = static_cast(weights_arr.data()); - auto* scales = scales_arr.data::value_type>(); - auto* biases = biases_arr.data::value_type>(); + auto * weights = static_cast(weights_arr.data()); + auto * scales = scales_arr.data::value_type>(); + auto * biases = biases_arr.data::value_type>(); for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max @@ -534,14 +553,18 @@ void quantize_q8_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_a } } -void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, +void quantize_q8_1(const float * x, + ov::Tensor & weights_arr, + ov::Tensor & scales_arr, + ov::Tensor & biases_arr, + int64_t k, int64_t qk) { assert(k % qk == 0); const int nb = k / qk; - auto* weights = static_cast(weights_arr.data()); - auto* scales = scales_arr.data::value_type>(); - auto* biases = biases_arr.data::value_type>(); + auto * weights = static_cast(weights_arr.data()); + auto * scales = scales_arr.data::value_type>(); + auto * biases = biases_arr.data::value_type>(); for (int i = 0; i < nb; i++) { float min = std::numeric_limits::max(); float max = std::numeric_limits::lowest(); diff --git a/ggml/src/ggml-openvino/openvino/frontend.cpp b/ggml/src/ggml-openvino/openvino/frontend.cpp index dbdae1ed45..27d10d71c1 100644 --- a/ggml/src/ggml-openvino/openvino/frontend.cpp +++ b/ggml/src/ggml-openvino/openvino/frontend.cpp @@ -10,11 +10,11 @@ namespace ggml { FrontEnd::FrontEnd() {} -std::shared_ptr FrontEnd::convert(const InputModel::Ptr& model, bool naive) { +std::shared_ptr FrontEnd::convert(const InputModel::Ptr & model, bool naive) { auto ggml_model = std::dynamic_pointer_cast(model); FRONT_END_GENERAL_CHECK(ggml_model, "Invalid input model"); std::shared_ptr converted_model; - const auto& supported_ops = get_supported_ops(); + const auto & supported_ops = get_supported_ops(); { TranslateSession translate_session(model, supported_ops, naive); converted_model = translate_session.get_converted_model(); diff --git a/ggml/src/ggml-openvino/openvino/input_model.cpp b/ggml/src/ggml-openvino/openvino/input_model.cpp index 5fb16ea2db..0f66270a5e 100644 --- a/ggml/src/ggml-openvino/openvino/input_model.cpp +++ b/ggml/src/ggml-openvino/openvino/input_model.cpp @@ -6,9 +6,9 @@ namespace ov { namespace frontend { namespace ggml { -InputModel::InputModel(const std::shared_ptr& gdecoder) : m_decoder(gdecoder) {} +InputModel::InputModel(const std::shared_ptr & gdecoder) : m_decoder(gdecoder) {} -const std::shared_ptr& InputModel::get_model_decoder() const { +const std::shared_ptr & InputModel::get_model_decoder() const { return m_decoder; } diff --git a/ggml/src/ggml-openvino/openvino/op/cont.cpp b/ggml/src/ggml-openvino/openvino/op/cont.cpp index 9ae0f420cc..a17273d426 100644 --- a/ggml/src/ggml-openvino/openvino/op/cont.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp @@ -1,4 +1,8 @@ +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + #include #include #include @@ -6,16 +10,12 @@ #include #include -#include "../node_context.hpp" -#include "../op_table.hpp" -#include "../utils.hpp" - namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_cont(const NodeContext& context) { +OutputVector translate_cont(const NodeContext & context) { num_inputs_check(context, 1, 1); int op_case = context.get_op_case(); @@ -29,9 +29,7 @@ OutputVector translate_cont(const NodeContext& context) { // The input comes from a PERMUTE dst_shape[1] = -1; res = std::make_shared( - context.get_input(0), - ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), - false); + context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), false); } else if (op_case == 2) { // The input comes from a TRANSPOSE return {context.get_input(0)}; diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index 54b49018a9..d5186cddee 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -1,15 +1,16 @@ -#include -#include #include "../node_context.hpp" #include "../op_table.hpp" #include "../utils.hpp" +#include +#include + namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_cpy(const NodeContext& context) { +OutputVector translate_cpy(const NodeContext & context) { auto res = std::make_shared(context.get_input(0), context.get_output_type(0)); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp index 9845fe0a02..029023637a 100644 --- a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +++ b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp @@ -1,3 +1,7 @@ +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + #include #include #include @@ -8,24 +12,20 @@ #include #include -#include "../node_context.hpp" -#include "../op_table.hpp" -#include "../utils.hpp" - namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_flash_attn_ext(const NodeContext& context) { +OutputVector translate_flash_attn_ext(const NodeContext & context) { num_inputs_check(context, 4, 4); auto q_f32 = context.get_input(0); auto k = context.get_input(1); auto v = context.get_input(2); auto mask = context.get_input(3); - float* params = reinterpret_cast(context.get_output_op_params(0)); - float scale = params[0]; + float * params = reinterpret_cast(context.get_output_op_params(0)); + float scale = params[0]; // float max_bias = params[1]; // float logit_softcap = params[2]; @@ -43,15 +43,14 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) { auto token_len = get_dimensions(q, {2}); auto kv_len = get_dimensions(k.get_node_shared_ptr(), {2}); - auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0}); - auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1}); + auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 0}); + auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 1}); auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2}); + auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2}); auto stop = std::make_shared(ov::OutputVector{token_len, kv_len}, 0); - mask_sliced = - std::make_shared(mask, zero_2d, stop, one_2d, axes); + mask_sliced = std::make_shared(mask, zero_2d, stop, one_2d, axes); mask_sliced = std::make_shared(mask_sliced, zero_1d); } @@ -72,8 +71,8 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) { kv_unsqueezed = std::make_shared(kv, unsqueeze_axes); auto kv_last_two_dims = get_dimensions(kv.get_node_shared_ptr(), {1, 2}); - kv_broadcast_shape = - std::make_shared(ov::OutputVector{kv_batch_node, factor_node, kv_last_two_dims}, 0); + kv_broadcast_shape = std::make_shared( + ov::OutputVector{kv_batch_node, factor_node, kv_last_two_dims}, 0); new_kv_shape = std::make_shared(ov::OutputVector{q_batch_node, kv_last_two_dims}, 0); } else { @@ -82,8 +81,8 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) { kv_unsqueezed = std::make_shared(kv, unsqueeze_axes); auto kv_last_two_dims = get_dimensions(kv.get_node_shared_ptr(), {2, 3}); - kv_broadcast_shape = - std::make_shared(ov::OutputVector{one_1d, kv_batch_node, factor_node, kv_last_two_dims}, 0); + kv_broadcast_shape = std::make_shared( + ov::OutputVector{one_1d, kv_batch_node, factor_node, kv_last_two_dims}, 0); new_kv_shape = std::make_shared(ov::OutputVector{one_1d, q_batch_node, kv_last_two_dims}, 0); } @@ -105,8 +104,8 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) { res = std::make_shared(sdpa_f32, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); } else { - res = std::make_shared(sdpa_f32, - ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); + res = std::make_shared( + sdpa_f32, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); } return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp index 5e4c7d901a..2e3520554e 100644 --- a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp @@ -1,3 +1,7 @@ +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + #include #include #include @@ -5,16 +9,12 @@ #include #include -#include "../node_context.hpp" -#include "../op_table.hpp" -#include "../utils.hpp" - namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_get_rows(const NodeContext& context) { +OutputVector translate_get_rows(const NodeContext & context) { num_inputs_check(context, 2, 2); int op_case = context.get_op_case(); diff --git a/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp index 4295bf7517..3e3cae0071 100644 --- a/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp @@ -1,3 +1,7 @@ +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + #include #include #include @@ -7,16 +11,12 @@ #include #include -#include "../node_context.hpp" -#include "../op_table.hpp" -#include "../utils.hpp" - namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_glu_geglu(const NodeContext& context) { +OutputVector translate_glu_geglu(const NodeContext & context) { num_inputs_check(context, 1, 2); ov::Output src0; @@ -32,7 +32,7 @@ OutputVector translate_glu_geglu(const NodeContext& context) { src1 = split->output(1); } - int32_t* params = context.get_output_op_params(0); + int32_t * params = context.get_output_op_params(0); const int32_t swapped = params[1]; if (swapped) { std::swap(src0, src1); diff --git a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp index bef42fe4b7..61cdaadea3 100644 --- a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp @@ -1,3 +1,7 @@ +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + #include #include #include @@ -7,16 +11,12 @@ #include #include -#include "../node_context.hpp" -#include "../op_table.hpp" -#include "../utils.hpp" - namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_glu_swiglu(const NodeContext& context) { +OutputVector translate_glu_swiglu(const NodeContext & context) { num_inputs_check(context, 1, 2); ov::Output src0; @@ -32,7 +32,7 @@ OutputVector translate_glu_swiglu(const NodeContext& context) { src1 = split->output(1); } - int32_t* params = context.get_output_op_params(0); + int32_t * params = context.get_output_op_params(0); const int32_t swapped = params[1]; if (swapped) { std::swap(src0, src1); diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index b4103378eb..c161bce75d 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -1,3 +1,7 @@ +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + #include #include #include @@ -15,16 +19,12 @@ #include #include -#include "../node_context.hpp" -#include "../op_table.hpp" -#include "../utils.hpp" - namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_mulmat(const NodeContext& context) { +OutputVector translate_mulmat(const NodeContext & context) { num_inputs_check(context, 2, 2); int op_case = context.get_op_case(); diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index 5f86f47c1c..128ffb2933 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -1,3 +1,7 @@ +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + #include #include #include @@ -9,16 +13,12 @@ #include #include -#include "../node_context.hpp" -#include "../op_table.hpp" -#include "../utils.hpp" - namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_permute(const NodeContext& context) { +OutputVector translate_permute(const NodeContext & context) { num_inputs_check(context, 1, 1); int op_case = context.get_op_case(); @@ -28,15 +28,15 @@ OutputVector translate_permute(const NodeContext& context) { if (op_case == 1) { if (context.is_static()) { - res = std::make_shared(context.get_input(0), - ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); + res = std::make_shared( + context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); } else { auto src = context.get_input(0); if (src.get_partial_shape().rank() == 3) { src = std::make_shared(src, zero); } - res = std::make_shared(src, - ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); + res = std::make_shared( + src, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); } } else { auto src = context.get_input(0); @@ -47,7 +47,8 @@ OutputVector translate_permute(const NodeContext& context) { std::vector src_shape(src_shape_.begin(), src_shape_.end()); auto src_reshaped = std::make_shared( src, - ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{-1, src_shape[1], src_shape[2]}), + ov::op::v0::Constant::create(ov::element::i64, {3}, + std::vector{-1, src_shape[1], src_shape[2]}), false); res = std::make_shared( src_reshaped, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); @@ -55,8 +56,8 @@ OutputVector translate_permute(const NodeContext& context) { if (src.get_partial_shape().rank() == 3) { src = std::make_shared(src, zero); } - res = std::make_shared(src, - ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); + res = std::make_shared( + src, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); } } return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/openvino/op/reshape.cpp b/ggml/src/ggml-openvino/openvino/op/reshape.cpp index 1ed6f4b880..bbf94865ef 100644 --- a/ggml/src/ggml-openvino/openvino/op/reshape.cpp +++ b/ggml/src/ggml-openvino/openvino/op/reshape.cpp @@ -1,3 +1,7 @@ +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + #include #include #include @@ -7,16 +11,12 @@ #include #include -#include "../node_context.hpp" -#include "../op_table.hpp" -#include "../utils.hpp" - namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_reshape(const NodeContext& context) { +OutputVector translate_reshape(const NodeContext & context) { num_inputs_check(context, 1, 1); if (context.get_input_shape(0) == context.get_output_shape(0)) { return {context.get_input(0)}; @@ -29,15 +29,11 @@ OutputVector translate_reshape(const NodeContext& context) { auto output_shape = context.get_output_shape(0).to_shape(); std::shared_ptr new_shape_node; if (op_case == 1) { - new_shape_node = - ov::op::v0::Constant::create(ov::element::i64, - {3}, - std::vector{-1, (int64_t)output_shape[1], (int64_t)output_shape[2]}); + new_shape_node = ov::op::v0::Constant::create( + ov::element::i64, {3}, std::vector{-1, (int64_t) output_shape[1], (int64_t) output_shape[2]}); } else if (op_case == 2) { - new_shape_node = - ov::op::v0::Constant::create(ov::element::i64, - {3}, - std::vector{(int64_t)output_shape[0], -1, (int64_t)output_shape[2]}); + new_shape_node = ov::op::v0::Constant::create( + ov::element::i64, {3}, std::vector{(int64_t) output_shape[0], -1, (int64_t) output_shape[2]}); } else if (op_case == 3) { new_shape_node = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{(int64_t) output_shape[0], -1, 1}); diff --git a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp index c9df4c42f3..3ac96d0c22 100644 --- a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp @@ -1,3 +1,7 @@ +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + #include #include #include @@ -7,16 +11,12 @@ #include #include -#include "../node_context.hpp" -#include "../op_table.hpp" -#include "../utils.hpp" - namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_rms_norm(const NodeContext& context) { +OutputVector translate_rms_norm(const NodeContext & context) { num_inputs_check(context, 1, 1); auto input_node = context.get_input(0); diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index 484730d289..362ccce17f 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -1,3 +1,7 @@ +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + #include #include #include @@ -14,16 +18,12 @@ #include #include -#include "../node_context.hpp" -#include "../op_table.hpp" -#include "../utils.hpp" - namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_rope(const NodeContext& context) { +OutputVector translate_rope(const NodeContext & context) { num_inputs_check(context, 2, 3); int op_case = context.get_op_case(); @@ -32,7 +32,7 @@ OutputVector translate_rope(const NodeContext& context) { auto data_node = context.get_input(0).get_node_shared_ptr(); auto output_shape = context.get_output_shape(0).to_shape(); - int32_t* op_params = context.get_output_op_params(0); + int32_t * op_params = context.get_output_op_params(0); Output cos_theta_node; Output sin_theta_node; @@ -85,7 +85,8 @@ OutputVector translate_rope(const NodeContext& context) { auto stack = std::make_shared(OutputVector{first_half, second_half}, 3); res = std::make_shared(stack, std::make_shared(data_node), false); if (!(context.is_static())) { - res = std::make_shared(res, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); + res = + std::make_shared(res, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); } } else if (mode == ROPE_TYPE_NEOX) { auto data_split = std::make_shared( diff --git a/ggml/src/ggml-openvino/openvino/op/scale.cpp b/ggml/src/ggml-openvino/openvino/op/scale.cpp index 783440ebd9..f52381786a 100644 --- a/ggml/src/ggml-openvino/openvino/op/scale.cpp +++ b/ggml/src/ggml-openvino/openvino/op/scale.cpp @@ -1,17 +1,17 @@ -#include -#include -#include - #include "../node_context.hpp" #include "../op_table.hpp" #include "../utils.hpp" +#include +#include +#include + namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_scale(const NodeContext& context) { +OutputVector translate_scale(const NodeContext & context) { num_inputs_check(context, 1, 1); float scale; diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp index 001bd08773..643ba7bffa 100644 --- a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp @@ -1,3 +1,7 @@ +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + #include #include #include @@ -15,16 +19,12 @@ #include #include -#include "../node_context.hpp" -#include "../op_table.hpp" -#include "../utils.hpp" - namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_set_rows(const NodeContext& context) { +OutputVector translate_set_rows(const NodeContext & context) { num_inputs_check(context, 3, 3); auto data = context.get_input(0); @@ -44,8 +44,7 @@ OutputVector translate_set_rows(const NodeContext& context) { Output res; if (context.is_static()) { auto dst_reshaped = std::make_shared( - dst, - ov::op::v0::Constant::create(ov::element::i64, {2}, {(int64_t) dst_shape[1], (int64_t) dst_shape[2]}), + dst, ov::op::v0::Constant::create(ov::element::i64, {2}, {(int64_t) dst_shape[1], (int64_t) dst_shape[2]}), false); auto indices_reshaped = std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); @@ -55,7 +54,8 @@ OutputVector translate_set_rows(const NodeContext& context) { auto updated = std::make_shared(dst_reshaped, indices_reshaped, data_reshaped, zero); res = std::make_shared(updated, std::make_shared(dst), false); } else { - assert(dst.get_partial_shape().rank() == 4 && dst.get_partial_shape()[2].is_static() && dst.get_partial_shape()[3].is_static()); + assert(dst.get_partial_shape().rank() == 4 && dst.get_partial_shape()[2].is_static() && + dst.get_partial_shape()[3].is_static()); int64_t dim2 = dst.get_partial_shape()[2].get_length(); int64_t dim3 = dst.get_partial_shape()[3].get_length(); data = std::make_shared( diff --git a/ggml/src/ggml-openvino/openvino/op/softmax.cpp b/ggml/src/ggml-openvino/openvino/op/softmax.cpp index 1aa3bf76a0..6c43054050 100644 --- a/ggml/src/ggml-openvino/openvino/op/softmax.cpp +++ b/ggml/src/ggml-openvino/openvino/op/softmax.cpp @@ -1,3 +1,7 @@ +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + #include #include #include @@ -13,16 +17,12 @@ #include #include -#include "../node_context.hpp" -#include "../op_table.hpp" -#include "../utils.hpp" - namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_soft_max(const NodeContext& context) { +OutputVector translate_soft_max(const NodeContext & context) { num_inputs_check(context, 1, 2); auto input_node = context.get_input(0).get_node_shared_ptr(); @@ -30,9 +30,9 @@ OutputVector translate_soft_max(const NodeContext& context) { float scale = 1.0f; float max_bias = 0.0f; - auto* op_params = context.get_output_op_params(0); - memcpy(&scale, (float*) op_params + 0, sizeof(float)); - memcpy(&max_bias, (float*) op_params + 1, sizeof(float)); + auto * op_params = context.get_output_op_params(0); + memcpy(&scale, (float *) op_params + 0, sizeof(float)); + memcpy(&max_bias, (float *) op_params + 1, sizeof(float)); auto src0_shape = context.get_input_shape(0).get_shape(); const uint32_t h = src0_shape[2]; const uint32_t n_head = src0_shape[0]; diff --git a/ggml/src/ggml-openvino/openvino/op/transpose.cpp b/ggml/src/ggml-openvino/openvino/op/transpose.cpp index c585dffa6e..6b4f8a849b 100644 --- a/ggml/src/ggml-openvino/openvino/op/transpose.cpp +++ b/ggml/src/ggml-openvino/openvino/op/transpose.cpp @@ -1,15 +1,15 @@ -#include - #include "../node_context.hpp" #include "../op_table.hpp" #include "../utils.hpp" +#include + namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_transpose(const NodeContext& context) { +OutputVector translate_transpose(const NodeContext & context) { num_inputs_check(context, 1, 1); auto res = std::make_shared(context.get_input(0), diff --git a/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp b/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp index 2b27c0be12..b2214fa930 100644 --- a/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp @@ -1,17 +1,17 @@ -#include -#include -#include - #include "../node_context.hpp" #include "../op_table.hpp" #include "../utils.hpp" +#include +#include +#include + namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_unary_silu(const NodeContext& context) { +OutputVector translate_unary_silu(const NodeContext & context) { num_inputs_check(context, 1, 1); auto input = context.get_input(0); diff --git a/ggml/src/ggml-openvino/openvino/op/view.cpp b/ggml/src/ggml-openvino/openvino/op/view.cpp index 034b6df119..b53abca7e9 100644 --- a/ggml/src/ggml-openvino/openvino/op/view.cpp +++ b/ggml/src/ggml-openvino/openvino/op/view.cpp @@ -6,12 +6,13 @@ namespace frontend { namespace ggml { namespace op { -OutputVector translate_view(const NodeContext& context) { +OutputVector translate_view(const NodeContext & context) { num_inputs_check(context, 1, 1); if (context.get_op_case() == 2) { auto dst_shape = context.get_output_shape(0).to_shape(); - return rename_outputs_with_suffix({process_view_input(context, 0, dst_shape[1] * dst_shape[2])}, context.get_name()); + return rename_outputs_with_suffix({process_view_input(context, 0, dst_shape[1] * dst_shape[2])}, + context.get_name()); } return {context.get_input(0)}; } diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp index e36e8f17cc..8aeb060aa5 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.cpp +++ b/ggml/src/ggml-openvino/openvino/op_table.cpp @@ -1,5 +1,7 @@ #include "op_table.hpp" +#include "utils.hpp" + #include #include #include @@ -7,8 +9,6 @@ #include #include -#include "utils.hpp" - namespace ov { namespace frontend { namespace ggml { diff --git a/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp b/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp index 4759e86e1e..375bbbd735 100644 --- a/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +++ b/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp @@ -1,15 +1,15 @@ #include "eliminate_zp.hpp" #include +#include #include -#include -#include -#include #include #include #include #include -#include +#include +#include +#include namespace ov { namespace frontend { @@ -35,13 +35,17 @@ EliminateZeroPoints::EliminateZeroPoints() { auto m_scale = ov::pass::pattern::any_input(); auto m_multiply = ov::pass::pattern::wrap_type({m_scale, m_subtract}); - const auto callback = [=](ov::pass::pattern::Matcher& m) { - const auto& pattern_map = m.get_pattern_value_map(); + const auto callback = [=](ov::pass::pattern::Matcher & m) { + const auto & pattern_map = m.get_pattern_value_map(); - auto multiply_node = std::dynamic_pointer_cast(pattern_map.at(m_multiply).get_node_shared_ptr()); - auto subtract_node = std::dynamic_pointer_cast(pattern_map.at(m_subtract).get_node_shared_ptr()); - auto data_constant = std::dynamic_pointer_cast(pattern_map.at(m_data_constant).get_node_shared_ptr()); - auto zp_constant = std::dynamic_pointer_cast(pattern_map.at(m_zp_constant).get_node_shared_ptr()); + auto multiply_node = + std::dynamic_pointer_cast(pattern_map.at(m_multiply).get_node_shared_ptr()); + auto subtract_node = + std::dynamic_pointer_cast(pattern_map.at(m_subtract).get_node_shared_ptr()); + auto data_constant = + std::dynamic_pointer_cast(pattern_map.at(m_data_constant).get_node_shared_ptr()); + auto zp_constant = + std::dynamic_pointer_cast(pattern_map.at(m_zp_constant).get_node_shared_ptr()); if (!multiply_node || !subtract_node || !data_constant || !zp_constant) { return false; @@ -101,14 +105,16 @@ EliminateZeroPoints::EliminateZeroPoints() { new_constant = std::make_shared(target_type, data_shape, adjusted_values); } - auto new_convert = std::make_shared(new_constant, subtract_node->get_output_element_type(0)); + auto new_convert = + std::make_shared(new_constant, subtract_node->get_output_element_type(0)); ov::replace_node(subtract_node, new_convert); return true; }; - register_matcher(std::make_shared(m_multiply, "ov::frontend::ggml::pass::EliminateZeroPoints"), - callback); + register_matcher( + std::make_shared(m_multiply, "ov::frontend::ggml::pass::EliminateZeroPoints"), + callback); } } // namespace pass diff --git a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp index f38c0837d1..3e5730c90f 100644 --- a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +++ b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp @@ -33,8 +33,8 @@ FuseToSDPA::FuseToSDPA() { const auto m_v = ov::pass::pattern::any_input(); const auto m_qkv = ov::pass::pattern::wrap_type({m_softmax_qk_f16, m_v}); - const auto callback = [=](ov::pass::pattern::Matcher& m) { - auto& pattern_to_output = m.get_pattern_value_map(); + const auto callback = [=](ov::pass::pattern::Matcher & m) { + auto & pattern_to_output = m.get_pattern_value_map(); auto k = pattern_to_output[m_k]; auto q = pattern_to_output[m_q]; auto v = pattern_to_output[m_v]; diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index e35599084e..67c5b4a51b 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -1,5 +1,11 @@ #include "translate_session.hpp" +#include "ggml-openvino/openvino/node_context.hpp" +#include "ggml-openvino/openvino/utils.hpp" +#include "input_model.hpp" +#include "pass/eliminate_zp.hpp" +#include "pass/mark_decompression_convert_constant_folding.hpp" + #include #include #include @@ -25,12 +31,6 @@ #include #include -#include "ggml-openvino/openvino/node_context.hpp" -#include "ggml-openvino/openvino/utils.hpp" -#include "input_model.hpp" -#include "pass/eliminate_zp.hpp" -#include "pass/mark_decompression_convert_constant_folding.hpp" - namespace ov { namespace frontend { namespace ggml { @@ -40,16 +40,17 @@ using namespace ov::op; namespace { ov::pass::MakeStateful::ParamResPairs get_kv_param_res_pairs( - const std::shared_ptr& model, const std::map& kv_param_res_names) { + const std::shared_ptr & model, + const std::map & kv_param_res_names) { ov::pass::MakeStateful::ParamResPairs pairs; - const auto& params = model->get_parameters(); - const auto& results = model->get_results(); + const auto & params = model->get_parameters(); + const auto & results = model->get_results(); - for (const auto& param_res : kv_param_res_names) { - const auto& param_name = param_res.first; - const auto& res_name = param_res.second; + for (const auto & param_res : kv_param_res_names) { + const auto & param_name = param_res.first; + const auto & res_name = param_res.second; - auto param_it = std::find_if(params.begin(), params.end(), [&](const std::shared_ptr& node) { + auto param_it = std::find_if(params.begin(), params.end(), [&](const std::shared_ptr & node) { return node->get_friendly_name() == param_name; }); @@ -57,7 +58,7 @@ ov::pass::MakeStateful::ParamResPairs get_kv_param_res_pairs( " is not associated with any of " "Parameters in the network."); - auto res_it = std::find_if(results.begin(), results.end(), [&](const std::shared_ptr& node) { + auto res_it = std::find_if(results.begin(), results.end(), [&](const std::shared_ptr & node) { return node->get_friendly_name() == res_name; }); @@ -72,17 +73,17 @@ ov::pass::MakeStateful::ParamResPairs get_kv_param_res_pairs( return pairs; } -void add_token_len(TensorMap& tensor_map) { +void add_token_len(TensorMap & tensor_map) { auto inp_tokens = tensor_map.at("inp_tokens").get_node_shared_ptr(); auto token_len = get_dimensions(inp_tokens, {2}); token_len->set_friendly_name("token_len"); tensor_map.insert({"token_len", token_len->output(0)}); } -void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { +void add_sliced_mask(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) { auto token_len = tensor_map.at("token_len").get_node_shared_ptr(); - auto create_sliced_mask = [&](const std::string& mask_name, const std::string& sliced_name, bool is_static) { + auto create_sliced_mask = [&](const std::string & mask_name, const std::string & sliced_name, bool is_static) { if (tensor_map.find(mask_name) != tensor_map.end()) { auto mask = tensor_map.at(mask_name).get_node_shared_ptr(); std::shared_ptr mask_sliced; @@ -110,8 +111,7 @@ void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { kv_len = std::make_shared(kv_len, one_1d); auto stop = std::make_shared(ov::OutputVector{token_len, kv_len}, 0); - mask_sliced = - std::make_shared(mask, zero_2d, stop, one_2d, axes); + mask_sliced = std::make_shared(mask, zero_2d, stop, one_2d, axes); mask_sliced = std::make_shared(mask_sliced, zero_1d); mask_sliced = std::make_shared(mask_sliced, ov::element::f16); mask_sliced->set_friendly_name(sliced_name); @@ -125,8 +125,8 @@ void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { // create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static()); } -void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { - int32_t* rope_params = ggml_model_decoder.get_rope_params(); +void add_rope_sin_cos(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) { + int32_t * rope_params = ggml_model_decoder.get_rope_params(); auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); std::shared_ptr rope_freqs_weight; if (tensor_map.find("rope_freqs_weight") != tensor_map.end()) { @@ -144,7 +144,7 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { } // Create common patterns -void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { +void preprocess(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) { add_token_len(tensor_map); add_sliced_mask(tensor_map, ggml_model_decoder); add_rope_sin_cos(tensor_map, ggml_model_decoder); @@ -152,8 +152,8 @@ void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { } // namespace -TranslateSession::TranslateSession(const frontend::InputModel::Ptr& input_model, - const std::unordered_map& translator_map, +TranslateSession::TranslateSession(const frontend::InputModel::Ptr & input_model, + const std::unordered_map & translator_map, bool naive) : m_input_model(input_model), m_translator_map(translator_map), @@ -168,26 +168,26 @@ std::shared_ptr TranslateSession::get_converted_model() { return m_ov_model; } -std::shared_ptr TranslateSession::translate_graph(const frontend::InputModel::Ptr& input_model) { +std::shared_ptr TranslateSession::translate_graph(const frontend::InputModel::Ptr & input_model) { ov::ParameterVector params; ov::ResultVector results; auto tensor_map = std::make_shared(); std::shared_ptr resulting_model; - const auto& ggml_model = std::dynamic_pointer_cast(input_model); + const auto & ggml_model = std::dynamic_pointer_cast(input_model); std::shared_ptr ggml_model_decoder = ggml_model->get_model_decoder(); - for (const auto& it : ggml_model_decoder->get_model_inputs()) { + for (const auto & it : ggml_model_decoder->get_model_inputs()) { params.push_back(std::dynamic_pointer_cast(it.second)); (*tensor_map)[it.first] = it.second; } - for (const auto& it : ggml_model_decoder->get_model_extra_inputs()) { + for (const auto & it : ggml_model_decoder->get_model_extra_inputs()) { params.push_back(std::dynamic_pointer_cast(it.second)); (*tensor_map)[it.first] = it.second; } - for (const auto& it : ggml_model_decoder->get_model_weights()) { + for (const auto & it : ggml_model_decoder->get_model_weights()) { (*tensor_map)[it.first] = it.second; } @@ -199,22 +199,15 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo ov::OutputVector converted_outputs; auto it = m_translator_map.find(operation_type); - FRONT_END_OP_CONVERSION_CHECK(it != m_translator_map.end(), - "Translation for operation type ", - operation_type, + FRONT_END_OP_CONVERSION_CHECK(it != m_translator_map.end(), "Translation for operation type ", operation_type, " is not implemented."); NodeContext node_context(node, tensor_map, this); converted_outputs = it->second(node_context); - const auto& node_output_names = node->get_output_names(); - FRONT_END_OP_CONVERSION_CHECK(node_output_names.size() == converted_outputs.size(), - "Number of ", - operation_type, - " outputs greater than number of converted outputs, which are ", - node_output_names.size(), - " and ", - converted_outputs.size(), - " respectively."); + const auto & node_output_names = node->get_output_names(); + FRONT_END_OP_CONVERSION_CHECK(node_output_names.size() == converted_outputs.size(), "Number of ", + operation_type, " outputs greater than number of converted outputs, which are ", + node_output_names.size(), " and ", converted_outputs.size(), " respectively."); for (size_t i = 0; i < node_output_names.size(); ++i) { auto output_name = node_output_names[i]; @@ -229,10 +222,9 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo } ggml_model_decoder->visit_subgraph(node_visitor); - for (const auto& name : ggml_model_decoder->get_model_output_names()) { + for (const auto & name : ggml_model_decoder->get_model_output_names()) { FRONT_END_GENERAL_CHECK(tensor_map->find(name) != tensor_map->end(), - "Output name not found in tensor map: ", - name); + "Output name not found in tensor map: ", name); auto result = std::make_shared(tensor_map->at(name)); result->set_friendly_name(name); results.push_back(result); diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp index f70cb91a17..1723c7d003 100644 --- a/ggml/src/ggml-openvino/openvino/utils.cpp +++ b/ggml/src/ggml-openvino/openvino/utils.cpp @@ -1,5 +1,7 @@ #include "utils.hpp" +#include "ggml-impl.h" + #include #include #include @@ -17,8 +19,6 @@ #include #include -#include "ggml-impl.h" - namespace ov { namespace frontend { namespace ggml { @@ -30,7 +30,7 @@ std::string getCurrentTime() { return buf; } -void num_inputs_check(const NodeContext& context, size_t min_inputs, size_t max_inputs) { +void num_inputs_check(const NodeContext & context, size_t min_inputs, size_t max_inputs) { auto input_size = context.get_input_size(); FRONT_END_OP_CONVERSION_CHECK(input_size >= min_inputs, "Got less inputs than expected"); FRONT_END_OP_CONVERSION_CHECK(input_size <= max_inputs, "Got more inputs than expected"); @@ -48,20 +48,20 @@ int non_cont_dim(std::vector ne, std::vector nb) { return 0; } -std::shared_ptr get_dimensions(const std::shared_ptr& shape, - const std::vector& dims) { +std::shared_ptr get_dimensions(const std::shared_ptr & shape, + const std::vector & dims) { using namespace ov::op; const auto zero = v0::Constant::create(ov::element::i32, ov::Shape{}, {0}); const auto dims_const = v0::Constant::create(ov::element::i32, ov::Shape{dims.size()}, dims); return std::make_shared(shape, dims_const, zero); } -std::shared_ptr get_dimensions(const std::shared_ptr& node, const std::vector& dims) { +std::shared_ptr get_dimensions(const std::shared_ptr & node, const std::vector & dims) { return get_dimensions(std::make_shared(node), dims); } -OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::string& suffix) { - for (const auto& output : outputs) { +OutputVector rename_outputs_with_suffix(const OutputVector & outputs, const std::string & suffix) { + for (const auto & output : outputs) { auto node = output.get_node_shared_ptr(); std::string name = node->get_friendly_name(); name += "_"; @@ -111,7 +111,7 @@ void ggml_rope_yarn_corr_dims(int n_dims, } } // namespace -std::pair, ov::Output> make_sin_cos(int32_t* rope_params, +std::pair, ov::Output> make_sin_cos(int32_t * rope_params, std::shared_ptr inp_pos, std::shared_ptr rope_freqs_weight) { inp_pos = std::make_shared(inp_pos, ov::element::f32); @@ -179,11 +179,11 @@ std::pair, ov::Output> make_sin_cos(int32_t* rope_params, return std::make_pair(sin_theta, cos_theta); } -ov::Output process_view_input(const NodeContext& context, int input_index, int slice_len) { +ov::Output process_view_input(const NodeContext & context, int input_index, int slice_len) { // Only works for VIEW operations that slice at the lowest dimension // If the VIEW also reshape the result, `slice_len` should be provided auto input = context.get_input(input_index); - int32_t* op_params = context.get_input_op_params(input_index); + int32_t * op_params = context.get_input_op_params(input_index); auto src1_stride = context.get_input_stride(input_index); int64_t split_addr = op_params[0] / src1_stride[2]; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 9b000f26d5..eb9ea9fee9 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -1,5 +1,11 @@ #include "utils.h" +#include "ggml-impl.h" +#include "ggml-openvino/ggml-decoder.h" +#include "ggml.h" +#include "openvino/frontend.hpp" +#include "openvino/input_model.hpp" + #include #include #include @@ -23,15 +29,9 @@ #include #include -#include "ggml-impl.h" -#include "ggml-openvino/ggml-decoder.h" -#include "ggml.h" -#include "openvino/frontend.hpp" -#include "openvino/input_model.hpp" - -ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string& name) { - const auto* ggml_tensor = ggml_decoder->get_input_ggml_tensor(name); - auto* input_data = ggml_tensor->data; +ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string & name) { + const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(name); + auto * input_data = ggml_tensor->data; ov::Shape input_shape; if (name.find("cache_k") == 0 || name.find("cache_v") == 0) { input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor).to_shape(); @@ -45,13 +45,14 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, return input_tensor; } -std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder) { - std::map output_tensors; +std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder) { + std::map output_tensors; + auto output_names = ggml_decoder->get_model_output_names(); for (size_t inp = 0; inp < output_names.size(); ++inp) { auto name = output_names[inp]; - const auto* tensor = ggml_decoder->get_output_ggml_tensor(name); - auto* output_data = tensor->view_src ? tensor->view_src->data : tensor->data; + const auto * tensor = ggml_decoder->get_output_ggml_tensor(name); + auto * output_data = tensor->view_src ? tensor->view_src->data : tensor->data; output_tensors[name] = output_data; } return output_tensors; @@ -63,14 +64,14 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { return front_end; } -enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph) { +enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { static ov::Core core; static std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : ""; if (device.empty()) { - const std::vector preferred_device = { "GPU", "CPU", "NPU" }; + const std::vector preferred_device = {"GPU", "CPU", "NPU"}; const auto available_devices = core.get_available_devices(); - for (const auto& dev : preferred_device) { + for (const auto & dev : preferred_device) { if (std::find(available_devices.begin(), available_devices.end(), dev) != available_devices.end()) { device = dev; break; @@ -92,17 +93,17 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto start_time = ggml_time_us(); - auto* cache_dir = getenv("GGML_OPENVINO_CACHE_DIR"); + auto * cache_dir = getenv("GGML_OPENVINO_CACHE_DIR"); if (cache_dir && !is_static) { core.set_property(ov::cache_dir(cache_dir)); } static std::mutex cache_mutex; - static std::unordered_map> infer_request_cache; - static std::unordered_map> ov_input_names_cache; - static std::unordered_map> ov_output_names_cache; + static std::unordered_map> infer_request_cache; + static std::unordered_map> ov_input_names_cache; + static std::unordered_map> ov_output_names_cache; // For NPU, store the kvcache model, since we cannot create two infer_request - static std::unordered_map compiled_model_cache; + static std::unordered_map compiled_model_cache; std::shared_ptr ggml_decoder; ov::InferRequest infer_request; @@ -181,7 +182,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c ov::serialize(model, timestamped_filename); } - auto* disable_sdpa_optimization = getenv("GGML_OPENVINO_DISABLE_SDPA_OPTIMIZATION"); + auto * disable_sdpa_optimization = getenv("GGML_OPENVINO_DISABLE_SDPA_OPTIMIZATION"); if (disable_sdpa_optimization && std::string(disable_sdpa_optimization) != "0") { config = { {"GPU_ENABLE_SDPA_OPTIMIZATION", "0"} @@ -196,10 +197,10 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c std::vector ov_input_names; std::vector ov_output_names; - for (const auto& ov_param : model->get_parameters()) { + for (const auto & ov_param : model->get_parameters()) { ov_input_names.push_back(ov_param->get_friendly_name()); } - for (const auto& ov_output : model->get_results()) { + for (const auto & ov_output : model->get_results()) { ov_output_names.push_back(ov_output->get_friendly_name()); } ov_input_names_cache[cgraph] = ov_input_names; @@ -225,7 +226,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto gguf_tensor_addrs = get_ggml_graph_output_dst(ggml_decoder); for (size_t i = 0; i < ov_output_names.size(); i++) { - auto& result_name = ov_output_names[i]; + auto & result_name = ov_output_names[i]; const auto output_tensor = infer_request.get_output_tensor(i); std::memcpy(gguf_tensor_addrs[result_name], output_tensor.data(), output_tensor.get_byte_size()); @@ -278,7 +279,7 @@ ov::AnyMap get_npu_generate_config() { return config; } -std::map get_types_to_requant(const std::string& device) { +std::map get_types_to_requant(const std::string & device) { if (device == "NPU") { return { {GGML_TYPE_Q4_0, ExtraQuantType::Q4_0_128}, @@ -297,15 +298,15 @@ std::map get_types_to_requant(const std::string& devi return {}; } -bool is_naive(struct ggml_cgraph* cgraph) { +bool is_naive(ggml_cgraph * cgraph) { constexpr int naive_graph_size_threshold = 20; return cgraph->n_nodes < naive_graph_size_threshold; } -enum ggml_status naive_compute(struct ggml_cgraph* cgraph, - ov::Core& core, - const std::string& device, - const ov::AnyMap& config) { +enum ggml_status naive_compute(ggml_cgraph * cgraph, + ov::Core & core, + const std::string & device, + const ov::AnyMap & config) { if (cgraph->n_nodes == 1 && (cgraph->nodes[0]->op == GGML_OP_NONE || cgraph->nodes[0]->op == GGML_OP_VIEW)) { return GGML_STATUS_SUCCESS; } @@ -343,7 +344,7 @@ enum ggml_status naive_compute(struct ggml_cgraph* cgraph, return GGML_STATUS_SUCCESS; } -ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string& param_name) { +ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string & param_name) { bool is_static = ggml_decoder->is_static(); bool is_first_token = ggml_decoder->is_first_token(); @@ -358,10 +359,10 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, cons if (param_name == "inp_tokens" || param_name == "inp_pos") { if (is_first_token) { size_t context_size = ggml_decoder->get_context_size(); - const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); + const auto * input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); std::vector padded_data = pad_input(input_tensor_ggml, 1, context_size, 0); input_tensor = ov::Tensor(ov::element::i32, ov::Shape{1, 1, context_size}); - auto* data_ptr = input_tensor.data(); + auto * data_ptr = input_tensor.data(); std::copy(padded_data.begin(), padded_data.end(), data_ptr); } else { input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); @@ -369,22 +370,22 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, cons } else if (param_name.find("KQ_mask") == 0) { size_t context_size = ggml_decoder->get_context_size(); - const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); + const auto * input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); if (is_first_token) { std::vector padded_data = pad_input(input_tensor_ggml, context_size, context_size, -INFINITY); set_zero_diagonal(padded_data, context_size); input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, context_size, context_size}); - auto* data_ptr = input_tensor.data(); + auto * data_ptr = input_tensor.data(); std::copy(padded_data.begin(), padded_data.end(), data_ptr); } else { std::vector padded_data = pad_input(input_tensor_ggml, 1, context_size, -INFINITY); input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, context_size}); - auto* data_ptr = input_tensor.data(); + auto * data_ptr = input_tensor.data(); std::copy(padded_data.begin(), padded_data.end(), data_ptr); } - } else if (const auto* op = ggml_decoder->get_tensor_used_op(ggml_decoder->get_tensor_from_name(param_name)); + } else if (const auto * op = ggml_decoder->get_tensor_used_op(ggml_decoder->get_tensor_from_name(param_name)); op && op->op == GGML_OP_SET_ROWS && is_static && is_first_token) { input_tensor = ov::Tensor(ov::element::i64, ov::Shape{1, 1, 1}); } else { @@ -394,8 +395,8 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, cons return input_tensor; } -size_t checksum(const void* data, size_t size) { - const uint8_t* bytes = static_cast(data); +size_t checksum(const void * data, size_t size) { + const uint8_t * bytes = static_cast(data); size_t sum = 0; for (size_t i = 0; i < size; ++i) { sum += (uint8_t) i; @@ -408,36 +409,37 @@ size_t checksum(const void* data, size_t size) { #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" -void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor) { +void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor) { std::cout << "Input name: " << name << ", Input shape: " << tensor.get_shape() << ", Address: " << tensor.data() << std::endl; switch (tensor.get_element_type()) { - case ov::element::f32: - std::cout << *(tensor.data()) << std::endl; - break; - case ov::element::f16: - std::cout << *(tensor.data()) << std::endl; - break; - case ov::element::i32: - for (size_t i = 0; i < tensor.get_size(); ++i) { - std::cout << tensor.data()[i] << " "; - } - std::cout << std::endl; - break; - case ov::element::i64: - std::cout << *(tensor.data()) << std::endl; - break; - default: - break; + case ov::element::f32: + std::cout << *(tensor.data()) << std::endl; + break; + case ov::element::f16: + std::cout << *(tensor.data()) << std::endl; + break; + case ov::element::i32: + for (size_t i = 0; i < tensor.get_size(); ++i) { + std::cout << tensor.data()[i] << " "; + } + std::cout << std::endl; + break; + case ov::element::i64: + std::cout << *(tensor.data()) << std::endl; + break; + default: + break; } } -void print_output_tensor_info(const std::string& name, const ov::Tensor& tensor, - std::map& output_dst) { +void print_output_tensor_info(const std::string & name, + const ov::Tensor & tensor, + std::map & output_dst) { std::cout << "Output name: " << name << ", Output shape: " << tensor.get_shape() << ", Address: " << output_dst[name] << std::endl; - auto print_float_stats = [](const std::string& type_name, size_t size, auto get_value) { + auto print_float_stats = [](const std::string & type_name, size_t size, auto get_value) { if (size == 0) { return; } @@ -467,13 +469,13 @@ void print_output_tensor_info(const std::string& name, const ov::Tensor& tensor, switch (tensor.get_element_type()) { case ov::element::f32: { - const float* data = tensor.data(); + const float * data = tensor.data(); size_t size = tensor.get_size(); print_float_stats("[f32]", size, [data](size_t i) { return data[i]; }); break; } case ov::element::f16: { - const ov::float16* data = tensor.data(); + const ov::float16 * data = tensor.data(); size_t size = tensor.get_size(); print_float_stats("[f16]", size, [data](size_t i) { return static_cast(data[i]); }); break; @@ -485,17 +487,17 @@ void print_output_tensor_info(const std::string& name, const ov::Tensor& tensor, #pragma GCC diagnostic pop -void set_zero_diagonal(std::vector& matrix, size_t dim) { +void set_zero_diagonal(std::vector & matrix, size_t dim) { for (size_t i = 0; i < dim; ++i) { matrix[i * dim + i] = 0.0f; } } -bool is_prefill(struct ggml_cgraph* cgraph) { +bool is_prefill(ggml_cgraph * cgraph) { for (int i = 0; i < cgraph->n_nodes; ++i) { - auto* op = cgraph->nodes[i]; + auto * op = cgraph->nodes[i]; for (int j = 0; j < GGML_MAX_SRC; ++j) { - auto* src = op->src[j]; + auto * src = op->src[j]; if (src == nullptr) { break; } diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 42686c593b..22f5cc8c34 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -1,32 +1,32 @@ -#include -#include - #include "ggml-backend-impl.h" #include "ggml-decoder.h" #include "ggml-impl.h" -enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph); +#include +#include -std::shared_ptr get_ggml_decoder(struct ggml_cgraph* cgraph, bool is_static, bool is_first_token); +enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph); -ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string& name); +std::shared_ptr get_ggml_decoder(struct ggml_cgraph * cgraph, bool is_static, bool is_first_token); -std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder); +ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string & name); -size_t checksum(const void* data, size_t size); +std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder); -void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor); +size_t checksum(const void * data, size_t size); -void print_output_tensor_info(const std::string& name, - const ov::Tensor& tensor, - std::map& output_dst); +void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor); + +void print_output_tensor_info(const std::string & name, + const ov::Tensor & tensor, + std::map & output_dst); template -std::vector pad_input(const ggml_tensor* tensor, size_t padded_rows, size_t padded_cols, T pad_value) { +std::vector pad_input(const ggml_tensor * tensor, size_t padded_rows, size_t padded_cols, T pad_value) { std::vector padded_data(padded_rows * padded_cols, pad_value); size_t rows = tensor->ne[1]; size_t cols = tensor->ne[0]; - T* data = static_cast(tensor->data); + T * data = static_cast(tensor->data); for (size_t i = 0; i < std::min(rows, padded_rows); ++i) { for (size_t j = 0; j < std::min(cols, padded_cols); ++j) { @@ -36,18 +36,20 @@ std::vector pad_input(const ggml_tensor* tensor, size_t padded_rows, size_t p return padded_data; } -void set_zero_diagonal(std::vector& matrix, size_t dim); +void set_zero_diagonal(std::vector & matrix, size_t dim); bool is_prefill(struct ggml_cgraph * cgraph); ov::AnyMap get_npu_prefill_config(); ov::AnyMap get_npu_generate_config(); -std::map get_types_to_requant(const std::string& device); +std::map get_types_to_requant(const std::string & device); -ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string& param_name); +ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string & param_name); -bool is_naive(struct ggml_cgraph* cgraph); +bool is_naive(struct ggml_cgraph * cgraph); -enum ggml_status naive_compute(struct ggml_cgraph* cgraph, ov::Core& core, const std::string& device, - const ov::AnyMap& config); +enum ggml_status naive_compute(struct ggml_cgraph * cgraph, + ov::Core & core, + const std::string & device, + const ov::AnyMap & config); From b8690bc055a0b9c10e0efe9504328b2426b118cb Mon Sep 17 00:00:00 2001 From: Zijun Yu Date: Tue, 4 Nov 2025 15:19:09 +0800 Subject: [PATCH 163/254] NPU Unify PD (#14) * Stateless. Fix llama-cli llama-server * Simplify broadcast op in attention * Replace get_output_tensor+memcpy with set_output_tensor * NPU unify PD. Unify dynamic and static dims --- ggml/src/ggml-openvino/ggml-decoder.cpp | 81 ++-- ggml/src/ggml-openvino/ggml-decoder.h | 18 +- ggml/src/ggml-openvino/openvino/decoder.hpp | 1 - .../ggml-openvino/openvino/node_context.hpp | 7 +- .../openvino/op/flash_attn_ext.cpp | 52 +-- .../src/ggml-openvino/openvino/op/permute.cpp | 36 +- ggml/src/ggml-openvino/openvino/op/rope.cpp | 4 - .../ggml-openvino/openvino/op/set_rows.cpp | 12 +- .../openvino/translate_session.cpp | 12 +- ggml/src/ggml-openvino/utils.cpp | 355 +++++++----------- ggml/src/ggml-openvino/utils.h | 19 +- 11 files changed, 227 insertions(+), 370 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 392d45dd6b..8472f41a56 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include @@ -39,7 +38,6 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_tensor * node, ggml_cgraph * cgraph, bool is_static, - bool is_first_token, int context_size, int context_size_swa, int num_heads, @@ -55,25 +53,24 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_tensor * node, m_num_heads(num_heads), m_num_heads_kv(num_heads_kv), m_head_size(head_size), - m_is_static(is_static), - m_is_first_token(is_first_token) { + m_is_static(is_static) { set_input_output(node); } GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map> & model_weights, - bool is_static, - bool is_first_token) : + bool is_static) : m_cgraph(cgraph), m_op_name(m_node ? std::string(m_node->name) : ""), m_model_weights(model_weights), - m_is_static(is_static), - m_is_first_token(is_first_token) { - if (is_first_token && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) { + m_is_static(is_static) { + if (auto * env = getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); env && std::string(env) != "0") { + unsetenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); print_tensor_address_map(cgraph); } set_llm_params(); + validate_cgraph(); for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { auto * cur_node = cgraph->nodes[node_n]; @@ -160,8 +157,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { // Model outputs are tensors with GGML_TENSOR_FLAG_OUTPUT flag and kv_caches static std::set debug_output_names = {}; // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph - if (node->op == GGML_OP_SET_ROWS || node->flags & GGML_TENSOR_FLAG_OUTPUT || node_name.find("result") == 0 || - debug_output_names.count(node_name)) { + if (node->op == GGML_OP_SET_ROWS || node->flags & GGML_TENSOR_FLAG_OUTPUT || + node_name.find("output") != std::string::npos || debug_output_names.count(node_name)) { if (node->op == GGML_OP_SET_ROWS) { assert(node_name.find("cache_k") == 0 || node_name.find("cache_v") == 0); if (auto it = std::find(m_kv_names.begin(), m_kv_names.end(), node_name); it == m_kv_names.end()) { @@ -285,53 +282,54 @@ void GgmlOvDecoder::set_llm_params() { } else { m_context_size = cache_k->ne[1]; } - } else if (node->op == GGML_OP_ROPE && - (name.find("Qcur-0") == 0 || std::string(node->src[0]->name).find("Qcur-0") == 0)) { - m_head_size = node->ne[0]; - m_num_heads = node->ne[1]; - m_rope_params = node->op_params; - } else if (node->op == GGML_OP_ROPE && - (name.find("Kcur-0") == 0 || std::string(node->src[0]->name).find("Kcur-0") == 0)) { - m_num_heads_kv = node->ne[1]; + } else if (node->op == GGML_OP_ROPE) { + if (name.find("Qcur-0") == 0 || std::string(node->src[0]->name).find("Qcur-0") == 0) { + m_head_size = node->ne[0]; + m_num_heads = node->ne[1]; + m_rope_params = node->op_params; + auto * inp_pos = node->src[1]; + m_input_len = inp_pos->ne[0]; + m_past_kv_len = *(int32_t *) inp_pos->data; + } else if (name.find("Kcur-0") == 0 || std::string(node->src[0]->name).find("Kcur-0") == 0) { + m_num_heads_kv = node->ne[1]; + } } } } +void GgmlOvDecoder::validate_cgraph() const { + if (m_is_static && m_input_len != 1) { + throw std::runtime_error("Static graph (NPU) must have input_len == 1, but got " + std::to_string(m_input_len) + + ", try set -ub 1"); + } +} + ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * src) const { auto name = std::string(src->name); ov::PartialShape input_shape; - if (name == "inp_tokens" || name == "inp_pos") { - if (m_is_static) { - if (m_is_first_token) { - input_shape = ov::PartialShape{1, 1, m_context_size}; - } else { - input_shape = ov::PartialShape{1, 1, 1}; - } - } else { - input_shape = ov::PartialShape{1, 1, -1}; - } - } else if (name == "inp_out_ids" && !m_is_static) { - input_shape = ov::PartialShape{1, 1, -1}; + + if (name == "inp_tokens" || name == "inp_pos" || name == "inp_out_ids") { + input_shape = ov::PartialShape{1, 1, m_is_static ? 1 : -1}; + } else if (name.find("KQ_mask") == 0) { if (m_is_static) { - if (m_is_first_token) { - input_shape = ov::PartialShape{1, m_context_size, m_context_size}; - } else { - input_shape = ov::PartialShape{1, 1, m_context_size}; - } + input_shape = ov::PartialShape{1, 1, m_context_size}; } else { input_shape = ov::PartialShape{1, -1, -1}; } + } else if (name.find("cache_") == 0) { + auto past_token_len = -1; if (m_is_static) { int layer = extract_layer_from_name(name); bool is_swa = is_swa_layer(layer); - input_shape = ov::PartialShape{is_swa ? m_context_size_swa : m_context_size, m_num_heads_kv, m_head_size}; - } else { - input_shape = ov::PartialShape{1, -1, m_num_heads_kv, m_head_size}; + past_token_len = is_swa ? m_context_size_swa : m_context_size; } + input_shape = ov::PartialShape{past_token_len, m_num_heads_kv, m_head_size}; + } else if (const auto * op = get_tensor_used_op(src); op && op->op == GGML_OP_SET_ROWS) { input_shape = ov::PartialShape{1, 1, m_is_static ? 1 : -1}; + } else if (src->op == GGML_OP_VIEW) { // This case is added to make test-backend-ops work input_shape = ov::PartialShape{get_shape(src->view_src)}; @@ -745,9 +743,8 @@ int32_t * GgmlOvDecoder::get_output_op_params(const std::string & name) const { void GgmlOvDecoder::visit_subgraph(std::function)> node_visitor) const { for (const auto & node : m_nodes) { - auto decoder = - std::make_shared(node, m_cgraph, m_is_static, m_is_first_token, m_context_size, - m_context_size_swa, m_num_heads, m_num_heads_kv, m_head_size, m_swa_layers); + auto decoder = std::make_shared(node, m_cgraph, m_is_static, m_context_size, m_context_size_swa, + m_num_heads, m_num_heads_kv, m_head_size, m_swa_layers); node_visitor(decoder); } } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 884151d32e..fe30bde445 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -16,14 +16,12 @@ public: // Graph decoder GgmlOvDecoder(ggml_cgraph * cgraph, std::map> & model_weights, - bool is_static, - bool is_first_token); + bool is_static); // Node decoder, called in GgmlOvDecoder::visit_subgraph GgmlOvDecoder(ggml_tensor * node, ggml_cgraph * cgraph, bool is_static, - bool is_first_token, int context_size, int context_size_swa, int num_heads, @@ -81,9 +79,9 @@ public: virtual void visit_subgraph(std::function)> node_visitor) const override; - const ggml_tensor * get_input_ggml_tensor(const std::string & name) const { return m_inputs.at(name); } + ggml_tensor * get_input_ggml_tensor(const std::string & name) const { return m_inputs.at(name); } - const ggml_tensor * get_output_ggml_tensor(const std::string & name) const { return m_outputs.at(name); } + ggml_tensor * get_output_ggml_tensor(const std::string & name) const { return m_outputs.at(name); } virtual int get_op_case() const override { return m_op_case; } @@ -119,14 +117,16 @@ public: virtual int get_head_size() const override { return m_head_size; } + int get_past_kv_len() const { return m_past_kv_len; } + + int get_input_len() const { return m_input_len; } + virtual int32_t * get_rope_params() const override { return m_rope_params; } virtual std::map get_kv_param_res_names() const override; virtual bool is_static() const override { return m_is_static; } - virtual bool is_first_token() const override { return m_is_first_token; } - ov::PartialShape get_graph_input_shape(const ggml_tensor * src) const; static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename); @@ -153,6 +153,7 @@ private: // set context_size, num_heads, etc void set_llm_params(); + void validate_cgraph() const; ggml_cgraph * m_cgraph = nullptr; ggml_tensor * m_node = nullptr; @@ -176,10 +177,11 @@ private: int m_num_heads; int m_num_heads_kv; int m_head_size; + int m_past_kv_len; + int m_input_len; int32_t * m_rope_params; std::vector m_kv_names; bool m_is_static = false; - bool m_is_first_token; }; void print_tensor_address_map(const ggml_cgraph * cgraph); diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index 6f11ff1283..a3cb995a3c 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -65,7 +65,6 @@ public: virtual std::map get_kv_param_res_names() const = 0; virtual bool is_static() const = 0; - virtual bool is_first_token() const = 0; virtual int get_context_size() const = 0; virtual int get_context_size_swa() const = 0; virtual int is_swa_layer(int layer) const = 0; diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index a64ae098ab..0d76dc83e0 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -97,12 +97,7 @@ public: int get_op_case() const { return m_decoder->get_op_case(); } - bool is_static() const { - return m_decoder->is_static(); - } - bool is_first_token() const { - return m_decoder->is_first_token(); - } + bool is_static() const { return m_decoder->is_static(); } private: std::shared_ptr m_decoder; diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp index 029023637a..de2af85aa8 100644 --- a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +++ b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp @@ -2,9 +2,11 @@ #include "../op_table.hpp" #include "../utils.hpp" +#include #include #include #include +#include #include #include #include @@ -51,43 +53,25 @@ OutputVector translate_flash_attn_ext(const NodeContext & context) { auto stop = std::make_shared(ov::OutputVector{token_len, kv_len}, 0); mask_sliced = std::make_shared(mask, zero_2d, stop, one_2d, axes); - mask_sliced = std::make_shared(mask_sliced, zero_1d); } if (mask_sliced.get_element_type() != ov::element::f16) { mask_sliced = std::make_shared(mask_sliced, ov::element::f16); } - auto tile_kv = [](int64_t q_batch, int64_t kv_batch, ov::Output kv, bool is_static) { - int64_t factor = q_batch / kv_batch; + auto tile_kv = [&](int64_t num_heads, int64_t num_heads_kv, int64_t head_size, ov::Output kv) { + int64_t factor = num_heads / num_heads_kv; if (factor > 1) { - auto q_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{q_batch}); - auto kv_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{kv_batch}); - auto factor_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{factor}); - ov::Output kv_broadcast_shape, kv_unsqueezed, new_kv_shape; - if (is_static) { - auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); - kv_unsqueezed = std::make_shared(kv, unsqueeze_axes); + auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); + kv_unsqueezed = std::make_shared(kv, unsqueeze_axes); - auto kv_last_two_dims = get_dimensions(kv.get_node_shared_ptr(), {1, 2}); - kv_broadcast_shape = std::make_shared( - ov::OutputVector{kv_batch_node, factor_node, kv_last_two_dims}, 0); - new_kv_shape = - std::make_shared(ov::OutputVector{q_batch_node, kv_last_two_dims}, 0); - } else { - auto one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {2}); - kv_unsqueezed = std::make_shared(kv, unsqueeze_axes); + kv_broadcast_shape = + ov::op::v0::Constant::create(ov::element::i64, {4}, {num_heads_kv, factor, (int64_t) 1, head_size}); + new_kv_shape = ov::op::v0::Constant::create(ov::element::i64, {3}, {num_heads, (int64_t) -1, head_size}); - auto kv_last_two_dims = get_dimensions(kv.get_node_shared_ptr(), {2, 3}); - kv_broadcast_shape = std::make_shared( - ov::OutputVector{one_1d, kv_batch_node, factor_node, kv_last_two_dims}, 0); - new_kv_shape = - std::make_shared(ov::OutputVector{one_1d, q_batch_node, kv_last_two_dims}, 0); - } - - kv = std::make_shared(kv_unsqueezed, kv_broadcast_shape); + kv = std::make_shared(kv_unsqueezed, kv_broadcast_shape, + ov::op::BroadcastType::BIDIRECTIONAL); kv = std::make_shared(kv, new_kv_shape, false); } return kv; @@ -95,18 +79,12 @@ OutputVector translate_flash_attn_ext(const NodeContext & context) { auto q_shape = context.get_input_shape(0).to_shape(); auto k_shape = context.get_input_shape(1).to_shape(); - k = tile_kv(q_shape[0], k_shape[0], k, context.is_static()); - v = tile_kv(q_shape[0], k_shape[0], v, context.is_static()); + k = tile_kv(q_shape[0], k_shape[0], q_shape[2], k); + v = tile_kv(q_shape[0], k_shape[0], q_shape[2], v); auto sdpa = std::make_shared(q, k, v, mask_sliced, scale_node, false); - auto sdpa_f32 = std::make_shared(sdpa, ov::element::f32); - if (context.is_static()) { - res = std::make_shared(sdpa_f32, - ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); - } else { - res = std::make_shared( - sdpa_f32, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); - } + res = std::make_shared(sdpa, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); + res = std::make_shared(res, ov::element::f32); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index 128ffb2933..cf651a084b 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -26,40 +26,8 @@ OutputVector translate_permute(const NodeContext & context) { ov::Output res; auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - if (op_case == 1) { - if (context.is_static()) { - res = std::make_shared( - context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); - } else { - auto src = context.get_input(0); - if (src.get_partial_shape().rank() == 3) { - src = std::make_shared(src, zero); - } - res = std::make_shared( - src, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); - } - } else { - auto src = context.get_input(0); - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - - if (context.is_static()) { - auto src_shape_ = context.get_input_shape(0).to_shape(); - std::vector src_shape(src_shape_.begin(), src_shape_.end()); - auto src_reshaped = std::make_shared( - src, - ov::op::v0::Constant::create(ov::element::i64, {3}, - std::vector{-1, src_shape[1], src_shape[2]}), - false); - res = std::make_shared( - src_reshaped, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); - } else { - if (src.get_partial_shape().rank() == 3) { - src = std::make_shared(src, zero); - } - res = std::make_shared( - src, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); - } - } + auto src = context.get_input(0); + res = std::make_shared(src, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index 362ccce17f..9ad2e25284 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -84,10 +84,6 @@ OutputVector translate_rope(const NodeContext & context) { ov::op::v0::Constant::create(ov::element::i64, {1}, {3})); auto stack = std::make_shared(OutputVector{first_half, second_half}, 3); res = std::make_shared(stack, std::make_shared(data_node), false); - if (!(context.is_static())) { - res = - std::make_shared(res, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); - } } else if (mode == ROPE_TYPE_NEOX) { auto data_split = std::make_shared( data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}), 2); diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp index 643ba7bffa..8d0277ce86 100644 --- a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp @@ -33,10 +33,6 @@ OutputVector translate_set_rows(const NodeContext & context) { auto dst_shape = context.get_output_shape(0).to_shape(); FRONT_END_OP_CONVERSION_CHECK(dst_shape[0] == 1, "Unsupported shape in SET_ROWS"); - if (context.is_static() && context.is_first_token()) { - return rename_outputs_with_suffix({data}, context.get_name()); - } - auto indices = context.get_input(1); auto dst = context.get_input(context.get_output_name()); @@ -54,13 +50,11 @@ OutputVector translate_set_rows(const NodeContext & context) { auto updated = std::make_shared(dst_reshaped, indices_reshaped, data_reshaped, zero); res = std::make_shared(updated, std::make_shared(dst), false); } else { - assert(dst.get_partial_shape().rank() == 4 && dst.get_partial_shape()[2].is_static() && - dst.get_partial_shape()[3].is_static()); + int64_t dim1 = dst.get_partial_shape()[1].get_length(); int64_t dim2 = dst.get_partial_shape()[2].get_length(); - int64_t dim3 = dst.get_partial_shape()[3].get_length(); data = std::make_shared( - data, ov::op::v0::Constant::create(ov::element::i64, {4}, {(int64_t) 1, (int64_t) -1, dim2, dim3}), false); - res = std::make_shared(OutputVector{dst, data}, 1); + data, ov::op::v0::Constant::create(ov::element::i64, {3}, {(int64_t) -1, dim1, dim2}), false); + res = std::make_shared(OutputVector{dst, data}, 0); } return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 67c5b4a51b..def1f39460 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -27,7 +27,6 @@ #include #include #include -#include #include #include @@ -112,7 +111,6 @@ void add_sliced_mask(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) { auto stop = std::make_shared(ov::OutputVector{token_len, kv_len}, 0); mask_sliced = std::make_shared(mask, zero_2d, stop, one_2d, axes); - mask_sliced = std::make_shared(mask_sliced, zero_1d); mask_sliced = std::make_shared(mask_sliced, ov::element::f16); mask_sliced->set_friendly_name(sliced_name); } @@ -243,11 +241,11 @@ std::shared_ptr TranslateSession::apply_transformations(std::shared_ptr(); - if (!ggml_model_decoder->is_static()) { - const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names(); - const auto kv_param_res_pairs = get_kv_param_res_pairs(model, kv_param_res_names); - manager.register_pass(kv_param_res_pairs); - } + // if (!ggml_model_decoder->is_static()) { + // const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names(); + // const auto kv_param_res_pairs = get_kv_param_res_pairs(model, kv_param_res_names); + // manager.register_pass(kv_param_res_pairs); + // } // if (ggml_model_decoder->is_static()) { manager.register_pass(); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index eb9ea9fee9..50e3ef20bc 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -12,12 +12,14 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include @@ -26,60 +28,29 @@ #include #include #include +#include #include #include -ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string & name) { - const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(name); - auto * input_data = ggml_tensor->data; - ov::Shape input_shape; - if (name.find("cache_k") == 0 || name.find("cache_v") == 0) { - input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor).to_shape(); - } else if (ggml_tensor->op == GGML_OP_VIEW) { - // This case is added to make test-backend-ops work - input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor->view_src).to_shape(); - } else { - input_shape = ggml_decoder->get_input_shape(name).to_shape(); - } - auto input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); - return input_tensor; -} - -std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder) { - std::map output_tensors; - - auto output_names = ggml_decoder->get_model_output_names(); - for (size_t inp = 0; inp < output_names.size(); ++inp) { - auto name = output_names[inp]; - const auto * tensor = ggml_decoder->get_output_ggml_tensor(name); - auto * output_data = tensor->view_src ? tensor->view_src->data : tensor->data; - output_tensors[name] = output_data; - } - return output_tensors; -} - -static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { - auto fem = ov::frontend::FrontEndManager(); - auto front_end = fem.load_by_framework("ggml"); - return front_end; -} +// Suppress deprecation warning for ov::Tensor::data() +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { static ov::Core core; - static std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : ""; - if (device.empty()) { - const std::vector preferred_device = {"GPU", "CPU", "NPU"}; - const auto available_devices = core.get_available_devices(); - for (const auto & dev : preferred_device) { - if (std::find(available_devices.begin(), available_devices.end(), dev) != available_devices.end()) { - device = dev; - break; - } + auto get_device = [&] { + std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "CPU"; + auto available_devices = core.get_available_devices(); + if (std::find(available_devices.begin(), available_devices.end(), device) == available_devices.end()) { + GGML_LOG_WARN("GGML OpenVINO Backend: device %s is not available, fallback to CPU\n", device.c_str()); + device = "CPU"; } - } - + return device; + }; + static std::string device = get_device(); bool is_static = device == "NPU" ? true : false; + ov::AnyMap config; if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { @@ -102,11 +73,9 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * static std::unordered_map> infer_request_cache; static std::unordered_map> ov_input_names_cache; static std::unordered_map> ov_output_names_cache; - // For NPU, store the kvcache model, since we cannot create two infer_request - static std::unordered_map compiled_model_cache; std::shared_ptr ggml_decoder; - ov::InferRequest infer_request; + std::shared_ptr infer_request; int64_t decoder_end_time; int64_t conversion_end_time; @@ -118,83 +87,36 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * auto it = infer_request_cache.find(cgraph); if (it != infer_request_cache.end()) { std::map> model_weights; - ggml_decoder = std::make_shared(cgraph, model_weights, is_static, false); + ggml_decoder = std::make_shared(cgraph, model_weights, is_static); decoder_end_time = ggml_time_us(); - // For NPU for the first time we call kvcache modle, pop the compiled kvcache model from cache - if (is_static && compiled_model_cache.find(cgraph) != compiled_model_cache.end()) { - infer_request_cache[cgraph] = - std::make_shared(compiled_model_cache[cgraph].create_infer_request()); - compiled_model_cache.erase(cgraph); - } - infer_request = *infer_request_cache[cgraph]; - + infer_request = infer_request_cache[cgraph]; conversion_end_time = ggml_time_us(); compile_end_time = conversion_end_time; } else { std::shared_ptr model; auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, get_types_to_requant(device)); - if (is_static) { - ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true); - auto ggml_decoder_kvcache = std::make_shared(cgraph, model_weights, is_static, false); - decoder_end_time = ggml_time_us(); + ggml_decoder = std::make_shared(cgraph, model_weights, is_static); + decoder_end_time = ggml_time_us(); - auto input_model = std::make_shared(ggml_decoder); - auto input_model_kvcache = std::make_shared(ggml_decoder_kvcache); + auto input_model = std::make_shared(ggml_decoder); + model = ov::frontend::ggml::FrontEnd::convert(input_model); + ggml_decoder->clear_model_weights(); + conversion_end_time = ggml_time_us(); - model = ov::frontend::ggml::FrontEnd::convert(input_model); - ggml_decoder->clear_model_weights(); - auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache); - ggml_decoder_kvcache->clear_model_weights(); - conversion_end_time = ggml_time_us(); - - if (getenv("GGML_OPENVINO_DUMP_IR")) { - char timestamped_filename[64]; - auto timestamp = (long long) ggml_time_us(); - snprintf(timestamped_filename, sizeof(timestamped_filename), "model_prefill_%lld.xml", timestamp); - ov::serialize(model, timestamped_filename); - snprintf(timestamped_filename, sizeof(timestamped_filename), "model_kvcache_%lld.xml", timestamp); - ov::serialize(model_kvcache, timestamped_filename); - } - - auto compiled_model = core.compile_model(model, device, get_npu_prefill_config()); - auto compiled_model_kvcache = core.compile_model(model_kvcache, device, get_npu_generate_config()); - compiled_model_cache[cgraph] = compiled_model_kvcache; - compile_end_time = ggml_time_us(); - - infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); - infer_request = *infer_request_cache[cgraph]; - compiled_model_cache[cgraph] = compiled_model_kvcache; - } else { - ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true); - decoder_end_time = ggml_time_us(); - - auto input_model = std::make_shared(ggml_decoder); - model = ov::frontend::ggml::FrontEnd::convert(input_model); - ggml_decoder->clear_model_weights(); - conversion_end_time = ggml_time_us(); - - if (getenv("GGML_OPENVINO_DUMP_IR")) { - char timestamped_filename[64]; - auto timestamp = (long long) ggml_time_us(); - snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); - ov::serialize(model, timestamped_filename); - } - - auto * disable_sdpa_optimization = getenv("GGML_OPENVINO_DISABLE_SDPA_OPTIMIZATION"); - if (disable_sdpa_optimization && std::string(disable_sdpa_optimization) != "0") { - config = { - {"GPU_ENABLE_SDPA_OPTIMIZATION", "0"} - }; - } - - auto compiled_model = core.compile_model(model, device, config); - compile_end_time = ggml_time_us(); - infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); - infer_request = *infer_request_cache[cgraph]; + if (getenv("GGML_OPENVINO_DUMP_IR")) { + char timestamped_filename[64]; + auto timestamp = (long long) ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); + ov::serialize(model, timestamped_filename); } + auto compiled_model = core.compile_model(model, device, get_ov_compile_config(device)); + compile_end_time = ggml_time_us(); + infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); + infer_request = infer_request_cache[cgraph]; + std::vector ov_input_names; std::vector ov_output_names; for (const auto & ov_param : model->get_parameters()) { @@ -210,72 +132,66 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * auto ov_input_names = ov_input_names_cache[cgraph]; auto ov_output_names = ov_output_names_cache[cgraph]; + for (size_t i = 0; i < ov_input_names.size(); i++) { auto param_name = ov_input_names[i]; auto input_tensor = get_ov_input_tensor(ggml_decoder, param_name); - infer_request.set_input_tensor(i, input_tensor); + infer_request->set_input_tensor(i, input_tensor); if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { print_input_tensor_info(param_name, input_tensor); } } + + for (size_t i = 0; i < ov_output_names.size(); i++) { + auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); + infer_request->set_output_tensor(i, output_tensor); + } + auto input_end_time = ggml_time_us(); - infer_request.infer(); + infer_request->infer(); + auto infer_end_time = ggml_time_us(); - auto gguf_tensor_addrs = get_ggml_graph_output_dst(ggml_decoder); for (size_t i = 0; i < ov_output_names.size(); i++) { - auto & result_name = ov_output_names[i]; - const auto output_tensor = infer_request.get_output_tensor(i); - - std::memcpy(gguf_tensor_addrs[result_name], output_tensor.data(), output_tensor.get_byte_size()); - + const auto output_tensor = infer_request->get_output_tensor(i); if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { - print_output_tensor_info(result_name, output_tensor, gguf_tensor_addrs); + print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data()); } } - auto end_time = ggml_time_us(); if (getenv("GGML_OPENVINO_PROFILING")) { - GGML_LOG_INFO("GGML OpenVINO Backend: \n"); + GGML_LOG_INFO("\nGGML OpenVINO Backend: \n"); GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000); GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000); GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); GGML_LOG_INFO(" - Graph Input Time: %ld ms \n", (input_end_time - compile_end_time) / 1000); GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - input_end_time) / 1000); - GGML_LOG_INFO(" - Graph Output Time: %ld ms \n", (end_time - infer_end_time) / 1000); } return GGML_STATUS_SUCCESS; GGML_UNUSED(backend); } -namespace { -ov::AnyMap get_npu_base_config() { - return { - {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm" }, - {"NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES" }, - {"NPU_USE_NPUW", "YES" }, - {"NPUW_DEVICES", "NPU" }, - {"NPUW_FOLD", "YES" }, - {"NPUW_WEIGHTS_BANK", "shared" }, - {"NPUW_FUNCALL_FOR_ALL", "YES" }, - {"NPUW_FUNCALL_ASYNC", "YES" }, - {"NPUW_DQ", "YES" }, - {"NPUW_DQ_FULL", "NO" }, - {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""}, - }; -} -} // namespace - -ov::AnyMap get_npu_prefill_config() { - auto config = get_npu_base_config(); - return config; -} - -ov::AnyMap get_npu_generate_config() { - auto config = get_npu_base_config(); +ov::AnyMap get_ov_compile_config(const std::string & device) { + ov::AnyMap config; + if (device == "NPU") { + config = { + {"NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES" }, + {"NPU_USE_NPUW", "YES" }, + {"NPUW_DEVICES", "NPU" }, + {"NPUW_FOLD", "YES" }, + {"NPUW_WEIGHTS_BANK", "shared"}, + {"NPUW_FUNCALL_FOR_ALL", "YES" }, + {"NPUW_FUNCALL_ASYNC", "YES" }, + {"NPUW_DQ", "YES" }, + {"NPUW_DQ_FULL", "NO" }, + }; + if (auto * cache_dir = getenv("GGML_OPENVINO_CACHE_DIR"); cache_dir) { + config["NPUW_CACHE_DIR"] = cache_dir; + } + } return config; } @@ -291,7 +207,7 @@ std::map get_types_to_requant(const std::string & dev } if (device == "GPU") { return { - // gs16 is WIP + // gs16 will be supported on openvino-2025.4 {GGML_TYPE_Q6_K, ExtraQuantType::Q8_0_32}, }; } @@ -331,70 +247,91 @@ enum ggml_status naive_compute(ggml_cgraph * cgraph, infer_request.set_input_tensor(i, input_tensor); } - infer_request.infer(); - - auto gguf_tensor_addrs = get_ggml_graph_output_dst(decoder); auto ov_results = model->get_results(); for (size_t i = 0; i < ov_results.size(); i++) { auto result_name = ov_results[i]->get_friendly_name(); - const auto output_tensor = infer_request.get_output_tensor(i); - - std::memcpy(gguf_tensor_addrs[result_name], output_tensor.data(), output_tensor.get_byte_size()); + auto output_tensor = get_ov_output_tensor(decoder, result_name); + infer_request.set_output_tensor(i, output_tensor); } + + infer_request.infer(); return GGML_STATUS_SUCCESS; } +namespace { +ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string & name) { + const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(name); + auto * input_data = ggml_tensor->data; + ov::Shape input_shape; + if (ggml_tensor->op == GGML_OP_VIEW) { + // This case is added to make test-backend-ops work + input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor->view_src).to_shape(); + } else { + input_shape = ggml_decoder->get_input_shape(name).to_shape(); + } + auto input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); + return input_tensor; +} +} // namespace + ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string & param_name) { bool is_static = ggml_decoder->is_static(); - bool is_first_token = ggml_decoder->is_first_token(); ov::Tensor input_tensor; if (ggml_decoder->get_model_extra_inputs().find(param_name) != ggml_decoder->get_model_extra_inputs().end()) { input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name); - } else if (!is_static) { + } else if (param_name.find("cache_k") == 0 || param_name.find("cache_v") == 0) { + void * input_data = ggml_decoder->get_input_ggml_tensor(param_name)->data; + size_t past_kv_len = + ggml_decoder->is_static() ? ggml_decoder->get_context_size() : ggml_decoder->get_past_kv_len(); + ov::Shape input_shape = {past_kv_len, (size_t) ggml_decoder->get_num_heads_kv(), + (size_t) ggml_decoder->get_head_size()}; + input_tensor = ov::Tensor(ggml_decoder->get_input_type(param_name), input_shape, input_data); + + } else if (is_static && param_name.find("KQ_mask") == 0) { + size_t context_size = ggml_decoder->get_context_size(); + const auto * input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); + std::vector padded_data = pad_input(input_tensor_ggml, 1, context_size, -INFINITY); + input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, context_size}); + auto * data_ptr = input_tensor.data(); + std::copy(padded_data.begin(), padded_data.end(), data_ptr); + + } else if (is_static && param_name.find("inp_out_ids") == 0) { input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); + if (input_tensor.get_size() == 0) { + input_tensor = ov::Tensor(input_tensor.get_element_type(), ov::Shape{1, 1, 1}); + *input_tensor.data() = 0; + } } else { - if (param_name == "inp_tokens" || param_name == "inp_pos") { - if (is_first_token) { - size_t context_size = ggml_decoder->get_context_size(); - const auto * input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); - std::vector padded_data = pad_input(input_tensor_ggml, 1, context_size, 0); - input_tensor = ov::Tensor(ov::element::i32, ov::Shape{1, 1, context_size}); - auto * data_ptr = input_tensor.data(); - std::copy(padded_data.begin(), padded_data.end(), data_ptr); - } else { - input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); - } - - } else if (param_name.find("KQ_mask") == 0) { - size_t context_size = ggml_decoder->get_context_size(); - const auto * input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); - if (is_first_token) { - std::vector padded_data = - pad_input(input_tensor_ggml, context_size, context_size, -INFINITY); - set_zero_diagonal(padded_data, context_size); - input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, context_size, context_size}); - auto * data_ptr = input_tensor.data(); - std::copy(padded_data.begin(), padded_data.end(), data_ptr); - } else { - std::vector padded_data = pad_input(input_tensor_ggml, 1, context_size, -INFINITY); - input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, context_size}); - auto * data_ptr = input_tensor.data(); - std::copy(padded_data.begin(), padded_data.end(), data_ptr); - } - - } else if (const auto * op = ggml_decoder->get_tensor_used_op(ggml_decoder->get_tensor_from_name(param_name)); - op && op->op == GGML_OP_SET_ROWS && is_static && is_first_token) { - input_tensor = ov::Tensor(ov::element::i64, ov::Shape{1, 1, 1}); - } else { - input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); - } + input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); } return input_tensor; } +ov::Tensor get_ov_output_tensor(std::shared_ptr ggml_decoder, const std::string & result_name) { + auto * ggml_tensor = ggml_decoder->get_output_ggml_tensor(result_name); + auto output_type = ggml_decoder->get_output_type(result_name); + ov::Shape output_shape; + if (result_name.find("cache") == std::string::npos) { + output_shape = ggml_decoder->get_output_shape(result_name).to_shape(); + if (ggml_decoder->is_static() && result_name == "result_output") { + output_shape[1] = 1; + } + } else { + size_t total_token_len = ggml_decoder->get_past_kv_len() + ggml_decoder->get_input_len(); + size_t num_heads_kv = ggml_decoder->get_num_heads_kv(); + size_t head_size = ggml_decoder->get_head_size(); + if (ggml_decoder->is_static()) { + total_token_len = ggml_decoder->get_context_size(); + } + output_shape = ov::Shape{total_token_len, num_heads_kv, head_size}; + } + ov::Tensor output_tensor(output_type, output_shape, ggml_tensor->data); + return output_tensor; +} + size_t checksum(const void * data, size_t size) { const uint8_t * bytes = static_cast(data); size_t sum = 0; @@ -405,10 +342,6 @@ size_t checksum(const void * data, size_t size) { return sum; } -// Suppress deprecation warning for ov::Tensor::data() -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" - void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor) { std::cout << "Input name: " << name << ", Input shape: " << tensor.get_shape() << ", Address: " << tensor.data() << std::endl; @@ -433,11 +366,9 @@ void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor } } -void print_output_tensor_info(const std::string & name, - const ov::Tensor & tensor, - std::map & output_dst) { - std::cout << "Output name: " << name << ", Output shape: " << tensor.get_shape() - << ", Address: " << output_dst[name] << std::endl; +void print_output_tensor_info(const std::string & name, const ov::Tensor & tensor, void * output_dst) { + std::cout << "Output name: " << name << ", Output shape: " << tensor.get_shape() << ", Address: " << output_dst + << std::endl; auto print_float_stats = [](const std::string & type_name, size_t size, auto get_value) { if (size == 0) { @@ -485,15 +416,13 @@ void print_output_tensor_info(const std::string & name, } } -#pragma GCC diagnostic pop - void set_zero_diagonal(std::vector & matrix, size_t dim) { for (size_t i = 0; i < dim; ++i) { matrix[i * dim + i] = 0.0f; } } -bool is_prefill(ggml_cgraph * cgraph) { +const ggml_tensor * get_inp_pos_tensor(ggml_cgraph * cgraph) { for (int i = 0; i < cgraph->n_nodes; ++i) { auto * op = cgraph->nodes[i]; for (int j = 0; j < GGML_MAX_SRC; ++j) { @@ -501,11 +430,17 @@ bool is_prefill(ggml_cgraph * cgraph) { if (src == nullptr) { break; } - if (std::string(src->name) == "inp_tokens") { - return src->ne[0] != 1; + if (std::string(src->name) == "inp_pos") { + return src; } } } - GGML_LOG_ERROR("is_prefill: inp_tokens not found in cgraph"); - throw std::runtime_error("is_prefill: inp_tokens not found in cgraph"); + GGML_LOG_ERROR("get_inp_pos_tensor: inp_pos not found in cgraph"); + throw std::runtime_error("get_inp_pos_tensor: inp_pos not found in cgraph"); } + +bool get_is_first_token(const ggml_tensor * inp_pos) { + return *(int32_t *) inp_pos->data == 0; +} + +#pragma GCC diagnostic pop diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 22f5cc8c34..352f67aa12 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -7,19 +7,11 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph); -std::shared_ptr get_ggml_decoder(struct ggml_cgraph * cgraph, bool is_static, bool is_first_token); - -ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string & name); - -std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder); - size_t checksum(const void * data, size_t size); void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor); -void print_output_tensor_info(const std::string & name, - const ov::Tensor & tensor, - std::map & output_dst); +void print_output_tensor_info(const std::string & name, const ov::Tensor & tensor, void * output_dst); template std::vector pad_input(const ggml_tensor * tensor, size_t padded_rows, size_t padded_cols, T pad_value) { @@ -38,15 +30,18 @@ std::vector pad_input(const ggml_tensor * tensor, size_t padded_rows, size_t void set_zero_diagonal(std::vector & matrix, size_t dim); -bool is_prefill(struct ggml_cgraph * cgraph); +const ggml_tensor * get_inp_pos_tensor(struct ggml_cgraph * cgraph); -ov::AnyMap get_npu_prefill_config(); -ov::AnyMap get_npu_generate_config(); +bool get_is_first_token(const ggml_tensor * inp_pos); + +ov::AnyMap get_ov_compile_config(const std::string & device); std::map get_types_to_requant(const std::string & device); ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string & param_name); +ov::Tensor get_ov_output_tensor(std::shared_ptr ggml_decoder, const std::string & result_name); + bool is_naive(struct ggml_cgraph * cgraph); enum ggml_status naive_compute(struct ggml_cgraph * cgraph, From 303923aba7158fc40b21bf0f27ee0fe3ab50c29b Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 21 Oct 2025 13:27:46 +0800 Subject: [PATCH 164/254] Clean placeholders in ggml-openvino.cpp --- ggml/include/ggml-openvino.h | 13 --- ggml/src/ggml-openvino/ggml-openvino.cpp | 118 ++--------------------- 2 files changed, 6 insertions(+), 125 deletions(-) diff --git a/ggml/include/ggml-openvino.h b/ggml/include/ggml-openvino.h index 7b5298e520..b690a16378 100644 --- a/ggml/include/ggml-openvino.h +++ b/ggml/include/ggml-openvino.h @@ -21,20 +21,7 @@ GGML_BACKEND_API bool ggml_backend_is_openvino(ggml_backend_t backend); // device buffer GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device); -// split tensor buffer that splits matrices by rows across multiple devices -GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_split_buffer_type(const float * tensor_split); - -// pinned host buffer for use with the CPU backend for faster copies between CPU -// and GPU -GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(void); - GGML_BACKEND_API int ggml_backend_openvino_get_device_count(void); -// GGML_BACKEND_API void ggml_backend_openvino_get_device_description(int device, char * description, -// size_t description_size); -// GGML_BACKEND_API void ggml_backend_openvino_get_device_memory(int device, size_t * free, size_t * total); - -// GGML_BACKEND_API bool ggml_backend_openvino_register_host_buffer(void * buffer, size_t size); -// GGML_BACKEND_API void ggml_backend_openvino_unregister_host_buffer(void * buffer); GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void); diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index c5acb1ea26..b8630fa42c 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -119,43 +119,6 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(in GGML_UNUSED(device); } -// split tensor buffer that splits matrices by rows across multiple devices -GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_split_buffer_type(const float * tensor_split) { - GGML_ASSERT(tensor_split != nullptr); - return nullptr; -} - -// pinned host buffer for use with the CPU backend for faster copies between CPU -// and GPU -GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(void) { - return nullptr; -} - -struct ggml_backend_openvino_buffer_type_context { - int device; - std::string name; -}; - -static const char * ggml_backend_openvino_buffer_type_get_name(ggml_backend_buffer_type_t buft) { - ggml_backend_openvino_buffer_type_context * ctx = (ggml_backend_openvino_buffer_type_context *) buft->context; - - return ctx->name.c_str(); -} - -static bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft) { - return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name; -} - -static const char * ggml_backend_openvino_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) { - return GGML_OPENVINO_NAME "_Split"; - - GGML_UNUSED(buft); -} - -static bool ggml_backend_buft_is_openvino_split(ggml_backend_buffer_type_t buft) { - return buft->iface.get_name == ggml_backend_openvino_split_buffer_type_get_name; -} - struct ggml_backend_openvino_device_context { int device; std::string name; @@ -172,14 +135,10 @@ static const char * ggml_backend_openvino_device_get_description(ggml_backend_de return ctx->description.c_str(); } -// TODO static void ggml_backend_openvino_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { GGML_ASSERT(dev->context != nullptr); GGML_ASSERT(free != nullptr); GGML_ASSERT(total != nullptr); - ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *) dev->context; - GGML_ASSERT(ctx->device >= 0); - // ggml_openvino_set_device(ctx->device); *total = 1; *free = 1; } @@ -195,18 +154,11 @@ static void ggml_backend_openvino_device_get_props(ggml_backend_dev_t dev, ggml_ props->type = ggml_backend_openvino_device_get_type(dev); ggml_backend_openvino_device_get_memory(dev, &props->memory_free, &props->memory_total); - bool host_buffer = getenv("GGML_OPENVINO_NO_PINNED") == nullptr; -#ifdef GGML_OPENVINO_NO_PEER_COPY - bool events = false; -#else - bool events = true; -#endif - props->caps = { - /* .async = */ true, - /* .host_buffer = */ host_buffer, + /* .async = */ false, + /* .host_buffer = */ false, /* .buffer_from_host_ptr = */ false, - /* .events = */ events, + /* .events = */ false, }; } @@ -221,33 +173,6 @@ static ggml_backend_buffer_type_t ggml_backend_openvino_device_get_buffer_type(g return ggml_backend_openvino_buffer_type(ctx->device); } -static ggml_backend_buffer_type_t ggml_backend_openvino_device_get_host_buffer_type(ggml_backend_dev_t dev) { - GGML_UNUSED(dev); - return ggml_backend_openvino_host_buffer_type(); -} - -static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_ptr(ggml_backend_dev_t dev, - void * ptr, - size_t size, - size_t max_tensor_size) { - GGML_UNUSED(dev); - GGML_UNUSED(ptr); - GGML_UNUSED(size); - GGML_UNUSED(max_tensor_size); - return nullptr; -} - -static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(ggml_backend_dev_t dev, - void * ptr, - size_t size, - size_t max_tensor_size) { - GGML_UNUSED(dev); - GGML_UNUSED(ptr); - GGML_UNUSED(size); - GGML_UNUSED(max_tensor_size); - return nullptr; -} - static bool is_op_unsupported_case(const ggml_tensor * op) { switch (op->op) { case GGML_OP_SOFT_MAX: { @@ -447,7 +372,7 @@ static const struct ggml_backend_device_i ggml_backend_openvino_device_interface /* .init_backend = */ ggml_backend_openvino_device_init, /* .get_buffer_type = */ ggml_backend_openvino_device_get_buffer_type, /* .get_host_buffer_type = */ NULL, - /* .buffer_from_host_ptr = */ ggml_backend_openvino_device_buffer_from_ptr, + /* .buffer_from_host_ptr = */ NULL, /* .supports_op = */ ggml_backend_openvino_device_supports_op, /* .supports_buft = */ ggml_backend_openvino_device_supports_buft, /* .offload_op = */ NULL, @@ -466,44 +391,19 @@ static const char * ggml_backend_openvino_reg_get_name(ggml_backend_reg_t reg) { } static size_t ggml_backend_openvino_reg_get_device_count(ggml_backend_reg_t reg) { - return ggml_openvino_info().device_count; GGML_UNUSED(reg); - - // TODO - ggml_backend_openvino_reg_context * ctx = (ggml_backend_openvino_reg_context *) reg->context; - - return ctx->devices.size(); + return ggml_openvino_info().device_count; } static ggml_backend_dev_t ggml_backend_openvino_reg_get_device(ggml_backend_reg_t reg, size_t index) { ggml_backend_openvino_reg_context * ctx = (ggml_backend_openvino_reg_context *) reg->context; GGML_ASSERT(index < ctx->devices.size()); return ctx->devices[index]; - // GGML_ASSERT(index == 0); - - // static ggml_backend_device ggml_backend_openvino_device = { - // /* .iface = */ ggml_backend_openvino_device_interface, - // /* .reg = */ reg, - // /* .context = */ nullptr, - // }; - - // return &ggml_backend_openvino_device; - - // GGML_UNUSED(reg); - // GGML_UNUSED(index); } static void * ggml_backend_openvino_get_proc_address(ggml_backend_reg_t reg, const char * name) { GGML_UNUSED(reg); - if (strcmp(name, "ggml_backend_split_buffer_type") == 0) { - return (void *) ggml_backend_openvino_split_buffer_type; - } - // if (strcmp(name, "ggml_backend_register_host_buffer") == 0) { - // return (void *)ggml_backend_openvino_register_host_buffer; - // } - // if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) { - // return (void *)ggml_backend_openvino_unregister_host_buffer; - // } + GGML_UNUSED(name); return nullptr; } @@ -515,15 +415,11 @@ static const struct ggml_backend_reg_i ggml_backend_openvino_reg_interface = { }; static int get_openvino_device_count() { - ov::Core core; - auto devices = core.get_available_devices(); - // return devices.size(); return 1; } static ggml_openvino_device_info ggml_openvino_init() { ggml_openvino_device_info info = {}; - // TODO info.device_count = get_openvino_device_count(); return info; } @@ -543,13 +439,11 @@ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void) { if (!initialized) { ggml_backend_openvino_reg_context * ctx = new ggml_backend_openvino_reg_context; - // GGML_LOG_DEBUG("ggml_openvino_info().device_count = %d \n", ggml_openvino_info().device_count); for (int i = 0; i < ggml_openvino_info().device_count; i++) { ggml_backend_openvino_device_context * dev_ctx = new ggml_backend_openvino_device_context; dev_ctx->device = i; dev_ctx->name = GGML_OPENVINO_NAME + std::to_string(i); - // ggml_openvino_set_device(i); dev_ctx->description = ov::get_openvino_version().description; ggml_backend_dev_t dev = From ea2c99be1ccd685ca0595417edb1394a2553ad6a Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 5 Nov 2025 17:36:31 +0800 Subject: [PATCH 165/254] NPU unify PD (handled internally) --- ggml/src/ggml-openvino/ggml-decoder.cpp | 7 +- ggml/src/ggml-openvino/utils.cpp | 159 ++++++++++++++++++------ ggml/src/ggml-openvino/utils.h | 4 + 3 files changed, 123 insertions(+), 47 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 8472f41a56..5b86e9962d 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -297,12 +297,7 @@ void GgmlOvDecoder::set_llm_params() { } } -void GgmlOvDecoder::validate_cgraph() const { - if (m_is_static && m_input_len != 1) { - throw std::runtime_error("Static graph (NPU) must have input_len == 1, but got " + std::to_string(m_input_len) + - ", try set -ub 1"); - } -} +void GgmlOvDecoder::validate_cgraph() const {} ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * src) const { auto name = std::string(src->name); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 50e3ef20bc..965effe327 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -80,6 +80,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * int64_t decoder_end_time; int64_t conversion_end_time; int64_t compile_end_time; + int64_t infer_end_time; { std::lock_guard lock(cache_mutex); @@ -127,38 +128,79 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * } ov_input_names_cache[cgraph] = ov_input_names; ov_output_names_cache[cgraph] = ov_output_names; + + // Set output tensors and kvcache address for NPU once and for all since the graph is static + if (is_static) { + for (size_t i = 0; i < ov_output_names.size(); i++) { + auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); + infer_request->set_output_tensor(i, output_tensor); + } + for (size_t i = 0; i < ov_input_names.size(); i++) { + auto param_name = ov_input_names[i]; + if (param_name.find("cache_k") == 0 || param_name.find("cache_v") == 0) { + auto input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, 0, 0); + infer_request->set_input_tensor(i, input_tensor); + } + } + } } } auto ov_input_names = ov_input_names_cache[cgraph]; auto ov_output_names = ov_output_names_cache[cgraph]; - for (size_t i = 0; i < ov_input_names.size(); i++) { - auto param_name = ov_input_names[i]; - auto input_tensor = get_ov_input_tensor(ggml_decoder, param_name); - infer_request->set_input_tensor(i, input_tensor); + if (!is_static) { + for (size_t i = 0; i < ov_input_names.size(); i++) { + auto param_name = ov_input_names[i]; + auto input_tensor = get_ov_input_tensor(ggml_decoder, param_name); + infer_request->set_input_tensor(i, input_tensor); - if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { - print_input_tensor_info(param_name, input_tensor); + if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { + print_input_tensor_info(param_name, input_tensor); + } } - } - for (size_t i = 0; i < ov_output_names.size(); i++) { - auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); - infer_request->set_output_tensor(i, output_tensor); - } + for (size_t i = 0; i < ov_output_names.size(); i++) { + auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); + infer_request->set_output_tensor(i, output_tensor); + } - auto input_end_time = ggml_time_us(); + infer_request->infer(); + infer_end_time = ggml_time_us(); - infer_request->infer(); - - auto infer_end_time = ggml_time_us(); - - for (size_t i = 0; i < ov_output_names.size(); i++) { - const auto output_tensor = infer_request->get_output_tensor(i); if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { - print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data()); + for (size_t i = 0; i < ov_output_names.size(); i++) { + const auto output_tensor = infer_request->get_output_tensor(i); + print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data()); + } } + } else { + auto input_len = ggml_decoder->get_input_len(); + for (int j = 0; j < input_len; j++) { + for (size_t i = 0; i < ov_input_names.size(); i++) { + auto param_name = ov_input_names[i]; + if (param_name.find("cache_k") == 0 || param_name.find("cache_v") == 0) { + continue; + } + auto input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, j, input_len); + infer_request->set_input_tensor(i, input_tensor); + + if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { + const auto input_tensor = infer_request->get_input_tensor(i); + print_input_tensor_info(param_name, input_tensor); + } + } + + infer_request->infer(); + + if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { + for (size_t i = 0; i < ov_output_names.size(); i++) { + const auto output_tensor = infer_request->get_output_tensor(i); + print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data()); + } + } + } + infer_end_time = ggml_time_us(); } if (getenv("GGML_OPENVINO_PROFILING")) { @@ -166,8 +208,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000); GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000); GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); - GGML_LOG_INFO(" - Graph Input Time: %ld ms \n", (input_end_time - compile_end_time) / 1000); - GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - input_end_time) / 1000); + GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - compile_end_time) / 1000); } return GGML_STATUS_SUCCESS; @@ -275,41 +316,77 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, } // namespace ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string & param_name) { - bool is_static = ggml_decoder->is_static(); - ov::Tensor input_tensor; if (ggml_decoder->get_model_extra_inputs().find(param_name) != ggml_decoder->get_model_extra_inputs().end()) { input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name); } else if (param_name.find("cache_k") == 0 || param_name.find("cache_v") == 0) { void * input_data = ggml_decoder->get_input_ggml_tensor(param_name)->data; - size_t past_kv_len = - ggml_decoder->is_static() ? ggml_decoder->get_context_size() : ggml_decoder->get_past_kv_len(); - ov::Shape input_shape = {past_kv_len, (size_t) ggml_decoder->get_num_heads_kv(), + ov::Shape input_shape = {(size_t) ggml_decoder->get_past_kv_len(), (size_t) ggml_decoder->get_num_heads_kv(), (size_t) ggml_decoder->get_head_size()}; input_tensor = ov::Tensor(ggml_decoder->get_input_type(param_name), input_shape, input_data); - } else if (is_static && param_name.find("KQ_mask") == 0) { - size_t context_size = ggml_decoder->get_context_size(); - const auto * input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); - std::vector padded_data = pad_input(input_tensor_ggml, 1, context_size, -INFINITY); - input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, context_size}); - auto * data_ptr = input_tensor.data(); - std::copy(padded_data.begin(), padded_data.end(), data_ptr); - - } else if (is_static && param_name.find("inp_out_ids") == 0) { - input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); - if (input_tensor.get_size() == 0) { - input_tensor = ov::Tensor(input_tensor.get_element_type(), ov::Shape{1, 1, 1}); - *input_tensor.data() = 0; - } - } else { input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); } return input_tensor; } +ov::Tensor get_ov_input_tensor_static(std::shared_ptr ggml_decoder, + const std::string & param_name, + int j, + int input_len) { + const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(param_name); + const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor); + + if (param_name.find("cache_k") == 0 || param_name.find("cache_v") == 0) { + void * input_data = ggml_decoder->get_input_ggml_tensor(param_name)->data; + ov::Shape input_shape = {(size_t) ggml_decoder->get_context_size(), (size_t) ggml_decoder->get_num_heads_kv(), + (size_t) ggml_decoder->get_head_size()}; + return ov::Tensor(ggml_decoder->get_input_type(param_name), input_shape, input_data); + } + + if (param_name == "inp_pos" || param_name == "inp_tokens" || op->op == GGML_OP_SET_ROWS) { + ov::Shape input_shape = {1, 1, 1}; + ov::Tensor input_tensor(ggml_decoder->get_input_type(param_name), input_shape); + // copy the j-th value from ggml_tensor + size_t element_size = ggml_type_size(ggml_tensor->type); + void * input_data = (char *) ggml_tensor->data + j * element_size; + std::memcpy(input_tensor.data(), input_data, element_size); + return input_tensor; + } + + if (param_name == "inp_out_ids") { + ov::Shape input_shape = {1, 1, 1}; + ov::Tensor input_tensor(ggml_decoder->get_input_type(param_name), input_shape); + if (ggml_tensor->ne[0] == 0) { + *input_tensor.data() = 0; + } else if (ggml_tensor->ne[0] == 1) { + if (j == input_len - 1) { + *input_tensor.data() = *((int32_t *) ggml_tensor->data); + } else { + *input_tensor.data() = 0; + } + } else { + throw std::runtime_error("Static graph inp_out_ids unexpected ne[0] > 1"); + } + return input_tensor; + } + + if (param_name.find("KQ_mask") == 0) { + size_t context_size = ggml_decoder->get_context_size(); + const auto * input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); + std::vector padded_data = pad_input(input_tensor_ggml, input_len, context_size, -INFINITY); + ov::Tensor input_tensor(ov::element::f32, ov::Shape{1, 1, context_size}); + // copy the j-th row of padded_data + auto * data_ptr = input_tensor.data(); + std::copy(padded_data.begin() + j * context_size, padded_data.begin() + (j + 1) * context_size, data_ptr); + return input_tensor; + } + + return get_ov_input_tensor(ggml_decoder, param_name); +} + ov::Tensor get_ov_output_tensor(std::shared_ptr ggml_decoder, const std::string & result_name) { auto * ggml_tensor = ggml_decoder->get_output_ggml_tensor(result_name); auto output_type = ggml_decoder->get_output_type(result_name); diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 352f67aa12..999fc53f32 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -39,6 +39,10 @@ ov::AnyMap get_ov_compile_config(const std::string & device); std::map get_types_to_requant(const std::string & device); ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string & param_name); +ov::Tensor get_ov_input_tensor_static(std::shared_ptr ggml_decoder, + const std::string & param_name, + int j, + int input_len); ov::Tensor get_ov_output_tensor(std::shared_ptr ggml_decoder, const std::string & result_name); From 072dde0b2b6caed024960367f83bc5ef31fb34b0 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 20 Nov 2025 15:31:31 +0800 Subject: [PATCH 166/254] change graph to 4d, support multi sequences --- ggml/src/ggml-openvino/ggml-decoder.cpp | 163 ++++++++++-------- ggml/src/ggml-openvino/ggml-decoder.h | 50 ++++-- ggml/src/ggml-openvino/ggml-openvino.cpp | 8 - ggml/src/ggml-openvino/openvino/decoder.hpp | 6 +- ggml/src/ggml-openvino/openvino/op/cont.cpp | 1 + .../openvino/op/flash_attn_ext.cpp | 32 ++-- .../ggml-openvino/openvino/op/get_rows.cpp | 12 +- .../ggml-openvino/openvino/op/glu_geglu.cpp | 2 +- .../ggml-openvino/openvino/op/glu_swiglu.cpp | 2 +- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 1 + .../src/ggml-openvino/openvino/op/permute.cpp | 62 ++++++- .../src/ggml-openvino/openvino/op/reshape.cpp | 35 +++- ggml/src/ggml-openvino/openvino/op/rope.cpp | 26 +-- .../ggml-openvino/openvino/op/set_rows.cpp | 37 ++-- .../src/ggml-openvino/openvino/op/softmax.cpp | 1 + .../ggml-openvino/openvino/op/transpose.cpp | 4 +- ggml/src/ggml-openvino/openvino/op/view.cpp | 2 +- .../openvino/translate_session.cpp | 39 +---- ggml/src/ggml-openvino/openvino/utils.cpp | 26 +-- ggml/src/ggml-openvino/utils.cpp | 69 +++----- 20 files changed, 317 insertions(+), 261 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 5b86e9962d..0f913bdd75 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -44,26 +44,26 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_tensor * node, int num_heads_kv, int head_size, const std::vector & swa_layers) : + m_is_static(is_static), m_cgraph(cgraph), m_node(node), m_op_name(std::string(node->name)), - m_context_size(context_size), - m_context_size_swa(context_size_swa), - m_swa_layers(swa_layers), - m_num_heads(num_heads), - m_num_heads_kv(num_heads_kv), + m_ctx(context_size), + m_ctx_swa(context_size_swa), + m_n_heads(num_heads), + m_n_heads_kv(num_heads_kv), m_head_size(head_size), - m_is_static(is_static) { + m_swa_layers(swa_layers) { set_input_output(node); } GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map> & model_weights, bool is_static) : + m_is_static(is_static), m_cgraph(cgraph), m_op_name(m_node ? std::string(m_node->name) : ""), - m_model_weights(model_weights), - m_is_static(is_static) { + m_model_weights(model_weights) { if (auto * env = getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); env && std::string(env) != "0") { unsetenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); print_tensor_address_map(cgraph); @@ -78,7 +78,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, set_input_output(cur_node); } - // add_extra_inputs(); + add_extra_inputs(); } GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map> & model_weights) { @@ -125,7 +125,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { // Add model inputs and weights constants, if called for the whole graph if (naive) { if (m_model_weights.find(src_name) == m_model_weights.end()) { - auto param_node = std::make_shared(get_ov_type(src), get_graph_input_shape(src)); + auto param_node = + std::make_shared(get_ov_type(src), get_graph_input_shape(node, src)); param_node->set_friendly_name(src_name); param_node->output(0).get_tensor().set_names({src_name}); m_model_inputs[src_name] = param_node; @@ -142,7 +143,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { if (m_model_inputs.find(src_name) != m_model_inputs.end()) { continue; } - auto param_node = std::make_shared(get_ov_type(src), get_graph_input_shape(src)); + auto param_node = + std::make_shared(get_ov_type(src), get_graph_input_shape(node, src)); param_node->set_friendly_name(src_name); param_node->output(0).get_tensor().set_names({src_name}); m_model_inputs[src_name] = param_node; @@ -175,15 +177,20 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { if (m_node) { switch (node->op) { case GGML_OP_RESHAPE: { - if (node->src[0]->op == GGML_OP_RESHAPE && node->src[0]->src[0]->ne[0] == node->ne[0] && - node->src[0]->src[0]->ne[1] == node->ne[1]) { + auto * src = node->src[0]; + if (src->op == GGML_OP_RESHAPE && src->src[0]->ne[0] == node->ne[0] && src->src[0]->ne[1] == node->ne[1]) { m_op_case = 4; - } else if (node->ne[0] * node->ne[1] == node->src[0]->ne[0]) { + } else if (node->ne[0] * node->ne[1] == src->ne[0]) { m_op_case = 1; - } else if (node->src[0]->ne[0] * node->src[0]->ne[1] == node->ne[0]) { + } else if (src->ne[0] * src->ne[1] == node->ne[0]) { m_op_case = 2; - } else if (node->src[0]->ne[0] * node->src[0]->ne[1] == node->ne[1]) { + if (src->ne[2] * src->ne[3] == node->ne[1]) { + m_op_case = 5; + } + } else if (src->ne[0] * src->ne[1] == node->ne[1]) { m_op_case = 3; + } else if (src->ne[1] * src->ne[2] == node->ne[1]) { + m_op_case = 6; } break; } @@ -204,7 +211,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { } else if (ggml_is_contiguous(node->src[0])) { std::string src_name(node->view_src->name); if (src_name.find("cache") == std::string::npos) { - m_op_case = 1; + // permute Qcur + m_op_case = 4; } else { // Permute kv cache (view) int layer = extract_layer_from_name(src_name); @@ -241,10 +249,10 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { case GGML_OP_VIEW: { if (node->src[0]->op == GGML_OP_VIEW) { auto * src = node->src[0]; - auto * view_src = src->view_src; - if (view_src->ne[1] != src->ne[2]) { + if (ggml_nelements(node) != ggml_nelements(src)) { throw std::runtime_error("Unsupported VIEW case"); } + // This view is a reshape, slicing happens at src->op m_op_case = 2; } } @@ -272,64 +280,80 @@ void GgmlOvDecoder::set_llm_params() { auto * node = m_cgraph->nodes[i]; std::string name = std::string(node->name); if (node->op == GGML_OP_FLASH_ATTN_EXT) { - auto * cache_k = node->src[1]; - cache_k = cache_k->view_src ? cache_k->view_src : cache_k; + auto * cache_k_perm = node->src[1]; + assert(cache_k_perm->op == GGML_OP_PERMUTE); + auto * cache_k_view = cache_k_perm->src[0]; + assert(cache_k_view->op == GGML_OP_VIEW); + + auto * cache_k = cache_k_view->src[0]; int layer = extract_layer_from_name(cache_k->name); + auto * mask = node->src[3]; + std::string mask_name(mask->name); + assert(mask_name.find("KQ_mask") == 0); if (std::string(node->src[3]->name).find("swa") != std::string::npos) { m_swa_layers.push_back(layer); - m_context_size_swa = cache_k->ne[1]; + m_ctx_per_seq_swa = cache_k->ne[1]; } else { - m_context_size = cache_k->ne[1]; + m_ctx_per_seq = cache_k->ne[1]; + m_n_seq = cache_k->ne[2]; } + + m_n_seq_active = mask->ne[3]; + auto seq_size = cache_k->ne[0] * cache_k->ne[1] * ggml_type_size(cache_k->type); + m_seq_active_start = ((size_t *) cache_k_view->op_params)[0] / seq_size; + m_token_len_per_seq = node->ne[2]; + + if (mask_name.find("swa") != std::string::npos) { + m_attention_size_swa = mask->ne[0]; + } else { + m_attention_size = mask->ne[0]; + } + } else if (node->op == GGML_OP_ROPE) { if (name.find("Qcur-0") == 0 || std::string(node->src[0]->name).find("Qcur-0") == 0) { m_head_size = node->ne[0]; - m_num_heads = node->ne[1]; + m_n_heads = node->ne[1]; m_rope_params = node->op_params; auto * inp_pos = node->src[1]; m_input_len = inp_pos->ne[0]; - m_past_kv_len = *(int32_t *) inp_pos->data; } else if (name.find("Kcur-0") == 0 || std::string(node->src[0]->name).find("Kcur-0") == 0) { - m_num_heads_kv = node->ne[1]; + m_n_heads_kv = node->ne[1]; } } } + m_ctx = m_ctx_per_seq * m_n_seq; + m_ctx_swa = m_ctx_per_seq_swa * m_n_seq; } -void GgmlOvDecoder::validate_cgraph() const {} +void GgmlOvDecoder::validate_cgraph() const { + if (m_n_seq > 1 && m_is_static == true) { + throw std::runtime_error("n_seq > 1 is not supported on NPU"); + } +} -ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * src) const { - auto name = std::string(src->name); +ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const { + auto name = std::string(input->name); ov::PartialShape input_shape; if (name == "inp_tokens" || name == "inp_pos" || name == "inp_out_ids") { - input_shape = ov::PartialShape{1, 1, m_is_static ? 1 : -1}; + input_shape = ov::PartialShape{1, 1, 1, m_is_static ? 1 : -1}; } else if (name.find("KQ_mask") == 0) { if (m_is_static) { - input_shape = ov::PartialShape{1, 1, m_context_size}; + input_shape = ov::PartialShape{1, 1, 1, m_ctx}; } else { - input_shape = ov::PartialShape{1, -1, -1}; + input_shape = ov::PartialShape{-1, 1, -1, -1}; } - } else if (name.find("cache_") == 0) { - auto past_token_len = -1; - if (m_is_static) { - int layer = extract_layer_from_name(name); - bool is_swa = is_swa_layer(layer); - past_token_len = is_swa ? m_context_size_swa : m_context_size; - } - input_shape = ov::PartialShape{past_token_len, m_num_heads_kv, m_head_size}; + } else if (op && op->op == GGML_OP_SET_ROWS && op->src[1] == input) { + input_shape = ov::PartialShape{1, 1, 1, m_is_static ? 1 : -1}; - } else if (const auto * op = get_tensor_used_op(src); op && op->op == GGML_OP_SET_ROWS) { - input_shape = ov::PartialShape{1, 1, m_is_static ? 1 : -1}; - - } else if (src->op == GGML_OP_VIEW) { + } else if (input->op == GGML_OP_VIEW) { // This case is added to make test-backend-ops work - input_shape = ov::PartialShape{get_shape(src->view_src)}; + input_shape = ov::PartialShape{get_shape(input->view_src)}; } else { - input_shape = ov::PartialShape{get_shape(src)}; + input_shape = ov::PartialShape{get_shape(input)}; } return input_shape; } @@ -339,25 +363,9 @@ void GgmlOvDecoder::add_extra_inputs() { // 1. `attention_size`, used in FLASH_ATTN where the shape of the matmul's are 256 aligned, // see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding. // Not used for NPU. - // Update: not used anymore after the optimization of making kvcache dynamic (but breaks iSWA models) - int64_t attention_size = -1; - int64_t attention_size_swa = -1; - for (const auto & node : m_nodes) { - if (node->op == GGML_OP_FLASH_ATTN_EXT) { - auto * mask = node->src[3]; - std::string mask_name(mask->name); - if (mask_name.find("KQ_mask") != 0) { - throw std::runtime_error("Unexpected flash attention node: " + std::string(mask->name)); - } - if (mask_name.find("swa") != std::string::npos) { - attention_size_swa = mask->ne[0]; - } else { - attention_size = mask->ne[0]; - } - } - } + // 2. `n_seq_active` and `seq_active_start`, used in FLASH_ATTN_EXT to indicate the active sequences in the batch - auto create_attention_size_input = [this](const std::string & name, int64_t size) { + auto create_1d_input = [this](const std::string & name, int64_t size) { auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); param_node->set_friendly_name(name); param_node->output(0).get_tensor().set_names({name}); @@ -368,10 +376,15 @@ void GgmlOvDecoder::add_extra_inputs() { m_model_extra_input_values[name] = tensor; }; - create_attention_size_input("attention_size", attention_size); - if (attention_size_swa != -1) { - create_attention_size_input("attention_size_swa", attention_size_swa); + create_1d_input("attention_size", m_attention_size); + if (m_attention_size_swa != -1) { + create_1d_input("attention_size_swa", m_attention_size_swa); } + create_1d_input("n_seq_active", m_n_seq_active); + create_1d_input("seq_active_start", m_seq_active_start); + create_1d_input("seq_active_end", m_seq_active_start + m_n_seq_active); + create_1d_input("token_len_per_seq", m_token_len_per_seq); + // create_1d_input("token_len", m_token_len_per_seq * m_n_seq_active); } const ggml_tensor * GgmlOvDecoder::get_tensor_used_op(const ggml_tensor * tensor) const { @@ -472,6 +485,8 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor auto node_shape = get_shape(tensor); auto ne_total = ggml_nelements(tensor); + OPENVINO_ASSERT(node_shape[0] == 1, "Got 4D weights, expect all weights to be 2D: ", tensor->name); + node_shape.erase(node_shape.begin()); OPENVINO_ASSERT(node_shape[0] == 1, "Got 3D weights, expect all weights to be 2D: ", tensor->name); node_shape.erase(node_shape.begin()); @@ -641,7 +656,7 @@ void print_tensor_address_map(const ggml_cgraph * cgraph) { std::vector GgmlOvDecoder::get_shape(const ggml_tensor * tensor) { std::vector shape; - for (int i = GGML_MAX_DIMS - 2; i >= 0; --i) { + for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) { shape.push_back(static_cast(tensor->ne[i])); } return shape; @@ -649,7 +664,7 @@ std::vector GgmlOvDecoder::get_shape(const ggml_tensor * tensor) { std::vector GgmlOvDecoder::get_stride(const ggml_tensor * tensor) { std::vector stride; - for (int i = GGML_MAX_DIMS - 2; i >= 0; --i) { + for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) { stride.push_back(static_cast(tensor->nb[i])); } return stride; @@ -708,7 +723,11 @@ std::vector GgmlOvDecoder::get_output_stride(const std::string & name) c } ov::PartialShape GgmlOvDecoder::get_output_shape(const std::string & name) const { - return ov::PartialShape(get_shape(m_outputs.at(name))); + auto * ggml_tensor = m_outputs.at(name); + if (ggml_tensor->op == GGML_OP_SET_ROWS) { + ggml_tensor = ggml_tensor->view_src; + } + return ov::PartialShape(get_shape(ggml_tensor)); } ov::element::Type GgmlOvDecoder::get_output_type(const std::string & name) const { @@ -738,8 +757,8 @@ int32_t * GgmlOvDecoder::get_output_op_params(const std::string & name) const { void GgmlOvDecoder::visit_subgraph(std::function)> node_visitor) const { for (const auto & node : m_nodes) { - auto decoder = std::make_shared(node, m_cgraph, m_is_static, m_context_size, m_context_size_swa, - m_num_heads, m_num_heads_kv, m_head_size, m_swa_layers); + auto decoder = std::make_shared(node, m_cgraph, m_is_static, m_ctx, m_ctx_swa, m_n_heads, + m_n_heads_kv, m_head_size, m_swa_layers); node_visitor(decoder); } } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index fe30bde445..e2efc73f17 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -103,20 +103,20 @@ public: virtual const std::vector & get_model_output_names() const override { return m_model_output_names; } - virtual int get_context_size() const override { return m_context_size; } + virtual int get_ctx_size() const { return m_ctx; } - virtual int get_context_size_swa() const override { return m_context_size_swa; } + virtual int get_ctx_swa_size() const { return m_ctx_swa; } + + virtual int get_ctx_per_seq() const { return m_ctx_per_seq; } + + virtual int get_ctx_per_seq_swa() const { return m_ctx_per_seq_swa; } + + virtual int get_n_seq() const { return m_n_seq; } virtual int is_swa_layer(int layer) const override { return std::find(m_swa_layers.begin(), m_swa_layers.end(), layer) != m_swa_layers.end(); } - virtual int get_num_heads() const override { return m_num_heads; } - - virtual int get_num_heads_kv() const override { return m_num_heads_kv; } - - virtual int get_head_size() const override { return m_head_size; } - int get_past_kv_len() const { return m_past_kv_len; } int get_input_len() const { return m_input_len; } @@ -127,7 +127,7 @@ public: virtual bool is_static() const override { return m_is_static; } - ov::PartialShape get_graph_input_shape(const ggml_tensor * src) const; + ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const; static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename); @@ -151,10 +151,11 @@ private: static std::vector get_stride(const ggml_tensor * tensor); static ov::element::Type get_ov_type(const ggml_tensor * tensor); - // set context_size, num_heads, etc void set_llm_params(); void validate_cgraph() const; + bool m_is_static = false; + ggml_cgraph * m_cgraph = nullptr; ggml_tensor * m_node = nullptr; std::vector m_nodes; @@ -171,17 +172,28 @@ private: std::map> m_model_extra_input_values; std::map> m_model_weights; std::vector m_model_output_names; - int m_context_size; - int m_context_size_swa; + + // Fixed for a model + int m_ctx = -1; + int m_ctx_swa = -1; + int m_ctx_per_seq = -1; + int m_ctx_per_seq_swa = -1; + int m_n_seq = -1; + int m_n_heads = -1; + int m_n_heads_kv = -1; + int m_head_size = -1; std::vector m_swa_layers; - int m_num_heads; - int m_num_heads_kv; - int m_head_size; - int m_past_kv_len; - int m_input_len; - int32_t * m_rope_params; std::vector m_kv_names; - bool m_is_static = false; + + // Changed per inference + int m_n_seq_active = -1; + int m_seq_active_start = -1; + int m_attention_size = -1; + int m_attention_size_swa = -1; + int m_input_len = -1; + int m_token_len_per_seq = -1; + int m_past_kv_len = -1; + int32_t * m_rope_params = nullptr; }; void print_tensor_address_map(const ggml_cgraph * cgraph); diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index b8630fa42c..910c706bda 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -329,10 +329,6 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con GGML_LOG_WARN("OpenVINO backend does not support tensor type %s\n", ggml_type_name(op->type)); return false; } - if (op->ne[3] != 1) { - GGML_LOG_WARN("OpenVINO backend does not support tensors with ne[3] != 1\n"); - return false; - } for (int i = 0; i < GGML_MAX_SRC; i++) { auto * src = op->src[i]; if (src == nullptr) { @@ -342,10 +338,6 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con GGML_LOG_WARN("OpenVINO backend does not support tensor type %s\n", ggml_type_name(src->type)); return false; } - if (src->ne[3] != 1) { - GGML_LOG_WARN("OpenVINO backend does not support tensors with ne[3] != 1\n"); - return false; - } if (ggml_is_quantized(src->type) && src->ne[2] != 1) { GGML_LOG_WARN("OpenVINO backend does not support 3D quantized tensors\n"); return false; diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index a3cb995a3c..8f86a4de06 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -58,15 +58,11 @@ public: virtual const std::map>& get_model_weights() const = 0; virtual const std::vector& get_model_output_names() const = 0; - virtual int get_num_heads() const = 0; - virtual int get_num_heads_kv() const = 0; - virtual int get_head_size() const = 0; virtual int32_t* get_rope_params() const = 0; virtual std::map get_kv_param_res_names() const = 0; virtual bool is_static() const = 0; - virtual int get_context_size() const = 0; - virtual int get_context_size_swa() const = 0; + virtual int is_swa_layer(int layer) const = 0; }; diff --git a/ggml/src/ggml-openvino/openvino/op/cont.cpp b/ggml/src/ggml-openvino/openvino/op/cont.cpp index a17273d426..618b4efdea 100644 --- a/ggml/src/ggml-openvino/openvino/op/cont.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp @@ -27,6 +27,7 @@ OutputVector translate_cont(const NodeContext & context) { if (op_case == 1) { // The input comes from a PERMUTE + throw std::runtime_error("Code of this case might be outdated"); dst_shape[1] = -1; res = std::make_shared( context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), false); diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp index de2af85aa8..efbdf421c6 100644 --- a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +++ b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp @@ -42,17 +42,11 @@ OutputVector translate_flash_attn_ext(const NodeContext & context) { if (context.has_input(mask_name)) { mask_sliced = context.get_input(mask_name); } else { + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); auto token_len = get_dimensions(q, {2}); - auto kv_len = get_dimensions(k.get_node_shared_ptr(), {2}); - - auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 0}); - auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 1}); - auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2}); - - auto stop = std::make_shared(ov::OutputVector{token_len, kv_len}, 0); - mask_sliced = std::make_shared(mask, zero_2d, stop, one_2d, axes); + mask_sliced = std::make_shared(mask, zero, token_len, one, two); } if (mask_sliced.get_element_type() != ov::element::f16) { @@ -63,27 +57,29 @@ OutputVector translate_flash_attn_ext(const NodeContext & context) { int64_t factor = num_heads / num_heads_kv; if (factor > 1) { ov::Output kv_broadcast_shape, kv_unsqueezed, new_kv_shape; - auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); + auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {2}); kv_unsqueezed = std::make_shared(kv, unsqueeze_axes); - kv_broadcast_shape = - ov::op::v0::Constant::create(ov::element::i64, {4}, {num_heads_kv, factor, (int64_t) 1, head_size}); - new_kv_shape = ov::op::v0::Constant::create(ov::element::i64, {3}, {num_heads, (int64_t) -1, head_size}); + kv_broadcast_shape = ov::op::v0::Constant::create( + ov::element::i64, {5}, {(int64_t) 1, num_heads_kv, factor, (int64_t) 1, head_size}); + new_kv_shape = + ov::op::v0::Constant::create(ov::element::i64, {4}, {(int64_t) 0, num_heads, (int64_t) -1, head_size}); kv = std::make_shared(kv_unsqueezed, kv_broadcast_shape, ov::op::BroadcastType::BIDIRECTIONAL); - kv = std::make_shared(kv, new_kv_shape, false); + kv = std::make_shared(kv, new_kv_shape, true); } return kv; }; auto q_shape = context.get_input_shape(0).to_shape(); auto k_shape = context.get_input_shape(1).to_shape(); - k = tile_kv(q_shape[0], k_shape[0], q_shape[2], k); - v = tile_kv(q_shape[0], k_shape[0], q_shape[2], v); + k = tile_kv(q_shape[1], k_shape[1], q_shape[3], k); + v = tile_kv(q_shape[1], k_shape[1], q_shape[3], v); auto sdpa = std::make_shared(q, k, v, mask_sliced, scale_node, false); - res = std::make_shared(sdpa, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); + res = std::make_shared(sdpa, + ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); res = std::make_shared(res, ov::element::f32); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp index 2e3520554e..ace79c33a9 100644 --- a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp @@ -8,6 +8,7 @@ #include #include #include +#include namespace ov { namespace frontend { @@ -28,11 +29,13 @@ OutputVector translate_get_rows(const NodeContext & context) { indices = process_view_input(context, 1); } - // data[b,x,y] ind[1,b,x'] test-backend-ops case - // data[x,y] ind[1,1,x'] normal case - indices = std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); - if (data.get_partial_shape().rank() == 3) { + // data[1,b,x,y] ind[1,1,b,x'] test-backend-ops case + // data[x,y] ind[1,1,1,x'] normal case + indices = + std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); + if (data.get_partial_shape().rank() == 4) { auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1}); + data = std::make_shared(data, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); res = std::make_shared(data, indices, axis, 1); } else { auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {0}); @@ -42,6 +45,7 @@ OutputVector translate_get_rows(const NodeContext & context) { if (res.get_element_type() != context.get_output_type(0)) { res = std::make_shared(res, context.get_output_type(0)); } + res = std::make_shared(res, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp index 3e3cae0071..80bfbafd83 100644 --- a/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp @@ -26,7 +26,7 @@ OutputVector translate_glu_geglu(const NodeContext & context) { src1 = context.get_input(1); } else { auto combined = context.get_input(0); - auto split_axis = ov::op::v0::Constant::create(ov::element::i64, {}, {2}); + auto split_axis = ov::op::v0::Constant::create(ov::element::i64, {}, {3}); auto split = std::make_shared(combined, split_axis, 2); src0 = split->output(0); src1 = split->output(1); diff --git a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp index 61cdaadea3..2148931246 100644 --- a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp @@ -26,7 +26,7 @@ OutputVector translate_glu_swiglu(const NodeContext & context) { src1 = context.get_input(1); } else { auto combined = context.get_input(0); - auto split_axis = ov::op::v0::Constant::create(ov::element::i64, {}, {2}); + auto split_axis = ov::op::v0::Constant::create(ov::element::i64, {}, {3}); auto split = std::make_shared(combined, split_axis, 2); src0 = split->output(0); src1 = split->output(1); diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index c161bce75d..27e4bfa460 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -53,6 +53,7 @@ OutputVector translate_mulmat(const NodeContext & context) { Output Z = A_batch_larger ? B : A; int64_t factor = A_batch_larger ? A_batch / B_batch : B_batch / A_batch; if (factor > 1) { + // TODO code is outdated auto A_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{A_batch}); auto B_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{B_batch}); auto factor_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{factor}); diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index cf651a084b..2fe2325d6a 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -6,12 +6,12 @@ #include #include #include +#include #include #include #include #include #include -#include namespace ov { namespace frontend { @@ -22,12 +22,64 @@ OutputVector translate_permute(const NodeContext & context) { num_inputs_check(context, 1, 1); int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported PERMUTE case"); - ov::Output res; - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3 || op_case == 4, + "Unsupported PERMUTE case"); + ov::Output res; auto src = context.get_input(0); - res = std::make_shared(src, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); + auto perm = ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}); + + if (op_case == 1) { + res = std::make_shared(src, perm); + } else if (op_case == 4) { + auto output_shape = context.get_output_shape(0).to_shape(); + auto n_heads = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[1]}); + auto head_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[3]}); + auto n_seq_active = context.get_input("n_seq_active"); + auto neg_one = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); + + auto new_shape = + std::make_shared(ov::OutputVector{n_seq_active, neg_one, n_heads, head_size}, 0); + + // // Alternative + // auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + // auto new_shape = std::make_shared(ov::OutputVector{n_seq_active, neg_one, zero, zero}, 0); + + auto reshaped = std::make_shared(src, new_shape, true); + res = std::make_shared(reshaped, perm); + } else { + auto cache_shape = src.get_partial_shape(); + auto output_shape = context.get_output_shape(0).to_shape(); + int64_t head_size = output_shape[3]; + int64_t n_heads = output_shape[1]; + int64_t ctx_per_seq = cache_shape[2].get_length(); + int64_t n_seq = cache_shape[1].get_length(); + + Output attention_size; + if (context.is_static()) { + attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {INT_MAX}); + } else if (op_case == 2) { + attention_size = context.get_input("attention_size"); + } else { + attention_size = context.get_input("attention_size_swa"); + } + + // 1. reshape to [n_seq, ctx_per_seq, n_heads, head_size] + // 2. slice out the active sequences + // 3. slice out the attention part in each sequence + // 4. permute + auto seq_active_start = context.get_input("seq_active_start"); + auto seq_active_end = context.get_input("seq_active_end"); + + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + + auto src_reshaped = std::make_shared( + src, ov::op::v0::Constant::create(ov::element::i64, {4}, {n_seq, ctx_per_seq, n_heads, head_size}), false); + auto slice1 = std::make_shared(src_reshaped, seq_active_start, seq_active_end, one, zero); + auto slice2 = std::make_shared(slice1, zero, attention_size, one, one); + res = std::make_shared(slice2, perm); + } return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/reshape.cpp b/ggml/src/ggml-openvino/openvino/op/reshape.cpp index bbf94865ef..b34fa626f1 100644 --- a/ggml/src/ggml-openvino/openvino/op/reshape.cpp +++ b/ggml/src/ggml-openvino/openvino/op/reshape.cpp @@ -7,8 +7,10 @@ #include #include #include +#include #include #include +#include #include namespace ov { @@ -23,22 +25,43 @@ OutputVector translate_reshape(const NodeContext & context) { } int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3 || op_case == 4, - "Unsupported RESHAPE case"); + FRONT_END_CHECK_IMPLEMENTED( + op_case == 1 || op_case == 2 || op_case == 3 || op_case == 4 || op_case == 5 || op_case == 6, + "Unsupported RESHAPE case"); auto output_shape = context.get_output_shape(0).to_shape(); std::shared_ptr new_shape_node; if (op_case == 1) { new_shape_node = ov::op::v0::Constant::create( - ov::element::i64, {3}, std::vector{-1, (int64_t) output_shape[1], (int64_t) output_shape[2]}); + ov::element::i64, {4}, + std::vector{(int64_t) output_shape[0], -1, (int64_t) output_shape[2], (int64_t) output_shape[3]}); + } else if (op_case == 2) { new_shape_node = ov::op::v0::Constant::create( - ov::element::i64, {3}, std::vector{(int64_t) output_shape[0], -1, (int64_t) output_shape[2]}); + ov::element::i64, {4}, + std::vector{(int64_t) output_shape[0], (int64_t) output_shape[1], -1, (int64_t) output_shape[3]}); + } else if (op_case == 3) { - new_shape_node = - ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{(int64_t) output_shape[0], -1, 1}); + throw std::runtime_error("might be outdated RESHAPE case"); + new_shape_node = ov::op::v0::Constant::create( + ov::element::i64, {4}, std::vector{(int64_t) output_shape[0], (int64_t) output_shape[1], -1, 1}); + } else if (op_case == 4) { return {context.get_input(0).get_node_shared_ptr()->input_value(0)}; + + } else if (op_case == 5) { + std::vector shape_vec = {1, 1, -1, (int64_t) context.get_output_shape(0).to_shape()[3]}; + new_shape_node = ov::op::v0::Constant::create(ov::element::i64, {4}, shape_vec); + + // // Alternative + // auto token_len = context.get_input("token_len"); + // auto emb_size = + // ov::op::v0::Constant::create(ov::element::i64, {1}, {(int64_t) context.get_output_shape(0).to_shape()[3]}); + // auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + // new_shape_node = std::make_shared(ov::OutputVector{one, one, token_len, emb_size}, 0); + + } else if (op_case == 6) { + new_shape_node = ov::op::v0::Constant::create(ov::element::i64, {4}, context.get_output_shape(0).to_shape()); } auto res = std::make_shared(context.get_input(0), new_shape_node, false); return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index 9ad2e25284..5c83867d18 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -52,10 +52,10 @@ OutputVector translate_rope(const NodeContext & context) { if (op_case == 2) { // The input comes from a VIEW - int slice_len = output_shape[1] * output_shape[2]; + int slice_len = output_shape[2] * output_shape[3]; data_node = process_view_input(context, 0, slice_len).get_node_shared_ptr(); auto data_shape = ov::op::v0::Constant::create( - ov::element::i64, {3}, std::vector{-1, (int64_t) output_shape[1], (int64_t) output_shape[2]}); + ov::element::i64, {4}, std::vector{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]}); data_node = std::make_shared(data_node, data_shape, false); } @@ -67,9 +67,10 @@ OutputVector translate_rope(const NodeContext & context) { auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[2]}); - auto even_slice = std::make_shared(data_node, zero, end, two, two); - auto odd_slice = std::make_shared(data_node, one, end, two, two); + auto three = ov::op::v0::Constant::create(ov::element::i64, {1}, {3}); + auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[3]}); + auto even_slice = std::make_shared(data_node, zero, end, two, three); + auto odd_slice = std::make_shared(data_node, one, end, two, three); Output first_half = std::make_shared(std::make_shared(even_slice, cos_theta_node), @@ -79,14 +80,17 @@ OutputVector translate_rope(const NodeContext & context) { std::make_shared(odd_slice, cos_theta_node)); first_half = std::make_shared(first_half, - ov::op::v0::Constant::create(ov::element::i64, {1}, {3})); + ov::op::v0::Constant::create(ov::element::i64, {1}, {4})); second_half = std::make_shared(second_half, - ov::op::v0::Constant::create(ov::element::i64, {1}, {3})); - auto stack = std::make_shared(OutputVector{first_half, second_half}, 3); - res = std::make_shared(stack, std::make_shared(data_node), false); + ov::op::v0::Constant::create(ov::element::i64, {1}, {4})); + auto stack = std::make_shared(OutputVector{first_half, second_half}, 4); + + auto data_shape = ov::op::v0::Constant::create( + ov::element::i64, {4}, std::vector{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]}); + res = std::make_shared(stack, data_shape, false); } else if (mode == ROPE_TYPE_NEOX) { auto data_split = std::make_shared( - data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}), 2); + data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {3}), 2); Output slice_data_node_0 = data_split->outputs()[0]; Output slice_data_node_1 = data_split->outputs()[1]; @@ -98,7 +102,7 @@ OutputVector translate_rope(const NodeContext & context) { std::make_shared(slice_data_node_0, sin_theta_node), std::make_shared(slice_data_node_1, cos_theta_node)); - res = std::make_shared(ov::OutputVector{first_half_node, second_half_node}, 2); + res = std::make_shared(ov::OutputVector{first_half_node, second_half_node}, 3); } return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp index 8d0277ce86..d71aca1d7f 100644 --- a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp @@ -28,33 +28,28 @@ OutputVector translate_set_rows(const NodeContext & context) { num_inputs_check(context, 3, 3); auto data = context.get_input(0); + auto indices = context.get_input(1); + auto dst = context.get_input(2); + data = std::make_shared(data, context.get_output_type(0)); auto dst_shape = context.get_output_shape(0).to_shape(); - FRONT_END_OP_CONVERSION_CHECK(dst_shape[0] == 1, "Unsupported shape in SET_ROWS"); - auto indices = context.get_input(1); - auto dst = context.get_input(context.get_output_name()); + auto ind_squeezed = + std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 1, 2})); + auto data_reshaped = std::make_shared( + data, + ov::op::v0::Constant::create(ov::element::i64, {4}, + {(int64_t) 1, (int64_t) 1, (int64_t) -1, (int64_t) dst_shape[3]}), + false); + auto axes = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}); - auto zero = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {0}); - Output res; - if (context.is_static()) { - auto dst_reshaped = std::make_shared( - dst, ov::op::v0::Constant::create(ov::element::i64, {2}, {(int64_t) dst_shape[1], (int64_t) dst_shape[2]}), - false); - auto indices_reshaped = - std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); - auto data_reshaped = std::make_shared( - data, ov::op::v0::Constant::create(ov::element::i64, {2}, {(int64_t) -1, (int64_t) dst_shape[2]}), false); + Output res = std::make_shared(dst, ind_squeezed, data_reshaped, axes); - auto updated = std::make_shared(dst_reshaped, indices_reshaped, data_reshaped, zero); - res = std::make_shared(updated, std::make_shared(dst), false); - } else { - int64_t dim1 = dst.get_partial_shape()[1].get_length(); - int64_t dim2 = dst.get_partial_shape()[2].get_length(); - data = std::make_shared( - data, ov::op::v0::Constant::create(ov::element::i64, {3}, {(int64_t) -1, dim1, dim2}), false); - res = std::make_shared(OutputVector{dst, data}, 0); + if (auto dst_reshape = std::dynamic_pointer_cast(dst.get_node_shared_ptr())) { + // Fix the case of multiple sequences, reshape back to original shape [1, n_seq, ctx_per_seq, emb] + res = std::make_shared( + res, ov::op::v0::Constant::create(ov::element::i64, {4}, dst_reshape->get_input_shape(0)), false); } return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/softmax.cpp b/ggml/src/ggml-openvino/openvino/op/softmax.cpp index 6c43054050..591bcb46c4 100644 --- a/ggml/src/ggml-openvino/openvino/op/softmax.cpp +++ b/ggml/src/ggml-openvino/openvino/op/softmax.cpp @@ -23,6 +23,7 @@ namespace ggml { namespace op { OutputVector translate_soft_max(const NodeContext & context) { + // TODO code is outdated num_inputs_check(context, 1, 2); auto input_node = context.get_input(0).get_node_shared_ptr(); diff --git a/ggml/src/ggml-openvino/openvino/op/transpose.cpp b/ggml/src/ggml-openvino/openvino/op/transpose.cpp index 6b4f8a849b..572f98125d 100644 --- a/ggml/src/ggml-openvino/openvino/op/transpose.cpp +++ b/ggml/src/ggml-openvino/openvino/op/transpose.cpp @@ -12,8 +12,8 @@ namespace op { OutputVector translate_transpose(const NodeContext & context) { num_inputs_check(context, 1, 1); - auto res = std::make_shared(context.get_input(0), - ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 2, 1})); + auto res = std::make_shared( + context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 1, 3, 2})); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/view.cpp b/ggml/src/ggml-openvino/openvino/op/view.cpp index b53abca7e9..6bf980cab6 100644 --- a/ggml/src/ggml-openvino/openvino/op/view.cpp +++ b/ggml/src/ggml-openvino/openvino/op/view.cpp @@ -11,7 +11,7 @@ OutputVector translate_view(const NodeContext & context) { if (context.get_op_case() == 2) { auto dst_shape = context.get_output_shape(0).to_shape(); - return rename_outputs_with_suffix({process_view_input(context, 0, dst_shape[1] * dst_shape[2])}, + return rename_outputs_with_suffix({process_view_input(context, 0, dst_shape[2] * dst_shape[3])}, context.get_name()); } return {context.get_input(0)}; diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index def1f39460..a28946c617 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -72,15 +72,8 @@ ov::pass::MakeStateful::ParamResPairs get_kv_param_res_pairs( return pairs; } -void add_token_len(TensorMap & tensor_map) { - auto inp_tokens = tensor_map.at("inp_tokens").get_node_shared_ptr(); - auto token_len = get_dimensions(inp_tokens, {2}); - token_len->set_friendly_name("token_len"); - tensor_map.insert({"token_len", token_len->output(0)}); -} - void add_sliced_mask(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) { - auto token_len = tensor_map.at("token_len").get_node_shared_ptr(); + auto token_len_per_seq = tensor_map.at("token_len_per_seq").get_node_shared_ptr(); auto create_sliced_mask = [&](const std::string & mask_name, const std::string & sliced_name, bool is_static) { if (tensor_map.find(mask_name) != tensor_map.end()) { @@ -89,28 +82,10 @@ void add_sliced_mask(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) { if (is_static) { mask_sliced = mask; } else { - auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 0}); - auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 1}); - auto one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2}); - - std::shared_ptr kv_len; - { - auto start = ov::op::v0::Constant::create(element::i64, Shape{3}, {0, 0, -1}); - auto stride = ov::op::v0::Constant::create(element::i64, Shape{3}, {1, 1, 1}); - auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); - kv_len = std::make_shared( - inp_pos, start, start, stride, std::vector{0, 0, 0}, std::vector{1, 1, 1}); - } - kv_len = std::make_shared( - kv_len, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); - kv_len = std::make_shared(kv_len, ov::element::i64); - kv_len = std::make_shared(kv_len, one_1d); - auto stop = std::make_shared(ov::OutputVector{token_len, kv_len}, 0); - - mask_sliced = std::make_shared(mask, zero_2d, stop, one_2d, axes); + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); + mask_sliced = std::make_shared(mask, zero, token_len_per_seq, one, two); mask_sliced = std::make_shared(mask_sliced, ov::element::f16); mask_sliced->set_friendly_name(sliced_name); } @@ -119,8 +94,7 @@ void add_sliced_mask(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) { }; create_sliced_mask("KQ_mask", "KQ_mask_sliced", ggml_model_decoder.is_static()); - // swa is not working for the `kv_len` is not correct - // create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static()); + create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static()); } void add_rope_sin_cos(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) { @@ -143,7 +117,6 @@ void add_rope_sin_cos(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) // Create common patterns void preprocess(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) { - add_token_len(tensor_map); add_sliced_mask(tensor_map, ggml_model_decoder); add_rope_sin_cos(tensor_map, ggml_model_decoder); } diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp index 1723c7d003..bdda30fa6d 100644 --- a/ggml/src/ggml-openvino/openvino/utils.cpp +++ b/ggml/src/ggml-openvino/openvino/utils.cpp @@ -77,12 +77,12 @@ ov::Output rope_yarn_ramp_mix(int n_dims, const float corr_dims[2], fl int half_n_dims = n_dims / 2; std::vector dim_ids_vec(half_n_dims); std::iota(dim_ids_vec.begin(), dim_ids_vec.end(), 0); - auto dim_ids = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, (size_t) half_n_dims}, dim_ids_vec); - auto corr_low = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {corr_dims[0]}); - auto corr_high = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {corr_dims[1]}); - auto denom = - std::make_shared(std::make_shared(corr_high, corr_low), - ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {0.001f})); + auto dim_ids = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, (size_t) half_n_dims}, dim_ids_vec); + auto corr_low = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {corr_dims[0]}); + auto corr_high = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {corr_dims[1]}); + auto denom = std::make_shared( + std::make_shared(corr_high, corr_low), + ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {0.001f})); auto ramp_y = std::make_shared(std::make_shared(dim_ids, corr_low), denom); auto ramp_clamped = std::make_shared(ramp_y, 0.0f, 1.0f); @@ -116,7 +116,7 @@ std::pair, ov::Output> make_sin_cos(int32_t * rope_params std::shared_ptr rope_freqs_weight) { inp_pos = std::make_shared(inp_pos, ov::element::f32); auto pos_perm = - std::make_shared(ov::element::i64, ov::Shape{3}, std::vector{2, 1, 0}); + std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{0, 3, 1, 2}); inp_pos = std::make_shared(inp_pos, pos_perm); float freq_base; @@ -146,7 +146,7 @@ std::pair, ov::Output> make_sin_cos(int32_t * rope_params } Output freq_factors = - std::make_shared(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor); + std::make_shared(ov::element::f32, ov::Shape{1, 1, 1, factor.size()}, factor); if (rope_freqs_weight) { freq_factors = std::make_shared(freq_factors, rope_freqs_weight); } @@ -161,7 +161,7 @@ std::pair, ov::Output> make_sin_cos(int32_t * rope_params theta = theta_interp; } else { auto ramp_mix = rope_yarn_ramp_mix(n_dims, corr_dims, ext_factor); - auto one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {1.0f}); + auto one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {1.0f}); auto one_minus_ramp = std::make_shared(one, ramp_mix); theta = std::make_shared(std::make_shared(theta_interp, one_minus_ramp), @@ -183,19 +183,19 @@ ov::Output process_view_input(const NodeContext & context, int input_i // Only works for VIEW operations that slice at the lowest dimension // If the VIEW also reshape the result, `slice_len` should be provided auto input = context.get_input(input_index); - int32_t * op_params = context.get_input_op_params(input_index); + auto * op_params = (size_t *) context.get_input_op_params(input_index); auto src1_stride = context.get_input_stride(input_index); - int64_t split_addr = op_params[0] / src1_stride[2]; + int64_t split_addr = op_params[0] / src1_stride[3]; if (slice_len == 0) { - slice_len = context.get_input_shape(input_index)[2].get_length(); + slice_len = context.get_input_shape(input_index)[3].get_length(); } int64_t slice_end = split_addr + slice_len; auto begin = ov::op::v0::Constant::create(ov::element::i64, {1}, {split_addr}); auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_end}); auto stride = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - auto axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); + auto axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {3}); auto sliced = std::make_shared(input, begin, end, stride, axes); return sliced; } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 965effe327..63e808c038 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -129,18 +129,24 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * ov_input_names_cache[cgraph] = ov_input_names; ov_output_names_cache[cgraph] = ov_output_names; - // Set output tensors and kvcache address for NPU once and for all since the graph is static - if (is_static) { - for (size_t i = 0; i < ov_output_names.size(); i++) { + // Set output tensors (for NPU) and kvcache i/o tensors once and for all + for (size_t i = 0; i < ov_output_names.size(); i++) { + auto output_name = ov_output_names[i]; + if (is_static || output_name.find("cache") == 0) { auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); infer_request->set_output_tensor(i, output_tensor); } - for (size_t i = 0; i < ov_input_names.size(); i++) { - auto param_name = ov_input_names[i]; - if (param_name.find("cache_k") == 0 || param_name.find("cache_v") == 0) { - auto input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, 0, 0); - infer_request->set_input_tensor(i, input_tensor); + } + for (size_t i = 0; i < ov_input_names.size(); i++) { + auto param_name = ov_input_names[i]; + if (param_name.find("cache") == 0) { + ov::Tensor input_tensor; + if (is_static) { + input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, 0, 0); + } else { + input_tensor = get_ov_input_tensor(ggml_decoder, param_name); } + infer_request->set_input_tensor(i, input_tensor); } } } @@ -152,6 +158,9 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * if (!is_static) { for (size_t i = 0; i < ov_input_names.size(); i++) { auto param_name = ov_input_names[i]; + if (param_name.find("cache") == 0) { + continue; + } auto input_tensor = get_ov_input_tensor(ggml_decoder, param_name); infer_request->set_input_tensor(i, input_tensor); @@ -179,7 +188,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * for (int j = 0; j < input_len; j++) { for (size_t i = 0; i < ov_input_names.size(); i++) { auto param_name = ov_input_names[i]; - if (param_name.find("cache_k") == 0 || param_name.find("cache_v") == 0) { + if (param_name.find("cache") == 0) { continue; } auto input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, j, input_len); @@ -306,7 +315,7 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, ov::Shape input_shape; if (ggml_tensor->op == GGML_OP_VIEW) { // This case is added to make test-backend-ops work - input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor->view_src).to_shape(); + input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor, ggml_tensor->view_src).to_shape(); } else { input_shape = ggml_decoder->get_input_shape(name).to_shape(); } @@ -319,13 +328,6 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, cons ov::Tensor input_tensor; if (ggml_decoder->get_model_extra_inputs().find(param_name) != ggml_decoder->get_model_extra_inputs().end()) { input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name); - - } else if (param_name.find("cache_k") == 0 || param_name.find("cache_v") == 0) { - void * input_data = ggml_decoder->get_input_ggml_tensor(param_name)->data; - ov::Shape input_shape = {(size_t) ggml_decoder->get_past_kv_len(), (size_t) ggml_decoder->get_num_heads_kv(), - (size_t) ggml_decoder->get_head_size()}; - input_tensor = ov::Tensor(ggml_decoder->get_input_type(param_name), input_shape, input_data); - } else { input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); } @@ -339,15 +341,8 @@ ov::Tensor get_ov_input_tensor_static(std::shared_ptr ggml_decode const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(param_name); const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor); - if (param_name.find("cache_k") == 0 || param_name.find("cache_v") == 0) { - void * input_data = ggml_decoder->get_input_ggml_tensor(param_name)->data; - ov::Shape input_shape = {(size_t) ggml_decoder->get_context_size(), (size_t) ggml_decoder->get_num_heads_kv(), - (size_t) ggml_decoder->get_head_size()}; - return ov::Tensor(ggml_decoder->get_input_type(param_name), input_shape, input_data); - } - if (param_name == "inp_pos" || param_name == "inp_tokens" || op->op == GGML_OP_SET_ROWS) { - ov::Shape input_shape = {1, 1, 1}; + ov::Shape input_shape = {1, 1, 1, 1}; ov::Tensor input_tensor(ggml_decoder->get_input_type(param_name), input_shape); // copy the j-th value from ggml_tensor size_t element_size = ggml_type_size(ggml_tensor->type); @@ -357,7 +352,7 @@ ov::Tensor get_ov_input_tensor_static(std::shared_ptr ggml_decode } if (param_name == "inp_out_ids") { - ov::Shape input_shape = {1, 1, 1}; + ov::Shape input_shape = {1, 1, 1, 1}; ov::Tensor input_tensor(ggml_decoder->get_input_type(param_name), input_shape); if (ggml_tensor->ne[0] == 0) { *input_tensor.data() = 0; @@ -374,10 +369,10 @@ ov::Tensor get_ov_input_tensor_static(std::shared_ptr ggml_decode } if (param_name.find("KQ_mask") == 0) { - size_t context_size = ggml_decoder->get_context_size(); + size_t context_size = ggml_decoder->get_ctx_size(); const auto * input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); std::vector padded_data = pad_input(input_tensor_ggml, input_len, context_size, -INFINITY); - ov::Tensor input_tensor(ov::element::f32, ov::Shape{1, 1, context_size}); + ov::Tensor input_tensor(ov::element::f32, ov::Shape{1, 1, 1, context_size}); // copy the j-th row of padded_data auto * data_ptr = input_tensor.data(); std::copy(padded_data.begin() + j * context_size, padded_data.begin() + (j + 1) * context_size, data_ptr); @@ -391,20 +386,12 @@ ov::Tensor get_ov_output_tensor(std::shared_ptr ggml_decoder, con auto * ggml_tensor = ggml_decoder->get_output_ggml_tensor(result_name); auto output_type = ggml_decoder->get_output_type(result_name); ov::Shape output_shape; - if (result_name.find("cache") == std::string::npos) { - output_shape = ggml_decoder->get_output_shape(result_name).to_shape(); - if (ggml_decoder->is_static() && result_name == "result_output") { - output_shape[1] = 1; - } - } else { - size_t total_token_len = ggml_decoder->get_past_kv_len() + ggml_decoder->get_input_len(); - size_t num_heads_kv = ggml_decoder->get_num_heads_kv(); - size_t head_size = ggml_decoder->get_head_size(); - if (ggml_decoder->is_static()) { - total_token_len = ggml_decoder->get_context_size(); - } - output_shape = ov::Shape{total_token_len, num_heads_kv, head_size}; + output_shape = ggml_decoder->get_output_shape(result_name).to_shape(); + + if (ggml_decoder->is_static() && result_name == "result_output") { + output_shape[1] = 1; } + ov::Tensor output_tensor(output_type, output_shape, ggml_tensor->data); return output_tensor; } From ae404f7cbb177f3d8c4f445dcb2f697f8a3ef28a Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 20 Nov 2025 16:23:17 +0800 Subject: [PATCH 167/254] Fix llama-bench --- ggml/src/ggml-openvino/ggml-decoder.cpp | 11 ++++- .../src/ggml-openvino/openvino/op/permute.cpp | 2 +- .../ggml-openvino/openvino/op/set_rows.cpp | 10 +++- ggml/src/ggml-openvino/utils.cpp | 47 +++++++++---------- 4 files changed, 40 insertions(+), 30 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 0f913bdd75..dbc3780027 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -301,7 +301,9 @@ void GgmlOvDecoder::set_llm_params() { m_n_seq_active = mask->ne[3]; auto seq_size = cache_k->ne[0] * cache_k->ne[1] * ggml_type_size(cache_k->type); - m_seq_active_start = ((size_t *) cache_k_view->op_params)[0] / seq_size; + size_t offset; + memcpy(&offset, cache_k_view->op_params, sizeof(size_t)); + m_seq_active_start = offset / seq_size; m_token_len_per_seq = node->ne[2]; if (mask_name.find("swa") != std::string::npos) { @@ -346,6 +348,13 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co input_shape = ov::PartialShape{-1, 1, -1, -1}; } + } else if (name.find("cache_") == 0) { + input_shape = ov::PartialShape{get_shape(input)}; + if (!m_is_static) { + // do not fix ctx size to make llama-bench work + input_shape[2] = -1; + } + } else if (op && op->op == GGML_OP_SET_ROWS && op->src[1] == input) { input_shape = ov::PartialShape{1, 1, 1, m_is_static ? 1 : -1}; diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index 2fe2325d6a..772342a2ae 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -52,7 +52,7 @@ OutputVector translate_permute(const NodeContext & context) { auto output_shape = context.get_output_shape(0).to_shape(); int64_t head_size = output_shape[3]; int64_t n_heads = output_shape[1]; - int64_t ctx_per_seq = cache_shape[2].get_length(); + int64_t ctx_per_seq = cache_shape[2].is_static() ? cache_shape[2].get_length() : -1; int64_t n_seq = cache_shape[1].get_length(); Output attention_size; diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp index d71aca1d7f..a323e5ed38 100644 --- a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp @@ -18,6 +18,7 @@ #include #include #include +#include namespace ov { namespace frontend { @@ -48,8 +49,13 @@ OutputVector translate_set_rows(const NodeContext & context) { if (auto dst_reshape = std::dynamic_pointer_cast(dst.get_node_shared_ptr())) { // Fix the case of multiple sequences, reshape back to original shape [1, n_seq, ctx_per_seq, emb] - res = std::make_shared( - res, ov::op::v0::Constant::create(ov::element::i64, {4}, dst_reshape->get_input_shape(0)), false); + // ctx_per_seq is not fixed due to llama-bench compatibility + auto dst_shape_partial = dst_reshape->get_input_partial_shape(0); + std::vector dst_shape = {dst_shape_partial[0].get_length(), dst_shape_partial[1].get_length(), + dst_shape_partial[2].is_static() ? dst_shape_partial[2].get_length() : -1, + dst_shape_partial[3].get_length()}; + res = std::make_shared(res, ov::op::v0::Constant::create(ov::element::i64, {4}, dst_shape), + false); } return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 63e808c038..5b9ecb5f4f 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -129,26 +129,27 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * ov_input_names_cache[cgraph] = ov_input_names; ov_output_names_cache[cgraph] = ov_output_names; - // Set output tensors (for NPU) and kvcache i/o tensors once and for all - for (size_t i = 0; i < ov_output_names.size(); i++) { - auto output_name = ov_output_names[i]; - if (is_static || output_name.find("cache") == 0) { - auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); - infer_request->set_output_tensor(i, output_tensor); - } - } - for (size_t i = 0; i < ov_input_names.size(); i++) { - auto param_name = ov_input_names[i]; - if (param_name.find("cache") == 0) { - ov::Tensor input_tensor; - if (is_static) { - input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, 0, 0); - } else { - input_tensor = get_ov_input_tensor(ggml_decoder, param_name); - } - infer_request->set_input_tensor(i, input_tensor); - } - } + // // Set output tensors (for NPU) and kvcache i/o tensors once and for all + // // Note: does not seem to improve perf on CPU/GPU, but it breaks llama-bench, so disabled it + // for (size_t i = 0; i < ov_output_names.size(); i++) { + // auto output_name = ov_output_names[i]; + // if (is_static || output_name.find("cache") == 0) { + // auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); + // infer_request->set_output_tensor(i, output_tensor); + // } + // } + // for (size_t i = 0; i < ov_input_names.size(); i++) { + // auto param_name = ov_input_names[i]; + // if (param_name.find("cache") == 0) { + // ov::Tensor input_tensor; + // if (is_static) { + // input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, 0, 0); + // } else { + // input_tensor = get_ov_input_tensor(ggml_decoder, param_name); + // } + // infer_request->set_input_tensor(i, input_tensor); + // } + // } } } @@ -158,9 +159,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * if (!is_static) { for (size_t i = 0; i < ov_input_names.size(); i++) { auto param_name = ov_input_names[i]; - if (param_name.find("cache") == 0) { - continue; - } auto input_tensor = get_ov_input_tensor(ggml_decoder, param_name); infer_request->set_input_tensor(i, input_tensor); @@ -188,9 +186,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * for (int j = 0; j < input_len; j++) { for (size_t i = 0; i < ov_input_names.size(); i++) { auto param_name = ov_input_names[i]; - if (param_name.find("cache") == 0) { - continue; - } auto input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, j, input_len); infer_request->set_input_tensor(i, input_tensor); From 531941b348aa19186d04f79e9ff93d9e56438b17 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 24 Nov 2025 11:31:28 +0800 Subject: [PATCH 168/254] Fix NPU --- ggml/src/ggml-openvino/ggml-decoder.cpp | 31 +++++++++----- .../src/ggml-openvino/openvino/op/permute.cpp | 4 +- .../openvino/translate_session.cpp | 4 +- ggml/src/ggml-openvino/utils.cpp | 40 +++++++++---------- 4 files changed, 43 insertions(+), 36 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index dbc3780027..c00efaf6ae 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -311,6 +311,11 @@ void GgmlOvDecoder::set_llm_params() { } else { m_attention_size = mask->ne[0]; } + if (m_is_static) { + m_attention_size = m_ctx_per_seq; + m_attention_size_swa = m_ctx_per_seq_swa; + m_token_len_per_seq = 1; + } } else if (node->op == GGML_OP_ROPE) { if (name.find("Qcur-0") == 0 || std::string(node->src[0]->name).find("Qcur-0") == 0) { @@ -330,7 +335,7 @@ void GgmlOvDecoder::set_llm_params() { void GgmlOvDecoder::validate_cgraph() const { if (m_n_seq > 1 && m_is_static == true) { - throw std::runtime_error("n_seq > 1 is not supported on NPU"); + throw std::runtime_error("n_seq > 1 is not supported on NPU. Try setting -np 1."); } } @@ -371,18 +376,24 @@ void GgmlOvDecoder::add_extra_inputs() { // Extra inputs: // 1. `attention_size`, used in FLASH_ATTN where the shape of the matmul's are 256 aligned, // see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding. - // Not used for NPU. // 2. `n_seq_active` and `seq_active_start`, used in FLASH_ATTN_EXT to indicate the active sequences in the batch - auto create_1d_input = [this](const std::string & name, int64_t size) { - auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); - param_node->set_friendly_name(name); - param_node->output(0).get_tensor().set_names({name}); - m_model_extra_inputs[name] = param_node; + auto create_1d_input = [this](const std::string & name, int64_t value) { + if (m_is_static) { + auto constant = + std::make_shared(ov::element::i64, ov::Shape{1}, std::vector{value}); + constant->set_friendly_name(name); + m_model_extra_inputs[name] = constant; + } else { + auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); + param_node->set_friendly_name(name); + param_node->output(0).get_tensor().set_names({name}); + m_model_extra_inputs[name] = param_node; - auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); - *tensor->data() = size; - m_model_extra_input_values[name] = tensor; + auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); + *tensor->data() = value; + m_model_extra_input_values[name] = tensor; + } }; create_1d_input("attention_size", m_attention_size); diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index 772342a2ae..d156e48e3c 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -56,9 +56,7 @@ OutputVector translate_permute(const NodeContext & context) { int64_t n_seq = cache_shape[1].get_length(); Output attention_size; - if (context.is_static()) { - attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {INT_MAX}); - } else if (op_case == 2) { + if (op_case == 2) { attention_size = context.get_input("attention_size"); } else { attention_size = context.get_input("attention_size_swa"); diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index a28946c617..d12701acdc 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -154,7 +154,9 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo } for (const auto & it : ggml_model_decoder->get_model_extra_inputs()) { - params.push_back(std::dynamic_pointer_cast(it.second)); + if (std::dynamic_pointer_cast(it.second)) { + params.push_back(std::dynamic_pointer_cast(it.second)); + } (*tensor_map)[it.first] = it.second; } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 5b9ecb5f4f..6e1d7393c7 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -129,27 +129,22 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * ov_input_names_cache[cgraph] = ov_input_names; ov_output_names_cache[cgraph] = ov_output_names; - // // Set output tensors (for NPU) and kvcache i/o tensors once and for all - // // Note: does not seem to improve perf on CPU/GPU, but it breaks llama-bench, so disabled it - // for (size_t i = 0; i < ov_output_names.size(); i++) { - // auto output_name = ov_output_names[i]; - // if (is_static || output_name.find("cache") == 0) { - // auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); - // infer_request->set_output_tensor(i, output_tensor); - // } - // } - // for (size_t i = 0; i < ov_input_names.size(); i++) { - // auto param_name = ov_input_names[i]; - // if (param_name.find("cache") == 0) { - // ov::Tensor input_tensor; - // if (is_static) { - // input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, 0, 0); - // } else { - // input_tensor = get_ov_input_tensor(ggml_decoder, param_name); - // } - // infer_request->set_input_tensor(i, input_tensor); - // } - // } + // Set output tensors (for NPU) and kvcache i/o tensors once and for all + // Note: does not seem to improve perf on CPU/GPU, but breaks llama-bench, so disabled it for CPU/GPU + if (is_static) { + for (size_t i = 0; i < ov_output_names.size(); i++) { + auto output_name = ov_output_names[i]; + auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); + infer_request->set_output_tensor(i, output_tensor); + } + for (size_t i = 0; i < ov_input_names.size(); i++) { + auto param_name = ov_input_names[i]; + if (param_name.find("cache") == 0) { + auto input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, 0, 0); + infer_request->set_input_tensor(i, input_tensor); + } + } + } } } @@ -336,7 +331,8 @@ ov::Tensor get_ov_input_tensor_static(std::shared_ptr ggml_decode const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(param_name); const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor); - if (param_name == "inp_pos" || param_name == "inp_tokens" || op->op == GGML_OP_SET_ROWS) { + if (param_name == "inp_pos" || param_name == "inp_tokens" || + (op->op == GGML_OP_SET_ROWS && op->src[1] == ggml_tensor)) { ov::Shape input_shape = {1, 1, 1, 1}; ov::Tensor input_tensor(ggml_decoder->get_input_type(param_name), input_shape); // copy the j-th value from ggml_tensor From 047bfb5c907eb929d29c78f55e8a453ccbff8483 Mon Sep 17 00:00:00 2001 From: Arshath Date: Fri, 21 Nov 2025 04:39:33 +0530 Subject: [PATCH 169/254] Update ggml-decoder.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hitting error while compiling on windows: error C3861: 'unsetenv': identifier not found Reason: unsetenv() is a POSIX function; it doesn’t exist on Windows. Visual Studio (MSVC) won’t recognize it. Proposed fix: Use _putenv_s() (Windows equivalent) This is supported by MSVC and achieves the same effect: it removes the environment variable from the process environment. This keeps cross-platform compatibility. --- ggml/src/ggml-openvino/ggml-decoder.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index c00efaf6ae..ae559e41b9 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -63,10 +63,20 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, m_is_static(is_static), m_cgraph(cgraph), m_op_name(m_node ? std::string(m_node->name) : ""), - m_model_weights(model_weights) { + m_model_weights(model_weights), + /*m_is_static(is_static) { if (auto * env = getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); env && std::string(env) != "0") { unsetenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); print_tensor_address_map(cgraph); + }*/ + m_is_static(is_static) { + if (auto* env = getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); env && std::string(env) != "0") { + #ifdef _WIN32 + _putenv_s("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS", ""); + #else + unsetenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); + #endif + print_tensor_address_map(cgraph); } set_llm_params(); From 11b4cc5a67620b680c1270a6cfa3549f8ee1bf1b Mon Sep 17 00:00:00 2001 From: Arshath Date: Fri, 21 Nov 2025 04:42:27 +0530 Subject: [PATCH 170/254] Update ggml-decoder.cpp --- ggml/src/ggml-openvino/ggml-decoder.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index ae559e41b9..2866eaa8a2 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -64,12 +64,11 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, m_cgraph(cgraph), m_op_name(m_node ? std::string(m_node->name) : ""), m_model_weights(model_weights), - /*m_is_static(is_static) { - if (auto * env = getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); env && std::string(env) != "0") { + m_is_static(is_static) { + /*if (auto * env = getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); env && std::string(env) != "0") { unsetenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); print_tensor_address_map(cgraph); }*/ - m_is_static(is_static) { if (auto* env = getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); env && std::string(env) != "0") { #ifdef _WIN32 _putenv_s("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS", ""); From bed495226d2b8a243b4bd0cc00ef1232ddbd0eaf Mon Sep 17 00:00:00 2001 From: Arshath Date: Fri, 21 Nov 2025 04:55:54 +0530 Subject: [PATCH 171/254] Update ggml-decoder.cpp --- ggml/src/ggml-openvino/ggml-decoder.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 2866eaa8a2..61144eff4c 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -65,12 +65,11 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, m_op_name(m_node ? std::string(m_node->name) : ""), m_model_weights(model_weights), m_is_static(is_static) { - /*if (auto * env = getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); env && std::string(env) != "0") { - unsetenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); + if (auto * env = getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); env && std::string(env) != "0") { + /* unsetenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); print_tensor_address_map(cgraph); }*/ - if (auto* env = getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); env && std::string(env) != "0") { - #ifdef _WIN32 + #ifdef _WIN32 _putenv_s("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS", ""); #else unsetenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); From 4a57b37d4dc229a5ddeff874cf1ca5c927f31bda Mon Sep 17 00:00:00 2001 From: Arshath Date: Fri, 21 Nov 2025 04:57:55 +0530 Subject: [PATCH 172/254] Update ggml-decoder.cpp --- ggml/src/ggml-openvino/ggml-decoder.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 61144eff4c..9a520c9ab6 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -66,9 +66,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, m_model_weights(model_weights), m_is_static(is_static) { if (auto * env = getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); env && std::string(env) != "0") { - /* unsetenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); - print_tensor_address_map(cgraph); - }*/ + // unsetenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); #ifdef _WIN32 _putenv_s("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS", ""); #else From 98396b275a38cb695463e7514c66f32cc666e654 Mon Sep 17 00:00:00 2001 From: Arshath Date: Fri, 21 Nov 2025 04:59:29 +0530 Subject: [PATCH 173/254] Update ggml-decoder.cpp --- ggml/src/ggml-openvino/ggml-decoder.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 9a520c9ab6..a661a859fd 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -66,7 +66,6 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, m_model_weights(model_weights), m_is_static(is_static) { if (auto * env = getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); env && std::string(env) != "0") { - // unsetenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); #ifdef _WIN32 _putenv_s("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS", ""); #else From 4400b5cb4b311d8e3d395057f586f005074c2feb Mon Sep 17 00:00:00 2001 From: Arshath Date: Fri, 21 Nov 2025 05:03:19 +0530 Subject: [PATCH 174/254] Update ggml-decoder.cpp --- ggml/src/ggml-openvino/ggml-decoder.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index a661a859fd..b8fb25adda 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -71,7 +71,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, #else unsetenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); #endif - print_tensor_address_map(cgraph); + print_tensor_address_map(cgraph); } set_llm_params(); From ae936519d26d66f45b27840fc2477d173d9b9b7d Mon Sep 17 00:00:00 2001 From: XuejunZhai Date: Tue, 25 Nov 2025 17:58:54 -0800 Subject: [PATCH 175/254] Remove the second decoder for node. Moving the function into the model decoder --- ggml/src/ggml-openvino/ggml-decoder.cpp | 287 ++++++++++-------- ggml/src/ggml-openvino/ggml-decoder.h | 56 ++-- ggml/src/ggml-openvino/openvino/decoder.hpp | 26 +- .../ggml-openvino/openvino/node_context.hpp | 31 +- .../openvino/translate_session.cpp | 8 +- 5 files changed, 232 insertions(+), 176 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index b8fb25adda..f3f13167d9 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -35,36 +35,12 @@ #include #include -GgmlOvDecoder::GgmlOvDecoder(ggml_tensor * node, - ggml_cgraph * cgraph, - bool is_static, - int context_size, - int context_size_swa, - int num_heads, - int num_heads_kv, - int head_size, - const std::vector & swa_layers) : - m_is_static(is_static), - m_cgraph(cgraph), - m_node(node), - m_op_name(std::string(node->name)), - m_ctx(context_size), - m_ctx_swa(context_size_swa), - m_n_heads(num_heads), - m_n_heads_kv(num_heads_kv), - m_head_size(head_size), - m_swa_layers(swa_layers) { - set_input_output(node); -} - GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map> & model_weights, bool is_static) : m_is_static(is_static), m_cgraph(cgraph), - m_op_name(m_node ? std::string(m_node->name) : ""), - m_model_weights(model_weights), - m_is_static(is_static) { + m_model_weights(model_weights) { if (auto * env = getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); env && std::string(env) != "0") { #ifdef _WIN32 _putenv_s("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS", ""); @@ -83,6 +59,11 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, set_input_output(cur_node); } + for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { + m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node); + m_node_info_list[node_n].node_op_type = compute_op_type(m_node_info_list[node_n].node); + } + add_extra_inputs(); } @@ -104,6 +85,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::mapop == GGML_OP_SET_ROWS) { // SET_ROWS updates the tensor in place. For later ov op that uses the // the view_src of SET_ROWS, we need to make sure they get the updated tensor @@ -117,6 +99,12 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { m_output_names.push_back(node_name); m_outputs[node_name] = node; + current_node_info.node = node; + current_node_info.node_name = node_name; + current_node_info.node_outputs[node_name] = node; + current_node_info.node_outputs_names.push_back(node_name); + current_node_info.node_op_case = 0; + for (int i = 0; i < GGML_MAX_SRC; i++) { auto * src = node->src[i]; if (src == nullptr) { @@ -125,7 +113,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { std::string src_name = std::string(src->name); m_input_names.push_back(src_name); m_inputs[src_name] = src; - m_op_node_name.emplace_back(src_name, ggml_op_name(node->op)); + current_node_info.node_inputs[src_name] = src; + current_node_info.node_inputs_names.push_back(src_name); // Add model inputs and weights constants, if called for the whole graph if (naive) { @@ -137,7 +126,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { m_model_inputs[src_name] = param_node; } - } else if (!m_node && !src->view_src) { + } else if (!src->view_src) { ggml_backend_buffer * buffer = src->buffer; if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) { @@ -160,7 +149,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { // Add model outputs, if called for the whole graph if (naive) { m_model_output_names.push_back(node_name); - } else if (!m_node) { + } else { // Model outputs are tensors with GGML_TENSOR_FLAG_OUTPUT flag and kv_caches static std::set debug_output_names = {}; // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph @@ -179,92 +168,92 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { } } - if (m_node) { - switch (node->op) { - case GGML_OP_RESHAPE: { - auto * src = node->src[0]; - if (src->op == GGML_OP_RESHAPE && src->src[0]->ne[0] == node->ne[0] && src->src[0]->ne[1] == node->ne[1]) { - m_op_case = 4; - } else if (node->ne[0] * node->ne[1] == src->ne[0]) { - m_op_case = 1; - } else if (src->ne[0] * src->ne[1] == node->ne[0]) { - m_op_case = 2; - if (src->ne[2] * src->ne[3] == node->ne[1]) { - m_op_case = 5; - } - } else if (src->ne[0] * src->ne[1] == node->ne[1]) { - m_op_case = 3; - } else if (src->ne[1] * src->ne[2] == node->ne[1]) { - m_op_case = 6; + m_node_info_list.push_back(current_node_info); +} + +int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) { + int op_case = 0; + switch (node->op) { + case GGML_OP_RESHAPE: { + auto * src = node->src[0]; + if (src->op == GGML_OP_RESHAPE && src->src[0]->ne[0] == node->ne[0] && src->src[0]->ne[1] == node->ne[1]) { + op_case = 4; + } else if (node->ne[0] * node->ne[1] == src->ne[0]) { + op_case = 1; + } else if (src->ne[0] * src->ne[1] == node->ne[0]) { + op_case = 2; + if (src->ne[2] * src->ne[3] == node->ne[1]) { + op_case = 5; } - break; - } - case GGML_OP_CONT: { - if (node->src[0]->op == GGML_OP_PERMUTE) { - m_op_case = 1; - } else if (node->src[0]->op == GGML_OP_TRANSPOSE) { - m_op_case = 2; - } else if (node->src[0]->op == GGML_OP_VIEW) { - // The input comes from a VIEW which is subtensor - m_op_case = 3; - } - break; - } - case GGML_OP_PERMUTE: { - if (node->src[0]->op != GGML_OP_VIEW) { - m_op_case = 1; - } else if (ggml_is_contiguous(node->src[0])) { - std::string src_name(node->view_src->name); - if (src_name.find("cache") == std::string::npos) { - // permute Qcur - m_op_case = 4; - } else { - // Permute kv cache (view) - int layer = extract_layer_from_name(src_name); - if (!is_swa_layer(layer)) { - m_op_case = 2; - } else { - m_op_case = 3; - } - } - } - break; - } - case GGML_OP_MUL_MAT: { - if (node->src[0]->op == GGML_OP_CONT && node->src[0]->src[0]->op == GGML_OP_TRANSPOSE) { - m_op_case = 2; - } else if (node->src[0]->op == GGML_OP_VIEW && node->src[1]->op == GGML_OP_VIEW) { - // test-backend-ops case - m_op_case = 3; - } - break; - } - case GGML_OP_GET_ROWS: { - if (node->src[1]->op == GGML_OP_VIEW) { - m_op_case = 2; - } - break; - } - case GGML_OP_ROPE: { - if (node->src[0]->op == GGML_OP_VIEW) { - m_op_case = 2; - } - break; - } - case GGML_OP_VIEW: { - if (node->src[0]->op == GGML_OP_VIEW) { - auto * src = node->src[0]; - if (ggml_nelements(node) != ggml_nelements(src)) { - throw std::runtime_error("Unsupported VIEW case"); - } - // This view is a reshape, slicing happens at src->op - m_op_case = 2; - } - } - default: - break; + } else if (src->ne[0] * src->ne[1] == node->ne[1]) { + op_case = 3; + } else if (src->ne[1] * src->ne[2] == node->ne[1]) { + op_case = 6; } + break; } + case GGML_OP_CONT: { + if (node->src[0]->op == GGML_OP_PERMUTE) { + op_case = 1; + } else if (node->src[0]->op == GGML_OP_TRANSPOSE) { + op_case = 2; + } else if (node->src[0]->op == GGML_OP_VIEW) { + op_case = 3; + } + break; + } + case GGML_OP_PERMUTE: { + if (node->src[0]->op != GGML_OP_VIEW) { + op_case = 1; + } else if (ggml_is_contiguous(node->src[0])) { + std::string src_name(node->view_src->name); + if (src_name.find("cache") == std::string::npos) { + op_case = 4; + } else { + int layer = extract_layer_from_name(src_name); + if (!is_swa_layer(layer)) { + op_case = 2; + } else { + op_case = 3; + } + } + } + break; + } + case GGML_OP_MUL_MAT: { + if (node->src[0]->op == GGML_OP_CONT && node->src[0]->src[0]->op == GGML_OP_TRANSPOSE) { + op_case = 2; + } else if (node->src[0]->op == GGML_OP_VIEW && node->src[1]->op == GGML_OP_VIEW) { + op_case = 3; + } + break; + } + case GGML_OP_GET_ROWS: { + if (node->src[1]->op == GGML_OP_VIEW) { + op_case = 2; + } + break; + } + case GGML_OP_ROPE: { + if (node->src[0]->op == GGML_OP_VIEW) { + op_case = 2; + } + break; + } + case GGML_OP_VIEW: { + if (node->src[0]->op == GGML_OP_VIEW) { + auto * src = node->src[0]; + if (ggml_nelements(node) != ggml_nelements(src)) { + throw std::runtime_error("Unsupported VIEW case"); + } + op_case = 2; + } + break; + } + default: + break; + } + return op_case; } int extract_layer_from_name(const std::string & name) { @@ -722,10 +711,18 @@ ov::PartialShape GgmlOvDecoder::get_input_shape(const std::string & name) const return ov::PartialShape(get_shape(m_inputs.at(name))); } +ov::PartialShape GgmlOvDecoder::get_input_shape(int node_idx, const std::string & name) const { + return ov::PartialShape(get_shape(m_node_info_list[node_idx].node_inputs.at(name))); +} + std::vector GgmlOvDecoder::get_input_stride(const std::string & name) const { return get_stride(m_inputs.at(name)); } +std::vector GgmlOvDecoder::get_input_stride(int node_idx, const std::string & name) const { + return get_stride(m_node_info_list[node_idx].node_inputs.at(name)); +} + ov::element::Type GgmlOvDecoder::get_input_type(const std::string & name) const { return get_ov_type(m_inputs.at(name)); } @@ -734,15 +731,18 @@ size_t GgmlOvDecoder::get_input_size() const { return m_input_names.size(); } -std::string & GgmlOvDecoder::get_input_name(size_t index) const { - m_name = m_input_names[index]; - return m_name; +size_t GgmlOvDecoder::get_input_size(int node_idx) const { + return m_node_info_list[node_idx].node_inputs_names.size(); } std::vector GgmlOvDecoder::get_input_names() const { return m_input_names; } +std::vector GgmlOvDecoder::get_input_names(int node_idx) const { + return m_node_info_list[node_idx].node_inputs_names; +} + std::vector GgmlOvDecoder::get_output_stride(const std::string & name) const { return get_stride(m_outputs.at(name)); } @@ -755,40 +755,58 @@ ov::PartialShape GgmlOvDecoder::get_output_shape(const std::string & name) const return ov::PartialShape(get_shape(ggml_tensor)); } -ov::element::Type GgmlOvDecoder::get_output_type(const std::string & name) const { - return get_ov_type(m_outputs.at(name)); +ov::PartialShape GgmlOvDecoder::get_output_shape(int node_idx, const std::string & name) const { + auto * ggml_tensor = m_node_info_list[node_idx].node_outputs.at(name); + if (ggml_tensor->op == GGML_OP_SET_ROWS) { + ggml_tensor = ggml_tensor->view_src; + } + return ov::PartialShape(get_shape(ggml_tensor)); } -std::string & GgmlOvDecoder::get_output_name(size_t index) const { - m_name = std::string(m_output_names[index]); - return m_name; +ov::element::Type GgmlOvDecoder::get_output_type(const std::string & name) const { + return get_ov_type(m_outputs.at(name)); } std::vector GgmlOvDecoder::get_output_names() const { return m_output_names; } +std::vector GgmlOvDecoder::get_output_names(int node_idx) const { + return m_node_info_list[node_idx].node_outputs_names; +} + const std::string & GgmlOvDecoder::get_op_name() const { - return m_op_name; + static const std::string unknown_name = "UNKNOWN_OP_NAME"; + return unknown_name; +} + +const std::string & GgmlOvDecoder::get_op_name(int node_idx) const { + return m_node_info_list[node_idx].node_name; } int32_t * GgmlOvDecoder::get_input_op_params(const std::string & name) const { return m_inputs.at(name)->op_params; } +int32_t * GgmlOvDecoder::get_input_op_params(int node_idx, const std::string & name) const { + return m_node_info_list[node_idx].node_inputs.at(name)->op_params; +} + int32_t * GgmlOvDecoder::get_output_op_params(const std::string & name) const { return m_outputs.at(name)->op_params; } -void GgmlOvDecoder::visit_subgraph(std::function)> node_visitor) const { - for (const auto & node : m_nodes) { - auto decoder = std::make_shared(node, m_cgraph, m_is_static, m_ctx, m_ctx_swa, m_n_heads, - m_n_heads_kv, m_head_size, m_swa_layers); - node_visitor(decoder); +int32_t * GgmlOvDecoder::get_output_op_params(int node_idx, const std::string & name) const { + return m_node_info_list[node_idx].node_outputs.at(name)->op_params; +} + +void GgmlOvDecoder::visit_subgraph(std::function, int node_idx)> node_visitor) const { + for (int node_idx = 0; node_idx < m_cgraph->n_nodes; node_idx++) { + node_visitor(std::make_shared(*this), node_idx); } } -const std::string & GgmlOvDecoder::get_op_type() const { +std::string GgmlOvDecoder::compute_op_type(const ggml_tensor * node) { static const std::map ops = { {GGML_OP_NONE, "GGML_OP_NONE" }, {GGML_OP_ACC, "GGML_OP_ACC" }, @@ -836,14 +854,23 @@ const std::string & GgmlOvDecoder::get_op_type() const { {GGML_GLU_OP_REGLU, "GGML_GLU_OP_REGLU" } }; - switch (m_node->op) { + switch (node->op) { case GGML_OP_UNARY: - return unary_ops.at(ggml_get_unary_op(m_node)); + return unary_ops.at(ggml_get_unary_op(node)); case GGML_OP_GLU: - return glu_ops.at(ggml_get_glu_op(m_node)); + return glu_ops.at(ggml_get_glu_op(node)); default: - return ops.at(m_node->op); + return ops.at(node->op); } static const std::string unknown_op = "UNKNOWN_GGML_OP"; return unknown_op; } + +const std::string & GgmlOvDecoder::get_op_type(int node_idx) const { + return m_node_info_list[node_idx].node_op_type; +} + +const std::string & GgmlOvDecoder::get_op_type() const { + static const std::string unknown_op = "UNKNOWN_GGML_OP"; + return unknown_op; +} diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index e2efc73f17..6e2bf0486d 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -13,22 +13,21 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { public: + struct NodeInfo { + ggml_tensor * node; + std::map node_inputs; + std::vector node_inputs_names; + std::map node_outputs; + std::vector node_outputs_names; + int node_op_case = 0; + std::string node_op_type; + std::string node_name; + }; // Graph decoder GgmlOvDecoder(ggml_cgraph * cgraph, std::map> & model_weights, bool is_static); - // Node decoder, called in GgmlOvDecoder::visit_subgraph - GgmlOvDecoder(ggml_tensor * node, - ggml_cgraph * cgraph, - bool is_static, - int context_size, - int context_size_swa, - int num_heads, - int num_heads_kv, - int head_size, - const std::vector & swa_layers); - // Naive graph decoder GgmlOvDecoder(ggml_cgraph * cgraph, std::map> & model_weights); @@ -39,12 +38,18 @@ public: virtual ov::PartialShape get_input_shape(const std::string & name) const override; + virtual ov::PartialShape get_input_shape(int node_idx, const std::string & name) const override; + virtual std::vector get_input_stride(const std::string & name) const override; + virtual std::vector get_input_stride(int node_idx, const std::string & name) const override; + virtual ov::element::Type get_input_type(const std::string & name) const override; virtual size_t get_input_size() const override; + virtual size_t get_input_size(int node_idx) const override; + virtual void get_input_node(size_t input_port_idx, std::string & producer_name, std::string & producer_output_port_name, @@ -55,35 +60,45 @@ public: GGML_UNUSED(producer_output_port_index); } - virtual std::string & get_input_name(size_t index) const override; - virtual std::vector get_input_names() const override; + virtual std::vector get_input_names(int node_idx) const override; + virtual ov::PartialShape get_output_shape(const std::string & name) const override; + virtual ov::PartialShape get_output_shape(int node_idx, const std::string & name) const override; + virtual std::vector get_output_stride(const std::string & name) const override; virtual ov::element::Type get_output_type(const std::string & name) const override; virtual int32_t * get_input_op_params(const std::string & name) const override; + virtual int32_t * get_input_op_params(int node_idx, const std::string & name) const override; + virtual int32_t * get_output_op_params(const std::string & name) const override; - virtual std::string & get_output_name(size_t index) const override; + virtual int32_t * get_output_op_params(int node_idx, const std::string & name) const override; virtual std::vector get_output_names() const override; + virtual std::vector get_output_names(int node_idx) const override; + virtual const std::string & get_op_type() const override; + virtual const std::string & get_op_type(int node_idx) const override; + virtual const std::string & get_op_name() const override; - virtual void visit_subgraph(std::function)> node_visitor) const override; + virtual const std::string & get_op_name(int node_idx) const override; + + virtual void visit_subgraph(std::function, int node_idx)> node_visitor) const override; ggml_tensor * get_input_ggml_tensor(const std::string & name) const { return m_inputs.at(name); } ggml_tensor * get_output_ggml_tensor(const std::string & name) const { return m_outputs.at(name); } - virtual int get_op_case() const override { return m_op_case; } + virtual int get_op_case(int node_idx) const override { return m_node_info_list[node_idx].node_op_case; } virtual const std::map> & get_model_inputs() const override { return m_model_inputs; @@ -150,6 +165,8 @@ private: static std::vector get_shape(const ggml_tensor * tensor); static std::vector get_stride(const ggml_tensor * tensor); static ov::element::Type get_ov_type(const ggml_tensor * tensor); + int compute_op_case(const ggml_tensor * node); + std::string compute_op_type(const ggml_tensor * node); void set_llm_params(); void validate_cgraph() const; @@ -157,21 +174,18 @@ private: bool m_is_static = false; ggml_cgraph * m_cgraph = nullptr; - ggml_tensor * m_node = nullptr; std::vector m_nodes; std::map m_inputs; std::vector m_input_names; std::map m_outputs; std::vector m_output_names; - std::string m_op_name; - mutable std::string m_name; - int m_op_case = 0; - std::vector> m_op_node_name; + std::map> m_model_inputs; std::map> m_model_extra_inputs; std::map> m_model_extra_input_values; std::map> m_model_weights; std::vector m_model_output_names; + std::vector m_node_info_list; // Fixed for a model int m_ctx = -1; diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index 8f86a4de06..1d5b7a850f 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -16,42 +16,58 @@ public: virtual PartialShape get_input_shape(const std::string& name) const = 0; + virtual PartialShape get_input_shape(int node_idx, const std::string& name) const = 0; + virtual std::vector get_input_stride(const std::string& name) const = 0; + virtual std::vector get_input_stride(int node_idx, const std::string& name) const = 0; + virtual element::Type get_input_type(const std::string& name) const = 0; virtual size_t get_input_size() const = 0; + virtual size_t get_input_size(int node_idx) const = 0; + virtual void get_input_node(size_t input_port_idx, std::string& producer_name, std::string& producer_output_port_name, size_t& producer_output_port_index) const = 0; - virtual std::string& get_input_name(size_t index) const = 0; - virtual std::vector get_input_names() const = 0; + virtual std::vector get_input_names(int node_idx) const = 0; + virtual PartialShape get_output_shape(const std::string& name) const = 0; + virtual PartialShape get_output_shape(int node_idx, const std::string& name) const = 0; + virtual std::vector get_output_stride(const std::string& name) const = 0; virtual element::Type get_output_type(const std::string& name) const = 0; virtual int32_t* get_input_op_params(const std::string& name) const = 0; + virtual int32_t* get_input_op_params(int node_idx, const std::string& name) const = 0; + virtual int32_t* get_output_op_params(const std::string& name) const = 0; - virtual std::string& get_output_name(size_t index) const = 0; + virtual int32_t* get_output_op_params(int node_idx, const std::string& name) const = 0; virtual std::vector get_output_names() const = 0; + virtual std::vector get_output_names(int node_idx) const = 0; + virtual const std::string& get_op_type() const = 0; + virtual const std::string& get_op_type(int node_idx) const = 0; + virtual const std::string& get_op_name() const = 0; - virtual void visit_subgraph(std::function)> node_visitor) const = 0; + virtual const std::string& get_op_name(int node_idx) const = 0; - virtual int get_op_case() const = 0; + virtual void visit_subgraph(std::function, int node_idx)> node_visitor) const = 0; + + virtual int get_op_case(int node_idx) const = 0; virtual const std::map>& get_model_inputs() const = 0; virtual const std::map>& get_model_extra_inputs() const = 0; diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index 0d76dc83e0..64e3d550c5 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -18,13 +18,15 @@ class NodeContext : public frontend::NodeContext { public: NodeContext(const std::shared_ptr& decoder, std::shared_ptr& tensor_map, + int node_idx, TranslateSession* translate_session = nullptr) - : ov::frontend::NodeContext(decoder->get_op_type()), + : ov::frontend::NodeContext(decoder->get_op_type(node_idx)), m_decoder(decoder), m_tensor_map(tensor_map), + m_node_idx(node_idx), m_translate_session(translate_session) { - m_input_names = decoder->get_input_names(); - m_output_names = decoder->get_output_names(); + m_input_names = decoder->get_input_names(m_node_idx); + m_output_names = decoder->get_output_names(m_node_idx); } TranslateSession* get_translate_session() const { @@ -34,7 +36,7 @@ public: const std::vector& get_input_names() const { return m_input_names; } size_t get_input_size() const override { - return m_decoder->get_input_size(); + return m_decoder->get_input_size(m_node_idx); } ov::element::Type get_input_type(size_t index) const { @@ -42,29 +44,25 @@ public: } PartialShape get_input_shape(size_t index) const { - return m_decoder->get_input_shape(m_input_names[index]); + return m_decoder->get_input_shape(m_node_idx, m_input_names[index]); } std::vector get_input_stride(size_t index) const { - return m_decoder->get_input_stride(m_input_names[index]); + return m_decoder->get_input_stride(m_node_idx, m_input_names[index]); } std::string get_output_name() const { return m_output_names[0]; } PartialShape get_output_shape(size_t index) const { - return m_decoder->get_output_shape(m_output_names[index]); - } - - std::vector get_output_stride(size_t index) const { - return m_decoder->get_output_stride(m_output_names[index]); + return m_decoder->get_output_shape(m_node_idx, m_output_names[index]); } int32_t* get_input_op_params(size_t index) const { - return m_decoder->get_input_op_params(m_input_names[index]); + return m_decoder->get_input_op_params(m_node_idx, m_input_names[index]); } int32_t* get_output_op_params(size_t index) const { - return m_decoder->get_output_op_params(m_output_names[index]); + return m_decoder->get_output_op_params(m_node_idx, m_output_names[index]); } ov::element::Type get_output_type(size_t index) const { @@ -72,7 +70,7 @@ public: } Output get_input(int idx) const override { - return m_tensor_map->at(m_decoder->get_input_name(idx)); + return m_tensor_map->at(m_input_names[idx]); } Output get_input(const std::string& name) const override { @@ -87,7 +85,7 @@ public: } const std::string& get_name() const override { - return m_decoder->get_op_name(); + return m_decoder->get_op_name(m_node_idx); } ov::Any get_attribute_as_any(const std::string& name) const override { @@ -95,13 +93,14 @@ public: } int get_op_case() const { - return m_decoder->get_op_case(); + return m_decoder->get_op_case(m_node_idx); } bool is_static() const { return m_decoder->is_static(); } private: std::shared_ptr m_decoder; std::shared_ptr& m_tensor_map; + int m_node_idx; TranslateSession* m_translate_session; std::vector m_input_names; std::vector m_output_names; diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index d12701acdc..d03c9358b0 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -164,8 +164,8 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo (*tensor_map)[it.first] = it.second; } - auto node_visitor = [&](std::shared_ptr node) { - auto operation_type = node->get_op_type(); + auto node_visitor = [&](std::shared_ptr decoder, int node_idx) { + auto operation_type = decoder->get_op_type(node_idx); if (operation_type == "GGML_OP_NONE") { return; } @@ -174,10 +174,10 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo auto it = m_translator_map.find(operation_type); FRONT_END_OP_CONVERSION_CHECK(it != m_translator_map.end(), "Translation for operation type ", operation_type, " is not implemented."); - NodeContext node_context(node, tensor_map, this); + NodeContext node_context(decoder, tensor_map, node_idx, this); converted_outputs = it->second(node_context); - const auto & node_output_names = node->get_output_names(); + const auto & node_output_names = decoder->get_output_names(node_idx); FRONT_END_OP_CONVERSION_CHECK(node_output_names.size() == converted_outputs.size(), "Number of ", operation_type, " outputs greater than number of converted outputs, which are ", node_output_names.size(), " and ", converted_outputs.size(), " respectively."); From 992dea73fd6ed26ffb05b09ab1d8785d28776f24 Mon Sep 17 00:00:00 2001 From: XuejunZhai Date: Tue, 25 Nov 2025 21:12:41 -0800 Subject: [PATCH 176/254] Fix error for naive --- ggml/src/ggml-openvino/ggml-decoder.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index f3f13167d9..3eda21a562 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -78,6 +78,10 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::mapn_nodes; node_n++) { + m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node); + m_node_info_list[node_n].node_op_type = compute_op_type(m_node_info_list[node_n].node); + } } // Called in GgmlOvDecoder constructor. Two cases: 1. constructing a decoder for the whole graph; From 38254cf592ba3f7fbf69510e48ed63708559ee71 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 1 Dec 2025 13:47:43 +0800 Subject: [PATCH 177/254] NPU prefill chunking --- ggml/src/ggml-openvino/ggml-decoder.cpp | 17 +- ggml/src/ggml-openvino/ggml-decoder.h | 10 +- ggml/src/ggml-openvino/utils.cpp | 303 +++++++++++++++++++----- ggml/src/ggml-openvino/utils.h | 39 ++- 4 files changed, 286 insertions(+), 83 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 3eda21a562..c7035c1580 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -37,8 +37,12 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map> & model_weights, - bool is_static) : + bool is_static, + bool is_prefill, + int prefill_chunk_size) : m_is_static(is_static), + m_is_prefill(is_prefill), + m_prefill_chunk_size(prefill_chunk_size), m_cgraph(cgraph), m_model_weights(model_weights) { if (auto * env = getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); env && std::string(env) != "0") { @@ -341,12 +345,16 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co auto name = std::string(input->name); ov::PartialShape input_shape; - if (name == "inp_tokens" || name == "inp_pos" || name == "inp_out_ids") { + if (name == "inp_tokens" || name == "inp_pos") { + int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1; + input_shape = ov::PartialShape{1, 1, 1, len}; + + } else if (name == "inp_out_ids") { input_shape = ov::PartialShape{1, 1, 1, m_is_static ? 1 : -1}; } else if (name.find("KQ_mask") == 0) { if (m_is_static) { - input_shape = ov::PartialShape{1, 1, 1, m_ctx}; + input_shape = ov::PartialShape{1, 1, m_is_prefill ? m_prefill_chunk_size : 1, m_ctx}; } else { input_shape = ov::PartialShape{-1, 1, -1, -1}; } @@ -359,7 +367,8 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co } } else if (op && op->op == GGML_OP_SET_ROWS && op->src[1] == input) { - input_shape = ov::PartialShape{1, 1, 1, m_is_static ? 1 : -1}; + int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1; + input_shape = ov::PartialShape{1, 1, 1, len}; } else if (input->op == GGML_OP_VIEW) { // This case is added to make test-backend-ops work diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 6e2bf0486d..11f35f038e 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -26,7 +26,9 @@ public: // Graph decoder GgmlOvDecoder(ggml_cgraph * cgraph, std::map> & model_weights, - bool is_static); + bool is_static, + bool is_prefill = false, + int prefill_chunk_size = 256); // Naive graph decoder GgmlOvDecoder(ggml_cgraph * cgraph, std::map> & model_weights); @@ -159,6 +161,10 @@ public: void clear_model_weights() { m_model_weights.clear(); } + bool m_is_static = false; + bool m_is_prefill = false; + int m_prefill_chunk_size = 0; + private: void set_input_output(ggml_tensor * node, bool naive = false); void add_extra_inputs(); @@ -171,8 +177,6 @@ private: void set_llm_params(); void validate_cgraph() const; - bool m_is_static = false; - ggml_cgraph * m_cgraph = nullptr; std::vector m_nodes; std::map m_inputs; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 6e1d7393c7..ae8916cc58 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -48,8 +48,17 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * } return device; }; - static std::string device = get_device(); - bool is_static = device == "NPU" ? true : false; + auto get_prefill_chunk_size = [] { + const char * chunk_size_str = getenv("GGML_OPENVINO_PREFILL_CHUNK_SIZE"); + if (chunk_size_str && atoi(chunk_size_str) > 0) { + return atoi(chunk_size_str); + } + return 256; + }; + + static const auto device = get_device(); + static const auto is_static = device == "NPU" ? true : false; + static const auto prefill_chunk_size = get_prefill_chunk_size(); ov::AnyMap config; @@ -71,12 +80,16 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * static std::mutex cache_mutex; static std::unordered_map> infer_request_cache; + static std::unordered_map> infer_request_cache_prefill; static std::unordered_map> ov_input_names_cache; static std::unordered_map> ov_output_names_cache; std::shared_ptr ggml_decoder; std::shared_ptr infer_request; + const auto * inp_pos = get_inp_pos_tensor(cgraph); + const auto is_prefill = get_is_prefill(inp_pos); + int64_t decoder_end_time; int64_t conversion_end_time; int64_t compile_end_time; @@ -88,36 +101,78 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * auto it = infer_request_cache.find(cgraph); if (it != infer_request_cache.end()) { std::map> model_weights; - ggml_decoder = std::make_shared(cgraph, model_weights, is_static); + ggml_decoder = + std::make_shared(cgraph, model_weights, is_static, is_prefill, prefill_chunk_size); decoder_end_time = ggml_time_us(); infer_request = infer_request_cache[cgraph]; + if (is_static && is_prefill) { + infer_request = infer_request_cache_prefill[cgraph]; + } conversion_end_time = ggml_time_us(); compile_end_time = conversion_end_time; } else { std::shared_ptr model; auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, get_types_to_requant(device)); - ggml_decoder = std::make_shared(cgraph, model_weights, is_static); - decoder_end_time = ggml_time_us(); + if (!is_static) { + ggml_decoder = std::make_shared(cgraph, model_weights, is_static); + decoder_end_time = ggml_time_us(); - auto input_model = std::make_shared(ggml_decoder); - model = ov::frontend::ggml::FrontEnd::convert(input_model); - ggml_decoder->clear_model_weights(); - conversion_end_time = ggml_time_us(); + auto input_model = std::make_shared(ggml_decoder); + model = ov::frontend::ggml::FrontEnd::convert(input_model); + ggml_decoder->clear_model_weights(); + conversion_end_time = ggml_time_us(); - if (getenv("GGML_OPENVINO_DUMP_IR")) { - char timestamped_filename[64]; - auto timestamp = (long long) ggml_time_us(); - snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); - ov::serialize(model, timestamped_filename); + if (getenv("GGML_OPENVINO_DUMP_IR")) { + char timestamped_filename[64]; + auto timestamp = (long long) ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); + ov::serialize(model, timestamped_filename); + } + + auto compiled_model = core.compile_model(model, device, get_ov_compile_config(device)); + compile_end_time = ggml_time_us(); + infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); + infer_request = infer_request_cache[cgraph]; + } else { + auto ggml_decoder_prefill = + std::make_shared(cgraph, model_weights, is_static, true, prefill_chunk_size); + auto ggml_decoder_decode = + std::make_shared(cgraph, model_weights, is_static, false, prefill_chunk_size); + decoder_end_time = ggml_time_us(); + + auto input_model_prefill = std::make_shared(ggml_decoder_prefill); + auto input_model_decode = std::make_shared(ggml_decoder_decode); + + auto model_prefill = ov::frontend::ggml::FrontEnd::convert(input_model_prefill); + ggml_decoder_prefill->clear_model_weights(); + auto model_decode = ov::frontend::ggml::FrontEnd::convert(input_model_decode); + ggml_decoder_decode->clear_model_weights(); + conversion_end_time = ggml_time_us(); + + if (getenv("GGML_OPENVINO_DUMP_IR")) { + char timestamped_filename[64]; + auto timestamp = (long long) ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_prefill_%lld.xml", timestamp); + ov::serialize(model_prefill, timestamped_filename); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_decode_%lld.xml", timestamp); + ov::serialize(model_decode, timestamped_filename); + } + + auto compiled_model_prefill = core.compile_model(model_prefill, device, get_ov_compile_config(device)); + auto compiled_model_decode = core.compile_model(model_decode, device, get_ov_compile_config(device)); + infer_request_cache_prefill[cgraph] = + std::make_shared(compiled_model_prefill.create_infer_request()); + infer_request_cache[cgraph] = + std::make_shared(compiled_model_decode.create_infer_request()); + compile_end_time = ggml_time_us(); + + model = is_prefill ? model_prefill : model_decode; + ggml_decoder = is_prefill ? ggml_decoder_prefill : ggml_decoder_decode; + infer_request = is_prefill ? infer_request_cache_prefill[cgraph] : infer_request_cache[cgraph]; } - auto compiled_model = core.compile_model(model, device, get_ov_compile_config(device)); - compile_end_time = ggml_time_us(); - infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); - infer_request = infer_request_cache[cgraph]; - std::vector ov_input_names; std::vector ov_output_names; for (const auto & ov_param : model->get_parameters()) { @@ -131,20 +186,22 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * // Set output tensors (for NPU) and kvcache i/o tensors once and for all // Note: does not seem to improve perf on CPU/GPU, but breaks llama-bench, so disabled it for CPU/GPU - if (is_static) { - for (size_t i = 0; i < ov_output_names.size(); i++) { - auto output_name = ov_output_names[i]; - auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); - infer_request->set_output_tensor(i, output_tensor); - } - for (size_t i = 0; i < ov_input_names.size(); i++) { - auto param_name = ov_input_names[i]; - if (param_name.find("cache") == 0) { - auto input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, 0, 0); - infer_request->set_input_tensor(i, input_tensor); - } - } - } + // if (is_static) { + // for (size_t i = 0; i < ov_input_names.size(); i++) { + // auto param_name = ov_input_names[i]; + // if (param_name.find("cache") == 0) { + // auto input_tensor = get_ov_input_tensor_static_decode(ggml_decoder, param_name); + // infer_request->set_input_tensor(i, input_tensor); + // } + // } + // for (size_t i = 0; i < ov_output_names.size(); i++) { + // auto output_name = ov_output_names[i]; + // if (output_name.find("cache") == 0) { + // auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); + // infer_request->set_output_tensor(i, output_tensor); + // } + // } + // } } } @@ -177,11 +234,39 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * } } } else { - auto input_len = ggml_decoder->get_input_len(); - for (int j = 0; j < input_len; j++) { + if (is_prefill) { + auto inp_len = inp_pos->ne[0]; + for (int chunk_index = 0; chunk_index * prefill_chunk_size < inp_len; chunk_index++) { + for (size_t i = 0; i < ov_input_names.size(); i++) { + auto param_name = ov_input_names[i]; + auto input_tensor = get_ov_input_tensor_static_prefill(ggml_decoder, param_name, chunk_index); + infer_request->set_input_tensor(i, input_tensor); + + if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { + const auto input_tensor = infer_request->get_input_tensor(i); + print_input_tensor_info(param_name, input_tensor); + } + } + + for (size_t i = 0; i < ov_output_names.size(); i++) { + auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); + infer_request->set_output_tensor(i, output_tensor); + } + + infer_request->infer(); + + if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { + for (size_t i = 0; i < ov_output_names.size(); i++) { + const auto output_tensor = infer_request->get_output_tensor(i); + print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data()); + } + } + } + infer_end_time = ggml_time_us(); + } else { for (size_t i = 0; i < ov_input_names.size(); i++) { auto param_name = ov_input_names[i]; - auto input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, j, input_len); + auto input_tensor = get_ov_input_tensor_static_decode(ggml_decoder, param_name); infer_request->set_input_tensor(i, input_tensor); if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { @@ -190,7 +275,13 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * } } + for (size_t i = 0; i < ov_output_names.size(); i++) { + auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); + infer_request->set_output_tensor(i, output_tensor); + } + infer_request->infer(); + infer_end_time = ggml_time_us(); if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { for (size_t i = 0; i < ov_output_names.size(); i++) { @@ -199,7 +290,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * } } } - infer_end_time = ggml_time_us(); } if (getenv("GGML_OPENVINO_PROFILING")) { @@ -324,21 +414,84 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, cons return input_tensor; } -ov::Tensor get_ov_input_tensor_static(std::shared_ptr ggml_decoder, - const std::string & param_name, - int j, - int input_len) { +ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr ggml_decoder, + const std::string & param_name) { const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(param_name); const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor); if (param_name == "inp_pos" || param_name == "inp_tokens" || (op->op == GGML_OP_SET_ROWS && op->src[1] == ggml_tensor)) { + assert(ggml_tensor->ne[0] == 1); ov::Shape input_shape = {1, 1, 1, 1}; ov::Tensor input_tensor(ggml_decoder->get_input_type(param_name), input_shape); - // copy the j-th value from ggml_tensor + if (ggml_tensor->type == GGML_TYPE_I32) { + *input_tensor.data() = *((int32_t *) ggml_tensor->data); + } else if (ggml_tensor->type == GGML_TYPE_I64) { + *input_tensor.data() = *((int64_t *) ggml_tensor->data); + } else { + throw std::runtime_error("Unexpected tensor type for " + param_name); + } + return input_tensor; + } + + if (param_name == "inp_out_ids") { + ov::Shape input_shape = {1, 1, 1, 1}; + ov::Tensor input_tensor(ggml_decoder->get_input_type(param_name), input_shape); + int32_t inp_out_id = *((int32_t *) ggml_tensor->data); + assert(ggml_tensor->ne[0] == 1); + assert(inp_out_id == 0); + *input_tensor.data() = inp_out_id; + return input_tensor; + } + + if (param_name.find("KQ_mask") == 0) { + size_t context_size = ggml_decoder->get_ctx_size(); + std::vector padded_data = pad_input(ggml_tensor, 1, context_size, -INFINITY); + ov::Tensor input_tensor(ov::element::f32, ov::Shape{1, 1, 1, context_size}); + auto * data_ptr = input_tensor.data(); + std::copy(padded_data.begin(), padded_data.begin() + context_size, data_ptr); + return input_tensor; + } + + return get_ov_input_tensor(ggml_decoder, param_name); +} + +ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr ggml_decoder, + const std::string & param_name, + int chunk_index) { + const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(param_name); + const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor); + + const size_t input_len = ggml_decoder->get_input_len(); + const size_t chunk_size = ggml_decoder->m_prefill_chunk_size; + const size_t chunk_valid_size = std::min(chunk_size, input_len - chunk_index * chunk_size); + const size_t chunk_pad_size = chunk_size - chunk_valid_size; + + if (param_name == "inp_pos" || param_name == "inp_tokens" || + (op->op == GGML_OP_SET_ROWS && op->src[1] == ggml_tensor)) { + ov::Shape input_shape = {1, 1, 1, chunk_size}; + ov::Tensor input_tensor(ggml_decoder->get_input_type(param_name), input_shape); + // copy the chunk_index-th chunk from ggml_tensor size_t element_size = ggml_type_size(ggml_tensor->type); - void * input_data = (char *) ggml_tensor->data + j * element_size; - std::memcpy(input_tensor.data(), input_data, element_size); + void * input_data = (char *) ggml_tensor->data + chunk_index * chunk_size * element_size; + std::memcpy(input_tensor.data(), input_data, chunk_valid_size * element_size); + // pad the rest with last_value + 1, so that kv's of padded positions are inserted + // to the next row after the valids row in the kvcache + if (chunk_pad_size > 0) { + if (ggml_tensor->type == GGML_TYPE_I32) { + int32_t last_value = + *((int32_t *) ggml_tensor->data + (chunk_index * chunk_size + chunk_valid_size - 1)); + int32_t * output_data = input_tensor.data(); + std::fill(output_data + chunk_valid_size, output_data + chunk_size, last_value + 1); + } else if (ggml_tensor->type == GGML_TYPE_I64) { + int64_t last_value = + *((int64_t *) ggml_tensor->data + (chunk_index * chunk_size + chunk_valid_size - 1)); + int64_t * output_data = input_tensor.data(); + std::fill(output_data + chunk_valid_size, output_data + chunk_size, last_value + 1); + } else { + throw std::runtime_error("Unexpected tensor type for " + param_name); + } + } return input_tensor; } @@ -348,25 +501,26 @@ ov::Tensor get_ov_input_tensor_static(std::shared_ptr ggml_decode if (ggml_tensor->ne[0] == 0) { *input_tensor.data() = 0; } else if (ggml_tensor->ne[0] == 1) { - if (j == input_len - 1) { - *input_tensor.data() = *((int32_t *) ggml_tensor->data); - } else { - *input_tensor.data() = 0; - } + int32_t inp_out_id = *((int32_t *) ggml_tensor->data) % chunk_size; + *input_tensor.data() = inp_out_id; } else { - throw std::runtime_error("Static graph inp_out_ids unexpected ne[0] > 1"); + throw std::runtime_error("NPU does not support outputing logits for multiple tokens at once."); } return input_tensor; } if (param_name.find("KQ_mask") == 0) { + size_t cols = ggml_tensor->ne[0]; + size_t rows = ggml_tensor->ne[1]; + float * ggml_data = (float *) ggml_tensor->data + chunk_index * chunk_size * cols; + size_t chunk_valid_rows = std::min(chunk_size, rows - chunk_index * chunk_size); size_t context_size = ggml_decoder->get_ctx_size(); - const auto * input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); - std::vector padded_data = pad_input(input_tensor_ggml, input_len, context_size, -INFINITY); - ov::Tensor input_tensor(ov::element::f32, ov::Shape{1, 1, 1, context_size}); - // copy the j-th row of padded_data + std::vector padded_data = + pad_input(ggml_data, chunk_valid_rows, cols, chunk_size, context_size, -INFINITY); + set_zero_diagonal(padded_data, chunk_size, context_size); + ov::Tensor input_tensor(ov::element::f32, ov::Shape{1, 1, chunk_size, context_size}); auto * data_ptr = input_tensor.data(); - std::copy(padded_data.begin() + j * context_size, padded_data.begin() + (j + 1) * context_size, data_ptr); + std::copy(padded_data.begin(), padded_data.begin() + chunk_size * context_size, data_ptr); return input_tensor; } @@ -401,9 +555,28 @@ void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor std::cout << "Input name: " << name << ", Input shape: " << tensor.get_shape() << ", Address: " << tensor.data() << std::endl; switch (tensor.get_element_type()) { - case ov::element::f32: - std::cout << *(tensor.data()) << std::endl; + case ov::element::f32: { + if (name.find("KQ_mask") == std::string::npos) { + std::cout << *(tensor.data()) << std::endl; + } else { + size_t rows = tensor.get_shape()[2]; + size_t cols = tensor.get_shape()[3]; + auto * data = tensor.data(); + for (size_t i = 0; i < rows; ++i) { + for (size_t j = 0; j < cols; ++j) { + float val = data[i * cols + j]; + if (std::isinf(val) && val < 0) { + std::cout << std::setw(5) << "-inf"; + } else { + std::cout << std::setw(5) << val; + } + } + std::cout << std::endl; + } + } + break; + } case ov::element::f16: std::cout << *(tensor.data()) << std::endl; break; @@ -414,7 +587,10 @@ void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor std::cout << std::endl; break; case ov::element::i64: - std::cout << *(tensor.data()) << std::endl; + for (size_t i = 0; i < tensor.get_size(); ++i) { + std::cout << tensor.data()[i] << " "; + } + std::cout << std::endl; break; default: break; @@ -471,9 +647,10 @@ void print_output_tensor_info(const std::string & name, const ov::Tensor & tenso } } -void set_zero_diagonal(std::vector & matrix, size_t dim) { - for (size_t i = 0; i < dim; ++i) { - matrix[i * dim + i] = 0.0f; +void set_zero_diagonal(std::vector & matrix, size_t rows, size_t cols) { + for (size_t i = 0; i < rows; ++i) { + size_t diag_col = std::min(i, cols - 1); + matrix[i * cols + diag_col] = 0.0f; } } @@ -494,8 +671,8 @@ const ggml_tensor * get_inp_pos_tensor(ggml_cgraph * cgraph) { throw std::runtime_error("get_inp_pos_tensor: inp_pos not found in cgraph"); } -bool get_is_first_token(const ggml_tensor * inp_pos) { - return *(int32_t *) inp_pos->data == 0; +bool get_is_prefill(const ggml_tensor * inp_pos) { + return inp_pos->ne[0] > 1; } #pragma GCC diagnostic pop diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 999fc53f32..31f86d0999 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -3,6 +3,7 @@ #include "ggml-impl.h" #include +#include #include enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph); @@ -14,35 +15,47 @@ void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor void print_output_tensor_info(const std::string & name, const ov::Tensor & tensor, void * output_dst); template -std::vector pad_input(const ggml_tensor * tensor, size_t padded_rows, size_t padded_cols, T pad_value) { - std::vector padded_data(padded_rows * padded_cols, pad_value); - size_t rows = tensor->ne[1]; - size_t cols = tensor->ne[0]; - T * data = static_cast(tensor->data); +std::vector pad_input(const T * data, + size_t rows, + size_t cols, + size_t padded_rows, + size_t padded_cols, + T pad_value) { + std::vector padded(padded_rows * padded_cols, pad_value); for (size_t i = 0; i < std::min(rows, padded_rows); ++i) { for (size_t j = 0; j < std::min(cols, padded_cols); ++j) { - padded_data[i * padded_cols + j] = data[i * cols + j]; + padded[i * padded_cols + j] = data[i * cols + j]; } } - return padded_data; + + return padded; } -void set_zero_diagonal(std::vector & matrix, size_t dim); +template +std::vector pad_input(const ggml_tensor * tensor, size_t padded_rows, size_t padded_cols, T pad_value) { + return pad_input(reinterpret_cast(tensor->data), + static_cast(tensor->ne[1]), // rows + static_cast(tensor->ne[0]), // cols + padded_rows, padded_cols, pad_value); +} + +void set_zero_diagonal(std::vector & matrix, size_t rows, size_t cols); const ggml_tensor * get_inp_pos_tensor(struct ggml_cgraph * cgraph); -bool get_is_first_token(const ggml_tensor * inp_pos); +bool get_is_prefill(const ggml_tensor * inp_pos); ov::AnyMap get_ov_compile_config(const std::string & device); std::map get_types_to_requant(const std::string & device); ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string & param_name); -ov::Tensor get_ov_input_tensor_static(std::shared_ptr ggml_decoder, - const std::string & param_name, - int j, - int input_len); +ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr ggml_decoder, + const std::string & param_name); +ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr ggml_decoder, + const std::string & param_name, + int chunk_index); ov::Tensor get_ov_output_tensor(std::shared_ptr ggml_decoder, const std::string & result_name); From 59e7e7c47d444a6c7a25e90d3a00488966d6680f Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 3 Dec 2025 15:45:40 +0800 Subject: [PATCH 178/254] NPU fix llama-bench --- ggml/src/ggml-openvino/ggml-decoder.cpp | 98 ++++++++++----------- ggml/src/ggml-openvino/ggml-decoder.h | 93 ++++++++++++------- ggml/src/ggml-openvino/openvino/decoder.hpp | 2 +- ggml/src/ggml-openvino/utils.cpp | 84 ++++++++++++------ ggml/src/ggml-openvino/utils.h | 22 +++++ 5 files changed, 191 insertions(+), 108 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index c7035c1580..4c0258c4e3 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -36,6 +36,8 @@ #include GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, + ModelParams & model_params, + ComputeParams & compute_params, std::map> & model_weights, bool is_static, bool is_prefill, @@ -44,7 +46,9 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, m_is_prefill(is_prefill), m_prefill_chunk_size(prefill_chunk_size), m_cgraph(cgraph), - m_model_weights(model_weights) { + m_model_weights(model_weights), + m_model_params(model_params), + m_compute_params(compute_params) { if (auto * env = getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); env && std::string(env) != "0") { #ifdef _WIN32 _putenv_s("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS", ""); @@ -54,7 +58,6 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, print_tensor_address_map(cgraph); } - set_llm_params(); validate_cgraph(); for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { @@ -163,12 +166,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph if (node->op == GGML_OP_SET_ROWS || node->flags & GGML_TENSOR_FLAG_OUTPUT || node_name.find("output") != std::string::npos || debug_output_names.count(node_name)) { - if (node->op == GGML_OP_SET_ROWS) { - assert(node_name.find("cache_k") == 0 || node_name.find("cache_v") == 0); - if (auto it = std::find(m_kv_names.begin(), m_kv_names.end(), node_name); it == m_kv_names.end()) { - m_kv_names.push_back(node_name); - } - } if (auto it = std::find(m_model_output_names.begin(), m_model_output_names.end(), node_name); it == m_model_output_names.end()) { m_model_output_names.push_back(node_name); @@ -277,9 +274,11 @@ int extract_layer_from_name(const std::string & name) { return layer; } -void GgmlOvDecoder::set_llm_params() { - for (int i = 0; i < m_cgraph->n_nodes; i++) { - auto * node = m_cgraph->nodes[i]; +std::pair GgmlOvDecoder::compute_llm_params(ggml_cgraph * cgraph, bool is_static) { + ModelParams model_params; + ComputeParams compute_params; + for (int i = 0; i < cgraph->n_nodes; i++) { + auto * node = cgraph->nodes[i]; std::string name = std::string(node->name); if (node->op == GGML_OP_FLASH_ATTN_EXT) { auto * cache_k_perm = node->src[1]; @@ -294,49 +293,50 @@ void GgmlOvDecoder::set_llm_params() { assert(mask_name.find("KQ_mask") == 0); if (std::string(node->src[3]->name).find("swa") != std::string::npos) { - m_swa_layers.push_back(layer); - m_ctx_per_seq_swa = cache_k->ne[1]; + model_params.swa_layers.push_back(layer); + model_params.ctx_per_seq_swa = cache_k->ne[1]; } else { - m_ctx_per_seq = cache_k->ne[1]; - m_n_seq = cache_k->ne[2]; + model_params.ctx_per_seq = cache_k->ne[1]; + model_params.n_seq = cache_k->ne[2]; } - m_n_seq_active = mask->ne[3]; + compute_params.n_seq_active = mask->ne[3]; auto seq_size = cache_k->ne[0] * cache_k->ne[1] * ggml_type_size(cache_k->type); size_t offset; memcpy(&offset, cache_k_view->op_params, sizeof(size_t)); - m_seq_active_start = offset / seq_size; - m_token_len_per_seq = node->ne[2]; + compute_params.seq_active_start = offset / seq_size; + compute_params.token_len_per_seq = node->ne[2]; if (mask_name.find("swa") != std::string::npos) { - m_attention_size_swa = mask->ne[0]; + compute_params.attention_size_swa = mask->ne[0]; } else { - m_attention_size = mask->ne[0]; + compute_params.attention_size = mask->ne[0]; } - if (m_is_static) { - m_attention_size = m_ctx_per_seq; - m_attention_size_swa = m_ctx_per_seq_swa; - m_token_len_per_seq = 1; + if (is_static) { + compute_params.attention_size = model_params.ctx_per_seq; + compute_params.attention_size_swa = model_params.ctx_per_seq_swa; + compute_params.token_len_per_seq = 1; } } else if (node->op == GGML_OP_ROPE) { if (name.find("Qcur-0") == 0 || std::string(node->src[0]->name).find("Qcur-0") == 0) { - m_head_size = node->ne[0]; - m_n_heads = node->ne[1]; - m_rope_params = node->op_params; + model_params.head_size = node->ne[0]; + model_params.n_heads = node->ne[1]; + model_params.rope_params = node->op_params; auto * inp_pos = node->src[1]; - m_input_len = inp_pos->ne[0]; + compute_params.input_len = inp_pos->ne[0]; } else if (name.find("Kcur-0") == 0 || std::string(node->src[0]->name).find("Kcur-0") == 0) { - m_n_heads_kv = node->ne[1]; + model_params.n_heads_kv = node->ne[1]; } } } - m_ctx = m_ctx_per_seq * m_n_seq; - m_ctx_swa = m_ctx_per_seq_swa * m_n_seq; + model_params.ctx = model_params.ctx_per_seq * model_params.n_seq; + model_params.ctx_swa = model_params.ctx_per_seq_swa * model_params.n_seq; + return {model_params, compute_params}; } void GgmlOvDecoder::validate_cgraph() const { - if (m_n_seq > 1 && m_is_static == true) { + if (m_model_params.n_seq > 1 && m_is_static == true) { throw std::runtime_error("n_seq > 1 is not supported on NPU. Try setting -np 1."); } } @@ -354,7 +354,7 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co } else if (name.find("KQ_mask") == 0) { if (m_is_static) { - input_shape = ov::PartialShape{1, 1, m_is_prefill ? m_prefill_chunk_size : 1, m_ctx}; + input_shape = ov::PartialShape{1, 1, m_is_prefill ? m_prefill_chunk_size : 1, m_model_params.ctx}; } else { input_shape = ov::PartialShape{-1, 1, -1, -1}; } @@ -403,14 +403,14 @@ void GgmlOvDecoder::add_extra_inputs() { } }; - create_1d_input("attention_size", m_attention_size); - if (m_attention_size_swa != -1) { - create_1d_input("attention_size_swa", m_attention_size_swa); + create_1d_input("attention_size", m_compute_params.attention_size); + if (m_compute_params.attention_size_swa != -1) { + create_1d_input("attention_size_swa", m_compute_params.attention_size_swa); } - create_1d_input("n_seq_active", m_n_seq_active); - create_1d_input("seq_active_start", m_seq_active_start); - create_1d_input("seq_active_end", m_seq_active_start + m_n_seq_active); - create_1d_input("token_len_per_seq", m_token_len_per_seq); + create_1d_input("n_seq_active", m_compute_params.n_seq_active); + create_1d_input("seq_active_start", m_compute_params.seq_active_start); + create_1d_input("seq_active_end", m_compute_params.seq_active_start + m_compute_params.n_seq_active); + create_1d_input("token_len_per_seq", m_compute_params.token_len_per_seq); // create_1d_input("token_len", m_token_len_per_seq * m_n_seq_active); } @@ -445,15 +445,15 @@ const ggml_tensor * GgmlOvDecoder::get_tensor_from_name(const std::string & name return nullptr; } -std::map GgmlOvDecoder::get_kv_param_res_names() const { - std::map kv_param_res_names; - for (const auto & name : m_kv_names) { - if (name.find("cache_k") == 0 || name.find("cache_v") == 0) { - kv_param_res_names[name] = name; - } - } - return kv_param_res_names; -} +// std::map GgmlOvDecoder::get_kv_param_res_names() const { +// std::map kv_param_res_names; +// for (const auto & name : m_model_params.kv_names) { +// if (name.find("cache_k") == 0 || name.find("cache_v") == 0) { +// kv_param_res_names[name] = name; +// } +// } +// return kv_param_res_names; +// } std::map> GgmlOvDecoder::create_weight_nodes( ggml_cgraph * cgraph, diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 11f35f038e..f2efb65a23 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -11,6 +11,42 @@ #include #include +struct ModelParams { + int ctx = -1; + int ctx_swa = -1; + int ctx_per_seq = -1; + int ctx_per_seq_swa = -1; + int n_seq = -1; + int n_heads = -1; + int n_heads_kv = -1; + int head_size = -1; + int32_t * rope_params = nullptr; + std::vector swa_layers; + + // std::vector kv_names; + + bool can_reuse_dynamically(const ModelParams & other) const { + return n_seq == other.n_seq && n_heads == other.n_heads && n_heads_kv == other.n_heads_kv && + head_size == other.head_size && rope_params == other.rope_params && swa_layers == other.swa_layers; + } + + bool can_reuse_statically(const ModelParams & other) const { + return can_reuse_dynamically(other) && ctx_per_seq == other.ctx_per_seq && + ctx_per_seq_swa == other.ctx_per_seq_swa; + } +}; + +struct ComputeParams { + int n_seq_active = -1; + int seq_active_start = -1; + int attention_size = -1; + int attention_size_swa = -1; + int input_len = -1; + int token_len_per_seq = -1; + int past_kv_len = -1; + int output_len = -1; +}; + class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { public: struct NodeInfo { @@ -25,6 +61,8 @@ public: }; // Graph decoder GgmlOvDecoder(ggml_cgraph * cgraph, + ModelParams & model_params, + ComputeParams & compute_params, std::map> & model_weights, bool is_static, bool is_prefill = false, @@ -120,27 +158,28 @@ public: virtual const std::vector & get_model_output_names() const override { return m_model_output_names; } - virtual int get_ctx_size() const { return m_ctx; } + virtual int get_ctx_size() const { return m_model_params.ctx; } - virtual int get_ctx_swa_size() const { return m_ctx_swa; } + virtual int get_ctx_swa_size() const { return m_model_params.ctx_swa; } - virtual int get_ctx_per_seq() const { return m_ctx_per_seq; } + virtual int get_ctx_per_seq() const { return m_model_params.ctx_per_seq; } - virtual int get_ctx_per_seq_swa() const { return m_ctx_per_seq_swa; } + virtual int get_ctx_per_seq_swa() const { return m_model_params.ctx_per_seq_swa; } - virtual int get_n_seq() const { return m_n_seq; } + virtual int get_n_seq() const { return m_model_params.n_seq; } virtual int is_swa_layer(int layer) const override { - return std::find(m_swa_layers.begin(), m_swa_layers.end(), layer) != m_swa_layers.end(); + return std::find(m_model_params.swa_layers.begin(), m_model_params.swa_layers.end(), layer) != + m_model_params.swa_layers.end(); } - int get_past_kv_len() const { return m_past_kv_len; } + int get_past_kv_len() const { return m_compute_params.past_kv_len; } - int get_input_len() const { return m_input_len; } + int get_input_len() const { return m_compute_params.input_len; } - virtual int32_t * get_rope_params() const override { return m_rope_params; } + virtual int32_t * get_rope_params() const override { return m_model_params.rope_params; } - virtual std::map get_kv_param_res_names() const override; + // virtual std::map get_kv_param_res_names() const override; virtual bool is_static() const override { return m_is_static; } @@ -161,6 +200,16 @@ public: void clear_model_weights() { m_model_weights.clear(); } + static std::pair compute_llm_params(ggml_cgraph * cgraph, bool is_static); + + ModelParams get_model_params() const { return m_model_params; } + + ComputeParams get_compute_params() const { return m_compute_params; } + + void set_model_params(const ModelParams & model_params) { m_model_params = model_params; } + + void set_compute_params(const ComputeParams & compute_params) { m_compute_params = compute_params; } + bool m_is_static = false; bool m_is_prefill = false; int m_prefill_chunk_size = 0; @@ -174,7 +223,6 @@ private: int compute_op_case(const ggml_tensor * node); std::string compute_op_type(const ggml_tensor * node); - void set_llm_params(); void validate_cgraph() const; ggml_cgraph * m_cgraph = nullptr; @@ -191,27 +239,8 @@ private: std::vector m_model_output_names; std::vector m_node_info_list; - // Fixed for a model - int m_ctx = -1; - int m_ctx_swa = -1; - int m_ctx_per_seq = -1; - int m_ctx_per_seq_swa = -1; - int m_n_seq = -1; - int m_n_heads = -1; - int m_n_heads_kv = -1; - int m_head_size = -1; - std::vector m_swa_layers; - std::vector m_kv_names; - - // Changed per inference - int m_n_seq_active = -1; - int m_seq_active_start = -1; - int m_attention_size = -1; - int m_attention_size_swa = -1; - int m_input_len = -1; - int m_token_len_per_seq = -1; - int m_past_kv_len = -1; - int32_t * m_rope_params = nullptr; + ModelParams m_model_params; + ComputeParams m_compute_params; }; void print_tensor_address_map(const ggml_cgraph * cgraph); diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index 1d5b7a850f..9c455a3724 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -75,7 +75,7 @@ public: virtual const std::vector& get_model_output_names() const = 0; virtual int32_t* get_rope_params() const = 0; - virtual std::map get_kv_param_res_names() const = 0; + // virtual std::map get_kv_param_res_names() const = 0; virtual bool is_static() const = 0; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index ae8916cc58..e90073a1f2 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -79,16 +79,21 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * } static std::mutex cache_mutex; - static std::unordered_map> infer_request_cache; - static std::unordered_map> infer_request_cache_prefill; - static std::unordered_map> ov_input_names_cache; - static std::unordered_map> ov_output_names_cache; + static std::unordered_map, graph_key_hash> decoder_cache; + static std::unordered_map, graph_key_hash> infer_request_cache; + static std::unordered_map, graph_key_hash> infer_request_cache_prefill; + static std::unordered_map, graph_key_hash> ov_input_names_cache; + static std::unordered_map, graph_key_hash> ov_output_names_cache; std::shared_ptr ggml_decoder; std::shared_ptr infer_request; + ModelParams m_params; + ComputeParams c_params; + std::tie(m_params, c_params) = GgmlOvDecoder::compute_llm_params(cgraph, is_static); const auto * inp_pos = get_inp_pos_tensor(cgraph); const auto is_prefill = get_is_prefill(inp_pos); + const auto key = compute_graph_key(cgraph); int64_t decoder_end_time; int64_t conversion_end_time; @@ -98,25 +103,34 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * { std::lock_guard lock(cache_mutex); - auto it = infer_request_cache.find(cgraph); - if (it != infer_request_cache.end()) { + auto it = decoder_cache.find(key); + + auto cache_hit = it != decoder_cache.end(); + if (cache_hit) { + ggml_decoder = it->second; + cache_hit = is_static ? ggml_decoder->get_model_params().can_reuse_statically(m_params) : + ggml_decoder->get_model_params().can_reuse_dynamically(m_params); + } + + if (cache_hit) { std::map> model_weights; - ggml_decoder = - std::make_shared(cgraph, model_weights, is_static, is_prefill, prefill_chunk_size); + ggml_decoder = std::make_shared(cgraph, m_params, c_params, model_weights, is_static, + is_prefill, prefill_chunk_size); + decoder_cache[key] = ggml_decoder; decoder_end_time = ggml_time_us(); - infer_request = infer_request_cache[cgraph]; - if (is_static && is_prefill) { - infer_request = infer_request_cache_prefill[cgraph]; - } + infer_request = is_static && is_prefill ? infer_request_cache_prefill[key] : infer_request_cache[key]; conversion_end_time = ggml_time_us(); compile_end_time = conversion_end_time; } else { + infer_request_cache.erase(key); + infer_request_cache_prefill.erase(key); + std::shared_ptr model; auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, get_types_to_requant(device)); if (!is_static) { - ggml_decoder = std::make_shared(cgraph, model_weights, is_static); + ggml_decoder = std::make_shared(cgraph, m_params, c_params, model_weights, is_static); decoder_end_time = ggml_time_us(); auto input_model = std::make_shared(ggml_decoder); @@ -133,13 +147,14 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * auto compiled_model = core.compile_model(model, device, get_ov_compile_config(device)); compile_end_time = ggml_time_us(); - infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); - infer_request = infer_request_cache[cgraph]; + infer_request = std::make_shared(compiled_model.create_infer_request()); + infer_request_cache[key] = infer_request; + decoder_cache[key] = ggml_decoder; } else { - auto ggml_decoder_prefill = - std::make_shared(cgraph, model_weights, is_static, true, prefill_chunk_size); - auto ggml_decoder_decode = - std::make_shared(cgraph, model_weights, is_static, false, prefill_chunk_size); + auto ggml_decoder_prefill = std::make_shared(cgraph, m_params, c_params, model_weights, + is_static, true, prefill_chunk_size); + auto ggml_decoder_decode = std::make_shared(cgraph, m_params, c_params, model_weights, + is_static, false, prefill_chunk_size); decoder_end_time = ggml_time_us(); auto input_model_prefill = std::make_shared(ggml_decoder_prefill); @@ -162,15 +177,17 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * auto compiled_model_prefill = core.compile_model(model_prefill, device, get_ov_compile_config(device)); auto compiled_model_decode = core.compile_model(model_decode, device, get_ov_compile_config(device)); - infer_request_cache_prefill[cgraph] = + + infer_request_cache_prefill[key] = std::make_shared(compiled_model_prefill.create_infer_request()); - infer_request_cache[cgraph] = + infer_request_cache[key] = std::make_shared(compiled_model_decode.create_infer_request()); compile_end_time = ggml_time_us(); model = is_prefill ? model_prefill : model_decode; ggml_decoder = is_prefill ? ggml_decoder_prefill : ggml_decoder_decode; - infer_request = is_prefill ? infer_request_cache_prefill[cgraph] : infer_request_cache[cgraph]; + infer_request = is_prefill ? infer_request_cache_prefill[key] : infer_request_cache[key]; + decoder_cache[key] = ggml_decoder; } std::vector ov_input_names; @@ -181,8 +198,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * for (const auto & ov_output : model->get_results()) { ov_output_names.push_back(ov_output->get_friendly_name()); } - ov_input_names_cache[cgraph] = ov_input_names; - ov_output_names_cache[cgraph] = ov_output_names; + ov_input_names_cache[key] = ov_input_names; + ov_output_names_cache[key] = ov_output_names; // Set output tensors (for NPU) and kvcache i/o tensors once and for all // Note: does not seem to improve perf on CPU/GPU, but breaks llama-bench, so disabled it for CPU/GPU @@ -205,8 +222,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * } } - auto ov_input_names = ov_input_names_cache[cgraph]; - auto ov_output_names = ov_output_names_cache[cgraph]; + auto ov_input_names = ov_input_names_cache[key]; + auto ov_output_names = ov_output_names_cache[key]; if (!is_static) { for (size_t i = 0; i < ov_input_names.size(); i++) { @@ -675,4 +692,19 @@ bool get_is_prefill(const ggml_tensor * inp_pos) { return inp_pos->ne[0] > 1; } +graph_key compute_graph_key(ggml_cgraph * cgraph) { + graph_key key; + key.n_nodes = cgraph->n_nodes; + + if (cgraph->n_nodes > 0) { + key.first_node_name = std::string(cgraph->nodes[0]->name); + key.last_node_name = std::string(cgraph->nodes[cgraph->n_nodes - 1]->name); + } else { + key.first_node_name = ""; + key.last_node_name = ""; + } + + return key; +} + #pragma GCC diagnostic pop diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 31f86d0999..dca74f8afc 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -6,6 +6,26 @@ #include #include +struct graph_key { + size_t n_nodes; + std::string first_node_name; + std::string last_node_name; + + bool operator==(const graph_key & other) const { + return n_nodes == other.n_nodes && first_node_name == other.first_node_name && + last_node_name == other.last_node_name; + } +}; + +struct graph_key_hash { + size_t operator()(const graph_key & key) const { + size_t h = std::hash{}(key.n_nodes); + h ^= std::hash{}(key.first_node_name) + 0x9e3779b9 + (h << 6) + (h >> 2); + h ^= std::hash{}(key.last_node_name) + 0x9e3779b9 + (h << 6) + (h >> 2); + return h; + } +}; + enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph); size_t checksum(const void * data, size_t size); @@ -46,6 +66,8 @@ const ggml_tensor * get_inp_pos_tensor(struct ggml_cgraph * cgraph); bool get_is_prefill(const ggml_tensor * inp_pos); +graph_key compute_graph_key(struct ggml_cgraph * cgraph); + ov::AnyMap get_ov_compile_config(const std::string & device); std::map get_types_to_requant(const std::string & device); From 65348b5d2029117347721d664ba3f5a00458a9d2 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 27 Nov 2025 15:52:20 +0800 Subject: [PATCH 179/254] fallback naive run with accuracy issue --- ggml/src/ggml-openvino/ggml-decoder.cpp | 97 ++++++++++--------- ggml/src/ggml-openvino/ggml-decoder.h | 37 ++++--- ggml/src/ggml-openvino/openvino/decoder.hpp | 6 +- .../ggml-openvino/openvino/node_context.hpp | 8 +- ggml/src/ggml-openvino/openvino/op/cont.cpp | 2 +- .../openvino/op/flash_attn_ext.cpp | 2 +- .../ggml-openvino/openvino/op/glu_geglu.cpp | 2 +- .../ggml-openvino/openvino/op/glu_swiglu.cpp | 2 +- .../src/ggml-openvino/openvino/op/permute.cpp | 29 ++++-- .../src/ggml-openvino/openvino/op/reshape.cpp | 10 +- .../ggml-openvino/openvino/op/rms_norm.cpp | 2 +- ggml/src/ggml-openvino/openvino/op/rope.cpp | 4 +- ggml/src/ggml-openvino/openvino/op/scale.cpp | 2 +- .../ggml-openvino/openvino/op/set_rows.cpp | 2 +- .../src/ggml-openvino/openvino/op/softmax.cpp | 2 +- ggml/src/ggml-openvino/openvino/op/view.cpp | 2 +- .../openvino/translate_session.cpp | 11 ++- ggml/src/ggml-openvino/utils.cpp | 12 +-- 18 files changed, 134 insertions(+), 98 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 4c0258c4e3..ece1fc8a54 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -89,32 +89,60 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map data_addr_map; + std::unordered_set output_name_set; + for (const auto & node_info : m_node_info_list) { + for (const auto & it : node_info.node_inputs) { + const auto & src_name = it.first; + const auto & src_node = it.second; + + if (output_name_set.find(src_name) == output_name_set.end() && + m_model_weights.find(src_name) == m_model_weights.end() && + m_model_inputs.find(src_name) == m_model_inputs.end()) { + auto param_node = + std::make_shared(get_ov_type(src_node), ov::Shape(get_shape(src_node))); + param_node->set_friendly_name(src_name); + param_node->output(0).get_tensor().set_names({src_name}); + m_model_inputs[src_name] = param_node; + } + } + output_name_set.emplace(node_info.node_output_name); + data_addr_map[node_info.data_addr] = node_info.node_output; + } + for (const auto & it : data_addr_map) { + // No need to add view tensors as model outputs + if (it.second->op != GGML_OP_VIEW) { + m_model_outputs[std::string(it.second->name)] = it.second; + } + } } -// Called in GgmlOvDecoder constructor. Two cases: 1. constructing a decoder for the whole graph; -// 2. constructing a decoder for a node; -// 3. constructing a decoder for the whole graph naively (op test case) void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { - std::string node_name; NodeInfo current_node_info; + auto node_name = std::string(node->name); + auto node_output_name = node_name; + auto * node_output = node; if (node->op == GGML_OP_SET_ROWS) { // SET_ROWS updates the tensor in place. For later ov op that uses the // the view_src of SET_ROWS, we need to make sure they get the updated tensor // by putting the view_src name in the tensor_map in // /src/frontends/ggml/src/translate_session.cpp - node_name = std::string(node->view_src->name); - } else { - node_name = std::string(node->name); + node_output_name = std::string(node->view_src->name); + node_output = node->view_src; } - m_output_names.push_back(node_name); - m_outputs[node_name] = node; + m_output_names.push_back(node_output_name); + m_outputs[node_output_name] = node_output; current_node_info.node = node; current_node_info.node_name = node_name; - current_node_info.node_outputs[node_name] = node; - current_node_info.node_outputs_names.push_back(node_name); + current_node_info.node_output = node_output; + current_node_info.node_output_name = node_output_name; current_node_info.node_op_case = 0; + current_node_info.data_addr = node->data; for (int i = 0; i < GGML_MAX_SRC; i++) { auto * src = node->src[i]; @@ -127,17 +155,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { current_node_info.node_inputs[src_name] = src; current_node_info.node_inputs_names.push_back(src_name); - // Add model inputs and weights constants, if called for the whole graph - if (naive) { - if (m_model_weights.find(src_name) == m_model_weights.end()) { - auto param_node = - std::make_shared(get_ov_type(src), get_graph_input_shape(node, src)); - param_node->set_friendly_name(src_name); - param_node->output(0).get_tensor().set_names({src_name}); - m_model_inputs[src_name] = param_node; - } - - } else if (!src->view_src) { + // Add model inputs + if (!naive && !src->view_src) { ggml_backend_buffer * buffer = src->buffer; if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) { @@ -157,18 +176,15 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { } } - // Add model outputs, if called for the whole graph - if (naive) { - m_model_output_names.push_back(node_name); - } else { + // Add model outputs + if (!naive) { // Model outputs are tensors with GGML_TENSOR_FLAG_OUTPUT flag and kv_caches static std::set debug_output_names = {}; // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph if (node->op == GGML_OP_SET_ROWS || node->flags & GGML_TENSOR_FLAG_OUTPUT || - node_name.find("output") != std::string::npos || debug_output_names.count(node_name)) { - if (auto it = std::find(m_model_output_names.begin(), m_model_output_names.end(), node_name); - it == m_model_output_names.end()) { - m_model_output_names.push_back(node_name); + node_output_name.find("output") != std::string::npos || debug_output_names.count(node_output_name)) { + if (m_model_outputs.find(node_output_name) == m_model_outputs.end()) { + m_model_outputs[node_output_name] = node_output; } } } @@ -176,7 +192,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { m_node_info_list.push_back(current_node_info); } -int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) { +int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const { int op_case = 0; switch (node->op) { case GGML_OP_RESHAPE: { @@ -370,9 +386,6 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1; input_shape = ov::PartialShape{1, 1, 1, len}; - } else if (input->op == GGML_OP_VIEW) { - // This case is added to make test-backend-ops work - input_shape = ov::PartialShape{get_shape(input->view_src)}; } else { input_shape = ov::PartialShape{get_shape(input)}; } @@ -762,17 +775,11 @@ std::vector GgmlOvDecoder::get_output_stride(const std::string & name) c ov::PartialShape GgmlOvDecoder::get_output_shape(const std::string & name) const { auto * ggml_tensor = m_outputs.at(name); - if (ggml_tensor->op == GGML_OP_SET_ROWS) { - ggml_tensor = ggml_tensor->view_src; - } return ov::PartialShape(get_shape(ggml_tensor)); } -ov::PartialShape GgmlOvDecoder::get_output_shape(int node_idx, const std::string & name) const { - auto * ggml_tensor = m_node_info_list[node_idx].node_outputs.at(name); - if (ggml_tensor->op == GGML_OP_SET_ROWS) { - ggml_tensor = ggml_tensor->view_src; - } +ov::PartialShape GgmlOvDecoder::get_output_shape(int node_idx) const { + auto * ggml_tensor = m_node_info_list[node_idx].node_output; return ov::PartialShape(get_shape(ggml_tensor)); } @@ -785,7 +792,7 @@ std::vector GgmlOvDecoder::get_output_names() const { } std::vector GgmlOvDecoder::get_output_names(int node_idx) const { - return m_node_info_list[node_idx].node_outputs_names; + return {m_node_info_list[node_idx].node_output_name}; } const std::string & GgmlOvDecoder::get_op_name() const { @@ -809,8 +816,8 @@ int32_t * GgmlOvDecoder::get_output_op_params(const std::string & name) const { return m_outputs.at(name)->op_params; } -int32_t * GgmlOvDecoder::get_output_op_params(int node_idx, const std::string & name) const { - return m_node_info_list[node_idx].node_outputs.at(name)->op_params; +int32_t * GgmlOvDecoder::get_output_op_params(int node_idx) const { + return m_node_info_list[node_idx].node->op_params; } void GgmlOvDecoder::visit_subgraph(std::function, int node_idx)> node_visitor) const { diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index f2efb65a23..8e680b5c20 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -51,13 +51,14 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { public: struct NodeInfo { ggml_tensor * node; + std::string node_name; + std::string node_op_type; std::map node_inputs; std::vector node_inputs_names; - std::map node_outputs; - std::vector node_outputs_names; + ggml_tensor * node_output; + std::string node_output_name; int node_op_case = 0; - std::string node_op_type; - std::string node_name; + void * data_addr; }; // Graph decoder GgmlOvDecoder(ggml_cgraph * cgraph, @@ -106,7 +107,7 @@ public: virtual ov::PartialShape get_output_shape(const std::string & name) const override; - virtual ov::PartialShape get_output_shape(int node_idx, const std::string & name) const override; + virtual ov::PartialShape get_output_shape(int node_idx) const override; virtual std::vector get_output_stride(const std::string & name) const override; @@ -118,7 +119,7 @@ public: virtual int32_t * get_output_op_params(const std::string & name) const override; - virtual int32_t * get_output_op_params(int node_idx, const std::string & name) const override; + virtual int32_t * get_output_op_params(int node_idx) const override; virtual std::vector get_output_names() const override; @@ -156,7 +157,16 @@ public: return m_model_weights; } - virtual const std::vector & get_model_output_names() const override { return m_model_output_names; } + virtual std::vector get_model_output_names() const override { + std::vector output_names; + output_names.reserve(m_model_outputs.size()); + for (const auto & [name, tensor] : m_model_outputs) { + output_names.push_back(name); + } + return output_names; + } + + const std::map & get_model_outputs() const { return m_model_outputs; } virtual int get_ctx_size() const { return m_model_params.ctx; } @@ -214,14 +224,15 @@ public: bool m_is_prefill = false; int m_prefill_chunk_size = 0; -private: - void set_input_output(ggml_tensor * node, bool naive = false); - void add_extra_inputs(); static std::vector get_shape(const ggml_tensor * tensor); static std::vector get_stride(const ggml_tensor * tensor); static ov::element::Type get_ov_type(const ggml_tensor * tensor); - int compute_op_case(const ggml_tensor * node); - std::string compute_op_type(const ggml_tensor * node); + static std::string compute_op_type(const ggml_tensor * node); + +private: + void set_input_output(ggml_tensor * node, bool naive = false); + void add_extra_inputs(); + int compute_op_case(const ggml_tensor * node) const; void validate_cgraph() const; @@ -236,7 +247,7 @@ private: std::map> m_model_extra_inputs; std::map> m_model_extra_input_values; std::map> m_model_weights; - std::vector m_model_output_names; + std::map m_model_outputs; std::vector m_node_info_list; ModelParams m_model_params; diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index 9c455a3724..2ecc4401df 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -39,7 +39,7 @@ public: virtual PartialShape get_output_shape(const std::string& name) const = 0; - virtual PartialShape get_output_shape(int node_idx, const std::string& name) const = 0; + virtual PartialShape get_output_shape(int node_idx) const = 0; virtual std::vector get_output_stride(const std::string& name) const = 0; @@ -51,7 +51,7 @@ public: virtual int32_t* get_output_op_params(const std::string& name) const = 0; - virtual int32_t* get_output_op_params(int node_idx, const std::string& name) const = 0; + virtual int32_t * get_output_op_params(int node_idx) const = 0; virtual std::vector get_output_names() const = 0; @@ -72,7 +72,7 @@ public: virtual const std::map>& get_model_inputs() const = 0; virtual const std::map>& get_model_extra_inputs() const = 0; virtual const std::map>& get_model_weights() const = 0; - virtual const std::vector& get_model_output_names() const = 0; + virtual std::vector get_model_output_names() const = 0; virtual int32_t* get_rope_params() const = 0; // virtual std::map get_kv_param_res_names() const = 0; diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index 64e3d550c5..42d950c3eb 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -53,17 +53,13 @@ public: std::string get_output_name() const { return m_output_names[0]; } - PartialShape get_output_shape(size_t index) const { - return m_decoder->get_output_shape(m_node_idx, m_output_names[index]); - } + PartialShape get_output_shape() const { return m_decoder->get_output_shape(m_node_idx); } int32_t* get_input_op_params(size_t index) const { return m_decoder->get_input_op_params(m_node_idx, m_input_names[index]); } - int32_t* get_output_op_params(size_t index) const { - return m_decoder->get_output_op_params(m_node_idx, m_output_names[index]); - } + int32_t * get_output_op_params() const { return m_decoder->get_output_op_params(m_node_idx); } ov::element::Type get_output_type(size_t index) const { return m_decoder->get_output_type(m_output_names[index]); diff --git a/ggml/src/ggml-openvino/openvino/op/cont.cpp b/ggml/src/ggml-openvino/openvino/op/cont.cpp index 618b4efdea..d4c47d4bf1 100644 --- a/ggml/src/ggml-openvino/openvino/op/cont.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp @@ -22,7 +22,7 @@ OutputVector translate_cont(const NodeContext & context) { FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported CONT case"); auto src_shape = context.get_input_shape(0).to_shape(); - auto dst_shape = context.get_output_shape(0).to_shape(); + auto dst_shape = context.get_output_shape().to_shape(); ov::Output res; if (op_case == 1) { diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp index efbdf421c6..342da882aa 100644 --- a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +++ b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp @@ -26,7 +26,7 @@ OutputVector translate_flash_attn_ext(const NodeContext & context) { auto v = context.get_input(2); auto mask = context.get_input(3); - float * params = reinterpret_cast(context.get_output_op_params(0)); + float * params = reinterpret_cast(context.get_output_op_params()); float scale = params[0]; // float max_bias = params[1]; // float logit_softcap = params[2]; diff --git a/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp index 80bfbafd83..ad5cd3f6ba 100644 --- a/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp @@ -32,7 +32,7 @@ OutputVector translate_glu_geglu(const NodeContext & context) { src1 = split->output(1); } - int32_t * params = context.get_output_op_params(0); + int32_t * params = context.get_output_op_params(); const int32_t swapped = params[1]; if (swapped) { std::swap(src0, src1); diff --git a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp index 2148931246..2b7f13629f 100644 --- a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp @@ -32,7 +32,7 @@ OutputVector translate_glu_swiglu(const NodeContext & context) { src1 = split->output(1); } - int32_t * params = context.get_output_op_params(0); + int32_t * params = context.get_output_op_params(); const int32_t swapped = params[1]; if (swapped) { std::swap(src0, src1); diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index d156e48e3c..bfe09a2b84 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -32,10 +32,12 @@ OutputVector translate_permute(const NodeContext & context) { if (op_case == 1) { res = std::make_shared(src, perm); } else if (op_case == 4) { - auto output_shape = context.get_output_shape(0).to_shape(); + auto output_shape = context.get_output_shape().to_shape(); auto n_heads = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[1]}); auto head_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[3]}); - auto n_seq_active = context.get_input("n_seq_active"); + auto n_seq_active = context.has_input("n_seq_active") ? + context.get_input("n_seq_active") : + ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[0]}); auto neg_one = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); auto new_shape = @@ -49,26 +51,39 @@ OutputVector translate_permute(const NodeContext & context) { res = std::make_shared(reshaped, perm); } else { auto cache_shape = src.get_partial_shape(); - auto output_shape = context.get_output_shape(0).to_shape(); + auto output_shape = context.get_output_shape().to_shape(); int64_t head_size = output_shape[3]; int64_t n_heads = output_shape[1]; int64_t ctx_per_seq = cache_shape[2].is_static() ? cache_shape[2].get_length() : -1; int64_t n_seq = cache_shape[1].get_length(); Output attention_size; - if (op_case == 2) { + if (!context.has_input("attention_size")) { + attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[2]}); + } else if (op_case == 2) { attention_size = context.get_input("attention_size"); } else { attention_size = context.get_input("attention_size_swa"); } + Output seq_active_start; + Output seq_active_end; + if (context.has_input("seq_active_start")) { + seq_active_start = context.get_input("seq_active_start"); + seq_active_end = context.get_input("seq_active_end"); + } else { + int64_t n_seq_active = output_shape[0]; + size_t offset = *((size_t *) context.get_input_op_params(0)); + int64_t seq_active_start_val = offset / context.get_input_stride(0)[0]; + int64_t seq_active_end_val = seq_active_start_val + n_seq_active; + seq_active_start = ov::op::v0::Constant::create(ov::element::i64, {1}, {seq_active_start_val}); + seq_active_end = ov::op::v0::Constant::create(ov::element::i64, {1}, {seq_active_end_val}); + } + // 1. reshape to [n_seq, ctx_per_seq, n_heads, head_size] // 2. slice out the active sequences // 3. slice out the attention part in each sequence // 4. permute - auto seq_active_start = context.get_input("seq_active_start"); - auto seq_active_end = context.get_input("seq_active_end"); - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); diff --git a/ggml/src/ggml-openvino/openvino/op/reshape.cpp b/ggml/src/ggml-openvino/openvino/op/reshape.cpp index b34fa626f1..e26a8c778c 100644 --- a/ggml/src/ggml-openvino/openvino/op/reshape.cpp +++ b/ggml/src/ggml-openvino/openvino/op/reshape.cpp @@ -20,7 +20,7 @@ namespace op { OutputVector translate_reshape(const NodeContext & context) { num_inputs_check(context, 1, 1); - if (context.get_input_shape(0) == context.get_output_shape(0)) { + if (context.get_input_shape(0) == context.get_output_shape()) { return {context.get_input(0)}; } @@ -29,7 +29,7 @@ OutputVector translate_reshape(const NodeContext & context) { op_case == 1 || op_case == 2 || op_case == 3 || op_case == 4 || op_case == 5 || op_case == 6, "Unsupported RESHAPE case"); - auto output_shape = context.get_output_shape(0).to_shape(); + auto output_shape = context.get_output_shape().to_shape(); std::shared_ptr new_shape_node; if (op_case == 1) { new_shape_node = ov::op::v0::Constant::create( @@ -50,18 +50,18 @@ OutputVector translate_reshape(const NodeContext & context) { return {context.get_input(0).get_node_shared_ptr()->input_value(0)}; } else if (op_case == 5) { - std::vector shape_vec = {1, 1, -1, (int64_t) context.get_output_shape(0).to_shape()[3]}; + std::vector shape_vec = {1, 1, -1, (int64_t) context.get_output_shape().to_shape()[3]}; new_shape_node = ov::op::v0::Constant::create(ov::element::i64, {4}, shape_vec); // // Alternative // auto token_len = context.get_input("token_len"); // auto emb_size = - // ov::op::v0::Constant::create(ov::element::i64, {1}, {(int64_t) context.get_output_shape(0).to_shape()[3]}); + // ov::op::v0::Constant::create(ov::element::i64, {1}, {(int64_t) context.get_output_shape().to_shape()[3]}); // auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); // new_shape_node = std::make_shared(ov::OutputVector{one, one, token_len, emb_size}, 0); } else if (op_case == 6) { - new_shape_node = ov::op::v0::Constant::create(ov::element::i64, {4}, context.get_output_shape(0).to_shape()); + new_shape_node = ov::op::v0::Constant::create(ov::element::i64, {4}, context.get_output_shape().to_shape()); } auto res = std::make_shared(context.get_input(0), new_shape_node, false); return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp index 3ac96d0c22..99c97e06ae 100644 --- a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp @@ -27,7 +27,7 @@ OutputVector translate_rms_norm(const NodeContext & context) { square, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {-1}), true); float eps; - memcpy(&eps, context.get_output_op_params(0), sizeof(float)); + memcpy(&eps, context.get_output_op_params(), sizeof(float)); auto rms = std::make_shared( std::make_shared(mean, ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {eps}))); diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index 5c83867d18..96fbb6b795 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -31,8 +31,8 @@ OutputVector translate_rope(const NodeContext & context) { ov::Output res; auto data_node = context.get_input(0).get_node_shared_ptr(); - auto output_shape = context.get_output_shape(0).to_shape(); - int32_t * op_params = context.get_output_op_params(0); + auto output_shape = context.get_output_shape().to_shape(); + int32_t * op_params = context.get_output_op_params(); Output cos_theta_node; Output sin_theta_node; diff --git a/ggml/src/ggml-openvino/openvino/op/scale.cpp b/ggml/src/ggml-openvino/openvino/op/scale.cpp index f52381786a..01e59cedd9 100644 --- a/ggml/src/ggml-openvino/openvino/op/scale.cpp +++ b/ggml/src/ggml-openvino/openvino/op/scale.cpp @@ -15,7 +15,7 @@ OutputVector translate_scale(const NodeContext & context) { num_inputs_check(context, 1, 1); float scale; - memcpy(&scale, context.get_output_op_params(0), sizeof(float)); + memcpy(&scale, context.get_output_op_params(), sizeof(float)); auto scale_node = std::make_shared(ov::element::f32, ov::Shape{}, std::vector{scale}); auto res = std::make_shared(context.get_input(0), scale_node); diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp index a323e5ed38..eb128f04a3 100644 --- a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp @@ -34,7 +34,7 @@ OutputVector translate_set_rows(const NodeContext & context) { data = std::make_shared(data, context.get_output_type(0)); - auto dst_shape = context.get_output_shape(0).to_shape(); + auto dst_shape = context.get_output_shape().to_shape(); auto ind_squeezed = std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 1, 2})); diff --git a/ggml/src/ggml-openvino/openvino/op/softmax.cpp b/ggml/src/ggml-openvino/openvino/op/softmax.cpp index 591bcb46c4..921475e51a 100644 --- a/ggml/src/ggml-openvino/openvino/op/softmax.cpp +++ b/ggml/src/ggml-openvino/openvino/op/softmax.cpp @@ -31,7 +31,7 @@ OutputVector translate_soft_max(const NodeContext & context) { float scale = 1.0f; float max_bias = 0.0f; - auto * op_params = context.get_output_op_params(0); + auto * op_params = context.get_output_op_params(); memcpy(&scale, (float *) op_params + 0, sizeof(float)); memcpy(&max_bias, (float *) op_params + 1, sizeof(float)); auto src0_shape = context.get_input_shape(0).get_shape(); diff --git a/ggml/src/ggml-openvino/openvino/op/view.cpp b/ggml/src/ggml-openvino/openvino/op/view.cpp index 6bf980cab6..f0b8938bef 100644 --- a/ggml/src/ggml-openvino/openvino/op/view.cpp +++ b/ggml/src/ggml-openvino/openvino/op/view.cpp @@ -10,7 +10,7 @@ OutputVector translate_view(const NodeContext & context) { num_inputs_check(context, 1, 1); if (context.get_op_case() == 2) { - auto dst_shape = context.get_output_shape(0).to_shape(); + auto dst_shape = context.get_output_shape().to_shape(); return rename_outputs_with_suffix({process_view_input(context, 0, dst_shape[2] * dst_shape[3])}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index d03c9358b0..546778a470 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -203,7 +203,16 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo results.push_back(result); } - resulting_model = std::make_shared(results, params); + ov::ParameterVector used_params; + for (const auto & param : params) { + if (!param->output(0).get_target_inputs().empty()) { + used_params.push_back(param); + } + } + // if (auto diff = params.size() - used_params.size()) { + // GGML_LOG_INFO("%zu parameters are not used in the model.", diff); + // } + resulting_model = std::make_shared(results, used_params); apply_transformations(resulting_model); return resulting_model; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index e90073a1f2..92e8ce80b3 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -362,7 +362,7 @@ std::map get_types_to_requant(const std::string & dev } bool is_naive(ggml_cgraph * cgraph) { - constexpr int naive_graph_size_threshold = 20; + constexpr int naive_graph_size_threshold = 100; return cgraph->n_nodes < naive_graph_size_threshold; } @@ -412,7 +412,7 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, ov::Shape input_shape; if (ggml_tensor->op == GGML_OP_VIEW) { // This case is added to make test-backend-ops work - input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor, ggml_tensor->view_src).to_shape(); + input_shape = ggml_decoder->get_shape(ggml_tensor->view_src); } else { input_shape = ggml_decoder->get_input_shape(name).to_shape(); } @@ -545,15 +545,13 @@ ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr ggm } ov::Tensor get_ov_output_tensor(std::shared_ptr ggml_decoder, const std::string & result_name) { - auto * ggml_tensor = ggml_decoder->get_output_ggml_tensor(result_name); - auto output_type = ggml_decoder->get_output_type(result_name); - ov::Shape output_shape; - output_shape = ggml_decoder->get_output_shape(result_name).to_shape(); + auto * ggml_tensor = ggml_decoder->get_model_outputs().at(result_name); + auto output_type = ggml_decoder->get_ov_type(ggml_tensor); + auto output_shape = ggml_decoder->get_shape(ggml_tensor); if (ggml_decoder->is_static() && result_name == "result_output") { output_shape[1] = 1; } - ov::Tensor output_tensor(output_type, output_shape, ggml_tensor->data); return output_tensor; } From 808619e274042f3ea3925057d9d02ba163bd4720 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 3 Dec 2025 17:10:07 +0800 Subject: [PATCH 180/254] NPU support llma-perplexity -b 512 --no-warmup --- ggml/src/ggml-openvino/ggml-decoder.cpp | 8 +++++++- ggml/src/ggml-openvino/utils.cpp | 11 ++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index ece1fc8a54..3d8fddc720 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -344,6 +344,12 @@ std::pair GgmlOvDecoder::compute_llm_params(ggml_cgr } else if (name.find("Kcur-0") == 0 || std::string(node->src[0]->name).find("Kcur-0") == 0) { model_params.n_heads_kv = node->ne[1]; } + } else if (node->op == GGML_OP_GET_ROWS && std::string(node->src[1]->name) == "inp_out_ids") { + // for static case, output_len is always 1 except for llama-perplexity + compute_params.output_len = node->src[1]->ne[0]; + if (is_static && compute_params.output_len == 0) { + compute_params.output_len = 1; + } } } model_params.ctx = model_params.ctx_per_seq * model_params.n_seq; @@ -366,7 +372,7 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co input_shape = ov::PartialShape{1, 1, 1, len}; } else if (name == "inp_out_ids") { - input_shape = ov::PartialShape{1, 1, 1, m_is_static ? 1 : -1}; + input_shape = ov::PartialShape{1, 1, 1, m_is_static ? m_compute_params.output_len : -1}; } else if (name.find("KQ_mask") == 0) { if (m_is_static) { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 92e8ce80b3..139bda1f8f 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -513,15 +513,16 @@ ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr ggm } if (param_name == "inp_out_ids") { - ov::Shape input_shape = {1, 1, 1, 1}; + size_t output_len = ggml_decoder->get_compute_params().output_len; + ov::Shape input_shape = {1, 1, 1, output_len}; ov::Tensor input_tensor(ggml_decoder->get_input_type(param_name), input_shape); if (ggml_tensor->ne[0] == 0) { *input_tensor.data() = 0; - } else if (ggml_tensor->ne[0] == 1) { - int32_t inp_out_id = *((int32_t *) ggml_tensor->data) % chunk_size; - *input_tensor.data() = inp_out_id; } else { - throw std::runtime_error("NPU does not support outputing logits for multiple tokens at once."); + auto * data_addr = input_tensor.data(); + for (size_t i = 0; i < output_len; i++) { + data_addr[i] = ((int32_t *) ggml_tensor->data)[i] % chunk_size; + } } return input_tensor; } From 2a9d4ca836585acfe37b1fd5489721e3c2200bae Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 4 Dec 2025 11:12:14 +0800 Subject: [PATCH 181/254] Refactor: split ov_graph_compute for dynamic and static --- ggml/src/ggml-openvino/ggml-openvino.cpp | 5 +- ggml/src/ggml-openvino/utils.cpp | 396 +++++++++++++---------- ggml/src/ggml-openvino/utils.h | 5 +- 3 files changed, 237 insertions(+), 169 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 910c706bda..e809d250f7 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -53,9 +53,8 @@ static const char * ggml_backend_openvino_get_name(ggml_backend_t backend) { } static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { - openvino_frontend_compute(backend, cgraph); - - return GGML_STATUS_SUCCESS; + return ov_graph_compute(cgraph); + GGML_UNUSED(backend); } static const ggml_backend_i ggml_backend_openvino_interface = { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 139bda1f8f..8ce50c332a 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -36,9 +36,9 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" -enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { - static ov::Core core; +static ov::Core core; +enum ggml_status ov_graph_compute(ggml_cgraph * cgraph) { auto get_device = [&] { std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "CPU"; auto available_devices = core.get_available_devices(); @@ -48,6 +48,149 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * } return device; }; + + if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { + std::string filename = "cgraph.txt"; + GgmlOvDecoder::dump_cgraph(cgraph, filename); + } + + static const auto device = get_device(); + static const auto is_static = device == "NPU" ? true : false; + return is_static ? ov_graph_compute_static(cgraph) : ov_graph_compute_dynamic(cgraph, device); +} + +enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::string & device) { + static auto is_static = false; + static auto config = get_ov_compile_config(device); + + // if (is_naive(cgraph)) { + // return naive_compute(cgraph, core, device, config); + // } + + auto start_time = ggml_time_us(); + + static std::mutex cache_mutex; + static std::unordered_map, graph_key_hash> decoder_cache; + static std::unordered_map, graph_key_hash> infer_request_cache; + static std::unordered_map, graph_key_hash> ov_input_names_cache; + static std::unordered_map, graph_key_hash> ov_output_names_cache; + + std::shared_ptr ggml_decoder; + std::shared_ptr infer_request; + ModelParams m_params; + ComputeParams c_params; + std::tie(m_params, c_params) = GgmlOvDecoder::compute_llm_params(cgraph, is_static); + + const auto key = compute_graph_key(cgraph); + bool cache_hit; + + int64_t decoder_end_time; + int64_t conversion_end_time; + int64_t compile_end_time; + int64_t infer_end_time; + + { + std::lock_guard lock(cache_mutex); + + auto it = decoder_cache.find(key); + + cache_hit = it != decoder_cache.end(); + if (cache_hit) { + ggml_decoder = it->second; + cache_hit = ggml_decoder->get_model_params().can_reuse_statically(m_params); + } + + if (cache_hit) { + std::map> model_weights; + ggml_decoder = std::make_shared(cgraph, m_params, c_params, model_weights, is_static); + decoder_cache[key] = ggml_decoder; + infer_request = infer_request_cache[key]; + + decoder_end_time = ggml_time_us(); + conversion_end_time = decoder_end_time; + compile_end_time = decoder_end_time; + } else { + infer_request_cache.erase(key); + + std::shared_ptr model; + auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, get_types_to_requant(device)); + + ggml_decoder = std::make_shared(cgraph, m_params, c_params, model_weights, is_static); + decoder_end_time = ggml_time_us(); + + auto input_model = std::make_shared(ggml_decoder); + model = ov::frontend::ggml::FrontEnd::convert(input_model); + ggml_decoder->clear_model_weights(); + conversion_end_time = ggml_time_us(); + + if (getenv("GGML_OPENVINO_DUMP_IR")) { + char timestamped_filename[64]; + auto timestamp = (long long) ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); + ov::serialize(model, timestamped_filename); + } + + auto compiled_model = core.compile_model(model, device, config); + compile_end_time = ggml_time_us(); + infer_request = std::make_shared(compiled_model.create_infer_request()); + infer_request_cache[key] = infer_request; + decoder_cache[key] = ggml_decoder; + + std::vector ov_input_names; + std::vector ov_output_names; + for (const auto & ov_param : model->get_parameters()) { + ov_input_names.push_back(ov_param->get_friendly_name()); + } + for (const auto & ov_output : model->get_results()) { + ov_output_names.push_back(ov_output->get_friendly_name()); + } + ov_input_names_cache[key] = std::move(ov_input_names); + ov_output_names_cache[key] = std::move(ov_output_names); + } + } + + auto ov_input_names = ov_input_names_cache[key]; + auto ov_output_names = ov_output_names_cache[key]; + + for (size_t i = 0; i < ov_input_names.size(); i++) { + auto param_name = ov_input_names[i]; + auto input_tensor = get_ov_input_tensor(ggml_decoder, param_name); + infer_request->set_input_tensor(i, input_tensor); + + if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { + print_input_tensor_info(param_name, input_tensor); + } + } + + for (size_t i = 0; i < ov_output_names.size(); i++) { + auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); + infer_request->set_output_tensor(i, output_tensor); + } + + infer_request->infer(); + infer_end_time = ggml_time_us(); + + if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { + for (size_t i = 0; i < ov_output_names.size(); i++) { + const auto output_tensor = infer_request->get_output_tensor(i); + print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data()); + } + } + + if (getenv("GGML_OPENVINO_PROFILING")) { + GGML_LOG_INFO("\nGGML OpenVINO Backend: \n"); + GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000); + if (!cache_hit) { + GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000); + GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); + } + GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - compile_end_time) / 1000); + } + + return GGML_STATUS_SUCCESS; +} + +enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { auto get_prefill_chunk_size = [] { const char * chunk_size_str = getenv("GGML_OPENVINO_PREFILL_CHUNK_SIZE"); if (chunk_size_str && atoi(chunk_size_str) > 0) { @@ -56,16 +199,10 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * return 256; }; - static const auto device = get_device(); - static const auto is_static = device == "NPU" ? true : false; - static const auto prefill_chunk_size = get_prefill_chunk_size(); - - ov::AnyMap config; - - if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { - std::string filename = "cgraph.txt"; - GgmlOvDecoder::dump_cgraph(cgraph, filename); - } + static std::string device = "NPU"; + static auto is_static = true; + static auto prefill_chunk_size = get_prefill_chunk_size(); + static auto config = get_ov_compile_config(device); if (is_naive(cgraph)) { return naive_compute(cgraph, core, device, config); @@ -73,11 +210,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * auto start_time = ggml_time_us(); - auto * cache_dir = getenv("GGML_OPENVINO_CACHE_DIR"); - if (cache_dir && !is_static) { - core.set_property(ov::cache_dir(cache_dir)); - } - static std::mutex cache_mutex; static std::unordered_map, graph_key_hash> decoder_cache; static std::unordered_map, graph_key_hash> infer_request_cache; @@ -94,6 +226,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * const auto * inp_pos = get_inp_pos_tensor(cgraph); const auto is_prefill = get_is_prefill(inp_pos); const auto key = compute_graph_key(cgraph); + bool cache_hit; int64_t decoder_end_time; int64_t conversion_end_time; @@ -105,11 +238,10 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * auto it = decoder_cache.find(key); - auto cache_hit = it != decoder_cache.end(); + cache_hit = it != decoder_cache.end(); if (cache_hit) { ggml_decoder = it->second; - cache_hit = is_static ? ggml_decoder->get_model_params().can_reuse_statically(m_params) : - ggml_decoder->get_model_params().can_reuse_dynamically(m_params); + cache_hit = ggml_decoder->get_model_params().can_reuse_statically(m_params); } if (cache_hit) { @@ -117,11 +249,11 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * ggml_decoder = std::make_shared(cgraph, m_params, c_params, model_weights, is_static, is_prefill, prefill_chunk_size); decoder_cache[key] = ggml_decoder; - decoder_end_time = ggml_time_us(); + infer_request = is_prefill ? infer_request_cache_prefill[key] : infer_request_cache[key]; - infer_request = is_static && is_prefill ? infer_request_cache_prefill[key] : infer_request_cache[key]; - conversion_end_time = ggml_time_us(); - compile_end_time = conversion_end_time; + decoder_end_time = ggml_time_us(); + conversion_end_time = decoder_end_time; + compile_end_time = decoder_end_time; } else { infer_request_cache.erase(key); infer_request_cache_prefill.erase(key); @@ -129,67 +261,43 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * std::shared_ptr model; auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, get_types_to_requant(device)); - if (!is_static) { - ggml_decoder = std::make_shared(cgraph, m_params, c_params, model_weights, is_static); - decoder_end_time = ggml_time_us(); + auto ggml_decoder_prefill = std::make_shared(cgraph, m_params, c_params, model_weights, + is_static, true, prefill_chunk_size); + auto ggml_decoder_decode = std::make_shared(cgraph, m_params, c_params, model_weights, + is_static, false, prefill_chunk_size); + decoder_end_time = ggml_time_us(); - auto input_model = std::make_shared(ggml_decoder); - model = ov::frontend::ggml::FrontEnd::convert(input_model); - ggml_decoder->clear_model_weights(); - conversion_end_time = ggml_time_us(); + auto input_model_prefill = std::make_shared(ggml_decoder_prefill); + auto input_model_decode = std::make_shared(ggml_decoder_decode); - if (getenv("GGML_OPENVINO_DUMP_IR")) { - char timestamped_filename[64]; - auto timestamp = (long long) ggml_time_us(); - snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); - ov::serialize(model, timestamped_filename); - } + auto model_prefill = ov::frontend::ggml::FrontEnd::convert(input_model_prefill); + ggml_decoder_prefill->clear_model_weights(); + auto model_decode = ov::frontend::ggml::FrontEnd::convert(input_model_decode); + ggml_decoder_decode->clear_model_weights(); + conversion_end_time = ggml_time_us(); - auto compiled_model = core.compile_model(model, device, get_ov_compile_config(device)); - compile_end_time = ggml_time_us(); - infer_request = std::make_shared(compiled_model.create_infer_request()); - infer_request_cache[key] = infer_request; - decoder_cache[key] = ggml_decoder; - } else { - auto ggml_decoder_prefill = std::make_shared(cgraph, m_params, c_params, model_weights, - is_static, true, prefill_chunk_size); - auto ggml_decoder_decode = std::make_shared(cgraph, m_params, c_params, model_weights, - is_static, false, prefill_chunk_size); - decoder_end_time = ggml_time_us(); - - auto input_model_prefill = std::make_shared(ggml_decoder_prefill); - auto input_model_decode = std::make_shared(ggml_decoder_decode); - - auto model_prefill = ov::frontend::ggml::FrontEnd::convert(input_model_prefill); - ggml_decoder_prefill->clear_model_weights(); - auto model_decode = ov::frontend::ggml::FrontEnd::convert(input_model_decode); - ggml_decoder_decode->clear_model_weights(); - conversion_end_time = ggml_time_us(); - - if (getenv("GGML_OPENVINO_DUMP_IR")) { - char timestamped_filename[64]; - auto timestamp = (long long) ggml_time_us(); - snprintf(timestamped_filename, sizeof(timestamped_filename), "model_prefill_%lld.xml", timestamp); - ov::serialize(model_prefill, timestamped_filename); - snprintf(timestamped_filename, sizeof(timestamped_filename), "model_decode_%lld.xml", timestamp); - ov::serialize(model_decode, timestamped_filename); - } - - auto compiled_model_prefill = core.compile_model(model_prefill, device, get_ov_compile_config(device)); - auto compiled_model_decode = core.compile_model(model_decode, device, get_ov_compile_config(device)); - - infer_request_cache_prefill[key] = - std::make_shared(compiled_model_prefill.create_infer_request()); - infer_request_cache[key] = - std::make_shared(compiled_model_decode.create_infer_request()); - compile_end_time = ggml_time_us(); - - model = is_prefill ? model_prefill : model_decode; - ggml_decoder = is_prefill ? ggml_decoder_prefill : ggml_decoder_decode; - infer_request = is_prefill ? infer_request_cache_prefill[key] : infer_request_cache[key]; - decoder_cache[key] = ggml_decoder; + if (getenv("GGML_OPENVINO_DUMP_IR")) { + char timestamped_filename[64]; + auto timestamp = (long long) ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_prefill_%lld.xml", timestamp); + ov::serialize(model_prefill, timestamped_filename); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_decode_%lld.xml", timestamp); + ov::serialize(model_decode, timestamped_filename); } + auto compiled_model_prefill = core.compile_model(model_prefill, device, get_ov_compile_config(device)); + auto compiled_model_decode = core.compile_model(model_decode, device, get_ov_compile_config(device)); + + infer_request_cache_prefill[key] = + std::make_shared(compiled_model_prefill.create_infer_request()); + infer_request_cache[key] = std::make_shared(compiled_model_decode.create_infer_request()); + compile_end_time = ggml_time_us(); + + model = is_prefill ? model_prefill : model_decode; + ggml_decoder = is_prefill ? ggml_decoder_prefill : ggml_decoder_decode; + infer_request = is_prefill ? infer_request_cache_prefill[key] : infer_request_cache[key]; + decoder_cache[key] = ggml_decoder; + std::vector ov_input_names; std::vector ov_output_names; for (const auto & ov_param : model->get_parameters()) { @@ -198,40 +306,51 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * for (const auto & ov_output : model->get_results()) { ov_output_names.push_back(ov_output->get_friendly_name()); } - ov_input_names_cache[key] = ov_input_names; - ov_output_names_cache[key] = ov_output_names; - - // Set output tensors (for NPU) and kvcache i/o tensors once and for all - // Note: does not seem to improve perf on CPU/GPU, but breaks llama-bench, so disabled it for CPU/GPU - // if (is_static) { - // for (size_t i = 0; i < ov_input_names.size(); i++) { - // auto param_name = ov_input_names[i]; - // if (param_name.find("cache") == 0) { - // auto input_tensor = get_ov_input_tensor_static_decode(ggml_decoder, param_name); - // infer_request->set_input_tensor(i, input_tensor); - // } - // } - // for (size_t i = 0; i < ov_output_names.size(); i++) { - // auto output_name = ov_output_names[i]; - // if (output_name.find("cache") == 0) { - // auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); - // infer_request->set_output_tensor(i, output_tensor); - // } - // } - // } + ov_input_names_cache[key] = std::move(ov_input_names); + ov_output_names_cache[key] = std::move(ov_output_names); } } auto ov_input_names = ov_input_names_cache[key]; auto ov_output_names = ov_output_names_cache[key]; - if (!is_static) { + if (is_prefill) { + auto inp_len = inp_pos->ne[0]; + for (int chunk_index = 0; chunk_index * prefill_chunk_size < inp_len; chunk_index++) { + for (size_t i = 0; i < ov_input_names.size(); i++) { + auto param_name = ov_input_names[i]; + auto input_tensor = get_ov_input_tensor_static_prefill(ggml_decoder, param_name, chunk_index); + infer_request->set_input_tensor(i, input_tensor); + + if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { + const auto input_tensor = infer_request->get_input_tensor(i); + print_input_tensor_info(param_name, input_tensor); + } + } + + for (size_t i = 0; i < ov_output_names.size(); i++) { + auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); + infer_request->set_output_tensor(i, output_tensor); + } + + infer_request->infer(); + + if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { + for (size_t i = 0; i < ov_output_names.size(); i++) { + const auto output_tensor = infer_request->get_output_tensor(i); + print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data()); + } + } + } + infer_end_time = ggml_time_us(); + } else { for (size_t i = 0; i < ov_input_names.size(); i++) { auto param_name = ov_input_names[i]; - auto input_tensor = get_ov_input_tensor(ggml_decoder, param_name); + auto input_tensor = get_ov_input_tensor_static_decode(ggml_decoder, param_name); infer_request->set_input_tensor(i, input_tensor); if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { + const auto input_tensor = infer_request->get_input_tensor(i); print_input_tensor_info(param_name, input_tensor); } } @@ -250,79 +369,24 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data()); } } - } else { - if (is_prefill) { - auto inp_len = inp_pos->ne[0]; - for (int chunk_index = 0; chunk_index * prefill_chunk_size < inp_len; chunk_index++) { - for (size_t i = 0; i < ov_input_names.size(); i++) { - auto param_name = ov_input_names[i]; - auto input_tensor = get_ov_input_tensor_static_prefill(ggml_decoder, param_name, chunk_index); - infer_request->set_input_tensor(i, input_tensor); - - if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { - const auto input_tensor = infer_request->get_input_tensor(i); - print_input_tensor_info(param_name, input_tensor); - } - } - - for (size_t i = 0; i < ov_output_names.size(); i++) { - auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); - infer_request->set_output_tensor(i, output_tensor); - } - - infer_request->infer(); - - if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { - for (size_t i = 0; i < ov_output_names.size(); i++) { - const auto output_tensor = infer_request->get_output_tensor(i); - print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data()); - } - } - } - infer_end_time = ggml_time_us(); - } else { - for (size_t i = 0; i < ov_input_names.size(); i++) { - auto param_name = ov_input_names[i]; - auto input_tensor = get_ov_input_tensor_static_decode(ggml_decoder, param_name); - infer_request->set_input_tensor(i, input_tensor); - - if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { - const auto input_tensor = infer_request->get_input_tensor(i); - print_input_tensor_info(param_name, input_tensor); - } - } - - for (size_t i = 0; i < ov_output_names.size(); i++) { - auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); - infer_request->set_output_tensor(i, output_tensor); - } - - infer_request->infer(); - infer_end_time = ggml_time_us(); - - if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { - for (size_t i = 0; i < ov_output_names.size(); i++) { - const auto output_tensor = infer_request->get_output_tensor(i); - print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data()); - } - } - } } if (getenv("GGML_OPENVINO_PROFILING")) { GGML_LOG_INFO("\nGGML OpenVINO Backend: \n"); GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000); - GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000); - GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); + if (!cache_hit) { + GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000); + GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); + } GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - compile_end_time) / 1000); } return GGML_STATUS_SUCCESS; - GGML_UNUSED(backend); } ov::AnyMap get_ov_compile_config(const std::string & device) { ov::AnyMap config; + auto * cache_dir = getenv("GGML_OPENVINO_CACHE_DIR"); if (device == "NPU") { config = { {"NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES" }, @@ -335,9 +399,11 @@ ov::AnyMap get_ov_compile_config(const std::string & device) { {"NPUW_DQ", "YES" }, {"NPUW_DQ_FULL", "NO" }, }; - if (auto * cache_dir = getenv("GGML_OPENVINO_CACHE_DIR"); cache_dir) { + if (cache_dir) { config["NPUW_CACHE_DIR"] = cache_dir; } + } else if (cache_dir) { + core.set_property(ov::cache_dir(cache_dir)); } return config; } @@ -362,7 +428,7 @@ std::map get_types_to_requant(const std::string & dev } bool is_naive(ggml_cgraph * cgraph) { - constexpr int naive_graph_size_threshold = 100; + constexpr int naive_graph_size_threshold = 20; return cgraph->n_nodes < naive_graph_size_threshold; } diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index dca74f8afc..1fbac56bd3 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -26,7 +26,10 @@ struct graph_key_hash { } }; -enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph); +enum ggml_status ov_graph_compute(struct ggml_cgraph * cgraph); + +enum ggml_status ov_graph_compute_dynamic(struct ggml_cgraph * cgraph, const std::string & device); +enum ggml_status ov_graph_compute_static(struct ggml_cgraph * cgraph); size_t checksum(const void * data, size_t size); From 0ea8238ad05b9e571cb8f2416585477d9484d536 Mon Sep 17 00:00:00 2001 From: Xuejun Zhai Date: Wed, 3 Dec 2025 18:25:59 -0800 Subject: [PATCH 182/254] remove unused API GgmlOvDecoder::get_output_stride(const std::string & name) --- ggml/src/ggml-openvino/ggml-decoder.cpp | 4 ---- ggml/src/ggml-openvino/ggml-decoder.h | 2 -- ggml/src/ggml-openvino/openvino/decoder.hpp | 2 -- 3 files changed, 8 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 3d8fddc720..b6886733ba 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -775,10 +775,6 @@ std::vector GgmlOvDecoder::get_input_names(int node_idx) const { return m_node_info_list[node_idx].node_inputs_names; } -std::vector GgmlOvDecoder::get_output_stride(const std::string & name) const { - return get_stride(m_outputs.at(name)); -} - ov::PartialShape GgmlOvDecoder::get_output_shape(const std::string & name) const { auto * ggml_tensor = m_outputs.at(name); return ov::PartialShape(get_shape(ggml_tensor)); diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 8e680b5c20..51f314f17c 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -109,8 +109,6 @@ public: virtual ov::PartialShape get_output_shape(int node_idx) const override; - virtual std::vector get_output_stride(const std::string & name) const override; - virtual ov::element::Type get_output_type(const std::string & name) const override; virtual int32_t * get_input_op_params(const std::string & name) const override; diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index 2ecc4401df..54fe890fd1 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -41,8 +41,6 @@ public: virtual PartialShape get_output_shape(int node_idx) const = 0; - virtual std::vector get_output_stride(const std::string& name) const = 0; - virtual element::Type get_output_type(const std::string& name) const = 0; virtual int32_t* get_input_op_params(const std::string& name) const = 0; From 8f4ee4eee23a720a73978773cde6e9eaccf3018b Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 4 Dec 2025 15:48:03 +0800 Subject: [PATCH 183/254] minor update due to ov 2025.4 --- ggml/src/ggml-openvino/utils.cpp | 2 +- ggml/src/ggml-openvino/utils.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 8ce50c332a..156204f8d8 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -679,7 +679,7 @@ void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor } } -void print_output_tensor_info(const std::string & name, const ov::Tensor & tensor, void * output_dst) { +void print_output_tensor_info(const std::string & name, const ov::Tensor & tensor, const void * output_dst) { std::cout << "Output name: " << name << ", Output shape: " << tensor.get_shape() << ", Address: " << output_dst << std::endl; diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 1fbac56bd3..85bb3a2f88 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -35,7 +35,7 @@ size_t checksum(const void * data, size_t size); void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor); -void print_output_tensor_info(const std::string & name, const ov::Tensor & tensor, void * output_dst); +void print_output_tensor_info(const std::string & name, const ov::Tensor & tensor, const void * output_dst); template std::vector pad_input(const T * data, From 497964afbb69910cb6afa939f79fd00ac16fa259 Mon Sep 17 00:00:00 2001 From: Xuejun Zhai Date: Wed, 3 Dec 2025 19:19:38 -0800 Subject: [PATCH 184/254] remove unused API GgmlOvDecoder::get_output_names() --- ggml/src/ggml-openvino/ggml-decoder.cpp | 5 ----- ggml/src/ggml-openvino/ggml-decoder.h | 2 -- ggml/src/ggml-openvino/openvino/decoder.hpp | 2 -- 3 files changed, 9 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index b6886733ba..75f781f533 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -134,7 +134,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { node_output = node->view_src; } - m_output_names.push_back(node_output_name); m_outputs[node_output_name] = node_output; current_node_info.node = node; @@ -789,10 +788,6 @@ ov::element::Type GgmlOvDecoder::get_output_type(const std::string & name) const return get_ov_type(m_outputs.at(name)); } -std::vector GgmlOvDecoder::get_output_names() const { - return m_output_names; -} - std::vector GgmlOvDecoder::get_output_names(int node_idx) const { return {m_node_info_list[node_idx].node_output_name}; } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 51f314f17c..2050e1762d 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -119,8 +119,6 @@ public: virtual int32_t * get_output_op_params(int node_idx) const override; - virtual std::vector get_output_names() const override; - virtual std::vector get_output_names(int node_idx) const override; virtual const std::string & get_op_type() const override; diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index 54fe890fd1..e867af1416 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -51,8 +51,6 @@ public: virtual int32_t * get_output_op_params(int node_idx) const = 0; - virtual std::vector get_output_names() const = 0; - virtual std::vector get_output_names(int node_idx) const = 0; virtual const std::string& get_op_type() const = 0; From f516db1db51224d3430c613e55e69165ddef4daf Mon Sep 17 00:00:00 2001 From: Xuejun Zhai Date: Wed, 3 Dec 2025 19:28:01 -0800 Subject: [PATCH 185/254] remove unused API get_output_shape(const std::string & name) --- ggml/src/ggml-openvino/ggml-decoder.cpp | 5 ----- ggml/src/ggml-openvino/ggml-decoder.h | 2 -- ggml/src/ggml-openvino/openvino/decoder.hpp | 2 -- 3 files changed, 9 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 75f781f533..76553c1b97 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -774,11 +774,6 @@ std::vector GgmlOvDecoder::get_input_names(int node_idx) const { return m_node_info_list[node_idx].node_inputs_names; } -ov::PartialShape GgmlOvDecoder::get_output_shape(const std::string & name) const { - auto * ggml_tensor = m_outputs.at(name); - return ov::PartialShape(get_shape(ggml_tensor)); -} - ov::PartialShape GgmlOvDecoder::get_output_shape(int node_idx) const { auto * ggml_tensor = m_node_info_list[node_idx].node_output; return ov::PartialShape(get_shape(ggml_tensor)); diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 2050e1762d..9b0d02c3eb 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -105,8 +105,6 @@ public: virtual std::vector get_input_names(int node_idx) const override; - virtual ov::PartialShape get_output_shape(const std::string & name) const override; - virtual ov::PartialShape get_output_shape(int node_idx) const override; virtual ov::element::Type get_output_type(const std::string & name) const override; diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index e867af1416..a2ab8df691 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -37,8 +37,6 @@ public: virtual std::vector get_input_names(int node_idx) const = 0; - virtual PartialShape get_output_shape(const std::string& name) const = 0; - virtual PartialShape get_output_shape(int node_idx) const = 0; virtual element::Type get_output_type(const std::string& name) const = 0; From 6d7a0d6047362414ef158c564f9d406b4a20bd29 Mon Sep 17 00:00:00 2001 From: Xuejun Zhai Date: Wed, 3 Dec 2025 23:13:18 -0800 Subject: [PATCH 186/254] Modified API GgmlOvDecoder::get_output_type(const std::string & name) --- ggml/src/ggml-openvino/ggml-decoder.cpp | 4 ++-- ggml/src/ggml-openvino/ggml-decoder.h | 2 +- ggml/src/ggml-openvino/openvino/decoder.hpp | 2 +- ggml/src/ggml-openvino/openvino/node_context.hpp | 4 ++-- ggml/src/ggml-openvino/openvino/op/cpy.cpp | 2 +- ggml/src/ggml-openvino/openvino/op/get_rows.cpp | 4 ++-- ggml/src/ggml-openvino/openvino/op/set_rows.cpp | 2 +- ggml/src/ggml-openvino/openvino/op/softmax.cpp | 4 ++-- 8 files changed, 12 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 76553c1b97..5f49a11e27 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -779,8 +779,8 @@ ov::PartialShape GgmlOvDecoder::get_output_shape(int node_idx) const { return ov::PartialShape(get_shape(ggml_tensor)); } -ov::element::Type GgmlOvDecoder::get_output_type(const std::string & name) const { - return get_ov_type(m_outputs.at(name)); +ov::element::Type GgmlOvDecoder::get_output_type(const int node_idx) const { + return get_ov_type(m_node_info_list[node_idx].node); } std::vector GgmlOvDecoder::get_output_names(int node_idx) const { diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 9b0d02c3eb..25ea9af7f3 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -107,7 +107,7 @@ public: virtual ov::PartialShape get_output_shape(int node_idx) const override; - virtual ov::element::Type get_output_type(const std::string & name) const override; + virtual ov::element::Type get_output_type(const int node_idx) const override; virtual int32_t * get_input_op_params(const std::string & name) const override; diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index a2ab8df691..da57785214 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -39,7 +39,7 @@ public: virtual PartialShape get_output_shape(int node_idx) const = 0; - virtual element::Type get_output_type(const std::string& name) const = 0; + virtual element::Type get_output_type(const int node_idx) const = 0; virtual int32_t* get_input_op_params(const std::string& name) const = 0; diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index 42d950c3eb..3ca244b720 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -61,8 +61,8 @@ public: int32_t * get_output_op_params() const { return m_decoder->get_output_op_params(m_node_idx); } - ov::element::Type get_output_type(size_t index) const { - return m_decoder->get_output_type(m_output_names[index]); + ov::element::Type get_output_type() const { + return m_decoder->get_output_type(m_node_idx); } Output get_input(int idx) const override { diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index d5186cddee..ded2f0ca78 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -11,7 +11,7 @@ namespace ggml { namespace op { OutputVector translate_cpy(const NodeContext & context) { - auto res = std::make_shared(context.get_input(0), context.get_output_type(0)); + auto res = std::make_shared(context.get_input(0), context.get_output_type()); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp index ace79c33a9..dc8454a199 100644 --- a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp @@ -42,8 +42,8 @@ OutputVector translate_get_rows(const NodeContext & context) { res = std::make_shared(data, indices, axis); } - if (res.get_element_type() != context.get_output_type(0)) { - res = std::make_shared(res, context.get_output_type(0)); + if (res.get_element_type() != context.get_output_type()) { + res = std::make_shared(res, context.get_output_type()); } res = std::make_shared(res, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp index eb128f04a3..4ceb55589e 100644 --- a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp @@ -32,7 +32,7 @@ OutputVector translate_set_rows(const NodeContext & context) { auto indices = context.get_input(1); auto dst = context.get_input(2); - data = std::make_shared(data, context.get_output_type(0)); + data = std::make_shared(data, context.get_output_type()); auto dst_shape = context.get_output_shape().to_shape(); diff --git a/ggml/src/ggml-openvino/openvino/op/softmax.cpp b/ggml/src/ggml-openvino/openvino/op/softmax.cpp index 921475e51a..782fdf078d 100644 --- a/ggml/src/ggml-openvino/openvino/op/softmax.cpp +++ b/ggml/src/ggml-openvino/openvino/op/softmax.cpp @@ -63,8 +63,8 @@ OutputVector translate_soft_max(const NodeContext & context) { mask_node_sliced = std::make_shared(mask_node, zero, token_len, one, one); } - if (mask_node_sliced.get_element_type() != context.get_output_type(0)) { - mask_node_sliced = std::make_shared(mask_node_sliced, context.get_output_type(0)); + if (mask_node_sliced.get_element_type() != context.get_output_type()) { + mask_node_sliced = std::make_shared(mask_node_sliced, context.get_output_type()); } Output slope_mask; From ba852f2a60a5c513350265b3ba59f00041351ae4 Mon Sep 17 00:00:00 2001 From: Xuejun Zhai Date: Wed, 3 Dec 2025 23:17:47 -0800 Subject: [PATCH 187/254] Removed API GgmlOvDecoder::get_output_op_params(const std::string & name) --- ggml/src/ggml-openvino/ggml-decoder.cpp | 4 ---- ggml/src/ggml-openvino/ggml-decoder.h | 2 -- ggml/src/ggml-openvino/openvino/decoder.hpp | 2 -- 3 files changed, 8 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 5f49a11e27..0259534561 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -804,10 +804,6 @@ int32_t * GgmlOvDecoder::get_input_op_params(int node_idx, const std::string & n return m_node_info_list[node_idx].node_inputs.at(name)->op_params; } -int32_t * GgmlOvDecoder::get_output_op_params(const std::string & name) const { - return m_outputs.at(name)->op_params; -} - int32_t * GgmlOvDecoder::get_output_op_params(int node_idx) const { return m_node_info_list[node_idx].node->op_params; } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 25ea9af7f3..c120ca2bde 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -113,8 +113,6 @@ public: virtual int32_t * get_input_op_params(int node_idx, const std::string & name) const override; - virtual int32_t * get_output_op_params(const std::string & name) const override; - virtual int32_t * get_output_op_params(int node_idx) const override; virtual std::vector get_output_names(int node_idx) const override; diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index da57785214..520a1c211c 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -45,8 +45,6 @@ public: virtual int32_t* get_input_op_params(int node_idx, const std::string& name) const = 0; - virtual int32_t* get_output_op_params(const std::string& name) const = 0; - virtual int32_t * get_output_op_params(int node_idx) const = 0; virtual std::vector get_output_names(int node_idx) const = 0; From 111c96c266dd49452d78e97ee8dcc0b35d328a12 Mon Sep 17 00:00:00 2001 From: Xuejun Zhai Date: Wed, 3 Dec 2025 23:21:18 -0800 Subject: [PATCH 188/254] Removed API get_output_ggml_tensor(const std::string & name) --- ggml/src/ggml-openvino/ggml-decoder.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index c120ca2bde..2b224d77e6 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -129,8 +129,6 @@ public: ggml_tensor * get_input_ggml_tensor(const std::string & name) const { return m_inputs.at(name); } - ggml_tensor * get_output_ggml_tensor(const std::string & name) const { return m_outputs.at(name); } - virtual int get_op_case(int node_idx) const override { return m_node_info_list[node_idx].node_op_case; } virtual const std::map> & get_model_inputs() const override { From 8ff73e5d53eb2439aab5d8d013e4689a6daa572c Mon Sep 17 00:00:00 2001 From: Xuejun Zhai Date: Wed, 3 Dec 2025 23:25:55 -0800 Subject: [PATCH 189/254] Removed API m_outputs --- ggml/src/ggml-openvino/ggml-decoder.cpp | 2 -- ggml/src/ggml-openvino/ggml-decoder.h | 1 - 2 files changed, 3 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 0259534561..0f48552e95 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -134,8 +134,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { node_output = node->view_src; } - m_outputs[node_output_name] = node_output; - current_node_info.node = node; current_node_info.node_name = node_name; current_node_info.node_output = node_output; diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 2b224d77e6..f667392f7d 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -230,7 +230,6 @@ private: std::vector m_nodes; std::map m_inputs; std::vector m_input_names; - std::map m_outputs; std::vector m_output_names; std::map> m_model_inputs; From 197ed992c0db8e3b9708a6c1083d46b830693dc4 Mon Sep 17 00:00:00 2001 From: Xuejun Zhai Date: Wed, 3 Dec 2025 23:27:23 -0800 Subject: [PATCH 190/254] Removed m_output_names --- ggml/src/ggml-openvino/ggml-decoder.h | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index f667392f7d..a517374829 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -230,7 +230,6 @@ private: std::vector m_nodes; std::map m_inputs; std::vector m_input_names; - std::vector m_output_names; std::map> m_model_inputs; std::map> m_model_extra_inputs; From 95c307190617876a0734ec68503e2f3a562ca0ae Mon Sep 17 00:00:00 2001 From: Xuejun Zhai Date: Wed, 3 Dec 2025 23:54:55 -0800 Subject: [PATCH 191/254] Removed API GgmlOvDecoder::get_input_names() --- ggml/src/ggml-openvino/ggml-decoder.cpp | 7 +------ ggml/src/ggml-openvino/ggml-decoder.h | 3 --- ggml/src/ggml-openvino/openvino/decoder.hpp | 2 -- ggml/src/ggml-openvino/openvino/node_context.hpp | 4 ++-- 4 files changed, 3 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 0f48552e95..eb14834996 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -147,7 +147,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { continue; } std::string src_name = std::string(src->name); - m_input_names.push_back(src_name); m_inputs[src_name] = src; current_node_info.node_inputs[src_name] = src; current_node_info.node_inputs_names.push_back(src_name); @@ -757,17 +756,13 @@ ov::element::Type GgmlOvDecoder::get_input_type(const std::string & name) const } size_t GgmlOvDecoder::get_input_size() const { - return m_input_names.size(); + return m_model_inputs.size(); } size_t GgmlOvDecoder::get_input_size(int node_idx) const { return m_node_info_list[node_idx].node_inputs_names.size(); } -std::vector GgmlOvDecoder::get_input_names() const { - return m_input_names; -} - std::vector GgmlOvDecoder::get_input_names(int node_idx) const { return m_node_info_list[node_idx].node_inputs_names; } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index a517374829..5608b7c914 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -101,8 +101,6 @@ public: GGML_UNUSED(producer_output_port_index); } - virtual std::vector get_input_names() const override; - virtual std::vector get_input_names(int node_idx) const override; virtual ov::PartialShape get_output_shape(int node_idx) const override; @@ -229,7 +227,6 @@ private: ggml_cgraph * m_cgraph = nullptr; std::vector m_nodes; std::map m_inputs; - std::vector m_input_names; std::map> m_model_inputs; std::map> m_model_extra_inputs; diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index 520a1c211c..0949f7073f 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -33,8 +33,6 @@ public: std::string& producer_output_port_name, size_t& producer_output_port_index) const = 0; - virtual std::vector get_input_names() const = 0; - virtual std::vector get_input_names(int node_idx) const = 0; virtual PartialShape get_output_shape(int node_idx) const = 0; diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index 3ca244b720..e95bafc269 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -43,8 +43,8 @@ public: return m_decoder->get_input_type(m_input_names[index]); } - PartialShape get_input_shape(size_t index) const { - return m_decoder->get_input_shape(m_node_idx, m_input_names[index]); + PartialShape get_input_shape(size_t input_index) const { + return m_decoder->get_input_shape(m_node_idx, m_input_names[input_index]); } std::vector get_input_stride(size_t index) const { From cd611782ef284410c1de7c22a8a83f7f26941991 Mon Sep 17 00:00:00 2001 From: Xuejun Zhai Date: Thu, 4 Dec 2025 00:18:14 -0800 Subject: [PATCH 192/254] Removed API GgmlOvDecoder::get_input_stride(const std::string& name) --- ggml/src/ggml-openvino/ggml-decoder.cpp | 4 ---- ggml/src/ggml-openvino/ggml-decoder.h | 2 -- ggml/src/ggml-openvino/openvino/decoder.hpp | 2 -- 3 files changed, 8 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index eb14834996..2d96bf1572 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -743,10 +743,6 @@ ov::PartialShape GgmlOvDecoder::get_input_shape(int node_idx, const std::string return ov::PartialShape(get_shape(m_node_info_list[node_idx].node_inputs.at(name))); } -std::vector GgmlOvDecoder::get_input_stride(const std::string & name) const { - return get_stride(m_inputs.at(name)); -} - std::vector GgmlOvDecoder::get_input_stride(int node_idx, const std::string & name) const { return get_stride(m_node_info_list[node_idx].node_inputs.at(name)); } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 5608b7c914..336833d8af 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -81,8 +81,6 @@ public: virtual ov::PartialShape get_input_shape(int node_idx, const std::string & name) const override; - virtual std::vector get_input_stride(const std::string & name) const override; - virtual std::vector get_input_stride(int node_idx, const std::string & name) const override; virtual ov::element::Type get_input_type(const std::string & name) const override; diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index 0949f7073f..2cc6dbba46 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -18,8 +18,6 @@ public: virtual PartialShape get_input_shape(int node_idx, const std::string& name) const = 0; - virtual std::vector get_input_stride(const std::string& name) const = 0; - virtual std::vector get_input_stride(int node_idx, const std::string& name) const = 0; virtual element::Type get_input_type(const std::string& name) const = 0; From 891a3beb2d4b87e6babefc30946f2dfc395d16e5 Mon Sep 17 00:00:00 2001 From: Xuejun Zhai Date: Thu, 4 Dec 2025 06:10:44 -0800 Subject: [PATCH 193/254] Removed API get_input_type --- ggml/src/ggml-openvino/ggml-decoder.cpp | 6 +++++- ggml/src/ggml-openvino/ggml-decoder.h | 2 ++ ggml/src/ggml-openvino/openvino/decoder.hpp | 2 ++ ggml/src/ggml-openvino/openvino/node_context.hpp | 2 +- 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 2d96bf1572..97bd938567 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -147,7 +147,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { continue; } std::string src_name = std::string(src->name); - m_inputs[src_name] = src; current_node_info.node_inputs[src_name] = src; current_node_info.node_inputs_names.push_back(src_name); @@ -163,6 +162,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { if (m_model_inputs.find(src_name) != m_model_inputs.end()) { continue; } + m_inputs[src_name] = src; auto param_node = std::make_shared(get_ov_type(src), get_graph_input_shape(node, src)); param_node->set_friendly_name(src_name); @@ -751,6 +751,10 @@ ov::element::Type GgmlOvDecoder::get_input_type(const std::string & name) const return get_ov_type(m_inputs.at(name)); } +ov::element::Type GgmlOvDecoder::get_input_type(int node_idx, const std::string & name) const { + return get_ov_type(m_node_info_list[node_idx].node_inputs.at(name)); +} + size_t GgmlOvDecoder::get_input_size() const { return m_model_inputs.size(); } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 336833d8af..c76315f8af 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -85,6 +85,8 @@ public: virtual ov::element::Type get_input_type(const std::string & name) const override; + virtual ov::element::Type get_input_type(int node_idx, const std::string & name) const override; + virtual size_t get_input_size() const override; virtual size_t get_input_size(int node_idx) const override; diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index 2cc6dbba46..ef4b3a7593 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -22,6 +22,8 @@ public: virtual element::Type get_input_type(const std::string& name) const = 0; + virtual element::Type get_input_type(int node_idx, const std::string& name) const = 0; + virtual size_t get_input_size() const = 0; virtual size_t get_input_size(int node_idx) const = 0; diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index e95bafc269..a0666b21ac 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -40,7 +40,7 @@ public: } ov::element::Type get_input_type(size_t index) const { - return m_decoder->get_input_type(m_input_names[index]); + return m_decoder->get_input_type(m_node_idx, m_input_names[index]); } PartialShape get_input_shape(size_t input_index) const { From 42ca27f71427cd14af55523b2b5b7c93146c8ae1 Mon Sep 17 00:00:00 2001 From: Xuejun Zhai Date: Thu, 4 Dec 2025 06:24:38 -0800 Subject: [PATCH 194/254] Removed API get_input_type --- ggml/src/ggml-openvino/ggml-decoder.cpp | 4 ---- ggml/src/ggml-openvino/ggml-decoder.h | 2 -- ggml/src/ggml-openvino/openvino/decoder.hpp | 2 -- ggml/src/ggml-openvino/utils.cpp | 12 ++++++------ 4 files changed, 6 insertions(+), 14 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 97bd938567..95d7fc00f4 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -747,10 +747,6 @@ std::vector GgmlOvDecoder::get_input_stride(int node_idx, const std::str return get_stride(m_node_info_list[node_idx].node_inputs.at(name)); } -ov::element::Type GgmlOvDecoder::get_input_type(const std::string & name) const { - return get_ov_type(m_inputs.at(name)); -} - ov::element::Type GgmlOvDecoder::get_input_type(int node_idx, const std::string & name) const { return get_ov_type(m_node_info_list[node_idx].node_inputs.at(name)); } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index c76315f8af..3b6c1ec8e3 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -83,8 +83,6 @@ public: virtual std::vector get_input_stride(int node_idx, const std::string & name) const override; - virtual ov::element::Type get_input_type(const std::string & name) const override; - virtual ov::element::Type get_input_type(int node_idx, const std::string & name) const override; virtual size_t get_input_size() const override; diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index ef4b3a7593..8f6a9e9cb2 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -20,8 +20,6 @@ public: virtual std::vector get_input_stride(int node_idx, const std::string& name) const = 0; - virtual element::Type get_input_type(const std::string& name) const = 0; - virtual element::Type get_input_type(int node_idx, const std::string& name) const = 0; virtual size_t get_input_size() const = 0; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 156204f8d8..018199def5 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -480,9 +480,9 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, // This case is added to make test-backend-ops work input_shape = ggml_decoder->get_shape(ggml_tensor->view_src); } else { - input_shape = ggml_decoder->get_input_shape(name).to_shape(); + input_shape = ggml_decoder->get_shape(ggml_tensor); } - auto input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); + auto input_tensor = ov::Tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape, input_data); return input_tensor; } } // namespace @@ -506,7 +506,7 @@ ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr ggml (op->op == GGML_OP_SET_ROWS && op->src[1] == ggml_tensor)) { assert(ggml_tensor->ne[0] == 1); ov::Shape input_shape = {1, 1, 1, 1}; - ov::Tensor input_tensor(ggml_decoder->get_input_type(param_name), input_shape); + ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape); if (ggml_tensor->type == GGML_TYPE_I32) { *input_tensor.data() = *((int32_t *) ggml_tensor->data); } else if (ggml_tensor->type == GGML_TYPE_I64) { @@ -519,7 +519,7 @@ ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr ggml if (param_name == "inp_out_ids") { ov::Shape input_shape = {1, 1, 1, 1}; - ov::Tensor input_tensor(ggml_decoder->get_input_type(param_name), input_shape); + ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape); int32_t inp_out_id = *((int32_t *) ggml_tensor->data); assert(ggml_tensor->ne[0] == 1); assert(inp_out_id == 0); @@ -553,7 +553,7 @@ ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr ggm if (param_name == "inp_pos" || param_name == "inp_tokens" || (op->op == GGML_OP_SET_ROWS && op->src[1] == ggml_tensor)) { ov::Shape input_shape = {1, 1, 1, chunk_size}; - ov::Tensor input_tensor(ggml_decoder->get_input_type(param_name), input_shape); + ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape); // copy the chunk_index-th chunk from ggml_tensor size_t element_size = ggml_type_size(ggml_tensor->type); void * input_data = (char *) ggml_tensor->data + chunk_index * chunk_size * element_size; @@ -581,7 +581,7 @@ ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr ggm if (param_name == "inp_out_ids") { size_t output_len = ggml_decoder->get_compute_params().output_len; ov::Shape input_shape = {1, 1, 1, output_len}; - ov::Tensor input_tensor(ggml_decoder->get_input_type(param_name), input_shape); + ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape); if (ggml_tensor->ne[0] == 0) { *input_tensor.data() = 0; } else { From acb8a01d0e30c7606fdd6c57c2fb9c340b62adb4 Mon Sep 17 00:00:00 2001 From: Xuejun Zhai Date: Thu, 4 Dec 2025 06:28:21 -0800 Subject: [PATCH 195/254] Removed API GgmlOvDecoder::get_input_shape(const std::string & name) --- ggml/src/ggml-openvino/ggml-decoder.cpp | 4 ---- ggml/src/ggml-openvino/ggml-decoder.h | 2 -- ggml/src/ggml-openvino/openvino/decoder.hpp | 2 -- 3 files changed, 8 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 95d7fc00f4..b38a9a67a6 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -735,10 +735,6 @@ ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor * tensor) { } } -ov::PartialShape GgmlOvDecoder::get_input_shape(const std::string & name) const { - return ov::PartialShape(get_shape(m_inputs.at(name))); -} - ov::PartialShape GgmlOvDecoder::get_input_shape(int node_idx, const std::string & name) const { return ov::PartialShape(get_shape(m_node_info_list[node_idx].node_inputs.at(name))); } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 3b6c1ec8e3..8f9dab0ae7 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -77,8 +77,6 @@ public: GGML_UNUSED(name); } - virtual ov::PartialShape get_input_shape(const std::string & name) const override; - virtual ov::PartialShape get_input_shape(int node_idx, const std::string & name) const override; virtual std::vector get_input_stride(int node_idx, const std::string & name) const override; diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index 8f6a9e9cb2..61d5f11d2c 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -14,8 +14,6 @@ class GgmlDecoder : public DecoderBase { public: virtual ov::Any get_attribute(const std::string& name) const = 0; - virtual PartialShape get_input_shape(const std::string& name) const = 0; - virtual PartialShape get_input_shape(int node_idx, const std::string& name) const = 0; virtual std::vector get_input_stride(int node_idx, const std::string& name) const = 0; From 47c91db31fce681be108312a7f45dd1e6ebc463f Mon Sep 17 00:00:00 2001 From: Xuejun Zhai Date: Thu, 4 Dec 2025 06:56:39 -0800 Subject: [PATCH 196/254] Removed API GgmlOvDecoder::get_input_op_params(const std::string & name) --- ggml/src/ggml-openvino/ggml-decoder.cpp | 4 ---- ggml/src/ggml-openvino/ggml-decoder.h | 2 -- ggml/src/ggml-openvino/openvino/decoder.hpp | 2 -- 3 files changed, 8 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index b38a9a67a6..72f6144708 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -781,10 +781,6 @@ const std::string & GgmlOvDecoder::get_op_name(int node_idx) const { return m_node_info_list[node_idx].node_name; } -int32_t * GgmlOvDecoder::get_input_op_params(const std::string & name) const { - return m_inputs.at(name)->op_params; -} - int32_t * GgmlOvDecoder::get_input_op_params(int node_idx, const std::string & name) const { return m_node_info_list[node_idx].node_inputs.at(name)->op_params; } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 8f9dab0ae7..1e51a7e1a8 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -103,8 +103,6 @@ public: virtual ov::element::Type get_output_type(const int node_idx) const override; - virtual int32_t * get_input_op_params(const std::string & name) const override; - virtual int32_t * get_input_op_params(int node_idx, const std::string & name) const override; virtual int32_t * get_output_op_params(int node_idx) const override; diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index 61d5f11d2c..1603c7fd20 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -35,8 +35,6 @@ public: virtual element::Type get_output_type(const int node_idx) const = 0; - virtual int32_t* get_input_op_params(const std::string& name) const = 0; - virtual int32_t* get_input_op_params(int node_idx, const std::string& name) const = 0; virtual int32_t * get_output_op_params(int node_idx) const = 0; From 91a1b20c82a8bff136098a1b4e797278a1da002f Mon Sep 17 00:00:00 2001 From: Xuejun Zhai Date: Thu, 4 Dec 2025 22:14:05 -0800 Subject: [PATCH 197/254] Fix error for decoder cache --- ggml/src/ggml-openvino/ggml-decoder.h | 2 +- ggml/src/ggml-openvino/utils.cpp | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 1e51a7e1a8..111eb7200b 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -210,10 +210,10 @@ public: static std::vector get_stride(const ggml_tensor * tensor); static ov::element::Type get_ov_type(const ggml_tensor * tensor); static std::string compute_op_type(const ggml_tensor * node); + void add_extra_inputs(); private: void set_input_output(ggml_tensor * node, bool naive = false); - void add_extra_inputs(); int compute_op_case(const ggml_tensor * node) const; void validate_cgraph() const; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 018199def5..935404136c 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -102,8 +102,10 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin if (cache_hit) { std::map> model_weights; - ggml_decoder = std::make_shared(cgraph, m_params, c_params, model_weights, is_static); - decoder_cache[key] = ggml_decoder; + ggml_decoder = decoder_cache[key]; + ggml_decoder->set_compute_params(c_params); + ggml_decoder->set_model_params(m_params); + ggml_decoder->add_extra_inputs(); infer_request = infer_request_cache[key]; decoder_end_time = ggml_time_us(); From 28da9a9adc0efe5f6f11e31b688d602db3680771 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 5 Dec 2025 14:34:10 +0800 Subject: [PATCH 198/254] Reuse cached decoder --- ggml/src/ggml-openvino/utils.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 935404136c..1f94d4bad6 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -248,9 +248,11 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { if (cache_hit) { std::map> model_weights; - ggml_decoder = std::make_shared(cgraph, m_params, c_params, model_weights, is_static, - is_prefill, prefill_chunk_size); - decoder_cache[key] = ggml_decoder; + ggml_decoder = decoder_cache[key]; + ggml_decoder->m_is_prefill = is_prefill; + ggml_decoder->set_model_params(m_params); + ggml_decoder->set_compute_params(c_params); + ggml_decoder->add_extra_inputs(); infer_request = is_prefill ? infer_request_cache_prefill[key] : infer_request_cache[key]; decoder_end_time = ggml_time_us(); From 469325c6dab3ad5bb0feb4ea9074d36783983ba0 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 8 Dec 2025 16:17:40 +0800 Subject: [PATCH 199/254] GPU remove Q6_K requantization --- ggml/src/ggml-openvino/utils.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 1f94d4bad6..ad99447f30 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -422,12 +422,6 @@ std::map get_types_to_requant(const std::string & dev {GGML_TYPE_Q5_K, ExtraQuantType::F16 }, }; } - if (device == "GPU") { - return { - // gs16 will be supported on openvino-2025.4 - {GGML_TYPE_Q6_K, ExtraQuantType::Q8_0_32}, - }; - } return {}; } From ae01322dbdf634bfec0265df347bf0196ba4eac4 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 8 Dec 2025 16:18:22 +0800 Subject: [PATCH 200/254] NPU fix wrong model output shape --- ggml/src/ggml-openvino/utils.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index ad99447f30..7412dcc2a8 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -614,8 +614,8 @@ ov::Tensor get_ov_output_tensor(std::shared_ptr ggml_decoder, con auto output_type = ggml_decoder->get_ov_type(ggml_tensor); auto output_shape = ggml_decoder->get_shape(ggml_tensor); - if (ggml_decoder->is_static() && result_name == "result_output") { - output_shape[1] = 1; + if (ggml_decoder->is_static() && result_name == "result_output" && output_shape[2] == 0) { + output_shape[2] = 1; } ov::Tensor output_tensor(output_type, output_shape, ggml_tensor->data); return output_tensor; From c9234b44cc46e9ea7783bfbaea54125efdebd349 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 8 Dec 2025 16:18:52 +0800 Subject: [PATCH 201/254] NPU fix q4 perf regression --- .../openvino/pass/squeeze_matmul.cpp | 58 +++++++++++++++++++ .../openvino/pass/squeeze_matmul.hpp | 17 ++++++ .../openvino/translate_session.cpp | 8 ++- 3 files changed, 80 insertions(+), 3 deletions(-) create mode 100644 ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp create mode 100644 ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.hpp diff --git a/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp b/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp new file mode 100644 index 0000000000..6627d60c5b --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp @@ -0,0 +1,58 @@ +#include "squeeze_matmul.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace opp = ov::pass::pattern; + +namespace ov { +namespace frontend { +namespace ggml { +namespace pass { + +// For quantized models, NPUW expects the activation to be 3d in DQ(DynamicQuantization) opt, e.g. DQMatMulGQ2i +SqueezeMatmul::SqueezeMatmul() { + auto m_act = opp::any_input(); + auto m_wei = opp::any_input(); + auto m_matmul = opp::wrap_type({m_act, m_wei}); + + const auto callback = [=](ov::pass::pattern::Matcher & m) { + const auto & pattern_map = m.get_pattern_value_map(); + auto matmul_node = + std::dynamic_pointer_cast(pattern_map.at(m_matmul).get_node_shared_ptr()); + auto act = pattern_map.at(m_act); + auto wei = pattern_map.at(m_wei); + auto act_shape = act.get_partial_shape(); + auto wei_shape = wei.get_partial_shape(); + if (act_shape.rank().is_dynamic() || wei_shape.rank().is_dynamic()) { + return false; + } + if (act_shape.rank().get_length() == 4 && wei_shape.rank().get_length() == 2) { + auto axis = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {0}); + auto squeezed_act = std::make_shared(act, axis); + auto new_matmul = std::make_shared(squeezed_act, wei, matmul_node->get_transpose_a(), + matmul_node->get_transpose_b()); + auto unsqueezed_output = std::make_shared(new_matmul, axis); + unsqueezed_output->set_friendly_name(matmul_node->get_friendly_name()); + ov::copy_runtime_info(matmul_node, {squeezed_act, new_matmul, unsqueezed_output}); + ov::replace_node(matmul_node, unsqueezed_output); + return true; + } + return false; + }; + + register_matcher(std::make_shared(m_matmul, "ov::frontend::ggml::pass::SqueezeMatmul"), + callback); +} + +} // namespace pass +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.hpp b/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.hpp new file mode 100644 index 0000000000..f8fbc69d54 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.hpp @@ -0,0 +1,17 @@ +#include "openvino/pass/matcher_pass.hpp" + +namespace ov { +namespace frontend { +namespace ggml { +namespace pass { + +class SqueezeMatmul : public ov::pass::MatcherPass { +public: + OPENVINO_MATCHER_PASS_RTTI("ov::frontend::ggml::pass::SqueezeMatmul") + SqueezeMatmul(); +}; + +} // namespace pass +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 546778a470..ccd0947a2b 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -5,6 +5,7 @@ #include "input_model.hpp" #include "pass/eliminate_zp.hpp" #include "pass/mark_decompression_convert_constant_folding.hpp" +#include "pass/squeeze_matmul.hpp" #include #include @@ -231,9 +232,10 @@ std::shared_ptr TranslateSession::apply_transformations(std::shared_ptr(kv_param_res_pairs); // } - // if (ggml_model_decoder->is_static()) { - manager.register_pass(); - // } + if (ggml_model_decoder->is_static()) { + manager.register_pass(); + manager.register_pass(); + } manager.run_passes(model); } return model; From 9e3163e8468ba7c8d2c49aa62d7aff1d0bd6107c Mon Sep 17 00:00:00 2001 From: Xuejun Zhai Date: Tue, 9 Dec 2025 18:14:28 -0800 Subject: [PATCH 202/254] Remove unused variable nodes --- ggml/src/ggml-openvino/ggml-decoder.cpp | 2 -- ggml/src/ggml-openvino/ggml-decoder.h | 1 - 2 files changed, 3 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 72f6144708..275a8a216a 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -62,7 +62,6 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { auto * cur_node = cgraph->nodes[node_n]; - m_nodes.push_back(cur_node); set_input_output(cur_node); } @@ -82,7 +81,6 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::mapop == GGML_OP_NONE) { continue; } - m_nodes.push_back(cur_node); set_input_output(cur_node, true); } for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 111eb7200b..bcfe8097d6 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -219,7 +219,6 @@ private: void validate_cgraph() const; ggml_cgraph * m_cgraph = nullptr; - std::vector m_nodes; std::map m_inputs; std::map> m_model_inputs; From 0ef2e5e4d47c1eeb65b00ee9af9ffbd07993760b Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 11 Dec 2025 11:30:25 +0800 Subject: [PATCH 203/254] Fix decoder can_reuse for llama-bench --- ggml/src/ggml-openvino/ggml-decoder.h | 12 ++++++------ ggml/src/ggml-openvino/utils.cpp | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index bcfe8097d6..edcd036785 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -25,15 +25,15 @@ struct ModelParams { // std::vector kv_names; - bool can_reuse_dynamically(const ModelParams & other) const { + bool operator==(const ModelParams & other) const { return n_seq == other.n_seq && n_heads == other.n_heads && n_heads_kv == other.n_heads_kv && - head_size == other.head_size && rope_params == other.rope_params && swa_layers == other.swa_layers; + head_size == other.head_size && rope_params == other.rope_params && swa_layers == other.swa_layers && + ctx_per_seq == other.ctx_per_seq && ctx_per_seq_swa == other.ctx_per_seq_swa; } - bool can_reuse_statically(const ModelParams & other) const { - return can_reuse_dynamically(other) && ctx_per_seq == other.ctx_per_seq && - ctx_per_seq_swa == other.ctx_per_seq_swa; - } + bool can_reuse_dynamically(const ModelParams & other) const { return *this == other; } + + bool can_reuse_statically(const ModelParams & other) const { return *this == other; } }; struct ComputeParams { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 7412dcc2a8..836e366fd7 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -97,7 +97,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin cache_hit = it != decoder_cache.end(); if (cache_hit) { ggml_decoder = it->second; - cache_hit = ggml_decoder->get_model_params().can_reuse_statically(m_params); + cache_hit = ggml_decoder->get_model_params().can_reuse_dynamically(m_params); } if (cache_hit) { From ae5336386f65125a7680fb7813938b1764d79ea8 Mon Sep 17 00:00:00 2001 From: Arshath Date: Thu, 25 Dec 2025 19:39:05 -0800 Subject: [PATCH 204/254] Update build.md for Windows --- docs/build.md | 71 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 23 deletions(-) diff --git a/docs/build.md b/docs/build.md index a63b45a1af..b9d5139b3a 100644 --- a/docs/build.md +++ b/docs/build.md @@ -707,31 +707,41 @@ Follow the instructions below to install OpenVINO runtime and build llama.cpp wi - Linux or Windows system with Intel hardware (CPU, GPU, or NPU) - **For Intel GPU or NPU Usage**: Install the appropriate hardware drivers for your Intel GPU or NPU. For detailed instructions, see: [Additional Configurations for Hardware Acceleration](https://docs.openvino.ai/2025/get-started/install-openvino/configurations.html). -- Git, CMake, and Ninja software tools are needed for building. -```bash - sudo apt-get update - sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar -``` + +- **Linux:** + - Git, CMake, and Ninja software tools are needed for building. + ```bash + sudo apt-get update + sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar + ``` + +- **Windows:** + - Download Microsoft.VisualStudio.2022.BuildTools [Visual_Studio_Build_Tools]https://aka.ms/vs/17/release/vs_BuildTools.exe Select "Desktop development with C++" under workloads. + - Install git + - Use "x64 Native Tools Command Prompt" for Build ### 1. Install OpenVINO Runtime - Follow the guide to install OpenVINO Runtime from an archive file: [Linux](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-linux.html) | [Windows](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-windows.html) -
-📦 Click to expand OpenVINO 2025.3 installation from an archive file on Ubuntu -
+- **Linux:** -```bash -wget https://raw.githubusercontent.com/ravi9/misc-scripts/main/openvino/ov-archive-install/install-openvino-from-archive.sh -chmod +x install-openvino-from-archive.sh -./install-openvino-from-archive.sh -``` -
+
+ 📦 Click to expand OpenVINO 2025.3 installation from an archive file on Ubuntu +
+ + ```bash + wget https://raw.githubusercontent.com/ravi9/misc-scripts/main/openvino/ov-archive-install/install-openvino-from-archive.sh + chmod +x install-openvino-from-archive.sh + ./install-openvino-from-archive.sh + ``` +
-- Verify OpenVINO is initialized properly -```bash -echo $OpenVINO_DIR -``` + - Verify OpenVINO is initialized properly + - **Linux:** + ```bash + echo $OpenVINO_DIR + ``` ### 2. Build llama.cpp with OpenVINO Backend @@ -741,13 +751,28 @@ Clone the OpenVINO-enabled llama.cpp fork and build it: git clone https://github.com/ravi9/llama.cpp.git cd llama.cpp git switch dev_backend_openvino - -# Build with OpenVINO support -source /opt/intel/openvino/setupvars.sh -cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF -cmake --build build/ReleaseOV --config Release -j $(nproc) ``` +- **Linux:** + ```bash + # Build with OpenVINO support + source /opt/intel/openvino/setupvars.sh + cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF + cmake --build build/ReleaseOV --config Release -j $(nproc) + ``` + +- **Windows:** + ```bash + # Build with OpenVINO support + "C:\Program Files (x86)\Intel\openvino_2025.3.0\setupvars.bat" + cmake -B build/ReleaseOV -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF -DLLAMA_CURL=OFF + cmake --build build\ReleaseOV --config Release + ``` + - For faster compilation, add the -- /m argument to run multiple jobs in parallel with as many CPU cores available. + ```bash + cmake --build build\ReleaseOV --config Release -- /m + ``` + ### 3. Download Sample Model Download models for testing: From 22d9c17a6f3521ca7a4f8ba8cebcc7f4c3425175 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 18 Dec 2025 12:52:10 +0800 Subject: [PATCH 205/254] backend buffer: allocate on host --- ggml/src/ggml-openvino/ggml-decoder.cpp | 119 ++++-- ggml/src/ggml-openvino/ggml-openvino-extra.h | 247 +++++++++++ ggml/src/ggml-openvino/ggml-openvino.cpp | 425 ++++++++++++++++++- ggml/src/ggml-openvino/ggml-quants.cpp | 161 +++++-- ggml/src/ggml-openvino/ggml-quants.hpp | 29 +- ggml/src/ggml-openvino/utils.cpp | 20 +- 6 files changed, 901 insertions(+), 100 deletions(-) create mode 100644 ggml/src/ggml-openvino/ggml-openvino-extra.h diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 275a8a216a..409a16e816 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -2,6 +2,7 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" +#include "ggml-openvino-extra.h" #include "ggml-quants.hpp" #include @@ -17,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -33,6 +35,7 @@ #include #include #include +#include #include GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, @@ -512,8 +515,49 @@ std::map> GgmlOvDecoder::create_weight_no return model_weights; } +// Static cache for quantized weight nodes (keyed by tensor data pointer) +// This is a fallback for when tensors don't have pre-built constants in extra +static std::unordered_map> s_quantized_weight_cache; +static std::mutex s_quantized_weight_cache_mutex; + std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor, std::optional requant_type) { + // Check if we have a pre-built constant from the OpenVINO backend buffer + // This is set during ggml_backend_openvino_buffer_set_tensor + if (tensor->extra != nullptr && !requant_type.has_value()) { + // Cast to our extra base type and check the type + auto * extra_base = static_cast(tensor->extra); + + if (extra_base->type == ggml_openvino_extra_base::Type::WEIGHT) { + // F16/F32/BF16 weight with shared-memory constant + auto * weight_extra = static_cast(tensor->extra); + if (weight_extra->constant) { + GGML_LOG_DEBUG("%s: using pre-built constant for %s\n", __func__, tensor->name); + return weight_extra->constant; + } + } else if (extra_base->type == ggml_openvino_extra_base::Type::QUANTIZED_WEIGHT) { + // Quantized weight with pre-extracted data + auto * quant_extra = static_cast(tensor->extra); + if (quant_extra->constant) { + GGML_LOG_DEBUG("%s: using pre-extracted quantized constant for %s\n", __func__, tensor->name); + return quant_extra->constant; + } + } + } + + // Fallback: Check static cache for quantized weights (keyed by data pointer) + // This handles cases where tensors weren't loaded through OpenVINO buffer + if (ggml_is_quantized(tensor->type) && !requant_type.has_value()) { + std::lock_guard lock(s_quantized_weight_cache_mutex); + auto it = s_quantized_weight_cache.find(tensor->data); + if (it != s_quantized_weight_cache.end()) { + GGML_LOG_DEBUG("%s: using cached quantized constant for %s\n", __func__, tensor->name); + return it->second; + } + } + + GGML_LOG_DEBUG("%s: creating new constant for %s (extra=%p)\n", __func__, tensor->name, tensor->extra); + std::set weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K}; if (weight_types.find(tensor->type) == weight_types.end()) { @@ -543,63 +587,48 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor return weight_node; } - // Quantized case - OPENVINO_ASSERT(tensor->extra == nullptr, "Unsupported weight tensor: " + std::string(tensor->name) + - " Possibly this is a repacked quantized weights"); + // Quantized case - extra should be nullptr (not our type) + // Our ggml_openvino_weight_extra is only set for F16/F32 weights + if (tensor->extra != nullptr) { + // Check if it's our type - if so, something is wrong + auto * extra_base = static_cast(tensor->extra); + if (extra_base->type == ggml_openvino_extra_base::Type::WEIGHT || + extra_base->type == ggml_openvino_extra_base::Type::TENSOR) { + OPENVINO_ASSERT(false, "Quantized weight tensor has unexpected extra type: " + std::string(tensor->name)); + } + // Otherwise it might be repacked quantized weights from another backend + OPENVINO_ASSERT(false, "Unsupported weight tensor: " + std::string(tensor->name) + + " Possibly this is a repacked quantized weights"); + } if (requant_type.has_value()) { return requantize(tensor, requant_type.value()); } - ov::element::Type weight_type; - if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) { - weight_type = ov::element::u4; - } else { // tensor.type == GGUF_TYPE_Q8_0 || tensor.type == GGUF_TYPE_Q6_K || tensor.type == GGUF_TYPE_Q5_K - weight_type = ov::element::u8; + // Extract quantized weights using the shared function + auto layout = ggml_openvino_get_extracted_layout(tensor); + if (layout.total_size == 0) { + OPENVINO_THROW("Unsupported quantized type for ", tensor->name, " type=", ggml_type_name(tensor->type)); } - uint64_t weights_per_block; - // here we only consider sub block, q6k:16 q4k:32 q5k:32 - if (tensor->type == GGML_TYPE_Q6_K) { - weights_per_block = 16; - } else { - weights_per_block = 32; - } - - OPENVINO_ASSERT(node_shape.back() % weights_per_block == 0, "[load_gguf] tensor ", tensor->name, - " has incompatible last dim shape: ", node_shape.back()); + ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; + ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block}; ov::Tensor weights(weight_type, node_shape); - // For scales and biases - node_shape[node_shape.size() - 1] = node_shape[node_shape.size() - 1] / weights_per_block; - ov::Tensor scales(ov::element::f16, node_shape); - ov::Tensor biases(ov::element::f16, node_shape); + ov::Tensor scales(ov::element::f16, scale_shape); + ov::Tensor biases(ov::element::f16, scale_shape); - ov::Output weight_node; - if (tensor->type == GGML_TYPE_Q4_0) { - extract_q4_0_data(tensor, weights, scales, biases); - weight_node = make_int4_weights(weights, scales, biases, weights_per_block); - } else if (tensor->type == GGML_TYPE_Q4_1) { - extract_q4_1_data(tensor, weights, scales, biases); - weight_node = make_int4_weights(weights, scales, biases, weights_per_block); - } else if (tensor->type == GGML_TYPE_Q8_0) { - extract_q8_0_data(tensor, weights, scales, biases); - weight_node = make_int8_weights(weights, scales, biases, weights_per_block); - } else if (tensor->type == GGML_TYPE_Q6_K) { - extract_q6_k_data(tensor, weights, scales, biases); - weight_node = make_int8_weights(weights, scales, biases, weights_per_block); - } else if (tensor->type == GGML_TYPE_Q4_K) { - extract_q4_k_data(tensor, weights, scales, biases); - weight_node = make_int4_weights(weights, scales, biases, weights_per_block); - } else if (tensor->type == GGML_TYPE_Q5_K) { - extract_q5_k_data(tensor, weights, scales, biases); - weight_node = make_int8_weights(weights, scales, biases, weights_per_block); + auto result = extract_quantized_weights(tensor, tensor->data, weights, scales, biases); + result->set_friendly_name(tensor->name); + + // Cache the quantized weight node for future reuse + if (ggml_is_quantized(tensor->type) && !requant_type.has_value()) { + std::lock_guard lock(s_quantized_weight_cache_mutex); + s_quantized_weight_cache[tensor->data] = result; + GGML_LOG_DEBUG("%s: cached quantized constant for %s\n", __func__, tensor->name); } - OPENVINO_ASSERT(weight_node.get_shape().size() == 2, "Weight should be 2D"); - - weight_node.get_node_shared_ptr()->set_friendly_name(tensor->name); - return weight_node.get_node_shared_ptr(); + return result; } void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filename) { diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h new file mode 100644 index 0000000000..99db870412 --- /dev/null +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h @@ -0,0 +1,247 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include "ggml.h" + +// ExtraQuantType enum - defines requantization target formats +enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128, Q8_0_C, Q8_0_32 }; + +// ===================================================== +// Global Device Configuration (singleton) +// ===================================================== +// Initialized once during backend init from GGML_OPENVINO_DEVICE env var + +struct ggml_openvino_device_config { + std::string device_name = "CPU"; + bool is_npu = false; + bool initialized = false; + + void init() { + if (initialized) return; + const char* env = std::getenv("GGML_OPENVINO_DEVICE"); + if (env) { + device_name = env; + is_npu = (device_name == "NPU"); + } + initialized = true; + } +}; + +// Get the global device config singleton +inline ggml_openvino_device_config& ggml_openvino_get_device_config() { + static ggml_openvino_device_config config; + return config; +} + +// Initialize device config (call during backend init) +inline void ggml_openvino_init_device_config() { + ggml_openvino_get_device_config().init(); +} + +// Get the device name +inline const std::string& ggml_openvino_get_device_name() { + return ggml_openvino_get_device_config().device_name; +} + +// Check if running on NPU +inline bool ggml_openvino_is_npu() { + return ggml_openvino_get_device_config().is_npu; +} + +// Get requantization type for a tensor type (returns nullopt if no requant needed) +inline std::optional ggml_openvino_get_requant_type(ggml_type type) { + if (!ggml_openvino_is_npu()) { + return std::nullopt; + } + // NPU requantization rules + switch (type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q4_K: + return ExtraQuantType::Q4_0_128; + case GGML_TYPE_Q6_K: + case GGML_TYPE_Q5_K: + return ExtraQuantType::F16; + default: + return std::nullopt; + } +} + +// ===================================================== +// OpenVINO Tensor Extra Types +// ===================================================== +// These types are stored in tensor->extra by the OpenVINO backend buffer. +// They allow: +// 1. Pre-built ov::Constant nodes for weights (avoiding memcpy during graph construction) +// 2. ov::Tensor wrappers for KV cache / compute tensors (for direct use with infer_request) + +// Base class for OpenVINO tensor extra data +struct ggml_openvino_extra_base { + enum class Type { WEIGHT, QUANTIZED_WEIGHT, TENSOR }; + Type type; + virtual ~ggml_openvino_extra_base() = default; +protected: + explicit ggml_openvino_extra_base(Type t) : type(t) {} +}; + +// Extra data for F16/F32/BF16 weight tensors - stores the pre-built ov::Constant node +struct ggml_openvino_weight_extra : public ggml_openvino_extra_base { + std::shared_ptr constant; // Pre-built OpenVINO Constant node + + explicit ggml_openvino_weight_extra(std::shared_ptr c) + : ggml_openvino_extra_base(Type::WEIGHT), constant(std::move(c)) {} +}; + +// Extra data for quantized weight tensors - stores extracted weights/scales/biases and ov::Constant +struct ggml_openvino_quantized_weight_extra : public ggml_openvino_extra_base { + ov::Tensor weights; // U4 or U8 extracted weights + ov::Tensor scales; // F16 scales + ov::Tensor biases; // F16 biases (zero points) + std::shared_ptr constant; // Pre-built OpenVINO weight subgraph + + ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor b, std::shared_ptr c) + : ggml_openvino_extra_base(Type::QUANTIZED_WEIGHT), + weights(std::move(w)), scales(std::move(s)), biases(std::move(b)), constant(std::move(c)) {} +}; + +// Extra data for KV cache / compute tensors - stores ov::Tensor for infer_request +struct ggml_openvino_tensor_extra : public ggml_openvino_extra_base { + std::shared_ptr tensor; // For direct use with infer_request + + explicit ggml_openvino_tensor_extra(std::shared_ptr t) + : ggml_openvino_extra_base(Type::TENSOR), tensor(std::move(t)) {} +}; + +// ===================================================== +// Extracted Size Calculation for Quantized Tensors +// ===================================================== +// For quantized tensors, we need extra space to store extracted weights, scales, and biases. +// Returns the total size needed in the buffer for extracted data. + +struct ggml_openvino_extracted_layout { + size_t total_size; // Total bytes needed + size_t weights_offset; // Offset to weights in buffer + size_t weights_size; // Size of weights in bytes + size_t scales_offset; // Offset to scales in buffer + size_t scales_size; // Size of scales in bytes + size_t biases_offset; // Offset to biases in buffer + size_t biases_size; // Size of biases in bytes + bool is_u4; // true for U4 weights, false for U8 + int64_t weights_per_block;// weights per scale/bias block + + // Requantization info + bool is_requant; // true if this tensor needs requantization + std::optional requant_type; // target requant type if is_requant +}; + +// Calculate the buffer layout for extracted quantized data +inline ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor) { + ggml_openvino_extracted_layout layout = {}; + + if (!ggml_is_quantized(tensor->type)) { + return layout; + } + + // Only handle 2D weight tensors + if (tensor->ne[2] != 1 || tensor->ne[3] != 1) { + return layout; + } + + int64_t n_elements = ggml_nelements(tensor); + const size_t alignment = 64; // Good for SIMD + + // Check if requantization is needed (NPU-specific) + auto requant_type = ggml_openvino_get_requant_type(tensor->type); + if (requant_type.has_value()) { + layout.is_requant = true; + layout.requant_type = requant_type; + + // Special case: requant to F16 - just store F16 weights, no scales/biases + if (requant_type.value() == ExtraQuantType::F16) { + layout.weights_size = n_elements * sizeof(uint16_t); // F16 = 2 bytes + layout.total_size = layout.weights_size; + layout.weights_offset = 0; + // No scales/biases for F16 + return layout; + } + + // Requant to different quantized format (e.g., Q4_0_128) + switch (requant_type.value()) { + case ExtraQuantType::Q4_0_128: + layout.is_u4 = true; + layout.weights_per_block = 128; + break; + case ExtraQuantType::Q8_0_32: + layout.is_u4 = false; + layout.weights_per_block = 32; + break; + default: + // Unsupported requant type - fall through to normal extraction + layout.is_requant = false; + layout.requant_type = std::nullopt; + break; + } + + if (layout.is_requant) { + // Calculate sizes for requantized format + layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements; + int64_t n_blocks = n_elements / layout.weights_per_block; + layout.scales_size = n_blocks * sizeof(uint16_t); + layout.biases_size = n_blocks * sizeof(uint16_t); + + layout.weights_offset = 0; + layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment; + layout.biases_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment; + layout.total_size = layout.biases_offset + layout.biases_size; + layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor)); + return layout; + } + } + + // Normal extraction (no requant) - determine format based on tensor type + switch (tensor->type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q4_K: + layout.is_u4 = true; + layout.weights_per_block = 32; + break; + case GGML_TYPE_Q8_0: + layout.is_u4 = false; + layout.weights_per_block = 32; + break; + case GGML_TYPE_Q6_K: + layout.is_u4 = false; + layout.weights_per_block = 16; + break; + case GGML_TYPE_Q5_K: + layout.is_u4 = false; + layout.weights_per_block = 32; + break; + default: + // Unsupported quantization type + return layout; + } + + // Calculate sizes + // Weights: U4 = n_elements/2 bytes, U8 = n_elements bytes + layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements; + + // Scales and biases: F16 per block + int64_t n_blocks = n_elements / layout.weights_per_block; + layout.scales_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes + layout.biases_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes + + // Layout in buffer: [weights | scales | biases] with alignment + layout.weights_offset = 0; + layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment; + layout.biases_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment; + layout.total_size = layout.biases_offset + layout.biases_size; + + return layout; +} diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index e809d250f7..747d1b8a30 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -3,18 +3,429 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" #include "ggml-impl.h" +#include "ggml-openvino-extra.h" #include "ggml-openvino/utils.h" +#include "ggml-quants.hpp" #include "ggml.h" #include +#include +#include #include #include +#include #include #include #include #define GGML_OPENVINO_MAX_STREAMS 8 +// OpenVINO buffer alignment (same as CPU for compatibility) +#define GGML_OPENVINO_BUFFER_ALIGNMENT 64 + +// ===================================================== +// OpenVINO Buffer Implementation using ov::Tensor +// ===================================================== +// +// Design: This implementation uses a hybrid approach: +// 1. For weight tensors: Store a pre-built ov::op::v0::Constant in tensor->extra +// - This avoids the memcpy during graph construction +// - For quantized weights, the constant is already converted to OpenVINO format +// 2. For KV cache / compute tensors: Store an ov::Tensor in tensor->extra +// - This can be directly passed to infer_request +// - Future: can be changed to ov::RemoteTensor for GPU/NPU +// +// This design is similar to: +// - CUDA split buffer: tensor->extra stores device pointers +// - CPU repack buffer: tensor->extra stores tensor_traits with repacked data +// ===================================================== + +// Buffer context that manages per-tensor allocations (no contiguous buffer for weights) +struct ggml_backend_openvino_buffer_context { + int device; + std::string name; + + // For non-weight buffers (KV cache, compute), we still use contiguous allocation + void * data; + size_t size; + bool is_weight_buffer; // Set when buffer usage is set to WEIGHTS + + // Track all extras for cleanup + std::vector tensor_extras; + + ggml_backend_openvino_buffer_context(int device, size_t size) : + device(device), + name(std::string(GGML_OPENVINO_NAME) + std::to_string(device)), + data(nullptr), + size(size), + is_weight_buffer(false) { + // Allocate aligned contiguous memory + if (size > 0) { +#ifdef _WIN32 + data = _aligned_malloc(size, GGML_OPENVINO_BUFFER_ALIGNMENT); +#else + data = aligned_alloc(GGML_OPENVINO_BUFFER_ALIGNMENT, size); +#endif + if (data == nullptr) { + GGML_LOG_ERROR("%s: failed to allocate %zu bytes\n", __func__, size); + } + } + } + + ~ggml_backend_openvino_buffer_context() { + // Clean up all tensor extras + for (auto * extra : tensor_extras) { + delete extra; + } + tensor_extras.clear(); + + // Free contiguous memory + if (data != nullptr) { +#ifdef _WIN32 + _aligned_free(data); +#else + free(data); +#endif + data = nullptr; + } + } +}; + +// Buffer type context (per-device) +struct ggml_backend_openvino_buffer_type_context { + int device; + std::string name; +}; + +// Buffer interface functions +static void ggml_backend_openvino_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + delete ctx; +} + +static void * ggml_backend_openvino_buffer_get_base(ggml_backend_buffer_t buffer) { + ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + return ctx->data; +} + +static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + // Views share the extra from view_src + if (tensor->view_src != nullptr) { + GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); + if (tensor->view_src->extra != nullptr) { + tensor->extra = tensor->view_src->extra; + } + return GGML_STATUS_SUCCESS; + } + + // For non-view tensors, tensor->extra will be set in set_tensor + // when the actual weight data is loaded + GGML_UNUSED(buffer); + return GGML_STATUS_SUCCESS; +} + +static void ggml_backend_openvino_buffer_memset_tensor(ggml_backend_buffer_t buffer, + ggml_tensor * tensor, + uint8_t value, + size_t offset, + size_t size) { + GGML_ASSERT(tensor != nullptr && tensor->data != nullptr); + memset((char *) tensor->data + offset, value, size); + GGML_UNUSED(buffer); +} + +static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer, + ggml_tensor * tensor, + const void * data, + size_t offset, + size_t size) { + GGML_ASSERT(tensor != nullptr && tensor->data != nullptr); + ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + + // Check if this is a weight buffer (usage is set BEFORE set_tensor is called) + bool is_weight_buffer = (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + // Full tensor set: offset=0, full size, not a view + bool is_full_tensor_set = (offset == 0 && size == ggml_nbytes(tensor) && tensor->view_src == nullptr); + // 2D tensor (typical weight shape) + bool is_2d = (tensor->ne[2] == 1 && tensor->ne[3] == 1); + + // Check if this is a quantized weight tensor that needs extraction/requantization + ggml_openvino_extracted_layout layout = {}; + if (is_weight_buffer && is_full_tensor_set && is_2d && ggml_is_quantized(tensor->type)) { + layout = ggml_openvino_get_extracted_layout(tensor); + } + + if (layout.total_size > 0) { + uint8_t * buf_base = (uint8_t *) tensor->data; + + // 2D shape for weights [rows, cols] + ov::Shape weight_shape = {static_cast(tensor->ne[1]), static_cast(tensor->ne[0])}; + + try { + std::shared_ptr constant; + + if (layout.is_requant && layout.requant_type.has_value()) { + // Requantization path + if (layout.requant_type.value() == ExtraQuantType::F16) { + // Requant to F16: create F16 tensor with external memory, requantize fills it + ov::Tensor weights(ov::element::f16, weight_shape, buf_base); + ov::Tensor dummy_scales, dummy_biases; // Not used for F16 + // requantize_to_buffers fills weights and returns a Constant wrapping it + constant = requantize_to_buffers(tensor, data, ExtraQuantType::F16, 0, weights, dummy_scales, + dummy_biases); + + // Store in tensor->extra (use weight_extra since it's F16) + auto * extra = new ggml_openvino_weight_extra(constant); + ctx->tensor_extras.push_back(extra); + tensor->extra = extra; + + GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name); + } else { + // Requant to quantized format (Q4_0_128, Q8_0_32, etc.) + ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; + ov::Shape scale_shape = {static_cast(tensor->ne[1]), + static_cast(tensor->ne[0] / layout.weights_per_block)}; + + ov::Tensor weights(weight_type, weight_shape, buf_base + layout.weights_offset); + ov::Tensor scales(ov::element::f16, scale_shape, buf_base + layout.scales_offset); + ov::Tensor biases(ov::element::f16, scale_shape, buf_base + layout.biases_offset); + + constant = requantize_to_buffers(tensor, data, layout.requant_type.value(), + layout.weights_per_block, weights, scales, biases); + + // Store in tensor->extra + auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales), + std::move(biases), constant); + ctx->tensor_extras.push_back(extra); + tensor->extra = extra; + + GGML_LOG_DEBUG("%s: requantized %s to %s (u%d, block_size=%ld)\n", __func__, tensor->name, + layout.requant_type.value() == ExtraQuantType::Q4_0_128 ? "Q4_0_128" : "Q8_0_32", + layout.is_u4 ? 4 : 8, layout.weights_per_block); + } + } else { + // Normal extraction path (no requant) + ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; + int64_t n_blocks = ggml_nelements(tensor) / layout.weights_per_block; + ov::Shape scale_shape = {static_cast(tensor->ne[1]), + static_cast(tensor->ne[0] / layout.weights_per_block)}; + + ov::Tensor weights(weight_type, weight_shape, buf_base + layout.weights_offset); + ov::Tensor scales(ov::element::f16, scale_shape, buf_base + layout.scales_offset); + ov::Tensor biases(ov::element::f16, scale_shape, buf_base + layout.biases_offset); + + constant = extract_quantized_weights(tensor, data, weights, scales, biases); + + // Store in tensor->extra + auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales), + std::move(biases), constant); + ctx->tensor_extras.push_back(extra); + tensor->extra = extra; + + GGML_LOG_DEBUG("%s: extracted quantized constant for %s (u%d, %zu weights, %ld blocks)\n", __func__, + tensor->name, layout.is_u4 ? 4 : 8, layout.weights_size, n_blocks); + } + + } catch (const std::exception & e) { + GGML_LOG_ERROR("%s: failed to process quantized data for %s: %s\n", __func__, tensor->name, e.what()); + // Fall back to storing raw data + memcpy((char *) tensor->data + offset, data, size); + } + } else if (is_weight_buffer && is_full_tensor_set && is_2d && + (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16)) { + // F16/F32/BF16 weight tensor - copy data and create shared-memory constant + memcpy((char *) tensor->data + offset, data, size); + + try { + // Get OpenVINO element type + ov::element::Type element_type; + switch (tensor->type) { + case GGML_TYPE_F32: + element_type = ov::element::f32; + break; + case GGML_TYPE_F16: + element_type = ov::element::f16; + break; + case GGML_TYPE_BF16: + element_type = ov::element::bf16; + break; + default: + return; // Should not happen + } + + // Create 2D shape (OpenVINO expects [rows, cols]) + ov::Shape shape = {static_cast(tensor->ne[1]), static_cast(tensor->ne[0])}; + + // Create ov::Tensor with external memory, then wrap with Constant + ov::Tensor ov_tensor(element_type, shape, tensor->data); + auto constant = std::make_shared(ov_tensor); + constant->set_friendly_name(tensor->name); + + // Store in tensor->extra + ggml_openvino_weight_extra * extra = new ggml_openvino_weight_extra(constant); + ctx->tensor_extras.push_back(extra); + tensor->extra = extra; + + GGML_LOG_DEBUG("%s: created shared-memory constant for %s\n", __func__, tensor->name); + + } catch (const std::exception & e) { + GGML_LOG_DEBUG("%s: failed to create shared-memory constant for %s: %s\n", __func__, tensor->name, + e.what()); + } + } else { + // Non-weight tensor (KV cache, activations, etc.) - just copy data + memcpy((char *) tensor->data + offset, data, size); + } +} + +static void ggml_backend_openvino_buffer_get_tensor(ggml_backend_buffer_t buffer, + const ggml_tensor * tensor, + void * data, + size_t offset, + size_t size) { + GGML_ASSERT(tensor != nullptr && tensor->data != nullptr); + memcpy(data, (const char *) tensor->data + offset, size); + GGML_UNUSED(buffer); +} + +static bool ggml_backend_openvino_buffer_cpy_tensor(ggml_backend_buffer_t buffer, + const ggml_tensor * src, + ggml_tensor * dst) { + GGML_ASSERT(src != nullptr && dst != nullptr); + // Can copy from any host buffer (including other OpenVINO buffers) + if (ggml_backend_buffer_is_host(src->buffer)) { + memcpy(dst->data, src->data, ggml_nbytes(src)); + return true; + } + return false; + GGML_UNUSED(buffer); +} + +static void ggml_backend_openvino_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + if (ctx->data != nullptr) { + memset(ctx->data, value, ctx->size); + } +} + +static const ggml_backend_buffer_i ggml_backend_openvino_buffer_interface = { + /* .free_buffer = */ ggml_backend_openvino_buffer_free_buffer, + /* .get_base = */ ggml_backend_openvino_buffer_get_base, + /* .init_tensor = */ ggml_backend_openvino_buffer_init_tensor, + /* .memset_tensor = */ ggml_backend_openvino_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_openvino_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_openvino_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_openvino_buffer_cpy_tensor, + /* .clear = */ ggml_backend_openvino_buffer_clear, + /* .reset = */ NULL, +}; + +// Buffer type interface functions +static const char * ggml_backend_openvino_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + ggml_backend_openvino_buffer_type_context * ctx = (ggml_backend_openvino_buffer_type_context *) buft->context; + return ctx->name.c_str(); +} + +static ggml_backend_buffer_t ggml_backend_openvino_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, + size_t size) { + ggml_backend_openvino_buffer_type_context * buft_ctx = (ggml_backend_openvino_buffer_type_context *) buft->context; + + // Create buffer context with contiguous memory allocation + ggml_backend_openvino_buffer_context * ctx = new ggml_backend_openvino_buffer_context(buft_ctx->device, size); + + if (ctx->data == nullptr && size > 0) { + GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size); + delete ctx; + return nullptr; + } + + return ggml_backend_buffer_init(buft, ggml_backend_openvino_buffer_interface, ctx, size); +} + +static size_t ggml_backend_openvino_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return GGML_OPENVINO_BUFFER_ALIGNMENT; +} + +static size_t ggml_backend_openvino_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return SIZE_MAX; +} + +static size_t ggml_backend_openvino_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, + const ggml_tensor * tensor) { + GGML_UNUSED(buft); + + // For quantized 2D tensors (weights), we need extra space for extracted data + if (ggml_is_quantized(tensor->type) && tensor->ne[2] == 1 && tensor->ne[3] == 1) { + ggml_openvino_extracted_layout layout = ggml_openvino_get_extracted_layout(tensor); + if (layout.total_size > 0) { + GGML_LOG_DEBUG( + "%s: tensor %s needs %zu bytes (original %zu, extracted: weights=%zu scales=%zu biases=%zu)\n", + __func__, tensor->name, layout.total_size, ggml_nbytes(tensor), layout.weights_size, layout.scales_size, + layout.biases_size); + return layout.total_size; + } + } + + return ggml_nbytes(tensor); +} + +static bool ggml_backend_openvino_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + // Currently using host memory via ov::Tensor + // This will be false when using GPU/NPU remote tensors + return true; +} + +static const ggml_backend_buffer_type_i ggml_backend_openvino_buffer_type_interface = { + /* .get_name = */ ggml_backend_openvino_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_openvino_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_openvino_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_openvino_buffer_type_get_max_size, + /* .get_alloc_size = */ ggml_backend_openvino_buffer_type_get_alloc_size, + /* .is_host = */ ggml_backend_openvino_buffer_type_is_host, +}; + +// Get buffer type for a specific device +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device) { + GGML_ASSERT(device >= 0 && device < ggml_backend_openvino_get_device_count()); + + static std::mutex mutex; + std::lock_guard lock(mutex); + + static std::vector buffer_types; + static std::vector buffer_type_contexts; + + if (buffer_types.empty()) { + int device_count = ggml_backend_openvino_get_device_count(); + buffer_types.resize(device_count); + buffer_type_contexts.resize(device_count); + + for (int i = 0; i < device_count; i++) { + buffer_type_contexts[i].device = i; + buffer_type_contexts[i].name = std::string(GGML_OPENVINO_NAME) + std::to_string(i); + + buffer_types[i] = ggml_backend_buffer_type{ + /* .iface = */ ggml_backend_openvino_buffer_type_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_openvino_reg(), i), + /* .context = */ &buffer_type_contexts[i], + }; + } + } + + return &buffer_types[device]; +} + +// Check if a buffer is an OpenVINO buffer +static bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer) { + return buffer->iface.free_buffer == ggml_backend_openvino_buffer_free_buffer; +} + +// ===================================================== +// OpenVINO Backend Context and Interface +// ===================================================== + struct ggml_backend_openvino_context { int device; // the device ID currently in use std::string name; // context Name @@ -111,13 +522,6 @@ GGML_BACKEND_API bool ggml_backend_is_openvino(ggml_backend_t backend) { return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_openvino_guid()); } -// device buffer -GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device) { - GGML_ASSERT(device >= 0); - return ggml_backend_cpu_buffer_type(); - GGML_UNUSED(device); -} - struct ggml_backend_openvino_device_context { int device; std::string name; @@ -350,7 +754,8 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con } static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - return ggml_backend_buft_is_host(buft); + // Support our own buffer type and any host buffer (for mmap'd files, etc.) + return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name || ggml_backend_buft_is_host(buft); GGML_UNUSED(dev); } @@ -410,6 +815,10 @@ static int get_openvino_device_count() { } static ggml_openvino_device_info ggml_openvino_init() { + // Initialize device config singleton from env var + ggml_openvino_init_device_config(); + GGML_LOG_INFO("OpenVINO: using device %s\n", ggml_openvino_get_device_name().c_str()); + ggml_openvino_device_info info = {}; info.device_count = get_openvino_device_count(); return info; diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index 2076c3c75d..662f27be7a 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -418,11 +418,124 @@ ov::Output make_int4_weights(ov::Tensor & weight, return std::make_shared(w_zp_s, ov::element::f32); } -std::shared_ptr requantize(const ggml_tensor * tensor, ExtraQuantType requant_type) { - std::vector weights_f32(tensor->ne[0] * tensor->ne[1]); - ggml_get_type_traits(tensor->type)->to_float(tensor->data, weights_f32.data(), ggml_nelements(tensor)); +// Extract quantized weights from tensor and create weight subgraph +std::shared_ptr extract_quantized_weights(const ggml_tensor * tensor, + const void * data, + ov::Tensor & weights, + ov::Tensor & scales, + ov::Tensor & biases) { + // Create a temporary tensor for extraction functions that read from tensor->data + ggml_tensor temp_tensor = *tensor; + temp_tensor.data = const_cast(data); - std::shared_ptr weight_node; + // Determine block size based on tensor type + int64_t weights_per_block; + bool is_u4; + switch (tensor->type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q4_K: + is_u4 = true; + weights_per_block = 32; + break; + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q5_K: + is_u4 = false; + weights_per_block = 32; + break; + case GGML_TYPE_Q6_K: + is_u4 = false; + weights_per_block = 16; + break; + default: + throw std::runtime_error("Unsupported quantized type for extraction: " + + std::string(ggml_type_name(tensor->type))); + } + + // Extract quantized data + switch (tensor->type) { + case GGML_TYPE_Q4_0: + extract_q4_0_data(&temp_tensor, weights, scales, biases); + break; + case GGML_TYPE_Q4_1: + extract_q4_1_data(&temp_tensor, weights, scales, biases); + break; + case GGML_TYPE_Q4_K: + extract_q4_k_data(&temp_tensor, weights, scales, biases); + break; + case GGML_TYPE_Q8_0: + extract_q8_0_data(&temp_tensor, weights, scales, biases); + break; + case GGML_TYPE_Q6_K: + extract_q6_k_data(&temp_tensor, weights, scales, biases); + break; + case GGML_TYPE_Q5_K: + extract_q5_k_data(&temp_tensor, weights, scales, biases); + break; + default: + throw std::runtime_error("Unsupported quantized type: " + std::string(ggml_type_name(tensor->type))); + } + + // Create the OpenVINO weight subgraph + ov::Output weight_node; + if (is_u4) { + weight_node = make_int4_weights(weights, scales, biases, weights_per_block); + } else { + weight_node = make_int8_weights(weights, scales, biases, weights_per_block); + } + + auto result = weight_node.get_node_shared_ptr(); + result->set_friendly_name(tensor->name); + return result; +} + +// Requantize weights to target format, writing to provided buffers +std::shared_ptr requantize_to_buffers(const ggml_tensor * tensor, + const void * data, + ExtraQuantType requant_type, + int64_t block_size, + ov::Tensor & weights, + ov::Tensor & scales, + ov::Tensor & biases) { + int64_t n_elements = ggml_nelements(tensor); + + // First dequantize to F32 + std::vector weights_f32(n_elements); + ggml_get_type_traits(tensor->type)->to_float(data, weights_f32.data(), n_elements); + + // Handle F16 case - just convert and create constant + if (requant_type == ExtraQuantType::F16) { + ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), n_elements); + auto result = std::make_shared(weights); + result->set_friendly_name(tensor->name); + return result; + } + + // Requantize to target quantized format + bool is_u4 = (requant_type == ExtraQuantType::Q4_0_C || requant_type == ExtraQuantType::Q4_0_128); + + if (is_u4) { + quantize_q4_0(weights_f32.data(), weights, scales, biases, n_elements, block_size); + } else if (requant_type == ExtraQuantType::Q8_1_C) { + quantize_q8_1(weights_f32.data(), weights, scales, biases, n_elements, block_size); + } else { + quantize_q8_0(weights_f32.data(), weights, scales, biases, n_elements, block_size); + } + + // Create the OpenVINO weight subgraph + ov::Output weight_node; + if (is_u4) { + weight_node = make_int4_weights(weights, scales, biases, block_size); + } else { + weight_node = make_int8_weights(weights, scales, biases, block_size); + } + + auto result = weight_node.get_node_shared_ptr(); + result->set_friendly_name(tensor->name); + return result; +} + +std::shared_ptr requantize(const ggml_tensor * tensor, ExtraQuantType requant_type) { ov::Shape node_shape = {(uint64_t) (tensor->ne[1]), (uint64_t) (tensor->ne[0])}; // FIXME hardcoded workaround to fix the case where token_emb.weight is q4_0 (instead of q6_k) @@ -432,42 +545,28 @@ std::shared_ptr requantize(const ggml_tensor * tensor, ExtraQuantType requant_type = ExtraQuantType::F16; } - if (requant_type == ExtraQuantType::F16) { - ov::Tensor weights(ov::element::f16, node_shape); - ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), ggml_nelements(tensor)); - std::shared_ptr weight_node = std::make_shared(weights); - weight_node->set_friendly_name(tensor->name); - return weight_node; - } - + // Determine block size int64_t block_size = node_shape[1]; if (requant_type == ExtraQuantType::Q4_0_128) { block_size = 128; } else if (requant_type == ExtraQuantType::Q8_0_32) { block_size = 32; } - auto scales_shape = ov::Shape{node_shape[0], node_shape[1] / block_size}; - ov::Tensor weights; - ov::Tensor scales(ov::element::f16, scales_shape); - ov::Tensor bias(ov::element::f16, scales_shape); - - if (requant_type == ExtraQuantType::Q4_0_C || requant_type == ExtraQuantType::Q4_0_128) { - weights = ov::Tensor(ov::element::u4, node_shape); - quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size); - weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr(); - } else if (requant_type == ExtraQuantType::Q8_1_C) { - weights = ov::Tensor(ov::element::u8, node_shape); - quantize_q8_1(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size); - weight_node = make_int8_weights(weights, scales, bias, block_size).get_node_shared_ptr(); - } else if (requant_type == ExtraQuantType::Q8_0_C || requant_type == ExtraQuantType::Q8_0_32) { - weights = ov::Tensor(ov::element::u8, node_shape); - quantize_q8_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size); - weight_node = make_int8_weights(weights, scales, bias, block_size).get_node_shared_ptr(); + // Allocate tensors + ov::Tensor weights, scales, biases; + if (requant_type == ExtraQuantType::F16) { + weights = ov::Tensor(ov::element::f16, node_shape); + } else { + bool is_u4 = (requant_type == ExtraQuantType::Q4_0_C || requant_type == ExtraQuantType::Q4_0_128); + ov::element::Type weight_type = is_u4 ? ov::element::u4 : ov::element::u8; + ov::Shape scales_shape = {node_shape[0], node_shape[1] / block_size}; + weights = ov::Tensor(weight_type, node_shape); + scales = ov::Tensor(ov::element::f16, scales_shape); + biases = ov::Tensor(ov::element::f16, scales_shape); } - weight_node->set_friendly_name(tensor->name); - return weight_node; + return requantize_to_buffers(tensor, tensor->data, requant_type, block_size, weights, scales, biases); } void quantize_q4_0(const float * x, diff --git a/ggml/src/ggml-openvino/ggml-quants.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp index 71ae317a39..0f14a6ed2d 100644 --- a/ggml/src/ggml-openvino/ggml-quants.hpp +++ b/ggml/src/ggml-openvino/ggml-quants.hpp @@ -1,10 +1,11 @@ #pragma once +#include "ggml-openvino-extra.h" // For ExtraQuantType +#include "ggml.h" + #include #include #include -#include "ggml.h" - void unpack_32_4(const uint8_t* data, uint8_t* dst); void extract_q4_0_data(const ggml_tensor* tensor, @@ -51,10 +52,32 @@ ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& biases, size_t group_size = GGML_QUANTIZATION_GROUP_SIZE); -enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128, Q8_0_C, Q8_0_32 }; +// ExtraQuantType is defined in ggml-openvino-extra.h std::shared_ptr requantize(const ggml_tensor* tensor, ExtraQuantType requant_type); +// Extract quantized weights from tensor and create weight subgraph +// If weights/scales/biases are provided (non-empty), uses them as output buffers +// Otherwise allocates new ov::Tensors internally +// Returns the weight node (make_int4_weights or make_int8_weights result) +std::shared_ptr extract_quantized_weights( + const ggml_tensor * tensor, + const void * data, // Source data pointer (may differ from tensor->data) + ov::Tensor & weights, + ov::Tensor & scales, + ov::Tensor & biases); + +// Requantize weights from tensor to target format, writing to provided buffers +// For F16 target, only weights buffer is used (scales/biases ignored) +// Returns the weight node +std::shared_ptr requantize_to_buffers(const ggml_tensor * tensor, + const void * data, // Source data pointer + ExtraQuantType requant_type, + int64_t block_size, + ov::Tensor & weights, + ov::Tensor & scales, + ov::Tensor & biases); + void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, int64_t qk); void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 836e366fd7..251fb82361 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -1,6 +1,7 @@ #include "utils.h" #include "ggml-impl.h" +#include "ggml-openvino-extra.h" #include "ggml-openvino/ggml-decoder.h" #include "ggml.h" #include "openvino/frontend.hpp" @@ -39,23 +40,14 @@ static ov::Core core; enum ggml_status ov_graph_compute(ggml_cgraph * cgraph) { - auto get_device = [&] { - std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "CPU"; - auto available_devices = core.get_available_devices(); - if (std::find(available_devices.begin(), available_devices.end(), device) == available_devices.end()) { - GGML_LOG_WARN("GGML OpenVINO Backend: device %s is not available, fallback to CPU\n", device.c_str()); - device = "CPU"; - } - return device; - }; - if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { std::string filename = "cgraph.txt"; GgmlOvDecoder::dump_cgraph(cgraph, filename); } - static const auto device = get_device(); - static const auto is_static = device == "NPU" ? true : false; + // Use device from singleton (initialized during backend init) + const auto & device = ggml_openvino_get_device_name(); + const auto is_static = ggml_openvino_is_npu(); return is_static ? ov_graph_compute_static(cgraph) : ov_graph_compute_dynamic(cgraph, device); } @@ -413,7 +405,8 @@ ov::AnyMap get_ov_compile_config(const std::string & device) { } std::map get_types_to_requant(const std::string & device) { - if (device == "NPU") { + // Use singleton to check if NPU (device param kept for API compatibility) + if (ggml_openvino_is_npu()) { return { {GGML_TYPE_Q4_0, ExtraQuantType::Q4_0_128}, {GGML_TYPE_Q4_1, ExtraQuantType::Q4_0_128}, @@ -423,6 +416,7 @@ std::map get_types_to_requant(const std::string & dev }; } return {}; + GGML_UNUSED(device); } bool is_naive(ggml_cgraph * cgraph) { From 72bba828dfc539ec1ed979d2acf587b3325219bc Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 18 Dec 2025 17:03:03 +0800 Subject: [PATCH 206/254] Use shared_buffer for GPU NPU; Refactor --- ggml/src/ggml-openvino/CMakeLists.txt | 3 +- ggml/src/ggml-openvino/ggml-decoder.cpp | 78 ++------ ggml/src/ggml-openvino/ggml-decoder.h | 7 +- .../src/ggml-openvino/ggml-openvino-extra.cpp | 177 ++++++++++++++++++ ggml/src/ggml-openvino/ggml-openvino-extra.h | 159 ++-------------- ggml/src/ggml-openvino/ggml-openvino.cpp | 154 +++++++-------- ggml/src/ggml-openvino/ggml-quants.cpp | 106 +++++++++++ ggml/src/ggml-openvino/ggml-quants.hpp | 10 + ggml/src/ggml-openvino/utils.cpp | 19 +- ggml/src/ggml-openvino/utils.h | 2 - 10 files changed, 389 insertions(+), 326 deletions(-) create mode 100644 ggml/src/ggml-openvino/ggml-openvino-extra.cpp diff --git a/ggml/src/ggml-openvino/CMakeLists.txt b/ggml/src/ggml-openvino/CMakeLists.txt index 3051a8b240..175b585661 100644 --- a/ggml/src/ggml-openvino/CMakeLists.txt +++ b/ggml/src/ggml-openvino/CMakeLists.txt @@ -1,4 +1,5 @@ find_package(OpenVINO REQUIRED) +find_package(OpenCL REQUIRED) include("${OpenVINO_DIR}/../3rdparty/tbb/lib/cmake/TBB/TBBConfig.cmake") @@ -10,7 +11,7 @@ ggml_add_backend_library(ggml-openvino ${GGML_HEADERS_OPENVINO} ) -target_link_libraries(ggml-openvino PRIVATE openvino::runtime TBB::tbb) +target_link_libraries(ggml-openvino PRIVATE openvino::runtime TBB::tbb OpenCL::OpenCL) if (GGML_OPENVINO) if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 409a16e816..2d6437f069 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -3,6 +3,7 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" #include "ggml-openvino-extra.h" +#include "ggml-openvino.h" #include "ggml-quants.hpp" #include @@ -471,9 +472,7 @@ const ggml_tensor * GgmlOvDecoder::get_tensor_from_name(const std::string & name // return kv_param_res_names; // } -std::map> GgmlOvDecoder::create_weight_nodes( - ggml_cgraph * cgraph, - std::map types_to_requantize) { +std::map> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph) { std::map> model_weights; static std::mutex weights_mutex; auto * nodes = cgraph->nodes; @@ -498,10 +497,7 @@ std::map> GgmlOvDecoder::create_weight_no } } if (should_create) { - auto requant_type = types_to_requantize.count(src->type) ? - std::optional(types_to_requantize.at(src->type)) : - std::nullopt; - auto weight_node = create_weight_node(src, requant_type); + auto weight_node = create_weight_node(src); weight_node->set_friendly_name(src_name); { std::lock_guard lock(weights_mutex); @@ -520,11 +516,14 @@ std::map> GgmlOvDecoder::create_weight_no static std::unordered_map> s_quantized_weight_cache; static std::mutex s_quantized_weight_cache_mutex; -std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor, - std::optional requant_type) { +std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor) { // Check if we have a pre-built constant from the OpenVINO backend buffer // This is set during ggml_backend_openvino_buffer_set_tensor - if (tensor->extra != nullptr && !requant_type.has_value()) { + if (tensor->extra) { + if (!ggml_backend_buffer_is_openvino(tensor->buffer)) { + OPENVINO_ASSERT(false, "Unsupported weight tensor: " + std::string(tensor->name) + + " Possibly this is a cpu backend repacked quantized weights"); + } // Cast to our extra base type and check the type auto * extra_base = static_cast(tensor->extra); @@ -547,7 +546,7 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor // Fallback: Check static cache for quantized weights (keyed by data pointer) // This handles cases where tensors weren't loaded through OpenVINO buffer - if (ggml_is_quantized(tensor->type) && !requant_type.has_value()) { + if (ggml_is_quantized(tensor->type)) { std::lock_guard lock(s_quantized_weight_cache_mutex); auto it = s_quantized_weight_cache.find(tensor->data); if (it != s_quantized_weight_cache.end()) { @@ -565,64 +564,11 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor ggml_type_name(tensor->type)); } - auto node_type = get_ov_type(tensor); - auto node_shape = get_shape(tensor); - auto ne_total = ggml_nelements(tensor); - - OPENVINO_ASSERT(node_shape[0] == 1, "Got 4D weights, expect all weights to be 2D: ", tensor->name); - node_shape.erase(node_shape.begin()); - OPENVINO_ASSERT(node_shape[0] == 1, "Got 3D weights, expect all weights to be 2D: ", tensor->name); - node_shape.erase(node_shape.begin()); - - // F16 and F32 case - if (node_type != ov::element::dynamic) { - ov::Tensor weights(node_type, node_shape); - memcpy(weights.data(), tensor->data, ne_total * node_type.size()); - std::shared_ptr weight_node = std::make_shared(weights); - // Disabled because it triggers a bug in NPUW, no performance impact on CPU GPU - // if (node_type == ov::element::f16) { - // weight_node = std::make_shared(weight_node, ov::element::f32); - // } - weight_node->set_friendly_name(tensor->name); - return weight_node; - } - - // Quantized case - extra should be nullptr (not our type) - // Our ggml_openvino_weight_extra is only set for F16/F32 weights - if (tensor->extra != nullptr) { - // Check if it's our type - if so, something is wrong - auto * extra_base = static_cast(tensor->extra); - if (extra_base->type == ggml_openvino_extra_base::Type::WEIGHT || - extra_base->type == ggml_openvino_extra_base::Type::TENSOR) { - OPENVINO_ASSERT(false, "Quantized weight tensor has unexpected extra type: " + std::string(tensor->name)); - } - // Otherwise it might be repacked quantized weights from another backend - OPENVINO_ASSERT(false, "Unsupported weight tensor: " + std::string(tensor->name) + - " Possibly this is a repacked quantized weights"); - } - - if (requant_type.has_value()) { - return requantize(tensor, requant_type.value()); - } - - // Extract quantized weights using the shared function - auto layout = ggml_openvino_get_extracted_layout(tensor); - if (layout.total_size == 0) { - OPENVINO_THROW("Unsupported quantized type for ", tensor->name, " type=", ggml_type_name(tensor->type)); - } - - ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; - ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block}; - - ov::Tensor weights(weight_type, node_shape); - ov::Tensor scales(ov::element::f16, scale_shape); - ov::Tensor biases(ov::element::f16, scale_shape); - - auto result = extract_quantized_weights(tensor, tensor->data, weights, scales, biases); + std::shared_ptr result = process_weight_tensor(tensor, tensor->data, nullptr); result->set_friendly_name(tensor->name); // Cache the quantized weight node for future reuse - if (ggml_is_quantized(tensor->type) && !requant_type.has_value()) { + if (ggml_is_quantized(tensor->type)) { std::lock_guard lock(s_quantized_weight_cache_mutex); s_quantized_weight_cache[tensor->data] = result; GGML_LOG_DEBUG("%s: cached quantized constant for %s\n", __func__, tensor->name); diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index edcd036785..0b302b9320 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -179,12 +179,9 @@ public: static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename); - static std::shared_ptr create_weight_node(ggml_tensor * tensor, - std::optional requant_type = std::nullopt); + static std::shared_ptr create_weight_node(ggml_tensor * tensor); - static std::map> create_weight_nodes( - ggml_cgraph * cgraph, - std::map types_to_requantize = {}); + static std::map> create_weight_nodes(ggml_cgraph * cgraph); const ggml_tensor * get_tensor_used_op(const ggml_tensor * tensor) const; diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp new file mode 100644 index 0000000000..75b27c8fa8 --- /dev/null +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp @@ -0,0 +1,177 @@ +#include "ggml-openvino-extra.h" + +#include "ggml-impl.h" + +ov::Core & ov_singleton_core() { + static ov::Core core; + return core; +} + +// ===================================================== +// Device Configuration Implementations +// ===================================================== + +void ggml_openvino_device_config::init() { + if (initialized) { + return; + } + device_name = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "CPU"; + auto available_devices = ov_singleton_core().get_available_devices(); + if (std::find(available_devices.begin(), available_devices.end(), device_name) == available_devices.end()) { + GGML_LOG_WARN("GGML OpenVINO Backend: device %s is not available, fallback to CPU\n", device_name.c_str()); + device_name = "CPU"; + } + is_npu = (device_name == "NPU"); + initialized = true; +} + +// Get the global device config singleton +ggml_openvino_device_config & ggml_openvino_get_device_config() { + static ggml_openvino_device_config config; + return config; +} + +// Initialize device config (call during backend init) +void ggml_openvino_init_device_config() { + ggml_openvino_get_device_config().init(); +} + +// Get the device name +const std::string & ggml_openvino_get_device_name() { + return ggml_openvino_get_device_config().device_name; +} + +// Check if running on NPU +bool ggml_openvino_is_npu() { + return ggml_openvino_get_device_config().is_npu; +} + +// Get requantization type for a tensor type (returns nullopt if no requant needed) +std::optional ggml_openvino_get_requant_type(ggml_type type) { + if (!ggml_openvino_is_npu()) { + return std::nullopt; + } + // NPU requantization rules + switch (type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q4_K: + return ExtraQuantType::Q4_0_128; + case GGML_TYPE_Q6_K: + case GGML_TYPE_Q5_K: + return ExtraQuantType::F16; + default: + return std::nullopt; + } +} + +// ===================================================== +// Extracted Layout Calculation +// ===================================================== + +ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor) { + ggml_openvino_extracted_layout layout = {}; + + if (!ggml_is_quantized(tensor->type)) { + return layout; + } + + // Only handle 2D weight tensors + if (tensor->ne[2] != 1 || tensor->ne[3] != 1) { + return layout; + } + + int64_t n_elements = ggml_nelements(tensor); + const size_t alignment = 64; // Good for SIMD + + // Check if requantization is needed (NPU-specific) + auto requant_type = ggml_openvino_get_requant_type(tensor->type); + if (requant_type.has_value()) { + layout.is_requant = true; + layout.requant_type = requant_type; + + // Special case: requant to F16 - just store F16 weights, no scales/biases + if (requant_type.value() == ExtraQuantType::F16) { + layout.weights_size = n_elements * sizeof(uint16_t); // F16 = 2 bytes + layout.total_size = layout.weights_size; + layout.weights_offset = 0; + // No scales/biases for F16 + return layout; + } + + // Requant to different quantized format (e.g., Q4_0_128) + switch (requant_type.value()) { + case ExtraQuantType::Q4_0_128: + layout.is_u4 = true; + layout.weights_per_block = 128; + break; + case ExtraQuantType::Q8_0_32: + layout.is_u4 = false; + layout.weights_per_block = 32; + break; + default: + // Unsupported requant type - fall through to normal extraction + layout.is_requant = false; + layout.requant_type = std::nullopt; + break; + } + + if (layout.is_requant) { + // Calculate sizes for requantized format + layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements; + int64_t n_blocks = n_elements / layout.weights_per_block; + layout.scales_size = n_blocks * sizeof(uint16_t); + layout.biases_size = n_blocks * sizeof(uint16_t); + + layout.weights_offset = 0; + layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment; + layout.biases_offset = + layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment; + layout.total_size = layout.biases_offset + layout.biases_size; + layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor)); + return layout; + } + } + + // Normal extraction (no requant) - determine format based on tensor type + switch (tensor->type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q4_K: + layout.is_u4 = true; + layout.weights_per_block = 32; + break; + case GGML_TYPE_Q8_0: + layout.is_u4 = false; + layout.weights_per_block = 32; + break; + case GGML_TYPE_Q6_K: + layout.is_u4 = false; + layout.weights_per_block = 16; + break; + case GGML_TYPE_Q5_K: + layout.is_u4 = false; + layout.weights_per_block = 32; + break; + default: + // Unsupported quantization type + return layout; + } + + // Calculate sizes + // Weights: U4 = n_elements/2 bytes, U8 = n_elements bytes + layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements; + + // Scales and biases: F16 per block + int64_t n_blocks = n_elements / layout.weights_per_block; + layout.scales_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes + layout.biases_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes + + // Layout in buffer: [weights | scales | biases] with alignment + layout.weights_offset = 0; + layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment; + layout.biases_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment; + layout.total_size = layout.biases_offset + layout.biases_size; + + return layout; +} diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h index 99db870412..7e0138388f 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.h +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h @@ -1,16 +1,20 @@ #pragma once +#include "ggml.h" +#include "openvino/runtime/core.hpp" + #include #include -#include #include #include +#include #include -#include "ggml.h" // ExtraQuantType enum - defines requantization target formats enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128, Q8_0_C, Q8_0_32 }; +ov::Core & ov_singleton_core(); + // ===================================================== // Global Device Configuration (singleton) // ===================================================== @@ -21,56 +25,23 @@ struct ggml_openvino_device_config { bool is_npu = false; bool initialized = false; - void init() { - if (initialized) return; - const char* env = std::getenv("GGML_OPENVINO_DEVICE"); - if (env) { - device_name = env; - is_npu = (device_name == "NPU"); - } - initialized = true; - } + void init(); }; // Get the global device config singleton -inline ggml_openvino_device_config& ggml_openvino_get_device_config() { - static ggml_openvino_device_config config; - return config; -} +ggml_openvino_device_config & ggml_openvino_get_device_config(); // Initialize device config (call during backend init) -inline void ggml_openvino_init_device_config() { - ggml_openvino_get_device_config().init(); -} +void ggml_openvino_init_device_config(); // Get the device name -inline const std::string& ggml_openvino_get_device_name() { - return ggml_openvino_get_device_config().device_name; -} +const std::string & ggml_openvino_get_device_name(); // Check if running on NPU -inline bool ggml_openvino_is_npu() { - return ggml_openvino_get_device_config().is_npu; -} +bool ggml_openvino_is_npu(); // Get requantization type for a tensor type (returns nullopt if no requant needed) -inline std::optional ggml_openvino_get_requant_type(ggml_type type) { - if (!ggml_openvino_is_npu()) { - return std::nullopt; - } - // NPU requantization rules - switch (type) { - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q4_K: - return ExtraQuantType::Q4_0_128; - case GGML_TYPE_Q6_K: - case GGML_TYPE_Q5_K: - return ExtraQuantType::F16; - default: - return std::nullopt; - } -} +std::optional ggml_openvino_get_requant_type(ggml_type type); // ===================================================== // OpenVINO Tensor Extra Types @@ -140,108 +111,4 @@ struct ggml_openvino_extracted_layout { }; // Calculate the buffer layout for extracted quantized data -inline ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor) { - ggml_openvino_extracted_layout layout = {}; - - if (!ggml_is_quantized(tensor->type)) { - return layout; - } - - // Only handle 2D weight tensors - if (tensor->ne[2] != 1 || tensor->ne[3] != 1) { - return layout; - } - - int64_t n_elements = ggml_nelements(tensor); - const size_t alignment = 64; // Good for SIMD - - // Check if requantization is needed (NPU-specific) - auto requant_type = ggml_openvino_get_requant_type(tensor->type); - if (requant_type.has_value()) { - layout.is_requant = true; - layout.requant_type = requant_type; - - // Special case: requant to F16 - just store F16 weights, no scales/biases - if (requant_type.value() == ExtraQuantType::F16) { - layout.weights_size = n_elements * sizeof(uint16_t); // F16 = 2 bytes - layout.total_size = layout.weights_size; - layout.weights_offset = 0; - // No scales/biases for F16 - return layout; - } - - // Requant to different quantized format (e.g., Q4_0_128) - switch (requant_type.value()) { - case ExtraQuantType::Q4_0_128: - layout.is_u4 = true; - layout.weights_per_block = 128; - break; - case ExtraQuantType::Q8_0_32: - layout.is_u4 = false; - layout.weights_per_block = 32; - break; - default: - // Unsupported requant type - fall through to normal extraction - layout.is_requant = false; - layout.requant_type = std::nullopt; - break; - } - - if (layout.is_requant) { - // Calculate sizes for requantized format - layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements; - int64_t n_blocks = n_elements / layout.weights_per_block; - layout.scales_size = n_blocks * sizeof(uint16_t); - layout.biases_size = n_blocks * sizeof(uint16_t); - - layout.weights_offset = 0; - layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment; - layout.biases_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment; - layout.total_size = layout.biases_offset + layout.biases_size; - layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor)); - return layout; - } - } - - // Normal extraction (no requant) - determine format based on tensor type - switch (tensor->type) { - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q4_K: - layout.is_u4 = true; - layout.weights_per_block = 32; - break; - case GGML_TYPE_Q8_0: - layout.is_u4 = false; - layout.weights_per_block = 32; - break; - case GGML_TYPE_Q6_K: - layout.is_u4 = false; - layout.weights_per_block = 16; - break; - case GGML_TYPE_Q5_K: - layout.is_u4 = false; - layout.weights_per_block = 32; - break; - default: - // Unsupported quantization type - return layout; - } - - // Calculate sizes - // Weights: U4 = n_elements/2 bytes, U8 = n_elements bytes - layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements; - - // Scales and biases: F16 per block - int64_t n_blocks = n_elements / layout.weights_per_block; - layout.scales_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes - layout.biases_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes - - // Layout in buffer: [weights | scales | biases] with alignment - layout.weights_offset = 0; - layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment; - layout.biases_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment; - layout.total_size = layout.biases_offset + layout.biases_size; - - return layout; -} +ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor); diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 747d1b8a30..e20ae71e40 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -12,7 +12,11 @@ #include #include #include +#include #include +#include +#include +#include #include #include #include @@ -48,7 +52,8 @@ struct ggml_backend_openvino_buffer_context { // For non-weight buffers (KV cache, compute), we still use contiguous allocation void * data; size_t size; - bool is_weight_buffer; // Set when buffer usage is set to WEIGHTS + + std::shared_ptr ov_tensor; // Track all extras for cleanup std::vector tensor_extras; @@ -57,18 +62,42 @@ struct ggml_backend_openvino_buffer_context { device(device), name(std::string(GGML_OPENVINO_NAME) + std::to_string(device)), data(nullptr), - size(size), - is_weight_buffer(false) { - // Allocate aligned contiguous memory - if (size > 0) { + size(size) { + if (size == 0) { + return; + } + + const auto & device_name = ggml_openvino_get_device_name(); + auto & core = ov_singleton_core(); + + if (device_name == "CPU") { #ifdef _WIN32 - data = _aligned_malloc(size, GGML_OPENVINO_BUFFER_ALIGNMENT); + data = _aligned_malloc(alloc_size, GGML_OPENVINO_BUFFER_ALIGNMENT); #else data = aligned_alloc(GGML_OPENVINO_BUFFER_ALIGNMENT, size); #endif - if (data == nullptr) { - GGML_LOG_ERROR("%s: failed to allocate %zu bytes\n", __func__, size); - } + ov_tensor = std::make_shared(ov::element::u8, ov::Shape{size}, data); + } else if (device_name == "GPU") { + auto gpu_context = core.get_default_context("GPU").as(); + auto usm_tensor = gpu_context.create_usm_host_tensor(ov::element::u8, ov::Shape{size}); + data = usm_tensor.get(); + ov_tensor = std::make_shared(std::move(usm_tensor)); + } else { + auto npu_context = core.get_default_context("NPU").as(); + auto l0_tensor = npu_context.create_l0_host_tensor(ov::element::u8, ov::Shape{size}); + data = l0_tensor.get(); + ov_tensor = std::make_shared(std::move(l0_tensor)); + } + + if (data == nullptr) { + GGML_LOG_ERROR("%s: failed to allocate %zu bytes\n", __func__, size); + return; + } + + if (reinterpret_cast(data) % GGML_OPENVINO_BUFFER_ALIGNMENT != 0) { + GGML_LOG_ERROR("%s: %s buffer is not aligned to %d bytes\n", __func__, device_name.c_str(), + GGML_OPENVINO_BUFFER_ALIGNMENT); + GGML_ABORT("fatal error"); } } @@ -78,15 +107,12 @@ struct ggml_backend_openvino_buffer_context { delete extra; } tensor_extras.clear(); - - // Free contiguous memory - if (data != nullptr) { + if (data && ggml_openvino_get_device_name() == "CPU") { #ifdef _WIN32 _aligned_free(data); #else free(data); #endif - data = nullptr; } } }; @@ -156,57 +182,26 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer } if (layout.total_size > 0) { + // Quantized weight tensor with extraction/requantization uint8_t * buf_base = (uint8_t *) tensor->data; - // 2D shape for weights [rows, cols] - ov::Shape weight_shape = {static_cast(tensor->ne[1]), static_cast(tensor->ne[0])}; - try { - std::shared_ptr constant; + std::shared_ptr constant = process_weight_tensor(tensor, data, buf_base); + constant->set_friendly_name(tensor->name); - if (layout.is_requant && layout.requant_type.has_value()) { - // Requantization path - if (layout.requant_type.value() == ExtraQuantType::F16) { - // Requant to F16: create F16 tensor with external memory, requantize fills it - ov::Tensor weights(ov::element::f16, weight_shape, buf_base); - ov::Tensor dummy_scales, dummy_biases; // Not used for F16 - // requantize_to_buffers fills weights and returns a Constant wrapping it - constant = requantize_to_buffers(tensor, data, ExtraQuantType::F16, 0, weights, dummy_scales, - dummy_biases); - - // Store in tensor->extra (use weight_extra since it's F16) - auto * extra = new ggml_openvino_weight_extra(constant); - ctx->tensor_extras.push_back(extra); - tensor->extra = extra; - - GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name); - } else { - // Requant to quantized format (Q4_0_128, Q8_0_32, etc.) - ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; - ov::Shape scale_shape = {static_cast(tensor->ne[1]), - static_cast(tensor->ne[0] / layout.weights_per_block)}; - - ov::Tensor weights(weight_type, weight_shape, buf_base + layout.weights_offset); - ov::Tensor scales(ov::element::f16, scale_shape, buf_base + layout.scales_offset); - ov::Tensor biases(ov::element::f16, scale_shape, buf_base + layout.biases_offset); - - constant = requantize_to_buffers(tensor, data, layout.requant_type.value(), - layout.weights_per_block, weights, scales, biases); - - // Store in tensor->extra - auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales), - std::move(biases), constant); - ctx->tensor_extras.push_back(extra); - tensor->extra = extra; - - GGML_LOG_DEBUG("%s: requantized %s to %s (u%d, block_size=%ld)\n", __func__, tensor->name, - layout.requant_type.value() == ExtraQuantType::Q4_0_128 ? "Q4_0_128" : "Q8_0_32", - layout.is_u4 ? 4 : 8, layout.weights_per_block); - } + // Store in tensor->extra + if (layout.is_requant && layout.requant_type.has_value() && + layout.requant_type.value() == ExtraQuantType::F16) { + // F16 requant case - use weight_extra + auto * extra = new ggml_openvino_weight_extra(constant); + ctx->tensor_extras.push_back(extra); + tensor->extra = extra; + GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name); } else { - // Normal extraction path (no requant) + // Quantized case - use quantized_weight_extra + // Create tensors with external memory (already filled by process_weight_tensor) ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; - int64_t n_blocks = ggml_nelements(tensor) / layout.weights_per_block; + ov::Shape weight_shape = {static_cast(tensor->ne[1]), static_cast(tensor->ne[0])}; ov::Shape scale_shape = {static_cast(tensor->ne[1]), static_cast(tensor->ne[0] / layout.weights_per_block)}; @@ -214,16 +209,20 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer ov::Tensor scales(ov::element::f16, scale_shape, buf_base + layout.scales_offset); ov::Tensor biases(ov::element::f16, scale_shape, buf_base + layout.biases_offset); - constant = extract_quantized_weights(tensor, data, weights, scales, biases); - - // Store in tensor->extra auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales), std::move(biases), constant); ctx->tensor_extras.push_back(extra); tensor->extra = extra; - GGML_LOG_DEBUG("%s: extracted quantized constant for %s (u%d, %zu weights, %ld blocks)\n", __func__, - tensor->name, layout.is_u4 ? 4 : 8, layout.weights_size, n_blocks); + if (layout.is_requant) { + GGML_LOG_DEBUG("%s: requantized %s to %s (u%d, block_size=%ld)\n", __func__, tensor->name, + layout.requant_type.value() == ExtraQuantType::Q4_0_128 ? "Q4_0_128" : "Q8_0_32", + layout.is_u4 ? 4 : 8, layout.weights_per_block); + } else { + int64_t n_blocks = ggml_nelements(tensor) / layout.weights_per_block; + GGML_LOG_DEBUG("%s: extracted quantized constant for %s (u%d, %zu weights, %ld blocks)\n", __func__, + tensor->name, layout.is_u4 ? 4 : 8, layout.weights_size, n_blocks); + } } } catch (const std::exception & e) { @@ -233,32 +232,9 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer } } else if (is_weight_buffer && is_full_tensor_set && is_2d && (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16)) { - // F16/F32/BF16 weight tensor - copy data and create shared-memory constant - memcpy((char *) tensor->data + offset, data, size); - + // F16/F32/BF16 weight tensor try { - // Get OpenVINO element type - ov::element::Type element_type; - switch (tensor->type) { - case GGML_TYPE_F32: - element_type = ov::element::f32; - break; - case GGML_TYPE_F16: - element_type = ov::element::f16; - break; - case GGML_TYPE_BF16: - element_type = ov::element::bf16; - break; - default: - return; // Should not happen - } - - // Create 2D shape (OpenVINO expects [rows, cols]) - ov::Shape shape = {static_cast(tensor->ne[1]), static_cast(tensor->ne[0])}; - - // Create ov::Tensor with external memory, then wrap with Constant - ov::Tensor ov_tensor(element_type, shape, tensor->data); - auto constant = std::make_shared(ov_tensor); + std::shared_ptr constant = process_weight_tensor(tensor, data, tensor->data); constant->set_friendly_name(tensor->name); // Store in tensor->extra @@ -418,7 +394,7 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(in } // Check if a buffer is an OpenVINO buffer -static bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer) { +bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer) { return buffer->iface.free_buffer == ggml_backend_openvino_buffer_free_buffer; } diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index 662f27be7a..6cacc7b034 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -569,6 +569,112 @@ std::shared_ptr requantize(const ggml_tensor * tensor, ExtraQuantType return requantize_to_buffers(tensor, tensor->data, requant_type, block_size, weights, scales, biases); } +std::shared_ptr process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr) { + GGML_ASSERT(tensor != nullptr); + GGML_ASSERT(data != nullptr); + + // Get 2D shape for weights [rows, cols] + ov::Shape node_shape = {static_cast(tensor->ne[1]), static_cast(tensor->ne[0])}; + + // Handle F16/F32/BF16 weights + if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) { + ov::element::Type element_type; + switch (tensor->type) { + case GGML_TYPE_F32: + element_type = ov::element::f32; + break; + case GGML_TYPE_F16: + element_type = ov::element::f16; + break; + case GGML_TYPE_BF16: + element_type = ov::element::bf16; + break; + default: + OPENVINO_THROW("Unexpected tensor type in F16/F32/BF16 path"); + } + + if (output_base_ptr) { + // Using external buffer - copy data and create shared-memory constant + size_t tensor_bytes = ggml_nbytes(tensor); + memcpy(output_base_ptr, data, tensor_bytes); + ov::Tensor ov_tensor(element_type, node_shape, output_base_ptr); + return std::make_shared(ov_tensor); + } else { + // Allocate internal buffer + ov::Tensor weights(element_type, node_shape); + memcpy(weights.data(), data, ggml_nelements(tensor) * element_type.size()); + return std::make_shared(weights); + } + } + + // Handle quantized weights + if (!ggml_is_quantized(tensor->type)) { + OPENVINO_THROW("Unsupported weight tensor type: ", ggml_type_name(tensor->type)); + } + + auto layout = ggml_openvino_get_extracted_layout(tensor); + if (layout.total_size == 0) { + OPENVINO_THROW("Unsupported quantized type: ", ggml_type_name(tensor->type)); + } + + std::shared_ptr result; + + if (layout.is_requant && layout.requant_type.has_value()) { + // Requantization path + if (layout.requant_type.value() == ExtraQuantType::F16) { + // Requant to F16 + ov::Tensor weights; + if (output_base_ptr) { + weights = ov::Tensor(ov::element::f16, node_shape, + static_cast(output_base_ptr) + layout.weights_offset); + } else { + weights = ov::Tensor(ov::element::f16, node_shape); + } + ov::Tensor dummy_scales, dummy_biases; // Not used for F16 + result = requantize_to_buffers(tensor, data, ExtraQuantType::F16, 0, weights, dummy_scales, dummy_biases); + } else { + // Requant to quantized format (Q4_0_128, Q8_0_32, etc.) + ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; + ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block}; + + ov::Tensor weights, scales, biases; + if (output_base_ptr) { + uint8_t * buf_base = static_cast(output_base_ptr); + weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset); + scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset); + biases = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.biases_offset); + } else { + weights = ov::Tensor(weight_type, node_shape); + scales = ov::Tensor(ov::element::f16, scale_shape); + biases = ov::Tensor(ov::element::f16, scale_shape); + } + + result = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block, weights, + scales, biases); + } + } else { + // Normal extraction path (no requant) + ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; + ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block}; + + ov::Tensor weights, scales, biases; + if (output_base_ptr) { + uint8_t * buf_base = static_cast(output_base_ptr); + weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset); + scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset); + biases = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.biases_offset); + } else { + weights = ov::Tensor(weight_type, node_shape); + scales = ov::Tensor(ov::element::f16, scale_shape); + biases = ov::Tensor(ov::element::f16, scale_shape); + } + + result = extract_quantized_weights(tensor, data, weights, scales, biases); + } + + return result; +} + void quantize_q4_0(const float * x, ov::Tensor & weights_arr, ov::Tensor & scales_arr, diff --git a/ggml/src/ggml-openvino/ggml-quants.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp index 0f14a6ed2d..b1d286f1b8 100644 --- a/ggml/src/ggml-openvino/ggml-quants.hpp +++ b/ggml/src/ggml-openvino/ggml-quants.hpp @@ -78,6 +78,16 @@ std::shared_ptr requantize_to_buffers(const ggml_tensor * tensor, ov::Tensor & scales, ov::Tensor & biases); +// Process weight tensor and create an OpenVINO constant node +// Handles F16/F32/BF16 and quantized weights, with optional requantization +// If output_base_ptr is nullptr, allocates internal buffers (for decoder use) +// If output_base_ptr is provided, uses pre-allocated buffers at specified offsets (for backend buffer use) +// Returns the weight constant node +std::shared_ptr process_weight_tensor( + const ggml_tensor * tensor, + const void * data, // Source data pointer (may differ from tensor->data) + void * output_base_ptr = nullptr); // Base pointer for output buffers (or nullptr for internal allocation) + void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, int64_t qk); void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 251fb82361..6d56af9318 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -107,7 +107,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin infer_request_cache.erase(key); std::shared_ptr model; - auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, get_types_to_requant(device)); + auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); ggml_decoder = std::make_shared(cgraph, m_params, c_params, model_weights, is_static); decoder_end_time = ggml_time_us(); @@ -255,7 +255,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { infer_request_cache_prefill.erase(key); std::shared_ptr model; - auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, get_types_to_requant(device)); + auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); auto ggml_decoder_prefill = std::make_shared(cgraph, m_params, c_params, model_weights, is_static, true, prefill_chunk_size); @@ -404,21 +404,6 @@ ov::AnyMap get_ov_compile_config(const std::string & device) { return config; } -std::map get_types_to_requant(const std::string & device) { - // Use singleton to check if NPU (device param kept for API compatibility) - if (ggml_openvino_is_npu()) { - return { - {GGML_TYPE_Q4_0, ExtraQuantType::Q4_0_128}, - {GGML_TYPE_Q4_1, ExtraQuantType::Q4_0_128}, - {GGML_TYPE_Q4_K, ExtraQuantType::Q4_0_128}, - {GGML_TYPE_Q6_K, ExtraQuantType::F16 }, - {GGML_TYPE_Q5_K, ExtraQuantType::F16 }, - }; - } - return {}; - GGML_UNUSED(device); -} - bool is_naive(ggml_cgraph * cgraph) { constexpr int naive_graph_size_threshold = 20; return cgraph->n_nodes < naive_graph_size_threshold; diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 85bb3a2f88..81fb2c2035 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -73,8 +73,6 @@ graph_key compute_graph_key(struct ggml_cgraph * cgraph); ov::AnyMap get_ov_compile_config(const std::string & device); -std::map get_types_to_requant(const std::string & device); - ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string & param_name); ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr ggml_decoder, const std::string & param_name); From 3fdcb6ab727dadf96806aed7282460d8335861dd Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 19 Dec 2025 16:58:07 +0800 Subject: [PATCH 207/254] Add ov_backend_host_buffer; Use cached remote context --- ggml/include/ggml-openvino.h | 8 ++ ggml/src/ggml-openvino/ggml-decoder.cpp | 29 ++++- .../src/ggml-openvino/ggml-openvino-extra.cpp | 85 ++++++++++++++ ggml/src/ggml-openvino/ggml-openvino-extra.h | 11 ++ ggml/src/ggml-openvino/ggml-openvino.cpp | 111 ++++++++++++++++-- ggml/src/ggml-openvino/utils.cpp | 92 ++++++++------- ggml/src/ggml-openvino/utils.h | 2 - 7 files changed, 281 insertions(+), 57 deletions(-) diff --git a/ggml/include/ggml-openvino.h b/ggml/include/ggml-openvino.h index b690a16378..392e26c48e 100644 --- a/ggml/include/ggml-openvino.h +++ b/ggml/include/ggml-openvino.h @@ -18,9 +18,17 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device); GGML_BACKEND_API bool ggml_backend_is_openvino(ggml_backend_t backend); +GGML_BACKEND_API bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer); + +GGML_BACKEND_API bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft); + +GGML_BACKEND_API bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft); + // device buffer GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device); +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(int device); + GGML_BACKEND_API int ggml_backend_openvino_get_device_count(void); GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void); diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 2d6437f069..13ef00dcb6 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -593,11 +593,19 @@ void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filena << std::setw(20) << "op" << std::setw(20) << "name" << std::setw(3) << " " - << std::setw(50) << "stride" + << std::setw(62) << "stride" + << std::setw(20) << "buffer_type" << "\n"; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; + // Get buffer type name + const char * buf_name = "none"; + ggml_backend_buffer_t buf = node->view_src ? node->view_src->buffer : node->buffer; + if (buf) { + buf_name = ggml_backend_buffer_name(buf); + } + file << " - " << std::setw(3) << i << ": [ " << std::setw(5) << node->ne[0] << ", " << std::setw(5) << node->ne[1] << ", " @@ -610,10 +618,18 @@ void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filena << std::setw(5) << node->nb[1] << ", " << std::setw(5) << node->nb[2] << ", " << std::setw(5) << node->nb[3] << "] " + << std::right << std::setw(15) << buf_name << std::right << "\n"; for (int i = 0; i < GGML_MAX_SRC; i++) { if (auto* src = node->src[i]) { + // Get buffer type name for source + const char * src_buf_name = "none"; + ggml_backend_buffer_t src_buf = src->view_src ? src->view_src->buffer : src->buffer; + if (src_buf) { + src_buf_name = ggml_backend_buffer_name(src_buf); + } + file << std::setw(10) << " [ " << std::setw(5) << src->ne[0] << ", " << std::setw(5) << src->ne[1] << ", " @@ -627,6 +643,7 @@ void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filena << std::setw(5) << src->nb[1] << ", " << std::setw(5) << src->nb[2] << ", " << std::setw(5) << src->nb[3] << "] " + << std::right << std::setw(15) << src_buf_name << std::right << "\n"; } } @@ -636,11 +653,19 @@ void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filena for (int i = 0; i < cgraph->n_leafs; i++) { ggml_tensor * node = cgraph->leafs[i]; + // Get buffer type name for leaf + const char * leaf_buf_name = "none"; + ggml_backend_buffer_t leaf_buf = node->view_src ? node->view_src->buffer : node->buffer; + if (leaf_buf) { + leaf_buf_name = ggml_backend_buffer_name(leaf_buf); + } + file << " - " << std::setw(3) << i << ": [ " << std::setw(5) << node->ne[0] << ", " << std::setw(5) << node->ne[1] << "] " << std::setw(8) << ggml_op_name(node->op) << " " - << std::setw(16) << ggml_get_name(node) << "\n"; + << std::setw(16) << ggml_get_name(node) + << std::setw(20) << leaf_buf_name << "\n"; } // clang-format on file << "========================================\n"; diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp index 75b27c8fa8..085ae1ece4 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp @@ -2,6 +2,9 @@ #include "ggml-impl.h" +#include +#include + ov::Core & ov_singleton_core() { static ov::Core core; return core; @@ -22,6 +25,31 @@ void ggml_openvino_device_config::init() { device_name = "CPU"; } is_npu = (device_name == "NPU"); + + auto * cache_dir = getenv("GGML_OPENVINO_CACHE_DIR"); + if (device_name == "NPU") { + compile_config = { + {"NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES" }, + {"NPU_USE_NPUW", "YES" }, + {"NPUW_DEVICES", "NPU" }, + {"NPUW_FOLD", "YES" }, + {"NPUW_WEIGHTS_BANK", "shared"}, + {"NPUW_FUNCALL_FOR_ALL", "YES" }, + {"NPUW_FUNCALL_ASYNC", "YES" }, + {"NPUW_DQ", "YES" }, + {"NPUW_DQ_FULL", "NO" }, + }; + if (cache_dir) { + compile_config["NPUW_CACHE_DIR"] = cache_dir; + } + } else if (cache_dir) { + ov_singleton_core().set_property(ov::cache_dir(cache_dir)); + } + + if (device_name != "CPU") { + remote_context = ov_singleton_core().get_default_context(device_name); + } + initialized = true; } @@ -46,6 +74,16 @@ bool ggml_openvino_is_npu() { return ggml_openvino_get_device_config().is_npu; } +// Get the remote context for the current device (returns empty optional for CPU) +std::optional ggml_openvino_get_remote_context() { + return ggml_openvino_get_device_config().remote_context; +} + +// Get the compile config for the current device +const ov::AnyMap & ggml_openvino_get_compile_config() { + return ggml_openvino_get_device_config().compile_config; +} + // Get requantization type for a tensor type (returns nullopt if no requant needed) std::optional ggml_openvino_get_requant_type(ggml_type type) { if (!ggml_openvino_is_npu()) { @@ -175,3 +213,50 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten return layout; } + +ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor) { + ov::Shape shape; + for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) { + shape.push_back(static_cast(tensor->ne[i])); + } + + ov::element::Type element_type; + switch (tensor->type) { + case GGML_TYPE_F32: + element_type = ov::element::f32; + break; + case GGML_TYPE_F16: + element_type = ov::element::f16; + break; + case GGML_TYPE_BF16: + element_type = ov::element::bf16; + break; + case GGML_TYPE_I32: + element_type = ov::element::i32; + break; + case GGML_TYPE_I64: + element_type = ov::element::i64; + break; + default: + GGML_LOG_ERROR("%s: unsupported tensor type for ov::Tensor: %s\n", __func__, ggml_type_name(tensor->type)); + return nullptr; + } + + const auto & device_name = ggml_openvino_get_device_name(); + auto remote_context = ggml_openvino_get_remote_context(); + + std::shared_ptr ov_tensor; + if (device_name == "CPU") { + ov_tensor = std::make_shared(element_type, shape, tensor->data); + } else if (device_name == "GPU") { + auto gpu_context = remote_context->as(); + auto usm_tensor = gpu_context.create_tensor(element_type, shape, tensor->data); + ov_tensor = std::make_shared(std::move(usm_tensor)); + } else { + auto npu_context = remote_context->as(); + auto l0_tensor = npu_context.create_tensor(element_type, shape, tensor->data); + ov_tensor = std::make_shared(std::move(l0_tensor)); + } + + return new ggml_openvino_tensor_extra(ov_tensor); +} diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h index 7e0138388f..fdd8312dff 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.h +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -15,6 +16,12 @@ enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128, Q8_0_C, Q8_0_32 }; ov::Core & ov_singleton_core(); +// Get the remote context for the current device (returns empty optional for CPU) +std::optional ggml_openvino_get_remote_context(); + +// Get the compile config for the current device +const ov::AnyMap & ggml_openvino_get_compile_config(); + // ===================================================== // Global Device Configuration (singleton) // ===================================================== @@ -24,6 +31,8 @@ struct ggml_openvino_device_config { std::string device_name = "CPU"; bool is_npu = false; bool initialized = false; + std::optional remote_context; + ov::AnyMap compile_config; void init(); }; @@ -112,3 +121,5 @@ struct ggml_openvino_extracted_layout { // Calculate the buffer layout for extracted quantized data ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor); + +ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor); diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index e20ae71e40..c5c25fb6c1 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -56,7 +56,7 @@ struct ggml_backend_openvino_buffer_context { std::shared_ptr ov_tensor; // Track all extras for cleanup - std::vector tensor_extras; + std::map tensor_extras; ggml_backend_openvino_buffer_context(int device, size_t size) : device(device), @@ -103,8 +103,8 @@ struct ggml_backend_openvino_buffer_context { ~ggml_backend_openvino_buffer_context() { // Clean up all tensor extras - for (auto * extra : tensor_extras) { - delete extra; + for (auto & pair : tensor_extras) { + delete pair.second; } tensor_extras.clear(); if (data && ggml_openvino_get_device_name() == "CPU") { @@ -144,9 +144,20 @@ static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_bu return GGML_STATUS_SUCCESS; } - // For non-view tensors, tensor->extra will be set in set_tensor - // when the actual weight data is loaded - GGML_UNUSED(buffer); + ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + + if (tensor->data != nullptr) { + ggml_openvino_tensor_extra * extra = ggml_openvino_create_tensor_extra(tensor); + if (extra != nullptr) { + auto it = ctx->tensor_extras.find(tensor); + if (it != ctx->tensor_extras.end()) { + delete it->second; + } + ctx->tensor_extras[tensor] = extra; + tensor->extra = extra; + } + } + return GGML_STATUS_SUCCESS; } @@ -194,7 +205,7 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer layout.requant_type.value() == ExtraQuantType::F16) { // F16 requant case - use weight_extra auto * extra = new ggml_openvino_weight_extra(constant); - ctx->tensor_extras.push_back(extra); + ctx->tensor_extras[tensor] = extra; tensor->extra = extra; GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name); } else { @@ -211,7 +222,7 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales), std::move(biases), constant); - ctx->tensor_extras.push_back(extra); + ctx->tensor_extras[tensor] = extra; tensor->extra = extra; if (layout.is_requant) { @@ -239,7 +250,7 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer // Store in tensor->extra ggml_openvino_weight_extra * extra = new ggml_openvino_weight_extra(constant); - ctx->tensor_extras.push_back(extra); + ctx->tensor_extras[tensor] = extra; tensor->extra = extra; GGML_LOG_DEBUG("%s: created shared-memory constant for %s\n", __func__, tensor->name); @@ -251,6 +262,19 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer } else { // Non-weight tensor (KV cache, activations, etc.) - just copy data memcpy((char *) tensor->data + offset, data, size); + + ggml_openvino_tensor_extra * extra = ggml_openvino_create_tensor_extra(tensor); + if (extra == nullptr) { + GGML_LOG_ERROR("%s: failed to create tensor extra for %s\n", __func__, tensor->name); + return; + } + + auto it = ctx->tensor_extras.find(tensor); + if (it != ctx->tensor_extras.end()) { + delete it->second; + } + ctx->tensor_extras[tensor] = extra; + tensor->extra = extra; } } @@ -393,11 +417,67 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(in return &buffer_types[device]; } -// Check if a buffer is an OpenVINO buffer +// ===================================================== +// OpenVINO Host Buffer Implementation +// ===================================================== + +static const char * ggml_backend_openvino_host_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + ggml_backend_openvino_buffer_type_context * ctx = (ggml_backend_openvino_buffer_type_context *) buft->context; + static std::string name; + name = ctx->name + "_HOST"; + return name.c_str(); +} + +static const ggml_backend_buffer_type_i ggml_backend_openvino_host_buffer_type_interface = { + /* .get_name = */ ggml_backend_openvino_host_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_openvino_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_openvino_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_openvino_buffer_type_get_max_size, + /* .get_alloc_size = */ ggml_backend_openvino_buffer_type_get_alloc_size, + /* .is_host = */ ggml_backend_openvino_buffer_type_is_host, +}; + +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(int device) { + GGML_ASSERT(device >= 0 && device < ggml_backend_openvino_get_device_count()); + + static std::mutex mutex; + std::lock_guard lock(mutex); + + static std::vector buffer_types; + static std::vector buffer_type_contexts; + + if (buffer_types.empty()) { + int device_count = ggml_backend_openvino_get_device_count(); + buffer_types.resize(device_count); + buffer_type_contexts.resize(device_count); + + for (int i = 0; i < device_count; i++) { + buffer_type_contexts[i].device = i; + buffer_type_contexts[i].name = std::string(GGML_OPENVINO_NAME) + std::to_string(i); + + buffer_types[i] = ggml_backend_buffer_type{ + /* .iface = */ ggml_backend_openvino_host_buffer_type_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_openvino_reg(), i), + /* .context = */ &buffer_type_contexts[i], + }; + } + } + + return &buffer_types[device]; +} + bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer) { return buffer->iface.free_buffer == ggml_backend_openvino_buffer_free_buffer; } +bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name; +} + +bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_openvino_host_buffer_type_get_name; +} + // ===================================================== // OpenVINO Backend Context and Interface // ===================================================== @@ -552,6 +632,11 @@ static ggml_backend_buffer_type_t ggml_backend_openvino_device_get_buffer_type(g return ggml_backend_openvino_buffer_type(ctx->device); } +static ggml_backend_buffer_type_t ggml_backend_openvino_device_get_host_buffer_type(ggml_backend_dev_t dev) { + ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *) dev->context; + return ggml_backend_openvino_host_buffer_type(ctx->device); +} + static bool is_op_unsupported_case(const ggml_tensor * op) { switch (op->op) { case GGML_OP_SOFT_MAX: { @@ -731,7 +816,8 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { // Support our own buffer type and any host buffer (for mmap'd files, etc.) - return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name || ggml_backend_buft_is_host(buft); + return ggml_backend_buft_is_openvino(buft) || ggml_backend_buft_is_host(buft); + // return ggml_backend_buft_is_openvino(buft) || ggml_backend_buft_is_openvino_host(buft); GGML_UNUSED(dev); } @@ -743,7 +829,8 @@ static const struct ggml_backend_device_i ggml_backend_openvino_device_interface /* .get_props = */ ggml_backend_openvino_device_get_props, /* .init_backend = */ ggml_backend_openvino_device_init, /* .get_buffer_type = */ ggml_backend_openvino_device_get_buffer_type, - /* .get_host_buffer_type = */ NULL, + // /* .get_host_buffer_type = */ NULL, + /* .get_host_buffer_type = */ ggml_backend_openvino_device_get_host_buffer_type, /* .buffer_from_host_ptr = */ NULL, /* .supports_op = */ ggml_backend_openvino_device_supports_op, /* .supports_buft = */ ggml_backend_openvino_device_supports_buft, diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 6d56af9318..89cf51f880 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -37,11 +37,9 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" -static ov::Core core; - enum ggml_status ov_graph_compute(ggml_cgraph * cgraph) { if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { - std::string filename = "cgraph.txt"; + std::string filename = "cgraph_ov.txt"; GgmlOvDecoder::dump_cgraph(cgraph, filename); } @@ -52,8 +50,9 @@ enum ggml_status ov_graph_compute(ggml_cgraph * cgraph) { } enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::string & device) { + auto & core = ov_singleton_core(); + const auto & config = ggml_openvino_get_compile_config(); static auto is_static = false; - static auto config = get_ov_compile_config(device); // if (is_naive(cgraph)) { // return naive_compute(cgraph, core, device, config); @@ -124,7 +123,13 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin ov::serialize(model, timestamped_filename); } - auto compiled_model = core.compile_model(model, device, config); + ov::CompiledModel compiled_model; + auto remote_context = ggml_openvino_get_remote_context(); + if (remote_context.has_value()) { + compiled_model = core.compile_model(model, remote_context.value(), config); + } else { + compiled_model = core.compile_model(model, device, config); + } compile_end_time = ggml_time_us(); infer_request = std::make_shared(compiled_model.create_infer_request()); infer_request_cache[key] = infer_request; @@ -173,18 +178,20 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin if (getenv("GGML_OPENVINO_PROFILING")) { GGML_LOG_INFO("\nGGML OpenVINO Backend: \n"); - GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000); + GGML_LOG_INFO(" - Graph decoder time: %ld ms \n", (decoder_end_time - start_time) / 1000); if (!cache_hit) { - GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000); - GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); + GGML_LOG_INFO(" - Graph conversion time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000); + GGML_LOG_INFO(" - Graph compile time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); } - GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - compile_end_time) / 1000); + GGML_LOG_INFO(" - Graph inference time: %ld ms \n", (infer_end_time - compile_end_time) / 1000); } return GGML_STATUS_SUCCESS; } enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { + auto & core = ov_singleton_core(); + auto get_prefill_chunk_size = [] { const char * chunk_size_str = getenv("GGML_OPENVINO_PREFILL_CHUNK_SIZE"); if (chunk_size_str && atoi(chunk_size_str) > 0) { @@ -196,7 +203,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { static std::string device = "NPU"; static auto is_static = true; static auto prefill_chunk_size = get_prefill_chunk_size(); - static auto config = get_ov_compile_config(device); + const auto & config = ggml_openvino_get_compile_config(); if (is_naive(cgraph)) { return naive_compute(cgraph, core, device, config); @@ -281,8 +288,16 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { ov::serialize(model_decode, timestamped_filename); } - auto compiled_model_prefill = core.compile_model(model_prefill, device, get_ov_compile_config(device)); - auto compiled_model_decode = core.compile_model(model_decode, device, get_ov_compile_config(device)); + ov::CompiledModel compiled_model_prefill; + ov::CompiledModel compiled_model_decode; + auto remote_context = ggml_openvino_get_remote_context(); + if (remote_context.has_value()) { + compiled_model_prefill = core.compile_model(model_prefill, remote_context.value(), config); + compiled_model_decode = core.compile_model(model_decode, remote_context.value(), config); + } else { + compiled_model_prefill = core.compile_model(model_prefill, device, config); + compiled_model_decode = core.compile_model(model_decode, device, config); + } infer_request_cache_prefill[key] = std::make_shared(compiled_model_prefill.create_infer_request()); @@ -369,41 +384,17 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { if (getenv("GGML_OPENVINO_PROFILING")) { GGML_LOG_INFO("\nGGML OpenVINO Backend: \n"); - GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000); + GGML_LOG_INFO(" - Graph decoder time: %ld ms \n", (decoder_end_time - start_time) / 1000); if (!cache_hit) { - GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000); - GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); + GGML_LOG_INFO(" - Graph conversion time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000); + GGML_LOG_INFO(" - Graph compile time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); } - GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - compile_end_time) / 1000); + GGML_LOG_INFO(" - Graph inference time: %ld ms \n", (infer_end_time - compile_end_time) / 1000); } return GGML_STATUS_SUCCESS; } -ov::AnyMap get_ov_compile_config(const std::string & device) { - ov::AnyMap config; - auto * cache_dir = getenv("GGML_OPENVINO_CACHE_DIR"); - if (device == "NPU") { - config = { - {"NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES" }, - {"NPU_USE_NPUW", "YES" }, - {"NPUW_DEVICES", "NPU" }, - {"NPUW_FOLD", "YES" }, - {"NPUW_WEIGHTS_BANK", "shared"}, - {"NPUW_FUNCALL_FOR_ALL", "YES" }, - {"NPUW_FUNCALL_ASYNC", "YES" }, - {"NPUW_DQ", "YES" }, - {"NPUW_DQ_FULL", "NO" }, - }; - if (cache_dir) { - config["NPUW_CACHE_DIR"] = cache_dir; - } - } else if (cache_dir) { - core.set_property(ov::cache_dir(cache_dir)); - } - return config; -} - bool is_naive(ggml_cgraph * cgraph) { constexpr int naive_graph_size_threshold = 20; return cgraph->n_nodes < naive_graph_size_threshold; @@ -428,7 +419,14 @@ enum ggml_status naive_compute(ggml_cgraph * cgraph, if (getenv("GGML_OPENVINO_DUMP_IR")) { ov::serialize(model, "IR_naive.xml"); } - auto infer_request = core.compile_model(model, device, config).create_infer_request(); + + ov::InferRequest infer_request; + auto remote_context = ggml_openvino_get_remote_context(); + if (remote_context.has_value()) { + infer_request = core.compile_model(model, remote_context.value(), config).create_infer_request(); + } else { + infer_request = core.compile_model(model, device, config).create_infer_request(); + } auto ov_params = model->get_parameters(); for (size_t i = 0; i < ov_params.size(); i++) { @@ -451,6 +449,18 @@ enum ggml_status naive_compute(ggml_cgraph * cgraph, namespace { ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string & name) { const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(name); + + if (ggml_tensor->extra != nullptr) { + // GGML_LOG_DEBUG("Using ggml_tensor->extra as ov::Tensor for input: %s\n", name.c_str()); + auto * extra_base = static_cast(ggml_tensor->extra); + if (extra_base->type != ggml_openvino_extra_base::Type::TENSOR) { + throw std::runtime_error("ggml tensor extra is not of type TENSOR for input: " + name); + } + auto * tensor_extra = static_cast(extra_base); + return *tensor_extra->tensor; + } + + // GGML_LOG_DEBUG("Converting ggml tensor to ov::Tensor for input: %s\n", name.c_str()); auto * input_data = ggml_tensor->data; ov::Shape input_shape; if (ggml_tensor->op == GGML_OP_VIEW) { diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 81fb2c2035..44ca2db00f 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -71,8 +71,6 @@ bool get_is_prefill(const ggml_tensor * inp_pos); graph_key compute_graph_key(struct ggml_cgraph * cgraph); -ov::AnyMap get_ov_compile_config(const std::string & device); - ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string & param_name); ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr ggml_decoder, const std::string & param_name); From d7578497415cd660a0856f87823cfa76db5cdc64 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 22 Dec 2025 16:45:17 +0800 Subject: [PATCH 208/254] Put kvcache on GPU --- .../src/ggml-openvino/ggml-openvino-extra.cpp | 78 +++++++- ggml/src/ggml-openvino/ggml-openvino-extra.h | 33 ++++ ggml/src/ggml-openvino/ggml-openvino.cpp | 169 ++++++++++++++++-- 3 files changed, 262 insertions(+), 18 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp index 085ae1ece4..aa50d46c03 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp @@ -46,13 +46,56 @@ void ggml_openvino_device_config::init() { ov_singleton_core().set_property(ov::cache_dir(cache_dir)); } - if (device_name != "CPU") { + // Initialize remote context with queue sharing for GPU + if (device_name == "GPU") { + // Create OpenCL context and queue + cl_int err; + cl_platform_id platform; + err = clGetPlatformIDs(1, &platform, nullptr); + if (err != CL_SUCCESS) { + GGML_LOG_ERROR("Failed to get OpenCL platform: %d\n", err); + return; + } + + cl_device_id cl_device; + err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &cl_device, nullptr); + if (err != CL_SUCCESS) { + GGML_LOG_ERROR("Failed to get OpenCL device: %d\n", err); + return; + } + + cl_context cl_ctx = clCreateContext(nullptr, 1, &cl_device, nullptr, nullptr, &err); + if (err != CL_SUCCESS) { + GGML_LOG_ERROR("Failed to create OpenCL context: %d\n", err); + return; + } + + cl_queue = clCreateCommandQueueWithProperties(cl_ctx, cl_device, nullptr, &err); + if (err != CL_SUCCESS) { + GGML_LOG_ERROR("Failed to create OpenCL command queue: %d\n", err); + clReleaseContext(cl_ctx); + return; + } + + // Create OpenVINO remote context with queue sharing + remote_context = ov::intel_gpu::ocl::ClContext(ov_singleton_core(), cl_queue); + + // Release the context (queue keeps a reference) + clReleaseContext(cl_ctx); + } else if (device_name == "NPU") { remote_context = ov_singleton_core().get_default_context(device_name); } initialized = true; } +ggml_openvino_device_config::~ggml_openvino_device_config() { + if (cl_queue != nullptr) { + clReleaseCommandQueue(cl_queue); + cl_queue = nullptr; + } +} + // Get the global device config singleton ggml_openvino_device_config & ggml_openvino_get_device_config() { static ggml_openvino_device_config config; @@ -84,6 +127,39 @@ const ov::AnyMap & ggml_openvino_get_compile_config() { return ggml_openvino_get_device_config().compile_config; } +// Get the OpenCL command queue for GPU operations +cl_command_queue ggml_openvino_get_cl_queue() { + return ggml_openvino_get_device_config().cl_queue; +} + +// Get the clEnqueueMemFillINTEL function pointer (lazy load) +clEnqueueMemFillINTEL_fn ggml_openvino_get_clEnqueueMemFillINTEL() { + static clEnqueueMemFillINTEL_fn fn = nullptr; + static bool loaded = false; + if (!loaded) { + loaded = true; + cl_platform_id platform; + if (clGetPlatformIDs(1, &platform, nullptr) == CL_SUCCESS) { + fn = (clEnqueueMemFillINTEL_fn) clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueMemFillINTEL"); + } + } + return fn; +} + +// Get the clEnqueueMemcpyINTEL function pointer (lazy load) +clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL() { + static clEnqueueMemcpyINTEL_fn fn = nullptr; + static bool loaded = false; + if (!loaded) { + loaded = true; + cl_platform_id platform; + if (clGetPlatformIDs(1, &platform, nullptr) == CL_SUCCESS) { + fn = (clEnqueueMemcpyINTEL_fn) clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueMemcpyINTEL"); + } + } + return fn; +} + // Get requantization type for a tensor type (returns nullopt if no requant needed) std::optional ggml_openvino_get_requant_type(ggml_type type) { if (!ggml_openvino_is_npu()) { diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h index fdd8312dff..a1a8514190 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.h +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h @@ -3,6 +3,9 @@ #include "ggml.h" #include "openvino/runtime/core.hpp" +#define CL_TARGET_OPENCL_VERSION 300 +#include + #include #include #include @@ -22,6 +25,34 @@ std::optional ggml_openvino_get_remote_context(); // Get the compile config for the current device const ov::AnyMap & ggml_openvino_get_compile_config(); +// Get the OpenCL command queue for GPU operations (returns nullptr for CPU/NPU) +cl_command_queue ggml_openvino_get_cl_queue(); + +// Intel USM extension function type +typedef cl_int(CL_API_CALL * clEnqueueMemFillINTEL_fn)(cl_command_queue queue, + void * dst_ptr, + const void * pattern, + size_t pattern_size, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event); + +typedef cl_int(CL_API_CALL * clEnqueueMemcpyINTEL_fn)(cl_command_queue queue, + cl_bool blocking, + void * dst_ptr, + const void * src_ptr, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event); + +// Get the clEnqueueMemFillINTEL function pointer (returns nullptr if not available) +clEnqueueMemFillINTEL_fn ggml_openvino_get_clEnqueueMemFillINTEL(); + +// Get the clEnqueueMemcpyINTEL function pointer (returns nullptr if not available) +clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL(); + // ===================================================== // Global Device Configuration (singleton) // ===================================================== @@ -33,8 +64,10 @@ struct ggml_openvino_device_config { bool initialized = false; std::optional remote_context; ov::AnyMap compile_config; + cl_command_queue cl_queue = nullptr; void init(); + ~ggml_openvino_device_config(); }; // Get the global device config singleton diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index c5c25fb6c1..e139c2d662 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -8,6 +8,8 @@ #include "ggml-quants.hpp" #include "ggml.h" +#include + #include #include #include @@ -52,17 +54,23 @@ struct ggml_backend_openvino_buffer_context { // For non-weight buffers (KV cache, compute), we still use contiguous allocation void * data; size_t size; + bool is_remote; - std::shared_ptr ov_tensor; + // Wrapping of the buffer + std::shared_ptr ov_buffer; // Track all extras for cleanup std::map tensor_extras; - ggml_backend_openvino_buffer_context(int device, size_t size) : + // Used for re-allocation on device for kvcache + void * data_prev; + + ggml_backend_openvino_buffer_context(int device, size_t size, bool is_remote = false) : device(device), name(std::string(GGML_OPENVINO_NAME) + std::to_string(device)), data(nullptr), - size(size) { + size(size), + is_remote(is_remote) { if (size == 0) { return; } @@ -76,17 +84,22 @@ struct ggml_backend_openvino_buffer_context { #else data = aligned_alloc(GGML_OPENVINO_BUFFER_ALIGNMENT, size); #endif - ov_tensor = std::make_shared(ov::element::u8, ov::Shape{size}, data); + ov_buffer = std::make_shared(ov::element::u8, ov::Shape{size}, data); } else if (device_name == "GPU") { auto gpu_context = core.get_default_context("GPU").as(); - auto usm_tensor = gpu_context.create_usm_host_tensor(ov::element::u8, ov::Shape{size}); + ov::intel_gpu::ocl::USMTensor usm_tensor; + if (is_remote) { + usm_tensor = gpu_context.create_usm_device_tensor(ov::element::u8, ov::Shape{size}); + } else { + usm_tensor = gpu_context.create_usm_host_tensor(ov::element::u8, ov::Shape{size}); + } data = usm_tensor.get(); - ov_tensor = std::make_shared(std::move(usm_tensor)); + ov_buffer = std::make_shared(std::move(usm_tensor)); } else { auto npu_context = core.get_default_context("NPU").as(); auto l0_tensor = npu_context.create_l0_host_tensor(ov::element::u8, ov::Shape{size}); data = l0_tensor.get(); - ov_tensor = std::make_shared(std::move(l0_tensor)); + ov_buffer = std::make_shared(std::move(l0_tensor)); } if (data == nullptr) { @@ -135,6 +148,22 @@ static void * ggml_backend_openvino_buffer_get_base(ggml_backend_buffer_t buffer } static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name); + ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + + // Put kvcache on device memory for GPU + if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY && strncmp(tensor->name, "cache_", 6) == 0 && !ctx->is_remote && + ggml_openvino_get_device_name() == "GPU") { + GGML_ASSERT(ctx->tensor_extras.empty()); + auto device = ctx->device; + auto size = ctx->size; + auto * data_prev = ctx->data; + delete ctx; + ctx = new ggml_backend_openvino_buffer_context(device, size, true); + buffer->context = ctx; + tensor->data = (char *) ctx->data + ((char *) tensor->data - (char *) data_prev); + } + // Views share the extra from view_src if (tensor->view_src != nullptr) { GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); @@ -144,7 +173,7 @@ static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_bu return GGML_STATUS_SUCCESS; } - ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + ctx = (ggml_backend_openvino_buffer_context *) buffer->context; if (tensor->data != nullptr) { ggml_openvino_tensor_extra * extra = ggml_openvino_create_tensor_extra(tensor); @@ -166,9 +195,28 @@ static void ggml_backend_openvino_buffer_memset_tensor(ggml_backend_buffer_t buf uint8_t value, size_t offset, size_t size) { + GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name); GGML_ASSERT(tensor != nullptr && tensor->data != nullptr); - memset((char *) tensor->data + offset, value, size); - GGML_UNUSED(buffer); + ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + + if (ctx->is_remote) { + // For remote (device) buffers, use OpenCL USM memfill + cl_command_queue queue = ggml_openvino_get_cl_queue(); + auto mem_fill_fn = ggml_openvino_get_clEnqueueMemFillINTEL(); + if (queue != nullptr && mem_fill_fn != nullptr) { + uint8_t pattern = value; + cl_int err = mem_fill_fn(queue, (char *) tensor->data + offset, &pattern, sizeof(pattern), size, 0, nullptr, + nullptr); + if (err != CL_SUCCESS) { + GGML_LOG_ERROR("%s: clEnqueueMemFillINTEL failed with error %d\n", __func__, err); + } + clFinish(queue); + } else { + GGML_LOG_ERROR("%s: no OpenCL queue or clEnqueueMemFillINTEL not available for GPU buffer\n", __func__); + } + } else { + memset((char *) tensor->data + offset, value, size); + } } static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer, @@ -176,6 +224,7 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer const void * data, size_t offset, size_t size) { + // GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name); GGML_ASSERT(tensor != nullptr && tensor->data != nullptr); ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; @@ -260,8 +309,23 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer e.what()); } } else { - // Non-weight tensor (KV cache, activations, etc.) - just copy data - memcpy((char *) tensor->data + offset, data, size); + // Non-weight tensor (KV cache, activations, etc.) - copy data + if (ctx->is_remote) { + // For remote (device) buffers, use OpenCL USM memcpy (host-to-device) + cl_command_queue queue = ggml_openvino_get_cl_queue(); + auto mem_cpy_fn = ggml_openvino_get_clEnqueueMemcpyINTEL(); + if (queue != nullptr && mem_cpy_fn != nullptr) { + cl_int err = + mem_cpy_fn(queue, CL_TRUE, (char *) tensor->data + offset, data, size, 0, nullptr, nullptr); + if (err != CL_SUCCESS) { + GGML_LOG_ERROR("%s: clEnqueueMemcpyINTEL failed with error %d\n", __func__, err); + } + } else { + GGML_LOG_ERROR("%s: no OpenCL queue or clEnqueueMemcpyINTEL not available for GPU buffer\n", __func__); + } + } else { + memcpy((char *) tensor->data + offset, data, size); + } ggml_openvino_tensor_extra * extra = ggml_openvino_create_tensor_extra(tensor); if (extra == nullptr) { @@ -283,28 +347,99 @@ static void ggml_backend_openvino_buffer_get_tensor(ggml_backend_buffer_t buffer void * data, size_t offset, size_t size) { + // GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name); GGML_ASSERT(tensor != nullptr && tensor->data != nullptr); - memcpy(data, (const char *) tensor->data + offset, size); - GGML_UNUSED(buffer); + ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + + if (ctx->is_remote) { + // For remote (device) buffers, use OpenCL USM memcpy (device-to-host) + cl_command_queue queue = ggml_openvino_get_cl_queue(); + auto mem_cpy_fn = ggml_openvino_get_clEnqueueMemcpyINTEL(); + if (queue != nullptr && mem_cpy_fn != nullptr) { + cl_int err = + mem_cpy_fn(queue, CL_TRUE, data, (const char *) tensor->data + offset, size, 0, nullptr, nullptr); + if (err != CL_SUCCESS) { + GGML_LOG_ERROR("%s: clEnqueueMemcpyINTEL failed with error %d\n", __func__, err); + } + } else { + GGML_LOG_ERROR("%s: no OpenCL queue or clEnqueueMemcpyINTEL not available for GPU buffer\n", __func__); + } + } else { + memcpy(data, (const char *) tensor->data + offset, size); + } } static bool ggml_backend_openvino_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { + // GGML_LOG_DEBUG("%s: src tensor name=%s, dst tensor name=%s\n", __func__, src->name, dst->name); GGML_ASSERT(src != nullptr && dst != nullptr); - // Can copy from any host buffer (including other OpenVINO buffers) + ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + + if (ctx->is_remote) { + // For remote (device) buffers, use OpenCL USM memcpy + cl_command_queue queue = ggml_openvino_get_cl_queue(); + auto mem_cpy_fn = ggml_openvino_get_clEnqueueMemcpyINTEL(); + if (queue == nullptr || mem_cpy_fn == nullptr) { + GGML_LOG_ERROR("%s: no OpenCL queue or clEnqueueMemcpyINTEL not available for GPU buffer\n", __func__); + return false; + } + // Can copy from host to device + if (ggml_backend_buffer_is_host(src->buffer)) { + cl_int err = mem_cpy_fn(queue, CL_TRUE, dst->data, src->data, ggml_nbytes(src), 0, nullptr, nullptr); + if (err != CL_SUCCESS) { + GGML_LOG_ERROR("%s: clEnqueueMemcpyINTEL (host-to-device) failed with error %d\n", __func__, err); + return false; + } + return true; + } + // Can also copy from device to device if both are OpenVINO remote buffers + if (ggml_backend_buffer_is_openvino(src->buffer)) { + ggml_backend_openvino_buffer_context * src_ctx = + (ggml_backend_openvino_buffer_context *) src->buffer->context; + if (src_ctx->is_remote) { + cl_int err = + mem_cpy_fn(queue, CL_TRUE, dst->data, src->data, ggml_nbytes(src), 0, nullptr, nullptr); + if (err != CL_SUCCESS) { + GGML_LOG_ERROR("%s: clEnqueueMemcpyINTEL (device-to-device) failed with error %d\n", __func__, + err); + return false; + } + return true; + } + } + return false; + } + + // Host buffer - can copy from any host buffer if (ggml_backend_buffer_is_host(src->buffer)) { memcpy(dst->data, src->data, ggml_nbytes(src)); return true; } return false; - GGML_UNUSED(buffer); } static void ggml_backend_openvino_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; - if (ctx->data != nullptr) { + GGML_ASSERT(ctx->data != nullptr); + if (!ctx->is_remote) { memset(ctx->data, value, ctx->size); + } else { + // For remote (device) buffers, use OpenCL command queue + GGML_ASSERT(ggml_openvino_get_device_name() == "GPU"); + cl_command_queue queue = ggml_openvino_get_cl_queue(); + auto mem_fill_fn = ggml_openvino_get_clEnqueueMemFillINTEL(); + if (queue != nullptr && mem_fill_fn != nullptr) { + uint8_t pattern = value; + cl_int err = mem_fill_fn(queue, ctx->data, &pattern, sizeof(pattern), ctx->size, 0, nullptr, nullptr); + if (err != CL_SUCCESS) { + GGML_LOG_WARN("%s: clEnqueueMemFillINTEL failed with error %d\n", __func__, err); + } + clFinish(queue); + } else { + GGML_LOG_WARN("%s: no OpenCL queue or clEnqueueMemFillINTEL not available for GPU buffer clear\n", + __func__); + } } } From 8273a7c2f44ff3aed70638e057b5523471020995 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 24 Dec 2025 10:51:13 +0800 Subject: [PATCH 209/254] Use ggml_aligned_malloc --- ggml/include/ggml-openvino.h | 6 ++++-- ggml/src/ggml-openvino/ggml-openvino.cpp | 23 +++++------------------ 2 files changed, 9 insertions(+), 20 deletions(-) diff --git a/ggml/include/ggml-openvino.h b/ggml/include/ggml-openvino.h index 392e26c48e..46c1485f66 100644 --- a/ggml/include/ggml-openvino.h +++ b/ggml/include/ggml-openvino.h @@ -51,8 +51,10 @@ struct ggml_openvino_device_info { std::array default_tensor_split = {}; }; -const ggml_openvino_device_info & ggml_openvino_info(); - #ifdef __cplusplus } #endif + +#ifdef __cplusplus +const ggml_openvino_device_info & ggml_openvino_info(); +#endif diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index e139c2d662..acaa3ddc00 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -24,11 +24,6 @@ #include #include -#define GGML_OPENVINO_MAX_STREAMS 8 - -// OpenVINO buffer alignment (same as CPU for compatibility) -#define GGML_OPENVINO_BUFFER_ALIGNMENT 64 - // ===================================================== // OpenVINO Buffer Implementation using ov::Tensor // ===================================================== @@ -79,11 +74,7 @@ struct ggml_backend_openvino_buffer_context { auto & core = ov_singleton_core(); if (device_name == "CPU") { -#ifdef _WIN32 - data = _aligned_malloc(alloc_size, GGML_OPENVINO_BUFFER_ALIGNMENT); -#else - data = aligned_alloc(GGML_OPENVINO_BUFFER_ALIGNMENT, size); -#endif + data = ggml_aligned_malloc(size); ov_buffer = std::make_shared(ov::element::u8, ov::Shape{size}, data); } else if (device_name == "GPU") { auto gpu_context = core.get_default_context("GPU").as(); @@ -107,9 +98,9 @@ struct ggml_backend_openvino_buffer_context { return; } - if (reinterpret_cast(data) % GGML_OPENVINO_BUFFER_ALIGNMENT != 0) { + if (reinterpret_cast(data) % TENSOR_ALIGNMENT != 0) { GGML_LOG_ERROR("%s: %s buffer is not aligned to %d bytes\n", __func__, device_name.c_str(), - GGML_OPENVINO_BUFFER_ALIGNMENT); + TENSOR_ALIGNMENT); GGML_ABORT("fatal error"); } } @@ -121,11 +112,7 @@ struct ggml_backend_openvino_buffer_context { } tensor_extras.clear(); if (data && ggml_openvino_get_device_name() == "CPU") { -#ifdef _WIN32 - _aligned_free(data); -#else - free(data); -#endif + ggml_aligned_free(data, size); } } }; @@ -479,7 +466,7 @@ static ggml_backend_buffer_t ggml_backend_openvino_buffer_type_alloc_buffer(ggml static size_t ggml_backend_openvino_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); - return GGML_OPENVINO_BUFFER_ALIGNMENT; + return TENSOR_ALIGNMENT; } static size_t ggml_backend_openvino_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { From 88d1d17eacd0380c00084f982a2dbe872c57357e Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 25 Dec 2025 16:07:44 +0800 Subject: [PATCH 210/254] only use remote tensor for kvcache --- .../src/ggml-openvino/ggml-openvino-extra.cpp | 22 +++---- ggml/src/ggml-openvino/ggml-openvino-extra.h | 2 +- ggml/src/ggml-openvino/ggml-openvino.cpp | 60 +++++++++---------- 3 files changed, 40 insertions(+), 44 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp index aa50d46c03..908a975247 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp @@ -290,7 +290,7 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten return layout; } -ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor) { +ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor, bool is_remote) { ov::Shape shape; for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) { shape.push_back(static_cast(tensor->ne[i])); @@ -322,16 +322,18 @@ ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor auto remote_context = ggml_openvino_get_remote_context(); std::shared_ptr ov_tensor; - if (device_name == "CPU") { - ov_tensor = std::make_shared(element_type, shape, tensor->data); - } else if (device_name == "GPU") { - auto gpu_context = remote_context->as(); - auto usm_tensor = gpu_context.create_tensor(element_type, shape, tensor->data); - ov_tensor = std::make_shared(std::move(usm_tensor)); + if (is_remote) { + if (device_name == "GPU") { + auto gpu_context = remote_context->as(); + auto usm_tensor = gpu_context.create_tensor(element_type, shape, tensor->data); + ov_tensor = std::make_shared(std::move(usm_tensor)); + } else { + auto npu_context = remote_context->as(); + auto l0_tensor = npu_context.create_tensor(element_type, shape, tensor->data); + ov_tensor = std::make_shared(std::move(l0_tensor)); + } } else { - auto npu_context = remote_context->as(); - auto l0_tensor = npu_context.create_tensor(element_type, shape, tensor->data); - ov_tensor = std::make_shared(std::move(l0_tensor)); + ov_tensor = std::make_shared(element_type, shape, tensor->data); } return new ggml_openvino_tensor_extra(ov_tensor); diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h index a1a8514190..2f9d257769 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.h +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h @@ -155,4 +155,4 @@ struct ggml_openvino_extracted_layout { // Calculate the buffer layout for extracted quantized data ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor); -ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor); +ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor, bool is_remote); diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index acaa3ddc00..c0d555e86f 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -8,8 +8,6 @@ #include "ggml-quants.hpp" #include "ggml.h" -#include - #include #include #include @@ -73,24 +71,22 @@ struct ggml_backend_openvino_buffer_context { const auto & device_name = ggml_openvino_get_device_name(); auto & core = ov_singleton_core(); - if (device_name == "CPU") { + if (is_remote) { + if (device_name == "GPU") { + auto gpu_context = core.get_default_context("GPU").as(); + ov::intel_gpu::ocl::USMTensor usm_tensor = + gpu_context.create_usm_device_tensor(ov::element::u8, ov::Shape{size}); + data = usm_tensor.get(); + ov_buffer = std::make_shared(std::move(usm_tensor)); + } else { + auto npu_context = core.get_default_context("NPU").as(); + auto l0_tensor = npu_context.create_l0_host_tensor(ov::element::u8, ov::Shape{size}); + data = l0_tensor.get(); + ov_buffer = std::make_shared(std::move(l0_tensor)); + } + } else { data = ggml_aligned_malloc(size); ov_buffer = std::make_shared(ov::element::u8, ov::Shape{size}, data); - } else if (device_name == "GPU") { - auto gpu_context = core.get_default_context("GPU").as(); - ov::intel_gpu::ocl::USMTensor usm_tensor; - if (is_remote) { - usm_tensor = gpu_context.create_usm_device_tensor(ov::element::u8, ov::Shape{size}); - } else { - usm_tensor = gpu_context.create_usm_host_tensor(ov::element::u8, ov::Shape{size}); - } - data = usm_tensor.get(); - ov_buffer = std::make_shared(std::move(usm_tensor)); - } else { - auto npu_context = core.get_default_context("NPU").as(); - auto l0_tensor = npu_context.create_l0_host_tensor(ov::element::u8, ov::Shape{size}); - data = l0_tensor.get(); - ov_buffer = std::make_shared(std::move(l0_tensor)); } if (data == nullptr) { @@ -111,7 +107,7 @@ struct ggml_backend_openvino_buffer_context { delete pair.second; } tensor_extras.clear(); - if (data && ggml_openvino_get_device_name() == "CPU") { + if (!is_remote && data != nullptr) { ggml_aligned_free(data, size); } } @@ -135,12 +131,12 @@ static void * ggml_backend_openvino_buffer_get_base(ggml_backend_buffer_t buffer } static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { - GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name); + // GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name); ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; - // Put kvcache on device memory for GPU + // Put kvcache on device memory if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY && strncmp(tensor->name, "cache_", 6) == 0 && !ctx->is_remote && - ggml_openvino_get_device_name() == "GPU") { + ggml_openvino_get_device_name() != "CPU") { GGML_ASSERT(ctx->tensor_extras.empty()); auto device = ctx->device; auto size = ctx->size; @@ -163,7 +159,7 @@ static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_bu ctx = (ggml_backend_openvino_buffer_context *) buffer->context; if (tensor->data != nullptr) { - ggml_openvino_tensor_extra * extra = ggml_openvino_create_tensor_extra(tensor); + ggml_openvino_tensor_extra * extra = ggml_openvino_create_tensor_extra(tensor, ctx->is_remote); if (extra != nullptr) { auto it = ctx->tensor_extras.find(tensor); if (it != ctx->tensor_extras.end()) { @@ -186,7 +182,7 @@ static void ggml_backend_openvino_buffer_memset_tensor(ggml_backend_buffer_t buf GGML_ASSERT(tensor != nullptr && tensor->data != nullptr); ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; - if (ctx->is_remote) { + if (ctx->is_remote && ggml_openvino_get_device_name() == "GPU") { // For remote (device) buffers, use OpenCL USM memfill cl_command_queue queue = ggml_openvino_get_cl_queue(); auto mem_fill_fn = ggml_openvino_get_clEnqueueMemFillINTEL(); @@ -297,8 +293,7 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer } } else { // Non-weight tensor (KV cache, activations, etc.) - copy data - if (ctx->is_remote) { - // For remote (device) buffers, use OpenCL USM memcpy (host-to-device) + if (ctx->is_remote && ggml_openvino_get_device_name() == "GPU") { cl_command_queue queue = ggml_openvino_get_cl_queue(); auto mem_cpy_fn = ggml_openvino_get_clEnqueueMemcpyINTEL(); if (queue != nullptr && mem_cpy_fn != nullptr) { @@ -314,7 +309,7 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer memcpy((char *) tensor->data + offset, data, size); } - ggml_openvino_tensor_extra * extra = ggml_openvino_create_tensor_extra(tensor); + ggml_openvino_tensor_extra * extra = ggml_openvino_create_tensor_extra(tensor, ctx->is_remote); if (extra == nullptr) { GGML_LOG_ERROR("%s: failed to create tensor extra for %s\n", __func__, tensor->name); return; @@ -338,7 +333,7 @@ static void ggml_backend_openvino_buffer_get_tensor(ggml_backend_buffer_t buffer GGML_ASSERT(tensor != nullptr && tensor->data != nullptr); ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; - if (ctx->is_remote) { + if (ctx->is_remote && ggml_openvino_get_device_name() == "GPU") { // For remote (device) buffers, use OpenCL USM memcpy (device-to-host) cl_command_queue queue = ggml_openvino_get_cl_queue(); auto mem_cpy_fn = ggml_openvino_get_clEnqueueMemcpyINTEL(); @@ -363,7 +358,7 @@ static bool ggml_backend_openvino_buffer_cpy_tensor(ggml_backend_buffer_t buffer GGML_ASSERT(src != nullptr && dst != nullptr); ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; - if (ctx->is_remote) { + if (ctx->is_remote && ggml_openvino_get_device_name() == "GPU") { // For remote (device) buffers, use OpenCL USM memcpy cl_command_queue queue = ggml_openvino_get_cl_queue(); auto mem_cpy_fn = ggml_openvino_get_clEnqueueMemcpyINTEL(); @@ -409,10 +404,7 @@ static bool ggml_backend_openvino_buffer_cpy_tensor(ggml_backend_buffer_t buffer static void ggml_backend_openvino_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; GGML_ASSERT(ctx->data != nullptr); - if (!ctx->is_remote) { - memset(ctx->data, value, ctx->size); - } else { - // For remote (device) buffers, use OpenCL command queue + if (ctx->is_remote && ggml_openvino_get_device_name() == "GPU") { GGML_ASSERT(ggml_openvino_get_device_name() == "GPU"); cl_command_queue queue = ggml_openvino_get_cl_queue(); auto mem_fill_fn = ggml_openvino_get_clEnqueueMemFillINTEL(); @@ -427,6 +419,8 @@ static void ggml_backend_openvino_buffer_clear(ggml_backend_buffer_t buffer, uin GGML_LOG_WARN("%s: no OpenCL queue or clEnqueueMemFillINTEL not available for GPU buffer clear\n", __func__); } + } else { + memset(ctx->data, value, ctx->size); } } From a356b4447789e81d8f0f4ec32f311cd8d2b1ba78 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 25 Dec 2025 17:08:51 +0800 Subject: [PATCH 211/254] only use remote tensor for kvcache for GPU --- .../src/ggml-openvino/ggml-openvino-extra.cpp | 19 ++++------- ggml/src/ggml-openvino/ggml-openvino.cpp | 34 ++++++++----------- 2 files changed, 21 insertions(+), 32 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp index 908a975247..eff1627cb4 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp @@ -1,6 +1,7 @@ #include "ggml-openvino-extra.h" #include "ggml-impl.h" +#include "ggml.h" #include #include @@ -224,9 +225,8 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten layout.weights_per_block = 32; break; default: - // Unsupported requant type - fall through to normal extraction - layout.is_requant = false; - layout.requant_type = std::nullopt; + layout.weights_per_block = -1; + GGML_ABORT("Code of re-quantizing to channel-wise is not updated"); break; } @@ -323,15 +323,10 @@ ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor std::shared_ptr ov_tensor; if (is_remote) { - if (device_name == "GPU") { - auto gpu_context = remote_context->as(); - auto usm_tensor = gpu_context.create_tensor(element_type, shape, tensor->data); - ov_tensor = std::make_shared(std::move(usm_tensor)); - } else { - auto npu_context = remote_context->as(); - auto l0_tensor = npu_context.create_tensor(element_type, shape, tensor->data); - ov_tensor = std::make_shared(std::move(l0_tensor)); - } + GGML_ASSERT(device_name == "GPU"); + auto gpu_context = remote_context->as(); + auto usm_tensor = gpu_context.create_tensor(element_type, shape, tensor->data); + ov_tensor = std::make_shared(std::move(usm_tensor)); } else { ov_tensor = std::make_shared(element_type, shape, tensor->data); } diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index c0d555e86f..9b1fd55adf 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -72,18 +72,13 @@ struct ggml_backend_openvino_buffer_context { auto & core = ov_singleton_core(); if (is_remote) { - if (device_name == "GPU") { - auto gpu_context = core.get_default_context("GPU").as(); - ov::intel_gpu::ocl::USMTensor usm_tensor = - gpu_context.create_usm_device_tensor(ov::element::u8, ov::Shape{size}); - data = usm_tensor.get(); - ov_buffer = std::make_shared(std::move(usm_tensor)); - } else { - auto npu_context = core.get_default_context("NPU").as(); - auto l0_tensor = npu_context.create_l0_host_tensor(ov::element::u8, ov::Shape{size}); - data = l0_tensor.get(); - ov_buffer = std::make_shared(std::move(l0_tensor)); - } + // NPU memory is too small even for kvcache + GGML_ASSERT(device_name == "GPU"); + auto gpu_context = core.get_default_context("GPU").as(); + ov::intel_gpu::ocl::USMTensor usm_tensor = + gpu_context.create_usm_device_tensor(ov::element::u8, ov::Shape{size}); + data = usm_tensor.get(); + ov_buffer = std::make_shared(std::move(usm_tensor)); } else { data = ggml_aligned_malloc(size); ov_buffer = std::make_shared(ov::element::u8, ov::Shape{size}, data); @@ -134,9 +129,9 @@ static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_bu // GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name); ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; - // Put kvcache on device memory + // Put kvcache on device memory for GPU if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY && strncmp(tensor->name, "cache_", 6) == 0 && !ctx->is_remote && - ggml_openvino_get_device_name() != "CPU") { + ggml_openvino_get_device_name() == "GPU") { GGML_ASSERT(ctx->tensor_extras.empty()); auto device = ctx->device; auto size = ctx->size; @@ -182,7 +177,7 @@ static void ggml_backend_openvino_buffer_memset_tensor(ggml_backend_buffer_t buf GGML_ASSERT(tensor != nullptr && tensor->data != nullptr); ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; - if (ctx->is_remote && ggml_openvino_get_device_name() == "GPU") { + if (ctx->is_remote) { // For remote (device) buffers, use OpenCL USM memfill cl_command_queue queue = ggml_openvino_get_cl_queue(); auto mem_fill_fn = ggml_openvino_get_clEnqueueMemFillINTEL(); @@ -293,7 +288,7 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer } } else { // Non-weight tensor (KV cache, activations, etc.) - copy data - if (ctx->is_remote && ggml_openvino_get_device_name() == "GPU") { + if (ctx->is_remote) { cl_command_queue queue = ggml_openvino_get_cl_queue(); auto mem_cpy_fn = ggml_openvino_get_clEnqueueMemcpyINTEL(); if (queue != nullptr && mem_cpy_fn != nullptr) { @@ -333,7 +328,7 @@ static void ggml_backend_openvino_buffer_get_tensor(ggml_backend_buffer_t buffer GGML_ASSERT(tensor != nullptr && tensor->data != nullptr); ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; - if (ctx->is_remote && ggml_openvino_get_device_name() == "GPU") { + if (ctx->is_remote) { // For remote (device) buffers, use OpenCL USM memcpy (device-to-host) cl_command_queue queue = ggml_openvino_get_cl_queue(); auto mem_cpy_fn = ggml_openvino_get_clEnqueueMemcpyINTEL(); @@ -358,7 +353,7 @@ static bool ggml_backend_openvino_buffer_cpy_tensor(ggml_backend_buffer_t buffer GGML_ASSERT(src != nullptr && dst != nullptr); ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; - if (ctx->is_remote && ggml_openvino_get_device_name() == "GPU") { + if (ctx->is_remote) { // For remote (device) buffers, use OpenCL USM memcpy cl_command_queue queue = ggml_openvino_get_cl_queue(); auto mem_cpy_fn = ggml_openvino_get_clEnqueueMemcpyINTEL(); @@ -404,8 +399,7 @@ static bool ggml_backend_openvino_buffer_cpy_tensor(ggml_backend_buffer_t buffer static void ggml_backend_openvino_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; GGML_ASSERT(ctx->data != nullptr); - if (ctx->is_remote && ggml_openvino_get_device_name() == "GPU") { - GGML_ASSERT(ggml_openvino_get_device_name() == "GPU"); + if (ctx->is_remote) { cl_command_queue queue = ggml_openvino_get_cl_queue(); auto mem_fill_fn = ggml_openvino_get_clEnqueueMemFillINTEL(); if (queue != nullptr && mem_fill_fn != nullptr) { From cfc471353d3fe2724714b8cb585d8d0d299a329e Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 26 Dec 2025 11:38:45 +0800 Subject: [PATCH 212/254] FIX: use remote tensor from singleton --- ggml/src/ggml-openvino/ggml-openvino.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 9b1fd55adf..a1b5b5dd32 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -69,12 +69,11 @@ struct ggml_backend_openvino_buffer_context { } const auto & device_name = ggml_openvino_get_device_name(); - auto & core = ov_singleton_core(); if (is_remote) { - // NPU memory is too small even for kvcache GGML_ASSERT(device_name == "GPU"); - auto gpu_context = core.get_default_context("GPU").as(); + auto remote_context = ggml_openvino_get_remote_context(); + auto gpu_context = remote_context->as(); ov::intel_gpu::ocl::USMTensor usm_tensor = gpu_context.create_usm_device_tensor(ov::element::u8, ov::Shape{size}); data = usm_tensor.get(); @@ -129,7 +128,7 @@ static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_bu // GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name); ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; - // Put kvcache on device memory for GPU + // Put kvcache on device memory for GPU (NPU memory is too small even for kvcache) if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY && strncmp(tensor->name, "cache_", 6) == 0 && !ctx->is_remote && ggml_openvino_get_device_name() == "GPU") { GGML_ASSERT(ctx->tensor_extras.empty()); From 52a44012c08d0cf1e146bb75df9576394fb5fdc4 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 26 Dec 2025 13:52:09 +0800 Subject: [PATCH 213/254] Update build.md to include OpenCL --- docs/build.md | 41 +++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/docs/build.md b/docs/build.md index b9d5139b3a..05cf22e01f 100644 --- a/docs/build.md +++ b/docs/build.md @@ -714,10 +714,23 @@ Follow the instructions below to install OpenVINO runtime and build llama.cpp wi sudo apt-get update sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar ``` + - OpenCL + ```bash + sudo apt install ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd + ``` - **Windows:** - - Download Microsoft.VisualStudio.2022.BuildTools [Visual_Studio_Build_Tools]https://aka.ms/vs/17/release/vs_BuildTools.exe Select "Desktop development with C++" under workloads. + - Download Microsoft.VisualStudio.2022.BuildTools: [Visual_Studio_Build_Tools](https://aka.ms/vs/17/release/vs_BuildTools.exe) + Select "Desktop development with C++" under workloads - Install git + - Install OpenCL with vcpkg + ```powershell + cd C:\ + git clone https://github.com/microsoft/vcpkg + cd vcpkg + bootstrap-vcpkg.bat + vcpkg install opencl + ``` - Use "x64 Native Tools Command Prompt" for Build ### 1. Install OpenVINO Runtime @@ -729,19 +742,19 @@ Follow the instructions below to install OpenVINO runtime and build llama.cpp wi
📦 Click to expand OpenVINO 2025.3 installation from an archive file on Ubuntu
- + ```bash wget https://raw.githubusercontent.com/ravi9/misc-scripts/main/openvino/ov-archive-install/install-openvino-from-archive.sh chmod +x install-openvino-from-archive.sh ./install-openvino-from-archive.sh ``` + + Verify OpenVINO is initialized properly: + ```bash + echo $OpenVINO_DIR + ```
- - Verify OpenVINO is initialized properly - - **Linux:** - ```bash - echo $OpenVINO_DIR - ``` ### 2. Build llama.cpp with OpenVINO Backend @@ -761,14 +774,14 @@ git switch dev_backend_openvino cmake --build build/ReleaseOV --config Release -j $(nproc) ``` -- **Windows:** +- **Windows:** ```bash # Build with OpenVINO support "C:\Program Files (x86)\Intel\openvino_2025.3.0\setupvars.bat" - cmake -B build/ReleaseOV -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF -DLLAMA_CURL=OFF + cmake -B build\ReleaseOV -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF -DLLAMA_CURL=OFF -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake cmake --build build\ReleaseOV --config Release ``` - - For faster compilation, add the -- /m argument to run multiple jobs in parallel with as many CPU cores available. + - For faster compilation, add the -- /m argument to run multiple jobs in parallel with as many CPU cores available. ```bash cmake --build build\ReleaseOV --config Release -- /m ``` @@ -845,7 +858,7 @@ docker build --target=full -t llama-openvino:full -f .devops/openvino.Dockerfile # Build a minimal CLI-only image containing just the llama-cli executable. docker build --target=light -t llama-openvino:light -f .devops/openvino.Dockerfile . -# Builds a server-only image with llama-server executable, health check endpoint, and REST API support. +# Builds a server-only image with llama-server executable, health check endpoint, and REST API support. docker build --target=server -t llama-openvino:server -f .devops/openvino.Dockerfile . # If you are behind a proxy: @@ -868,17 +881,17 @@ llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct.fp16.gguf docker run --rm -it --env GGML_OPENVINO_DEVICE=NPU -v ~/models:/models \ --device=/dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \ llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct.fp16.gguf -``` +``` Run Llama.cpp Server with OpenVINO Backend ```bash # Run the Server Docker container server -docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct.fp16.gguf +docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct.fp16.gguf # In a NEW terminal, test the server with curl # If you are behind a proxy, make sure to set NO_PROXY to avoid proxy for localhost -export NO_PROXY=localhost,127.0.0.1 +export NO_PROXY=localhost,127.0.0.1 # Test health endpoint curl -f http://localhost:8080/health From c1142ddb7c060ab826aa34d57017c829028af5e9 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 26 Dec 2025 15:18:30 +0800 Subject: [PATCH 214/254] NPU always requant to q4_0_128 --- .../src/ggml-openvino/ggml-openvino-extra.cpp | 16 ++++++--- ggml/src/ggml-openvino/ggml-openvino-extra.h | 2 +- ggml/src/ggml-openvino/ggml-quants.cpp | 34 ------------------- ggml/src/ggml-openvino/ggml-quants.hpp | 4 --- 4 files changed, 12 insertions(+), 44 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp index eff1627cb4..26cc386dff 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp @@ -3,6 +3,7 @@ #include "ggml-impl.h" #include "ggml.h" +#include #include #include @@ -162,19 +163,24 @@ clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL() { } // Get requantization type for a tensor type (returns nullopt if no requant needed) -std::optional ggml_openvino_get_requant_type(ggml_type type) { +std::optional ggml_openvino_get_requant_type(const ggml_tensor * tensor) { if (!ggml_openvino_is_npu()) { return std::nullopt; } // NPU requantization rules - switch (type) { + if (strncmp(tensor->name, "token_embd.weight", 17) == 0) { + return ExtraQuantType::F16; + } + if (strncmp(tensor->name, "output.weight", 13) == 0) { + return ExtraQuantType::Q4_0_128; + } + switch (tensor->type) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_K: - return ExtraQuantType::Q4_0_128; case GGML_TYPE_Q6_K: case GGML_TYPE_Q5_K: - return ExtraQuantType::F16; + return ExtraQuantType::Q4_0_128; default: return std::nullopt; } @@ -200,7 +206,7 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten const size_t alignment = 64; // Good for SIMD // Check if requantization is needed (NPU-specific) - auto requant_type = ggml_openvino_get_requant_type(tensor->type); + auto requant_type = ggml_openvino_get_requant_type(tensor); if (requant_type.has_value()) { layout.is_requant = true; layout.requant_type = requant_type; diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h index 2f9d257769..fbfe459edf 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.h +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h @@ -83,7 +83,7 @@ const std::string & ggml_openvino_get_device_name(); bool ggml_openvino_is_npu(); // Get requantization type for a tensor type (returns nullopt if no requant needed) -std::optional ggml_openvino_get_requant_type(ggml_type type); +std::optional ggml_openvino_get_requant_type(const ggml_tensor * tensor); // ===================================================== // OpenVINO Tensor Extra Types diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index 6cacc7b034..1a5679cd8d 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -535,40 +535,6 @@ std::shared_ptr requantize_to_buffers(const ggml_tensor * tensor, return result; } -std::shared_ptr requantize(const ggml_tensor * tensor, ExtraQuantType requant_type) { - ov::Shape node_shape = {(uint64_t) (tensor->ne[1]), (uint64_t) (tensor->ne[0])}; - - // FIXME hardcoded workaround to fix the case where token_emb.weight is q4_0 (instead of q6_k) - // (In some q4_0 models which use two different weight for token_emb and output, token_emb is q4_0) - std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : ""; - if (device == "NPU" && std::string(tensor->name) == "token_embd.weight") { - requant_type = ExtraQuantType::F16; - } - - // Determine block size - int64_t block_size = node_shape[1]; - if (requant_type == ExtraQuantType::Q4_0_128) { - block_size = 128; - } else if (requant_type == ExtraQuantType::Q8_0_32) { - block_size = 32; - } - - // Allocate tensors - ov::Tensor weights, scales, biases; - if (requant_type == ExtraQuantType::F16) { - weights = ov::Tensor(ov::element::f16, node_shape); - } else { - bool is_u4 = (requant_type == ExtraQuantType::Q4_0_C || requant_type == ExtraQuantType::Q4_0_128); - ov::element::Type weight_type = is_u4 ? ov::element::u4 : ov::element::u8; - ov::Shape scales_shape = {node_shape[0], node_shape[1] / block_size}; - weights = ov::Tensor(weight_type, node_shape); - scales = ov::Tensor(ov::element::f16, scales_shape); - biases = ov::Tensor(ov::element::f16, scales_shape); - } - - return requantize_to_buffers(tensor, tensor->data, requant_type, block_size, weights, scales, biases); -} - std::shared_ptr process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr) { GGML_ASSERT(tensor != nullptr); GGML_ASSERT(data != nullptr); diff --git a/ggml/src/ggml-openvino/ggml-quants.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp index b1d286f1b8..a1334e2408 100644 --- a/ggml/src/ggml-openvino/ggml-quants.hpp +++ b/ggml/src/ggml-openvino/ggml-quants.hpp @@ -52,10 +52,6 @@ ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& biases, size_t group_size = GGML_QUANTIZATION_GROUP_SIZE); -// ExtraQuantType is defined in ggml-openvino-extra.h - -std::shared_ptr requantize(const ggml_tensor* tensor, ExtraQuantType requant_type); - // Extract quantized weights from tensor and create weight subgraph // If weights/scales/biases are provided (non-empty), uses them as output buffers // Otherwise allocates new ov::Tensors internally From 67c9720e496290411975d05244552fc8c6f11631 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 29 Dec 2025 15:25:59 +0800 Subject: [PATCH 215/254] Optimize symmetric quant weight extraction: use single zp --- .../src/ggml-openvino/ggml-openvino-extra.cpp | 32 ++++- ggml/src/ggml-openvino/ggml-openvino-extra.h | 1 + ggml/src/ggml-openvino/ggml-quants.cpp | 133 ++++++++++++++---- 3 files changed, 140 insertions(+), 26 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp index 26cc386dff..2f24d7a1db 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp @@ -192,6 +192,7 @@ std::optional ggml_openvino_get_requant_type(const ggml_tensor * ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor) { ggml_openvino_extracted_layout layout = {}; + layout.is_symmetric = false; if (!ggml_is_quantized(tensor->type)) { return layout; @@ -225,10 +226,26 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten case ExtraQuantType::Q4_0_128: layout.is_u4 = true; layout.weights_per_block = 128; + layout.is_symmetric = true; + break; + case ExtraQuantType::Q4_0_C: + layout.is_u4 = true; + layout.weights_per_block = tensor->ne[0]; + layout.is_symmetric = true; break; case ExtraQuantType::Q8_0_32: layout.is_u4 = false; layout.weights_per_block = 32; + layout.is_symmetric = true; + break; + case ExtraQuantType::Q8_0_C: + layout.is_u4 = false; + layout.weights_per_block = tensor->ne[0]; + layout.is_symmetric = true; + break; + case ExtraQuantType::Q8_1_C: + layout.is_u4 = false; + layout.weights_per_block = tensor->ne[0]; break; default: layout.weights_per_block = -1; @@ -241,7 +258,8 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements; int64_t n_blocks = n_elements / layout.weights_per_block; layout.scales_size = n_blocks * sizeof(uint16_t); - layout.biases_size = n_blocks * sizeof(uint16_t); + // For symmetric quantization, we only need one bias value (not one per block) + layout.biases_size = layout.is_symmetric ? sizeof(uint16_t) : n_blocks * sizeof(uint16_t); layout.weights_offset = 0; layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment; @@ -256,7 +274,14 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten // Normal extraction (no requant) - determine format based on tensor type switch (tensor->type) { case GGML_TYPE_Q4_0: + layout.is_u4 = true; + layout.weights_per_block = 32; + layout.is_symmetric = true; + break; case GGML_TYPE_Q4_1: + layout.is_u4 = true; + layout.weights_per_block = 32; + break; case GGML_TYPE_Q4_K: layout.is_u4 = true; layout.weights_per_block = 32; @@ -264,10 +289,12 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten case GGML_TYPE_Q8_0: layout.is_u4 = false; layout.weights_per_block = 32; + layout.is_symmetric = true; break; case GGML_TYPE_Q6_K: layout.is_u4 = false; layout.weights_per_block = 16; + layout.is_symmetric = true; break; case GGML_TYPE_Q5_K: layout.is_u4 = false; @@ -285,7 +312,8 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten // Scales and biases: F16 per block int64_t n_blocks = n_elements / layout.weights_per_block; layout.scales_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes - layout.biases_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes + // For symmetric quantization, we only need one bias value (not one per block) + layout.biases_size = layout.is_symmetric ? sizeof(uint16_t) : n_blocks * sizeof(uint16_t); // Layout in buffer: [weights | scales | biases] with alignment layout.weights_offset = 0; diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h index fbfe459edf..e2c5a8ceea 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.h +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h @@ -146,6 +146,7 @@ struct ggml_openvino_extracted_layout { size_t biases_size; // Size of biases in bytes bool is_u4; // true for U4 weights, false for U8 int64_t weights_per_block;// weights per scale/bias block + bool is_symmetric; // true for symmetric quantization // Requantization info bool is_requant; // true if this tensor needs requantization diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index 1a5679cd8d..8946b73a56 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -55,9 +55,18 @@ void extract_q4_0_data(const ggml_tensor * tensor, auto * scales = scales_arr.data::value_type>(); auto * biases = biases_arr.data::value_type>(); + bool is_scalar_bias = (biases_arr.get_size() == 1); // Symmetric quantization + ov::parallel_for(scales_arr.get_size(), [&](size_t i) { scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))); - biases[i] = ov::float16(-8.f * static_cast(scales[i])); + // For symmetric quantization, only write the first bias (all blocks share the same bias relationship) + if (is_scalar_bias) { + if (i == 0) { + biases[0] = ov::float16(-8.f * static_cast(scales[0])); + } + } else { + biases[i] = ov::float16(-8.f * static_cast(scales[i])); + } unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16); }); } @@ -95,10 +104,19 @@ void extract_q8_0_data(const ggml_tensor * tensor, auto * scales = scales_arr.data::value_type>(); auto * biases = biases_arr.data::value_type>(); + bool is_scalar_bias = (biases_arr.get_size() == 1); // Symmetric quantization + ov::parallel_for(scales_arr.get_size(), [&](size_t i) { uint8_t * block_data = data + i * bytes_per_block; scales[i] = ov::float16::from_bits(*(uint16_t *) block_data); - biases[i] = ov::float16(-128.f * static_cast(scales[i])); + // For symmetric quantization, only write the first bias (all blocks share the same bias relationship) + if (is_scalar_bias) { + if (i == 0) { + biases[0] = ov::float16(-128.f * static_cast(scales[0])); + } + } else { + biases[i] = ov::float16(-128.f * static_cast(scales[i])); + } for (size_t j = 0; j < weights_per_block; ++j) { uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes. // Original data is in int8_t, so we add a bias of -128 and invert the first bit. @@ -190,6 +208,8 @@ void extract_q6_k_data(const ggml_tensor * tensor, auto * scales = scales_arr.data::value_type>(); auto * biases = biases_arr.data::value_type>(); + bool is_scalar_bias = (biases_arr.get_size() == 1); // Symmetric quantization + ov::parallel_for(n_super_block, [&](size_t i) { uint8_t * block_data = data + i * bytes_per_block; @@ -199,7 +219,14 @@ void extract_q6_k_data(const ggml_tensor * tensor, for (size_t j = 0; j < 16; j++) { scales[j + i * 16] = ov::float16(scale_factor * static_cast(*((int8_t *) (block_data + 128 + 64 + j)))); - biases[j + i * 16] = ov::float16(-32.f * static_cast(scales[j + i * 16])); + // For symmetric quantization, only write the first bias (all blocks share the same bias relationship) + if (is_scalar_bias) { + if (i == 0 && j == 0) { + biases[0] = ov::float16(-32.f * static_cast(scales[0])); + } + } else { + biases[j + i * 16] = ov::float16(-32.f * static_cast(scales[j + i * 16])); + } } uint8_t * ql = block_data; @@ -302,15 +329,22 @@ ov::Output make_int8_weights(ov::Tensor & weight, // Expand dimensions for scales and biases auto scale_shape = scales.get_shape(); + auto bias_shape = biases.get_shape(); + bool is_scalar_bias = bias_shape.empty(); // Symmetric quantization ov::Shape packed_shape = {orig_shape[0], orig_shape[1] / group_size, group_size}; if (packed_shape[1] == 1) { + // Requantized channel-wise case packed_shape.erase(packed_shape.begin() + 1); } else { scale_shape.push_back(1); scales.set_shape(scale_shape); - biases.set_shape(scale_shape); + // For symmetric quantization, biases remain scalar (don't resize) + if (!is_scalar_bias) { + bias_shape = scale_shape; + biases.set_shape(bias_shape); + } } // Create graph nodes @@ -318,15 +352,23 @@ ov::Output make_int8_weights(ov::Tensor & weight, static_cast(weight.data()), nullptr); weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; auto scales_f16 = std::make_shared(scales); - ov::Tensor biases_u8(ov::element::u8, scale_shape); + ov::Tensor biases_u8(ov::element::u8, is_scalar_bias ? ov::Shape{} : scale_shape); // Calculate zero point const ov::float16 * bias_data = biases.data::value_type>(); const ov::float16 * scale_data = scales.data::value_type>(); uint8_t * bias_u8_data = biases_u8.data(); - for (size_t i = 0; i < biases_u8.get_size(); ++i) { - bias_u8_data[i] = - (uint8_t) std::round(-1.f * static_cast(bias_data[i]) / static_cast(scale_data[i])); + + if (is_scalar_bias) { + // Symmetric quantization: single bias value for all blocks + // For Q8_0, bias = -128 * scale, so zero_point = 128 + bias_u8_data[0] = (uint8_t) std::round(-1.f * static_cast(bias_data[0]) / static_cast(scale_data[0])); + } else { + // Asymmetric quantization: per-block biases + for (size_t i = 0; i < biases_u8.get_size(); ++i) { + bias_u8_data[i] = + (uint8_t) std::round(-1.f * static_cast(bias_data[i]) / static_cast(scale_data[i])); + } } auto zero_point = std::make_shared(biases_u8); @@ -361,17 +403,23 @@ ov::Output make_int4_weights(ov::Tensor & weight, // Expand dimensions for scales and biases ov::Shape scale_bias_shape = scales.get_shape(); + auto bias_shape = biases.get_shape(); + bool is_scalar_bias = bias_shape.empty(); // Symmetric quantization // Create INT4 weight tensor ov::Shape packed_shape = {orig_weight_shape[0], orig_weight_shape[1] / group_size, group_size}; - // Requantized channel-wise case if (packed_shape[1] == 1) { + // Requantized channel-wise case packed_shape.erase(packed_shape.begin() + 1); } else { scale_bias_shape.push_back(1); scales.set_shape(scale_bias_shape); - biases.set_shape(scale_bias_shape); + // For symmetric quantization, biases remain scalar (don't resize) + if (!is_scalar_bias) { + bias_shape = scale_bias_shape; + biases.set_shape(bias_shape); + } } auto weights_node = std::make_shared(ov::element::u4, packed_shape, @@ -382,14 +430,23 @@ ov::Output make_int4_weights(ov::Tensor & weight, // Pack zero points: two subsequent values into one const ov::float16 * bias_data = biases.data::value_type>(); const ov::float16 * scale_data = scales.data::value_type>(); - ov::Tensor zero_point_tensor(ov::element::u4, scale_bias_shape); + ov::Tensor zero_point_tensor(ov::element::u4, is_scalar_bias ? ov::Shape{} : scale_bias_shape); uint8_t * zero_point_data = static_cast(zero_point_tensor.data()); - for (size_t i = 0; i < zero_point_tensor.get_byte_size(); ++i) { - uint8_t bias1 = - (uint8_t) std::round(-1.f * static_cast(bias_data[i * 2]) / static_cast(scale_data[i * 2])); - uint8_t bias2 = (uint8_t) std::round(-1.f * static_cast(bias_data[i * 2 + 1]) / - static_cast(scale_data[i * 2 + 1])); - zero_point_data[i] = (bias2 << 4) | (bias1 & 0x0F); + + if (is_scalar_bias) { + // Symmetric quantization: single bias value for all blocks + // For Q4_0, bias = -8 * scale, so zero_point = 8 + uint8_t zp = (uint8_t) std::round(-1.f * static_cast(bias_data[0]) / static_cast(scale_data[0])); + zero_point_data[0] = (zp << 4) | (zp & 0x0F); + } else { + // Asymmetric quantization: per-block biases + for (size_t i = 0; i < zero_point_tensor.get_byte_size(); ++i) { + uint8_t bias1 = + (uint8_t) std::round(-1.f * static_cast(bias_data[i * 2]) / static_cast(scale_data[i * 2])); + uint8_t bias2 = (uint8_t) std::round(-1.f * static_cast(bias_data[i * 2 + 1]) / + static_cast(scale_data[i * 2 + 1])); + zero_point_data[i] = (bias2 << 4) | (bias1 & 0x0F); + } } auto zero_points_node = std::make_shared(zero_point_tensor); @@ -602,17 +659,19 @@ std::shared_ptr process_weight_tensor(const ggml_tensor * tensor, cons // Requant to quantized format (Q4_0_128, Q8_0_32, etc.) ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block}; + // For symmetric quantization, biases are a single value instead of per-block + ov::Shape bias_shape = layout.is_symmetric ? ov::Shape{} : scale_shape; ov::Tensor weights, scales, biases; if (output_base_ptr) { uint8_t * buf_base = static_cast(output_base_ptr); weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset); scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset); - biases = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.biases_offset); + biases = ov::Tensor(ov::element::f16, bias_shape, buf_base + layout.biases_offset); } else { weights = ov::Tensor(weight_type, node_shape); scales = ov::Tensor(ov::element::f16, scale_shape); - biases = ov::Tensor(ov::element::f16, scale_shape); + biases = ov::Tensor(ov::element::f16, bias_shape); } result = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block, weights, @@ -622,17 +681,19 @@ std::shared_ptr process_weight_tensor(const ggml_tensor * tensor, cons // Normal extraction path (no requant) ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block}; + // For symmetric quantization, biases are a single value instead of per-block + ov::Shape bias_shape = layout.is_symmetric ? ov::Shape{} : scale_shape; ov::Tensor weights, scales, biases; if (output_base_ptr) { uint8_t * buf_base = static_cast(output_base_ptr); weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset); scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset); - biases = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.biases_offset); + biases = ov::Tensor(ov::element::f16, bias_shape, buf_base + layout.biases_offset); } else { weights = ov::Tensor(weight_type, node_shape); scales = ov::Tensor(ov::element::f16, scale_shape); - biases = ov::Tensor(ov::element::f16, scale_shape); + biases = ov::Tensor(ov::element::f16, bias_shape); } result = extract_quantized_weights(tensor, data, weights, scales, biases); @@ -653,6 +714,8 @@ void quantize_q4_0(const float * x, auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); auto * biases = biases_arr.data::value_type>(); + bool is_scalar_bias = (biases_arr.get_size() == 1); // Symmetric quantization + for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max float max = 0.0f; @@ -669,7 +732,13 @@ void quantize_q4_0(const float * x, if (d == 0) { scales[i] = ov::float16(1.0f); - biases[i] = ov::float16(-8.0f); + if (is_scalar_bias) { + if (i == 0) { + biases[0] = ov::float16(-8.0f); + } + } else { + biases[i] = ov::float16(-8.0f); + } uint8_t zp = 8; memset(weights + i * qk / 2, zp | (zp << 4), qk / 2); continue; @@ -677,7 +746,14 @@ void quantize_q4_0(const float * x, const float id = 1.0f / d; scales[i] = ov::float16(d); - biases[i] = ov::float16(-8.f * d); + // For symmetric quantization, only write the first bias (all blocks share the same bias relationship) + if (is_scalar_bias) { + if (i == 0) { + biases[0] = ov::float16(-8.f * d); + } + } else { + biases[i] = ov::float16(-8.f * d); + } for (int j = 0; j < qk / 2; ++j) { const float x0 = x[i * qk + 2 * j] * id; @@ -701,6 +777,8 @@ void quantize_q8_0(const float * x, auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); auto * biases = biases_arr.data::value_type>(); + bool is_scalar_bias = (biases_arr.get_size() == 1); // Symmetric quantization + for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max @@ -714,7 +792,14 @@ void quantize_q8_0(const float * x, const float d = amax / 127.0f; const float id = d ? 1.0f / d : 0.0f; scales[i] = ov::float16(d); - biases[i] = ov::float16(-128.0f * d); + // For symmetric quantization, only write the first bias (all blocks share the same bias relationship) + if (is_scalar_bias) { + if (i == 0) { + biases[0] = ov::float16(-128.0f * d); + } + } else { + biases[i] = ov::float16(-128.0f * d); + } for (int j = 0; j < qk; ++j) { const float x0 = x[i * qk + j] * id; From 4e451778d32e4093e148f9ec38221ee29e6b28cd Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 29 Dec 2025 15:27:50 +0800 Subject: [PATCH 216/254] Use Q8_0_C in token embd, lm_head, and for 5 and 6 bits quant --- ggml/src/ggml-openvino/ggml-openvino-extra.cpp | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp index 2f24d7a1db..35d3d93cfd 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp @@ -164,23 +164,19 @@ clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL() { // Get requantization type for a tensor type (returns nullopt if no requant needed) std::optional ggml_openvino_get_requant_type(const ggml_tensor * tensor) { - if (!ggml_openvino_is_npu()) { - return std::nullopt; - } - // NPU requantization rules if (strncmp(tensor->name, "token_embd.weight", 17) == 0) { - return ExtraQuantType::F16; + return ExtraQuantType::Q8_0_C; } if (strncmp(tensor->name, "output.weight", 13) == 0) { + return ExtraQuantType::Q8_0_C; + } + if (ggml_openvino_is_npu()) { return ExtraQuantType::Q4_0_128; } switch (tensor->type) { - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q4_K: case GGML_TYPE_Q6_K: case GGML_TYPE_Q5_K: - return ExtraQuantType::Q4_0_128; + return ExtraQuantType::Q8_0_C; default: return std::nullopt; } From f5c71e3cf4156cdc33a1ffb6b2ad6d3c2285af7d Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 30 Dec 2025 10:51:40 +0800 Subject: [PATCH 217/254] Update build.md --- docs/build.md | 42 +++++++++--------------------------------- 1 file changed, 9 insertions(+), 33 deletions(-) diff --git a/docs/build.md b/docs/build.md index 05cf22e01f..c52c27c295 100644 --- a/docs/build.md +++ b/docs/build.md @@ -768,22 +768,16 @@ git switch dev_backend_openvino - **Linux:** ```bash - # Build with OpenVINO support source /opt/intel/openvino/setupvars.sh cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF - cmake --build build/ReleaseOV --config Release -j $(nproc) + cmake --build build/ReleaseOV --parallel ``` - **Windows:** ```bash - # Build with OpenVINO support "C:\Program Files (x86)\Intel\openvino_2025.3.0\setupvars.bat" - cmake -B build\ReleaseOV -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF -DLLAMA_CURL=OFF -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake - cmake --build build\ReleaseOV --config Release - ``` - - For faster compilation, add the -- /m argument to run multiple jobs in parallel with as many CPU cores available. - ```bash - cmake --build build\ReleaseOV --config Release -- /m + cmake -B build\ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF -DLLAMA_CURL=OFF -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake + cmake --build build\ReleaseOV --parallel ``` ### 3. Download Sample Model @@ -791,16 +785,9 @@ git switch dev_backend_openvino Download models for testing: ```bash -# Create models directory mkdir -p ~/models/ - -# Download model file: Llama-3.2-1B-Instruct.fp16.gguf -wget https://huggingface.co/MaziyarPanahi/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct.fp16.gguf \ - -O ~/models/Llama-3.2-1B-Instruct.fp16.gguf - -# Download model file: Phi-3-mini-4k-instruct-fp16.gguf -wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-fp16.gguf \ - -O ~/models/Phi-3-mini-4k-instruct-fp16.gguf +wget https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf \ + -O ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf ``` ### 4. Run inference with OpenVINO backend: @@ -808,20 +795,14 @@ wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/P When using the OpenVINO backend, the first inference token may have slightly higher latency due to on-the-fly conversion to the OpenVINO graph. Subsequent tokens and runs will be faster. ```bash -export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache -# Default device is GPU. -# If not set, automatically selects the first available device in priority order: GPU, CPU, NPU. +# If device is unset or unavailable, default to CPU. export GGML_OPENVINO_DEVICE=GPU - -./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is " - +./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is " ``` To run in chat mode: ```bash -export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache -./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is " - +./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf ``` ### Configuration Options @@ -833,16 +814,11 @@ Control OpenVINO behavior using these environment variables: - **`GGML_OPENVINO_PROFILING`**: Enable execution time profiling. - **`GGML_OPENVINO_DUMP_CGRAPH`**: Save compute graph to `cgraph.txt`. - **`GGML_OPENVINO_DUMP_IR`**: Export OpenVINO IR files with timestamps. -- **`GGML_OPENVINO_DEBUG_INPUT`**: Enable input debugging. -- **`GGML_OPENVINO_DEBUG_OUTPUT`**: Enable output debugging. ### Example with Profiling ```bash -export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache -export GGML_OPENVINO_PROFILING=1 - -GGML_OPENVINO_DEVICE=GPU ./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is " +GGML_OPENVINO_PROFILING=1 GGML_OPENVINO_DEVICE=GPU ./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is " ``` ### Docker build Llama.cpp with OpenVINO Backend From 0d6f253e489777474ec87ba7f928b9d3a79a30aa Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 7 Jan 2026 16:56:30 +0800 Subject: [PATCH 218/254] Support -ctk f32 --- ggml/src/ggml-openvino/ggml-decoder.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 13ef00dcb6..51fb433410 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -296,6 +296,9 @@ std::pair GgmlOvDecoder::compute_llm_params(ggml_cgr std::string name = std::string(node->name); if (node->op == GGML_OP_FLASH_ATTN_EXT) { auto * cache_k_perm = node->src[1]; + if (cache_k_perm->op == GGML_OP_CPY) { + cache_k_perm = cache_k_perm->src[0]; + } assert(cache_k_perm->op == GGML_OP_PERMUTE); auto * cache_k_view = cache_k_perm->src[0]; assert(cache_k_view->op == GGML_OP_VIEW); From 5f30eacdb4452c22feed10dc177e248a60341bd6 Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Wed, 7 Jan 2026 16:05:02 -0800 Subject: [PATCH 219/254] Initial stateful graph support --- ggml/src/ggml-openvino/ggml-decoder.cpp | 53 ++++++++++++++----- ggml/src/ggml-openvino/ggml-decoder.h | 8 ++- ggml/src/ggml-openvino/openvino/decoder.hpp | 5 +- .../ggml-openvino/openvino/node_context.hpp | 3 ++ .../ggml-openvino/openvino/op/get_rows.cpp | 7 ++- .../src/ggml-openvino/openvino/op/permute.cpp | 2 +- .../src/ggml-openvino/openvino/op/reshape.cpp | 22 +++++--- ggml/src/ggml-openvino/openvino/op/rope.cpp | 42 +++++++++++---- .../ggml-openvino/openvino/op/set_rows.cpp | 12 ++++- .../openvino/translate_session.cpp | 25 +++++++-- ggml/src/ggml-openvino/openvino/utils.cpp | 37 ++++++++++--- ggml/src/ggml-openvino/openvino/utils.hpp | 3 +- ggml/src/ggml-openvino/utils.cpp | 34 +++++++++--- ggml/src/ggml-openvino/utils.h | 2 +- 14 files changed, 197 insertions(+), 58 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 51fb433410..7c72c1fb34 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -44,9 +44,11 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, ComputeParams & compute_params, std::map> & model_weights, bool is_static, + bool is_stateful, bool is_prefill, int prefill_chunk_size) : m_is_static(is_static), + m_is_stateful(is_stateful), m_is_prefill(is_prefill), m_prefill_chunk_size(prefill_chunk_size), m_cgraph(cgraph), @@ -157,19 +159,40 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { ggml_backend_buffer * buffer = src->buffer; if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) { + ov::PartialShape stateful_kv_shape; // GGML_BACKEND_BUFFER_USAGE_ANY are kv caches if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) { assert(src_name.find("cache_k") == 0 || src_name.find("cache_v") == 0); + if (auto it = std::find(m_model_params.kv_names.begin(), m_model_params.kv_names.end(), src_name); it == m_model_params.kv_names.end()) { + m_model_params.kv_names.push_back(src_name); + if (is_stateful()) { + // TODO: The shape modification for stateful model below is not validated for all supported models yet. More generic solution might be needed + // to enable additional cases. Ideally, this could be removed from decoder and done as part of a transformation later. + auto stateless_kv_shape = get_graph_input_shape(node, src); + assert(stateless_kv_shape.size() == 4 && stateless_kv_shape[0] == 1 && stateless_kv_shape[1] == 1 + && stateless_kv_shape[2].is_dynamic() && stateless_kv_shape[3] == (m_model_params.n_heads_kv*m_model_params.head_size)); + stateful_kv_shape = {stateless_kv_shape[0], ov::Dimension::dynamic(), m_model_params.n_heads_kv, m_model_params.head_size}; + } + } } if (m_model_inputs.find(src_name) != m_model_inputs.end()) { continue; } m_inputs[src_name] = src; - auto param_node = - std::make_shared(get_ov_type(src), get_graph_input_shape(node, src)); - param_node->set_friendly_name(src_name); - param_node->output(0).get_tensor().set_names({src_name}); - m_model_inputs[src_name] = param_node; + assert(stateful_kv_shape.rank().is_static()); + if (stateful_kv_shape.rank().get_length() != 0) { + auto param_node = + std::make_shared(get_ov_type(src), stateful_kv_shape); + param_node->set_friendly_name(src_name); + param_node->output(0).get_tensor().set_names({src_name}); + m_model_inputs[src_name] = param_node; + } else { + auto param_node = + std::make_shared(get_ov_type(src), get_graph_input_shape(node, src)); + param_node->set_friendly_name(src_name); + param_node->output(0).get_tensor().set_names({src_name}); + m_model_inputs[src_name] = param_node; + } } } } @@ -378,6 +401,8 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co } else if (name.find("KQ_mask") == 0) { if (m_is_static) { input_shape = ov::PartialShape{1, 1, m_is_prefill ? m_prefill_chunk_size : 1, m_model_params.ctx}; + } else if (m_is_stateful) { + input_shape = ov::PartialShape{1, 1, -1, -1}; } else { input_shape = ov::PartialShape{-1, 1, -1, -1}; } @@ -465,15 +490,15 @@ const ggml_tensor * GgmlOvDecoder::get_tensor_from_name(const std::string & name return nullptr; } -// std::map GgmlOvDecoder::get_kv_param_res_names() const { -// std::map kv_param_res_names; -// for (const auto & name : m_model_params.kv_names) { -// if (name.find("cache_k") == 0 || name.find("cache_v") == 0) { -// kv_param_res_names[name] = name; -// } -// } -// return kv_param_res_names; -// } +std::map GgmlOvDecoder::get_kv_param_res_names() const { + std::map kv_param_res_names; + for (const auto & name : m_model_params.kv_names) { + if (name.find("cache_k") == 0 || name.find("cache_v") == 0) { + kv_param_res_names[name] = name; + } + } + return kv_param_res_names; +} std::map> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph) { std::map> model_weights; diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 0b302b9320..4afec272e1 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -23,7 +23,7 @@ struct ModelParams { int32_t * rope_params = nullptr; std::vector swa_layers; - // std::vector kv_names; + std::vector kv_names; bool operator==(const ModelParams & other) const { return n_seq == other.n_seq && n_heads == other.n_heads && n_heads_kv == other.n_heads_kv && @@ -66,6 +66,7 @@ public: ComputeParams & compute_params, std::map> & model_weights, bool is_static, + bool is_stateful = false, bool is_prefill = false, int prefill_chunk_size = 256); @@ -171,10 +172,12 @@ public: virtual int32_t * get_rope_params() const override { return m_model_params.rope_params; } - // virtual std::map get_kv_param_res_names() const override; + virtual std::map get_kv_param_res_names() const override; virtual bool is_static() const override { return m_is_static; } + virtual bool is_stateful() const override { return m_is_stateful; } + ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const; static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename); @@ -200,6 +203,7 @@ public: void set_compute_params(const ComputeParams & compute_params) { m_compute_params = compute_params; } bool m_is_static = false; + bool m_is_stateful = false; bool m_is_prefill = false; int m_prefill_chunk_size = 0; diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index 1603c7fd20..3b8da2be5d 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -59,10 +59,13 @@ public: virtual std::vector get_model_output_names() const = 0; virtual int32_t* get_rope_params() const = 0; - // virtual std::map get_kv_param_res_names() const = 0; + + virtual std::map get_kv_param_res_names() const = 0; virtual bool is_static() const = 0; + virtual bool is_stateful() const = 0; + virtual int is_swa_layer(int layer) const = 0; }; diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index a0666b21ac..235adcc784 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -91,8 +91,11 @@ public: int get_op_case() const { return m_decoder->get_op_case(m_node_idx); } + bool is_static() const { return m_decoder->is_static(); } + bool is_stateful() const { return m_decoder->is_stateful(); } + private: std::shared_ptr m_decoder; std::shared_ptr& m_tensor_map; diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp index dc8454a199..d6e7a35534 100644 --- a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp @@ -37,6 +37,9 @@ OutputVector translate_get_rows(const NodeContext & context) { auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1}); data = std::make_shared(data, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); res = std::make_shared(data, indices, axis, 1); + } else if (context.is_stateful() && data.get_partial_shape().rank() == 3) { + auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1}); + res = std::make_shared(data, indices, axis, 1); } else { auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {0}); res = std::make_shared(data, indices, axis); @@ -45,7 +48,9 @@ OutputVector translate_get_rows(const NodeContext & context) { if (res.get_element_type() != context.get_output_type()) { res = std::make_shared(res, context.get_output_type()); } - res = std::make_shared(res, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); + if (!(context.is_stateful())) { + res = std::make_shared(res, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); + } return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index bfe09a2b84..fa7ab0c43f 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -29,7 +29,7 @@ OutputVector translate_permute(const NodeContext & context) { auto src = context.get_input(0); auto perm = ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}); - if (op_case == 1) { + if (op_case == 1 || context.is_stateful()) { res = std::make_shared(src, perm); } else if (op_case == 4) { auto output_shape = context.get_output_shape().to_shape(); diff --git a/ggml/src/ggml-openvino/openvino/op/reshape.cpp b/ggml/src/ggml-openvino/openvino/op/reshape.cpp index e26a8c778c..7eebd7b7b1 100644 --- a/ggml/src/ggml-openvino/openvino/op/reshape.cpp +++ b/ggml/src/ggml-openvino/openvino/op/reshape.cpp @@ -32,10 +32,15 @@ OutputVector translate_reshape(const NodeContext & context) { auto output_shape = context.get_output_shape().to_shape(); std::shared_ptr new_shape_node; if (op_case == 1) { - new_shape_node = ov::op::v0::Constant::create( - ov::element::i64, {4}, - std::vector{(int64_t) output_shape[0], -1, (int64_t) output_shape[2], (int64_t) output_shape[3]}); - + if (context.is_stateful()) { + new_shape_node = ov::op::v0::Constant::create( + ov::element::i64, {3}, + std::vector{-1, (int64_t) output_shape[2], (int64_t) output_shape[3]}); + } else { + new_shape_node = ov::op::v0::Constant::create( + ov::element::i64, {4}, + std::vector{(int64_t) output_shape[0], -1, (int64_t) output_shape[2], (int64_t) output_shape[3]}); + } } else if (op_case == 2) { new_shape_node = ov::op::v0::Constant::create( ov::element::i64, {4}, @@ -50,8 +55,13 @@ OutputVector translate_reshape(const NodeContext & context) { return {context.get_input(0).get_node_shared_ptr()->input_value(0)}; } else if (op_case == 5) { - std::vector shape_vec = {1, 1, -1, (int64_t) context.get_output_shape().to_shape()[3]}; - new_shape_node = ov::op::v0::Constant::create(ov::element::i64, {4}, shape_vec); + if (context.is_stateful()) { + std::vector shape_vec = {1, -1, (int64_t) context.get_output_shape().to_shape()[3]}; + new_shape_node = ov::op::v0::Constant::create(ov::element::i64, {3}, shape_vec); + } else { + std::vector shape_vec = {1, 1, -1, (int64_t) context.get_output_shape().to_shape()[3]}; + new_shape_node = ov::op::v0::Constant::create(ov::element::i64, {4}, shape_vec); + } // // Alternative // auto token_len = context.get_input("token_len"); diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index 96fbb6b795..b72e445706 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -54,9 +54,18 @@ OutputVector translate_rope(const NodeContext & context) { // The input comes from a VIEW int slice_len = output_shape[2] * output_shape[3]; data_node = process_view_input(context, 0, slice_len).get_node_shared_ptr(); - auto data_shape = ov::op::v0::Constant::create( - ov::element::i64, {4}, std::vector{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]}); - data_node = std::make_shared(data_node, data_shape, false); + if (context.is_stateful()) { + auto data_shape = ov::op::v0::Constant::create( + ov::element::i64, {3}, std::vector{-1, (int64_t) output_shape[2], (int64_t) output_shape[3]}); + data_node = std::make_shared(data_node, data_shape, false); + } else { + auto data_shape = ov::op::v0::Constant::create( + ov::element::i64, {4}, std::vector{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]}); + data_node = std::make_shared(data_node, data_shape, false); + } + //auto data_shape = ov::op::v0::Constant::create( + // ov::element::i64, {4}, std::vector{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]}); + //data_node = std::make_shared(data_node, data_shape, false); } const int mode = op_params[2]; @@ -67,10 +76,19 @@ OutputVector translate_rope(const NodeContext & context) { auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - auto three = ov::op::v0::Constant::create(ov::element::i64, {1}, {3}); auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[3]}); - auto even_slice = std::make_shared(data_node, zero, end, two, three); - auto odd_slice = std::make_shared(data_node, one, end, two, three); + Output even_slice; + Output odd_slice; + int32_t unsqueeze_dim = 4; + if (context.is_stateful()) { + unsqueeze_dim = 3; + even_slice = std::make_shared(data_node, zero, end, two, two); + odd_slice = std::make_shared(data_node, one, end, two, two); + } else { + auto three = ov::op::v0::Constant::create(ov::element::i64, {1}, {3}); + even_slice = std::make_shared(data_node, zero, end, two, three); + odd_slice = std::make_shared(data_node, one, end, two, three); + } Output first_half = std::make_shared(std::make_shared(even_slice, cos_theta_node), @@ -80,10 +98,10 @@ OutputVector translate_rope(const NodeContext & context) { std::make_shared(odd_slice, cos_theta_node)); first_half = std::make_shared(first_half, - ov::op::v0::Constant::create(ov::element::i64, {1}, {4})); + ov::op::v0::Constant::create(ov::element::i64, {1}, {unsqueeze_dim})); second_half = std::make_shared(second_half, - ov::op::v0::Constant::create(ov::element::i64, {1}, {4})); - auto stack = std::make_shared(OutputVector{first_half, second_half}, 4); + ov::op::v0::Constant::create(ov::element::i64, {1}, {unsqueeze_dim})); + auto stack = std::make_shared(OutputVector{first_half, second_half}, unsqueeze_dim); auto data_shape = ov::op::v0::Constant::create( ov::element::i64, {4}, std::vector{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]}); @@ -102,7 +120,11 @@ OutputVector translate_rope(const NodeContext & context) { std::make_shared(slice_data_node_0, sin_theta_node), std::make_shared(slice_data_node_1, cos_theta_node)); - res = std::make_shared(ov::OutputVector{first_half_node, second_half_node}, 3); + int32_t concat_dim = 3; + if (context.is_stateful()) { + concat_dim = 2; + } + res = std::make_shared(ov::OutputVector{first_half_node, second_half_node}, concat_dim); } return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp index 4ceb55589e..69c4ca7089 100644 --- a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp @@ -45,7 +45,17 @@ OutputVector translate_set_rows(const NodeContext & context) { false); auto axes = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}); - Output res = std::make_shared(dst, ind_squeezed, data_reshaped, axes); + Output res; + if (context.is_stateful()) { + int concat_axis = 1; + int64_t dim2 = dst.get_partial_shape()[2].get_length(); + int64_t dim3 = dst.get_partial_shape()[3].get_length(); + data = std::make_shared( + data, ov::op::v0::Constant::create(ov::element::i64, {4}, {(int64_t) 1, (int64_t) -1, dim2, dim3}), false); + res = std::make_shared(OutputVector{dst, data}, concat_axis); + } else { + res = std::make_shared(dst, ind_squeezed, data_reshaped, axes); + } if (auto dst_reshape = std::dynamic_pointer_cast(dst.get_node_shared_ptr())) { // Fix the case of multiple sequences, reshape back to original shape [1, n_seq, ctx_per_seq, emb] diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index ccd0947a2b..02e08c24f4 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -82,6 +83,20 @@ void add_sliced_mask(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) { std::shared_ptr mask_sliced; if (is_static) { mask_sliced = mask; + } else if (ggml_model_decoder.is_stateful()) { + auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0}); + auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1}); + auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); + auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {-2,-1}); + auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); + auto shape_of_inp_pos = std::make_shared(inp_pos); + auto gather_inp_pos = std::make_shared(shape_of_inp_pos, two_1d, zero_1d); + auto stop = std::make_shared(ov::OutputVector{token_len_per_seq, gather_inp_pos}, 0); + mask_sliced = + std::make_shared(mask, zero_2d, stop, one_2d, axes); + mask_sliced = std::make_shared(mask_sliced, ov::element::f16); + mask_sliced->set_friendly_name(sliced_name); } else { auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); @@ -226,11 +241,11 @@ std::shared_ptr TranslateSession::apply_transformations(std::shared_ptr(); - // if (!ggml_model_decoder->is_static()) { - // const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names(); - // const auto kv_param_res_pairs = get_kv_param_res_pairs(model, kv_param_res_names); - // manager.register_pass(kv_param_res_pairs); - // } + if (ggml_model_decoder->is_stateful()) { + const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names(); + const auto kv_param_res_pairs = get_kv_param_res_pairs(model, kv_param_res_names); + manager.register_pass(kv_param_res_pairs); + } if (ggml_model_decoder->is_static()) { manager.register_pass(); diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp index bdda30fa6d..b7553f99c8 100644 --- a/ggml/src/ggml-openvino/openvino/utils.cpp +++ b/ggml/src/ggml-openvino/openvino/utils.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -113,11 +114,20 @@ void ggml_rope_yarn_corr_dims(int n_dims, std::pair, ov::Output> make_sin_cos(int32_t * rope_params, std::shared_ptr inp_pos, - std::shared_ptr rope_freqs_weight) { - inp_pos = std::make_shared(inp_pos, ov::element::f32); - auto pos_perm = - std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{0, 3, 1, 2}); - inp_pos = std::make_shared(inp_pos, pos_perm); + std::shared_ptr rope_freqs_weight, + bool stateful) { + if (stateful) { + inp_pos = std::make_shared(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); + inp_pos = std::make_shared(inp_pos, ov::element::f32); + auto pos_perm = + std::make_shared(ov::element::i64, ov::Shape{3}, std::vector{2, 1, 0}); + inp_pos = std::make_shared(inp_pos, pos_perm); + } else { + inp_pos = std::make_shared(inp_pos, ov::element::f32); + auto pos_perm = + std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{0, 3, 1, 2}); + inp_pos = std::make_shared(inp_pos, pos_perm); + } float freq_base; float freq_scale; @@ -145,8 +155,14 @@ std::pair, ov::Output> make_sin_cos(int32_t * rope_params factor[i] = theta_scale * factor[i - 1]; } - Output freq_factors = - std::make_shared(ov::element::f32, ov::Shape{1, 1, 1, factor.size()}, factor); + Output freq_factors; + if (stateful) { + freq_factors = + std::make_shared(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor); + } else { + freq_factors = + std::make_shared(ov::element::f32, ov::Shape{1, 1, 1, factor.size()}, factor); + } if (rope_freqs_weight) { freq_factors = std::make_shared(freq_factors, rope_freqs_weight); } @@ -161,7 +177,12 @@ std::pair, ov::Output> make_sin_cos(int32_t * rope_params theta = theta_interp; } else { auto ramp_mix = rope_yarn_ramp_mix(n_dims, corr_dims, ext_factor); - auto one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {1.0f}); + Output one; + if (stateful) { + one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {1.0f}); + } else { + one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {1.0f}); + } auto one_minus_ramp = std::make_shared(one, ramp_mix); theta = std::make_shared(std::make_shared(theta_interp, one_minus_ramp), diff --git a/ggml/src/ggml-openvino/openvino/utils.hpp b/ggml/src/ggml-openvino/openvino/utils.hpp index 6c6d2ae8d4..4ffe37ada6 100644 --- a/ggml/src/ggml-openvino/openvino/utils.hpp +++ b/ggml/src/ggml-openvino/openvino/utils.hpp @@ -66,7 +66,8 @@ OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std:: std::pair, ov::Output> make_sin_cos(int32_t* rope_params, std::shared_ptr inp_pos, - std::shared_ptr rope_freqs_weight = nullptr); + std::shared_ptr rope_freqs_weight = nullptr, + bool stateful = false); ov::Output process_view_input(const NodeContext& context, int input_index, int slice_len = 0); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 89cf51f880..ff94c4acfe 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -46,10 +46,14 @@ enum ggml_status ov_graph_compute(ggml_cgraph * cgraph) { // Use device from singleton (initialized during backend init) const auto & device = ggml_openvino_get_device_name(); const auto is_static = ggml_openvino_is_npu(); - return is_static ? ov_graph_compute_static(cgraph) : ov_graph_compute_dynamic(cgraph, device); + bool stateful = false; + if (getenv("GGML_OPENVINO_STATEFUL_EXECUTION") && !is_static) { + stateful = true; + } + return is_static ? ov_graph_compute_static(cgraph) : ov_graph_compute_dynamic(cgraph, device, stateful); } -enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::string & device) { +enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::string & device, bool stateful) { auto & core = ov_singleton_core(); const auto & config = ggml_openvino_get_compile_config(); static auto is_static = false; @@ -99,6 +103,12 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin ggml_decoder->add_extra_inputs(); infer_request = infer_request_cache[key]; + auto * inp_pos = get_inp_pos_tensor(cgraph); + int32_t * pos_data = (int32_t *) inp_pos->data; + if (pos_data[0] == 0) { + infer_request->reset_state(); + } + decoder_end_time = ggml_time_us(); conversion_end_time = decoder_end_time; compile_end_time = decoder_end_time; @@ -108,7 +118,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin std::shared_ptr model; auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); - ggml_decoder = std::make_shared(cgraph, m_params, c_params, model_weights, is_static); + ggml_decoder = std::make_shared(cgraph, m_params, c_params, model_weights, is_static, stateful); decoder_end_time = ggml_time_us(); auto input_model = std::make_shared(ggml_decoder); @@ -202,6 +212,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { static std::string device = "NPU"; static auto is_static = true; + static auto stateful = false; static auto prefill_chunk_size = get_prefill_chunk_size(); const auto & config = ggml_openvino_get_compile_config(); @@ -265,9 +276,9 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); auto ggml_decoder_prefill = std::make_shared(cgraph, m_params, c_params, model_weights, - is_static, true, prefill_chunk_size); + is_static, stateful, true, prefill_chunk_size); auto ggml_decoder_decode = std::make_shared(cgraph, m_params, c_params, model_weights, - is_static, false, prefill_chunk_size); + is_static, stateful, false, prefill_chunk_size); decoder_end_time = ggml_time_us(); auto input_model_prefill = std::make_shared(ggml_decoder_prefill); @@ -606,8 +617,17 @@ ov::Tensor get_ov_output_tensor(std::shared_ptr ggml_decoder, con if (ggml_decoder->is_static() && result_name == "result_output" && output_shape[2] == 0) { output_shape[2] = 1; } - ov::Tensor output_tensor(output_type, output_shape, ggml_tensor->data); - return output_tensor; + if (ggml_decoder->is_stateful() && result_name == "result_output") { + std::vector output_shape_3d; + for (size_t i=1; idata); + return output_tensor; + } else { + ov::Tensor output_tensor(output_type, output_shape, ggml_tensor->data); + return output_tensor; + } } size_t checksum(const void * data, size_t size) { diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 44ca2db00f..47bf2d4ff1 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -28,7 +28,7 @@ struct graph_key_hash { enum ggml_status ov_graph_compute(struct ggml_cgraph * cgraph); -enum ggml_status ov_graph_compute_dynamic(struct ggml_cgraph * cgraph, const std::string & device); +enum ggml_status ov_graph_compute_dynamic(struct ggml_cgraph * cgraph, const std::string & device, bool stateful = false); enum ggml_status ov_graph_compute_static(struct ggml_cgraph * cgraph); size_t checksum(const void * data, size_t size); From d2fc15226ba7eada589da64bd1d6e67d83d44e31 Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Fri, 9 Jan 2026 11:04:15 -0800 Subject: [PATCH 220/254] Update ggml/src/ggml-openvino/ggml-decoder.cpp Co-authored-by: Yamini Nimmagadda --- ggml/src/ggml-openvino/ggml-decoder.cpp | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 7c72c1fb34..f429b796b5 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -180,19 +180,13 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { } m_inputs[src_name] = src; assert(stateful_kv_shape.rank().is_static()); - if (stateful_kv_shape.rank().get_length() != 0) { - auto param_node = - std::make_shared(get_ov_type(src), stateful_kv_shape); - param_node->set_friendly_name(src_name); - param_node->output(0).get_tensor().set_names({src_name}); - m_model_inputs[src_name] = param_node; - } else { - auto param_node = - std::make_shared(get_ov_type(src), get_graph_input_shape(node, src)); - param_node->set_friendly_name(src_name); - param_node->output(0).get_tensor().set_names({src_name}); - m_model_inputs[src_name] = param_node; - } + ov::PartialShape param_shape = (stateful_kv_shape.rank().get_length() != 0) + ? stateful_kv_shape + : get_graph_input_shape(node, src); + auto param_node = std::make_shared(get_ov_type(src), param_shape); + param_node->set_friendly_name(src_name); + param_node->output(0).get_tensor().set_names({src_name}); + m_model_inputs[src_name] = param_node; } } } From 981ec6571d676fb406a249d9fc0be95975fd4292 Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Fri, 9 Jan 2026 11:05:25 -0800 Subject: [PATCH 221/254] code cleanup --- ggml/src/ggml-openvino/openvino/op/rope.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index b72e445706..01bc46131e 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -63,9 +63,6 @@ OutputVector translate_rope(const NodeContext & context) { ov::element::i64, {4}, std::vector{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]}); data_node = std::make_shared(data_node, data_shape, false); } - //auto data_shape = ov::op::v0::Constant::create( - // ov::element::i64, {4}, std::vector{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]}); - //data_node = std::make_shared(data_node, data_shape, false); } const int mode = op_params[2]; From a40a5dfc60b0ab6d2caad7cb53b5ff1c3a56ffb8 Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Fri, 9 Jan 2026 11:29:40 -0800 Subject: [PATCH 222/254] npu perf fix --- ggml/src/ggml-openvino/ggml-openvino-extra.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp index 35d3d93cfd..b6fec855fc 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp @@ -165,7 +165,7 @@ clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL() { // Get requantization type for a tensor type (returns nullopt if no requant needed) std::optional ggml_openvino_get_requant_type(const ggml_tensor * tensor) { if (strncmp(tensor->name, "token_embd.weight", 17) == 0) { - return ExtraQuantType::Q8_0_C; + return (ggml_openvino_is_npu() ? ExtraQuantType::F16 : ExtraQuantType::Q8_0_C); } if (strncmp(tensor->name, "output.weight", 13) == 0) { return ExtraQuantType::Q8_0_C; From a81b202f570c038342ad227702316957e7425705 Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Mon, 12 Jan 2026 10:47:16 -0800 Subject: [PATCH 223/254] requant to f16 for Q6 embed on NPU --- ggml/src/ggml-openvino/ggml-openvino-extra.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp index b6fec855fc..bc0362ee46 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp @@ -165,7 +165,7 @@ clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL() { // Get requantization type for a tensor type (returns nullopt if no requant needed) std::optional ggml_openvino_get_requant_type(const ggml_tensor * tensor) { if (strncmp(tensor->name, "token_embd.weight", 17) == 0) { - return (ggml_openvino_is_npu() ? ExtraQuantType::F16 : ExtraQuantType::Q8_0_C); + return ((ggml_openvino_is_npu() && tensor->type == GGML_TYPE_Q6_K) ? ExtraQuantType::F16 : ExtraQuantType::Q8_0_C); } if (strncmp(tensor->name, "output.weight", 13) == 0) { return ExtraQuantType::Q8_0_C; From a92eceecd9b5163643eb68f049934657bb2174b0 Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Tue, 13 Jan 2026 12:27:07 -0800 Subject: [PATCH 224/254] Update ggml/src/ggml-openvino/ggml-decoder.cpp --- ggml/src/ggml-openvino/ggml-decoder.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index f429b796b5..40d5cd5418 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -186,7 +186,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { auto param_node = std::make_shared(get_ov_type(src), param_shape); param_node->set_friendly_name(src_name); param_node->output(0).get_tensor().set_names({src_name}); - m_model_inputs[src_name] = param_node; + m_model_inputs[src_name] = param_node; } } } From 599335c63330787d4908e02c8d7e8093b63be298 Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Tue, 13 Jan 2026 12:27:33 -0800 Subject: [PATCH 225/254] Update ggml/src/ggml-openvino/ggml-openvino-extra.cpp --- .../src/ggml-openvino/ggml-openvino-extra.cpp | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp index bc0362ee46..76871cc4be 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp @@ -268,34 +268,33 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten } // Normal extraction (no requant) - determine format based on tensor type + layout.is_u4 = false; + layout.weights_per_block = 32; + layout.is_symmetric = false; + switch (tensor->type) { case GGML_TYPE_Q4_0: layout.is_u4 = true; - layout.weights_per_block = 32; layout.is_symmetric = true; break; + case GGML_TYPE_Q4_1: - layout.is_u4 = true; - layout.weights_per_block = 32; - break; case GGML_TYPE_Q4_K: layout.is_u4 = true; - layout.weights_per_block = 32; break; + case GGML_TYPE_Q8_0: - layout.is_u4 = false; - layout.weights_per_block = 32; layout.is_symmetric = true; break; + case GGML_TYPE_Q6_K: - layout.is_u4 = false; layout.weights_per_block = 16; layout.is_symmetric = true; break; + case GGML_TYPE_Q5_K: - layout.is_u4 = false; - layout.weights_per_block = 32; break; + default: // Unsupported quantization type return layout; From 416556a87de782a3693db604a4c7bd1366ff10ee Mon Sep 17 00:00:00 2001 From: Yamini Nimmagadda Date: Mon, 12 Jan 2026 16:48:59 -0800 Subject: [PATCH 226/254] Create OPENVINO.md in llama.cpp backend docs --- docs/backend/OPENVINO.md | 144 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 docs/backend/OPENVINO.md diff --git a/docs/backend/OPENVINO.md b/docs/backend/OPENVINO.md new file mode 100644 index 0000000000..d56c61d8a8 --- /dev/null +++ b/docs/backend/OPENVINO.md @@ -0,0 +1,144 @@ +# OpenVINO Backend for llama.cpp + +This document describes the OpenVINO backend for `llama.cpp`, which enables hardware-accelerated inference on **Intel® CPUs, GPUs, and NPUs** while remaining compatible with the existing **GGUF model ecosystem**. + +The backend translates GGML compute graphs into OpenVINO graphs and leverages graph compilation, kernel fusion, and device-specific optimizations to improve inference performance on supported Intel hardware. + +## Overview + +The OpenVINO backend is implemented in ggml/src/ggml-openvino and provides a translation layer for core GGML operations. It supports FP16 and BF16 models, as well as selected quantized GGUF formats. This backend enables accelerated inference on Intel CPUs, integrated and discrete GPUs, and NPUs, while integrating seamlessly with the existing `llama.cpp` execution flow. + +## Supported Devices + +OpenVINO backend supports the following hardware: + +- Intel CPUs +- Intel integrated GPUs +- Intel NPUs (Requires UD32+ driver) + +Although OpenVINO supports a wide range of [Intel hardware](https://docs.openvino.ai/2025/about-openvino/release-notes-openvino/system-requirements.html), the llama.cpp OpenVINO backend has been validated specifically on AI PCs such as the Intel® Core™ Ultra Series 1 and Series 2. + +## Supported Model Precisions + +### Fully Supported + +- FP16 GGUF +- BF16 GGUF + +### Quantized Models (Partial Support) + +- `Q4_0` +- `Q4_1` +- `Q4_K_M` +- `Q6_K` + +Accuracy and performance optimizations for quantized models are still work in progress. + +## Quantization Support Details + +### CPU + +- **`Q4_0`, `Q4_1`, `Q4_K_M`, `Q6_K` models are supported** +- `Q6_K` tensors (6-bit, gs16 symmetric) are converted to int8 gs16 symmetric +- `Q5_K` tensors (5-bit, gs32 asymmetric) are converted to int8 gs32 asymmetric + +### GPU + +- **`Q4_0`, `Q4_1`, `Q4_K_M`, `Q6_K` models are supported** +- `Q6_K` tensors (6-bit, gs16 symmetric) are requantized to int8 gs32 symmetric +- `Q5_K` tensors (5-bit, gs32 asymmetric) are converted to int8 gs32 asymmetric + +### NPU + +- **Primary supported quantization scheme is `Q4_0`** +- `Q4_0` and `Q4_1` tensors are requantized to int4 gs128 symmetric +- `Q6_K` tensors are dequantized to FP16 + +#### Additional Notes + +- Both `Q4_0` and `Q4_1` models use `Q6_K` for the token embedding tensor and the final matmul weight tensor (often the same tensor) +- `Q4_0` models may produce some `Q4_1` tensors if an imatrix is provided during quantization using `llama-quantize` +- `Q4_K_M` models may include both `Q6_K` and `Q5_K` tensors (observed in Phi-3) + +## Validated Models + +The following models have been validated for functionality on Intel® Core™ Ultra Series 1 and Series 2: + +- [Llama-3.2-1B-Instruct-GGUF](https://huggingface.co/MaziyarPanahi/Llama-3.2-1B-Instruct-GGUF) +- [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) +- [microsoft/Phi-3-mini-4k-instruct-gguf](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf) +- [Qwen/Qwen2.5-1.5B-Instruct-GGUF](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF) +- [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) +- [openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-1B-sft-bf16) +- [tencent/Hunyuan-7B-Instruct](https://huggingface.co/tencent/Hunyuan-7B-Instruct) +- [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) + +## Build Instructions + +### Prerequisites + +- OpenVINO runtime and development packages +- CMake +- C++17-compatible compiler + +### Build Example + +```bash +cmake -B build/ReleaseOV \ + -DGGML_OPENVINO=ON \ + -DCMAKE_BUILD_TYPE=Release + +cmake --build build/ReleaseOV -j +``` + +# Runtime Configuration + +The OpenVINO backend can be configured using the following environment variables at runtime to control device selection, caching, debugging, and profiling behavior. + +## Configuration Options + +| Variable | Description | +|--------|-------------| +| `GGML_OPENVINO_DEVICE` | Specify the target device (`CPU`, `GPU`, `NPU`). If not set, the backend automatically selects the first available device in priority order: **GPU → CPU → NPU**. When set to `NPU`, static compilation mode is enabled for optimal performance. | +| `GGML_OPENVINO_CACHE_DIR` | Directory for OpenVINO model caching (recommended: `/tmp/ov_cache`). Enables model caching when set. **Not supported on NPU devices.** | +| `GGML_OPENVINO_PROFILING` | Enable execution-time profiling. | +| `GGML_OPENVINO_DUMP_CGRAPH` | Dump the GGML compute graph to `cgraph.txt`. | +| `GGML_OPENVINO_DUMP_IR` | Export OpenVINO IR files with timestamps. | +| `GGML_OPENVINO_DEBUG_INPUT` | Enable input debugging. | +| `GGML_OPENVINO_DEBUG_OUTPUT` | Enable output debugging. | + +## Example Usage + +### GPU Inference with Profiling + +```bash +export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache +export GGML_OPENVINO_PROFILING=1 +export GGML_OPENVINO_DEVICE=GPU + +./build/ReleaseOV/bin/llama-simple \ + -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf \ + -n 50 \ + "The story of AI is " +``` + +### llama-bench + +```bash +GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1 +``` +-fa 1 is required when running llama-bench with the OpenVINO backend. + +### NPU Notes + +- Prompt processing is currently slower than CPU/GPU +- Smaller context sizes are recommended (e.g. `-c 512`) +- Static compilation mode is enabled automatically +- Model caching is not yet supported + +## Work in Progress + +- Performance and memory optimizations +- Broader quantization coverage +- Support for additional model architectures +- Extensive accuracy validation From 25e652569b2432fb24f083bad18c360fa38fdd76 Mon Sep 17 00:00:00 2001 From: Yamini Nimmagadda Date: Mon, 12 Jan 2026 17:12:01 -0800 Subject: [PATCH 227/254] Update OPENVINO.md --- docs/backend/OPENVINO.md | 38 +++++++++++++++----------------------- 1 file changed, 15 insertions(+), 23 deletions(-) diff --git a/docs/backend/OPENVINO.md b/docs/backend/OPENVINO.md index d56c61d8a8..bc3a2c66cd 100644 --- a/docs/backend/OPENVINO.md +++ b/docs/backend/OPENVINO.md @@ -52,7 +52,7 @@ Accuracy and performance optimizations for quantized models are still work in pr - **Primary supported quantization scheme is `Q4_0`** - `Q4_0` and `Q4_1` tensors are requantized to int4 gs128 symmetric -- `Q6_K` tensors are dequantized to FP16 +- `Q6_K` tensors are requentized to int8 except for the token embedding matrix #### Additional Notes @@ -72,30 +72,17 @@ The following models have been validated for functionality on Intel® Core™ Ul - [openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-1B-sft-bf16) - [tencent/Hunyuan-7B-Instruct](https://huggingface.co/tencent/Hunyuan-7B-Instruct) - [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) +- [bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF) ## Build Instructions -### Prerequisites +For detailed build instructions, refer to [build.md](../build.md#openvino) -- OpenVINO runtime and development packages -- CMake -- C++17-compatible compiler - -### Build Example - -```bash -cmake -B build/ReleaseOV \ - -DGGML_OPENVINO=ON \ - -DCMAKE_BUILD_TYPE=Release - -cmake --build build/ReleaseOV -j -``` - -# Runtime Configuration +## Runtime Configuration The OpenVINO backend can be configured using the following environment variables at runtime to control device selection, caching, debugging, and profiling behavior. -## Configuration Options +### Configuration Options | Variable | Description | |--------|-------------| @@ -107,9 +94,9 @@ The OpenVINO backend can be configured using the following environment variables | `GGML_OPENVINO_DEBUG_INPUT` | Enable input debugging. | | `GGML_OPENVINO_DEBUG_OUTPUT` | Enable output debugging. | -## Example Usage +### Example Usage -### GPU Inference with Profiling +#### GPU Inference with Profiling ```bash export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache @@ -122,7 +109,7 @@ export GGML_OPENVINO_DEVICE=GPU "The story of AI is " ``` -### llama-bench +#### llama-bench ```bash GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1 @@ -131,11 +118,16 @@ GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1 ### NPU Notes -- Prompt processing is currently slower than CPU/GPU - Smaller context sizes are recommended (e.g. `-c 512`) - Static compilation mode is enabled automatically - Model caching is not yet supported - +- Does not support llama-server -np > 1 (multiple parallel sequences) +- Only supports llama-perplexity -b 512 or smaller + +## Llama.cpp Tools + +The following tools work with the OpenVINO backend on CPU and GPU: llama-simple, llama-run, llama-cli, llama-server, llama-bench, llama-perplexity. + ## Work in Progress - Performance and memory optimizations From 9ba324726aa7b01d8e58336e3609aac055054103 Mon Sep 17 00:00:00 2001 From: Yamini Nimmagadda Date: Mon, 12 Jan 2026 17:29:46 -0800 Subject: [PATCH 228/254] Update OPENVINO.md --- docs/backend/OPENVINO.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/backend/OPENVINO.md b/docs/backend/OPENVINO.md index bc3a2c66cd..7c2e733b03 100644 --- a/docs/backend/OPENVINO.md +++ b/docs/backend/OPENVINO.md @@ -93,6 +93,9 @@ The OpenVINO backend can be configured using the following environment variables | `GGML_OPENVINO_DUMP_IR` | Export OpenVINO IR files with timestamps. | | `GGML_OPENVINO_DEBUG_INPUT` | Enable input debugging. | | `GGML_OPENVINO_DEBUG_OUTPUT` | Enable output debugging. | +| *`GGML_OPENVINO_STATEFUL_EXECUTION` | Enable stateful execution for better performance | + +*`GGML_OPENVINO_STATEFUL_EXECUTION` is an **Experimental** feature to allow stateful execution for managing the KV cache internally inside the OpenVINO model, improving performance on CPUs and GPUs. Stateful execution is not effective on NPUs, and not all models currently support this feature. This feature is experimental and has been validated only with the llama-simple, llama-cli, llama-bench, and llama-run applications and is recommended to enable for the best performance. Other applications, such as llama-server and llama-perplexity, are not yet supported. ### Example Usage From 61552e4450abc5924589a1916b1d7f63f02f151a Mon Sep 17 00:00:00 2001 From: Yamini Nimmagadda Date: Mon, 12 Jan 2026 17:37:26 -0800 Subject: [PATCH 229/254] Update OPENVINO.md --- docs/backend/OPENVINO.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/backend/OPENVINO.md b/docs/backend/OPENVINO.md index 7c2e733b03..3395b70e60 100644 --- a/docs/backend/OPENVINO.md +++ b/docs/backend/OPENVINO.md @@ -52,7 +52,7 @@ Accuracy and performance optimizations for quantized models are still work in pr - **Primary supported quantization scheme is `Q4_0`** - `Q4_0` and `Q4_1` tensors are requantized to int4 gs128 symmetric -- `Q6_K` tensors are requentized to int8 except for the token embedding matrix +- `Q6_K` tensors are requentized to int8 except for the token embedding matrix which is dequantized to fp16 #### Additional Notes From 63eed0d9f3bcfb906e9f09c3055b838c59e3f3f2 Mon Sep 17 00:00:00 2001 From: Yamini Nimmagadda Date: Mon, 12 Jan 2026 17:43:28 -0800 Subject: [PATCH 230/254] Update build.md --- docs/build.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/build.md b/docs/build.md index c52c27c295..be29e1107f 100644 --- a/docs/build.md +++ b/docs/build.md @@ -701,7 +701,7 @@ To read documentation for how to build on IBM Z & LinuxONE, [click here](./build [OpenVINO](https://docs.openvino.ai/2025/index.html) is an open-source toolkit for optimizing and deploying high-performance AI inference, specifically designed for Intel hardware, including CPUs, GPUs, and NPUs, in the cloud, on-premises, and on the edge. The OpenVINO backend enhances performance by leveraging hardware-specific optimizations and can be enabled for use with llama.cpp. -Follow the instructions below to install OpenVINO runtime and build llama.cpp with OpenVINO support. +Follow the instructions below to install OpenVINO runtime and build llama.cpp with OpenVINO support. For more detailed information on OpenVINO backend, refer to [OPENVINO.md](backend/OPENVINO.md) ### Prerequisites From f44c60e995fda6b817dc29756a00bb9c4703a188 Mon Sep 17 00:00:00 2001 From: Yamini Nimmagadda Date: Tue, 13 Jan 2026 14:33:16 -0800 Subject: [PATCH 231/254] Update OPENVINO.md --- docs/backend/OPENVINO.md | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/docs/backend/OPENVINO.md b/docs/backend/OPENVINO.md index 3395b70e60..d69aaedf61 100644 --- a/docs/backend/OPENVINO.md +++ b/docs/backend/OPENVINO.md @@ -36,23 +36,15 @@ Accuracy and performance optimizations for quantized models are still work in pr ## Quantization Support Details -### CPU +### CPU and GPU - **`Q4_0`, `Q4_1`, `Q4_K_M`, `Q6_K` models are supported** -- `Q6_K` tensors (6-bit, gs16 symmetric) are converted to int8 gs16 symmetric -- `Q5_K` tensors (5-bit, gs32 asymmetric) are converted to int8 gs32 asymmetric - -### GPU - -- **`Q4_0`, `Q4_1`, `Q4_K_M`, `Q6_K` models are supported** -- `Q6_K` tensors (6-bit, gs16 symmetric) are requantized to int8 gs32 symmetric -- `Q5_K` tensors (5-bit, gs32 asymmetric) are converted to int8 gs32 asymmetric +- `Q5_K` and `Q6_K` tensors are converted to `Q8_0_C` ### NPU - **Primary supported quantization scheme is `Q4_0`** -- `Q4_0` and `Q4_1` tensors are requantized to int4 gs128 symmetric -- `Q6_K` tensors are requentized to int8 except for the token embedding matrix which is dequantized to fp16 +- `Q6_K` tensors are requantized to `Q4_0_128` in general. For embedding weights, `Q6_K` tensors are requantized to `Q8_0_C` except for the token embedding matrix which is dequantized to fp16 #### Additional Notes From e9ed5c4cb655f7fd2f3f0f21b13a7c0da464201c Mon Sep 17 00:00:00 2001 From: Yamini Nimmagadda Date: Tue, 13 Jan 2026 14:50:44 -0800 Subject: [PATCH 232/254] Update OPENVINO.md --- docs/backend/OPENVINO.md | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/docs/backend/OPENVINO.md b/docs/backend/OPENVINO.md index d69aaedf61..87c537f20b 100644 --- a/docs/backend/OPENVINO.md +++ b/docs/backend/OPENVINO.md @@ -13,20 +13,15 @@ The OpenVINO backend is implemented in ggml/src/ggml-openvino and provides a tra OpenVINO backend supports the following hardware: - Intel CPUs -- Intel integrated GPUs +- Intel integrated and discrete GPUs - Intel NPUs (Requires UD32+ driver) Although OpenVINO supports a wide range of [Intel hardware](https://docs.openvino.ai/2025/about-openvino/release-notes-openvino/system-requirements.html), the llama.cpp OpenVINO backend has been validated specifically on AI PCs such as the Intel® Core™ Ultra Series 1 and Series 2. ## Supported Model Precisions -### Fully Supported - -- FP16 GGUF -- BF16 GGUF - -### Quantized Models (Partial Support) - +- `FP16` +- `BF16` (on Intel Xeon) - `Q4_0` - `Q4_1` - `Q4_K_M` @@ -46,7 +41,7 @@ Accuracy and performance optimizations for quantized models are still work in pr - **Primary supported quantization scheme is `Q4_0`** - `Q6_K` tensors are requantized to `Q4_0_128` in general. For embedding weights, `Q6_K` tensors are requantized to `Q8_0_C` except for the token embedding matrix which is dequantized to fp16 -#### Additional Notes +### Additional Notes - Both `Q4_0` and `Q4_1` models use `Q6_K` for the token embedding tensor and the final matmul weight tensor (often the same tensor) - `Q4_0` models may produce some `Q4_1` tensors if an imatrix is provided during quantization using `llama-quantize` From d3649c11cbf2d4967ed8b2871c05d299a14e3cd8 Mon Sep 17 00:00:00 2001 From: Yamini Nimmagadda Date: Tue, 13 Jan 2026 14:53:27 -0800 Subject: [PATCH 233/254] Update OPENVINO.md --- docs/backend/OPENVINO.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/backend/OPENVINO.md b/docs/backend/OPENVINO.md index 87c537f20b..acb461f435 100644 --- a/docs/backend/OPENVINO.md +++ b/docs/backend/OPENVINO.md @@ -108,8 +108,6 @@ GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1 ### NPU Notes -- Smaller context sizes are recommended (e.g. `-c 512`) -- Static compilation mode is enabled automatically - Model caching is not yet supported - Does not support llama-server -np > 1 (multiple parallel sequences) - Only supports llama-perplexity -b 512 or smaller From d7dccf887be69f17f1e67609f60a1942ddf4ef48 Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Thu, 15 Jan 2026 14:38:53 -0800 Subject: [PATCH 234/254] kq_mask naming fix --- ggml/src/ggml-openvino/ggml-decoder.cpp | 4 ++-- ggml/src/ggml-openvino/openvino/translate_session.cpp | 4 ++-- ggml/src/ggml-openvino/utils.cpp | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 40d5cd5418..b8fe6358c8 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -324,7 +324,7 @@ std::pair GgmlOvDecoder::compute_llm_params(ggml_cgr int layer = extract_layer_from_name(cache_k->name); auto * mask = node->src[3]; std::string mask_name(mask->name); - assert(mask_name.find("KQ_mask") == 0); + assert(mask_name.find("self_kq_mask") == 0); if (std::string(node->src[3]->name).find("swa") != std::string::npos) { model_params.swa_layers.push_back(layer); @@ -392,7 +392,7 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co } else if (name == "inp_out_ids") { input_shape = ov::PartialShape{1, 1, 1, m_is_static ? m_compute_params.output_len : -1}; - } else if (name.find("KQ_mask") == 0) { + } else if (name.find("self_kq_mask") == 0) { if (m_is_static) { input_shape = ov::PartialShape{1, 1, m_is_prefill ? m_prefill_chunk_size : 1, m_model_params.ctx}; } else if (m_is_stateful) { diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 02e08c24f4..adb3025d17 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -109,8 +109,8 @@ void add_sliced_mask(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) { } }; - create_sliced_mask("KQ_mask", "KQ_mask_sliced", ggml_model_decoder.is_static()); - create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static()); + create_sliced_mask("self_kq_mask", "KQ_mask_sliced", ggml_model_decoder.is_static()); + create_sliced_mask("self_kq_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static()); } void add_rope_sin_cos(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index ff94c4acfe..f7d62588c8 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -525,7 +525,7 @@ ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr ggml return input_tensor; } - if (param_name.find("KQ_mask") == 0) { + if (param_name.find("self_kq_mask") == 0) { size_t context_size = ggml_decoder->get_ctx_size(); std::vector padded_data = pad_input(ggml_tensor, 1, context_size, -INFINITY); ov::Tensor input_tensor(ov::element::f32, ov::Shape{1, 1, 1, context_size}); @@ -591,7 +591,7 @@ ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr ggm return input_tensor; } - if (param_name.find("KQ_mask") == 0) { + if (param_name.find("self_kq_mask") == 0) { size_t cols = ggml_tensor->ne[0]; size_t rows = ggml_tensor->ne[1]; float * ggml_data = (float *) ggml_tensor->data + chunk_index * chunk_size * cols; @@ -645,7 +645,7 @@ void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor << std::endl; switch (tensor.get_element_type()) { case ov::element::f32: { - if (name.find("KQ_mask") == std::string::npos) { + if (name.find("self_kq_mask") == std::string::npos) { std::cout << *(tensor.data()) << std::endl; } else { size_t rows = tensor.get_shape()[2]; From aa4bc90030c8fb7804054f09aa31d8f3ace5e0db Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Fri, 16 Jan 2026 13:06:43 -0800 Subject: [PATCH 235/254] Syntax correction for workflows build file --- .github/workflows/build.yml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a0be0c704b..55bea37d15 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1770,12 +1770,12 @@ jobs: GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp ggml-ci-arm64-cpu-kleidiai: - runs-on: ubuntu-22.04-arm + runs-on: ubuntu-22.04-arm - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 - name: ccache uses: ggml-org/ccache-action@v1.2.16 @@ -1790,10 +1790,10 @@ jobs: sudo apt-get update sudo apt-get install -y build-essential - - name: Test - id: ggml-ci - run: | - GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt + - name: Test + id: ggml-ci + run: | + GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt ubuntu-cpu-cmake-riscv64-native: runs-on: RISCV64 From 9a15c8b0cfc7ba1d5b1b9e8983135b18bce6905c Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 21 Jan 2026 15:23:12 +0800 Subject: [PATCH 236/254] Change ov backend buffer is_host to false --- ggml/src/ggml-openvino/ggml-openvino.cpp | 56 +++++++++++++++--------- 1 file changed, 36 insertions(+), 20 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index a1b5b5dd32..de986ea42d 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -22,6 +22,16 @@ #include #include +#if defined(_WIN32) +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include +#else +# include +#endif + // ===================================================== // OpenVINO Buffer Implementation using ov::Tensor // ===================================================== @@ -152,7 +162,7 @@ static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_bu ctx = (ggml_backend_openvino_buffer_context *) buffer->context; - if (tensor->data != nullptr) { + if (tensor->data != nullptr && !ggml_is_quantized(tensor->type)) { ggml_openvino_tensor_extra * extra = ggml_openvino_create_tensor_extra(tensor, ctx->is_remote); if (extra != nullptr) { auto it = ctx->tensor_extras.find(tensor); @@ -172,7 +182,7 @@ static void ggml_backend_openvino_buffer_memset_tensor(ggml_backend_buffer_t buf uint8_t value, size_t offset, size_t size) { - GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name); + // GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name); GGML_ASSERT(tensor != nullptr && tensor->data != nullptr); ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; @@ -480,20 +490,13 @@ static size_t ggml_backend_openvino_buffer_type_get_alloc_size(ggml_backend_buff return ggml_nbytes(tensor); } -static bool ggml_backend_openvino_buffer_type_is_host(ggml_backend_buffer_type_t buft) { - GGML_UNUSED(buft); - // Currently using host memory via ov::Tensor - // This will be false when using GPU/NPU remote tensors - return true; -} - static const ggml_backend_buffer_type_i ggml_backend_openvino_buffer_type_interface = { /* .get_name = */ ggml_backend_openvino_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_openvino_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_openvino_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_openvino_buffer_type_get_max_size, /* .get_alloc_size = */ ggml_backend_openvino_buffer_type_get_alloc_size, - /* .is_host = */ ggml_backend_openvino_buffer_type_is_host, + /* .is_host = */ nullptr, }; // Get buffer type for a specific device @@ -537,13 +540,18 @@ static const char * ggml_backend_openvino_host_buffer_type_get_name(ggml_backend return name.c_str(); } +static bool ggml_backend_openvino_host_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return true; +} + static const ggml_backend_buffer_type_i ggml_backend_openvino_host_buffer_type_interface = { /* .get_name = */ ggml_backend_openvino_host_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_openvino_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_openvino_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_openvino_buffer_type_get_max_size, /* .get_alloc_size = */ ggml_backend_openvino_buffer_type_get_alloc_size, - /* .is_host = */ ggml_backend_openvino_buffer_type_is_host, + /* .is_host = */ ggml_backend_openvino_host_buffer_type_is_host, }; GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(int device) { @@ -704,11 +712,22 @@ static const char * ggml_backend_openvino_device_get_description(ggml_backend_de } static void ggml_backend_openvino_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { - GGML_ASSERT(dev->context != nullptr); - GGML_ASSERT(free != nullptr); - GGML_ASSERT(total != nullptr); - *total = 1; - *free = 1; +#ifdef _WIN32 + MEMORYSTATUSEX status; + status.dwLength = sizeof(status); + GlobalMemoryStatusEx(&status); + *total = status.ullTotalPhys; + *free = status.ullAvailPhys; +#else + long pages = sysconf(_SC_PHYS_PAGES); + long page_size = sysconf(_SC_PAGE_SIZE); + *total = pages * page_size; + + // "free" system memory is ill-defined, for practical purposes assume that all of it is free: + *free = *total; +#endif // _WIN32 + + GGML_UNUSED(dev); } static enum ggml_backend_dev_type ggml_backend_openvino_device_get_type(ggml_backend_dev_t dev) { @@ -924,9 +943,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con } static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - // Support our own buffer type and any host buffer (for mmap'd files, etc.) - return ggml_backend_buft_is_openvino(buft) || ggml_backend_buft_is_host(buft); - // return ggml_backend_buft_is_openvino(buft) || ggml_backend_buft_is_openvino_host(buft); + return ggml_backend_buft_is_openvino(buft) || ggml_backend_buft_is_openvino_host(buft); GGML_UNUSED(dev); } @@ -938,7 +955,6 @@ static const struct ggml_backend_device_i ggml_backend_openvino_device_interface /* .get_props = */ ggml_backend_openvino_device_get_props, /* .init_backend = */ ggml_backend_openvino_device_init, /* .get_buffer_type = */ ggml_backend_openvino_device_get_buffer_type, - // /* .get_host_buffer_type = */ NULL, /* .get_host_buffer_type = */ ggml_backend_openvino_device_get_host_buffer_type, /* .buffer_from_host_ptr = */ NULL, /* .supports_op = */ ggml_backend_openvino_device_supports_op, From 8fb20b28b78df640fb712b867fc98090b2801616 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 22 Jan 2026 12:20:50 +0800 Subject: [PATCH 237/254] Fix llama-bench -p -n where p<=256 --- ggml/src/ggml-openvino/utils.cpp | 12 +++++------- ggml/src/ggml-openvino/utils.h | 9 +++------ 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index f7d62588c8..2d30eef941 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -768,14 +768,12 @@ graph_key compute_graph_key(ggml_cgraph * cgraph) { graph_key key; key.n_nodes = cgraph->n_nodes; - if (cgraph->n_nodes > 0) { - key.first_node_name = std::string(cgraph->nodes[0]->name); - key.last_node_name = std::string(cgraph->nodes[cgraph->n_nodes - 1]->name); - } else { - key.first_node_name = ""; - key.last_node_name = ""; + for (int i = 0; i < cgraph->n_nodes; ++i) { + const auto * node = cgraph->nodes[i]; + if (node->op == GGML_OP_SET_ROWS && strncmp(node->src[2]->name, "cache_k_l0", 10) == 0) { + key.cache_k_l0 = node->src[2]; + } } - return key; } diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 47bf2d4ff1..72ef904f74 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -8,20 +8,17 @@ struct graph_key { size_t n_nodes; - std::string first_node_name; - std::string last_node_name; + void * cache_k_l0; bool operator==(const graph_key & other) const { - return n_nodes == other.n_nodes && first_node_name == other.first_node_name && - last_node_name == other.last_node_name; + return n_nodes == other.n_nodes && cache_k_l0 == other.cache_k_l0; } }; struct graph_key_hash { size_t operator()(const graph_key & key) const { size_t h = std::hash{}(key.n_nodes); - h ^= std::hash{}(key.first_node_name) + 0x9e3779b9 + (h << 6) + (h >> 2); - h ^= std::hash{}(key.last_node_name) + 0x9e3779b9 + (h << 6) + (h >> 2); + h ^= std::hash{}(key.cache_k_l0) + 0x9e3779b9 + (h << 6) + (h >> 2); return h; } }; From 1c0a47a4856c308285be4588e3c9adac88cf96de Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 22 Jan 2026 15:52:10 +0800 Subject: [PATCH 238/254] Fix --direct-io 0 --- ggml/src/ggml-openvino/ggml-openvino.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index de986ea42d..06bff5a2b7 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -943,7 +943,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con } static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - return ggml_backend_buft_is_openvino(buft) || ggml_backend_buft_is_openvino_host(buft); + return ggml_backend_buft_is_openvino(buft) || ggml_backend_buft_is_host(buft); GGML_UNUSED(dev); } From c840210213943a9fa58d67aaa70910d5511195a4 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Sat, 24 Jan 2026 17:16:06 +0800 Subject: [PATCH 239/254] Don't put kvcache on GPU in stateful mode --- ggml/src/ggml-openvino/ggml-openvino.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 06bff5a2b7..8d6a0dbf33 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -140,7 +140,7 @@ static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_bu // Put kvcache on device memory for GPU (NPU memory is too small even for kvcache) if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY && strncmp(tensor->name, "cache_", 6) == 0 && !ctx->is_remote && - ggml_openvino_get_device_name() == "GPU") { + ggml_openvino_get_device_name() == "GPU" && !getenv("GGML_OPENVINO_STATEFUL_EXECUTION")) { GGML_ASSERT(ctx->tensor_extras.empty()); auto device = ctx->device; auto size = ctx->size; From d398214e1408be00d62cc0dd6daf2718357a88bb Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 23 Jan 2026 15:49:01 +0800 Subject: [PATCH 240/254] Remove hardcode names --- ggml/src/ggml-openvino/ggml-decoder.cpp | 63 +++++++++++++------------ ggml/src/ggml-openvino/ggml-decoder.h | 8 ++-- ggml/src/ggml-openvino/utils.cpp | 4 +- 3 files changed, 39 insertions(+), 36 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index b8fe6358c8..01e2c2ff19 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -169,9 +169,11 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { // TODO: The shape modification for stateful model below is not validated for all supported models yet. More generic solution might be needed // to enable additional cases. Ideally, this could be removed from decoder and done as part of a transformation later. auto stateless_kv_shape = get_graph_input_shape(node, src); - assert(stateless_kv_shape.size() == 4 && stateless_kv_shape[0] == 1 && stateless_kv_shape[1] == 1 - && stateless_kv_shape[2].is_dynamic() && stateless_kv_shape[3] == (m_model_params.n_heads_kv*m_model_params.head_size)); - stateful_kv_shape = {stateless_kv_shape[0], ov::Dimension::dynamic(), m_model_params.n_heads_kv, m_model_params.head_size}; + assert(stateless_kv_shape.size() == 4 && stateless_kv_shape[0] == 1 && + stateless_kv_shape[1] == 1 && stateless_kv_shape[2].is_dynamic() && + stateless_kv_shape[3] == (m_model_params.n_heads_kv * m_model_params.head_size)); + stateful_kv_shape = {stateless_kv_shape[0], ov::Dimension::dynamic(), + m_model_params.n_heads_kv, m_model_params.head_size}; } } } @@ -180,9 +182,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { } m_inputs[src_name] = src; assert(stateful_kv_shape.rank().is_static()); - ov::PartialShape param_shape = (stateful_kv_shape.rank().get_length() != 0) - ? stateful_kv_shape - : get_graph_input_shape(node, src); + ov::PartialShape param_shape = + (stateful_kv_shape.rank().get_length() != 0) ? stateful_kv_shape : get_graph_input_shape(node, src); auto param_node = std::make_shared(get_ov_type(src), param_shape); param_node->set_friendly_name(src_name); param_node->output(0).get_tensor().set_names({src_name}); @@ -197,7 +198,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { static std::set debug_output_names = {}; // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph if (node->op == GGML_OP_SET_ROWS || node->flags & GGML_TENSOR_FLAG_OUTPUT || - node_output_name.find("output") != std::string::npos || debug_output_names.count(node_output_name)) { + debug_output_names.count(node_output_name)) { if (m_model_outputs.find(node_output_name) == m_model_outputs.end()) { m_model_outputs[node_output_name] = node_output; } @@ -312,6 +313,11 @@ std::pair GgmlOvDecoder::compute_llm_params(ggml_cgr auto * node = cgraph->nodes[i]; std::string name = std::string(node->name); if (node->op == GGML_OP_FLASH_ATTN_EXT) { + model_params.n_heads = node->src[0]->ne[2]; + model_params.n_heads_kv = node->src[1]->ne[2]; + model_params.head_size = node->src[0]->ne[0]; + compute_params.input_len = node->src[0]->ne[1]; + auto * cache_k_perm = node->src[1]; if (cache_k_perm->op == GGML_OP_CPY) { cache_k_perm = cache_k_perm->src[0]; @@ -324,9 +330,8 @@ std::pair GgmlOvDecoder::compute_llm_params(ggml_cgr int layer = extract_layer_from_name(cache_k->name); auto * mask = node->src[3]; std::string mask_name(mask->name); - assert(mask_name.find("self_kq_mask") == 0); - if (std::string(node->src[3]->name).find("swa") != std::string::npos) { + if (mask_name.find("swa") != std::string::npos) { model_params.swa_layers.push_back(layer); model_params.ctx_per_seq_swa = cache_k->ne[1]; } else { @@ -351,24 +356,17 @@ std::pair GgmlOvDecoder::compute_llm_params(ggml_cgr compute_params.attention_size_swa = model_params.ctx_per_seq_swa; compute_params.token_len_per_seq = 1; } - - } else if (node->op == GGML_OP_ROPE) { - if (name.find("Qcur-0") == 0 || std::string(node->src[0]->name).find("Qcur-0") == 0) { - model_params.head_size = node->ne[0]; - model_params.n_heads = node->ne[1]; - model_params.rope_params = node->op_params; - auto * inp_pos = node->src[1]; - compute_params.input_len = inp_pos->ne[0]; - } else if (name.find("Kcur-0") == 0 || std::string(node->src[0]->name).find("Kcur-0") == 0) { - model_params.n_heads_kv = node->ne[1]; - } - } else if (node->op == GGML_OP_GET_ROWS && std::string(node->src[1]->name) == "inp_out_ids") { - // for static case, output_len is always 1 except for llama-perplexity - compute_params.output_len = node->src[1]->ne[0]; - if (is_static && compute_params.output_len == 0) { - compute_params.output_len = 1; - } + break; } + if (node->op == GGML_OP_ROPE) { + model_params.rope_params = node->op_params; + } + } + auto * output_tensor = cgraph->nodes[cgraph->n_nodes - 1]; + compute_params.output_len = output_tensor->ne[1]; + // for NPU, output_len is always 1 except for llama-perplexity + if (is_static && compute_params.output_len == 0) { + compute_params.output_len = 1; } model_params.ctx = model_params.ctx_per_seq * model_params.n_seq; model_params.ctx_swa = model_params.ctx_per_seq_swa * model_params.n_seq; @@ -385,14 +383,17 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co auto name = std::string(input->name); ov::PartialShape input_shape; - if (name == "inp_tokens" || name == "inp_pos") { + if ((op->op == GGML_OP_GET_ROWS && op->src[0]->op == GGML_OP_NONE) || op->op == GGML_OP_ROPE) { + // tokens or positions int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1; input_shape = ov::PartialShape{1, 1, 1, len}; - } else if (name == "inp_out_ids") { + } else if (op->op == GGML_OP_GET_ROWS) { + // output index input_shape = ov::PartialShape{1, 1, 1, m_is_static ? m_compute_params.output_len : -1}; - } else if (name.find("self_kq_mask") == 0) { + } else if (op->op == GGML_OP_CPY || op->op == GGML_OP_FLASH_ATTN_EXT) { + // mask if (m_is_static) { input_shape = ov::PartialShape{1, 1, m_is_prefill ? m_prefill_chunk_size : 1, m_model_params.ctx}; } else if (m_is_stateful) { @@ -401,7 +402,8 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co input_shape = ov::PartialShape{-1, 1, -1, -1}; } - } else if (name.find("cache_") == 0) { + } else if (op && op->op == GGML_OP_SET_ROWS && op->src[2] == input) { + // kvcache input_shape = ov::PartialShape{get_shape(input)}; if (!m_is_static) { // do not fix ctx size to make llama-bench work @@ -409,6 +411,7 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co } } else if (op && op->op == GGML_OP_SET_ROWS && op->src[1] == input) { + // kv update index int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1; input_shape = ov::PartialShape{1, 1, 1, len}; diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 4afec272e1..c0d18b7512 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -16,7 +16,7 @@ struct ModelParams { int ctx_swa = -1; int ctx_per_seq = -1; int ctx_per_seq_swa = -1; - int n_seq = -1; + int n_seq = 1; int n_heads = -1; int n_heads_kv = -1; int head_size = -1; @@ -37,14 +37,14 @@ struct ModelParams { }; struct ComputeParams { - int n_seq_active = -1; - int seq_active_start = -1; + int n_seq_active = 1; + int seq_active_start = 0; int attention_size = -1; int attention_size_swa = -1; int input_len = -1; int token_len_per_seq = -1; int past_kv_len = -1; - int output_len = -1; + int output_len = 1; }; class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 2d30eef941..8c3717472b 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -614,10 +614,10 @@ ov::Tensor get_ov_output_tensor(std::shared_ptr ggml_decoder, con auto output_type = ggml_decoder->get_ov_type(ggml_tensor); auto output_shape = ggml_decoder->get_shape(ggml_tensor); - if (ggml_decoder->is_static() && result_name == "result_output" && output_shape[2] == 0) { + if (ggml_decoder->is_static() && output_shape[2] == 0) { output_shape[2] = 1; } - if (ggml_decoder->is_stateful() && result_name == "result_output") { + if (ggml_decoder->is_stateful() && ggml_tensor->flags & GGML_TENSOR_FLAG_OUTPUT) { std::vector output_shape_3d; for (size_t i=1; i Date: Fri, 23 Jan 2026 15:49:36 +0800 Subject: [PATCH 241/254] Fix stateful shapes --- .../ggml-openvino/openvino/op/glu_geglu.cpp | 2 +- .../ggml-openvino/openvino/op/glu_swiglu.cpp | 2 +- ggml/src/ggml-openvino/openvino/op/rope.cpp | 22 +++++-------------- ggml/src/ggml-openvino/openvino/utils.cpp | 2 +- ggml/src/ggml-openvino/utils.cpp | 2 ++ 5 files changed, 11 insertions(+), 19 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp index ad5cd3f6ba..8be9e8deb0 100644 --- a/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp @@ -26,7 +26,7 @@ OutputVector translate_glu_geglu(const NodeContext & context) { src1 = context.get_input(1); } else { auto combined = context.get_input(0); - auto split_axis = ov::op::v0::Constant::create(ov::element::i64, {}, {3}); + auto split_axis = ov::op::v0::Constant::create(ov::element::i64, {}, {-1}); auto split = std::make_shared(combined, split_axis, 2); src0 = split->output(0); src1 = split->output(1); diff --git a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp index 2b7f13629f..6e0b85517e 100644 --- a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp @@ -26,7 +26,7 @@ OutputVector translate_glu_swiglu(const NodeContext & context) { src1 = context.get_input(1); } else { auto combined = context.get_input(0); - auto split_axis = ov::op::v0::Constant::create(ov::element::i64, {}, {3}); + auto split_axis = ov::op::v0::Constant::create(ov::element::i64, {}, {-1}); auto split = std::make_shared(combined, split_axis, 2); src0 = split->output(0); src1 = split->output(1); diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index 01bc46131e..44e3368217 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -70,22 +70,16 @@ OutputVector translate_rope(const NodeContext & context) { constexpr int ROPE_TYPE_NORM = 0; if (mode == ROPE_TYPE_NORM) { + auto neg_one = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[3]}); Output even_slice; Output odd_slice; - int32_t unsqueeze_dim = 4; - if (context.is_stateful()) { - unsqueeze_dim = 3; - even_slice = std::make_shared(data_node, zero, end, two, two); - odd_slice = std::make_shared(data_node, one, end, two, two); - } else { - auto three = ov::op::v0::Constant::create(ov::element::i64, {1}, {3}); - even_slice = std::make_shared(data_node, zero, end, two, three); - odd_slice = std::make_shared(data_node, one, end, two, three); - } + int32_t unsqueeze_dim = context.is_stateful() ? 3 : 4; + even_slice = std::make_shared(data_node, zero, end, two, neg_one); + odd_slice = std::make_shared(data_node, one, end, two, neg_one); Output first_half = std::make_shared(std::make_shared(even_slice, cos_theta_node), @@ -105,7 +99,7 @@ OutputVector translate_rope(const NodeContext & context) { res = std::make_shared(stack, data_shape, false); } else if (mode == ROPE_TYPE_NEOX) { auto data_split = std::make_shared( - data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {3}), 2); + data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {-1}), 2); Output slice_data_node_0 = data_split->outputs()[0]; Output slice_data_node_1 = data_split->outputs()[1]; @@ -117,11 +111,7 @@ OutputVector translate_rope(const NodeContext & context) { std::make_shared(slice_data_node_0, sin_theta_node), std::make_shared(slice_data_node_1, cos_theta_node)); - int32_t concat_dim = 3; - if (context.is_stateful()) { - concat_dim = 2; - } - res = std::make_shared(ov::OutputVector{first_half_node, second_half_node}, concat_dim); + res = std::make_shared(ov::OutputVector{first_half_node, second_half_node}, -1); } return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp index b7553f99c8..a0215b97b1 100644 --- a/ggml/src/ggml-openvino/openvino/utils.cpp +++ b/ggml/src/ggml-openvino/openvino/utils.cpp @@ -216,7 +216,7 @@ ov::Output process_view_input(const NodeContext & context, int input_i auto begin = ov::op::v0::Constant::create(ov::element::i64, {1}, {split_addr}); auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_end}); auto stride = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - auto axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {3}); + auto axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {context.is_stateful() ? 2 : 3}); auto sliced = std::make_shared(input, begin, end, stride, axes); return sliced; } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 8c3717472b..edf42cd985 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -497,6 +497,7 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, cons ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr ggml_decoder, const std::string & param_name) { + // NPU decoding stage const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(param_name); const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor); @@ -540,6 +541,7 @@ ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr ggml ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr ggml_decoder, const std::string & param_name, int chunk_index) { + // NPU prompt processing stage const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(param_name); const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor); From 3259921309abf5acd7c79eae047148176751bef6 Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Wed, 21 Jan 2026 15:17:11 -0800 Subject: [PATCH 242/254] Simplification for stateful and update output shape processing --- ggml/src/ggml-openvino/ggml-decoder.cpp | 18 ++++----- ggml/src/ggml-openvino/ggml-decoder.h | 2 +- .../openvino/translate_session.cpp | 25 ++++++++++++ ggml/src/ggml-openvino/utils.cpp | 39 ++++++++----------- 4 files changed, 52 insertions(+), 32 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 01e2c2ff19..2f97af0a3e 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -56,11 +56,11 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, m_model_params(model_params), m_compute_params(compute_params) { if (auto * env = getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); env && std::string(env) != "0") { - #ifdef _WIN32 - _putenv_s("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS", ""); - #else - unsetenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); - #endif +#ifdef _WIN32 + _putenv_s("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS", ""); +#else + unsetenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); +#endif print_tensor_address_map(cgraph); } @@ -106,8 +106,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map(get_ov_type(src_node), ov::Shape(get_shape(src_node))); + auto param_node = std::make_shared(get_ov_type(src_node), get_shape(src_node)); param_node->set_friendly_name(src_name); param_node->output(0).get_tensor().set_names({src_name}); m_model_inputs[src_name] = param_node; @@ -163,7 +162,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { // GGML_BACKEND_BUFFER_USAGE_ANY are kv caches if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) { assert(src_name.find("cache_k") == 0 || src_name.find("cache_v") == 0); - if (auto it = std::find(m_model_params.kv_names.begin(), m_model_params.kv_names.end(), src_name); it == m_model_params.kv_names.end()) { + if (auto it = std::find(m_model_params.kv_names.begin(), m_model_params.kv_names.end(), src_name); + it == m_model_params.kv_names.end()) { m_model_params.kv_names.push_back(src_name); if (is_stateful()) { // TODO: The shape modification for stateful model below is not validated for all supported models yet. More generic solution might be needed @@ -719,7 +719,7 @@ void print_tensor_address_map(const ggml_cgraph * cgraph) { } } -std::vector GgmlOvDecoder::get_shape(const ggml_tensor * tensor) { +ov::Shape GgmlOvDecoder::get_shape(const ggml_tensor * tensor) { std::vector shape; for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) { shape.push_back(static_cast(tensor->ne[i])); diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index c0d18b7512..f69d187880 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -207,7 +207,7 @@ public: bool m_is_prefill = false; int m_prefill_chunk_size = 0; - static std::vector get_shape(const ggml_tensor * tensor); + static ov::Shape get_shape(const ggml_tensor * tensor); static std::vector get_stride(const ggml_tensor * tensor); static ov::element::Type get_ov_type(const ggml_tensor * tensor); static std::string compute_op_type(const ggml_tensor * node); diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index adb3025d17..b7e7b58531 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -29,8 +29,10 @@ #include #include #include +#include #include #include +#include namespace ov { namespace frontend { @@ -252,6 +254,29 @@ std::shared_ptr TranslateSession::apply_transformations(std::shared_ptr(); } manager.run_passes(model); + if (ggml_model_decoder->is_stateful()) { + auto output_names = ggml_model_decoder->get_model_output_names(); + std::map model_output_indexes; + for (size_t i=0; iget_output_size(); i++) { + auto output_friendly_name = model->output(i).get_node_shared_ptr()->get_friendly_name(); + auto output_id = model_output_indexes[output_friendly_name]; + auto model_output_shape = model->output(i).get_partial_shape(); + auto decoder_output_shape = ggml_model_decoder->get_output_shape(output_id); + if (model_output_shape.rank().is_static() && decoder_output_shape.rank().is_static() + && model_output_shape.rank().get_length() + 1 == decoder_output_shape.rank().get_length() + && decoder_output_shape[0].is_static() && decoder_output_shape[0].get_length() == 1) { + ppp.output(i).postprocess().custom([](const ov::Output& node) { + auto axes = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{1}, {0}); + return std::make_shared(node, axes); + }); + } + } + model = ppp.build(); + } } return model; } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index edf42cd985..0c5a520b25 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -103,10 +103,12 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin ggml_decoder->add_extra_inputs(); infer_request = infer_request_cache[key]; - auto * inp_pos = get_inp_pos_tensor(cgraph); - int32_t * pos_data = (int32_t *) inp_pos->data; - if (pos_data[0] == 0) { - infer_request->reset_state(); + if (stateful) { + const auto * inp_pos = get_inp_pos_tensor(cgraph); + int32_t * pos_data = (int32_t *) inp_pos->data; + if (pos_data[0] == 0) { + infer_request->reset_state(); + } } decoder_end_time = ggml_time_us(); @@ -118,7 +120,8 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin std::shared_ptr model; auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); - ggml_decoder = std::make_shared(cgraph, m_params, c_params, model_weights, is_static, stateful); + ggml_decoder = + std::make_shared(cgraph, m_params, c_params, model_weights, is_static, stateful); decoder_end_time = ggml_time_us(); auto input_model = std::make_shared(ggml_decoder); @@ -351,7 +354,9 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { } for (size_t i = 0; i < ov_output_names.size(); i++) { - auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); + auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names[i]); + ov::Tensor output_tensor(infer_request->get_output_tensor(i).get_element_type(), + infer_request->get_output_tensor(i).get_shape(), ggml_tensor->data); infer_request->set_output_tensor(i, output_tensor); } @@ -378,7 +383,9 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { } for (size_t i = 0; i < ov_output_names.size(); i++) { - auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); + auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names[i]); + ov::Tensor output_tensor(infer_request->get_output_tensor(i).get_element_type(), + infer_request->get_output_tensor(i).get_shape(), ggml_tensor->data); infer_request->set_output_tensor(i, output_tensor); } @@ -478,7 +485,7 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, // This case is added to make test-backend-ops work input_shape = ggml_decoder->get_shape(ggml_tensor->view_src); } else { - input_shape = ggml_decoder->get_shape(ggml_tensor); + input_shape = ggml_decoder->get_shape(ggml_tensor); } auto input_tensor = ov::Tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape, input_data); return input_tensor; @@ -616,20 +623,8 @@ ov::Tensor get_ov_output_tensor(std::shared_ptr ggml_decoder, con auto output_type = ggml_decoder->get_ov_type(ggml_tensor); auto output_shape = ggml_decoder->get_shape(ggml_tensor); - if (ggml_decoder->is_static() && output_shape[2] == 0) { - output_shape[2] = 1; - } - if (ggml_decoder->is_stateful() && ggml_tensor->flags & GGML_TENSOR_FLAG_OUTPUT) { - std::vector output_shape_3d; - for (size_t i=1; idata); - return output_tensor; - } else { - ov::Tensor output_tensor(output_type, output_shape, ggml_tensor->data); - return output_tensor; - } + ov::Tensor output_tensor(output_type, output_shape, ggml_tensor->data); + return output_tensor; } size_t checksum(const void * data, size_t size) { From 18ab0f562b7424545c71d10bf0efac2344570191 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 3 Feb 2026 17:39:21 +0800 Subject: [PATCH 243/254] Remove hardcode names --- ggml/src/ggml-openvino/ggml-decoder.cpp | 33 +++++++++---------- ggml/src/ggml-openvino/ggml-decoder.h | 28 ++++++++++++++++ .../src/ggml-openvino/ggml-openvino-extra.cpp | 3 +- ggml/src/ggml-openvino/ggml-openvino.cpp | 4 +-- ggml/src/ggml-openvino/utils.cpp | 18 +++++----- 5 files changed, 56 insertions(+), 30 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 2f97af0a3e..4806b90894 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -161,7 +161,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { ov::PartialShape stateful_kv_shape; // GGML_BACKEND_BUFFER_USAGE_ANY are kv caches if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) { - assert(src_name.find("cache_k") == 0 || src_name.find("cache_v") == 0); if (auto it = std::find(m_model_params.kv_names.begin(), m_model_params.kv_names.end(), src_name); it == m_model_params.kv_names.end()) { m_model_params.kv_names.push_back(src_name); @@ -242,18 +241,18 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const { case GGML_OP_PERMUTE: { if (node->src[0]->op != GGML_OP_VIEW) { op_case = 1; - } else if (ggml_is_contiguous(node->src[0])) { + } else if (node->src[0]->src[0]->op == GGML_OP_NONE) { + // kv cache tensor std::string src_name(node->view_src->name); - if (src_name.find("cache") == std::string::npos) { - op_case = 4; + int layer = extract_layer_from_name(src_name); + if (!is_swa_layer(layer)) { + op_case = 2; } else { - int layer = extract_layer_from_name(src_name); - if (!is_swa_layer(layer)) { - op_case = 2; - } else { - op_case = 3; - } + op_case = 3; } + } else if (node->src[0]->src[0]->op == GGML_OP_ROPE || node->src[0]->src[0]->src[0]->op == GGML_OP_ROPE) { + // rope'ed query tensor + op_case = 4; } break; } @@ -383,16 +382,16 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co auto name = std::string(input->name); ov::PartialShape input_shape; - if ((op->op == GGML_OP_GET_ROWS && op->src[0]->op == GGML_OP_NONE) || op->op == GGML_OP_ROPE) { + if (is_inp_tok(input, op) || is_inp_pos(input, op)) { // tokens or positions int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1; input_shape = ov::PartialShape{1, 1, 1, len}; - } else if (op->op == GGML_OP_GET_ROWS) { + } else if (is_output_idx(input, op)) { // output index input_shape = ov::PartialShape{1, 1, 1, m_is_static ? m_compute_params.output_len : -1}; - } else if (op->op == GGML_OP_CPY || op->op == GGML_OP_FLASH_ATTN_EXT) { + } else if (is_inp_mask(input, op)) { // mask if (m_is_static) { input_shape = ov::PartialShape{1, 1, m_is_prefill ? m_prefill_chunk_size : 1, m_model_params.ctx}; @@ -402,7 +401,7 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co input_shape = ov::PartialShape{-1, 1, -1, -1}; } - } else if (op && op->op == GGML_OP_SET_ROWS && op->src[2] == input) { + } else if (is_kvcache(input, op)) { // kvcache input_shape = ov::PartialShape{get_shape(input)}; if (!m_is_static) { @@ -410,7 +409,7 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co input_shape[2] = -1; } - } else if (op && op->op == GGML_OP_SET_ROWS && op->src[1] == input) { + } else if (is_kv_idx(input, op)) { // kv update index int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1; input_shape = ov::PartialShape{1, 1, 1, len}; @@ -490,9 +489,7 @@ const ggml_tensor * GgmlOvDecoder::get_tensor_from_name(const std::string & name std::map GgmlOvDecoder::get_kv_param_res_names() const { std::map kv_param_res_names; for (const auto & name : m_model_params.kv_names) { - if (name.find("cache_k") == 0 || name.find("cache_v") == 0) { - kv_param_res_names[name] = name; - } + kv_param_res_names[name] = name; } return kv_param_res_names; } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index f69d187880..260cc0cedb 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -213,6 +213,34 @@ public: static std::string compute_op_type(const ggml_tensor * node); void add_extra_inputs(); + inline static bool is_inp_tok(const ggml_tensor * tensor, const ggml_tensor * op) { + return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op == GGML_OP_NONE; + } + + inline static bool is_inp_pos(const ggml_tensor * tensor, const ggml_tensor * op) { + return op->op == GGML_OP_ROPE && tensor == op->src[1]; + } + + inline static bool is_inp_emb(const ggml_tensor * tensor, const ggml_tensor * op) { + return tensor->op == GGML_OP_GET_ROWS && op->op == GGML_OP_RMS_NORM; + } + + inline static bool is_inp_mask(const ggml_tensor * tensor, const ggml_tensor * op) { + return op->op == GGML_OP_CPY || (op->op == GGML_OP_FLASH_ATTN_EXT && tensor == op->src[3]); + } + + inline static bool is_kvcache(const ggml_tensor * tensor, const ggml_tensor * op) { + return op->op == GGML_OP_SET_ROWS && op->src[2] == tensor; + } + + inline static bool is_kv_idx(const ggml_tensor * tensor, const ggml_tensor * op) { + return op->op == GGML_OP_SET_ROWS && op->src[1] == tensor; + } + + inline static bool is_output_idx(const ggml_tensor * tensor, const ggml_tensor * op) { + return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op != GGML_OP_NONE; + } + private: void set_input_output(ggml_tensor * node, bool naive = false); int compute_op_case(const ggml_tensor * node) const; diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp index 76871cc4be..3b4afbbbce 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp @@ -85,7 +85,8 @@ void ggml_openvino_device_config::init() { // Release the context (queue keeps a reference) clReleaseContext(cl_ctx); } else if (device_name == "NPU") { - remote_context = ov_singleton_core().get_default_context(device_name); + // remote tensor is not used for NPU yet + // remote_context = ov_singleton_core().get_default_context(device_name); } initialized = true; diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 8d6a0dbf33..b2d5234083 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -139,8 +139,8 @@ static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_bu ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; // Put kvcache on device memory for GPU (NPU memory is too small even for kvcache) - if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY && strncmp(tensor->name, "cache_", 6) == 0 && !ctx->is_remote && - ggml_openvino_get_device_name() == "GPU" && !getenv("GGML_OPENVINO_STATEFUL_EXECUTION")) { + if (strncmp(tensor->name, "cache_", 6) == 0 && !ctx->is_remote && ggml_openvino_get_device_name() == "GPU" && + !getenv("GGML_OPENVINO_STATEFUL_EXECUTION")) { GGML_ASSERT(ctx->tensor_extras.empty()); auto device = ctx->device; auto size = ctx->size; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 0c5a520b25..83d3b3afee 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -508,8 +508,8 @@ ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr ggml const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(param_name); const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor); - if (param_name == "inp_pos" || param_name == "inp_tokens" || - (op->op == GGML_OP_SET_ROWS && op->src[1] == ggml_tensor)) { + if (GgmlOvDecoder::is_inp_tok(ggml_tensor, op) || GgmlOvDecoder::is_inp_pos(ggml_tensor, op) || + GgmlOvDecoder::is_kv_idx(ggml_tensor, op)) { assert(ggml_tensor->ne[0] == 1); ov::Shape input_shape = {1, 1, 1, 1}; ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape); @@ -523,7 +523,7 @@ ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr ggml return input_tensor; } - if (param_name == "inp_out_ids") { + if (GgmlOvDecoder::is_output_idx(ggml_tensor, op)) { ov::Shape input_shape = {1, 1, 1, 1}; ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape); int32_t inp_out_id = *((int32_t *) ggml_tensor->data); @@ -533,7 +533,7 @@ ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr ggml return input_tensor; } - if (param_name.find("self_kq_mask") == 0) { + if (GgmlOvDecoder::is_inp_mask(ggml_tensor, op)) { size_t context_size = ggml_decoder->get_ctx_size(); std::vector padded_data = pad_input(ggml_tensor, 1, context_size, -INFINITY); ov::Tensor input_tensor(ov::element::f32, ov::Shape{1, 1, 1, context_size}); @@ -557,8 +557,8 @@ ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr ggm const size_t chunk_valid_size = std::min(chunk_size, input_len - chunk_index * chunk_size); const size_t chunk_pad_size = chunk_size - chunk_valid_size; - if (param_name == "inp_pos" || param_name == "inp_tokens" || - (op->op == GGML_OP_SET_ROWS && op->src[1] == ggml_tensor)) { + if (GgmlOvDecoder::is_inp_tok(ggml_tensor, op) || GgmlOvDecoder::is_inp_pos(ggml_tensor, op) || + GgmlOvDecoder::is_kv_idx(ggml_tensor, op)) { ov::Shape input_shape = {1, 1, 1, chunk_size}; ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape); // copy the chunk_index-th chunk from ggml_tensor @@ -585,7 +585,7 @@ ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr ggm return input_tensor; } - if (param_name == "inp_out_ids") { + if (GgmlOvDecoder::is_output_idx(ggml_tensor, op)) { size_t output_len = ggml_decoder->get_compute_params().output_len; ov::Shape input_shape = {1, 1, 1, output_len}; ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape); @@ -600,7 +600,7 @@ ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr ggm return input_tensor; } - if (param_name.find("self_kq_mask") == 0) { + if (GgmlOvDecoder::is_inp_mask(ggml_tensor, op)) { size_t cols = ggml_tensor->ne[0]; size_t rows = ggml_tensor->ne[1]; float * ggml_data = (float *) ggml_tensor->data + chunk_index * chunk_size * cols; @@ -748,7 +748,7 @@ const ggml_tensor * get_inp_pos_tensor(ggml_cgraph * cgraph) { if (src == nullptr) { break; } - if (std::string(src->name) == "inp_pos") { + if (GgmlOvDecoder::is_inp_pos(src, op)) { return src; } } From b6c0697d10c46450b208c34fa15d835f99862042 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 4 Feb 2026 16:58:39 +0800 Subject: [PATCH 244/254] Avoid re-compilation in llama-bench --- ggml/include/ggml-openvino.h | 2 ++ ggml/src/ggml-openvino/ggml-decoder.cpp | 16 ++++++++-- ggml/src/ggml-openvino/ggml-decoder.h | 20 ++++++++----- ggml/src/ggml-openvino/ggml-openvino.cpp | 16 ++++++++++ ggml/src/ggml-openvino/utils.cpp | 38 ++++++++++-------------- ggml/src/ggml-openvino/utils.h | 25 +++++++++++----- 6 files changed, 78 insertions(+), 39 deletions(-) diff --git a/ggml/include/ggml-openvino.h b/ggml/include/ggml-openvino.h index 46c1485f66..b68b55d1e8 100644 --- a/ggml/include/ggml-openvino.h +++ b/ggml/include/ggml-openvino.h @@ -24,6 +24,8 @@ GGML_BACKEND_API bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t b GGML_BACKEND_API bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft); +GGML_BACKEND_API size_t ggml_backend_openvino_buffer_get_ctx_id(ggml_backend_buffer_t buffer); + // device buffer GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device); diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 4806b90894..f7052bfc82 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -79,6 +79,17 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, add_extra_inputs(); } +void GgmlOvDecoder::update_io(ggml_cgraph * cgraph) { + m_cgraph = cgraph; + m_model_inputs.clear(); + m_model_outputs.clear(); + m_node_info_list.clear(); + for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { + auto * cur_node = cgraph->nodes[node_n]; + set_input_output(cur_node); + } +} + GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map> & model_weights) { m_cgraph = cgraph; m_model_weights = model_weights; @@ -330,6 +341,7 @@ std::pair GgmlOvDecoder::compute_llm_params(ggml_cgr auto * mask = node->src[3]; std::string mask_name(mask->name); + model_params.kv_buffer_ctx_id = ggml_backend_openvino_buffer_get_ctx_id(cache_k->buffer); if (mask_name.find("swa") != std::string::npos) { model_params.swa_layers.push_back(layer); model_params.ctx_per_seq_swa = cache_k->ne[1]; @@ -358,7 +370,7 @@ std::pair GgmlOvDecoder::compute_llm_params(ggml_cgr break; } if (node->op == GGML_OP_ROPE) { - model_params.rope_params = node->op_params; + memcpy(model_params.rope_params, node->op_params, sizeof(int32_t) * 15); } } auto * output_tensor = cgraph->nodes[cgraph->n_nodes - 1]; @@ -405,7 +417,7 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co // kvcache input_shape = ov::PartialShape{get_shape(input)}; if (!m_is_static) { - // do not fix ctx size to make llama-bench work + // do not fix ctx size to make llama-bench work across test params input_shape[2] = -1; } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 260cc0cedb..c8e3edeaf8 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -5,6 +5,7 @@ #include "openvino/decoder.hpp" #include +#include #include #include #include @@ -20,20 +21,21 @@ struct ModelParams { int n_heads = -1; int n_heads_kv = -1; int head_size = -1; - int32_t * rope_params = nullptr; + int32_t rope_params[15]; std::vector swa_layers; std::vector kv_names; + size_t kv_buffer_ctx_id = 0; - bool operator==(const ModelParams & other) const { - return n_seq == other.n_seq && n_heads == other.n_heads && n_heads_kv == other.n_heads_kv && - head_size == other.head_size && rope_params == other.rope_params && swa_layers == other.swa_layers && - ctx_per_seq == other.ctx_per_seq && ctx_per_seq_swa == other.ctx_per_seq_swa; + bool same_rope_params(const ModelParams & other) const { + return memcmp(rope_params, other.rope_params, sizeof(int32_t) * 15) == 0; } - bool can_reuse_dynamically(const ModelParams & other) const { return *this == other; } + bool can_reuse_dynamically(const ModelParams & other) const { return same_rope_params(other); } - bool can_reuse_statically(const ModelParams & other) const { return *this == other; } + bool can_reuse_statically(const ModelParams & other) const { return same_rope_params(other) && ctx == other.ctx; } + + bool kv_buffer_changed(const ModelParams & other) const { return kv_buffer_ctx_id != other.kv_buffer_ctx_id; } }; struct ComputeParams { @@ -170,7 +172,7 @@ public: int get_input_len() const { return m_compute_params.input_len; } - virtual int32_t * get_rope_params() const override { return m_model_params.rope_params; } + virtual int32_t * get_rope_params() const override { return const_cast(m_model_params.rope_params); } virtual std::map get_kv_param_res_names() const override; @@ -213,6 +215,8 @@ public: static std::string compute_op_type(const ggml_tensor * node); void add_extra_inputs(); + void update_io(ggml_cgraph * cgraph); + inline static bool is_inp_tok(const ggml_tensor * tensor, const ggml_tensor * op) { return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op == GGML_OP_NONE; } diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index b2d5234083..87577dde9c 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -8,6 +8,7 @@ #include "ggml-quants.hpp" #include "ggml.h" +#include #include #include #include @@ -53,6 +54,7 @@ struct ggml_backend_openvino_buffer_context { int device; std::string name; + size_t id; // For non-weight buffers (KV cache, compute), we still use contiguous allocation void * data; @@ -71,6 +73,10 @@ struct ggml_backend_openvino_buffer_context { ggml_backend_openvino_buffer_context(int device, size_t size, bool is_remote = false) : device(device), name(std::string(GGML_OPENVINO_NAME) + std::to_string(device)), + id([]() { + static std::atomic next_id{1}; + return next_id.fetch_add(1); + }()), data(nullptr), size(size), is_remote(is_remote) { @@ -107,6 +113,8 @@ struct ggml_backend_openvino_buffer_context { ~ggml_backend_openvino_buffer_context() { // Clean up all tensor extras + GGML_LOG_DEBUG("Deleting OpenVINO buffer context #%zu for device %d, size %zu MB\n", id, device, + size / 1024 / 1024); for (auto & pair : tensor_extras) { delete pair.second; } @@ -587,6 +595,14 @@ bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer) { return buffer->iface.free_buffer == ggml_backend_openvino_buffer_free_buffer; } +size_t ggml_backend_openvino_buffer_get_ctx_id(ggml_backend_buffer_t buffer) { + if (!ggml_backend_buffer_is_openvino(buffer)) { + return 0; + } + ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + return ctx->id; +} + bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft) { return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name; } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 83d3b3afee..69cac19019 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -76,7 +76,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin ComputeParams c_params; std::tie(m_params, c_params) = GgmlOvDecoder::compute_llm_params(cgraph, is_static); - const auto key = compute_graph_key(cgraph); + graph_key key(cgraph); bool cache_hit; int64_t decoder_end_time; @@ -90,19 +90,22 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin auto it = decoder_cache.find(key); cache_hit = it != decoder_cache.end(); + ModelParams old_m_params; if (cache_hit) { ggml_decoder = it->second; - cache_hit = ggml_decoder->get_model_params().can_reuse_dynamically(m_params); + old_m_params = ggml_decoder->get_model_params(); + cache_hit = old_m_params.can_reuse_dynamically(m_params); } if (cache_hit) { std::map> model_weights; - ggml_decoder = decoder_cache[key]; ggml_decoder->set_compute_params(c_params); ggml_decoder->set_model_params(m_params); + if (old_m_params.kv_buffer_changed(m_params)) { + ggml_decoder->update_io(cgraph); + } ggml_decoder->add_extra_inputs(); - infer_request = infer_request_cache[key]; - + infer_request = infer_request_cache.at(key); if (stateful) { const auto * inp_pos = get_inp_pos_tensor(cgraph); int32_t * pos_data = (int32_t *) inp_pos->data; @@ -240,7 +243,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { const auto * inp_pos = get_inp_pos_tensor(cgraph); const auto is_prefill = get_is_prefill(inp_pos); - const auto key = compute_graph_key(cgraph); + graph_key key(cgraph); bool cache_hit; int64_t decoder_end_time; @@ -254,19 +257,23 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { auto it = decoder_cache.find(key); cache_hit = it != decoder_cache.end(); + ModelParams old_m_params; if (cache_hit) { ggml_decoder = it->second; - cache_hit = ggml_decoder->get_model_params().can_reuse_statically(m_params); + old_m_params = ggml_decoder->get_model_params(); + cache_hit = old_m_params.can_reuse_statically(m_params); } if (cache_hit) { std::map> model_weights; - ggml_decoder = decoder_cache[key]; ggml_decoder->m_is_prefill = is_prefill; ggml_decoder->set_model_params(m_params); ggml_decoder->set_compute_params(c_params); + if (old_m_params.kv_buffer_changed(m_params)) { + ggml_decoder->update_io(cgraph); + } ggml_decoder->add_extra_inputs(); - infer_request = is_prefill ? infer_request_cache_prefill[key] : infer_request_cache[key]; + infer_request = is_prefill ? infer_request_cache_prefill.at(key) : infer_request_cache.at(key); decoder_end_time = ggml_time_us(); conversion_end_time = decoder_end_time; @@ -761,17 +768,4 @@ bool get_is_prefill(const ggml_tensor * inp_pos) { return inp_pos->ne[0] > 1; } -graph_key compute_graph_key(ggml_cgraph * cgraph) { - graph_key key; - key.n_nodes = cgraph->n_nodes; - - for (int i = 0; i < cgraph->n_nodes; ++i) { - const auto * node = cgraph->nodes[i]; - if (node->op == GGML_OP_SET_ROWS && strncmp(node->src[2]->name, "cache_k_l0", 10) == 0) { - key.cache_k_l0 = node->src[2]; - } - } - return key; -} - #pragma GCC diagnostic pop diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 72ef904f74..7c403b7d89 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -5,20 +5,33 @@ #include #include #include +#include struct graph_key { - size_t n_nodes; - void * cache_k_l0; + int n_nodes; + std::string first_node_name; + std::string last_node_name; + + graph_key(const ggml_cgraph * cgraph) : n_nodes(cgraph->n_nodes) { + if (n_nodes > 0) { + first_node_name = cgraph->nodes[0]->name; + last_node_name = cgraph->nodes[n_nodes - 1]->name; + } + } bool operator==(const graph_key & other) const { - return n_nodes == other.n_nodes && cache_k_l0 == other.cache_k_l0; + return n_nodes == other.n_nodes && first_node_name == other.first_node_name && + last_node_name == other.last_node_name; } }; struct graph_key_hash { size_t operator()(const graph_key & key) const { - size_t h = std::hash{}(key.n_nodes); - h ^= std::hash{}(key.cache_k_l0) + 0x9e3779b9 + (h << 6) + (h >> 2); + size_t h = std::hash{}(key.n_nodes); + if (key.n_nodes > 0) { + h ^= std::hash{}(key.first_node_name) + 0x9e3779b9 + (h << 6) + (h >> 2); + h ^= std::hash{}(key.last_node_name) + 0x9e3779b9 + (h << 6) + (h >> 2); + } return h; } }; @@ -66,8 +79,6 @@ const ggml_tensor * get_inp_pos_tensor(struct ggml_cgraph * cgraph); bool get_is_prefill(const ggml_tensor * inp_pos); -graph_key compute_graph_key(struct ggml_cgraph * cgraph); - ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string & param_name); ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr ggml_decoder, const std::string & param_name); From 0ee7e0548555c5230cfd4dd5f46536b8a6fb3568 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 5 Feb 2026 11:12:50 +0800 Subject: [PATCH 245/254] Extract zp directly instead of bias --- ggml/src/ggml-openvino/ggml-decoder.cpp | 35 +- .../src/ggml-openvino/ggml-openvino-extra.cpp | 29 +- ggml/src/ggml-openvino/ggml-openvino-extra.h | 21 +- ggml/src/ggml-openvino/ggml-openvino.cpp | 13 +- ggml/src/ggml-openvino/ggml-quants.cpp | 389 +++++++++--------- ggml/src/ggml-openvino/ggml-quants.hpp | 86 ++-- 6 files changed, 295 insertions(+), 278 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index f7052bfc82..d8d71cf25e 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -508,10 +508,10 @@ std::map GgmlOvDecoder::get_kv_param_res_names() const std::map> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph) { std::map> model_weights; - static std::mutex weights_mutex; + // static std::mutex weights_mutex; auto * nodes = cgraph->nodes; auto n_nodes = cgraph->n_nodes; - std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor * node) { + std::for_each(std::execution::seq, nodes, nodes + n_nodes, [&](ggml_tensor * node) { for (int i = 0; i < GGML_MAX_SRC; i++) { auto * src = node->src[i]; if (src == nullptr) { @@ -522,21 +522,26 @@ std::map> GgmlOvDecoder::create_weight_no if (!src->view_src) { ggml_backend_buffer * buffer = src->buffer; if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS || ggml_is_quantized(src->type)) { - bool should_create = false; - { - std::lock_guard lock(weights_mutex); - if (model_weights.find(src_name) == model_weights.end()) { - model_weights[src_name] = nullptr; - should_create = true; - } - } - if (should_create) { + // bool should_create = false; + // { + // std::lock_guard lock(weights_mutex); + // if (model_weights.find(src_name) == model_weights.end()) { + // model_weights[src_name] = nullptr; + // should_create = true; + // } + // } + // if (should_create) { + // auto weight_node = create_weight_node(src); + // weight_node->set_friendly_name(src_name); + // { + // std::lock_guard lock(weights_mutex); + // model_weights[src_name] = weight_node; + // } + // } + if (model_weights.find(src_name) == model_weights.end()) { auto weight_node = create_weight_node(src); weight_node->set_friendly_name(src_name); - { - std::lock_guard lock(weights_mutex); - model_weights[src_name] = weight_node; - } + model_weights[src_name] = weight_node; } } } diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp index 3b4afbbbce..4584dc38d0 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp @@ -209,12 +209,12 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten layout.is_requant = true; layout.requant_type = requant_type; - // Special case: requant to F16 - just store F16 weights, no scales/biases + // Special case: requant to F16 - just store F16 weights, no scales/zp if (requant_type.value() == ExtraQuantType::F16) { layout.weights_size = n_elements * sizeof(uint16_t); // F16 = 2 bytes layout.total_size = layout.weights_size; layout.weights_offset = 0; - // No scales/biases for F16 + // No scales/zp for F16 return layout; } @@ -255,14 +255,15 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements; int64_t n_blocks = n_elements / layout.weights_per_block; layout.scales_size = n_blocks * sizeof(uint16_t); - // For symmetric quantization, we only need one bias value (not one per block) - layout.biases_size = layout.is_symmetric ? sizeof(uint16_t) : n_blocks * sizeof(uint16_t); + // For symmetric quantization, we only need one zp value (not one per block) + // Zero points are stored in U4 or U8 format matching the weight type + size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks; + layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements; layout.weights_offset = 0; layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment; - layout.biases_offset = - layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment; - layout.total_size = layout.biases_offset + layout.biases_size; + layout.zp_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment; + layout.total_size = layout.zp_offset + layout.zp_size; layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor)); return layout; } @@ -305,17 +306,19 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten // Weights: U4 = n_elements/2 bytes, U8 = n_elements bytes layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements; - // Scales and biases: F16 per block + // Scales: F16 per block int64_t n_blocks = n_elements / layout.weights_per_block; layout.scales_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes - // For symmetric quantization, we only need one bias value (not one per block) - layout.biases_size = layout.is_symmetric ? sizeof(uint16_t) : n_blocks * sizeof(uint16_t); + // Zero points: U4 or U8 matching weight type + // For symmetric quantization, we only need one zp value (not one per block) + size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks; + layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements; - // Layout in buffer: [weights | scales | biases] with alignment + // Layout in buffer: [weights | scales | zp] with alignment layout.weights_offset = 0; layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment; - layout.biases_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment; - layout.total_size = layout.biases_offset + layout.biases_size; + layout.zp_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment; + layout.total_size = layout.zp_offset + layout.zp_size; return layout; } diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h index e2c5a8ceea..726a90abb0 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.h +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h @@ -110,16 +110,19 @@ struct ggml_openvino_weight_extra : public ggml_openvino_extra_base { : ggml_openvino_extra_base(Type::WEIGHT), constant(std::move(c)) {} }; -// Extra data for quantized weight tensors - stores extracted weights/scales/biases and ov::Constant +// Extra data for quantized weight tensors - stores extracted weights/scales/zp and ov::Constant struct ggml_openvino_quantized_weight_extra : public ggml_openvino_extra_base { ov::Tensor weights; // U4 or U8 extracted weights ov::Tensor scales; // F16 scales - ov::Tensor biases; // F16 biases (zero points) + ov::Tensor zp; // U4 or U8 zero points (same type as weights) std::shared_ptr constant; // Pre-built OpenVINO weight subgraph - ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor b, std::shared_ptr c) - : ggml_openvino_extra_base(Type::QUANTIZED_WEIGHT), - weights(std::move(w)), scales(std::move(s)), biases(std::move(b)), constant(std::move(c)) {} + ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor z, std::shared_ptr c) : + ggml_openvino_extra_base(Type::QUANTIZED_WEIGHT), + weights(std::move(w)), + scales(std::move(s)), + zp(std::move(z)), + constant(std::move(c)) {} }; // Extra data for KV cache / compute tensors - stores ov::Tensor for infer_request @@ -133,7 +136,7 @@ struct ggml_openvino_tensor_extra : public ggml_openvino_extra_base { // ===================================================== // Extracted Size Calculation for Quantized Tensors // ===================================================== -// For quantized tensors, we need extra space to store extracted weights, scales, and biases. +// For quantized tensors, we need extra space to store extracted weights, scales, and zero points. // Returns the total size needed in the buffer for extracted data. struct ggml_openvino_extracted_layout { @@ -142,10 +145,10 @@ struct ggml_openvino_extracted_layout { size_t weights_size; // Size of weights in bytes size_t scales_offset; // Offset to scales in buffer size_t scales_size; // Size of scales in bytes - size_t biases_offset; // Offset to biases in buffer - size_t biases_size; // Size of biases in bytes + size_t zp_offset; // Offset to zero points in buffer + size_t zp_size; // Size of zero points in bytes (U4 or U8) bool is_u4; // true for U4 weights, false for U8 - int64_t weights_per_block;// weights per scale/bias block + int64_t weights_per_block; // weights per scale/zp block bool is_symmetric; // true for symmetric quantization // Requantization info diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 87577dde9c..e531a9c036 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -259,13 +259,15 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer ov::Shape weight_shape = {static_cast(tensor->ne[1]), static_cast(tensor->ne[0])}; ov::Shape scale_shape = {static_cast(tensor->ne[1]), static_cast(tensor->ne[0] / layout.weights_per_block)}; + // zp shape: scalar for symmetric, per-block for asymmetric + ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape; ov::Tensor weights(weight_type, weight_shape, buf_base + layout.weights_offset); ov::Tensor scales(ov::element::f16, scale_shape, buf_base + layout.scales_offset); - ov::Tensor biases(ov::element::f16, scale_shape, buf_base + layout.biases_offset); + ov::Tensor zp(weight_type, zp_shape, buf_base + layout.zp_offset); auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales), - std::move(biases), constant); + std::move(zp), constant); ctx->tensor_extras[tensor] = extra; tensor->extra = extra; @@ -487,10 +489,9 @@ static size_t ggml_backend_openvino_buffer_type_get_alloc_size(ggml_backend_buff if (ggml_is_quantized(tensor->type) && tensor->ne[2] == 1 && tensor->ne[3] == 1) { ggml_openvino_extracted_layout layout = ggml_openvino_get_extracted_layout(tensor); if (layout.total_size > 0) { - GGML_LOG_DEBUG( - "%s: tensor %s needs %zu bytes (original %zu, extracted: weights=%zu scales=%zu biases=%zu)\n", - __func__, tensor->name, layout.total_size, ggml_nbytes(tensor), layout.weights_size, layout.scales_size, - layout.biases_size); + GGML_LOG_DEBUG("%s: tensor %s needs %zu bytes (original %zu, extracted: weights=%zu scales=%zu zp=%zu)\n", + __func__, tensor->name, layout.total_size, ggml_nbytes(tensor), layout.weights_size, + layout.scales_size, layout.zp_size); return layout.total_size; } } diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index 8946b73a56..2de0494c91 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -42,80 +42,97 @@ void unpack_32_4(const uint8_t * data, uint8_t * dst) { } } -// Extracts (weight, scales, biases) from Q4_0 tensors. +// Extracts (weight, scales, zp) from Q4_0 tensors. // Data layout is: |16 bit scale|32 x 4bit weights|. void extract_q4_0_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, ov::Tensor & scales_arr, - ov::Tensor & biases_arr) { + ov::Tensor & zp_arr) { const uint64_t bytes_per_block = 18; // 2 bytes scale, 32x0.5 byte weights auto * data = static_cast(tensor->data); auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); - auto * biases = biases_arr.data::value_type>(); + auto * zp = static_cast(zp_arr.data()); - bool is_scalar_bias = (biases_arr.get_size() == 1); // Symmetric quantization + bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization + + // For Q4_0, zero point is always 8 + if (is_scalar_zp) { + zp[0] = 8 | (8 << 4); // Pack two 4-bit values + } ov::parallel_for(scales_arr.get_size(), [&](size_t i) { scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))); - // For symmetric quantization, only write the first bias (all blocks share the same bias relationship) - if (is_scalar_bias) { - if (i == 0) { - biases[0] = ov::float16(-8.f * static_cast(scales[0])); + // For asymmetric quantization, compute per-block zero points + if (!is_scalar_zp) { + // Pack two 4-bit zero points per byte + if (i % 2 == 0) { + zp[i / 2] = 8; // Lower nibble + } else { + zp[i / 2] |= (8 << 4); // Upper nibble } - } else { - biases[i] = ov::float16(-8.f * static_cast(scales[i])); } unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16); }); } -// Extracts (weight, scales, biases) from Q4_1 tensors. -// Data layout is: |16 bit scale|16 bit bias|32 x 4bit weights|. +// Extracts (weight, scales, zp) from Q4_1 tensors. +// Data layout is: |16 bit scale|16 bit min|32 x 4bit weights|. void extract_q4_1_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, ov::Tensor & scales_arr, - ov::Tensor & biases_arr) { - const uint64_t bytes_per_block = 20; // 2 bytes scale, 2 bytes bias, 32x0.5 byte weights + ov::Tensor & zp_arr) { + const uint64_t bytes_per_block = 20; // 2 bytes scale, 2 bytes min, 32x0.5 byte weights auto * data = static_cast(tensor->data); auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); - auto * biases = biases_arr.data::value_type>(); + auto * zp = static_cast(zp_arr.data()); + ov::parallel_for(scales_arr.get_size(), [&](size_t i) { - scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))); - biases[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2))); + float scale = static_cast(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)))); + float min = static_cast(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2)))); + scales[i] = ov::float16(scale); + // zp = -min / scale (bias = min, so zp = -bias/scale) + uint8_t zp_val = (scale != 0.0f) ? (uint8_t) std::round(-min / scale) : 0; + // Pack two 4-bit zero points per byte + if (i % 2 == 0) { + zp[i / 2] = zp_val & 0x0F; // Lower nibble + } else { + zp[i / 2] |= (zp_val << 4); // Upper nibble + } unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16); }); } -// Extracts (weight, scales, biases) from Q8_0 tensors. +// Extracts (weight, scales, zp) from Q8_0 tensors. // Data layout is: |16 bit scale|32 x 8bit weights|. void extract_q8_0_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, ov::Tensor & scales_arr, - ov::Tensor & biases_arr) { + ov::Tensor & zp_arr) { const uint64_t weights_per_block = 32; const uint64_t bytes_per_block = 34; // 2 bytes scale, 32x1 byte weights auto * data = static_cast(tensor->data); auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); - auto * biases = biases_arr.data::value_type>(); + auto * zp = static_cast(zp_arr.data()); - bool is_scalar_bias = (biases_arr.get_size() == 1); // Symmetric quantization + bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization + + // For Q8_0, zero point is always 128 + if (is_scalar_zp) { + zp[0] = 128; + } ov::parallel_for(scales_arr.get_size(), [&](size_t i) { uint8_t * block_data = data + i * bytes_per_block; scales[i] = ov::float16::from_bits(*(uint16_t *) block_data); - // For symmetric quantization, only write the first bias (all blocks share the same bias relationship) - if (is_scalar_bias) { - if (i == 0) { - biases[0] = ov::float16(-128.f * static_cast(scales[0])); - } - } else { - biases[i] = ov::float16(-128.f * static_cast(scales[i])); + // For asymmetric quantization, store per-block zero points + if (!is_scalar_zp) { + zp[i] = 128; } for (size_t j = 0; j < weights_per_block; ++j) { uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes. @@ -147,51 +164,60 @@ void unpack_256_4(const uint8_t * data, uint8_t * dst) { void extract_q4_k_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, ov::Tensor & scales_arr, - ov::Tensor & biases_arr) { + ov::Tensor & zp_arr) { const uint64_t bytes_per_block = 2 + 2 + 12 + 128; const uint64_t n_super_block = tensor->nb[3] / bytes_per_block; auto * data = static_cast(tensor->data); auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); - auto * biases = biases_arr.data::value_type>(); + auto * zp = static_cast(zp_arr.data()); ov::parallel_for(n_super_block, [&](size_t i) { uint8_t * block_data = data + i * bytes_per_block; // Extract scale factors and offsets float scale_scales = static_cast(ov::float16::from_bits(*((uint16_t *) block_data))); - float scale_biases = static_cast(ov::float16::from_bits(*((uint16_t *) block_data + 1))); + float scale_mins = static_cast(ov::float16::from_bits(*((uint16_t *) block_data + 1))); // Extract qs1 and qs2 uint8_t * qs1 = block_data + 4; - // uint8_t* qs2 = block_data + 16; - scales[i * 8] = ov::float16(scale_scales * static_cast((*(qs1) & 0b111111))); - scales[i * 8 + 1] = ov::float16(scale_scales * static_cast((*(qs1 + 1) & 0b111111))); - scales[i * 8 + 2] = ov::float16(scale_scales * static_cast((*(qs1 + 2) & 0b111111))); - scales[i * 8 + 3] = ov::float16(scale_scales * static_cast((*(qs1 + 3) & 0b111111))); - scales[i * 8 + 4] = - ov::float16(scale_scales * static_cast((*(qs1 + 8) & 0b00001111) | ((*(qs1) >> 6) << 4))); - scales[i * 8 + 5] = - ov::float16(scale_scales * static_cast((*(qs1 + 9) & 0b00001111) | ((*(qs1 + 1) >> 6) << 4))); - scales[i * 8 + 6] = - ov::float16(scale_scales * static_cast((*(qs1 + 10) & 0b00001111) | ((*(qs1 + 2) >> 6) << 4))); - scales[i * 8 + 7] = - ov::float16(scale_scales * static_cast((*(qs1 + 11) & 0b00001111) | ((*(qs1 + 3) >> 6) << 4))); + // Calculate scales + float scale_vals[8]; + scale_vals[0] = scale_scales * static_cast((*(qs1) & 0b111111)); + scale_vals[1] = scale_scales * static_cast((*(qs1 + 1) & 0b111111)); + scale_vals[2] = scale_scales * static_cast((*(qs1 + 2) & 0b111111)); + scale_vals[3] = scale_scales * static_cast((*(qs1 + 3) & 0b111111)); + scale_vals[4] = scale_scales * static_cast((*(qs1 + 8) & 0b00001111) | ((*(qs1) >> 6) << 4)); + scale_vals[5] = scale_scales * static_cast((*(qs1 + 9) & 0b00001111) | ((*(qs1 + 1) >> 6) << 4)); + scale_vals[6] = scale_scales * static_cast((*(qs1 + 10) & 0b00001111) | ((*(qs1 + 2) >> 6) << 4)); + scale_vals[7] = scale_scales * static_cast((*(qs1 + 11) & 0b00001111) | ((*(qs1 + 3) >> 6) << 4)); - biases[i * 8] = ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 4) & 0b111111))); - biases[i * 8 + 1] = ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 5) & 0b111111))); - biases[i * 8 + 2] = ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 6) & 0b111111))); - biases[i * 8 + 3] = ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 7) & 0b111111))); - biases[i * 8 + 4] = - ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 8) >> 4) | ((*(qs1 + 4) >> 6) << 4))); - biases[i * 8 + 5] = - ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 9) >> 4) | ((*(qs1 + 5) >> 6) << 4))); - biases[i * 8 + 6] = - ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 10) >> 4) | ((*(qs1 + 6) >> 6) << 4))); - biases[i * 8 + 7] = - ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 11) >> 4) | ((*(qs1 + 7) >> 6) << 4))); + // Calculate min values (bias = -min) + float min_vals[8]; + min_vals[0] = scale_mins * static_cast((*(qs1 + 4) & 0b111111)); + min_vals[1] = scale_mins * static_cast((*(qs1 + 5) & 0b111111)); + min_vals[2] = scale_mins * static_cast((*(qs1 + 6) & 0b111111)); + min_vals[3] = scale_mins * static_cast((*(qs1 + 7) & 0b111111)); + min_vals[4] = scale_mins * static_cast((*(qs1 + 8) >> 4) | ((*(qs1 + 4) >> 6) << 4)); + min_vals[5] = scale_mins * static_cast((*(qs1 + 9) >> 4) | ((*(qs1 + 5) >> 6) << 4)); + min_vals[6] = scale_mins * static_cast((*(qs1 + 10) >> 4) | ((*(qs1 + 6) >> 6) << 4)); + min_vals[7] = scale_mins * static_cast((*(qs1 + 11) >> 4) | ((*(qs1 + 7) >> 6) << 4)); + + // Store scales and compute zero points + for (int j = 0; j < 8; j++) { + scales[i * 8 + j] = ov::float16(scale_vals[j]); + // zp = min / scale (since bias = -min and zp = -bias/scale) + uint8_t zp_val = (scale_vals[j] != 0.0f) ? (uint8_t) std::round(min_vals[j] / scale_vals[j]) : 0; + // Pack two 4-bit zero points per byte + size_t idx = i * 8 + j; + if (idx % 2 == 0) { + zp[idx / 2] = zp_val & 0x0F; + } else { + zp[idx / 2] |= (zp_val << 4); + } + } unpack_256_4(block_data + 16, weights + i * 128); }); } @@ -199,16 +225,21 @@ void extract_q4_k_data(const ggml_tensor * tensor, void extract_q6_k_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, ov::Tensor & scales_arr, - ov::Tensor & biases_arr) { + ov::Tensor & zp_arr) { const uint64_t bytes_per_block = 128 + 64 + 16 + 2; const uint64_t n_super_block = tensor->nb[3] / bytes_per_block; auto * data = static_cast(tensor->data); auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); - auto * biases = biases_arr.data::value_type>(); + auto * zp = static_cast(zp_arr.data()); - bool is_scalar_bias = (biases_arr.get_size() == 1); // Symmetric quantization + bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization + + // For Q6_K, zero point is always 32 + if (is_scalar_zp) { + zp[0] = 32; + } ov::parallel_for(n_super_block, [&](size_t i) { uint8_t * block_data = data + i * bytes_per_block; @@ -219,13 +250,9 @@ void extract_q6_k_data(const ggml_tensor * tensor, for (size_t j = 0; j < 16; j++) { scales[j + i * 16] = ov::float16(scale_factor * static_cast(*((int8_t *) (block_data + 128 + 64 + j)))); - // For symmetric quantization, only write the first bias (all blocks share the same bias relationship) - if (is_scalar_bias) { - if (i == 0 && j == 0) { - biases[0] = ov::float16(-32.f * static_cast(scales[0])); - } - } else { - biases[j + i * 16] = ov::float16(-32.f * static_cast(scales[j + i * 16])); + // For asymmetric quantization, store per-block zero points + if (!is_scalar_zp) { + zp[j + i * 16] = 32; } } @@ -258,20 +285,20 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8 void extract_q5_k_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, ov::Tensor & scales_arr, - ov::Tensor & biases_arr) { + ov::Tensor & zp_arr) { const uint64_t bytes_per_block = 4 + 12 + 32 + 128; const uint64_t n_super_block = tensor->nb[3] / bytes_per_block; auto * data = static_cast(tensor->data); auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); - auto * biases = biases_arr.data::value_type>(); + auto * zp = static_cast(zp_arr.data()); ov::parallel_for(n_super_block, [&](size_t i) { uint8_t * block_data = data + i * bytes_per_block; const float d = static_cast(ov::float16::from_bits(*((uint16_t *) block_data))); - const float min = static_cast(ov::float16::from_bits(*((uint16_t *) block_data + 1))); + const float min_factor = static_cast(ov::float16::from_bits(*((uint16_t *) block_data + 1))); const uint8_t * scales_data = block_data + 4; // 12 bytes of scales const uint8_t * qh = block_data + 4 + 12; // 32 bytes of high bits @@ -289,17 +316,18 @@ void extract_q5_k_data(const ggml_tensor * tensor, // Get scale and min for first 32 elements get_scale_min_k4(is + 0, scales_data, &sc, &m); const float d1 = d * sc; - const float m1 = min * m; + const float m1 = min_factor * m; // Get scale and min for second 32 elements get_scale_min_k4(is + 1, scales_data, &sc, &m); const float d2 = d * sc; - const float m2 = min * m; + const float m2 = min_factor * m; scales[i * 8 + is] = ov::float16(d1); - biases[i * 8 + is] = ov::float16(-m1); scales[i * 8 + is + 1] = ov::float16(d2); - biases[i * 8 + is + 1] = ov::float16(-m2); + // zp = min / scale (since bias = -min and zp = -bias/scale) + zp[i * 8 + is] = (d1 != 0.0f) ? (uint8_t) std::round(m1 / d1) : 0; + zp[i * 8 + is + 1] = (d2 != 0.0f) ? (uint8_t) std::round(m2 / d2) : 0; // Extract weights for first 32 elements (matching deq formula exactly) for (int l = 0; l < 32; ++l) { @@ -321,16 +349,13 @@ void extract_q5_k_data(const ggml_tensor * tensor, // TODO Reorder for make_intX_weights -ov::Output make_int8_weights(ov::Tensor & weight, - ov::Tensor & scales, - ov::Tensor & biases, - size_t group_size) { +ov::Output make_int8_weights(ov::Tensor & weight, ov::Tensor & scales, ov::Tensor & zp, size_t group_size) { ov::Shape orig_shape = weight.get_shape(); - // Expand dimensions for scales and biases + // Expand dimensions for scales and zp auto scale_shape = scales.get_shape(); - auto bias_shape = biases.get_shape(); - bool is_scalar_bias = bias_shape.empty(); // Symmetric quantization + auto zp_shape = zp.get_shape(); + bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization ov::Shape packed_shape = {orig_shape[0], orig_shape[1] / group_size, group_size}; @@ -340,10 +365,10 @@ ov::Output make_int8_weights(ov::Tensor & weight, } else { scale_shape.push_back(1); scales.set_shape(scale_shape); - // For symmetric quantization, biases remain scalar (don't resize) - if (!is_scalar_bias) { - bias_shape = scale_shape; - biases.set_shape(bias_shape); + // For symmetric quantization, zp remains scalar (don't resize) + if (!is_scalar_zp) { + zp_shape.push_back(1); + zp.set_shape(zp_shape); } } @@ -352,26 +377,9 @@ ov::Output make_int8_weights(ov::Tensor & weight, static_cast(weight.data()), nullptr); weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; auto scales_f16 = std::make_shared(scales); - ov::Tensor biases_u8(ov::element::u8, is_scalar_bias ? ov::Shape{} : scale_shape); - // Calculate zero point - const ov::float16 * bias_data = biases.data::value_type>(); - const ov::float16 * scale_data = scales.data::value_type>(); - uint8_t * bias_u8_data = biases_u8.data(); - - if (is_scalar_bias) { - // Symmetric quantization: single bias value for all blocks - // For Q8_0, bias = -128 * scale, so zero_point = 128 - bias_u8_data[0] = (uint8_t) std::round(-1.f * static_cast(bias_data[0]) / static_cast(scale_data[0])); - } else { - // Asymmetric quantization: per-block biases - for (size_t i = 0; i < biases_u8.get_size(); ++i) { - bias_u8_data[i] = - (uint8_t) std::round(-1.f * static_cast(bias_data[i]) / static_cast(scale_data[i])); - } - } - - auto zero_point = std::make_shared(biases_u8); + // Zero point is already in U8 format from extraction + auto zero_point = std::make_shared(zp); float zp_value; if (ov::op::util::get_single_value(zero_point, zp_value)) { zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value}); @@ -395,16 +403,13 @@ ov::Output make_int8_weights(ov::Tensor & weight, return std::make_shared(w_zp_s, ov::element::f32); } -ov::Output make_int4_weights(ov::Tensor & weight, - ov::Tensor & scales, - ov::Tensor & biases, - size_t group_size) { +ov::Output make_int4_weights(ov::Tensor & weight, ov::Tensor & scales, ov::Tensor & zp, size_t group_size) { ov::Shape orig_weight_shape = weight.get_shape(); - // Expand dimensions for scales and biases - ov::Shape scale_bias_shape = scales.get_shape(); - auto bias_shape = biases.get_shape(); - bool is_scalar_bias = bias_shape.empty(); // Symmetric quantization + // Expand dimensions for scales and zp + ov::Shape scale_shape = scales.get_shape(); + auto zp_shape = zp.get_shape(); + bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization // Create INT4 weight tensor ov::Shape packed_shape = {orig_weight_shape[0], orig_weight_shape[1] / group_size, group_size}; @@ -413,12 +418,12 @@ ov::Output make_int4_weights(ov::Tensor & weight, // Requantized channel-wise case packed_shape.erase(packed_shape.begin() + 1); } else { - scale_bias_shape.push_back(1); - scales.set_shape(scale_bias_shape); - // For symmetric quantization, biases remain scalar (don't resize) - if (!is_scalar_bias) { - bias_shape = scale_bias_shape; - biases.set_shape(bias_shape); + scale_shape.push_back(1); + scales.set_shape(scale_shape); + // For symmetric quantization, zp remains scalar (don't resize) + if (!is_scalar_zp) { + zp_shape.push_back(1); + zp.set_shape(zp_shape); } } @@ -427,29 +432,8 @@ ov::Output make_int4_weights(ov::Tensor & weight, weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; auto weights_f16 = std::make_shared(weights_node, ov::element::f16); - // Pack zero points: two subsequent values into one - const ov::float16 * bias_data = biases.data::value_type>(); - const ov::float16 * scale_data = scales.data::value_type>(); - ov::Tensor zero_point_tensor(ov::element::u4, is_scalar_bias ? ov::Shape{} : scale_bias_shape); - uint8_t * zero_point_data = static_cast(zero_point_tensor.data()); - - if (is_scalar_bias) { - // Symmetric quantization: single bias value for all blocks - // For Q4_0, bias = -8 * scale, so zero_point = 8 - uint8_t zp = (uint8_t) std::round(-1.f * static_cast(bias_data[0]) / static_cast(scale_data[0])); - zero_point_data[0] = (zp << 4) | (zp & 0x0F); - } else { - // Asymmetric quantization: per-block biases - for (size_t i = 0; i < zero_point_tensor.get_byte_size(); ++i) { - uint8_t bias1 = - (uint8_t) std::round(-1.f * static_cast(bias_data[i * 2]) / static_cast(scale_data[i * 2])); - uint8_t bias2 = (uint8_t) std::round(-1.f * static_cast(bias_data[i * 2 + 1]) / - static_cast(scale_data[i * 2 + 1])); - zero_point_data[i] = (bias2 << 4) | (bias1 & 0x0F); - } - } - - auto zero_points_node = std::make_shared(zero_point_tensor); + // Zero point is already in U4 format from extraction + auto zero_points_node = std::make_shared(zp); float zp_value; if (ov::op::util::get_single_value(zero_points_node, zp_value)) { zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value}); @@ -480,7 +464,7 @@ std::shared_ptr extract_quantized_weights(const ggml_tensor * tensor, const void * data, ov::Tensor & weights, ov::Tensor & scales, - ov::Tensor & biases) { + ov::Tensor & zp) { // Create a temporary tensor for extraction functions that read from tensor->data ggml_tensor temp_tensor = *tensor; temp_tensor.data = const_cast(data); @@ -512,22 +496,22 @@ std::shared_ptr extract_quantized_weights(const ggml_tensor * tensor, // Extract quantized data switch (tensor->type) { case GGML_TYPE_Q4_0: - extract_q4_0_data(&temp_tensor, weights, scales, biases); + extract_q4_0_data(&temp_tensor, weights, scales, zp); break; case GGML_TYPE_Q4_1: - extract_q4_1_data(&temp_tensor, weights, scales, biases); + extract_q4_1_data(&temp_tensor, weights, scales, zp); break; case GGML_TYPE_Q4_K: - extract_q4_k_data(&temp_tensor, weights, scales, biases); + extract_q4_k_data(&temp_tensor, weights, scales, zp); break; case GGML_TYPE_Q8_0: - extract_q8_0_data(&temp_tensor, weights, scales, biases); + extract_q8_0_data(&temp_tensor, weights, scales, zp); break; case GGML_TYPE_Q6_K: - extract_q6_k_data(&temp_tensor, weights, scales, biases); + extract_q6_k_data(&temp_tensor, weights, scales, zp); break; case GGML_TYPE_Q5_K: - extract_q5_k_data(&temp_tensor, weights, scales, biases); + extract_q5_k_data(&temp_tensor, weights, scales, zp); break; default: throw std::runtime_error("Unsupported quantized type: " + std::string(ggml_type_name(tensor->type))); @@ -536,9 +520,9 @@ std::shared_ptr extract_quantized_weights(const ggml_tensor * tensor, // Create the OpenVINO weight subgraph ov::Output weight_node; if (is_u4) { - weight_node = make_int4_weights(weights, scales, biases, weights_per_block); + weight_node = make_int4_weights(weights, scales, zp, weights_per_block); } else { - weight_node = make_int8_weights(weights, scales, biases, weights_per_block); + weight_node = make_int8_weights(weights, scales, zp, weights_per_block); } auto result = weight_node.get_node_shared_ptr(); @@ -553,7 +537,7 @@ std::shared_ptr requantize_to_buffers(const ggml_tensor * tensor, int64_t block_size, ov::Tensor & weights, ov::Tensor & scales, - ov::Tensor & biases) { + ov::Tensor & zp) { int64_t n_elements = ggml_nelements(tensor); // First dequantize to F32 @@ -572,19 +556,19 @@ std::shared_ptr requantize_to_buffers(const ggml_tensor * tensor, bool is_u4 = (requant_type == ExtraQuantType::Q4_0_C || requant_type == ExtraQuantType::Q4_0_128); if (is_u4) { - quantize_q4_0(weights_f32.data(), weights, scales, biases, n_elements, block_size); + quantize_q4_0(weights_f32.data(), weights, scales, zp, n_elements, block_size); } else if (requant_type == ExtraQuantType::Q8_1_C) { - quantize_q8_1(weights_f32.data(), weights, scales, biases, n_elements, block_size); + quantize_q8_1(weights_f32.data(), weights, scales, zp, n_elements, block_size); } else { - quantize_q8_0(weights_f32.data(), weights, scales, biases, n_elements, block_size); + quantize_q8_0(weights_f32.data(), weights, scales, zp, n_elements, block_size); } // Create the OpenVINO weight subgraph ov::Output weight_node; if (is_u4) { - weight_node = make_int4_weights(weights, scales, biases, block_size); + weight_node = make_int4_weights(weights, scales, zp, block_size); } else { - weight_node = make_int8_weights(weights, scales, biases, block_size); + weight_node = make_int8_weights(weights, scales, zp, block_size); } auto result = weight_node.get_node_shared_ptr(); @@ -653,50 +637,52 @@ std::shared_ptr process_weight_tensor(const ggml_tensor * tensor, cons } else { weights = ov::Tensor(ov::element::f16, node_shape); } - ov::Tensor dummy_scales, dummy_biases; // Not used for F16 - result = requantize_to_buffers(tensor, data, ExtraQuantType::F16, 0, weights, dummy_scales, dummy_biases); + ov::Tensor dummy_scales, dummy_zp; // Not used for F16 + result = requantize_to_buffers(tensor, data, ExtraQuantType::F16, 0, weights, dummy_scales, dummy_zp); } else { // Requant to quantized format (Q4_0_128, Q8_0_32, etc.) ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block}; - // For symmetric quantization, biases are a single value instead of per-block - ov::Shape bias_shape = layout.is_symmetric ? ov::Shape{} : scale_shape; + // For symmetric quantization, zp is a scalar value instead of per-block + // zp uses the same element type as weights (U4 or U8) + ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape; - ov::Tensor weights, scales, biases; + ov::Tensor weights, scales, zp; if (output_base_ptr) { uint8_t * buf_base = static_cast(output_base_ptr); weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset); scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset); - biases = ov::Tensor(ov::element::f16, bias_shape, buf_base + layout.biases_offset); + zp = ov::Tensor(weight_type, zp_shape, buf_base + layout.zp_offset); } else { weights = ov::Tensor(weight_type, node_shape); scales = ov::Tensor(ov::element::f16, scale_shape); - biases = ov::Tensor(ov::element::f16, bias_shape); + zp = ov::Tensor(weight_type, zp_shape); } result = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block, weights, - scales, biases); + scales, zp); } } else { // Normal extraction path (no requant) ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block}; - // For symmetric quantization, biases are a single value instead of per-block - ov::Shape bias_shape = layout.is_symmetric ? ov::Shape{} : scale_shape; + // For symmetric quantization, zp is a scalar value instead of per-block + // zp uses the same element type as weights (U4 or U8) + ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape; - ov::Tensor weights, scales, biases; + ov::Tensor weights, scales, zp; if (output_base_ptr) { uint8_t * buf_base = static_cast(output_base_ptr); weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset); scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset); - biases = ov::Tensor(ov::element::f16, bias_shape, buf_base + layout.biases_offset); + zp = ov::Tensor(weight_type, zp_shape, buf_base + layout.zp_offset); } else { weights = ov::Tensor(weight_type, node_shape); scales = ov::Tensor(ov::element::f16, scale_shape); - biases = ov::Tensor(ov::element::f16, bias_shape); + zp = ov::Tensor(weight_type, zp_shape); } - result = extract_quantized_weights(tensor, data, weights, scales, biases); + result = extract_quantized_weights(tensor, data, weights, scales, zp); } return result; @@ -705,7 +691,7 @@ std::shared_ptr process_weight_tensor(const ggml_tensor * tensor, cons void quantize_q4_0(const float * x, ov::Tensor & weights_arr, ov::Tensor & scales_arr, - ov::Tensor & biases_arr, + ov::Tensor & zp_arr, int64_t k, int64_t qk) { assert(k % qk == 0); @@ -713,8 +699,13 @@ void quantize_q4_0(const float * x, auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); - auto * biases = biases_arr.data::value_type>(); - bool is_scalar_bias = (biases_arr.get_size() == 1); // Symmetric quantization + auto * zp = static_cast(zp_arr.data()); + bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization + + // For Q4_0, zero point is always 8 + if (is_scalar_zp) { + zp[0] = 8 | (8 << 4); // Pack two 4-bit values + } for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max @@ -732,27 +723,27 @@ void quantize_q4_0(const float * x, if (d == 0) { scales[i] = ov::float16(1.0f); - if (is_scalar_bias) { - if (i == 0) { - biases[0] = ov::float16(-8.0f); + // zp is already set to 8 for symmetric, or set per-block for asymmetric + if (!is_scalar_zp) { + if (i % 2 == 0) { + zp[i / 2] = 8; + } else { + zp[i / 2] |= (8 << 4); } - } else { - biases[i] = ov::float16(-8.0f); } - uint8_t zp = 8; - memset(weights + i * qk / 2, zp | (zp << 4), qk / 2); + memset(weights + i * qk / 2, 8 | (8 << 4), qk / 2); continue; } const float id = 1.0f / d; scales[i] = ov::float16(d); - // For symmetric quantization, only write the first bias (all blocks share the same bias relationship) - if (is_scalar_bias) { - if (i == 0) { - biases[0] = ov::float16(-8.f * d); + // For asymmetric quantization, store per-block zero points + if (!is_scalar_zp) { + if (i % 2 == 0) { + zp[i / 2] = 8; + } else { + zp[i / 2] |= (8 << 4); } - } else { - biases[i] = ov::float16(-8.f * d); } for (int j = 0; j < qk / 2; ++j) { @@ -768,7 +759,7 @@ void quantize_q4_0(const float * x, void quantize_q8_0(const float * x, ov::Tensor & weights_arr, ov::Tensor & scales_arr, - ov::Tensor & biases_arr, + ov::Tensor & zp_arr, int64_t k, int64_t qk) { assert(k % qk == 0); @@ -776,8 +767,13 @@ void quantize_q8_0(const float * x, auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); - auto * biases = biases_arr.data::value_type>(); - bool is_scalar_bias = (biases_arr.get_size() == 1); // Symmetric quantization + auto * zp = static_cast(zp_arr.data()); + bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization + + // For Q8_0, zero point is always 128 + if (is_scalar_zp) { + zp[0] = 128; + } for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max @@ -792,13 +788,9 @@ void quantize_q8_0(const float * x, const float d = amax / 127.0f; const float id = d ? 1.0f / d : 0.0f; scales[i] = ov::float16(d); - // For symmetric quantization, only write the first bias (all blocks share the same bias relationship) - if (is_scalar_bias) { - if (i == 0) { - biases[0] = ov::float16(-128.0f * d); - } - } else { - biases[i] = ov::float16(-128.0f * d); + // For asymmetric quantization, store per-block zero points + if (!is_scalar_zp) { + zp[i] = 128; } for (int j = 0; j < qk; ++j) { @@ -812,7 +804,7 @@ void quantize_q8_0(const float * x, void quantize_q8_1(const float * x, ov::Tensor & weights_arr, ov::Tensor & scales_arr, - ov::Tensor & biases_arr, + ov::Tensor & zp_arr, int64_t k, int64_t qk) { assert(k % qk == 0); @@ -820,7 +812,7 @@ void quantize_q8_1(const float * x, auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); - auto * biases = biases_arr.data::value_type>(); + auto * zp = static_cast(zp_arr.data()); for (int i = 0; i < nb; i++) { float min = std::numeric_limits::max(); float max = std::numeric_limits::lowest(); @@ -838,7 +830,8 @@ void quantize_q8_1(const float * x, const float d = (max - min) / ((1 << 8) - 1); const float id = d ? 1.0f / d : 0.0f; scales[i] = ov::float16(d); - biases[i] = ov::float16(min); + // zp = -min / scale (Q8_1 is asymmetric) + zp[i] = (d != 0.0f) ? (uint8_t) std::round(-min / d) : 0; for (int j = 0; j < qk; ++j) { const float x0 = (x[i * qk + j] - min) * id; diff --git a/ggml/src/ggml-openvino/ggml-quants.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp index a1334e2408..6739689264 100644 --- a/ggml/src/ggml-openvino/ggml-quants.hpp +++ b/ggml/src/ggml-openvino/ggml-quants.hpp @@ -8,52 +8,52 @@ void unpack_32_4(const uint8_t* data, uint8_t* dst); -void extract_q4_0_data(const ggml_tensor* tensor, - ov::Tensor& weights_arr, - ov::Tensor& scales_arr, - ov::Tensor& biases_arr); +void extract_q4_0_data(const ggml_tensor * tensor, + ov::Tensor & weights_arr, + ov::Tensor & scales_arr, + ov::Tensor & zp_arr); -void extract_q4_1_data(const ggml_tensor* tensor, - ov::Tensor& weights_arr, - ov::Tensor& scales_arr, - ov::Tensor& biases_arr); +void extract_q4_1_data(const ggml_tensor * tensor, + ov::Tensor & weights_arr, + ov::Tensor & scales_arr, + ov::Tensor & zp_arr); -void extract_q8_0_data(const ggml_tensor* tensor, - ov::Tensor& weights_arr, - ov::Tensor& scales_arr, - ov::Tensor& biases_arr); +void extract_q8_0_data(const ggml_tensor * tensor, + ov::Tensor & weights_arr, + ov::Tensor & scales_arr, + ov::Tensor & zp_arr); void unpack_256_4(const uint8_t* data, uint8_t* dst); -void extract_q4_k_data(const ggml_tensor* tensor, - ov::Tensor& weights_arr, - ov::Tensor& scales_arr, - ov::Tensor& biases_arr); +void extract_q4_k_data(const ggml_tensor * tensor, + ov::Tensor & weights_arr, + ov::Tensor & scales_arr, + ov::Tensor & zp_arr); -void extract_q5_k_data(const ggml_tensor* tensor, - ov::Tensor& weights_arr, - ov::Tensor& scales_arr, - ov::Tensor& biases_arr); +void extract_q5_k_data(const ggml_tensor * tensor, + ov::Tensor & weights_arr, + ov::Tensor & scales_arr, + ov::Tensor & zp_arr); -void extract_q6_k_data(const ggml_tensor* tensor, - ov::Tensor& weights_arr, - ov::Tensor& scales_arr, - ov::Tensor& biases_arr); +void extract_q6_k_data(const ggml_tensor * tensor, + ov::Tensor & weights_arr, + ov::Tensor & scales_arr, + ov::Tensor & zp_arr); static constexpr size_t GGML_QUANTIZATION_GROUP_SIZE = 32; -ov::Output make_int8_weights(ov::Tensor& weight, - ov::Tensor& scales, - ov::Tensor& biases, +ov::Output make_int8_weights(ov::Tensor & weight, + ov::Tensor & scales, + ov::Tensor & zp, size_t group_size = GGML_QUANTIZATION_GROUP_SIZE); -ov::Output make_int4_weights(ov::Tensor& weight, - ov::Tensor& scales, - ov::Tensor& biases, +ov::Output make_int4_weights(ov::Tensor & weight, + ov::Tensor & scales, + ov::Tensor & zp, size_t group_size = GGML_QUANTIZATION_GROUP_SIZE); // Extract quantized weights from tensor and create weight subgraph -// If weights/scales/biases are provided (non-empty), uses them as output buffers +// If weights/scales/zp are provided (non-empty), uses them as output buffers // Otherwise allocates new ov::Tensors internally // Returns the weight node (make_int4_weights or make_int8_weights result) std::shared_ptr extract_quantized_weights( @@ -61,10 +61,10 @@ std::shared_ptr extract_quantized_weights( const void * data, // Source data pointer (may differ from tensor->data) ov::Tensor & weights, ov::Tensor & scales, - ov::Tensor & biases); + ov::Tensor & zp); // Requantize weights from tensor to target format, writing to provided buffers -// For F16 target, only weights buffer is used (scales/biases ignored) +// For F16 target, only weights buffer is used (scales/zp ignored) // Returns the weight node std::shared_ptr requantize_to_buffers(const ggml_tensor * tensor, const void * data, // Source data pointer @@ -72,7 +72,7 @@ std::shared_ptr requantize_to_buffers(const ggml_tensor * tensor, int64_t block_size, ov::Tensor & weights, ov::Tensor & scales, - ov::Tensor & biases); + ov::Tensor & zp); // Process weight tensor and create an OpenVINO constant node // Handles F16/F32/BF16 and quantized weights, with optional requantization @@ -84,11 +84,23 @@ std::shared_ptr process_weight_tensor( const void * data, // Source data pointer (may differ from tensor->data) void * output_base_ptr = nullptr); // Base pointer for output buffers (or nullptr for internal allocation) -void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, +void quantize_q4_0(const float * x, + ov::Tensor & weights_arr, + ov::Tensor & scales_arr, + ov::Tensor & zp_arr, + int64_t k, int64_t qk); -void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, +void quantize_q8_1(const float * x, + ov::Tensor & weights_arr, + ov::Tensor & scales_arr, + ov::Tensor & zp_arr, + int64_t k, int64_t qk); -void quantize_q8_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, +void quantize_q8_0(const float * x, + ov::Tensor & weights_arr, + ov::Tensor & scales_arr, + ov::Tensor & zp_arr, + int64_t k, int64_t qk); namespace ov { From 900dd76c24bdd177b49b1aa39e458274819491fc Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 6 Feb 2026 20:09:12 +0800 Subject: [PATCH 246/254] Refactor weight tensor processing --- ggml/src/ggml-openvino/ggml-decoder.cpp | 72 ++++++------ .../src/ggml-openvino/ggml-openvino-extra.cpp | 1 + ggml/src/ggml-openvino/ggml-openvino-extra.h | 41 ++++--- ggml/src/ggml-openvino/ggml-openvino.cpp | 99 +++++++--------- ggml/src/ggml-openvino/ggml-quants.cpp | 108 +++++++----------- ggml/src/ggml-openvino/ggml-quants.hpp | 37 +++++- 6 files changed, 181 insertions(+), 177 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index d8d71cf25e..da381e4fad 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -550,11 +550,6 @@ std::map> GgmlOvDecoder::create_weight_no return model_weights; } -// Static cache for quantized weight nodes (keyed by tensor data pointer) -// This is a fallback for when tensors don't have pre-built constants in extra -static std::unordered_map> s_quantized_weight_cache; -static std::mutex s_quantized_weight_cache_mutex; - std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor) { // Check if we have a pre-built constant from the OpenVINO backend buffer // This is set during ggml_backend_openvino_buffer_set_tensor @@ -569,51 +564,62 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor if (extra_base->type == ggml_openvino_extra_base::Type::WEIGHT) { // F16/F32/BF16 weight with shared-memory constant auto * weight_extra = static_cast(tensor->extra); - if (weight_extra->constant) { - GGML_LOG_DEBUG("%s: using pre-built constant for %s\n", __func__, tensor->name); - return weight_extra->constant; + if (weight_extra->weight_node) { + GGML_LOG_DEBUG("%s: using pre-built weight node for %s\n", __func__, tensor->name); + return weight_extra->weight_node; } } else if (extra_base->type == ggml_openvino_extra_base::Type::QUANTIZED_WEIGHT) { // Quantized weight with pre-extracted data auto * quant_extra = static_cast(tensor->extra); - if (quant_extra->constant) { - GGML_LOG_DEBUG("%s: using pre-extracted quantized constant for %s\n", __func__, tensor->name); - return quant_extra->constant; + if (quant_extra->weight_node) { + GGML_LOG_DEBUG("%s: using pre-extracted quantized weight node for %s\n", __func__, tensor->name); + return quant_extra->weight_node; } } } - // Fallback: Check static cache for quantized weights (keyed by data pointer) - // This handles cases where tensors weren't loaded through OpenVINO buffer - if (ggml_is_quantized(tensor->type)) { - std::lock_guard lock(s_quantized_weight_cache_mutex); - auto it = s_quantized_weight_cache.find(tensor->data); - if (it != s_quantized_weight_cache.end()) { - GGML_LOG_DEBUG("%s: using cached quantized constant for %s\n", __func__, tensor->name); - return it->second; - } - } + // Fallback: tensor doesn't have a pre-built extra. The buffer type can only be + // openvino_host_buffer_type, which has enough space (get_alloc_size returns + // layout.total_size for quantized 2D tensors) to store extracted data in-place. + // Build the weight node and store it in tensor->extra for future reuse. + GGML_LOG_DEBUG("%s: creating new weight node for %s\n", __func__, tensor->name); - GGML_LOG_DEBUG("%s: creating new constant for %s (extra=%p)\n", __func__, tensor->name, tensor->extra); - - std::set weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, - GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K}; + static const std::set weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, + GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, + GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K}; if (weight_types.find(tensor->type) == weight_types.end()) { throw std::runtime_error("Unexpected weight tensor type: " + std::string(tensor->name) + " with type " + ggml_type_name(tensor->type)); } - std::shared_ptr result = process_weight_tensor(tensor, tensor->data, nullptr); - result->set_friendly_name(tensor->name); - - // Cache the quantized weight node for future reuse + OvWeight ov_weight; if (ggml_is_quantized(tensor->type)) { - std::lock_guard lock(s_quantized_weight_cache_mutex); - s_quantized_weight_cache[tensor->data] = result; - GGML_LOG_DEBUG("%s: cached quantized constant for %s\n", __func__, tensor->name); + // For quantized weights, copy raw data to a temp buffer first because + // process_weight_tensor reads from data and writes extracted results + // (weights/scales/zp) to output_base_ptr — they would overlap if both + // point to tensor->data. + size_t raw_size = ggml_nbytes(tensor); + std::vector tmp(raw_size); + memcpy(tmp.data(), tensor->data, raw_size); + ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data); + } else { + // For non-quantized weights (F16/F32/BF16), data is already in tensor->data. + // process_weight_tensor will create an ov::Tensor wrapping tensor->data directly. + ov_weight = process_weight_tensor(tensor, tensor->data, tensor->data); } - return result; + ov_weight.weight_node->set_friendly_name(tensor->name); + + ggml_openvino_extra_base * extra; + if (ov_weight.is_quantized()) { + extra = new ggml_openvino_quantized_weight_extra(std::move(ov_weight.weights), std::move(ov_weight.scales), + std::move(ov_weight.zp), ov_weight.weight_node); + } else { + extra = new ggml_openvino_weight_extra(std::move(ov_weight.weights), ov_weight.weight_node); + } + ggml_openvino_buffer_register_extra(tensor, extra); + + return ov_weight.weight_node; } void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filename) { diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp index 4584dc38d0..39bf7610eb 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp @@ -319,6 +319,7 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment; layout.zp_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment; layout.total_size = layout.zp_offset + layout.zp_size; + layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor)); return layout; } diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h index 726a90abb0..9ce4667154 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.h +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h @@ -102,27 +102,30 @@ protected: explicit ggml_openvino_extra_base(Type t) : type(t) {} }; -// Extra data for F16/F32/BF16 weight tensors - stores the pre-built ov::Constant node +// Extra data for F16/F32/BF16 weight tensors - stores the pre-built weight node struct ggml_openvino_weight_extra : public ggml_openvino_extra_base { - std::shared_ptr constant; // Pre-built OpenVINO Constant node + ov::Tensor weights; // The underlying weight data tensor + std::shared_ptr weight_node; // Pre-built OpenVINO weight node - explicit ggml_openvino_weight_extra(std::shared_ptr c) - : ggml_openvino_extra_base(Type::WEIGHT), constant(std::move(c)) {} + ggml_openvino_weight_extra(ov::Tensor w, std::shared_ptr n) : + ggml_openvino_extra_base(Type::WEIGHT), + weights(std::move(w)), + weight_node(std::move(n)) {} }; -// Extra data for quantized weight tensors - stores extracted weights/scales/zp and ov::Constant +// Extra data for quantized weight tensors - stores extracted weights/scales/zp and weight node struct ggml_openvino_quantized_weight_extra : public ggml_openvino_extra_base { ov::Tensor weights; // U4 or U8 extracted weights ov::Tensor scales; // F16 scales ov::Tensor zp; // U4 or U8 zero points (same type as weights) - std::shared_ptr constant; // Pre-built OpenVINO weight subgraph + std::shared_ptr weight_node; // Pre-built OpenVINO weight subgraph - ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor z, std::shared_ptr c) : + ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor z, std::shared_ptr n) : ggml_openvino_extra_base(Type::QUANTIZED_WEIGHT), weights(std::move(w)), scales(std::move(s)), zp(std::move(z)), - constant(std::move(c)) {} + weight_node(std::move(n)) {} }; // Extra data for KV cache / compute tensors - stores ov::Tensor for infer_request @@ -140,19 +143,19 @@ struct ggml_openvino_tensor_extra : public ggml_openvino_extra_base { // Returns the total size needed in the buffer for extracted data. struct ggml_openvino_extracted_layout { - size_t total_size; // Total bytes needed - size_t weights_offset; // Offset to weights in buffer - size_t weights_size; // Size of weights in bytes - size_t scales_offset; // Offset to scales in buffer - size_t scales_size; // Size of scales in bytes - size_t zp_offset; // Offset to zero points in buffer - size_t zp_size; // Size of zero points in bytes (U4 or U8) - bool is_u4; // true for U4 weights, false for U8 + size_t total_size = 0; // Total bytes needed + size_t weights_offset = 0; // Offset to weights in buffer + size_t weights_size = 0; // Size of weights in bytes + size_t scales_offset = 0; // Offset to scales in buffer + size_t scales_size = 0; // Size of scales in bytes + size_t zp_offset = 0; // Offset to zero points in buffer + size_t zp_size = 0; // Size of zero points in bytes (U4 or U8) + bool is_u4; // true for U4 weights, false for U8 int64_t weights_per_block; // weights per scale/zp block bool is_symmetric; // true for symmetric quantization // Requantization info - bool is_requant; // true if this tensor needs requantization + bool is_requant = false; // true if this tensor needs requantization std::optional requant_type; // target requant type if is_requant }; @@ -160,3 +163,7 @@ struct ggml_openvino_extracted_layout { ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor); ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor, bool is_remote); + +// Register an extra with the tensor's OpenVINO buffer context for proper lifetime management. +// This sets tensor->extra and tracks the extra in the buffer context for cleanup. +void ggml_openvino_buffer_register_extra(ggml_tensor * tensor, ggml_openvino_extra_base * extra); diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index e531a9c036..efd399fe3f 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -230,80 +230,45 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer // 2D tensor (typical weight shape) bool is_2d = (tensor->ne[2] == 1 && tensor->ne[3] == 1); - // Check if this is a quantized weight tensor that needs extraction/requantization - ggml_openvino_extracted_layout layout = {}; - if (is_weight_buffer && is_full_tensor_set && is_2d && ggml_is_quantized(tensor->type)) { - layout = ggml_openvino_get_extracted_layout(tensor); - } - - if (layout.total_size > 0) { - // Quantized weight tensor with extraction/requantization - uint8_t * buf_base = (uint8_t *) tensor->data; - + if (is_weight_buffer && is_full_tensor_set && is_2d) { try { - std::shared_ptr constant = process_weight_tensor(tensor, data, buf_base); - constant->set_friendly_name(tensor->name); + auto result = process_weight_tensor(tensor, data, tensor->data); + result.weight_node->set_friendly_name(tensor->name); - // Store in tensor->extra - if (layout.is_requant && layout.requant_type.has_value() && - layout.requant_type.value() == ExtraQuantType::F16) { - // F16 requant case - use weight_extra - auto * extra = new ggml_openvino_weight_extra(constant); - ctx->tensor_extras[tensor] = extra; - tensor->extra = extra; - GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name); - } else { - // Quantized case - use quantized_weight_extra - // Create tensors with external memory (already filled by process_weight_tensor) - ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; - ov::Shape weight_shape = {static_cast(tensor->ne[1]), static_cast(tensor->ne[0])}; - ov::Shape scale_shape = {static_cast(tensor->ne[1]), - static_cast(tensor->ne[0] / layout.weights_per_block)}; - // zp shape: scalar for symmetric, per-block for asymmetric - ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape; + const auto & layout = result.layout; + ggml_openvino_extra_base * extra; - ov::Tensor weights(weight_type, weight_shape, buf_base + layout.weights_offset); - ov::Tensor scales(ov::element::f16, scale_shape, buf_base + layout.scales_offset); - ov::Tensor zp(weight_type, zp_shape, buf_base + layout.zp_offset); - - auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales), - std::move(zp), constant); - ctx->tensor_extras[tensor] = extra; - tensor->extra = extra; + // Quantized path with extracted weight/scale/zp tensors + if (result.is_quantized()) { + extra = new ggml_openvino_quantized_weight_extra(std::move(result.weights), std::move(result.scales), + std::move(result.zp), result.weight_node); if (layout.is_requant) { GGML_LOG_DEBUG("%s: requantized %s to %s (u%d, block_size=%ld)\n", __func__, tensor->name, - layout.requant_type.value() == ExtraQuantType::Q4_0_128 ? "Q4_0_128" : "Q8_0_32", - layout.is_u4 ? 4 : 8, layout.weights_per_block); + extra_quant_type_name(layout.requant_type.value()), layout.is_u4 ? 4 : 8, + layout.weights_per_block); } else { int64_t n_blocks = ggml_nelements(tensor) / layout.weights_per_block; - GGML_LOG_DEBUG("%s: extracted quantized constant for %s (u%d, %zu weights, %ld blocks)\n", __func__, - tensor->name, layout.is_u4 ? 4 : 8, layout.weights_size, n_blocks); + GGML_LOG_DEBUG("%s: extracted quantized weight node for %s (u%d, %zu weights, %ld blocks)\n", + __func__, tensor->name, layout.is_u4 ? 4 : 8, layout.weights_size, n_blocks); + } + } else { + // F16/F32/BF16 weight or F16-requant + extra = new ggml_openvino_weight_extra(std::move(result.weights), result.weight_node); + + if (layout.total_size > 0) { + GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name); + } else { + GGML_LOG_DEBUG("%s: created shared-memory weight node for %s\n", __func__, tensor->name); } } - } catch (const std::exception & e) { - GGML_LOG_ERROR("%s: failed to process quantized data for %s: %s\n", __func__, tensor->name, e.what()); - // Fall back to storing raw data - memcpy((char *) tensor->data + offset, data, size); - } - } else if (is_weight_buffer && is_full_tensor_set && is_2d && - (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16)) { - // F16/F32/BF16 weight tensor - try { - std::shared_ptr constant = process_weight_tensor(tensor, data, tensor->data); - constant->set_friendly_name(tensor->name); - - // Store in tensor->extra - ggml_openvino_weight_extra * extra = new ggml_openvino_weight_extra(constant); ctx->tensor_extras[tensor] = extra; tensor->extra = extra; - GGML_LOG_DEBUG("%s: created shared-memory constant for %s\n", __func__, tensor->name); - } catch (const std::exception & e) { - GGML_LOG_DEBUG("%s: failed to create shared-memory constant for %s: %s\n", __func__, tensor->name, - e.what()); + GGML_LOG_ERROR("%s: failed to process weight tensor for %s: %s\n", __func__, tensor->name, e.what()); + memcpy((char *) tensor->data + offset, data, size); } } else { // Non-weight tensor (KV cache, activations, etc.) - copy data @@ -604,6 +569,22 @@ size_t ggml_backend_openvino_buffer_get_ctx_id(ggml_backend_buffer_t buffer) { return ctx->id; } +void ggml_openvino_buffer_register_extra(ggml_tensor * tensor, ggml_openvino_extra_base * extra) { + GGML_ASSERT(tensor != nullptr); + GGML_ASSERT(tensor->buffer != nullptr); + GGML_ASSERT(ggml_backend_buffer_is_openvino(tensor->buffer)); + + auto * ctx = static_cast(tensor->buffer->context); + + auto it = ctx->tensor_extras.find(tensor); + if (it != ctx->tensor_extras.end()) { + delete it->second; + } + + ctx->tensor_extras[tensor] = extra; + tensor->extra = extra; +} + bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft) { return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name; } diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index 2de0494c91..10909cbc1e 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -576,10 +576,12 @@ std::shared_ptr requantize_to_buffers(const ggml_tensor * tensor, return result; } -std::shared_ptr process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr) { +OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr) { GGML_ASSERT(tensor != nullptr); GGML_ASSERT(data != nullptr); + OvWeight result; + // Get 2D shape for weights [rows, cols] ov::Shape node_shape = {static_cast(tensor->ne[1]), static_cast(tensor->ne[0])}; @@ -600,18 +602,16 @@ std::shared_ptr process_weight_tensor(const ggml_tensor * tensor, cons OPENVINO_THROW("Unexpected tensor type in F16/F32/BF16 path"); } - if (output_base_ptr) { + if (output_base_ptr && output_base_ptr != data) { // Using external buffer - copy data and create shared-memory constant size_t tensor_bytes = ggml_nbytes(tensor); memcpy(output_base_ptr, data, tensor_bytes); - ov::Tensor ov_tensor(element_type, node_shape, output_base_ptr); - return std::make_shared(ov_tensor); + result.weights = ov::Tensor(element_type, node_shape, output_base_ptr); } else { - // Allocate internal buffer - ov::Tensor weights(element_type, node_shape); - memcpy(weights.data(), data, ggml_nelements(tensor) * element_type.size()); - return std::make_shared(weights); + result.weights = ov::Tensor(element_type, node_shape, data); } + result.weight_node = std::make_shared(result.weights); + return result; } // Handle quantized weights @@ -619,70 +619,48 @@ std::shared_ptr process_weight_tensor(const ggml_tensor * tensor, cons OPENVINO_THROW("Unsupported weight tensor type: ", ggml_type_name(tensor->type)); } - auto layout = ggml_openvino_get_extracted_layout(tensor); + result.layout = ggml_openvino_get_extracted_layout(tensor); + const auto & layout = result.layout; if (layout.total_size == 0) { OPENVINO_THROW("Unsupported quantized type: ", ggml_type_name(tensor->type)); } - std::shared_ptr result; + // F16 requant path - no separate scales/zp needed in result + if (layout.is_requant && layout.requant_type.has_value() && layout.requant_type.value() == ExtraQuantType::F16) { + if (output_base_ptr) { + result.weights = ov::Tensor(ov::element::f16, node_shape, + static_cast(output_base_ptr) + layout.weights_offset); + } else { + result.weights = ov::Tensor(ov::element::f16, node_shape); + } + ov::Tensor dummy_scales, dummy_zp; // Not used for F16 + result.weight_node = + requantize_to_buffers(tensor, data, ExtraQuantType::F16, 0, result.weights, dummy_scales, dummy_zp); + return result; + } + + // Quantized path (normal extraction or quantized requant) + // Create weight/scale/zp tensors - shared between both paths + ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; + ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block}; + ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape; + + if (output_base_ptr) { + uint8_t * buf_base = static_cast(output_base_ptr); + result.weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset); + result.scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset); + result.zp = ov::Tensor(weight_type, zp_shape, buf_base + layout.zp_offset); + } else { + result.weights = ov::Tensor(weight_type, node_shape); + result.scales = ov::Tensor(ov::element::f16, scale_shape); + result.zp = ov::Tensor(weight_type, zp_shape); + } if (layout.is_requant && layout.requant_type.has_value()) { - // Requantization path - if (layout.requant_type.value() == ExtraQuantType::F16) { - // Requant to F16 - ov::Tensor weights; - if (output_base_ptr) { - weights = ov::Tensor(ov::element::f16, node_shape, - static_cast(output_base_ptr) + layout.weights_offset); - } else { - weights = ov::Tensor(ov::element::f16, node_shape); - } - ov::Tensor dummy_scales, dummy_zp; // Not used for F16 - result = requantize_to_buffers(tensor, data, ExtraQuantType::F16, 0, weights, dummy_scales, dummy_zp); - } else { - // Requant to quantized format (Q4_0_128, Q8_0_32, etc.) - ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; - ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block}; - // For symmetric quantization, zp is a scalar value instead of per-block - // zp uses the same element type as weights (U4 or U8) - ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape; - - ov::Tensor weights, scales, zp; - if (output_base_ptr) { - uint8_t * buf_base = static_cast(output_base_ptr); - weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset); - scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset); - zp = ov::Tensor(weight_type, zp_shape, buf_base + layout.zp_offset); - } else { - weights = ov::Tensor(weight_type, node_shape); - scales = ov::Tensor(ov::element::f16, scale_shape); - zp = ov::Tensor(weight_type, zp_shape); - } - - result = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block, weights, - scales, zp); - } + result.weight_node = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block, + result.weights, result.scales, result.zp); } else { - // Normal extraction path (no requant) - ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; - ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block}; - // For symmetric quantization, zp is a scalar value instead of per-block - // zp uses the same element type as weights (U4 or U8) - ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape; - - ov::Tensor weights, scales, zp; - if (output_base_ptr) { - uint8_t * buf_base = static_cast(output_base_ptr); - weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset); - scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset); - zp = ov::Tensor(weight_type, zp_shape, buf_base + layout.zp_offset); - } else { - weights = ov::Tensor(weight_type, node_shape); - scales = ov::Tensor(ov::element::f16, scale_shape); - zp = ov::Tensor(weight_type, zp_shape); - } - - result = extract_quantized_weights(tensor, data, weights, scales, zp); + result.weight_node = extract_quantized_weights(tensor, data, result.weights, result.scales, result.zp); } return result; diff --git a/ggml/src/ggml-openvino/ggml-quants.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp index 6739689264..600b9c9f29 100644 --- a/ggml/src/ggml-openvino/ggml-quants.hpp +++ b/ggml/src/ggml-openvino/ggml-quants.hpp @@ -74,12 +74,43 @@ std::shared_ptr requantize_to_buffers(const ggml_tensor * tensor, ov::Tensor & scales, ov::Tensor & zp); -// Process weight tensor and create an OpenVINO constant node +inline const char * extra_quant_type_name(ExtraQuantType t) { + switch (t) { + case ExtraQuantType::F16: + return "F16"; + case ExtraQuantType::Q4_0_C: + return "Q4_0_C"; + case ExtraQuantType::Q4_0_128: + return "Q4_0_128"; + case ExtraQuantType::Q8_0_C: + return "Q8_0_C"; + case ExtraQuantType::Q8_0_32: + return "Q8_0_32"; + case ExtraQuantType::Q8_1_C: + return "Q8_1_C"; + default: + return "unknown"; + } +} + +// Result from process_weight_tensor containing the weight node and tensors. +// For quantized weights, also contains the extracted layout and scale/zp tensors. +struct OvWeight { + std::shared_ptr weight_node; + ggml_openvino_extracted_layout layout; // Only meaningful for quantized (layout.total_size > 0) + ov::Tensor weights; + ov::Tensor scales; + ov::Tensor zp; + + bool is_quantized() const { return layout.scales_size > 0; } +}; + +// Process weight tensor and create an OpenVINO weight node // Handles F16/F32/BF16 and quantized weights, with optional requantization // If output_base_ptr is nullptr, allocates internal buffers (for decoder use) // If output_base_ptr is provided, uses pre-allocated buffers at specified offsets (for backend buffer use) -// Returns the weight constant node -std::shared_ptr process_weight_tensor( +// Returns OvWeight with the weight node and optional quantized tensors +OvWeight process_weight_tensor( const ggml_tensor * tensor, const void * data, // Source data pointer (may differ from tensor->data) void * output_base_ptr = nullptr); // Base pointer for output buffers (or nullptr for internal allocation) From 1d4ec1b2ee4c0eee7b4663194520b8ae35730d55 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 11 Feb 2026 14:20:14 +0800 Subject: [PATCH 247/254] create_weight_node accept non-ov backend buffer --- ggml/src/ggml-openvino/ggml-decoder.cpp | 36 +++++++++++++------------ 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index da381e4fad..857aa3b850 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -551,13 +551,13 @@ std::map> GgmlOvDecoder::create_weight_no } std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor) { + const bool is_ov_buffer = ggml_backend_buffer_is_openvino(tensor->buffer); + // Check if we have a pre-built constant from the OpenVINO backend buffer // This is set during ggml_backend_openvino_buffer_set_tensor if (tensor->extra) { - if (!ggml_backend_buffer_is_openvino(tensor->buffer)) { - OPENVINO_ASSERT(false, "Unsupported weight tensor: " + std::string(tensor->name) + - " Possibly this is a cpu backend repacked quantized weights"); - } + OPENVINO_ASSERT(is_ov_buffer, "Unsupported weight tensor: " + std::string(tensor->name) + + " Possibly this is a cpu backend repacked quantized weights"); // Cast to our extra base type and check the type auto * extra_base = static_cast(tensor->extra); @@ -578,12 +578,7 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor } } - // Fallback: tensor doesn't have a pre-built extra. The buffer type can only be - // openvino_host_buffer_type, which has enough space (get_alloc_size returns - // layout.total_size for quantized 2D tensors) to store extracted data in-place. - // Build the weight node and store it in tensor->extra for future reuse. GGML_LOG_DEBUG("%s: creating new weight node for %s\n", __func__, tensor->name); - static const std::set weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K}; @@ -594,14 +589,18 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor OvWeight ov_weight; if (ggml_is_quantized(tensor->type)) { - // For quantized weights, copy raw data to a temp buffer first because - // process_weight_tensor reads from data and writes extracted results - // (weights/scales/zp) to output_base_ptr — they would overlap if both - // point to tensor->data. - size_t raw_size = ggml_nbytes(tensor); - std::vector tmp(raw_size); - memcpy(tmp.data(), tensor->data, raw_size); - ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data); + if (is_ov_buffer) { + // For quantized weights, copy raw data to a temp buffer first because + // process_weight_tensor reads from data and writes extracted results + // (weights/scales/zp) to output_base_ptr — they would overlap if both + // point to tensor->data. + size_t raw_size = ggml_nbytes(tensor); + std::vector tmp(raw_size); + memcpy(tmp.data(), tensor->data, raw_size); + ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data); + } else { + ov_weight = process_weight_tensor(tensor, tensor->data, nullptr); + } } else { // For non-quantized weights (F16/F32/BF16), data is already in tensor->data. // process_weight_tensor will create an ov::Tensor wrapping tensor->data directly. @@ -609,6 +608,9 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor } ov_weight.weight_node->set_friendly_name(tensor->name); + if (!is_ov_buffer) { + return ov_weight.weight_node; + } ggml_openvino_extra_base * extra; if (ov_weight.is_quantized()) { From e0590152ff4b6416cdedf6efef21b36dda5764e2 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 11 Feb 2026 15:21:53 +0800 Subject: [PATCH 248/254] remove changes in llama-graph.cpp --- ggml/src/ggml-openvino/ggml-decoder.cpp | 5 ++++- ggml/src/ggml-openvino/ggml-decoder.h | 19 +++++++++++++++++++ .../openvino/translate_session.cpp | 5 ++++- src/llama-graph.cpp | 10 +++------- 4 files changed, 30 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 857aa3b850..4b9429740c 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -160,7 +160,10 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { if (src == nullptr) { continue; } - std::string src_name = std::string(src->name); + auto src_name = std::string(src->name); + if (src->flags & GGML_TENSOR_FLAG_INPUT) { + src_name = get_graph_input_ov_name(src, node); + } current_node_info.node_inputs[src_name] = src; current_node_info.node_inputs_names.push_back(src_name); diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index c8e3edeaf8..59311a6121 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -245,6 +245,25 @@ public: return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op != GGML_OP_NONE; } + static std::string get_graph_input_ov_name(const ggml_tensor * tensor, const ggml_tensor * op) { + if (is_inp_tok(tensor, op)) { + return "inp_tokens"; + } + if (is_inp_pos(tensor, op)) { + return "inp_pos"; + } + if (is_inp_emb(tensor, op)) { + return "embd"; + } + if (is_output_idx(tensor, op)) { + return "inp_out_ids"; + } + if (is_inp_mask(tensor, op)) { + return std::string(tensor->name).find("swa") == std::string::npos ? "self_kq_mask" : "self_kq_mask_swa"; + } + return tensor->name; + } + private: void set_input_output(ggml_tensor * node, bool naive = false); int compute_op_case(const ggml_tensor * node) const; diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index b7e7b58531..2cca3de4f7 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -117,9 +117,12 @@ void add_sliced_mask(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) { void add_rope_sin_cos(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) { int32_t * rope_params = ggml_model_decoder.get_rope_params(); + if (tensor_map.find("inp_pos") == tensor_map.end() || rope_params == nullptr) { + return; + } auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); std::shared_ptr rope_freqs_weight; - if (tensor_map.find("rope_freqs_weight") != tensor_map.end()) { + if (tensor_map.find("rope_freqs.weight") != tensor_map.end()) { rope_freqs_weight = tensor_map.at("rope_freqs.weight").get_node_shared_ptr(); } diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 92ddb21c49..bba747d37b 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1535,7 +1535,6 @@ ggml_tensor * llm_graph_context::build_inp_pos() const { auto & cur = inp->pos; cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, (int64_t)n_tokens*hparams.n_pos_per_embd()); - cb(cur, "inp_pos", -1); ggml_set_input(cur); res->add_input(std::move(inp)); @@ -1571,7 +1570,6 @@ ggml_tensor * llm_graph_context::build_inp_out_ids() const { auto & cur = inp->out_ids; cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs); - cb(cur, "inp_out_ids", -1); ggml_set_input(cur); res->add_input(std::move(inp)); @@ -1813,7 +1811,6 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1); - cb(inp->self_kq_mask, "self_kq_mask", -1); ggml_set_input(inp->self_kq_mask); inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; @@ -1872,7 +1869,7 @@ ggml_tensor * llm_graph_context::build_attn( } if (wo_b) { - cb(cur, "kqv_wo", il); + //cb(cur, "kqv_wo", il); } if (wo_b) { @@ -1902,7 +1899,6 @@ static std::unique_ptr build_attn_inp_kv_impl( inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch); inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream); - ggml_set_name(inp->self_kq_mask, "self_kq_mask"); ggml_set_input(inp->self_kq_mask); inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; @@ -2119,7 +2115,7 @@ ggml_tensor * llm_graph_context::build_attn( } if (wo_b) { - cb(cur, "kqv_wo", il); + //cb(cur, "kqv_wo", il); } if (wo_b) { @@ -2174,7 +2170,7 @@ ggml_tensor * llm_graph_context::build_attn( } if (wo_b) { - cb(cur, "kqv_wo", il); + //cb(cur, "kqv_wo", il); } if (wo_b) { From 0d74aba277891770cb9b9c2b69bec3e90f3817c3 Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Wed, 11 Feb 2026 16:31:06 -0800 Subject: [PATCH 249/254] stateful masking fix (#38) Fix for stateful accuracy issues and cl_out_of_resources error in stateful GPU with larger context sizes. --- .../openvino/translate_session.cpp | 11 +++++++---- ggml/src/ggml-openvino/utils.cpp | 18 +++++++++++++++++- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 2cca3de4f7..286229dc0e 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -89,12 +90,14 @@ void add_sliced_mask(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) { auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0}); auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1}); auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); + auto three_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {3}); + auto neg_one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {-2,-1}); auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); - auto shape_of_inp_pos = std::make_shared(inp_pos); - auto gather_inp_pos = std::make_shared(shape_of_inp_pos, two_1d, zero_1d); - auto stop = std::make_shared(ov::OutputVector{token_len_per_seq, gather_inp_pos}, 0); + auto gather_inp_pos = std::make_shared(inp_pos, neg_one_1d, three_1d); + auto reshaped_inp_pos = std::make_shared(gather_inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {1}), false); + auto inp_pos_incremented = std::make_shared(reshaped_inp_pos, ov::op::v0::Constant::create(ov::element::i32, ov::Shape{1}, {1})); + auto stop = std::make_shared(ov::OutputVector{token_len_per_seq, std::make_shared(inp_pos_incremented, token_len_per_seq)}, 0); mask_sliced = std::make_shared(mask, zero_2d, stop, one_2d, axes); mask_sliced = std::make_shared(mask_sliced, ov::element::f16); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 69cac19019..55ea4eb355 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -57,6 +57,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin auto & core = ov_singleton_core(); const auto & config = ggml_openvino_get_compile_config(); static auto is_static = false; + static size_t stateful_kv_size = 0; // if (is_naive(cgraph)) { // return naive_compute(cgraph, core, device, config); @@ -106,12 +107,27 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin } ggml_decoder->add_extra_inputs(); infer_request = infer_request_cache.at(key); + if (stateful) { const auto * inp_pos = get_inp_pos_tensor(cgraph); int32_t * pos_data = (int32_t *) inp_pos->data; + auto pos_shape = ggml_decoder->get_shape(inp_pos); if (pos_data[0] == 0) { infer_request->reset_state(); - } + stateful_kv_size = pos_shape[3]; + } else if (stateful_kv_size == pos_data[0]) { + stateful_kv_size += pos_shape[3]; + } else { + auto states = infer_request->query_state(); + for (auto state : states) { + auto state_tensor = state.get_state(); + ov::Coordinate begin = {0, 0, 0, 0}; + ov::Coordinate end = {state_tensor.get_shape()[0], static_cast(pos_data[0]), state_tensor.get_shape()[2], state_tensor.get_shape()[3]}; + ov::Tensor new_state_tensor(state_tensor, begin, end); + state.set_state(new_state_tensor); + } + stateful_kv_size = pos_data[0] + 1; + } } decoder_end_time = ggml_time_us(); From d5d673cde3e45934fd298c99d38ce53c13aca31b Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 12 Feb 2026 17:25:18 +0800 Subject: [PATCH 250/254] Fix test-backend-ops crash glu, get_rows, scale, rms_norm, add --- ggml/src/ggml-openvino/ggml-decoder.cpp | 13 +++-- ggml/src/ggml-openvino/ggml-openvino.cpp | 58 +++++++++++++++++-- .../ggml-openvino/openvino/op/get_rows.cpp | 15 ++++- .../ggml-openvino/openvino/op/glu_geglu.cpp | 21 +++++-- .../ggml-openvino/openvino/op/glu_swiglu.cpp | 21 +++++-- ggml/src/ggml-openvino/openvino/op/scale.cpp | 18 +++++- ggml/src/ggml-openvino/utils.cpp | 14 +++-- 7 files changed, 129 insertions(+), 31 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 4b9429740c..8796c23abd 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -95,9 +95,6 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::mapn_nodes; node_n++) { auto * cur_node = cgraph->nodes[node_n]; - if (cur_node->op == GGML_OP_NONE) { - continue; - } set_input_output(cur_node, true); } for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { @@ -110,6 +107,9 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map data_addr_map; std::unordered_set output_name_set; for (const auto & node_info : m_node_info_list) { + if (node_info.node->op == GGML_OP_NONE) { + continue; + } for (const auto & it : node_info.node_inputs) { const auto & src_name = it.first; const auto & src_node = it.second; @@ -164,6 +164,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { if (src->flags & GGML_TENSOR_FLAG_INPUT) { src_name = get_graph_input_ov_name(src, node); } + m_inputs[src_name] = src; current_node_info.node_inputs[src_name] = src; current_node_info.node_inputs_names.push_back(src_name); @@ -193,7 +194,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { if (m_model_inputs.find(src_name) != m_model_inputs.end()) { continue; } - m_inputs[src_name] = src; assert(stateful_kv_shape.rank().is_static()); ov::PartialShape param_shape = (stateful_kv_shape.rank().get_length() != 0) ? stateful_kv_shape : get_graph_input_shape(node, src); @@ -264,7 +264,7 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const { } else { op_case = 3; } - } else if (node->src[0]->src[0]->op == GGML_OP_ROPE || node->src[0]->src[0]->src[0]->op == GGML_OP_ROPE) { + } else { // rope'ed query tensor op_case = 4; } @@ -839,6 +839,9 @@ int32_t * GgmlOvDecoder::get_output_op_params(int node_idx) const { void GgmlOvDecoder::visit_subgraph(std::function, int node_idx)> node_visitor) const { for (int node_idx = 0; node_idx < m_cgraph->n_nodes; node_idx++) { + if (m_cgraph->nodes[node_idx]->op == GGML_OP_NONE) { + continue; + } node_visitor(std::make_shared(*this), node_idx); } } diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index efd399fe3f..801d9ad5c4 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -113,8 +113,8 @@ struct ggml_backend_openvino_buffer_context { ~ggml_backend_openvino_buffer_context() { // Clean up all tensor extras - GGML_LOG_DEBUG("Deleting OpenVINO buffer context #%zu for device %d, size %zu MB\n", id, device, - size / 1024 / 1024); + // GGML_LOG_DEBUG("Deleting OpenVINO buffer context #%zu for device %d, size %zu MB\n", id, device, + // size / 1024 / 1024); for (auto & pair : tensor_extras) { delete pair.second; } @@ -454,9 +454,9 @@ static size_t ggml_backend_openvino_buffer_type_get_alloc_size(ggml_backend_buff if (ggml_is_quantized(tensor->type) && tensor->ne[2] == 1 && tensor->ne[3] == 1) { ggml_openvino_extracted_layout layout = ggml_openvino_get_extracted_layout(tensor); if (layout.total_size > 0) { - GGML_LOG_DEBUG("%s: tensor %s needs %zu bytes (original %zu, extracted: weights=%zu scales=%zu zp=%zu)\n", - __func__, tensor->name, layout.total_size, ggml_nbytes(tensor), layout.weights_size, - layout.scales_size, layout.zp_size); + // GGML_LOG_DEBUG("%s: tensor %s needs %zu bytes (original %zu, extracted: weights=%zu scales=%zu zp=%zu)\n", + // __func__, tensor->name, layout.total_size, ggml_nbytes(tensor), layout.weights_size, + // layout.scales_size, layout.zp_size); return layout.total_size; } } @@ -763,8 +763,36 @@ static ggml_backend_buffer_type_t ggml_backend_openvino_device_get_host_buffer_t return ggml_backend_openvino_host_buffer_type(ctx->device); } +static bool has_view_input(const ggml_tensor * op) { + for (int i = 0; i < GGML_MAX_SRC; i++) { + if (op->src[i] == nullptr) { + break; + } + if (op->src[i]->op == GGML_OP_VIEW) { + return true; + } + } + return false; +} + static bool is_op_unsupported_case(const ggml_tensor * op) { switch (op->op) { + case GGML_OP_GET_ROWS: + case GGML_OP_SET_ROWS: { + if (op->ne[3] != 1) { + return true; + } + break; + } + case GGML_OP_ADD: + case GGML_OP_MUL: { + for (int i = 0; i < 4; i++) { + if (op->src[0]->ne[i] != op->src[1]->ne[i] && (op->src[0]->ne[i] != 1 && op->src[1]->ne[i] != 1)) { + return true; + } + } + break; + } case GGML_OP_SOFT_MAX: { if (op->src[2] != nullptr) { GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with sinks\n"); @@ -876,7 +904,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con GGML_TYPE_Q5_K, GGML_TYPE_Q8_0, GGML_TYPE_Q6_K}; static const std::set supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW, - GGML_OP_CONT, GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE, + /*GGML_OP_CONT,*/ GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE, GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE, // softmax is not updated due to replaced by flash_attn_ext // GGML_OP_SOFT_MAX, @@ -896,6 +924,11 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con GGML_LOG_WARN("OpenVINO backend does not support unary op %s\n", ggml_unary_op_name(ggml_get_unary_op(op))); return false; } + if (has_view_input(op)) { + GGML_LOG_WARN("OpenVINO backend does not support unary op %s with view input\n", + ggml_unary_op_name(ggml_get_unary_op(op))); + return false; + } break; } case GGML_OP_GLU: { @@ -904,6 +937,11 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con GGML_LOG_WARN("OpenVINO backend does not support GLU op %s\n", ggml_glu_op_name(ggml_get_glu_op(op))); return false; } + if (has_view_input(op)) { + GGML_LOG_WARN("OpenVINO backend does not support unary op %s with view input\n", + ggml_glu_op_name(ggml_get_glu_op(op))); + return false; + } break; } default: { @@ -912,6 +950,14 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con GGML_LOG_WARN("OpenVINO backend does not support op %s\n", ggml_op_name(op->op)); return false; } + static std::set ops_not_support_view_input{ + GGML_OP_GET_ROWS, + GGML_OP_RMS_NORM, + }; + if (ops_not_support_view_input.find(op->op) != ops_not_support_view_input.end() && has_view_input(op)) { + GGML_LOG_WARN("OpenVINO backend does not support op %s with view input\n", ggml_op_name(op->op)); + return false; + } } } diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp index d6e7a35534..cdc0ec58b0 100644 --- a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp @@ -34,9 +34,18 @@ OutputVector translate_get_rows(const NodeContext & context) { indices = std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); if (data.get_partial_shape().rank() == 4) { - auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1}); - data = std::make_shared(data, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); - res = std::make_shared(data, indices, axis, 1); + if (data.get_partial_shape()[1].get_length() == 1) { + // Work-around for a bug in ov cpu plugin for test-backend-ops + data = std::make_shared(data, + ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); + auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {0}); + res = std::make_shared(data, indices, axis); + } else { + auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1}); + data = + std::make_shared(data, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); + res = std::make_shared(data, indices, axis, 1); + } } else if (context.is_stateful() && data.get_partial_shape().rank() == 3) { auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1}); res = std::make_shared(data, indices, axis, 1); diff --git a/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp index 8be9e8deb0..124003911b 100644 --- a/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp @@ -9,7 +9,6 @@ #include #include #include -#include namespace ov { namespace frontend { @@ -25,11 +24,23 @@ OutputVector translate_glu_geglu(const NodeContext & context) { src0 = context.get_input(0); src1 = context.get_input(1); } else { + // GGML splits along ne[0] (OV last axis) using floor division: nc = ne[0] / 2. + // Both halves are nc elements; if the dimension is odd, the last element is dropped. + // Use Slice instead of Split to handle odd dimensions correctly. auto combined = context.get_input(0); - auto split_axis = ov::op::v0::Constant::create(ov::element::i64, {}, {-1}); - auto split = std::make_shared(combined, split_axis, 2); - src0 = split->output(0); - src1 = split->output(1); + auto combined_shape = combined.get_partial_shape(); + int64_t last_dim_val = combined_shape[combined_shape.rank().get_length() - 1].get_length(); + int64_t nc = last_dim_val / 2; + + auto axis = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); + auto step = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto start0 = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto stop0 = ov::op::v0::Constant::create(ov::element::i64, {1}, {nc}); + auto start1 = ov::op::v0::Constant::create(ov::element::i64, {1}, {nc}); + auto stop1 = ov::op::v0::Constant::create(ov::element::i64, {1}, {2 * nc}); + + src0 = std::make_shared(combined, start0, stop0, step, axis); + src1 = std::make_shared(combined, start1, stop1, step, axis); } int32_t * params = context.get_output_op_params(); diff --git a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp index 6e0b85517e..10d1b39438 100644 --- a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp @@ -9,7 +9,6 @@ #include #include #include -#include namespace ov { namespace frontend { @@ -25,11 +24,23 @@ OutputVector translate_glu_swiglu(const NodeContext & context) { src0 = context.get_input(0); src1 = context.get_input(1); } else { + // GGML splits along ne[0] (OV last axis) using floor division: nc = ne[0] / 2. + // Both halves are nc elements; if the dimension is odd, the last element is dropped. + // Use Slice instead of Split to handle odd dimensions correctly. auto combined = context.get_input(0); - auto split_axis = ov::op::v0::Constant::create(ov::element::i64, {}, {-1}); - auto split = std::make_shared(combined, split_axis, 2); - src0 = split->output(0); - src1 = split->output(1); + auto combined_shape = combined.get_partial_shape(); + int64_t last_dim_val = combined_shape[combined_shape.rank().get_length() - 1].get_length(); + int64_t nc = last_dim_val / 2; + + auto axis = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); + auto step = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto start0 = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto stop0 = ov::op::v0::Constant::create(ov::element::i64, {1}, {nc}); + auto start1 = ov::op::v0::Constant::create(ov::element::i64, {1}, {nc}); + auto stop1 = ov::op::v0::Constant::create(ov::element::i64, {1}, {2 * nc}); + + src0 = std::make_shared(combined, start0, stop0, step, axis); + src1 = std::make_shared(combined, start1, stop1, step, axis); } int32_t * params = context.get_output_op_params(); diff --git a/ggml/src/ggml-openvino/openvino/op/scale.cpp b/ggml/src/ggml-openvino/openvino/op/scale.cpp index 01e59cedd9..ceb18ddb48 100644 --- a/ggml/src/ggml-openvino/openvino/op/scale.cpp +++ b/ggml/src/ggml-openvino/openvino/op/scale.cpp @@ -2,6 +2,7 @@ #include "../op_table.hpp" #include "../utils.hpp" +#include #include #include #include @@ -15,10 +16,21 @@ OutputVector translate_scale(const NodeContext & context) { num_inputs_check(context, 1, 1); float scale; - memcpy(&scale, context.get_output_op_params(), sizeof(float)); - auto scale_node = std::make_shared(ov::element::f32, ov::Shape{}, std::vector{scale}); + float bias; + memcpy(&scale, (float *) context.get_output_op_params() + 0, sizeof(float)); + memcpy(&bias, (float *) context.get_output_op_params() + 1, sizeof(float)); - auto res = std::make_shared(context.get_input(0), scale_node); + auto scale_node = std::make_shared(ov::element::f32, ov::Shape{}, std::vector{scale}); + auto scaled = std::make_shared(context.get_input(0), scale_node); + + std::shared_ptr res; + if (bias != 0.0f) { + auto bias_node = + std::make_shared(ov::element::f32, ov::Shape{}, std::vector{bias}); + res = std::make_shared(scaled, bias_node); + } else { + res = scaled; + } return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 55ea4eb355..a370043dd7 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -59,9 +59,9 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin static auto is_static = false; static size_t stateful_kv_size = 0; - // if (is_naive(cgraph)) { - // return naive_compute(cgraph, core, device, config); - // } + if (is_naive(cgraph)) { + return naive_compute(cgraph, core, device, config); + } auto start_time = ggml_time_us(); @@ -438,7 +438,13 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { bool is_naive(ggml_cgraph * cgraph) { constexpr int naive_graph_size_threshold = 20; - return cgraph->n_nodes < naive_graph_size_threshold; + int count = 0; + for (int i = 0; i < cgraph->n_nodes; i++) { + if (cgraph->nodes[i]->op != GGML_OP_NONE) { + count++; + } + } + return count < naive_graph_size_threshold; } enum ggml_status naive_compute(ggml_cgraph * cgraph, From 59e7d730d249ba86ff065f77f5ae8d989710204f Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Thu, 12 Feb 2026 16:57:55 -0800 Subject: [PATCH 251/254] hardcoded name handling for rope_freqs.weight --- ggml/src/ggml-openvino/ggml-decoder.cpp | 3 +++ ggml/src/ggml-openvino/ggml-decoder.h | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 8796c23abd..b73a102661 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -522,6 +522,9 @@ std::map> GgmlOvDecoder::create_weight_no } std::string src_name(src->name); + if (is_rope_freqs_weight(src, node)) { + src_name = "rope_freqs.weight"; + } if (!src->view_src) { ggml_backend_buffer * buffer = src->buffer; if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS || ggml_is_quantized(src->type)) { diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 59311a6121..820964551c 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -233,6 +233,10 @@ public: return op->op == GGML_OP_CPY || (op->op == GGML_OP_FLASH_ATTN_EXT && tensor == op->src[3]); } + inline static bool is_rope_freqs_weight(const ggml_tensor * tensor, const ggml_tensor * op) { + return op->op == GGML_OP_ROPE && tensor == op->src[2]; + } + inline static bool is_kvcache(const ggml_tensor * tensor, const ggml_tensor * op) { return op->op == GGML_OP_SET_ROWS && op->src[2] == tensor; } From 1a54965c439f5b1e7f71d4e3b0232f61d8d36139 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 13 Feb 2026 10:29:48 +0800 Subject: [PATCH 252/254] Suppress logging and add error handling to allow test-backend-ops to complete --- ggml/src/ggml-openvino/ggml-decoder.cpp | 6 +- .../src/ggml-openvino/ggml-openvino-extra.cpp | 2 +- ggml/src/ggml-openvino/ggml-openvino.cpp | 94 +++++++++---------- ggml/src/ggml-openvino/openvino/op/rope.cpp | 4 +- ggml/src/ggml-openvino/utils.cpp | 37 +++++--- 5 files changed, 76 insertions(+), 67 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 8796c23abd..99776e1bec 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -568,20 +568,20 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor // F16/F32/BF16 weight with shared-memory constant auto * weight_extra = static_cast(tensor->extra); if (weight_extra->weight_node) { - GGML_LOG_DEBUG("%s: using pre-built weight node for %s\n", __func__, tensor->name); + // GGML_LOG_DEBUG("%s: using pre-built weight node for %s\n", __func__, tensor->name); return weight_extra->weight_node; } } else if (extra_base->type == ggml_openvino_extra_base::Type::QUANTIZED_WEIGHT) { // Quantized weight with pre-extracted data auto * quant_extra = static_cast(tensor->extra); if (quant_extra->weight_node) { - GGML_LOG_DEBUG("%s: using pre-extracted quantized weight node for %s\n", __func__, tensor->name); + // GGML_LOG_DEBUG("%s: using pre-extracted quantized weight node for %s\n", __func__, tensor->name); return quant_extra->weight_node; } } } - GGML_LOG_DEBUG("%s: creating new weight node for %s\n", __func__, tensor->name); + // GGML_LOG_DEBUG("%s: creating new weight node for %s\n", __func__, tensor->name); static const std::set weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K}; diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp index 39bf7610eb..7a48ed1b65 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp @@ -348,7 +348,7 @@ ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor element_type = ov::element::i64; break; default: - GGML_LOG_ERROR("%s: unsupported tensor type for ov::Tensor: %s\n", __func__, ggml_type_name(tensor->type)); + // GGML_LOG_WARN("%s: unsupported tensor type for ov::Tensor: %s\n", __func__, ggml_type_name(tensor->type)); return nullptr; } diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 801d9ad5c4..6655db7298 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -223,7 +223,7 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer GGML_ASSERT(tensor != nullptr && tensor->data != nullptr); ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; - // Check if this is a weight buffer (usage is set BEFORE set_tensor is called) + // Check if this is a weight buffer (usage is set BEFORE set_tensor is called, except in test-backend-ops) bool is_weight_buffer = (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS); // Full tensor set: offset=0, full size, not a view bool is_full_tensor_set = (offset == 0 && size == ggml_nbytes(tensor) && tensor->view_src == nullptr); @@ -235,7 +235,7 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer auto result = process_weight_tensor(tensor, data, tensor->data); result.weight_node->set_friendly_name(tensor->name); - const auto & layout = result.layout; + // const auto & layout = result.layout; ggml_openvino_extra_base * extra; // Quantized path with extracted weight/scale/zp tensors @@ -243,24 +243,24 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer extra = new ggml_openvino_quantized_weight_extra(std::move(result.weights), std::move(result.scales), std::move(result.zp), result.weight_node); - if (layout.is_requant) { - GGML_LOG_DEBUG("%s: requantized %s to %s (u%d, block_size=%ld)\n", __func__, tensor->name, - extra_quant_type_name(layout.requant_type.value()), layout.is_u4 ? 4 : 8, - layout.weights_per_block); - } else { - int64_t n_blocks = ggml_nelements(tensor) / layout.weights_per_block; - GGML_LOG_DEBUG("%s: extracted quantized weight node for %s (u%d, %zu weights, %ld blocks)\n", - __func__, tensor->name, layout.is_u4 ? 4 : 8, layout.weights_size, n_blocks); - } + // if (layout.is_requant) { + // GGML_LOG_DEBUG("%s: requantized %s to %s (u%d, block_size=%ld)\n", __func__, tensor->name, + // extra_quant_type_name(layout.requant_type.value()), layout.is_u4 ? 4 : 8, + // layout.weights_per_block); + // } else { + // int64_t n_blocks = ggml_nelements(tensor) / layout.weights_per_block; + // GGML_LOG_DEBUG("%s: extracted quantized weight node for %s (u%d, %zu weights, %ld blocks)\n", + // __func__, tensor->name, layout.is_u4 ? 4 : 8, layout.weights_size, n_blocks); + // } } else { // F16/F32/BF16 weight or F16-requant extra = new ggml_openvino_weight_extra(std::move(result.weights), result.weight_node); - if (layout.total_size > 0) { - GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name); - } else { - GGML_LOG_DEBUG("%s: created shared-memory weight node for %s\n", __func__, tensor->name); - } + // if (layout.total_size > 0) { + // GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name); + // } else { + // GGML_LOG_DEBUG("%s: created shared-memory weight node for %s\n", __func__, tensor->name); + // } } ctx->tensor_extras[tensor] = extra; @@ -271,7 +271,7 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer memcpy((char *) tensor->data + offset, data, size); } } else { - // Non-weight tensor (KV cache, activations, etc.) - copy data + // Non-weight tensor (KV cache, activations, etc.) - copy data. test-backend-ops also goes here if (ctx->is_remote) { cl_command_queue queue = ggml_openvino_get_cl_queue(); auto mem_cpy_fn = ggml_openvino_get_clEnqueueMemcpyINTEL(); @@ -290,7 +290,7 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer ggml_openvino_tensor_extra * extra = ggml_openvino_create_tensor_extra(tensor, ctx->is_remote); if (extra == nullptr) { - GGML_LOG_ERROR("%s: failed to create tensor extra for %s\n", __func__, tensor->name); + // GGML_LOG_ERROR("%s: failed to create tensor extra for %s\n", __func__, tensor->name); return; } @@ -795,7 +795,7 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { } case GGML_OP_SOFT_MAX: { if (op->src[2] != nullptr) { - GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with sinks\n"); + // GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with sinks\n"); return true; } float scale = 1.0f; @@ -804,14 +804,14 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { memcpy(&scale, (const float *) op_params + 0, sizeof(float)); memcpy(&max_bias, (const float *) op_params + 1, sizeof(float)); if (max_bias > 0) { - GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with max_bias > 0\n"); + // GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with max_bias > 0\n"); return true; } break; } case GGML_OP_FLASH_ATTN_EXT: { if (op->src[4] != nullptr) { - GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with sinks\n"); + // GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with sinks\n"); return true; } float scale = 1.0f; @@ -822,11 +822,11 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { memcpy(&max_bias, (const float *) op_params + 1, sizeof(float)); memcpy(&logit_softcap, (const float *) op_params + 2, sizeof(float)); if (max_bias > 0) { - GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with max_bias > 0\n"); + // GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with max_bias > 0\n"); return true; } if (logit_softcap != 0) { - GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with logit_softcap != 0\n"); + // GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with logit_softcap != 0\n"); return true; } break; @@ -834,14 +834,14 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { case GGML_OP_PERMUTE: { if (op->type == GGML_TYPE_BF16) { // err msg: [GPU] Could not find a suitable kernel for transpose - GGML_LOG_WARN("OpenVINO backend does not support PERMUTE with BF16 type\n"); + // GGML_LOG_WARN("OpenVINO backend does not support PERMUTE with BF16 type\n"); return true; } break; } case GGML_OP_CPY: { if (op->src[1] != op) { - GGML_LOG_WARN("OpenVINO backend only supports CPY that is a cast\n"); + // GGML_LOG_WARN("OpenVINO backend only supports CPY that is a cast\n"); return true; } break; @@ -849,7 +849,7 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { case GGML_OP_MUL_MAT: { if (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16) { // Has accuracy issue, try enabling this and see `test-backend-ops -o "MUL_MAT"` - GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with two F16 tensors\n"); + // GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with two F16 tensors\n"); return true; } break; @@ -858,17 +858,17 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { const int32_t * op_params = op->op_params; const int n_dims = op_params[1]; const int mode = op_params[2]; - if (mode == GGML_ROPE_TYPE_MROPE || mode == GGML_ROPE_TYPE_VISION) { - GGML_LOG_WARN("OpenVINO backend does not support ROPE with mode %d\n", mode); + if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX) { + // GGML_LOG_WARN("OpenVINO backend does not support ROPE with mode %d\n", mode); return true; } if (n_dims != 0.0f && n_dims != op->src[0]->ne[0]) { - GGML_LOG_WARN("OpenVINO backend does not support ROPE with n_dims %d != src[0]->ne[0] %ld\n", n_dims, - op->src[0]->ne[0]); + // GGML_LOG_WARN("OpenVINO backend does not support ROPE with n_dims %d != src[0]->ne[0] %ld\n", n_dims, + // op->src[0]->ne[0]); return true; } if (op->type != GGML_TYPE_F32) { - GGML_LOG_WARN("OpenVINO backend does not support ROPE with type %s\n", ggml_type_name(op->type)); + // GGML_LOG_WARN("OpenVINO backend does not support ROPE with type %s\n", ggml_type_name(op->type)); return true; } float freq_scale; @@ -876,15 +876,15 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { memcpy(&freq_scale, op_params + 6, sizeof(float)); memcpy(&ext_factor, op_params + 7, sizeof(float)); if (ext_factor != 0.0f) { - GGML_LOG_WARN("OpenVINO backend does not support ROPE with ext_factor %f != 0.0f\n", ext_factor); + // GGML_LOG_WARN("OpenVINO backend does not support ROPE with ext_factor %f != 0.0f\n", ext_factor); return true; } if (op->src[0]->op == GGML_OP_VIEW) { if (op->src[0]->view_src->ne[1] != op->src[0]->ne[2]) { - GGML_LOG_WARN( - "OpenVINO backend does not support ROPE with src[0]->view_src->ne[1] %ld != src[0]->ne[2] " - "%ld\n", - op->src[0]->view_src->ne[1], op->src[0]->ne[2]); + // GGML_LOG_WARN( + // "OpenVINO backend does not support ROPE with src[0]->view_src->ne[1] %ld != src[0]->ne[2] " + // "%ld\n", + // op->src[0]->view_src->ne[1], op->src[0]->ne[2]); return true; } } @@ -921,12 +921,12 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con case GGML_OP_UNARY: { auto supported = supported_unary_ops.find(ggml_get_unary_op(op)) != supported_unary_ops.end(); if (!supported) { - GGML_LOG_WARN("OpenVINO backend does not support unary op %s\n", ggml_unary_op_name(ggml_get_unary_op(op))); + // GGML_LOG_WARN("OpenVINO backend does not support unary op %s\n", ggml_unary_op_name(ggml_get_unary_op(op))); return false; } if (has_view_input(op)) { - GGML_LOG_WARN("OpenVINO backend does not support unary op %s with view input\n", - ggml_unary_op_name(ggml_get_unary_op(op))); + // GGML_LOG_WARN("OpenVINO backend does not support unary op %s with view input\n", + // ggml_unary_op_name(ggml_get_unary_op(op))); return false; } break; @@ -934,12 +934,12 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con case GGML_OP_GLU: { auto supported = supported_glu_ops.find(ggml_get_glu_op(op)) != supported_glu_ops.end(); if (!supported) { - GGML_LOG_WARN("OpenVINO backend does not support GLU op %s\n", ggml_glu_op_name(ggml_get_glu_op(op))); + // GGML_LOG_WARN("OpenVINO backend does not support GLU op %s\n", ggml_glu_op_name(ggml_get_glu_op(op))); return false; } if (has_view_input(op)) { - GGML_LOG_WARN("OpenVINO backend does not support unary op %s with view input\n", - ggml_glu_op_name(ggml_get_glu_op(op))); + // GGML_LOG_WARN("OpenVINO backend does not support unary op %s with view input\n", + // ggml_glu_op_name(ggml_get_glu_op(op))); return false; } break; @@ -947,7 +947,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con default: { auto supported = supported_ops.find(op->op) != supported_ops.end(); if (!supported) { - GGML_LOG_WARN("OpenVINO backend does not support op %s\n", ggml_op_name(op->op)); + // GGML_LOG_WARN("OpenVINO backend does not support op %s\n", ggml_op_name(op->op)); return false; } static std::set ops_not_support_view_input{ @@ -955,14 +955,14 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con GGML_OP_RMS_NORM, }; if (ops_not_support_view_input.find(op->op) != ops_not_support_view_input.end() && has_view_input(op)) { - GGML_LOG_WARN("OpenVINO backend does not support op %s with view input\n", ggml_op_name(op->op)); + // GGML_LOG_WARN("OpenVINO backend does not support op %s with view input\n", ggml_op_name(op->op)); return false; } } } if (supported_types.find(op->type) == supported_types.end()) { - GGML_LOG_WARN("OpenVINO backend does not support tensor type %s\n", ggml_type_name(op->type)); + // GGML_LOG_WARN("OpenVINO backend does not support tensor type %s\n", ggml_type_name(op->type)); return false; } for (int i = 0; i < GGML_MAX_SRC; i++) { @@ -971,11 +971,11 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con break; } if (supported_types.find(src->type) == supported_types.end()) { - GGML_LOG_WARN("OpenVINO backend does not support tensor type %s\n", ggml_type_name(src->type)); + // GGML_LOG_WARN("OpenVINO backend does not support tensor type %s\n", ggml_type_name(src->type)); return false; } if (ggml_is_quantized(src->type) && src->ne[2] != 1) { - GGML_LOG_WARN("OpenVINO backend does not support 3D quantized tensors\n"); + // GGML_LOG_WARN("OpenVINO backend does not support 3D quantized tensors\n"); return false; } } diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index 44e3368217..22fb7e2ba2 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -66,10 +66,10 @@ OutputVector translate_rope(const NodeContext & context) { } const int mode = op_params[2]; + constexpr int ROPE_TYPE_NORMAL = 0; constexpr int ROPE_TYPE_NEOX = 2; - constexpr int ROPE_TYPE_NORM = 0; - if (mode == ROPE_TYPE_NORM) { + if (mode == ROPE_TYPE_NORMAL) { auto neg_one = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index a370043dd7..e79f582939 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -38,19 +38,31 @@ #pragma GCC diagnostic ignored "-Wdeprecated-declarations" enum ggml_status ov_graph_compute(ggml_cgraph * cgraph) { - if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { - std::string filename = "cgraph_ov.txt"; - GgmlOvDecoder::dump_cgraph(cgraph, filename); - } + try { + if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { + std::string filename = "cgraph_ov.txt"; + GgmlOvDecoder::dump_cgraph(cgraph, filename); + } - // Use device from singleton (initialized during backend init) - const auto & device = ggml_openvino_get_device_name(); - const auto is_static = ggml_openvino_is_npu(); - bool stateful = false; - if (getenv("GGML_OPENVINO_STATEFUL_EXECUTION") && !is_static) { - stateful = true; + // Use device from singleton (initialized during backend init) + const auto & device = ggml_openvino_get_device_name(); + const auto is_static = ggml_openvino_is_npu(); + bool stateful = false; + if (getenv("GGML_OPENVINO_STATEFUL_EXECUTION") && !is_static) { + stateful = true; + } + + return is_static ? ov_graph_compute_static(cgraph) : ov_graph_compute_dynamic(cgraph, device, stateful); + } catch (const ov::Exception & e) { + // GGML_LOG_ERROR("GGML OpenVINO backend ov::Exception: %s\n", e.what()); + return GGML_STATUS_FAILED; + } catch (const std::exception & e) { + // GGML_LOG_ERROR("GGML OpenVINO backend std::exception: %s\n", e.what()); + return GGML_STATUS_FAILED; + } catch (...) { + // GGML_LOG_ERROR("GGML OpenVINO backend unknown exception\n"); + return GGML_STATUS_FAILED; } - return is_static ? ov_graph_compute_static(cgraph) : ov_graph_compute_dynamic(cgraph, device, stateful); } enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::string & device, bool stateful) { @@ -454,9 +466,6 @@ enum ggml_status naive_compute(ggml_cgraph * cgraph, if (cgraph->n_nodes == 1 && (cgraph->nodes[0]->op == GGML_OP_NONE || cgraph->nodes[0]->op == GGML_OP_VIEW)) { return GGML_STATUS_SUCCESS; } - if (cgraph->nodes[0]->op == GGML_OP_FLASH_ATTN_EXT) { - return GGML_STATUS_FAILED; - } auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); auto decoder = std::make_shared(cgraph, model_weights); From 2a6a95eb778a3dfaebafae2c3eaba6fe8b959c5c Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 13 Feb 2026 15:36:51 +0800 Subject: [PATCH 253/254] Fix MUL_MAT with broadcast; Add unsupported MUL_MAT FLASH_ATTN cases --- ggml/src/ggml-openvino/ggml-openvino.cpp | 37 +++++++++++++++++-- .../openvino/op/flash_attn_ext.cpp | 4 +- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 35 +++++++++--------- ggml/src/ggml-openvino/utils.cpp | 6 +-- 4 files changed, 56 insertions(+), 26 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 6655db7298..780d17b750 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -763,7 +763,7 @@ static ggml_backend_buffer_type_t ggml_backend_openvino_device_get_host_buffer_t return ggml_backend_openvino_host_buffer_type(ctx->device); } -static bool has_view_input(const ggml_tensor * op) { +static bool has_view_op_input(const ggml_tensor * op) { for (int i = 0; i < GGML_MAX_SRC; i++) { if (op->src[i] == nullptr) { break; @@ -775,6 +775,18 @@ static bool has_view_input(const ggml_tensor * op) { return false; } +static bool is_supported_flash_attn_pattern(const ggml_tensor * op) { + // pattern of q,k,v should be q->op==PERMUTE, q->src[0]->op==VIEW, q->src[0]->src[0]->view_src==nullptr + for (int i = 0; i < 3; i++) { + const ggml_tensor * src = op->src[i]; + if (src->op != GGML_OP_PERMUTE || src->src[0] == nullptr || src->src[0]->op != GGML_OP_VIEW || + src->src[0]->src[0] == nullptr || src->src[0]->src[0]->view_src != nullptr) { + return false; + } + } + return true; +} + static bool is_op_unsupported_case(const ggml_tensor * op) { switch (op->op) { case GGML_OP_GET_ROWS: @@ -814,6 +826,9 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { // GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with sinks\n"); return true; } + if (!is_supported_flash_attn_pattern(op)) { + return true; + } float scale = 1.0f; float max_bias = 0.0f; float logit_softcap = 0.0f; @@ -852,6 +867,20 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { // GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with two F16 tensors\n"); return true; } + if (op->src[0]->ne[3] != op->src[1]->ne[3] && op->src[0]->ne[3] != 1 && op->src[1]->ne[3] != 1) { + return true; + } + if (op->src[0]->op == GGML_OP_PERMUTE || op->src[1]->op == GGML_OP_PERMUTE) { + return true; + } + if (ggml_is_quantized(op->src[0]->type) && op->src[0]->ne[1] == 1) { + // MUL_MAT(type_a=q4_0,type_b=f32,m=1,n=2048,k=8192,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1) + // triggers a bug in ov matmul_shape_inference.hpp + return true; + } + if (op->src[0]->op == GGML_OP_VIEW && op->src[1]->op == GGML_OP_VIEW) { + return true; + } break; } case GGML_OP_ROPE: { @@ -924,7 +953,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con // GGML_LOG_WARN("OpenVINO backend does not support unary op %s\n", ggml_unary_op_name(ggml_get_unary_op(op))); return false; } - if (has_view_input(op)) { + if (has_view_op_input(op)) { // GGML_LOG_WARN("OpenVINO backend does not support unary op %s with view input\n", // ggml_unary_op_name(ggml_get_unary_op(op))); return false; @@ -937,7 +966,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con // GGML_LOG_WARN("OpenVINO backend does not support GLU op %s\n", ggml_glu_op_name(ggml_get_glu_op(op))); return false; } - if (has_view_input(op)) { + if (has_view_op_input(op)) { // GGML_LOG_WARN("OpenVINO backend does not support unary op %s with view input\n", // ggml_glu_op_name(ggml_get_glu_op(op))); return false; @@ -954,7 +983,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con GGML_OP_GET_ROWS, GGML_OP_RMS_NORM, }; - if (ops_not_support_view_input.find(op->op) != ops_not_support_view_input.end() && has_view_input(op)) { + if (ops_not_support_view_input.find(op->op) != ops_not_support_view_input.end() && has_view_op_input(op)) { // GGML_LOG_WARN("OpenVINO backend does not support op %s with view input\n", ggml_op_name(op->op)); return false; } diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp index 342da882aa..ca9e99ff88 100644 --- a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +++ b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp @@ -55,13 +55,13 @@ OutputVector translate_flash_attn_ext(const NodeContext & context) { auto tile_kv = [&](int64_t num_heads, int64_t num_heads_kv, int64_t head_size, ov::Output kv) { int64_t factor = num_heads / num_heads_kv; - if (factor > 1) { + if (factor > 1 && num_heads_kv > 1) { ov::Output kv_broadcast_shape, kv_unsqueezed, new_kv_shape; auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {2}); kv_unsqueezed = std::make_shared(kv, unsqueeze_axes); kv_broadcast_shape = ov::op::v0::Constant::create( - ov::element::i64, {5}, {(int64_t) 1, num_heads_kv, factor, (int64_t) 1, head_size}); + ov::element::i64, {5}, {(int64_t) 1, (int64_t) 1, factor, (int64_t) 1, (int64_t) 1}); new_kv_shape = ov::op::v0::Constant::create(ov::element::i64, {4}, {(int64_t) 0, num_heads, (int64_t) -1, head_size}); diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 27e4bfa460..d2483e0ab0 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -47,30 +47,31 @@ OutputVector translate_mulmat(const NodeContext & context) { auto B_shape = context.get_input_shape(0).to_shape(); auto A_shape = context.get_input_shape(1).to_shape(); - int64_t A_batch = A_shape[0]; - int64_t B_batch = B_shape[0]; + int64_t A_batch = A_shape[1]; + int64_t B_batch = B_shape[1]; + auto A_batch_larger = A_batch > B_batch; + auto batch_large = A_batch_larger ? A_batch : B_batch; + auto batch_small = A_batch_larger ? B_batch : A_batch; + Output Z = A_batch_larger ? B : A; - int64_t factor = A_batch_larger ? A_batch / B_batch : B_batch / A_batch; - if (factor > 1) { - // TODO code is outdated - auto A_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{A_batch}); - auto B_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{B_batch}); + int64_t factor = batch_large / batch_small; + if (factor > 1 && batch_small > 1) { + auto batch_large_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{batch_large}); + auto batch_small_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{batch_small}); auto factor_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{factor}); - auto Z_last_two_dims = get_dimensions(Z.get_node_shared_ptr(), {1, 2}); - - auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); + auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {2}); auto Z_unsqueezed = std::make_shared(Z, unsqueeze_axes); - Output batch_small = A_batch_larger ? B_batch_node : A_batch_node; - Output batch_large = A_batch_larger ? A_batch_node : B_batch_node; - auto broadcast_shape = - std::make_shared(ov::OutputVector{batch_small, factor_node, Z_last_two_dims}, 0); - auto Z_broadcasted = std::make_shared(Z_unsqueezed, broadcast_shape); + auto broadcast_shape = ov::op::v0::Constant::create( + ov::element::i64, {5}, {(int64_t) 1, (int64_t) 1, factor, (int64_t) 1, (int64_t) 1}); + auto new_Z_shape = ov::op::v0::Constant::create(ov::element::i64, {4}, + {(int64_t) 0, batch_large, (int64_t) -1, (int64_t) A_shape[3]}); - auto new_Z_shape = std::make_shared(ov::OutputVector{batch_large, Z_last_two_dims}, 0); - Z = std::make_shared(Z_broadcasted, new_Z_shape, false); + auto Z_broadcasted = std::make_shared(Z_unsqueezed, broadcast_shape, + ov::op::BroadcastType::BIDIRECTIONAL); + Z = std::make_shared(Z_broadcasted, new_Z_shape, true); } if (A_batch_larger) { B = Z; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index e79f582939..69fcb0eda4 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -54,13 +54,13 @@ enum ggml_status ov_graph_compute(ggml_cgraph * cgraph) { return is_static ? ov_graph_compute_static(cgraph) : ov_graph_compute_dynamic(cgraph, device, stateful); } catch (const ov::Exception & e) { - // GGML_LOG_ERROR("GGML OpenVINO backend ov::Exception: %s\n", e.what()); + GGML_LOG_ERROR("GGML OpenVINO backend ov::Exception: %s\n", e.what()); return GGML_STATUS_FAILED; } catch (const std::exception & e) { - // GGML_LOG_ERROR("GGML OpenVINO backend std::exception: %s\n", e.what()); + GGML_LOG_ERROR("GGML OpenVINO backend std::exception: %s\n", e.what()); return GGML_STATUS_FAILED; } catch (...) { - // GGML_LOG_ERROR("GGML OpenVINO backend unknown exception\n"); + GGML_LOG_ERROR("GGML OpenVINO backend unknown exception\n"); return GGML_STATUS_FAILED; } } From 5525bac07812583e33ca22eccdfb64041c1a46d0 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 13 Feb 2026 17:33:07 +0800 Subject: [PATCH 254/254] Use bias instead of zp in test-backend-ops --- ggml/src/ggml-openvino/ggml-decoder.cpp | 33 ++- ggml/src/ggml-openvino/ggml-decoder.h | 10 +- .../src/ggml-openvino/ggml-openvino-extra.cpp | 10 +- ggml/src/ggml-openvino/ggml-openvino-extra.h | 4 +- ggml/src/ggml-openvino/ggml-openvino.cpp | 7 + ggml/src/ggml-openvino/ggml-quants.cpp | 218 +++++++++++------- ggml/src/ggml-openvino/ggml-quants.hpp | 23 +- ggml/src/ggml-openvino/utils.cpp | 8 +- tests/CMakeLists.txt | 4 +- 9 files changed, 205 insertions(+), 112 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 99776e1bec..107b510f3b 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -50,6 +50,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, m_is_static(is_static), m_is_stateful(is_stateful), m_is_prefill(is_prefill), + m_naive(false), m_prefill_chunk_size(prefill_chunk_size), m_cgraph(cgraph), m_model_weights(model_weights), @@ -93,9 +94,10 @@ void GgmlOvDecoder::update_io(ggml_cgraph * cgraph) { GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map> & model_weights) { m_cgraph = cgraph; m_model_weights = model_weights; + m_naive = true; for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { auto * cur_node = cgraph->nodes[node_n]; - set_input_output(cur_node, true); + set_input_output(cur_node); } for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node); @@ -134,7 +136,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::mapname); auto node_output_name = node_name; @@ -169,7 +171,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { current_node_info.node_inputs_names.push_back(src_name); // Add model inputs - if (!naive && !src->view_src) { + if (!m_naive && !src->view_src) { ggml_backend_buffer * buffer = src->buffer; if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) { @@ -206,7 +208,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { } // Add model outputs - if (!naive) { + if (!m_naive) { // Model outputs are tensors with GGML_TENSOR_FLAG_OUTPUT flag and kv_caches static std::set debug_output_names = {}; // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph @@ -509,12 +511,14 @@ std::map GgmlOvDecoder::get_kv_param_res_names() const return kv_param_res_names; } -std::map> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph) { +std::map> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph, bool naive) { std::map> model_weights; // static std::mutex weights_mutex; auto * nodes = cgraph->nodes; auto n_nodes = cgraph->n_nodes; - std::for_each(std::execution::seq, nodes, nodes + n_nodes, [&](ggml_tensor * node) { + // std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor * node) { + for (int node_i = 0; node_i < n_nodes; node_i++) { + auto * node = nodes[node_i]; for (int i = 0; i < GGML_MAX_SRC; i++) { auto * src = node->src[i]; if (src == nullptr) { @@ -542,18 +546,19 @@ std::map> GgmlOvDecoder::create_weight_no // } // } if (model_weights.find(src_name) == model_weights.end()) { - auto weight_node = create_weight_node(src); + auto weight_node = create_weight_node(src, naive); weight_node->set_friendly_name(src_name); model_weights[src_name] = weight_node; } } } } - }); + } + // }); return model_weights; } -std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor) { +std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor, bool naive) { const bool is_ov_buffer = ggml_backend_buffer_is_openvino(tensor->buffer); // Check if we have a pre-built constant from the OpenVINO backend buffer @@ -581,6 +586,11 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor } } + // There are three cases where we need to create a new weight node: + // 1. weights are in openvino_host_buffer. Weight loading to host buffer will not trigger backend_buffer_set_tensor + // 2. weights are in cpu/cpu_mapped buffer. On token_embd.weight goes to case 1 or 2, depending on whether mmap or direct_io is used + // 3. test-backend-ops. buffers in test-backend-ops does not set USAGE_WEIGHT so backend_buffer_set_tensor will not create weight node + // GGML_LOG_DEBUG("%s: creating new weight node for %s\n", __func__, tensor->name); static const std::set weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, @@ -592,6 +602,7 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor OvWeight ov_weight; if (ggml_is_quantized(tensor->type)) { + auto use_bias = naive; if (is_ov_buffer) { // For quantized weights, copy raw data to a temp buffer first because // process_weight_tensor reads from data and writes extracted results @@ -600,9 +611,9 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor size_t raw_size = ggml_nbytes(tensor); std::vector tmp(raw_size); memcpy(tmp.data(), tensor->data, raw_size); - ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data); + ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data, use_bias); } else { - ov_weight = process_weight_tensor(tensor, tensor->data, nullptr); + ov_weight = process_weight_tensor(tensor, tensor->data, nullptr, use_bias); } } else { // For non-quantized weights (F16/F32/BF16), data is already in tensor->data. diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 59311a6121..ec6062a166 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -104,7 +104,7 @@ public: virtual ov::PartialShape get_output_shape(int node_idx) const override; - virtual ov::element::Type get_output_type(const int node_idx) const override; + virtual ov::element::Type get_output_type(int node_idx) const override; virtual int32_t * get_input_op_params(int node_idx, const std::string & name) const override; @@ -184,9 +184,10 @@ public: static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename); - static std::shared_ptr create_weight_node(ggml_tensor * tensor); + static std::shared_ptr create_weight_node(ggml_tensor * tensor, bool naive = false); - static std::map> create_weight_nodes(ggml_cgraph * cgraph); + static std::map> create_weight_nodes(ggml_cgraph * cgraph, + bool naive = false); const ggml_tensor * get_tensor_used_op(const ggml_tensor * tensor) const; @@ -207,6 +208,7 @@ public: bool m_is_static = false; bool m_is_stateful = false; bool m_is_prefill = false; + bool m_naive = false; int m_prefill_chunk_size = 0; static ov::Shape get_shape(const ggml_tensor * tensor); @@ -265,7 +267,7 @@ public: } private: - void set_input_output(ggml_tensor * node, bool naive = false); + void set_input_output(ggml_tensor * node); int compute_op_case(const ggml_tensor * node) const; void validate_cgraph() const; diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp index 7a48ed1b65..0b8b2d3743 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp @@ -6,6 +6,7 @@ #include #include #include +#include ov::Core & ov_singleton_core() { static ov::Core core; @@ -164,7 +165,7 @@ clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL() { } // Get requantization type for a tensor type (returns nullopt if no requant needed) -std::optional ggml_openvino_get_requant_type(const ggml_tensor * tensor) { +std::optional ggml_openvino_get_requant_type(const ggml_tensor * tensor, bool no_requant) { if (strncmp(tensor->name, "token_embd.weight", 17) == 0) { return ((ggml_openvino_is_npu() && tensor->type == GGML_TYPE_Q6_K) ? ExtraQuantType::F16 : ExtraQuantType::Q8_0_C); } @@ -174,6 +175,9 @@ std::optional ggml_openvino_get_requant_type(const ggml_tensor * if (ggml_openvino_is_npu()) { return ExtraQuantType::Q4_0_128; } + if (no_requant) { + return std::nullopt; + } switch (tensor->type) { case GGML_TYPE_Q6_K: case GGML_TYPE_Q5_K: @@ -187,7 +191,7 @@ std::optional ggml_openvino_get_requant_type(const ggml_tensor * // Extracted Layout Calculation // ===================================================== -ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor) { +ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor, bool use_bias) { ggml_openvino_extracted_layout layout = {}; layout.is_symmetric = false; @@ -204,7 +208,7 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten const size_t alignment = 64; // Good for SIMD // Check if requantization is needed (NPU-specific) - auto requant_type = ggml_openvino_get_requant_type(tensor); + auto requant_type = ggml_openvino_get_requant_type(tensor, use_bias); if (requant_type.has_value()) { layout.is_requant = true; layout.requant_type = requant_type; diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h index 9ce4667154..441a62e9d3 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.h +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h @@ -83,7 +83,7 @@ const std::string & ggml_openvino_get_device_name(); bool ggml_openvino_is_npu(); // Get requantization type for a tensor type (returns nullopt if no requant needed) -std::optional ggml_openvino_get_requant_type(const ggml_tensor * tensor); +std::optional ggml_openvino_get_requant_type(const ggml_tensor * tensor, bool no_requant = false); // ===================================================== // OpenVINO Tensor Extra Types @@ -160,7 +160,7 @@ struct ggml_openvino_extracted_layout { }; // Calculate the buffer layout for extracted quantized data -ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor); +ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor, bool use_bias = false); ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor, bool is_remote); diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 780d17b750..948ff2cc78 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -922,6 +922,13 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { default: break; } + if (op->op == GGML_OP_GET_ROWS) { + if (op->ne[0] == 256 && (op->src[0]->type == GGML_TYPE_Q4_K || op->src[0]->type == GGML_TYPE_Q5_K)) { + // ERR = 0.000000306 > 0.000000100 GET_ROWS(type=q4_K,n=256,m=5,r=4,be1=1,be2=1,v=0) + // ERR = 0.000000197 > 0.000000100 GET_ROWS(type=q5_K,n=256,m=5,r=4,be1=1,be2=1,v=0) + return true; + } + } return false; } diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index 10909cbc1e..3628f7a959 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -18,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -82,28 +84,41 @@ void extract_q4_0_data(const ggml_tensor * tensor, void extract_q4_1_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, ov::Tensor & scales_arr, - ov::Tensor & zp_arr) { + ov::Tensor & zp_arr, + bool use_bias) { const uint64_t bytes_per_block = 20; // 2 bytes scale, 2 bytes min, 32x0.5 byte weights auto * data = static_cast(tensor->data); auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); - auto * zp = static_cast(zp_arr.data()); - ov::parallel_for(scales_arr.get_size(), [&](size_t i) { - float scale = static_cast(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)))); - float min = static_cast(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2)))); - scales[i] = ov::float16(scale); - // zp = -min / scale (bias = min, so zp = -bias/scale) - uint8_t zp_val = (scale != 0.0f) ? (uint8_t) std::round(-min / scale) : 0; - // Pack two 4-bit zero points per byte - if (i % 2 == 0) { - zp[i / 2] = zp_val & 0x0F; // Lower nibble - } else { - zp[i / 2] |= (zp_val << 4); // Upper nibble - } - unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16); - }); + if (use_bias) { + // Store bias (min) directly as f16 instead of computing u4 zero points + auto * bias = zp_arr.data::value_type>(); + ov::parallel_for(scales_arr.get_size(), [&](size_t i) { + float scale = static_cast(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)))); + float min = static_cast(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2)))); + scales[i] = ov::float16(scale); + bias[i] = ov::float16(min); // bias = min, dequant: w*s + bias + unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16); + }); + } else { + auto * zp = static_cast(zp_arr.data()); + ov::parallel_for(scales_arr.get_size(), [&](size_t i) { + float scale = static_cast(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)))); + float min = static_cast(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2)))); + scales[i] = ov::float16(scale); + // zp = -min / scale (bias = min, so zp = -bias/scale) + uint8_t zp_val = (scale != 0.0f) ? (uint8_t) std::round(-min / scale) : 0; + // Pack two 4-bit zero points per byte + if (i % 2 == 0) { + zp[i / 2] = zp_val & 0x0F; // Lower nibble + } else { + zp[i / 2] |= (zp_val << 4); // Upper nibble + } + unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16); + }); + } } // Extracts (weight, scales, zp) from Q8_0 tensors. @@ -164,14 +179,18 @@ void unpack_256_4(const uint8_t * data, uint8_t * dst) { void extract_q4_k_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, ov::Tensor & scales_arr, - ov::Tensor & zp_arr) { + ov::Tensor & zp_arr, + bool use_bias) { const uint64_t bytes_per_block = 2 + 2 + 12 + 128; const uint64_t n_super_block = tensor->nb[3] / bytes_per_block; auto * data = static_cast(tensor->data); auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); - auto * zp = static_cast(zp_arr.data()); + + // For bias path, zp_arr holds f16 bias values; for zp path, it holds packed u4 zero points + auto * zp_u4 = use_bias ? nullptr : static_cast(zp_arr.data()); + auto * bias_f16 = use_bias ? zp_arr.data::value_type>() : nullptr; ov::parallel_for(n_super_block, [&](size_t i) { uint8_t * block_data = data + i * bytes_per_block; @@ -205,17 +224,22 @@ void extract_q4_k_data(const ggml_tensor * tensor, min_vals[6] = scale_mins * static_cast((*(qs1 + 10) >> 4) | ((*(qs1 + 6) >> 6) << 4)); min_vals[7] = scale_mins * static_cast((*(qs1 + 11) >> 4) | ((*(qs1 + 7) >> 6) << 4)); - // Store scales and compute zero points + // Store scales and compute zero points or bias for (int j = 0; j < 8; j++) { scales[i * 8 + j] = ov::float16(scale_vals[j]); - // zp = min / scale (since bias = -min and zp = -bias/scale) - uint8_t zp_val = (scale_vals[j] != 0.0f) ? (uint8_t) std::round(min_vals[j] / scale_vals[j]) : 0; - // Pack two 4-bit zero points per byte - size_t idx = i * 8 + j; - if (idx % 2 == 0) { - zp[idx / 2] = zp_val & 0x0F; + if (use_bias) { + // Store bias = -min directly as f16, dequant: w*s + bias + bias_f16[i * 8 + j] = ov::float16(-min_vals[j]); } else { - zp[idx / 2] |= (zp_val << 4); + // zp = min / scale (since bias = -min and zp = -bias/scale) + uint8_t zp_val = (scale_vals[j] != 0.0f) ? (uint8_t) std::round(min_vals[j] / scale_vals[j]) : 0; + // Pack two 4-bit zero points per byte + size_t idx = i * 8 + j; + if (idx % 2 == 0) { + zp_u4[idx / 2] = zp_val & 0x0F; + } else { + zp_u4[idx / 2] |= (zp_val << 4); + } } } unpack_256_4(block_data + 16, weights + i * 128); @@ -285,14 +309,18 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8 void extract_q5_k_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, ov::Tensor & scales_arr, - ov::Tensor & zp_arr) { + ov::Tensor & zp_arr, + bool use_bias) { const uint64_t bytes_per_block = 4 + 12 + 32 + 128; const uint64_t n_super_block = tensor->nb[3] / bytes_per_block; auto * data = static_cast(tensor->data); auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); - auto * zp = static_cast(zp_arr.data()); + + // For bias path, zp_arr holds f16 bias values; for zp path, it holds u8 zero points + auto * zp_u8 = use_bias ? nullptr : static_cast(zp_arr.data()); + auto * bias_f16 = use_bias ? zp_arr.data::value_type>() : nullptr; ov::parallel_for(n_super_block, [&](size_t i) { uint8_t * block_data = data + i * bytes_per_block; @@ -325,9 +353,15 @@ void extract_q5_k_data(const ggml_tensor * tensor, scales[i * 8 + is] = ov::float16(d1); scales[i * 8 + is + 1] = ov::float16(d2); - // zp = min / scale (since bias = -min and zp = -bias/scale) - zp[i * 8 + is] = (d1 != 0.0f) ? (uint8_t) std::round(m1 / d1) : 0; - zp[i * 8 + is + 1] = (d2 != 0.0f) ? (uint8_t) std::round(m2 / d2) : 0; + if (use_bias) { + // Store bias = -min directly as f16, dequant: w*s + bias + bias_f16[i * 8 + is] = ov::float16(-m1); + bias_f16[i * 8 + is + 1] = ov::float16(-m2); + } else { + // zp = min / scale (since bias = -min and zp = -bias/scale) + zp_u8[i * 8 + is] = (d1 != 0.0f) ? (uint8_t) std::round(m1 / d1) : 0; + zp_u8[i * 8 + is + 1] = (d2 != 0.0f) ? (uint8_t) std::round(m2 / d2) : 0; + } // Extract weights for first 32 elements (matching deq formula exactly) for (int l = 0; l < 32; ++l) { @@ -349,10 +383,14 @@ void extract_q5_k_data(const ggml_tensor * tensor, // TODO Reorder for make_intX_weights -ov::Output make_int8_weights(ov::Tensor & weight, ov::Tensor & scales, ov::Tensor & zp, size_t group_size) { +ov::Output make_int8_weights(ov::Tensor & weight, + ov::Tensor & scales, + ov::Tensor & zp, + size_t group_size, + bool use_bias) { ov::Shape orig_shape = weight.get_shape(); - // Expand dimensions for scales and zp + // Expand dimensions for scales and zp/bias auto scale_shape = scales.get_shape(); auto zp_shape = zp.get_shape(); bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization @@ -377,36 +415,45 @@ ov::Output make_int8_weights(ov::Tensor & weight, ov::Tensor & scales, static_cast(weight.data()), nullptr); weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; auto scales_f16 = std::make_shared(scales); - - // Zero point is already in U8 format from extraction - auto zero_point = std::make_shared(zp); - float zp_value; - if (ov::op::util::get_single_value(zero_point, zp_value)) { - zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value}); - } - - // Quantization operations auto weights_f16 = std::make_shared(weights_node, ov::element::f16); - auto zero_point_f16 = std::make_shared(zero_point, ov::element::f16); - auto w_zp = std::make_shared(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY); - ov::Output w_zp_s = - std::make_shared(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); + ov::Output result; + if (use_bias && !is_scalar_zp) { + // Bias path: w * s + b (zp tensor holds f16 bias values) + auto bias_f16 = std::make_shared(zp); + auto w_s = std::make_shared(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY); + result = std::make_shared(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY); + } else { + // Zero point path: (w - zp) * s + auto zero_point = std::make_shared(zp); + float zp_value; + if (ov::op::util::get_single_value(zero_point, zp_value)) { + zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value}); + } + auto zero_point_f16 = std::make_shared(zero_point, ov::element::f16); + auto w_zp = + std::make_shared(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY); + result = std::make_shared(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); + } if (packed_shape.size() != 2) { // If not requantized channel-wise case, reshape back to original shape auto final_shape = std::make_shared(ov::element::i64, ov::Shape{orig_shape.size()}, orig_shape); - w_zp_s = std::make_shared(w_zp_s, final_shape, false); + result = std::make_shared(result, final_shape, false); } - return std::make_shared(w_zp_s, ov::element::f32); + return std::make_shared(result, ov::element::f32); } -ov::Output make_int4_weights(ov::Tensor & weight, ov::Tensor & scales, ov::Tensor & zp, size_t group_size) { +ov::Output make_int4_weights(ov::Tensor & weight, + ov::Tensor & scales, + ov::Tensor & zp, + size_t group_size, + bool use_bias) { ov::Shape orig_weight_shape = weight.get_shape(); - // Expand dimensions for scales and zp + // Expand dimensions for scales and zp/bias ov::Shape scale_shape = scales.get_shape(); auto zp_shape = zp.get_shape(); bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization @@ -431,32 +478,35 @@ ov::Output make_int4_weights(ov::Tensor & weight, ov::Tensor & scales, static_cast(weight.data()), nullptr); weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; auto weights_f16 = std::make_shared(weights_node, ov::element::f16); - - // Zero point is already in U4 format from extraction - auto zero_points_node = std::make_shared(zp); - float zp_value; - if (ov::op::util::get_single_value(zero_points_node, zp_value)) { - zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value}); - } - auto zero_points_f16 = std::make_shared(zero_points_node, ov::element::f16); - auto scales_f16 = std::make_shared(scales); - // Perform dequantization - auto w_zp = std::make_shared(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY); - - ov::Output w_zp_s = - std::make_shared(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); + ov::Output result; + if (use_bias && !is_scalar_zp) { + // Bias path: w * s + b (zp tensor holds f16 bias values) + auto bias_f16 = std::make_shared(zp); + auto w_s = std::make_shared(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY); + result = std::make_shared(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY); + } else { + // Zero point path: (w - zp) * s + auto zero_points_node = std::make_shared(zp); + float zp_value; + if (ov::op::util::get_single_value(zero_points_node, zp_value)) { + zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value}); + } + auto zero_points_f16 = std::make_shared(zero_points_node, ov::element::f16); + auto w_zp = + std::make_shared(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY); + result = std::make_shared(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); + } if (packed_shape.size() != 2) { // If not requantized channel-wise case, reshape back to original shape auto final_shape = std::make_shared(ov::element::i64, ov::Shape{orig_weight_shape.size()}, orig_weight_shape); - - w_zp_s = std::make_shared(w_zp_s, final_shape, false); + result = std::make_shared(result, final_shape, false); } - return std::make_shared(w_zp_s, ov::element::f32); + return std::make_shared(result, ov::element::f32); } // Extract quantized weights from tensor and create weight subgraph @@ -464,7 +514,8 @@ std::shared_ptr extract_quantized_weights(const ggml_tensor * tensor, const void * data, ov::Tensor & weights, ov::Tensor & scales, - ov::Tensor & zp) { + ov::Tensor & zp, + bool use_bias) { // Create a temporary tensor for extraction functions that read from tensor->data ggml_tensor temp_tensor = *tensor; temp_tensor.data = const_cast(data); @@ -499,10 +550,10 @@ std::shared_ptr extract_quantized_weights(const ggml_tensor * tensor, extract_q4_0_data(&temp_tensor, weights, scales, zp); break; case GGML_TYPE_Q4_1: - extract_q4_1_data(&temp_tensor, weights, scales, zp); + extract_q4_1_data(&temp_tensor, weights, scales, zp, use_bias); break; case GGML_TYPE_Q4_K: - extract_q4_k_data(&temp_tensor, weights, scales, zp); + extract_q4_k_data(&temp_tensor, weights, scales, zp, use_bias); break; case GGML_TYPE_Q8_0: extract_q8_0_data(&temp_tensor, weights, scales, zp); @@ -511,7 +562,7 @@ std::shared_ptr extract_quantized_weights(const ggml_tensor * tensor, extract_q6_k_data(&temp_tensor, weights, scales, zp); break; case GGML_TYPE_Q5_K: - extract_q5_k_data(&temp_tensor, weights, scales, zp); + extract_q5_k_data(&temp_tensor, weights, scales, zp, use_bias); break; default: throw std::runtime_error("Unsupported quantized type: " + std::string(ggml_type_name(tensor->type))); @@ -520,9 +571,9 @@ std::shared_ptr extract_quantized_weights(const ggml_tensor * tensor, // Create the OpenVINO weight subgraph ov::Output weight_node; if (is_u4) { - weight_node = make_int4_weights(weights, scales, zp, weights_per_block); + weight_node = make_int4_weights(weights, scales, zp, weights_per_block, use_bias); } else { - weight_node = make_int8_weights(weights, scales, zp, weights_per_block); + weight_node = make_int8_weights(weights, scales, zp, weights_per_block, use_bias); } auto result = weight_node.get_node_shared_ptr(); @@ -576,7 +627,7 @@ std::shared_ptr requantize_to_buffers(const ggml_tensor * tensor, return result; } -OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr) { +OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr, bool use_bias) { GGML_ASSERT(tensor != nullptr); GGML_ASSERT(data != nullptr); @@ -619,12 +670,19 @@ OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, vo OPENVINO_THROW("Unsupported weight tensor type: ", ggml_type_name(tensor->type)); } - result.layout = ggml_openvino_get_extracted_layout(tensor); + result.layout = ggml_openvino_get_extracted_layout(tensor, use_bias); const auto & layout = result.layout; if (layout.total_size == 0) { OPENVINO_THROW("Unsupported quantized type: ", ggml_type_name(tensor->type)); } + if (use_bias) { + OPENVINO_ASSERT(!layout.is_requant, + "use_bias is only used for test-backend-ops, which should not have requantization"); + // bias node will be created on the fly and not use backend buffer + output_base_ptr = nullptr; + } + // F16 requant path - no separate scales/zp needed in result if (layout.is_requant && layout.requant_type.has_value() && layout.requant_type.value() == ExtraQuantType::F16) { if (output_base_ptr) { @@ -653,14 +711,20 @@ OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, vo } else { result.weights = ov::Tensor(weight_type, node_shape); result.scales = ov::Tensor(ov::element::f16, scale_shape); - result.zp = ov::Tensor(weight_type, zp_shape); + if (use_bias && !layout.is_symmetric) { + // bias only has effect for asymmetric quant + result.zp = ov::Tensor(ov::element::f16, zp_shape); + } else { + result.zp = ov::Tensor(weight_type, zp_shape); + } } if (layout.is_requant && layout.requant_type.has_value()) { result.weight_node = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block, result.weights, result.scales, result.zp); } else { - result.weight_node = extract_quantized_weights(tensor, data, result.weights, result.scales, result.zp); + result.weight_node = + extract_quantized_weights(tensor, data, result.weights, result.scales, result.zp, use_bias); } return result; diff --git a/ggml/src/ggml-openvino/ggml-quants.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp index 600b9c9f29..e4a02297ca 100644 --- a/ggml/src/ggml-openvino/ggml-quants.hpp +++ b/ggml/src/ggml-openvino/ggml-quants.hpp @@ -16,7 +16,8 @@ void extract_q4_0_data(const ggml_tensor * tensor, void extract_q4_1_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, ov::Tensor & scales_arr, - ov::Tensor & zp_arr); + ov::Tensor & zp_arr, + bool use_bias = false); void extract_q8_0_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, @@ -28,12 +29,14 @@ void unpack_256_4(const uint8_t* data, uint8_t* dst); void extract_q4_k_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, ov::Tensor & scales_arr, - ov::Tensor & zp_arr); + ov::Tensor & zp_arr, + bool use_bias = false); void extract_q5_k_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, ov::Tensor & scales_arr, - ov::Tensor & zp_arr); + ov::Tensor & zp_arr, + bool use_bias = false); void extract_q6_k_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, @@ -45,12 +48,14 @@ static constexpr size_t GGML_QUANTIZATION_GROUP_SIZE = 32; ov::Output make_int8_weights(ov::Tensor & weight, ov::Tensor & scales, ov::Tensor & zp, - size_t group_size = GGML_QUANTIZATION_GROUP_SIZE); + size_t group_size = GGML_QUANTIZATION_GROUP_SIZE, + bool use_bias = false); ov::Output make_int4_weights(ov::Tensor & weight, ov::Tensor & scales, ov::Tensor & zp, - size_t group_size = GGML_QUANTIZATION_GROUP_SIZE); + size_t group_size = GGML_QUANTIZATION_GROUP_SIZE, + bool use_bias = false); // Extract quantized weights from tensor and create weight subgraph // If weights/scales/zp are provided (non-empty), uses them as output buffers @@ -61,7 +66,8 @@ std::shared_ptr extract_quantized_weights( const void * data, // Source data pointer (may differ from tensor->data) ov::Tensor & weights, ov::Tensor & scales, - ov::Tensor & zp); + ov::Tensor & zp, + bool use_bias = false); // Use fp bias instead of quantized zero_point (for test-backend-ops) // Requantize weights from tensor to target format, writing to provided buffers // For F16 target, only weights buffer is used (scales/zp ignored) @@ -112,8 +118,9 @@ struct OvWeight { // Returns OvWeight with the weight node and optional quantized tensors OvWeight process_weight_tensor( const ggml_tensor * tensor, - const void * data, // Source data pointer (may differ from tensor->data) - void * output_base_ptr = nullptr); // Base pointer for output buffers (or nullptr for internal allocation) + const void * data, // Source data pointer (may differ from tensor->data) + void * output_base_ptr = nullptr, // Base pointer for output buffers (or nullptr for internal allocation) + bool use_bias = false); // Use fp bias instead of quantized zero_point, only used in test-backend-ops void quantize_q4_0(const float * x, ov::Tensor & weights_arr, diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 69fcb0eda4..41fbf27383 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -127,7 +127,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin if (pos_data[0] == 0) { infer_request->reset_state(); stateful_kv_size = pos_shape[3]; - } else if (stateful_kv_size == pos_data[0]) { + } else if (stateful_kv_size == static_cast(pos_data[0])) { stateful_kv_size += pos_shape[3]; } else { auto states = infer_request->query_state(); @@ -139,7 +139,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin state.set_state(new_state_tensor); } stateful_kv_size = pos_data[0] + 1; - } + } } decoder_end_time = ggml_time_us(); @@ -467,10 +467,10 @@ enum ggml_status naive_compute(ggml_cgraph * cgraph, return GGML_STATUS_SUCCESS; } - auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); + bool naive = true; + auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, naive); auto decoder = std::make_shared(cgraph, model_weights); auto input_model = std::make_shared(decoder); - auto naive = true; auto model = ov::frontend::ggml::FrontEnd::convert(input_model, naive); if (getenv("GGML_OPENVINO_DUMP_IR")) { ov::serialize(model, "IR_naive.xml"); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 6cdf6ae818..350bffc315 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -233,9 +233,7 @@ if (NOT LLAMA_SANITIZE_ADDRESS AND NOT GGML_SCHED_NO_REALLOC) llama_build_and_test(test-opt.cpp) endif() llama_build_and_test(test-gguf.cpp) -if (NOT GGML_OPENVINO) - llama_build_and_test(test-backend-ops.cpp) -endif() +llama_build_and_test(test-backend-ops.cpp) llama_build_and_test(test-model-load-cancel.cpp LABEL "model") llama_build_and_test(test-autorelease.cpp LABEL "model")