REFACTOR: support weigts as constant

This commit is contained in:
Yu, Zijun 2025-04-28 12:00:13 +08:00 committed by Mustafa Cavus
parent 0c7b026ecc
commit c04966cda6
6 changed files with 334 additions and 278 deletions

View File

@ -53,8 +53,7 @@ static ggml_backend_buffer_type_t ggml_backend_openvino_get_default_buffer_type(
static enum ggml_status
ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph *cgraph) {
int end_node = cgraph->n_nodes - 1;
openvino_frontend_compute(backend, cgraph, 0, end_node);
openvino_frontend_compute(backend, cgraph);
ov::Core core;

View File

@ -1,19 +1,14 @@
#pragma once
#include <unordered_map>
#include "openvino/core/node.hpp"
#include "openvino/frontend/decoder.hpp"
#include "openvino/op/parameter.hpp"
namespace ov {
namespace frontend {
namespace ggml {
// 定义 tensor_info 结构体
struct tensor_info {
std::vector<int> shape;
std::vector<int> stride;
};
// TODO: Directly include from openvino
class GgmlDecoder : public DecoderBase {
public:
@ -36,10 +31,6 @@ public:
virtual std::vector<std::string> get_input_names() const = 0;
virtual std::string& get_op_node_name(const std::string& name, const int index = -1) = 0;
// virtual const struct tensor_info get_node_op_info(const std::string& name) const = 0;
virtual PartialShape get_output_shape(const std::string& name) const = 0;
virtual std::vector<size_t> get_output_stride(const std::string& name) const = 0;
@ -64,14 +55,11 @@ public:
virtual void visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>)> node_visitor) const = 0;
// virtual const std::vector<size_t>& outputs() const = 0;
// virtual size_t output(size_t index) const = 0;
virtual bool check_if_continuous() const = 0;
virtual const std::vector<std::shared_ptr<ov::op::v0::Parameter>>& get_params() const = 0;
virtual const std::unordered_map<std::string, std::shared_ptr<ov::Node>>& get_model_inputs() const = 0;
virtual const std::unordered_map<std::string, std::shared_ptr<ov::Node>>& get_model_weights() const = 0;
virtual const std::vector<std::string>& get_model_output_names() const = 0;
};
} // namespace ggml

View File

@ -1,11 +1,62 @@
#include "ggml-decoder.h"
#include <ggml.h>
#include <ggml-impl.h>
#include <ggml-cpu-impl.h>
#include <iomanip>
#include <fstream>
void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map<std::string, ggml_tensor *>& inputs, std::map<std::string, ggml_tensor *>& outputs) {
#include <ggml-impl.h>
#include <ggml.h>
#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <fstream>
#include <iomanip>
#include <memory>
#include <openvino/core/node.hpp>
#include <openvino/core/type/float16.hpp>
#include <openvino/op/constant.hpp>
#include "ggml-backend-impl.h"
#include "ggml-backend.h"
GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph)
: m_cgraph(cgraph),
m_node(node),
m_op_name(m_node ? std::string(m_node->name) : "NONE_OP") {
if (m_node) {
set_input_output(m_node);
} else {
// std::map<void*, std::vector<std::string>> address_map;
// for (int node_n = start_index; node_n <= end_index; node_n++) {
// auto node = cgraph->nodes[node_n];
// if (node->data) {
// auto it = address_map.find(node->data);
// if (it == address_map.end()) {
// address_map[node->data] = std::vector<std::string>();
// }
// address_map[node->data].push_back(node->name);
// }
// }
// for (const auto& pair : address_map) {
// std::cout << "Address: " << pair.first << " -> ";
// for (const auto& name : pair.second) {
// std::cout << name << " ;";
// }
// std::cout << std::endl;
// }
for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) {
auto* cur_node = m_cgraph->nodes[node_n];
m_nodes.push_back(cur_node);
// Init model input and output
set_input_output(cur_node);
}
if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
dump_cgraph(m_cgraph);
}
}
}
// Called in GgmlOvDecoder constructor. Two cases: 1. constructing a decoder for the whole graph;
// 2. constructing a decoder for a node.
void GgmlOvDecoder::set_input_output(ggml_tensor* node) {
std::string node_name;
if (node->op == GGML_OP_CPY) {
// CPY updates the input tensor in place. For later ov op that uses the
@ -17,51 +68,130 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map<std::string, gg
node_name = std::string(node->name);
}
std::string src0_name = std::string(node->src[0]->name);
inputs[src0_name] = node->src[0];
outputs[node_name] = node;
m_input_names.push_back(src0_name);
m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op));
if (node->op == GGML_OP_CPY && node->view_src) {
m_output_names.push_back(node->view_src->name);
} else {
m_output_names.push_back(node_name);
m_output_names.push_back(node_name);
m_outputs[node_name] = node;
for (int i = 0; i < GGML_MAX_SRC; i++) {
auto* src = node->src[i];
if (src == nullptr) {
continue;
}
std::string src_name = std::string(src->name);
m_input_names.push_back(src_name);
m_inputs[src_name] = src;
m_op_node_name.emplace_back(src_name, ggml_op_name(node->op));
// If called for the whole graph, create constant nodes for weights and param nodes for inputs
if (!m_node && !src->view_src) {
ggml_backend_buffer* buffer = src->buffer;
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
bool weight_as_input = getenv("GGML_OPENVINO_WEIGHT_AS_INPUT");
auto& weights_map = weight_as_input ? m_model_inputs : m_model_weights;
if (weights_map.find(src_name) != weights_map.end()) {
continue;
}
std::shared_ptr<ov::Node> weight_node =
weight_as_input
? std::make_shared<ov::op::v0::Parameter>(get_ov_type(src), ov::Shape{get_shape(src)})
: create_weight_node(src);
weight_node->set_friendly_name(src_name);
weights_map[src_name] = weight_node;
} else if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) {
// GGML_BACKEND_BUFFER_USAGE_ANY are kv caches
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) {
assert(src_name.find("cache_k") == 0 || src_name.find("cache_v") == 0);
}
if (m_model_inputs.find(src_name) != m_model_inputs.end()) {
continue;
}
auto param_node = std::make_shared<ov::op::v0::Parameter>(get_ov_type(src), ov::Shape{get_shape(src)});
param_node->set_friendly_name(src_name);
m_model_inputs[src_name] = param_node;
}
}
}
if (node->src[1]) {
std::string src1_name = std::string(node->src[1]->name);
inputs[src1_name] = node->src[1];
m_input_names.push_back(src1_name);
m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op));
}
if (node->src[2]) {
std::string src2_name = std::string(node->src[2]->name);
inputs[src2_name] = node->src[2];
m_input_names.push_back(src2_name);
m_op_node_name.emplace_back(src2_name, ggml_op_name(node->op));
if (!m_node) {
// Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph
if (node->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || node->flags & GGML_TENSOR_FLAG_OUTPUT ||
std::string(node->name).find("result") == 0) {
auto name = node->view_src ? std::string(node->view_src->name) : std::string(node->name);
if (node->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) {
assert(name.find("cache_k") == 0 || name.find("cache_v") == 0);
}
auto it = std::find(m_model_output_names.begin(), m_model_output_names.end(), name);
if (it == m_model_output_names.end()) {
m_model_output_names.push_back(name);
}
}
}
switch (node->op) {
case GGML_OP_CONT: {
// Currently only two cases, either the input comes from a VIEW which is subtensor or from a PERMUTE
m_continuous = ggml_nelements(node->src[0]) == ggml_nelements(node->src[0]->view_src);
break;
}
case GGML_OP_CPY: {
m_continuous = ggml_is_contiguous(node);
break;
}
case GGML_OP_MUL_MAT: {
m_continuous = node->src[0]->view_src == nullptr;
break;
}
default:
break;
if (m_node) {
switch (node->op) {
case GGML_OP_CONT: {
// Currently only two cases, either the input comes from a VIEW which is subtensor or from a PERMUTE
m_continuous = ggml_nelements(node->src[0]) == ggml_nelements(node->src[0]->view_src);
break;
}
case GGML_OP_CPY: {
m_continuous = ggml_is_contiguous(node);
break;
}
case GGML_OP_MUL_MAT: {
m_continuous = node->src[0]->view_src == nullptr;
break;
}
default:
break;
}
}
}
void ggml_graph_op_print(const struct ggml_cgraph * cgraph) {
std::ofstream file("01_nodes.txt");
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) {
std::shared_ptr<ov::Node> weight_node;
auto node_type = get_ov_type(tensor);
auto node_shape = get_shape(tensor);
auto ne_total = ggml_nelements(tensor);
switch (tensor->type) {
case GGML_TYPE_I32: {
const auto* ptr = reinterpret_cast<const int32_t*>(tensor->data);
std::vector<int32_t> data(ptr, ptr + ne_total);
weight_node = std::make_shared<ov::op::v0::Constant>(node_type, node_shape, data);
break;
}
case GGML_TYPE_I64: {
const auto* ptr = reinterpret_cast<const int64_t*>(tensor->data);
std::vector<int64_t> data(ptr, ptr + ne_total);
weight_node = std::make_shared<ov::op::v0::Constant>(node_type, node_shape, data);
break;
}
case GGML_TYPE_F32: {
const auto* ptr = reinterpret_cast<const float*>(tensor->data);
std::vector<float> data(ptr, ptr + ne_total);
weight_node = std::make_shared<ov::op::v0::Constant>(node_type, node_shape, data);
break;
}
case GGML_TYPE_F16: {
const auto* ptr = reinterpret_cast<const uint16_t*>(tensor->data);
std::vector<ov::float16> data_f16;
data_f16.reserve(ne_total);
for (int i = 0; i < ne_total; ++i) {
data_f16.push_back(ov::float16::from_bits(ptr[i]));
}
weight_node = std::make_shared<ov::op::v0::Constant>(node_type, node_shape, data_f16);
break;
}
default:
throw std::invalid_argument("Unsupported tensor type");
}
return weight_node;
}
void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph) {
std::ofstream file("cgraph.txt");
if (!file.is_open()) {
std::cerr << "Failed to open file" << std::endl;
return;
@ -160,88 +290,53 @@ void ggml_graph_op_print(const struct ggml_cgraph * cgraph) {
file.close();
}
GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgraph, const int32_t start_index, const int32_t end_index)
:m_cgraph(cgraph),
m_node(node),
m_op_name(m_node ? std::string(m_node->name) : "NONE_OP") {
m_inputs.clear();
m_outputs.clear();
m_input_names.clear();
m_output_names.clear();
m_params.clear();
m_op_node_name.clear();
m_decoders.clear();
if (m_node) {
set_input_output(m_node, m_inputs, m_outputs);
} else {
// for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) {
for (int node_n = start_index; node_n <= end_index; node_n++) {
auto cur_node = m_cgraph->nodes[node_n];
m_nodes.push_back(cur_node);
// Init model input and output
set_input_output(cur_node, m_inputs, m_outputs);
}
if (getenv("GGML_OPENVINO_DEBUG")) {
ggml_graph_op_print(m_cgraph);
}
std::vector<size_t> GgmlOvDecoder::get_shape(const ggml_tensor* tensor) {
std::vector<size_t> shape;
for (int i = GGML_MAX_DIMS - 2; i >= 0; --i) {
shape.push_back(static_cast<size_t>(tensor->ne[i]));
}
return shape;
}
std::vector<size_t> GgmlOvDecoder::get_stride(const ggml_tensor* tensor) {
std::vector<size_t> stride;
for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) {
stride.push_back(static_cast<size_t>(tensor->nb[i]));
}
return stride;
}
ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor* tensor) {
ov::element::Type type = ov::element::dynamic;
switch (tensor->type) {
case GGML_TYPE_F32:
type = ov::element::f32;
break;
case GGML_TYPE_F16:
type = ov::element::f16;
break;
case GGML_TYPE_I64:
type = ov::element::i64;
break;
case GGML_TYPE_I32:
type = ov::element::i32;
break;
default:
break;
}
return type;
}
ov::PartialShape GgmlOvDecoder::get_input_shape(const std::string& name) const {
ov::PartialShape input_shape;
// Use input_node->ne
ggml_tensor * node = m_inputs.at(name);
std::vector<size_t> shape;
for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) {
if (node->ne[i] == 0) {
return input_shape;
}
shape.push_back(static_cast<size_t>(node->ne[i]));
}
input_shape = ov::PartialShape(shape);
return input_shape;
return ov::PartialShape(get_shape(m_inputs.at(name)));
}
std::vector<size_t> GgmlOvDecoder::get_input_stride(const std::string& name) const {
std::vector<size_t> stride;
ggml_tensor * node = m_inputs.at(name);
for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) {
stride.push_back(static_cast<size_t>(node->nb[i]));
}
return stride;
}
std::vector<size_t> GgmlOvDecoder::get_output_stride(const std::string& name) const {
std::vector<size_t> stride;
ggml_tensor * node = m_outputs.at(name);
for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) {
stride.push_back(static_cast<size_t>(node->nb[i]));
}
return stride;
return get_stride(m_inputs.at(name));
}
ov::element::Type GgmlOvDecoder::get_input_type(const std::string& name) const {
ov::element::Type type = ov::element::dynamic;
switch (m_inputs.at(name)->type) {
case GGML_TYPE_F32:
type = ov::element::f32;
break;
case GGML_TYPE_F16:
type = ov::element::f16;
break;
case GGML_TYPE_I64:
type = ov::element::i64;
break;
case GGML_TYPE_I32:
type = ov::element::i32;
break;
default:
break;
}
return type;
return get_ov_type(m_inputs.at(name));
}
size_t GgmlOvDecoder::get_input_size() const {
@ -257,69 +352,16 @@ std::vector<std::string> GgmlOvDecoder::get_input_names() const {
return m_input_names;
}
std::string& GgmlOvDecoder::get_op_node_name(const std::string& key_name, const int index) {
if (index == -1) {
for (size_t i = 0; i < m_op_node_name.size(); ++i) {
if (m_op_node_name[i].first == key_name) {
return m_op_node_name[i].second;
}
}
} else {
return m_op_node_name[index].second;
}
static std::string empty_string = "";
return empty_string; // empty string
}
const std::vector<std::shared_ptr<ov::op::v0::Parameter>>& GgmlOvDecoder::get_params() const {
return m_params;
std::vector<size_t> GgmlOvDecoder::get_output_stride(const std::string& name) const {
return get_stride(m_outputs.at(name));
}
ov::PartialShape GgmlOvDecoder::get_output_shape(const std::string& name) const {
ov::PartialShape output_shape;
ggml_tensor * node = m_outputs.at(name);
std::vector<size_t> shape;
for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) {
if (node->ne[i] == 0 ) {
// empty if any dimension has no elements
return output_shape;
}
shape.push_back(static_cast<size_t>(node->ne[i]));
}
output_shape = ov::PartialShape(shape);
return output_shape;
return ov::PartialShape(get_shape(m_outputs.at(name)));
}
ov::element::Type GgmlOvDecoder::get_output_type(const std::string& name) const {
// TODO: Change to Output
ov::element::Type type = ov::element::dynamic;
switch (m_outputs.at(name)->type) {
case GGML_TYPE_F32:
type = ov::element::f32;
break;
case GGML_TYPE_F16:
type = ov::element::f16;
break;
case GGML_TYPE_I64:
type = ov::element::i64;
break;
case GGML_TYPE_I32:
type = ov::element::i32;
break;
default:
break;
}
return type;
}
int32_t* GgmlOvDecoder::get_input_op_params(const std::string& name) const{
return m_inputs.at(name)->op_params;
}
int32_t* GgmlOvDecoder::get_output_op_params(const std::string& name) const{
return m_outputs.at(name)->op_params;
return get_ov_type(m_outputs.at(name));
}
std::string& GgmlOvDecoder::get_output_name(size_t index) const {
@ -335,10 +377,17 @@ const std::string& GgmlOvDecoder::get_op_name() const {
return m_op_name;
}
int32_t* GgmlOvDecoder::get_input_op_params(const std::string& name) const {
return m_inputs.at(name)->op_params;
}
int32_t* GgmlOvDecoder::get_output_op_params(const std::string& name) const {
return m_outputs.at(name)->op_params;
}
void GgmlOvDecoder::visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>)> node_visitor) const {
for (const auto& node : m_nodes) {
auto decoder = std::make_shared<GgmlOvDecoder>(node, m_cgraph);
// m_decoders.push_back(decoder);
node_visitor(decoder);
}
}

View File

@ -1,14 +1,17 @@
#pragma once
#include <memory>
#include <unordered_map>
#include <vector>
#include "decoder.h"
#include "ggml.h"
#include "openvino/op/parameter.hpp"
class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
public:
using ov::frontend::ggml::GgmlDecoder::GgmlDecoder;
GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgraph, const int32_t start_index=0, const int32_t end_index=0);
GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph);
virtual ov::Any get_attribute(const std::string& name) const override {
return nullptr;
@ -73,12 +76,23 @@ public:
return m_continuous;
}
std::string& get_op_node_name(const std::string& key_name, const int index) override;
virtual const std::vector<std::shared_ptr<ov::op::v0::Parameter>>& get_params() const override;
virtual const std::unordered_map<std::string, std::shared_ptr<ov::Node>>& get_model_inputs() const override {
return m_model_inputs;
}
virtual const std::unordered_map<std::string, std::shared_ptr<ov::Node>>& get_model_weights() const override {
return m_model_weights;
}
virtual const std::vector<std::string>& get_model_output_names() const override {
return m_model_output_names;
}
private:
void set_input_output(ggml_tensor* node, std::map<std::string, ggml_tensor *>& inputs, std::map<std::string, ggml_tensor *>& outputs);
void set_input_output(ggml_tensor* node);
static void dump_cgraph(const struct ggml_cgraph* cgraph);
static std::vector<size_t> get_shape(const ggml_tensor* tensor);
static std::vector<size_t> get_stride(const ggml_tensor* tensor);
static ov::element::Type get_ov_type(const ggml_tensor* tensor);
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor);
struct ggml_cgraph * m_cgraph;
std::map<std::string, ggml_tensor *> m_inputs;
@ -86,12 +100,12 @@ private:
std::map<std::string, ggml_tensor *> m_outputs;
std::vector<std::string> m_output_names;
ggml_tensor* m_node;
std::vector<ggml_tensor *> m_nodes;
std::vector<std::shared_ptr<GgmlOvDecoder>> m_decoders;
std::vector<ggml_tensor*> m_nodes;
std::string m_op_name;
mutable std::string m_name;
bool m_continuous;
std::vector<std::shared_ptr<ov::op::v0::Parameter>> m_params;
std::vector<std::pair<std::string, std::string>> m_op_node_name;
std::unordered_map<std::string, std::shared_ptr<ov::Node>> m_model_inputs;
std::unordered_map<std::string, std::shared_ptr<ov::Node>> m_model_weights;
std::vector<std::string> m_model_output_names;
};

View File

@ -1,49 +1,22 @@
#include "utils.h"
#include "ggml-backend-impl.h"
#include "ggml-impl.h"
#include "ggml.h"
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <fstream>
#include <openvino/core/graph_util.hpp>
#include <openvino/core/type/float16.hpp>
#include <openvino/frontend/manager.hpp>
#include <openvino/openvino.hpp>
using ov::frontend::ggml::GgmlDecoder;
#include "ggml-impl.h"
#include "ggml.h"
std::shared_ptr<GgmlOvDecoder> get_ggml_decoder(struct ggml_cgraph * cgraph, const int32_t start_index, const int32_t end_index) {
return std::make_shared<GgmlOvDecoder>(nullptr, cgraph, start_index, end_index);
}
std::vector<std::pair<std::string, ov::Tensor>> get_ggml_graph_input_tensors(std::shared_ptr<GgmlOvDecoder> ggml_decoder) {
std::vector<std::pair<std::string, ov::Tensor>> input_tensors;
auto input_names = ggml_decoder->get_input_names();
size_t op_iter = 0;
for (size_t inp = 0; inp < input_names.size(); ++inp) {
auto name = input_names[inp];
std::string op_node_name = ggml_decoder->get_op_node_name(name, op_iter++);
// auto node_op_name = ggml_decoder->get_node_op_name(name);
auto input_data = ggml_decoder->get_input_ggml_tensor(name)->data;
#ifdef GGML_OPENVINO_DEBUG
printf("Subgraph input %d: %g\n", inp, *(double*)(input_data));
#endif
ov::Tensor input_tensor;
ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape();
std::vector<size_t> input_stride = ggml_decoder->get_input_stride(name);
input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data);
// input_tensors[name] = input_tensor;
input_tensors.emplace_back(name, input_tensor);
}
// std::cout << "input_names.size(): " << input_names.size() << std::endl;
return input_tensors;
std::shared_ptr<GgmlOvDecoder> get_ggml_decoder(struct ggml_cgraph* cgraph) {
return std::make_shared<GgmlOvDecoder>(nullptr, cgraph);
}
ov::Tensor get_ggml_graph_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, std::string& name) {
auto input_data = ggml_decoder->get_input_ggml_tensor(name)->data;
#ifdef GGML_OPENVINO_DEBUG
printf("Subgraph input %s: %g\n", name.c_str(), *(double*)(input_data));
#endif
auto* input_data = ggml_decoder->get_input_ggml_tensor(name)->data;
ov::Tensor input_tensor;
ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape();
std::vector<size_t> input_stride = ggml_decoder->get_input_stride(name);
@ -53,19 +26,16 @@ ov::Tensor get_ggml_graph_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decod
std::map<std::string, void*> get_ggml_graph_output_dst(std::shared_ptr<GgmlOvDecoder> ggml_decoder) {
std::map<std::string, void*> output_tensors;
auto output_names = ggml_decoder->get_output_names();
auto output_names = ggml_decoder->get_model_output_names();
for (size_t inp = 0; inp < output_names.size(); ++inp) {
auto name = output_names[inp];
auto output_data = ggml_decoder->get_output_ggml_tensor(name)->data;
#ifdef GGML_OPENVINO_DEBUG
printf("Output %d: %g\n", inp, *(double*)(output_data));
#endif
const auto* tensor = ggml_decoder->get_output_ggml_tensor(name);
auto* output_data = tensor->view_src ? tensor->view_src->data : tensor->data;
output_tensors[name] = output_data;
}
return output_tensors;
}
static ov::frontend::FrontEnd::Ptr get_ggml_frontend() {
ov::frontend::FrontEnd::Ptr front_end = nullptr;
auto fem = ov::frontend::FrontEndManager();
@ -78,10 +48,9 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() {
return front_end;
}
enum ggml_status openvino_frontend_compute(ggml_backend_t backend,
struct ggml_cgraph *cgraph,
const int32_t start_index,
const int32_t end_index) {
enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph) {
auto start_time = ggml_time_us();
static ov::Core core;
// auto devices = core.get_available_devices();
@ -89,65 +58,102 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend,
if (!front_end) {
GGML_LOG_ERROR("GGML FrontEnd is not initialized \n");
return GGML_STATUS_FAILED;
} else {
#ifdef GGML_OPENVINO_DEBUG
GGML_LOG_INFO("GGML FrontEnd is initialized \n");
#endif
}
auto ggml_decoder = get_ggml_decoder(cgraph, start_index, end_index);
auto ggml_decoder = get_ggml_decoder(cgraph);
std::shared_ptr<ov::frontend::DecoderBase> graph_decoder = ggml_decoder;
// Load GraphIterator -> InputModel
ov::frontend::InputModel::Ptr input_model = front_end->load(graph_decoder);
if (!input_model) {
GGML_LOG_ERROR("Input Model is not loaded \n");
return GGML_STATUS_FAILED;
} else {
#ifdef GGML_OPENVINO_DEBUG
GGML_LOG_INFO("Input Model loaded \n");
#endif
}
// Convert InputModel -> ov::Model
std::shared_ptr<ov::Model> model = front_end->convert(input_model);
auto conversion_end_time = ggml_time_us();
if (getenv("OPENVINO_DUMP_GRAPH")) {
char timestamped_filename[64];
auto timestamp = (long long)ggml_time_us();
snprintf(timestamped_filename, sizeof(timestamped_filename),
"model_%lld.xml", timestamp);
ov::serialize(model, timestamped_filename);
if (getenv("GGML_OPENVINO_DUMP_IR")) {
char timestamped_filename[64];
auto timestamp = (long long)ggml_time_us();
snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp);
ov::serialize(model, timestamped_filename);
}
if (!model) {
GGML_LOG_ERROR("Model is not converted \n");
} else {
#ifdef GGML_OPENVINO_DEBUG
GGML_LOG_INFO("Model converted \n");
#endif
}
ov::CompiledModel compiled_model = core.compile_model(model);
ov::CompiledModel compiled_model =
core.compile_model(model, "CPU", ov::device::properties("CPU", ov::cache_dir("/tmp/ov_cache")));
auto compile_end_time = ggml_time_us();
ov::InferRequest infer_request = compiled_model.create_infer_request();
auto infer_request_start_time = ggml_time_us();
auto input_names = ggml_decoder->get_input_names();
auto input_tensors = get_ggml_graph_input_tensors(ggml_decoder);
auto ov_params = model->get_parameters();
for (size_t i = 0; i < ov_params.size(); i++) {
auto param_name = ov_params[i]->get_friendly_name();
infer_request.set_input_tensor(i, get_ggml_graph_input_tensor(ggml_decoder, param_name));
auto input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name);
if (getenv("GGML_OPENVINO_DEBUG_INPUT")) {
std::cout << "Input name: " << param_name << ", Input shape: " << input_tensor.get_shape()
<< ", Address: " << input_tensor.data() << std::endl;
switch (input_tensor.get_element_type()) {
case ov::element::f32:
std::cout << *(float*)(input_tensor.data()) << std::endl;
break;
case ov::element::f16:
std::cout << ov::float16::from_bits(*(uint16_t*)(input_tensor.data())) << std::endl;
break;
case ov::element::i32:
std::cout << *(int32_t*)(input_tensor.data()) << std::endl;
break;
case ov::element::i64:
std::cout << *(int64_t*)(input_tensor.data()) << std::endl;
break;
default:
break;
}
}
infer_request.set_input_tensor(i, input_tensor);
}
auto input_end_time = ggml_time_us();
infer_request.infer();
auto infer_end_time = ggml_time_us();
auto output_names = ggml_decoder->get_output_names();
auto output_names = ggml_decoder->get_model_output_names();
auto output_tensors = get_ggml_graph_output_dst(ggml_decoder);
for (size_t i = 0; i < output_names.size(); i++) {
auto output_tensor = infer_request.get_output_tensor(i);
std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size());
#ifdef GGML_OPENVINO_DEBUG
printf("Output %s after: %g\n", output_names[i].c_str(), *(double*)(output_tensor.data()));
#endif
if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) {
std::cout << "Output name: " << output_names[i] << ", Output shape: " << output_tensor.get_shape()
<< ", Address: " << output_tensors[output_names[i]] << std::endl;
switch (output_tensor.get_element_type()) {
case ov::element::f32:
std::cout << *(float*)(output_tensors[output_names[i]]) << std::endl;
break;
case ov::element::f16:
std::cout << ov::float16::from_bits(*(uint16_t*)(output_tensors[output_names[i]])) << std::endl;
break;
default:
break;
}
}
}
auto end_time = ggml_time_us();
if (getenv("GGML_OPENVINO_PROFILING")) {
GGML_LOG_INFO("GGML OpenVINO Backend: \n");
GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - start_time) / 1000);
GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000);
GGML_LOG_INFO(" - Graph InferRequest created Time: %ld ms \n",
(infer_request_start_time - compile_end_time) / 1000);
GGML_LOG_INFO(" - Graph Input Time: %ld ms \n", (input_end_time - infer_request_start_time) / 1000);
GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - input_end_time) / 1000);
GGML_LOG_INFO(" - Graph Output Time: %ld ms \n", (end_time - infer_end_time) / 1000);
}
return GGML_STATUS_SUCCESS;

View File

@ -1,4 +1,4 @@
#include "ggml-decoder.h"
#include "ggml-backend-impl.h"
enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, const int32_t start_index=0, const int32_t end_index=0);
enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph);