PERF: compile once (dynamic graph + cache)
This commit is contained in:
parent
7d5e234254
commit
a8e5efa44e
|
|
@ -58,6 +58,7 @@ public:
|
|||
virtual bool check_if_continuous() const = 0;
|
||||
|
||||
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_inputs() const = 0;
|
||||
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_extra_inputs() const = 0;
|
||||
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_weights() const = 0;
|
||||
virtual const std::vector<std::string>& get_model_output_names() const = 0;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -10,9 +10,11 @@
|
|||
#include <iomanip>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <openvino/core/dimension.hpp>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/core/type/float16.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/runtime/tensor.hpp>
|
||||
#include <ostream>
|
||||
#include <set>
|
||||
#include <string>
|
||||
|
|
@ -35,6 +37,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
|
|||
printed = true;
|
||||
}
|
||||
|
||||
set_max_token_len();
|
||||
for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) {
|
||||
auto* cur_node = m_cgraph->nodes[node_n];
|
||||
m_nodes.push_back(cur_node);
|
||||
|
|
@ -42,6 +45,8 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
|
|||
}
|
||||
m_model_weights = model_weights;
|
||||
|
||||
add_extra_inputs();
|
||||
|
||||
if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
|
||||
dump_cgraph(m_cgraph);
|
||||
}
|
||||
|
|
@ -102,7 +107,16 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node,
|
|||
if (m_model_inputs.find(src_name) != m_model_inputs.end()) {
|
||||
continue;
|
||||
}
|
||||
auto param_node = std::make_shared<ov::op::v0::Parameter>(get_ov_type(src), ov::Shape{get_shape(src)});
|
||||
ov::PartialShape input_shape;
|
||||
if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") {
|
||||
input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_max_token_len)};
|
||||
} else if (std::string(src->name).find("KQ_mask") == 0) {
|
||||
input_shape =
|
||||
ov::PartialShape{1, ov::Dimension(1, m_max_token_len), ov::Dimension(1, m_max_token_len)};
|
||||
} else {
|
||||
input_shape = ov::Shape{get_shape(src)};
|
||||
}
|
||||
auto param_node = std::make_shared<ov::op::v0::Parameter>(get_ov_type(src), input_shape);
|
||||
param_node->set_friendly_name(src_name);
|
||||
m_model_inputs[src_name] = param_node;
|
||||
}
|
||||
|
|
@ -146,6 +160,57 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node,
|
|||
}
|
||||
}
|
||||
|
||||
void GgmlOvDecoder::set_max_token_len() {
|
||||
for (int i = 0; i < m_cgraph->n_nodes; i++) {
|
||||
auto* node = m_cgraph->nodes[i];
|
||||
if (std::string(node->name) == "v-0") {
|
||||
auto* cache_v = node->src[0];
|
||||
m_max_token_len = cache_v->ne[0] / node->ne[1] / node->ne[2];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GgmlOvDecoder::add_extra_inputs() {
|
||||
int64_t past_token_len;
|
||||
int64_t attention_size;
|
||||
|
||||
for (const auto& node : m_nodes) {
|
||||
if (node->op == GGML_OP_CPY && ggml_is_contiguous(node)) {
|
||||
assert(std::string(node->view_src->name).find("cache_k") == 0);
|
||||
int64_t head_size = node->src[0]->ne[0];
|
||||
int64_t num_heads = node->src[0]->ne[1];
|
||||
past_token_len = (int64_t)(node->src[1]->op_params[0] / node->src[1]->nb[0] / head_size / num_heads);
|
||||
|
||||
std::string name = "past_token_len";
|
||||
auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{});
|
||||
param_node->set_friendly_name(name);
|
||||
m_model_extra_inputs[name] = param_node;
|
||||
|
||||
auto tensor = std::make_shared<ov::Tensor>(ov::element::i64, ov::Shape{});
|
||||
*tensor->data<int64_t>() = past_token_len;
|
||||
m_model_extra_input_values[name] = tensor;
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (const auto& node : m_nodes) {
|
||||
if (node->src[1] && std::string(node->src[1]->name).find("inp_tokens") == 0) {
|
||||
int64_t total_token_len = node->src[1]->ne[0] + past_token_len;
|
||||
attention_size = (total_token_len + 31) / 32 * 32;
|
||||
|
||||
std::string name = "attention_size";
|
||||
auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
|
||||
param_node->set_friendly_name(name);
|
||||
m_model_extra_inputs[name] = param_node;
|
||||
|
||||
auto tensor = std::make_shared<ov::Tensor>(ov::element::i64, ov::Shape{1});
|
||||
*tensor->data<int64_t>() = attention_size;
|
||||
m_model_extra_input_values[name] = tensor;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) {
|
||||
std::shared_ptr<ov::Node> weight_node;
|
||||
auto node_type = get_ov_type(tensor);
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
|
@ -79,6 +80,12 @@ public:
|
|||
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_inputs() const override {
|
||||
return m_model_inputs;
|
||||
}
|
||||
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_extra_inputs() const override {
|
||||
return m_model_extra_inputs;
|
||||
}
|
||||
virtual const std::map<std::string, std::shared_ptr<ov::Tensor>>& get_model_extra_input_values() const {
|
||||
return m_model_extra_input_values;
|
||||
}
|
||||
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_weights() const override {
|
||||
return m_model_weights;
|
||||
}
|
||||
|
|
@ -88,12 +95,16 @@ public:
|
|||
|
||||
private:
|
||||
void set_input_output(ggml_tensor* node, std::map<std::string, std::shared_ptr<ov::Node>>& model_weights);
|
||||
void add_extra_inputs();
|
||||
static void dump_cgraph(const struct ggml_cgraph* cgraph);
|
||||
static std::vector<size_t> get_shape(const ggml_tensor* tensor);
|
||||
static std::vector<size_t> get_stride(const ggml_tensor* tensor);
|
||||
static ov::element::Type get_ov_type(const ggml_tensor* tensor);
|
||||
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor);
|
||||
|
||||
void set_max_token_len();
|
||||
int64_t m_max_token_len;
|
||||
|
||||
struct ggml_cgraph * m_cgraph;
|
||||
std::map<std::string, ggml_tensor *> m_inputs;
|
||||
std::vector<std::string> m_input_names;
|
||||
|
|
@ -106,6 +117,8 @@ private:
|
|||
bool m_continuous;
|
||||
std::vector<std::pair<std::string, std::string>> m_op_node_name;
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> m_model_inputs;
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> m_model_extra_inputs;
|
||||
std::map<std::string, std::shared_ptr<ov::Tensor>> m_model_extra_input_values;
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> m_model_weights;
|
||||
std::vector<std::string> m_model_output_names;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -3,10 +3,14 @@
|
|||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <memory>
|
||||
#include <openvino/core/graph_util.hpp>
|
||||
#include <openvino/core/type/float16.hpp>
|
||||
#include <openvino/frontend/manager.hpp>
|
||||
#include <openvino/openvino.hpp>
|
||||
#include <openvino/runtime/compiled_model.hpp>
|
||||
#include <openvino/runtime/tensor.hpp>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml.h"
|
||||
|
|
@ -63,61 +67,65 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
|
|||
return GGML_STATUS_FAILED;
|
||||
}
|
||||
|
||||
using CachedItem = std::pair<std::shared_ptr<ov::Model>, ov::CompiledModel>;
|
||||
static std::unordered_map<struct ggml_cgraph*, CachedItem> compiled_cache;
|
||||
|
||||
std::shared_ptr<ov::Model> model;
|
||||
ov::CompiledModel compiled_model;
|
||||
int64_t conversion_end_time;
|
||||
int64_t compile_end_time;
|
||||
|
||||
auto ggml_decoder = get_ggml_decoder(cgraph);
|
||||
std::shared_ptr<ov::frontend::DecoderBase> graph_decoder = ggml_decoder;
|
||||
ov::frontend::InputModel::Ptr input_model = front_end->load(graph_decoder);
|
||||
if (!input_model) {
|
||||
GGML_LOG_ERROR("Input Model is not loaded \n");
|
||||
return GGML_STATUS_FAILED;
|
||||
auto it = compiled_cache.find(cgraph);
|
||||
if (it != compiled_cache.end()) {
|
||||
model = it->second.first;
|
||||
conversion_end_time = ggml_time_us();
|
||||
|
||||
compiled_model = it->second.second;
|
||||
compile_end_time = ggml_time_us();
|
||||
} else {
|
||||
std::shared_ptr<ov::frontend::DecoderBase> graph_decoder = ggml_decoder;
|
||||
ov::frontend::InputModel::Ptr input_model = front_end->load(graph_decoder);
|
||||
if (!input_model) {
|
||||
GGML_LOG_ERROR("Input Model is not loaded \n");
|
||||
return GGML_STATUS_FAILED;
|
||||
}
|
||||
|
||||
model = front_end->convert(input_model);
|
||||
conversion_end_time = ggml_time_us();
|
||||
|
||||
if (getenv("GGML_OPENVINO_DUMP_IR")) {
|
||||
char timestamped_filename[64];
|
||||
auto timestamp = (long long)ggml_time_us();
|
||||
snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp);
|
||||
ov::serialize(model, timestamped_filename);
|
||||
}
|
||||
|
||||
if (!model) {
|
||||
GGML_LOG_ERROR("Model is not converted \n");
|
||||
}
|
||||
compiled_model = core.compile_model(model, "CPU");
|
||||
compile_end_time = ggml_time_us();
|
||||
|
||||
compiled_cache[cgraph] = std::make_pair(model, compiled_model);
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Model> model = front_end->convert(input_model);
|
||||
auto conversion_end_time = ggml_time_us();
|
||||
|
||||
if (getenv("GGML_OPENVINO_DUMP_IR")) {
|
||||
char timestamped_filename[64];
|
||||
auto timestamp = (long long)ggml_time_us();
|
||||
snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp);
|
||||
ov::serialize(model, timestamped_filename);
|
||||
}
|
||||
|
||||
if (!model) {
|
||||
GGML_LOG_ERROR("Model is not converted \n");
|
||||
}
|
||||
|
||||
ov::CompiledModel compiled_model = core.compile_model(model, "CPU");
|
||||
auto compile_end_time = ggml_time_us();
|
||||
|
||||
ov::InferRequest infer_request = compiled_model.create_infer_request();
|
||||
auto infer_request_start_time = ggml_time_us();
|
||||
|
||||
auto input_names = ggml_decoder->get_input_names();
|
||||
auto ov_params = model->get_parameters();
|
||||
for (size_t i = 0; i < ov_params.size(); i++) {
|
||||
auto param_name = ov_params[i]->get_friendly_name();
|
||||
auto input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name);
|
||||
|
||||
if (getenv("GGML_OPENVINO_DEBUG_INPUT")) {
|
||||
std::cout << "Input name: " << param_name << ", Input shape: " << input_tensor.get_shape()
|
||||
<< ", Address: " << input_tensor.data() << std::endl;
|
||||
switch (input_tensor.get_element_type()) {
|
||||
case ov::element::f32:
|
||||
std::cout << *(float*)(input_tensor.data()) << std::endl;
|
||||
break;
|
||||
case ov::element::f16:
|
||||
std::cout << ov::float16::from_bits(*(uint16_t*)(input_tensor.data())) << std::endl;
|
||||
break;
|
||||
case ov::element::i32:
|
||||
std::cout << *(int32_t*)(input_tensor.data()) << std::endl;
|
||||
break;
|
||||
case ov::element::i64:
|
||||
std::cout << *(int64_t*)(input_tensor.data()) << std::endl;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
ov::Tensor input_tensor;
|
||||
if (ggml_decoder->get_model_extra_inputs().find(param_name) != ggml_decoder->get_model_extra_inputs().end()) {
|
||||
input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name);
|
||||
} else {
|
||||
input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name);
|
||||
}
|
||||
infer_request.set_input_tensor(i, input_tensor);
|
||||
|
||||
if (getenv("GGML_OPENVINO_DEBUG_INPUT")) {
|
||||
print_input_tensor_info(param_name, input_tensor);
|
||||
}
|
||||
}
|
||||
auto input_end_time = ggml_time_us();
|
||||
|
||||
|
|
@ -131,20 +139,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
|
|||
std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size());
|
||||
|
||||
if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) {
|
||||
std::cout << "Output name: " << output_names[i] << ", Output shape: " << output_tensor.get_shape()
|
||||
<< ", Address: " << output_tensors[output_names[i]] << std::endl;
|
||||
switch (output_tensor.get_element_type()) {
|
||||
case ov::element::f32:
|
||||
std::cout << *(float*)(output_tensor.data()) << std::endl;
|
||||
std::cout << checksum(output_tensor.data(), output_tensor.get_byte_size()) << std::endl;
|
||||
break;
|
||||
case ov::element::f16:
|
||||
std::cout << ov::float16::from_bits(*(uint16_t*)(output_tensor.data())) << std::endl;
|
||||
std::cout << checksum(output_tensor.data(), output_tensor.get_byte_size()) << std::endl;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
print_output_tensor_info(output_names[i], output_tensor, output_tensors);
|
||||
}
|
||||
}
|
||||
auto end_time = ggml_time_us();
|
||||
|
|
@ -153,9 +148,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
|
|||
GGML_LOG_INFO("GGML OpenVINO Backend: \n");
|
||||
GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - start_time) / 1000);
|
||||
GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000);
|
||||
GGML_LOG_INFO(" - Graph InferRequest created Time: %ld ms \n",
|
||||
(infer_request_start_time - compile_end_time) / 1000);
|
||||
GGML_LOG_INFO(" - Graph Input Time: %ld ms \n", (input_end_time - infer_request_start_time) / 1000);
|
||||
GGML_LOG_INFO(" - Graph Input Time: %ld ms \n", (input_end_time - compile_end_time) / 1000);
|
||||
GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - input_end_time) / 1000);
|
||||
GGML_LOG_INFO(" - Graph Output Time: %ld ms \n", (end_time - infer_end_time) / 1000);
|
||||
}
|
||||
|
|
@ -172,3 +165,43 @@ size_t checksum(const void* data, size_t size) {
|
|||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor) {
|
||||
std::cout << "Input name: " << name << ", Input shape: " << tensor.get_shape() << ", Address: " << tensor.data()
|
||||
<< std::endl;
|
||||
switch (tensor.get_element_type()) {
|
||||
case ov::element::f32:
|
||||
std::cout << *(float*)(tensor.data()) << std::endl;
|
||||
break;
|
||||
case ov::element::f16:
|
||||
std::cout << ov::float16::from_bits(*(uint16_t*)(tensor.data())) << std::endl;
|
||||
break;
|
||||
case ov::element::i32:
|
||||
std::cout << *(int32_t*)(tensor.data()) << std::endl;
|
||||
break;
|
||||
case ov::element::i64:
|
||||
std::cout << *(int64_t*)(tensor.data()) << std::endl;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void print_output_tensor_info(const std::string& name,
|
||||
const ov::Tensor& tensor,
|
||||
std::map<std::string, void*>& output_dst) {
|
||||
std::cout << "Output name: " << name << ", Output shape: " << tensor.get_shape()
|
||||
<< ", Address: " << output_dst[name] << std::endl;
|
||||
switch (tensor.get_element_type()) {
|
||||
case ov::element::f32:
|
||||
std::cout << *(float*)(tensor.data()) << std::endl;
|
||||
std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl;
|
||||
break;
|
||||
case ov::element::f16:
|
||||
std::cout << ov::float16::from_bits(*(uint16_t*)(tensor.data())) << std::endl;
|
||||
std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,3 +4,9 @@
|
|||
enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph);
|
||||
|
||||
size_t checksum(const void* data, size_t size);
|
||||
|
||||
void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor);
|
||||
|
||||
void print_output_tensor_info(const std::string& name,
|
||||
const ov::Tensor& tensor,
|
||||
std::map<std::string, void*>& output_dst);
|
||||
|
|
|
|||
Loading…
Reference in New Issue