Style: middle ptr and ref align, omit optional struct keyword

This commit is contained in:
Yu, Zijun 2025-10-21 14:45:32 +08:00 committed by Mustafa Cavus
parent bd3093f90c
commit eba8113dc4
32 changed files with 670 additions and 653 deletions

View File

@ -1,17 +1,17 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#include "ggml.h"
#include <cstring>
#include <array>
#include <cstring>
#ifdef __cplusplus
extern "C" {
#endif
#define GGML_OPENVINO_NAME "OPENVINO"
#define GGML_OPENVINO_MAX_DEVICES 16
#define GGML_OPENVINO_NAME "OPENVINO"
#define GGML_OPENVINO_MAX_DEVICES 16
// backend API
GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device);
@ -28,7 +28,7 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_split_buffer_t
// and GPU
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(void);
GGML_BACKEND_API int ggml_backend_openvino_get_device_count(void);
GGML_BACKEND_API int ggml_backend_openvino_get_device_count(void);
// GGML_BACKEND_API void ggml_backend_openvino_get_device_description(int device, char * description,
// size_t description_size);
// GGML_BACKEND_API void ggml_backend_openvino_get_device_memory(int device, size_t * free, size_t * total);
@ -42,13 +42,13 @@ struct ggml_openvino_device_info {
int device_count;
struct openvino_device_info {
int cc; // compute capability
int nsm; // number of streaming multiprocessors
size_t smpb; // max. shared memory per block
size_t smpbo; // max. shared memory per block (with opt-in)
bool vmm; // virtual memory support
size_t vmm_granularity; // granularity of virtual memory
size_t total_vram;
int cc; // compute capability
int nsm; // number of streaming multiprocessors
size_t smpb; // max. shared memory per block
size_t smpbo; // max. shared memory per block (with opt-in)
bool vmm; // virtual memory support
size_t vmm_granularity; // granularity of virtual memory
size_t total_vram;
};
openvino_device_info devices[GGML_OPENVINO_MAX_DEVICES] = {};

View File

@ -2,12 +2,10 @@
# Override root .clang-format
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
ReferenceAlignment: Left
PointerAlignment: Left
Cpp11BracedListStyle: true
AccessModifierOffset: -4
BinPackArguments: false
SpacesInContainerLiterals: false
BreakBeforeBraces: Attach
AccessModifierOffset: -4
IndentCaseBlocks: false
IndentCaseLabels: false
@ -32,7 +30,15 @@ AllowShortIfStatementsOnASingleLine: Never
AllowShortLambdasOnASingleLine: Inline
AllowShortLoopsOnASingleLine: false
AlwaysBreakBeforeMultilineStrings: true
BinPackParameters: true
# Treat CUDA keywords/attributes as "attribute macros" and avoid breaking lines inside them
AttributeMacros:
- __host__
- __device__
- __global__
- __forceinline__
- __launch_bounds__
BinPackArguments: true
BinPackParameters: false # OnePerLine
BitFieldColonSpacing: Both
# BreakAdjacentStringLiterals: true
BreakAfterAttributes: Never
@ -58,15 +64,18 @@ ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
IncludeBlocks: Regroup
IncludeCategories:
- Regex: '^<.*\.h>'
- Regex: '".*"'
Priority: 1
SortPriority: 0
- Regex: '^<.*'
- Regex: '^<.*\.h>'
Priority: 2
SortPriority: 0
- Regex: '.*'
- Regex: '^<.*'
Priority: 3
SortPriority: 0
- Regex: '.*'
Priority: 4
SortPriority: 0
IncludeIsMainRegex: '([-_](test|unittest))?$'
IncludeIsMainSourceRegex: ''
IndentAccessModifiers: false
@ -100,6 +109,7 @@ PenaltyBreakString: 1000
PenaltyBreakTemplateDeclaration: 10
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Middle
QualifierAlignment: Left
#QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict']
RawStringFormats:
@ -113,6 +123,7 @@ RawStringFormats:
- 'c++'
- 'C++'
CanonicalDelimiter: ''
ReferenceAlignment: Middle
ReflowComments: false # IndentOnly
SeparateDefinitionBlocks: Always
SortIncludes: CaseInsensitive

View File

@ -1,5 +1,9 @@
#include "ggml-decoder.h"
#include "ggml-backend-impl.h"
#include "ggml-backend.h"
#include "ggml-quants.hpp"
#include <ggml-impl.h>
#include <ggml.h>
@ -32,13 +36,16 @@
#include <string>
#include <vector>
#include "ggml-backend-impl.h"
#include "ggml-backend.h"
#include "ggml-quants.hpp"
GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token,
int context_size, int context_size_swa, int num_heads, int num_heads_kv, int head_size,
const std::vector<int>& swa_layers) :
GgmlOvDecoder::GgmlOvDecoder(ggml_tensor * node,
ggml_cgraph * cgraph,
bool is_static,
bool is_first_token,
int context_size,
int context_size_swa,
int num_heads,
int num_heads_kv,
int head_size,
const std::vector<int> & swa_layers) :
m_cgraph(cgraph),
m_node(node),
m_op_name(std::string(node->name)),
@ -53,8 +60,9 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
set_input_output(node);
}
GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph,
std::map<std::string, std::shared_ptr<ov::Node>>& model_weights, bool is_static,
GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
bool is_static,
bool is_first_token) :
m_cgraph(cgraph),
m_op_name(m_node ? std::string(m_node->name) : ""),
@ -68,7 +76,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph,
set_llm_params();
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
auto* cur_node = cgraph->nodes[node_n];
auto * cur_node = cgraph->nodes[node_n];
m_nodes.push_back(cur_node);
set_input_output(cur_node);
}
@ -76,12 +84,11 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph,
// add_extra_inputs();
}
GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph,
std::map<std::string, std::shared_ptr<ov::Node>>& model_weights) {
GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map<std::string, std::shared_ptr<ov::Node>> & model_weights) {
m_cgraph = cgraph;
m_model_weights = model_weights;
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
auto* cur_node = cgraph->nodes[node_n];
auto * cur_node = cgraph->nodes[node_n];
if (cur_node->op == GGML_OP_NONE) {
continue;
}
@ -93,7 +100,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph,
// Called in GgmlOvDecoder constructor. Two cases: 1. constructing a decoder for the whole graph;
// 2. constructing a decoder for a node;
// 3. constructing a decoder for the whole graph naively (op test case)
void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) {
void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) {
std::string node_name;
if (node->op == GGML_OP_SET_ROWS) {
// SET_ROWS updates the tensor in place. For later ov op that uses the
@ -109,7 +116,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) {
m_outputs[node_name] = node;
for (int i = 0; i < GGML_MAX_SRC; i++) {
auto* src = node->src[i];
auto * src = node->src[i];
if (src == nullptr) {
continue;
}
@ -128,7 +135,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) {
}
} else if (!m_node && !src->view_src) {
ggml_backend_buffer* buffer = src->buffer;
ggml_backend_buffer * buffer = src->buffer;
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) {
// GGML_BACKEND_BUFFER_USAGE_ANY are kv caches
@ -236,8 +243,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) {
}
case GGML_OP_VIEW: {
if (node->src[0]->op == GGML_OP_VIEW) {
auto* src = node->src[0];
auto* view_src = src->view_src;
auto * src = node->src[0];
auto * view_src = src->view_src;
if (view_src->ne[1] != src->ne[2]) {
throw std::runtime_error("Unsupported VIEW case");
}
@ -250,7 +257,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) {
}
}
int extract_layer_from_name(const std::string& name) {
int extract_layer_from_name(const std::string & name) {
size_t pos1 = name.find("_l");
assert(pos1 != std::string::npos);
pos1 += 2;
@ -265,10 +272,10 @@ int extract_layer_from_name(const std::string& name) {
void GgmlOvDecoder::set_llm_params() {
for (int i = 0; i < m_cgraph->n_nodes; i++) {
auto* node = m_cgraph->nodes[i];
auto * node = m_cgraph->nodes[i];
std::string name = std::string(node->name);
if (node->op == GGML_OP_FLASH_ATTN_EXT) {
auto* cache_k = node->src[1];
auto * cache_k = node->src[1];
cache_k = cache_k->view_src ? cache_k->view_src : cache_k;
int layer = extract_layer_from_name(cache_k->name);
@ -290,7 +297,7 @@ void GgmlOvDecoder::set_llm_params() {
}
}
ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) const {
ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * src) const {
auto name = std::string(src->name);
ov::PartialShape input_shape;
if (name == "inp_tokens" || name == "inp_pos") {
@ -323,7 +330,7 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co
} else {
input_shape = ov::PartialShape{1, -1, m_num_heads_kv, m_head_size};
}
} else if (const auto* op = get_tensor_used_op(src); op && op->op == GGML_OP_SET_ROWS) {
} else if (const auto * op = get_tensor_used_op(src); op && op->op == GGML_OP_SET_ROWS) {
input_shape = ov::PartialShape{1, 1, m_is_static ? 1 : -1};
} else if (src->op == GGML_OP_VIEW) {
// This case is added to make test-backend-ops work
@ -342,9 +349,9 @@ void GgmlOvDecoder::add_extra_inputs() {
// Update: not used anymore after the optimization of making kvcache dynamic (but breaks iSWA models)
int64_t attention_size = -1;
int64_t attention_size_swa = -1;
for (const auto& node : m_nodes) {
for (const auto & node : m_nodes) {
if (node->op == GGML_OP_FLASH_ATTN_EXT) {
auto* mask = node->src[3];
auto * mask = node->src[3];
std::string mask_name(mask->name);
if (mask_name.find("KQ_mask") != 0) {
throw std::runtime_error("Unexpected flash attention node: " + std::string(mask->name));
@ -357,7 +364,7 @@ void GgmlOvDecoder::add_extra_inputs() {
}
}
auto create_attention_size_input = [this](const std::string& name, int64_t size) {
auto create_attention_size_input = [this](const std::string & name, int64_t size) {
auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
param_node->set_friendly_name(name);
param_node->output(0).get_tensor().set_names({name});
@ -374,12 +381,12 @@ void GgmlOvDecoder::add_extra_inputs() {
}
}
const ggml_tensor* GgmlOvDecoder::get_tensor_used_op(const ggml_tensor* tensor) const {
const ggml_tensor * GgmlOvDecoder::get_tensor_used_op(const ggml_tensor * tensor) const {
if (tensor == nullptr) {
return nullptr;
}
for (int i = 0; i < m_cgraph->n_nodes; i++) {
const auto* node = m_cgraph->nodes[i];
const auto * node = m_cgraph->nodes[i];
for (int j = 0; j < GGML_MAX_SRC; j++) {
if (node->src[j] == tensor) {
return node;
@ -389,11 +396,11 @@ const ggml_tensor* GgmlOvDecoder::get_tensor_used_op(const ggml_tensor* tensor)
return nullptr;
}
const ggml_tensor* GgmlOvDecoder::get_tensor_from_name(const std::string& name) const {
const ggml_tensor * GgmlOvDecoder::get_tensor_from_name(const std::string & name) const {
for (int i = 0; i < m_cgraph->n_nodes; i++) {
const auto* node = m_cgraph->nodes[i];
const auto * node = m_cgraph->nodes[i];
for (int j = 0; j < GGML_MAX_SRC; j++) {
const auto* src = node->src[j];
const auto * src = node->src[j];
if (src == nullptr) {
break;
}
@ -407,7 +414,7 @@ const ggml_tensor* GgmlOvDecoder::get_tensor_from_name(const std::string& name)
std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const {
std::map<std::string, std::string> kv_param_res_names;
for (const auto& name : m_kv_names) {
for (const auto & name : m_kv_names) {
if (name.find("cache_k") == 0 || name.find("cache_v") == 0) {
kv_param_res_names[name] = name;
}
@ -416,21 +423,22 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
}
std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(
struct ggml_cgraph* cgraph, std::map<ggml_type, ExtraQuantType> types_to_requantize) {
ggml_cgraph * cgraph,
std::map<ggml_type, ExtraQuantType> types_to_requantize) {
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
static std::mutex weights_mutex;
auto* nodes = cgraph->nodes;
auto * nodes = cgraph->nodes;
auto n_nodes = cgraph->n_nodes;
std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor* node) {
std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor * node) {
for (int i = 0; i < GGML_MAX_SRC; i++) {
auto* src = node->src[i];
auto * src = node->src[i];
if (src == nullptr) {
continue;
}
std::string src_name(src->name);
if (!src->view_src) {
ggml_backend_buffer* buffer = src->buffer;
ggml_backend_buffer * buffer = src->buffer;
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS || ggml_is_quantized(src->type)) {
bool should_create = false;
{
@ -458,17 +466,10 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
return model_weights;
}
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor,
std::optional<ExtraQuantType> requant_type) {
std::set<ggml_type> weight_types = {GGML_TYPE_F32,
GGML_TYPE_F16,
GGML_TYPE_BF16,
GGML_TYPE_Q8_0,
GGML_TYPE_Q4_0,
GGML_TYPE_Q4_1,
GGML_TYPE_Q4_K,
GGML_TYPE_Q5_K,
GGML_TYPE_Q6_K};
std::set<ggml_type> weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0,
GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K};
if (weight_types.find(tensor->type) == weight_types.end()) {
throw std::runtime_error("Unexpected weight tensor type: " + std::string(tensor->name) + " with type " +
ggml_type_name(tensor->type));
@ -495,9 +496,8 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
}
// Quantized case
OPENVINO_ASSERT(
tensor->extra == nullptr,
"Unsupported weight tensor: " + std::string(tensor->name) + " Possibly this is a repacked quantized weights");
OPENVINO_ASSERT(tensor->extra == nullptr, "Unsupported weight tensor: " + std::string(tensor->name) +
" Possibly this is a repacked quantized weights");
if (requant_type.has_value()) {
return requantize(tensor, requant_type.value());
@ -518,11 +518,8 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
weights_per_block = 32;
}
OPENVINO_ASSERT(node_shape.back() % weights_per_block == 0,
"[load_gguf] tensor ",
tensor->name,
" has incompatible last dim shape: ",
node_shape.back());
OPENVINO_ASSERT(node_shape.back() % weights_per_block == 0, "[load_gguf] tensor ", tensor->name,
" has incompatible last dim shape: ", node_shape.back());
ov::Tensor weights(weight_type, node_shape);
// For scales and biases
@ -557,7 +554,7 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
return weight_node.get_node_shared_ptr();
}
void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename) {
void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filename) {
std::ofstream file(filename);
if (!file.is_open()) {
std::cerr << "Failed to open file" << std::endl;
@ -576,7 +573,7 @@ void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph, std::string& f
<< std::setw(50) << "stride"
<< "\n";
for (int i = 0; i < cgraph->n_nodes; i++) {
struct ggml_tensor * node = cgraph->nodes[i];
ggml_tensor * node = cgraph->nodes[i];
file << " - " << std::setw(3) << i << ": [ "
<< std::setw(5) << node->ne[0] << ", "
@ -614,7 +611,7 @@ void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph, std::string& f
file << "n_leafs = " << cgraph->n_leafs << "\n";
for (int i = 0; i < cgraph->n_leafs; i++) {
struct ggml_tensor * node = cgraph->leafs[i];
ggml_tensor * node = cgraph->leafs[i];
file << " - " << std::setw(3) << i << ": [ "
<< std::setw(5) << node->ne[0] << ", "
@ -628,10 +625,10 @@ void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph, std::string& f
file.close();
}
void print_tensor_address_map(const struct ggml_cgraph* cgraph) {
std::map<void*, std::vector<std::string>> address_map;
void print_tensor_address_map(const ggml_cgraph * cgraph) {
std::map<void *, std::vector<std::string>> address_map;
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
auto* node = cgraph->nodes[node_n];
auto * node = cgraph->nodes[node_n];
if (node->data) {
auto it = address_map.find(node->data);
if (it == address_map.end()) {
@ -640,16 +637,16 @@ void print_tensor_address_map(const struct ggml_cgraph* cgraph) {
address_map[node->data].push_back(node->name);
}
}
for (const auto& pair : address_map) {
for (const auto & pair : address_map) {
std::cout << "Address: " << pair.first << std::endl;
for (const auto& name : pair.second) {
for (const auto & name : pair.second) {
std::cout << name << " ; ";
}
std::cout << std::endl << std::endl;
}
}
std::vector<size_t> GgmlOvDecoder::get_shape(const ggml_tensor* tensor) {
std::vector<size_t> GgmlOvDecoder::get_shape(const ggml_tensor * tensor) {
std::vector<size_t> shape;
for (int i = GGML_MAX_DIMS - 2; i >= 0; --i) {
shape.push_back(static_cast<size_t>(tensor->ne[i]));
@ -657,7 +654,7 @@ std::vector<size_t> GgmlOvDecoder::get_shape(const ggml_tensor* tensor) {
return shape;
}
std::vector<size_t> GgmlOvDecoder::get_stride(const ggml_tensor* tensor) {
std::vector<size_t> GgmlOvDecoder::get_stride(const ggml_tensor * tensor) {
std::vector<size_t> stride;
for (int i = GGML_MAX_DIMS - 2; i >= 0; --i) {
stride.push_back(static_cast<size_t>(tensor->nb[i]));
@ -665,7 +662,7 @@ std::vector<size_t> GgmlOvDecoder::get_stride(const ggml_tensor* tensor) {
return stride;
}
ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor* tensor) {
ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor * tensor) {
switch (tensor->type) {
case GGML_TYPE_F64:
return ov::element::f64;
@ -688,15 +685,15 @@ ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor* tensor) {
}
}
ov::PartialShape GgmlOvDecoder::get_input_shape(const std::string& name) const {
ov::PartialShape GgmlOvDecoder::get_input_shape(const std::string & name) const {
return ov::PartialShape(get_shape(m_inputs.at(name)));
}
std::vector<size_t> GgmlOvDecoder::get_input_stride(const std::string& name) const {
std::vector<size_t> GgmlOvDecoder::get_input_stride(const std::string & name) const {
return get_stride(m_inputs.at(name));
}
ov::element::Type GgmlOvDecoder::get_input_type(const std::string& name) const {
ov::element::Type GgmlOvDecoder::get_input_type(const std::string & name) const {
return get_ov_type(m_inputs.at(name));
}
@ -704,7 +701,7 @@ size_t GgmlOvDecoder::get_input_size() const {
return m_input_names.size();
}
std::string& GgmlOvDecoder::get_input_name(size_t index) const {
std::string & GgmlOvDecoder::get_input_name(size_t index) const {
m_name = m_input_names[index];
return m_name;
}
@ -713,19 +710,19 @@ std::vector<std::string> GgmlOvDecoder::get_input_names() const {
return m_input_names;
}
std::vector<size_t> GgmlOvDecoder::get_output_stride(const std::string& name) const {
std::vector<size_t> GgmlOvDecoder::get_output_stride(const std::string & name) const {
return get_stride(m_outputs.at(name));
}
ov::PartialShape GgmlOvDecoder::get_output_shape(const std::string& name) const {
ov::PartialShape GgmlOvDecoder::get_output_shape(const std::string & name) const {
return ov::PartialShape(get_shape(m_outputs.at(name)));
}
ov::element::Type GgmlOvDecoder::get_output_type(const std::string& name) const {
ov::element::Type GgmlOvDecoder::get_output_type(const std::string & name) const {
return get_ov_type(m_outputs.at(name));
}
std::string& GgmlOvDecoder::get_output_name(size_t index) const {
std::string & GgmlOvDecoder::get_output_name(size_t index) const {
m_name = std::string(m_output_names[index]);
return m_name;
}
@ -734,35 +731,28 @@ std::vector<std::string> GgmlOvDecoder::get_output_names() const {
return m_output_names;
}
const std::string& GgmlOvDecoder::get_op_name() const {
const std::string & GgmlOvDecoder::get_op_name() const {
return m_op_name;
}
int32_t* GgmlOvDecoder::get_input_op_params(const std::string& name) const {
int32_t * GgmlOvDecoder::get_input_op_params(const std::string & name) const {
return m_inputs.at(name)->op_params;
}
int32_t* GgmlOvDecoder::get_output_op_params(const std::string& name) const {
int32_t * GgmlOvDecoder::get_output_op_params(const std::string & name) const {
return m_outputs.at(name)->op_params;
}
void GgmlOvDecoder::visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>)> node_visitor) const {
for (const auto& node : m_nodes) {
auto decoder = std::make_shared<GgmlOvDecoder>(node,
m_cgraph,
m_is_static,
m_is_first_token,
m_context_size,
m_context_size_swa,
m_num_heads,
m_num_heads_kv,
m_head_size,
m_swa_layers);
for (const auto & node : m_nodes) {
auto decoder =
std::make_shared<GgmlOvDecoder>(node, m_cgraph, m_is_static, m_is_first_token, m_context_size,
m_context_size_swa, m_num_heads, m_num_heads_kv, m_head_size, m_swa_layers);
node_visitor(decoder);
}
}
const std::string& GgmlOvDecoder::get_op_type() const {
const std::string & GgmlOvDecoder::get_op_type() const {
static const std::map<ggml_op, std::string> ops = {
{GGML_OP_NONE, "GGML_OP_NONE" },
{GGML_OP_ACC, "GGML_OP_ACC" },

View File

@ -1,5 +1,9 @@
#pragma once
#include "ggml-quants.hpp"
#include "ggml.h"
#include "openvino/decoder.hpp"
#include <cstdint>
#include <map>
#include <memory>
@ -7,98 +11,99 @@
#include <optional>
#include <vector>
#include "ggml-quants.hpp"
#include "ggml.h"
#include "openvino/decoder.hpp"
class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
public:
// Graph decoder
GgmlOvDecoder(struct ggml_cgraph* cgraph, std::map<std::string, std::shared_ptr<ov::Node>>& model_weights,
bool is_static, bool is_first_token);
GgmlOvDecoder(ggml_cgraph * cgraph,
std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
bool is_static,
bool is_first_token);
// Node decoder, called in GgmlOvDecoder::visit_subgraph
GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token,
int context_size, int context_size_swa, int num_heads, int num_heads_kv, int head_size,
const std::vector<int>& swa_layers);
GgmlOvDecoder(ggml_tensor * node,
ggml_cgraph * cgraph,
bool is_static,
bool is_first_token,
int context_size,
int context_size_swa,
int num_heads,
int num_heads_kv,
int head_size,
const std::vector<int> & swa_layers);
// Naive graph decoder
GgmlOvDecoder(struct ggml_cgraph* cgraph, std::map<std::string, std::shared_ptr<ov::Node>>& model_weights);
GgmlOvDecoder(ggml_cgraph * cgraph, std::map<std::string, std::shared_ptr<ov::Node>> & model_weights);
virtual ov::Any get_attribute(const std::string& name) const override {
virtual ov::Any get_attribute(const std::string & name) const override {
return nullptr;
GGML_UNUSED(name);
}
virtual ov::PartialShape get_input_shape(const std::string& name) const override;
virtual ov::PartialShape get_input_shape(const std::string & name) const override;
virtual std::vector<size_t> get_input_stride(const std::string& name) const override;
virtual std::vector<size_t> get_input_stride(const std::string & name) const override;
virtual ov::element::Type get_input_type(const std::string& name) const override;
virtual ov::element::Type get_input_type(const std::string & name) const override;
virtual size_t get_input_size() const override;
virtual void get_input_node(size_t input_port_idx,
std::string& producer_name,
std::string& producer_output_port_name,
size_t& producer_output_port_index) const override {
std::string & producer_name,
std::string & producer_output_port_name,
size_t & producer_output_port_index) const override {
GGML_UNUSED(input_port_idx);
GGML_UNUSED(producer_name);
GGML_UNUSED(producer_output_port_name);
GGML_UNUSED(producer_output_port_index);
}
virtual std::string& get_input_name(size_t index) const override;
virtual std::string & get_input_name(size_t index) const override;
virtual std::vector<std::string> get_input_names() const override;
virtual ov::PartialShape get_output_shape(const std::string& name) const override;
virtual ov::PartialShape get_output_shape(const std::string & name) const override;
virtual std::vector<size_t> get_output_stride(const std::string& name) const override;
virtual std::vector<size_t> get_output_stride(const std::string & name) const override;
virtual ov::element::Type get_output_type(const std::string& name) const override;
virtual ov::element::Type get_output_type(const std::string & name) const override;
virtual int32_t* get_input_op_params(const std::string& name) const override;
virtual int32_t * get_input_op_params(const std::string & name) const override;
virtual int32_t* get_output_op_params(const std::string& name) const override;
virtual int32_t * get_output_op_params(const std::string & name) const override;
virtual std::string& get_output_name(size_t index) const override;
virtual std::string & get_output_name(size_t index) const override;
virtual std::vector<std::string> get_output_names() const override;
virtual const std::string& get_op_type() const override;
virtual const std::string & get_op_type() const override;
virtual const std::string& get_op_name() const override;
virtual const std::string & get_op_name() const override;
virtual void visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>)> node_visitor) const override;
const ggml_tensor* get_input_ggml_tensor(const std::string& name) const {
return m_inputs.at(name);
}
const ggml_tensor * get_input_ggml_tensor(const std::string & name) const { return m_inputs.at(name); }
const ggml_tensor* get_output_ggml_tensor(const std::string& name) const {
return m_outputs.at(name);
}
const ggml_tensor * get_output_ggml_tensor(const std::string & name) const { return m_outputs.at(name); }
virtual int get_op_case() const override {
return m_op_case;
}
virtual int get_op_case() const override { return m_op_case; }
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_inputs() const override {
virtual const std::map<std::string, std::shared_ptr<ov::Node>> & get_model_inputs() const override {
return m_model_inputs;
}
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_extra_inputs() const override {
virtual const std::map<std::string, std::shared_ptr<ov::Node>> & get_model_extra_inputs() const override {
return m_model_extra_inputs;
}
virtual const std::map<std::string, std::shared_ptr<ov::Tensor>>& get_model_extra_input_values() const {
virtual const std::map<std::string, std::shared_ptr<ov::Tensor>> & get_model_extra_input_values() const {
return m_model_extra_input_values;
}
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_weights() const override {
virtual const std::map<std::string, std::shared_ptr<ov::Node>> & get_model_weights() const override {
return m_model_weights;
}
virtual const std::vector<std::string>& get_model_output_names() const override {
return m_model_output_names;
}
virtual const std::vector<std::string> & get_model_output_names() const override { return m_model_output_names; }
virtual int get_context_size() const override { return m_context_size; }
@ -114,7 +119,7 @@ public:
virtual int get_head_size() const override { return m_head_size; }
virtual int32_t* get_rope_params() const override { return m_rope_params; }
virtual int32_t * get_rope_params() const override { return m_rope_params; }
virtual std::map<std::string, std::string> get_kv_param_res_names() const override;
@ -122,36 +127,39 @@ public:
virtual bool is_first_token() const override { return m_is_first_token; }
ov::PartialShape get_graph_input_shape(const ggml_tensor* src) const;
ov::PartialShape get_graph_input_shape(const ggml_tensor * src) const;
static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename);
static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename);
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor,
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor * tensor,
std::optional<ExtraQuantType> requant_type = std::nullopt);
static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(
struct ggml_cgraph* cgraph, std::map<ggml_type, ExtraQuantType> types_to_requantize = {});
const ggml_tensor* get_tensor_used_op(const ggml_tensor* tensor) const;
const ggml_tensor* get_tensor_from_name(const std::string& name) const;
static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(
ggml_cgraph * cgraph,
std::map<ggml_type, ExtraQuantType> types_to_requantize = {});
const ggml_tensor * get_tensor_used_op(const ggml_tensor * tensor) const;
const ggml_tensor * get_tensor_from_name(const std::string & name) const;
void clear_model_weights() { m_model_weights.clear(); }
private:
void set_input_output(ggml_tensor* node, bool naive = false);
void set_input_output(ggml_tensor * node, bool naive = false);
void add_extra_inputs();
static std::vector<size_t> get_shape(const ggml_tensor* tensor);
static std::vector<size_t> get_stride(const ggml_tensor* tensor);
static ov::element::Type get_ov_type(const ggml_tensor* tensor);
static std::vector<size_t> get_shape(const ggml_tensor * tensor);
static std::vector<size_t> get_stride(const ggml_tensor * tensor);
static ov::element::Type get_ov_type(const ggml_tensor * tensor);
// set context_size, num_heads, etc
void set_llm_params();
struct ggml_cgraph* m_cgraph = nullptr;
ggml_tensor* m_node = nullptr;
std::vector<ggml_tensor*> m_nodes;
std::map<std::string, ggml_tensor*> m_inputs;
ggml_cgraph * m_cgraph = nullptr;
ggml_tensor * m_node = nullptr;
std::vector<ggml_tensor *> m_nodes;
std::map<std::string, ggml_tensor *> m_inputs;
std::vector<std::string> m_input_names;
std::map<std::string, ggml_tensor*> m_outputs;
std::map<std::string, ggml_tensor *> m_outputs;
std::vector<std::string> m_output_names;
std::string m_op_name;
mutable std::string m_name;
@ -168,12 +176,12 @@ private:
int m_num_heads;
int m_num_heads_kv;
int m_head_size;
int32_t* m_rope_params;
int32_t * m_rope_params;
std::vector<std::string> m_kv_names;
bool m_is_static = false;
bool m_is_first_token;
};
void print_tensor_address_map(const struct ggml_cgraph* cgraph);
void print_tensor_address_map(const ggml_cgraph * cgraph);
int extract_layer_from_name(const std::string& name);
int extract_layer_from_name(const std::string & name);

View File

@ -1,5 +1,11 @@
#include "ggml-openvino.h"
#include "ggml-backend-impl.h"
#include "ggml-backend.h"
#include "ggml-impl.h"
#include "ggml-openvino/utils.h"
#include "ggml.h"
#include <cstdint>
#include <mutex>
#include <openvino/openvino.hpp>
@ -7,39 +13,36 @@
#include <string>
#include <vector>
#include "ggml-backend-impl.h"
#include "ggml-backend.h"
#include "ggml-impl.h"
#include "ggml-openvino/utils.h"
#include "ggml.h"
#define GGML_OPENVINO_MAX_STREAMS 8
struct ggml_backend_openvino_context {
int device; // the device ID currently in use
std::string name; // context Name
std::string description; // context description
int device; // the device ID currently in use
std::string name; // context Name
std::string description; // context description
// OpenVINO core components
ov::Core core; // OpenVINO core interface
std::shared_ptr<ov::CompiledModel> model; // compiled Model
ov::InferRequest infer_request; // inference Request
ov::Core core; // OpenVINO core interface
std::shared_ptr<ov::CompiledModel> model; // compiled Model
ov::InferRequest infer_request; // inference Request
// OpenVINO Multi-stream support
static const int MAX_STREAMS = 8; // define the maximum number of flows
std::vector<ov::InferRequest> streams; // used to support multi-stream reasoning
int current_stream; // the currently active stream index
static const int MAX_STREAMS = 8; // define the maximum number of flows
std::vector<ov::InferRequest> streams; // used to support multi-stream reasoning
int current_stream; // the currently active stream index
// state Management
bool is_initialized; // initialize
bool is_initialized; // initialize
ggml_backend_openvino_context()
: device(0), name("OpenVINO"), description("OpenVINO Backend Context"),
current_stream(0), is_initialized(false) {}
ggml_backend_openvino_context() :
device(0),
name("OpenVINO"),
description("OpenVINO Backend Context"),
current_stream(0),
is_initialized(false) {}
};
static void ggml_backend_openvino_free(ggml_backend_t backend) {
ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *)backend->context;
ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *) backend->context;
delete ctx;
delete backend;
}
@ -49,8 +52,7 @@ static const char * ggml_backend_openvino_get_name(ggml_backend_t backend) {
GGML_UNUSED(backend);
}
static enum ggml_status
ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph *cgraph) {
static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
openvino_frontend_compute(backend, cgraph);
return GGML_STATUS_SUCCESS;
@ -78,7 +80,8 @@ int ggml_backend_openvino_get_device_count() {
}
static ggml_guid_t ggml_backend_openvino_guid(void) {
static ggml_guid guid = { 0x12, 0xa8, 0xae, 0xf4, 0xc0, 0x1e, 0x61, 0x97, 0x8f, 0xeb, 0x33, 0x04, 0xa1, 0x33, 0x51, 0x2d };
static ggml_guid guid = {0x12, 0xa8, 0xae, 0xf4, 0xc0, 0x1e, 0x61, 0x97,
0x8f, 0xeb, 0x33, 0x04, 0xa1, 0x33, 0x51, 0x2d};
return &guid;
}
@ -95,7 +98,7 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device) {
return nullptr;
}
ggml_backend_t openvino_backend = new ggml_backend {
ggml_backend_t openvino_backend = new ggml_backend{
/* .guid = */ ggml_backend_openvino_guid(),
/* .interface = */ ggml_backend_openvino_interface,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_openvino_reg(), device),
@ -134,15 +137,15 @@ struct ggml_backend_openvino_buffer_type_context {
};
static const char * ggml_backend_openvino_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
ggml_backend_openvino_buffer_type_context * ctx = (ggml_backend_openvino_buffer_type_context *)buft->context;
ggml_backend_openvino_buffer_type_context * ctx = (ggml_backend_openvino_buffer_type_context *) buft->context;
return ctx->name.c_str();
}
static bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name;
}
static const char * ggml_backend_openvino_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
return GGML_OPENVINO_NAME "_Split";
@ -160,12 +163,12 @@ struct ggml_backend_openvino_device_context {
};
static const char * ggml_backend_openvino_device_get_name(ggml_backend_dev_t dev) {
ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context;
ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *) dev->context;
return ctx->name.c_str();
}
static const char * ggml_backend_openvino_device_get_description(ggml_backend_dev_t dev) {
ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context;
ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *) dev->context;
return ctx->description.c_str();
}
@ -174,7 +177,7 @@ static void ggml_backend_openvino_device_get_memory(ggml_backend_dev_t dev, size
GGML_ASSERT(dev->context != nullptr);
GGML_ASSERT(free != nullptr);
GGML_ASSERT(total != nullptr);
ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context;
ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *) dev->context;
GGML_ASSERT(ctx->device >= 0);
// ggml_openvino_set_device(ctx->device);
*total = 1;
@ -187,9 +190,9 @@ static enum ggml_backend_dev_type ggml_backend_openvino_device_get_type(ggml_bac
}
static void ggml_backend_openvino_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
props->name = ggml_backend_openvino_device_get_name(dev);
props->name = ggml_backend_openvino_device_get_name(dev);
props->description = ggml_backend_openvino_device_get_description(dev);
props->type = ggml_backend_openvino_device_get_type(dev);
props->type = ggml_backend_openvino_device_get_type(dev);
ggml_backend_openvino_device_get_memory(dev, &props->memory_free, &props->memory_total);
bool host_buffer = getenv("GGML_OPENVINO_NO_PINNED") == nullptr;
@ -209,12 +212,12 @@ static void ggml_backend_openvino_device_get_props(ggml_backend_dev_t dev, ggml_
static ggml_backend_t ggml_backend_openvino_device_init(ggml_backend_dev_t dev, const char * params) {
GGML_UNUSED(params);
ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context;
ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *) dev->context;
return ggml_backend_openvino_init(ctx->device);
}
static ggml_backend_buffer_type_t ggml_backend_openvino_device_get_buffer_type(ggml_backend_dev_t dev) {
ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context;
ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *) dev->context;
return ggml_backend_openvino_buffer_type(ctx->device);
}
@ -223,7 +226,10 @@ static ggml_backend_buffer_type_t ggml_backend_openvino_device_get_host_buffer_t
return ggml_backend_openvino_host_buffer_type();
}
static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_ptr(ggml_backend_dev_t dev,
void * ptr,
size_t size,
size_t max_tensor_size) {
GGML_UNUSED(dev);
GGML_UNUSED(ptr);
GGML_UNUSED(size);
@ -231,7 +237,10 @@ static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_ptr(ggml_b
return nullptr;
}
static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(ggml_backend_dev_t dev,
void * ptr,
size_t size,
size_t max_tensor_size) {
GGML_UNUSED(dev);
GGML_UNUSED(ptr);
GGML_UNUSED(size);
@ -239,7 +248,7 @@ static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(g
return nullptr;
}
static bool is_op_unsupported_case(const ggml_tensor* op) {
static bool is_op_unsupported_case(const ggml_tensor * op) {
switch (op->op) {
case GGML_OP_SOFT_MAX: {
if (op->src[2] != nullptr) {
@ -248,9 +257,9 @@ static bool is_op_unsupported_case(const ggml_tensor* op) {
}
float scale = 1.0f;
float max_bias = 0.0f;
const auto* op_params = op->op_params;
memcpy(&scale, (const float*) op_params + 0, sizeof(float));
memcpy(&max_bias, (const float*) op_params + 1, sizeof(float));
const auto * op_params = op->op_params;
memcpy(&scale, (const float *) op_params + 0, sizeof(float));
memcpy(&max_bias, (const float *) op_params + 1, sizeof(float));
if (max_bias > 0) {
GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with max_bias > 0\n");
return true;
@ -265,10 +274,10 @@ static bool is_op_unsupported_case(const ggml_tensor* op) {
float scale = 1.0f;
float max_bias = 0.0f;
float logit_softcap = 0.0f;
const auto* op_params = op->op_params;
memcpy(&scale, (const float*) op_params + 0, sizeof(float));
memcpy(&max_bias, (const float*) op_params + 1, sizeof(float));
memcpy(&logit_softcap, (const float*) op_params + 2, sizeof(float));
const auto * op_params = op->op_params;
memcpy(&scale, (const float *) op_params + 0, sizeof(float));
memcpy(&max_bias, (const float *) op_params + 1, sizeof(float));
memcpy(&logit_softcap, (const float *) op_params + 2, sizeof(float));
if (max_bias > 0) {
GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with max_bias > 0\n");
return true;
@ -303,7 +312,7 @@ static bool is_op_unsupported_case(const ggml_tensor* op) {
break;
}
case GGML_OP_ROPE: {
const int32_t* op_params = op->op_params;
const int32_t * op_params = op->op_params;
const int n_dims = op_params[1];
const int mode = op_params[2];
if (mode == GGML_ROPE_TYPE_MROPE || mode == GGML_ROPE_TYPE_VISION) {
@ -311,8 +320,7 @@ static bool is_op_unsupported_case(const ggml_tensor* op) {
return true;
}
if (n_dims != 0.0f && n_dims != op->src[0]->ne[0]) {
GGML_LOG_WARN("OpenVINO backend does not support ROPE with n_dims %d != src[0]->ne[0] %ld\n",
n_dims,
GGML_LOG_WARN("OpenVINO backend does not support ROPE with n_dims %d != src[0]->ne[0] %ld\n", n_dims,
op->src[0]->ne[0]);
return true;
}
@ -333,8 +341,7 @@ static bool is_op_unsupported_case(const ggml_tensor* op) {
GGML_LOG_WARN(
"OpenVINO backend does not support ROPE with src[0]->view_src->ne[1] %ld != src[0]->ne[2] "
"%ld\n",
op->src[0]->view_src->ne[1],
op->src[0]->ne[2]);
op->src[0]->view_src->ne[1], op->src[0]->ne[2]);
return true;
}
}
@ -346,39 +353,19 @@ static bool is_op_unsupported_case(const ggml_tensor* op) {
return false;
}
static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor* op) {
static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
GGML_ASSERT(dev->reg != nullptr);
static std::set<ggml_type> supported_types{GGML_TYPE_F32,
GGML_TYPE_F16,
GGML_TYPE_BF16,
GGML_TYPE_I64,
GGML_TYPE_I32,
GGML_TYPE_Q4_0,
GGML_TYPE_Q4_1,
GGML_TYPE_Q4_K,
GGML_TYPE_Q5_K,
GGML_TYPE_Q8_0,
GGML_TYPE_Q6_K};
static std::set<ggml_type> supported_types{GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_I64,
GGML_TYPE_I32, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K,
GGML_TYPE_Q5_K, GGML_TYPE_Q8_0, GGML_TYPE_Q6_K};
static const std::set<ggml_op> supported_ops{GGML_OP_NONE,
GGML_OP_ADD,
GGML_OP_MUL,
GGML_OP_MUL_MAT,
GGML_OP_VIEW,
GGML_OP_CONT,
GGML_OP_RESHAPE,
GGML_OP_PERMUTE,
GGML_OP_TRANSPOSE,
GGML_OP_GET_ROWS,
GGML_OP_ROPE,
GGML_OP_RMS_NORM,
GGML_OP_SCALE,
static const std::set<ggml_op> supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW,
GGML_OP_CONT, GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE,
GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE,
// softmax is not updated due to replaced by flash_attn_ext
// GGML_OP_SOFT_MAX,
GGML_OP_SET_ROWS,
GGML_OP_FLASH_ATTN_EXT,
GGML_OP_CPY};
GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY};
static const std::set<ggml_unary_op> supported_unary_ops{
GGML_UNARY_OP_SILU,
};
@ -422,7 +409,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
return false;
}
for (int i = 0; i < GGML_MAX_SRC; i++) {
auto* src = op->src[i];
auto * src = op->src[i];
if (src == nullptr) {
break;
}
@ -483,13 +470,13 @@ static size_t ggml_backend_openvino_reg_get_device_count(ggml_backend_reg_t reg)
GGML_UNUSED(reg);
// TODO
ggml_backend_openvino_reg_context * ctx = (ggml_backend_openvino_reg_context *)reg->context;
ggml_backend_openvino_reg_context * ctx = (ggml_backend_openvino_reg_context *) reg->context;
return ctx->devices.size();
}
static ggml_backend_dev_t ggml_backend_openvino_reg_get_device(ggml_backend_reg_t reg, size_t index) {
ggml_backend_openvino_reg_context * ctx = (ggml_backend_openvino_reg_context *)reg->context;
ggml_backend_openvino_reg_context * ctx = (ggml_backend_openvino_reg_context *) reg->context;
GGML_ASSERT(index < ctx->devices.size());
return ctx->devices[index];
// GGML_ASSERT(index == 0);
@ -509,7 +496,7 @@ static ggml_backend_dev_t ggml_backend_openvino_reg_get_device(ggml_backend_reg_
static void * ggml_backend_openvino_get_proc_address(ggml_backend_reg_t reg, const char * name) {
GGML_UNUSED(reg);
if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
return (void *)ggml_backend_openvino_split_buffer_type;
return (void *) ggml_backend_openvino_split_buffer_type;
}
// if (strcmp(name, "ggml_backend_register_host_buffer") == 0) {
// return (void *)ggml_backend_openvino_register_host_buffer;
@ -565,17 +552,16 @@ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void) {
// ggml_openvino_set_device(i);
dev_ctx->description = ov::get_openvino_version().description;
ggml_backend_dev_t dev = new ggml_backend_device {
/* .interface = */ ggml_backend_openvino_device_interface,
/* .reg = */ &reg,
/* .context = */ dev_ctx
};
ggml_backend_dev_t dev =
new ggml_backend_device{/* .interface = */ ggml_backend_openvino_device_interface,
/* .reg = */ &reg,
/* .context = */ dev_ctx};
ctx->devices.push_back(dev);
}
reg = ggml_backend_reg{ /* .api_version = */ GGML_BACKEND_API_VERSION,
/* .iface = */ ggml_backend_openvino_reg_interface,
/* .context = */ ctx };
reg = ggml_backend_reg{/* .api_version = */ GGML_BACKEND_API_VERSION,
/* .iface = */ ggml_backend_openvino_reg_interface,
/* .context = */ ctx};
}
initialized = true;

View File

@ -1,5 +1,9 @@
#include "ggml-quants.hpp"
#include "ggml-common.h"
#include "ggml-impl.h"
#include "ggml.h"
#include <algorithm>
#include <cassert>
#include <cmath>
@ -24,11 +28,7 @@
#include <string>
#include <vector>
#include "ggml-common.h"
#include "ggml-impl.h"
#include "ggml.h"
void unpack_32_4(const uint8_t* data, uint8_t* dst) {
void unpack_32_4(const uint8_t * data, uint8_t * dst) {
std::fill_n(dst, 16, 0);
for (int j = 0; j < 16; ++j) {
uint8_t x = (data[j] & 0x0F);
@ -44,18 +44,19 @@ void unpack_32_4(const uint8_t* data, uint8_t* dst) {
// Extracts (weight, scales, biases) from Q4_0 tensors.
// Data layout is: |16 bit scale|32 x 4bit weights|.
void extract_q4_0_data(const ggml_tensor* tensor,
ov::Tensor& weights_arr,
ov::Tensor& scales_arr,
ov::Tensor& biases_arr) {
void extract_q4_0_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
ov::Tensor & biases_arr) {
const uint64_t bytes_per_block = 18; // 2 bytes scale, 32x0.5 byte weights
auto* data = static_cast<uint8_t*>(tensor->data);
auto* weights = static_cast<uint8_t*>(weights_arr.data());
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto * data = static_cast<uint8_t *>(tensor->data);
auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
scales[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block)));
scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)));
biases[i] = ov::float16(-8.f * static_cast<float>(scales[i]));
unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16);
});
@ -63,38 +64,40 @@ void extract_q4_0_data(const ggml_tensor* tensor,
// Extracts (weight, scales, biases) from Q4_1 tensors.
// Data layout is: |16 bit scale|16 bit bias|32 x 4bit weights|.
void extract_q4_1_data(const ggml_tensor* tensor,
ov::Tensor& weights_arr,
ov::Tensor& scales_arr,
ov::Tensor& biases_arr) {
void extract_q4_1_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
ov::Tensor & biases_arr) {
const uint64_t bytes_per_block = 20; // 2 bytes scale, 2 bytes bias, 32x0.5 byte weights
auto* data = static_cast<uint8_t*>(tensor->data);
auto* weights = static_cast<uint8_t*>(weights_arr.data());
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto * data = static_cast<uint8_t *>(tensor->data);
auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
scales[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block)));
biases[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block + 2)));
scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)));
biases[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2)));
unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16);
});
}
// Extracts (weight, scales, biases) from Q8_0 tensors.
// Data layout is: |16 bit scale|32 x 8bit weights|.
void extract_q8_0_data(const ggml_tensor* tensor,
ov::Tensor& weights_arr,
ov::Tensor& scales_arr,
ov::Tensor& biases_arr) {
void extract_q8_0_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
ov::Tensor & biases_arr) {
const uint64_t weights_per_block = 32;
const uint64_t bytes_per_block = 34; // 2 bytes scale, 32x1 byte weights
auto* data = static_cast<uint8_t*>(tensor->data);
auto* weights = static_cast<uint8_t*>(weights_arr.data());
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto * data = static_cast<uint8_t *>(tensor->data);
auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
uint8_t* block_data = data + i * bytes_per_block;
scales[i] = ov::float16::from_bits(*(uint16_t*) block_data);
uint8_t * block_data = data + i * bytes_per_block;
scales[i] = ov::float16::from_bits(*(uint16_t *) block_data);
biases[i] = ov::float16(-128.f * static_cast<float>(scales[i]));
for (size_t j = 0; j < weights_per_block; ++j) {
uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes.
@ -105,7 +108,7 @@ void extract_q8_0_data(const ggml_tensor* tensor,
});
}
void unpack_256_4(const uint8_t* data, uint8_t* dst) {
void unpack_256_4(const uint8_t * data, uint8_t * dst) {
// Initialize the output array with zeros
std::fill_n(dst, 128, 0);
@ -123,26 +126,27 @@ void unpack_256_4(const uint8_t* data, uint8_t* dst) {
}
}
void extract_q4_k_data(const ggml_tensor* tensor,
ov::Tensor& weights_arr,
ov::Tensor& scales_arr,
ov::Tensor& biases_arr) {
void extract_q4_k_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
ov::Tensor & biases_arr) {
const uint64_t bytes_per_block = 2 + 2 + 12 + 128;
const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
auto* data = static_cast<uint8_t*>(tensor->data);
auto* weights = static_cast<uint8_t*>(weights_arr.data());
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto * data = static_cast<uint8_t *>(tensor->data);
auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
ov::parallel_for(n_super_block, [&](size_t i) {
uint8_t* block_data = data + i * bytes_per_block;
uint8_t * block_data = data + i * bytes_per_block;
// Extract scale factors and offsets
float scale_scales = static_cast<float>(ov::float16::from_bits(*((uint16_t*)block_data)));
float scale_biases = static_cast<float>(ov::float16::from_bits(*((uint16_t*)block_data + 1)));
float scale_scales = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data)));
float scale_biases = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 1)));
// Extract qs1 and qs2
uint8_t* qs1 = block_data + 4;
uint8_t * qs1 = block_data + 4;
// uint8_t* qs2 = block_data + 16;
scales[i * 8] = ov::float16(scale_scales * static_cast<float>((*(qs1) & 0b111111)));
@ -174,31 +178,32 @@ void extract_q4_k_data(const ggml_tensor* tensor,
});
}
void extract_q6_k_data(const ggml_tensor* tensor,
ov::Tensor& weights_arr,
ov::Tensor& scales_arr,
ov::Tensor& biases_arr) {
void extract_q6_k_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
ov::Tensor & biases_arr) {
const uint64_t bytes_per_block = 128 + 64 + 16 + 2;
const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
auto* data = static_cast<uint8_t*>(tensor->data);
auto* weights = static_cast<uint8_t*>(weights_arr.data());
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto * data = static_cast<uint8_t *>(tensor->data);
auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
ov::parallel_for(n_super_block, [&](size_t i) {
uint8_t* block_data = data + i * bytes_per_block;
uint8_t * block_data = data + i * bytes_per_block;
float scale_factor =
static_cast<float>(ov::float16::from_bits(*((uint16_t*) block_data + 104))); // (128+64+16)/2
static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 104))); // (128+64+16)/2
for (size_t j = 0; j < 16; j++) {
scales[j + i * 16] =
ov::float16(scale_factor * static_cast<float>(*((int8_t*) (block_data + 128 + 64 + j))));
ov::float16(scale_factor * static_cast<float>(*((int8_t *) (block_data + 128 + 64 + j))));
biases[j + i * 16] = ov::float16(-32.f * static_cast<float>(scales[j + i * 16]));
}
uint8_t* ql = block_data;
uint8_t* qh = block_data + 128;
uint8_t * ql = block_data;
uint8_t * qh = block_data + 128;
for (int64_t j = 0; j < 32; ++j) {
weights[i * 256 + j] = (ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4);
@ -213,7 +218,7 @@ void extract_q6_k_data(const ggml_tensor* tensor,
});
}
static inline void get_scale_min_k4(int j, const uint8_t* q, uint8_t* d, uint8_t* m) {
static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) {
if (j < 4) {
*d = q[j] & 63;
*m = q[j + 4] & 63;
@ -223,24 +228,27 @@ static inline void get_scale_min_k4(int j, const uint8_t* q, uint8_t* d, uint8_t
}
}
void extract_q5_k_data(const ggml_tensor* tensor, ov::Tensor& weights_arr, ov::Tensor& scales_arr,
ov::Tensor& biases_arr) {
void extract_q5_k_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
ov::Tensor & biases_arr) {
const uint64_t bytes_per_block = 4 + 12 + 32 + 128;
const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
auto* data = static_cast<uint8_t*>(tensor->data);
auto* weights = static_cast<uint8_t*>(weights_arr.data());
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto * data = static_cast<uint8_t *>(tensor->data);
auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
ov::parallel_for(n_super_block, [&](size_t i) {
uint8_t* block_data = data + i * bytes_per_block;
uint8_t * block_data = data + i * bytes_per_block;
const float d = static_cast<float>(ov::float16::from_bits(*((uint16_t*) block_data)));
const float min = static_cast<float>(ov::float16::from_bits(*((uint16_t*) block_data + 1)));
const float d = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data)));
const float min = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 1)));
const uint8_t* scales_data = block_data + 4; // 12 bytes of scales
const uint8_t* qh = block_data + 4 + 12; // 32 bytes of high bits
const uint8_t* ql = block_data + 4 + 12 + 32; // 128 bytes of low bits
const uint8_t * scales_data = block_data + 4; // 12 bytes of scales
const uint8_t * qh = block_data + 4 + 12; // 32 bytes of high bits
const uint8_t * ql = block_data + 4 + 12 + 32; // 128 bytes of low bits
int is = 0;
uint8_t u1 = 1;
@ -286,7 +294,10 @@ void extract_q5_k_data(const ggml_tensor* tensor, ov::Tensor& weights_arr, ov::T
// TODO Reorder for make_intX_weights
ov::Output<ov::Node> make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) {
ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
ov::Tensor & scales,
ov::Tensor & biases,
size_t group_size) {
ov::Shape orig_shape = weight.get_shape();
// Expand dimensions for scales and biases
@ -303,18 +314,19 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, o
}
// Create graph nodes
auto weights_node = std::make_shared<ov::op::v0::Constant>(
ov::element::u8, packed_shape, static_cast<uint8_t*>(weight.data()), nullptr);
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u8, packed_shape,
static_cast<uint8_t *>(weight.data()), nullptr);
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
ov::Tensor biases_u8(ov::element::u8, scale_shape);
// Calculate zero point
const ov::float16* bias_data = biases.data<ov::element_type_traits<ov::element::f16>::value_type>();
const ov::float16* scale_data = scales.data<ov::element_type_traits<ov::element::f16>::value_type>();
uint8_t* bias_u8_data = biases_u8.data<uint8_t>();
const ov::float16 * bias_data = biases.data<ov::element_type_traits<ov::element::f16>::value_type>();
const ov::float16 * scale_data = scales.data<ov::element_type_traits<ov::element::f16>::value_type>();
uint8_t * bias_u8_data = biases_u8.data<uint8_t>();
for (size_t i = 0; i < biases_u8.get_size(); ++i) {
bias_u8_data[i] = (uint8_t)std::round(-1.f * static_cast<float>(bias_data[i]) / static_cast<float>(scale_data[i]));
bias_u8_data[i] =
(uint8_t) std::round(-1.f * static_cast<float>(bias_data[i]) / static_cast<float>(scale_data[i]));
}
auto zero_point = std::make_shared<ov::op::v0::Constant>(biases_u8);
@ -327,9 +339,7 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, o
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
auto zero_point_f16 = std::make_shared<ov::op::v0::Convert>(zero_point, ov::element::f16);
auto w_zp = std::make_shared<ov::op::v1::Subtract>(
weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY
);
auto w_zp = std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY);
ov::Output<ov::Node> w_zp_s =
std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
@ -343,18 +353,17 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, o
return std::make_shared<ov::op::v0::Convert>(w_zp_s, ov::element::f32);
}
ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) {
ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
ov::Tensor & scales,
ov::Tensor & biases,
size_t group_size) {
ov::Shape orig_weight_shape = weight.get_shape();
// Expand dimensions for scales and biases
ov::Shape scale_bias_shape = scales.get_shape();
// Create INT4 weight tensor
ov::Shape packed_shape = {
orig_weight_shape[0],
orig_weight_shape[1] / group_size,
group_size
};
ov::Shape packed_shape = {orig_weight_shape[0], orig_weight_shape[1] / group_size, group_size};
// Requantized channel-wise case
if (packed_shape[1] == 1) {
@ -365,18 +374,21 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o
biases.set_shape(scale_bias_shape);
}
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u4, packed_shape, static_cast<uint8_t*>(weight.data()), nullptr);
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u4, packed_shape,
static_cast<uint8_t *>(weight.data()), nullptr);
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
// Pack zero points: two subsequent values into one
const ov::float16* bias_data = biases.data<ov::element_type_traits<ov::element::f16>::value_type>();
const ov::float16* scale_data = scales.data<ov::element_type_traits<ov::element::f16>::value_type>();
const ov::float16 * bias_data = biases.data<ov::element_type_traits<ov::element::f16>::value_type>();
const ov::float16 * scale_data = scales.data<ov::element_type_traits<ov::element::f16>::value_type>();
ov::Tensor zero_point_tensor(ov::element::u4, scale_bias_shape);
uint8_t* zero_point_data = static_cast<uint8_t*>(zero_point_tensor.data());
uint8_t * zero_point_data = static_cast<uint8_t *>(zero_point_tensor.data());
for (size_t i = 0; i < zero_point_tensor.get_byte_size(); ++i) {
uint8_t bias1 = (uint8_t)std::round(-1.f * static_cast<float>(bias_data[i * 2]) / static_cast<float>(scale_data[i * 2]));
uint8_t bias2 = (uint8_t)std::round(-1.f * static_cast<float>(bias_data[i * 2 + 1]) / static_cast<float>(scale_data[i * 2 + 1]));
uint8_t bias1 =
(uint8_t) std::round(-1.f * static_cast<float>(bias_data[i * 2]) / static_cast<float>(scale_data[i * 2]));
uint8_t bias2 = (uint8_t) std::round(-1.f * static_cast<float>(bias_data[i * 2 + 1]) /
static_cast<float>(scale_data[i * 2 + 1]));
zero_point_data[i] = (bias2 << 4) | (bias1 & 0x0F);
}
@ -390,16 +402,15 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o
auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
// Perform dequantization
auto w_zp = std::make_shared<ov::op::v1::Subtract>(
weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
auto w_zp = std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
ov::Output<ov::Node> w_zp_s =
std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
if (packed_shape.size() != 2) {
// If not requantized channel-wise case, reshape back to original shape
auto final_shape = std::make_shared<ov::op::v0::Constant>(
ov::element::i64, ov::Shape{orig_weight_shape.size()}, orig_weight_shape);
auto final_shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{orig_weight_shape.size()},
orig_weight_shape);
w_zp_s = std::make_shared<ov::op::v1::Reshape>(w_zp_s, final_shape, false);
}
@ -407,7 +418,7 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o
return std::make_shared<ov::op::v0::Convert>(w_zp_s, ov::element::f32);
}
std::shared_ptr<ov::Node> requantize(const ggml_tensor* tensor, ExtraQuantType requant_type) {
std::shared_ptr<ov::Node> requantize(const ggml_tensor * tensor, ExtraQuantType requant_type) {
std::vector<float> weights_f32(tensor->ne[0] * tensor->ne[1]);
ggml_get_type_traits(tensor->type)->to_float(tensor->data, weights_f32.data(), ggml_nelements(tensor));
@ -459,14 +470,18 @@ std::shared_ptr<ov::Node> requantize(const ggml_tensor* tensor, ExtraQuantType r
return weight_node;
}
void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
void quantize_q4_0(const float * x,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
ov::Tensor & biases_arr,
int64_t k,
int64_t qk) {
assert(k % qk == 0);
const int nb = k / qk;
auto* weights = static_cast<uint8_t*>(weights_arr.data());
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max
float max = 0.0f;
@ -503,14 +518,18 @@ void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_a
}
}
void quantize_q8_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
void quantize_q8_0(const float * x,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
ov::Tensor & biases_arr,
int64_t k,
int64_t qk) {
assert(k % qk == 0);
const int nb = k / qk;
auto* weights = static_cast<uint8_t*>(weights_arr.data());
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max
@ -534,14 +553,18 @@ void quantize_q8_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_a
}
}
void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
void quantize_q8_1(const float * x,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
ov::Tensor & biases_arr,
int64_t k,
int64_t qk) {
assert(k % qk == 0);
const int nb = k / qk;
auto* weights = static_cast<uint8_t*>(weights_arr.data());
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
for (int i = 0; i < nb; i++) {
float min = std::numeric_limits<float>::max();
float max = std::numeric_limits<float>::lowest();

View File

@ -10,11 +10,11 @@ namespace ggml {
FrontEnd::FrontEnd() {}
std::shared_ptr<Model> FrontEnd::convert(const InputModel::Ptr& model, bool naive) {
std::shared_ptr<Model> FrontEnd::convert(const InputModel::Ptr & model, bool naive) {
auto ggml_model = std::dynamic_pointer_cast<ggml::InputModel>(model);
FRONT_END_GENERAL_CHECK(ggml_model, "Invalid input model");
std::shared_ptr<Model> converted_model;
const auto& supported_ops = get_supported_ops();
const auto & supported_ops = get_supported_ops();
{
TranslateSession translate_session(model, supported_ops, naive);
converted_model = translate_session.get_converted_model();

View File

@ -6,9 +6,9 @@ namespace ov {
namespace frontend {
namespace ggml {
InputModel::InputModel(const std::shared_ptr<GgmlDecoder>& gdecoder) : m_decoder(gdecoder) {}
InputModel::InputModel(const std::shared_ptr<GgmlDecoder> & gdecoder) : m_decoder(gdecoder) {}
const std::shared_ptr<GgmlDecoder>& InputModel::get_model_decoder() const {
const std::shared_ptr<GgmlDecoder> & InputModel::get_model_decoder() const {
return m_decoder;
}

View File

@ -1,4 +1,8 @@
#include "../node_context.hpp"
#include "../op_table.hpp"
#include "../utils.hpp"
#include <climits>
#include <cstdint>
#include <memory>
@ -6,16 +10,12 @@
#include <openvino/op/slice.hpp>
#include <vector>
#include "../node_context.hpp"
#include "../op_table.hpp"
#include "../utils.hpp"
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_cont(const NodeContext& context) {
OutputVector translate_cont(const NodeContext & context) {
num_inputs_check(context, 1, 1);
int op_case = context.get_op_case();
@ -29,9 +29,7 @@ OutputVector translate_cont(const NodeContext& context) {
// The input comes from a PERMUTE
dst_shape[1] = -1;
res = std::make_shared<ov::op::v1::Reshape>(
context.get_input(0),
ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape),
false);
context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), false);
} else if (op_case == 2) {
// The input comes from a TRANSPOSE
return {context.get_input(0)};

View File

@ -1,15 +1,16 @@
#include <memory>
#include <openvino/op/convert.hpp>
#include "../node_context.hpp"
#include "../op_table.hpp"
#include "../utils.hpp"
#include <memory>
#include <openvino/op/convert.hpp>
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_cpy(const NodeContext& context) {
OutputVector translate_cpy(const NodeContext & context) {
auto res = std::make_shared<ov::op::v0::Convert>(context.get_input(0), context.get_output_type(0));
return rename_outputs_with_suffix({res}, context.get_name());
}

View File

@ -1,3 +1,7 @@
#include "../node_context.hpp"
#include "../op_table.hpp"
#include "../utils.hpp"
#include <memory>
#include <openvino/op/broadcast.hpp>
#include <openvino/op/concat.hpp>
@ -8,24 +12,20 @@
#include <openvino/op/unsqueeze.hpp>
#include <string>
#include "../node_context.hpp"
#include "../op_table.hpp"
#include "../utils.hpp"
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_flash_attn_ext(const NodeContext& context) {
OutputVector translate_flash_attn_ext(const NodeContext & context) {
num_inputs_check(context, 4, 4);
auto q_f32 = context.get_input(0);
auto k = context.get_input(1);
auto v = context.get_input(2);
auto mask = context.get_input(3);
float* params = reinterpret_cast<float*>(context.get_output_op_params(0));
float scale = params[0];
float * params = reinterpret_cast<float *>(context.get_output_op_params(0));
float scale = params[0];
// float max_bias = params[1];
// float logit_softcap = params[2];
@ -43,15 +43,14 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) {
auto token_len = get_dimensions(q, {2});
auto kv_len = get_dimensions(k.get_node_shared_ptr(), {2});
auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0});
auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1});
auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 0});
auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 1});
auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2});
auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2});
auto stop = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{token_len, kv_len}, 0);
mask_sliced =
std::make_shared<ov::op::v8::Slice>(mask, zero_2d, stop, one_2d, axes);
mask_sliced = std::make_shared<ov::op::v8::Slice>(mask, zero_2d, stop, one_2d, axes);
mask_sliced = std::make_shared<ov::op::v0::Unsqueeze>(mask_sliced, zero_1d);
}
@ -72,8 +71,8 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) {
kv_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(kv, unsqueeze_axes);
auto kv_last_two_dims = get_dimensions(kv.get_node_shared_ptr(), {1, 2});
kv_broadcast_shape =
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{kv_batch_node, factor_node, kv_last_two_dims}, 0);
kv_broadcast_shape = std::make_shared<ov::op::v0::Concat>(
ov::OutputVector{kv_batch_node, factor_node, kv_last_two_dims}, 0);
new_kv_shape =
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{q_batch_node, kv_last_two_dims}, 0);
} else {
@ -82,8 +81,8 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) {
kv_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(kv, unsqueeze_axes);
auto kv_last_two_dims = get_dimensions(kv.get_node_shared_ptr(), {2, 3});
kv_broadcast_shape =
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{one_1d, kv_batch_node, factor_node, kv_last_two_dims}, 0);
kv_broadcast_shape = std::make_shared<ov::op::v0::Concat>(
ov::OutputVector{one_1d, kv_batch_node, factor_node, kv_last_two_dims}, 0);
new_kv_shape =
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{one_1d, q_batch_node, kv_last_two_dims}, 0);
}
@ -105,8 +104,8 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) {
res = std::make_shared<ov::op::v1::Transpose>(sdpa_f32,
ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2}));
} else {
res = std::make_shared<ov::op::v1::Transpose>(sdpa_f32,
ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}));
res = std::make_shared<ov::op::v1::Transpose>(
sdpa_f32, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}));
}
return rename_outputs_with_suffix({res}, context.get_name());
}

View File

@ -1,3 +1,7 @@
#include "../node_context.hpp"
#include "../op_table.hpp"
#include "../utils.hpp"
#include <openvino/core/node.hpp>
#include <openvino/core/node_output.hpp>
#include <openvino/op/constant.hpp>
@ -5,16 +9,12 @@
#include <openvino/op/gather.hpp>
#include <openvino/op/squeeze.hpp>
#include "../node_context.hpp"
#include "../op_table.hpp"
#include "../utils.hpp"
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_get_rows(const NodeContext& context) {
OutputVector translate_get_rows(const NodeContext & context) {
num_inputs_check(context, 2, 2);
int op_case = context.get_op_case();

View File

@ -1,3 +1,7 @@
#include "../node_context.hpp"
#include "../op_table.hpp"
#include "../utils.hpp"
#include <memory>
#include <openvino/core/node_output.hpp>
#include <openvino/op/constant.hpp>
@ -7,16 +11,12 @@
#include <openvino/op/slice.hpp>
#include <openvino/op/split.hpp>
#include "../node_context.hpp"
#include "../op_table.hpp"
#include "../utils.hpp"
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_glu_geglu(const NodeContext& context) {
OutputVector translate_glu_geglu(const NodeContext & context) {
num_inputs_check(context, 1, 2);
ov::Output<ov::Node> src0;
@ -32,7 +32,7 @@ OutputVector translate_glu_geglu(const NodeContext& context) {
src1 = split->output(1);
}
int32_t* params = context.get_output_op_params(0);
int32_t * params = context.get_output_op_params(0);
const int32_t swapped = params[1];
if (swapped) {
std::swap(src0, src1);

View File

@ -1,3 +1,7 @@
#include "../node_context.hpp"
#include "../op_table.hpp"
#include "../utils.hpp"
#include <cstdint>
#include <memory>
#include <openvino/core/node_output.hpp>
@ -7,16 +11,12 @@
#include <openvino/op/slice.hpp>
#include <openvino/op/split.hpp>
#include "../node_context.hpp"
#include "../op_table.hpp"
#include "../utils.hpp"
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_glu_swiglu(const NodeContext& context) {
OutputVector translate_glu_swiglu(const NodeContext & context) {
num_inputs_check(context, 1, 2);
ov::Output<ov::Node> src0;
@ -32,7 +32,7 @@ OutputVector translate_glu_swiglu(const NodeContext& context) {
src1 = split->output(1);
}
int32_t* params = context.get_output_op_params(0);
int32_t * params = context.get_output_op_params(0);
const int32_t swapped = params[1];
if (swapped) {
std::swap(src0, src1);

View File

@ -1,3 +1,7 @@
#include "../node_context.hpp"
#include "../op_table.hpp"
#include "../utils.hpp"
#include <climits>
#include <cstdint>
#include <memory>
@ -15,16 +19,12 @@
#include <openvino/op/util/op_types.hpp>
#include <vector>
#include "../node_context.hpp"
#include "../op_table.hpp"
#include "../utils.hpp"
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_mulmat(const NodeContext& context) {
OutputVector translate_mulmat(const NodeContext & context) {
num_inputs_check(context, 2, 2);
int op_case = context.get_op_case();

View File

@ -1,3 +1,7 @@
#include "../node_context.hpp"
#include "../op_table.hpp"
#include "../utils.hpp"
#include <climits>
#include <cstdint>
#include <memory>
@ -9,16 +13,12 @@
#include <openvino/op/transpose.hpp>
#include <openvino/op/unsqueeze.hpp>
#include "../node_context.hpp"
#include "../op_table.hpp"
#include "../utils.hpp"
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_permute(const NodeContext& context) {
OutputVector translate_permute(const NodeContext & context) {
num_inputs_check(context, 1, 1);
int op_case = context.get_op_case();
@ -28,15 +28,15 @@ OutputVector translate_permute(const NodeContext& context) {
if (op_case == 1) {
if (context.is_static()) {
res = std::make_shared<ov::op::v1::Transpose>(context.get_input(0),
ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2}));
res = std::make_shared<ov::op::v1::Transpose>(
context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2}));
} else {
auto src = context.get_input(0);
if (src.get_partial_shape().rank() == 3) {
src = std::make_shared<ov::op::v0::Unsqueeze>(src, zero);
}
res = std::make_shared<ov::op::v1::Transpose>(src,
ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}));
res = std::make_shared<ov::op::v1::Transpose>(
src, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}));
}
} else {
auto src = context.get_input(0);
@ -47,7 +47,8 @@ OutputVector translate_permute(const NodeContext& context) {
std::vector<int64_t> src_shape(src_shape_.begin(), src_shape_.end());
auto src_reshaped = std::make_shared<ov::op::v1::Reshape>(
src,
ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{-1, src_shape[1], src_shape[2]}),
ov::op::v0::Constant::create(ov::element::i64, {3},
std::vector<int64_t>{-1, src_shape[1], src_shape[2]}),
false);
res = std::make_shared<ov::op::v1::Transpose>(
src_reshaped, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2}));
@ -55,8 +56,8 @@ OutputVector translate_permute(const NodeContext& context) {
if (src.get_partial_shape().rank() == 3) {
src = std::make_shared<ov::op::v0::Unsqueeze>(src, zero);
}
res = std::make_shared<ov::op::v1::Transpose>(src,
ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}));
res = std::make_shared<ov::op::v1::Transpose>(
src, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}));
}
}
return rename_outputs_with_suffix({res}, context.get_name());

View File

@ -1,3 +1,7 @@
#include "../node_context.hpp"
#include "../op_table.hpp"
#include "../utils.hpp"
#include <cstdint>
#include <memory>
#include <openvino/core/node.hpp>
@ -7,16 +11,12 @@
#include <openvino/op/reshape.hpp>
#include <vector>
#include "../node_context.hpp"
#include "../op_table.hpp"
#include "../utils.hpp"
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_reshape(const NodeContext& context) {
OutputVector translate_reshape(const NodeContext & context) {
num_inputs_check(context, 1, 1);
if (context.get_input_shape(0) == context.get_output_shape(0)) {
return {context.get_input(0)};
@ -29,15 +29,11 @@ OutputVector translate_reshape(const NodeContext& context) {
auto output_shape = context.get_output_shape(0).to_shape();
std::shared_ptr<ov::Node> new_shape_node;
if (op_case == 1) {
new_shape_node =
ov::op::v0::Constant::create(ov::element::i64,
{3},
std::vector<int64_t>{-1, (int64_t)output_shape[1], (int64_t)output_shape[2]});
new_shape_node = ov::op::v0::Constant::create(
ov::element::i64, {3}, std::vector<int64_t>{-1, (int64_t) output_shape[1], (int64_t) output_shape[2]});
} else if (op_case == 2) {
new_shape_node =
ov::op::v0::Constant::create(ov::element::i64,
{3},
std::vector<int64_t>{(int64_t)output_shape[0], -1, (int64_t)output_shape[2]});
new_shape_node = ov::op::v0::Constant::create(
ov::element::i64, {3}, std::vector<int64_t>{(int64_t) output_shape[0], -1, (int64_t) output_shape[2]});
} else if (op_case == 3) {
new_shape_node =
ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{(int64_t) output_shape[0], -1, 1});

View File

@ -1,3 +1,7 @@
#include "../node_context.hpp"
#include "../op_table.hpp"
#include "../utils.hpp"
#include <memory>
#include <openvino/op/add.hpp>
#include <openvino/op/constant.hpp>
@ -7,16 +11,12 @@
#include <openvino/op/reduce_mean.hpp>
#include <openvino/op/sqrt.hpp>
#include "../node_context.hpp"
#include "../op_table.hpp"
#include "../utils.hpp"
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_rms_norm(const NodeContext& context) {
OutputVector translate_rms_norm(const NodeContext & context) {
num_inputs_check(context, 1, 1);
auto input_node = context.get_input(0);

View File

@ -1,3 +1,7 @@
#include "../node_context.hpp"
#include "../op_table.hpp"
#include "../utils.hpp"
#include <cstdint>
#include <memory>
#include <openvino/core/node.hpp>
@ -14,16 +18,12 @@
#include <openvino/op/unsqueeze.hpp>
#include <vector>
#include "../node_context.hpp"
#include "../op_table.hpp"
#include "../utils.hpp"
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_rope(const NodeContext& context) {
OutputVector translate_rope(const NodeContext & context) {
num_inputs_check(context, 2, 3);
int op_case = context.get_op_case();
@ -32,7 +32,7 @@ OutputVector translate_rope(const NodeContext& context) {
auto data_node = context.get_input(0).get_node_shared_ptr();
auto output_shape = context.get_output_shape(0).to_shape();
int32_t* op_params = context.get_output_op_params(0);
int32_t * op_params = context.get_output_op_params(0);
Output<Node> cos_theta_node;
Output<Node> sin_theta_node;
@ -85,7 +85,8 @@ OutputVector translate_rope(const NodeContext& context) {
auto stack = std::make_shared<ov::op::v0::Concat>(OutputVector{first_half, second_half}, 3);
res = std::make_shared<ov::op::v1::Reshape>(stack, std::make_shared<ov::op::v0::ShapeOf>(data_node), false);
if (!(context.is_static())) {
res = std::make_shared<ov::op::v0::Unsqueeze>(res, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
res =
std::make_shared<ov::op::v0::Unsqueeze>(res, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
}
} else if (mode == ROPE_TYPE_NEOX) {
auto data_split = std::make_shared<ov::op::v1::Split>(

View File

@ -1,17 +1,17 @@
#include <openvino/op/constant.hpp>
#include <openvino/op/multiply.hpp>
#include <vector>
#include "../node_context.hpp"
#include "../op_table.hpp"
#include "../utils.hpp"
#include <openvino/op/constant.hpp>
#include <openvino/op/multiply.hpp>
#include <vector>
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_scale(const NodeContext& context) {
OutputVector translate_scale(const NodeContext & context) {
num_inputs_check(context, 1, 1);
float scale;

View File

@ -1,3 +1,7 @@
#include "../node_context.hpp"
#include "../op_table.hpp"
#include "../utils.hpp"
#include <cassert>
#include <cstdint>
#include <memory>
@ -15,16 +19,12 @@
#include <openvino/op/squeeze.hpp>
#include <openvino/op/transpose.hpp>
#include "../node_context.hpp"
#include "../op_table.hpp"
#include "../utils.hpp"
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_set_rows(const NodeContext& context) {
OutputVector translate_set_rows(const NodeContext & context) {
num_inputs_check(context, 3, 3);
auto data = context.get_input(0);
@ -44,8 +44,7 @@ OutputVector translate_set_rows(const NodeContext& context) {
Output<Node> res;
if (context.is_static()) {
auto dst_reshaped = std::make_shared<ov::op::v1::Reshape>(
dst,
ov::op::v0::Constant::create(ov::element::i64, {2}, {(int64_t) dst_shape[1], (int64_t) dst_shape[2]}),
dst, ov::op::v0::Constant::create(ov::element::i64, {2}, {(int64_t) dst_shape[1], (int64_t) dst_shape[2]}),
false);
auto indices_reshaped =
std::make_shared<ov::op::v0::Squeeze>(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1}));
@ -55,7 +54,8 @@ OutputVector translate_set_rows(const NodeContext& context) {
auto updated = std::make_shared<ov::op::v3::ScatterUpdate>(dst_reshaped, indices_reshaped, data_reshaped, zero);
res = std::make_shared<ov::op::v1::Reshape>(updated, std::make_shared<ov::op::v0::ShapeOf>(dst), false);
} else {
assert(dst.get_partial_shape().rank() == 4 && dst.get_partial_shape()[2].is_static() && dst.get_partial_shape()[3].is_static());
assert(dst.get_partial_shape().rank() == 4 && dst.get_partial_shape()[2].is_static() &&
dst.get_partial_shape()[3].is_static());
int64_t dim2 = dst.get_partial_shape()[2].get_length();
int64_t dim3 = dst.get_partial_shape()[3].get_length();
data = std::make_shared<ov::op::v1::Reshape>(

View File

@ -1,3 +1,7 @@
#include "../node_context.hpp"
#include "../op_table.hpp"
#include "../utils.hpp"
#include <climits>
#include <cstdint>
#include <memory>
@ -13,16 +17,12 @@
#include <openvino/op/softmax.hpp>
#include <vector>
#include "../node_context.hpp"
#include "../op_table.hpp"
#include "../utils.hpp"
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_soft_max(const NodeContext& context) {
OutputVector translate_soft_max(const NodeContext & context) {
num_inputs_check(context, 1, 2);
auto input_node = context.get_input(0).get_node_shared_ptr();
@ -30,9 +30,9 @@ OutputVector translate_soft_max(const NodeContext& context) {
float scale = 1.0f;
float max_bias = 0.0f;
auto* op_params = context.get_output_op_params(0);
memcpy(&scale, (float*) op_params + 0, sizeof(float));
memcpy(&max_bias, (float*) op_params + 1, sizeof(float));
auto * op_params = context.get_output_op_params(0);
memcpy(&scale, (float *) op_params + 0, sizeof(float));
memcpy(&max_bias, (float *) op_params + 1, sizeof(float));
auto src0_shape = context.get_input_shape(0).get_shape();
const uint32_t h = src0_shape[2];
const uint32_t n_head = src0_shape[0];

View File

@ -1,15 +1,15 @@
#include <openvino/op/transpose.hpp>
#include "../node_context.hpp"
#include "../op_table.hpp"
#include "../utils.hpp"
#include <openvino/op/transpose.hpp>
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_transpose(const NodeContext& context) {
OutputVector translate_transpose(const NodeContext & context) {
num_inputs_check(context, 1, 1);
auto res = std::make_shared<ov::op::v1::Transpose>(context.get_input(0),

View File

@ -1,17 +1,17 @@
#include <openvino/core/node_output.hpp>
#include <openvino/op/multiply.hpp>
#include <openvino/op/sigmoid.hpp>
#include "../node_context.hpp"
#include "../op_table.hpp"
#include "../utils.hpp"
#include <openvino/core/node_output.hpp>
#include <openvino/op/multiply.hpp>
#include <openvino/op/sigmoid.hpp>
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_unary_silu(const NodeContext& context) {
OutputVector translate_unary_silu(const NodeContext & context) {
num_inputs_check(context, 1, 1);
auto input = context.get_input(0);

View File

@ -6,12 +6,13 @@ namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_view(const NodeContext& context) {
OutputVector translate_view(const NodeContext & context) {
num_inputs_check(context, 1, 1);
if (context.get_op_case() == 2) {
auto dst_shape = context.get_output_shape(0).to_shape();
return rename_outputs_with_suffix({process_view_input(context, 0, dst_shape[1] * dst_shape[2])}, context.get_name());
return rename_outputs_with_suffix({process_view_input(context, 0, dst_shape[1] * dst_shape[2])},
context.get_name());
}
return {context.get_input(0)};
}

View File

@ -1,5 +1,7 @@
#include "op_table.hpp"
#include "utils.hpp"
#include <openvino/op/add.hpp>
#include <openvino/op/divide.hpp>
#include <openvino/op/gather.hpp>
@ -7,8 +9,6 @@
#include <openvino/op/multiply.hpp>
#include <openvino/op/subtract.hpp>
#include "utils.hpp"
namespace ov {
namespace frontend {
namespace ggml {

View File

@ -1,15 +1,15 @@
#include "eliminate_zp.hpp"
#include <openvino/core/graph_util.hpp>
#include <openvino/core/parallel.hpp>
#include <openvino/core/rt_info.hpp>
#include <openvino/pass/pattern/op/label.hpp>
#include <openvino/pass/pattern/op/pattern.hpp>
#include <openvino/pass/pattern/op/wrap_type.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/convert.hpp>
#include <openvino/op/multiply.hpp>
#include <openvino/op/subtract.hpp>
#include <openvino/core/parallel.hpp>
#include <openvino/pass/pattern/op/label.hpp>
#include <openvino/pass/pattern/op/pattern.hpp>
#include <openvino/pass/pattern/op/wrap_type.hpp>
namespace ov {
namespace frontend {
@ -35,13 +35,17 @@ EliminateZeroPoints::EliminateZeroPoints() {
auto m_scale = ov::pass::pattern::any_input();
auto m_multiply = ov::pass::pattern::wrap_type<ov::op::v1::Multiply>({m_scale, m_subtract});
const auto callback = [=](ov::pass::pattern::Matcher& m) {
const auto& pattern_map = m.get_pattern_value_map();
const auto callback = [=](ov::pass::pattern::Matcher & m) {
const auto & pattern_map = m.get_pattern_value_map();
auto multiply_node = std::dynamic_pointer_cast<ov::op::v1::Multiply>(pattern_map.at(m_multiply).get_node_shared_ptr());
auto subtract_node = std::dynamic_pointer_cast<ov::op::v1::Subtract>(pattern_map.at(m_subtract).get_node_shared_ptr());
auto data_constant = std::dynamic_pointer_cast<ov::op::v0::Constant>(pattern_map.at(m_data_constant).get_node_shared_ptr());
auto zp_constant = std::dynamic_pointer_cast<ov::op::v0::Constant>(pattern_map.at(m_zp_constant).get_node_shared_ptr());
auto multiply_node =
std::dynamic_pointer_cast<ov::op::v1::Multiply>(pattern_map.at(m_multiply).get_node_shared_ptr());
auto subtract_node =
std::dynamic_pointer_cast<ov::op::v1::Subtract>(pattern_map.at(m_subtract).get_node_shared_ptr());
auto data_constant =
std::dynamic_pointer_cast<ov::op::v0::Constant>(pattern_map.at(m_data_constant).get_node_shared_ptr());
auto zp_constant =
std::dynamic_pointer_cast<ov::op::v0::Constant>(pattern_map.at(m_zp_constant).get_node_shared_ptr());
if (!multiply_node || !subtract_node || !data_constant || !zp_constant) {
return false;
@ -101,14 +105,16 @@ EliminateZeroPoints::EliminateZeroPoints() {
new_constant = std::make_shared<ov::op::v0::Constant>(target_type, data_shape, adjusted_values);
}
auto new_convert = std::make_shared<ov::op::v0::Convert>(new_constant, subtract_node->get_output_element_type(0));
auto new_convert =
std::make_shared<ov::op::v0::Convert>(new_constant, subtract_node->get_output_element_type(0));
ov::replace_node(subtract_node, new_convert);
return true;
};
register_matcher(std::make_shared<ov::pass::pattern::Matcher>(m_multiply, "ov::frontend::ggml::pass::EliminateZeroPoints"),
callback);
register_matcher(
std::make_shared<ov::pass::pattern::Matcher>(m_multiply, "ov::frontend::ggml::pass::EliminateZeroPoints"),
callback);
}
} // namespace pass

View File

@ -33,8 +33,8 @@ FuseToSDPA::FuseToSDPA() {
const auto m_v = ov::pass::pattern::any_input();
const auto m_qkv = ov::pass::pattern::wrap_type<ov::op::v0::MatMul>({m_softmax_qk_f16, m_v});
const auto callback = [=](ov::pass::pattern::Matcher& m) {
auto& pattern_to_output = m.get_pattern_value_map();
const auto callback = [=](ov::pass::pattern::Matcher & m) {
auto & pattern_to_output = m.get_pattern_value_map();
auto k = pattern_to_output[m_k];
auto q = pattern_to_output[m_q];
auto v = pattern_to_output[m_v];

View File

@ -1,5 +1,11 @@
#include "translate_session.hpp"
#include "ggml-openvino/openvino/node_context.hpp"
#include "ggml-openvino/openvino/utils.hpp"
#include "input_model.hpp"
#include "pass/eliminate_zp.hpp"
#include "pass/mark_decompression_convert_constant_folding.hpp"
#include <cstdint>
#include <cstdlib>
#include <map>
@ -25,12 +31,6 @@
#include <openvino/pass/constant_folding.hpp>
#include <openvino/pass/make_stateful.hpp>
#include "ggml-openvino/openvino/node_context.hpp"
#include "ggml-openvino/openvino/utils.hpp"
#include "input_model.hpp"
#include "pass/eliminate_zp.hpp"
#include "pass/mark_decompression_convert_constant_folding.hpp"
namespace ov {
namespace frontend {
namespace ggml {
@ -40,16 +40,17 @@ using namespace ov::op;
namespace {
ov::pass::MakeStateful::ParamResPairs get_kv_param_res_pairs(
const std::shared_ptr<ov::Model>& model, const std::map<std::string, std::string>& kv_param_res_names) {
const std::shared_ptr<ov::Model> & model,
const std::map<std::string, std::string> & kv_param_res_names) {
ov::pass::MakeStateful::ParamResPairs pairs;
const auto& params = model->get_parameters();
const auto& results = model->get_results();
const auto & params = model->get_parameters();
const auto & results = model->get_results();
for (const auto& param_res : kv_param_res_names) {
const auto& param_name = param_res.first;
const auto& res_name = param_res.second;
for (const auto & param_res : kv_param_res_names) {
const auto & param_name = param_res.first;
const auto & res_name = param_res.second;
auto param_it = std::find_if(params.begin(), params.end(), [&](const std::shared_ptr<v0::Parameter>& node) {
auto param_it = std::find_if(params.begin(), params.end(), [&](const std::shared_ptr<v0::Parameter> & node) {
return node->get_friendly_name() == param_name;
});
@ -57,7 +58,7 @@ ov::pass::MakeStateful::ParamResPairs get_kv_param_res_pairs(
" is not associated with any of "
"Parameters in the network.");
auto res_it = std::find_if(results.begin(), results.end(), [&](const std::shared_ptr<v0::Result>& node) {
auto res_it = std::find_if(results.begin(), results.end(), [&](const std::shared_ptr<v0::Result> & node) {
return node->get_friendly_name() == res_name;
});
@ -72,17 +73,17 @@ ov::pass::MakeStateful::ParamResPairs get_kv_param_res_pairs(
return pairs;
}
void add_token_len(TensorMap& tensor_map) {
void add_token_len(TensorMap & tensor_map) {
auto inp_tokens = tensor_map.at("inp_tokens").get_node_shared_ptr();
auto token_len = get_dimensions(inp_tokens, {2});
token_len->set_friendly_name("token_len");
tensor_map.insert({"token_len", token_len->output(0)});
}
void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
void add_sliced_mask(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) {
auto token_len = tensor_map.at("token_len").get_node_shared_ptr();
auto create_sliced_mask = [&](const std::string& mask_name, const std::string& sliced_name, bool is_static) {
auto create_sliced_mask = [&](const std::string & mask_name, const std::string & sliced_name, bool is_static) {
if (tensor_map.find(mask_name) != tensor_map.end()) {
auto mask = tensor_map.at(mask_name).get_node_shared_ptr();
std::shared_ptr<ov::Node> mask_sliced;
@ -110,8 +111,7 @@ void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
kv_len = std::make_shared<ov::op::v1::Add>(kv_len, one_1d);
auto stop = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{token_len, kv_len}, 0);
mask_sliced =
std::make_shared<ov::op::v8::Slice>(mask, zero_2d, stop, one_2d, axes);
mask_sliced = std::make_shared<ov::op::v8::Slice>(mask, zero_2d, stop, one_2d, axes);
mask_sliced = std::make_shared<ov::op::v0::Unsqueeze>(mask_sliced, zero_1d);
mask_sliced = std::make_shared<ov::op::v0::Convert>(mask_sliced, ov::element::f16);
mask_sliced->set_friendly_name(sliced_name);
@ -125,8 +125,8 @@ void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
// create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static());
}
void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
int32_t* rope_params = ggml_model_decoder.get_rope_params();
void add_rope_sin_cos(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) {
int32_t * rope_params = ggml_model_decoder.get_rope_params();
auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr();
std::shared_ptr<ov::Node> rope_freqs_weight;
if (tensor_map.find("rope_freqs_weight") != tensor_map.end()) {
@ -144,7 +144,7 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
}
// Create common patterns
void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
void preprocess(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) {
add_token_len(tensor_map);
add_sliced_mask(tensor_map, ggml_model_decoder);
add_rope_sin_cos(tensor_map, ggml_model_decoder);
@ -152,8 +152,8 @@ void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
} // namespace
TranslateSession::TranslateSession(const frontend::InputModel::Ptr& input_model,
const std::unordered_map<std::string, CreatorFunction>& translator_map,
TranslateSession::TranslateSession(const frontend::InputModel::Ptr & input_model,
const std::unordered_map<std::string, CreatorFunction> & translator_map,
bool naive) :
m_input_model(input_model),
m_translator_map(translator_map),
@ -168,26 +168,26 @@ std::shared_ptr<Model> TranslateSession::get_converted_model() {
return m_ov_model;
}
std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputModel::Ptr& input_model) {
std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputModel::Ptr & input_model) {
ov::ParameterVector params;
ov::ResultVector results;
auto tensor_map = std::make_shared<TensorMap>();
std::shared_ptr<Model> resulting_model;
const auto& ggml_model = std::dynamic_pointer_cast<InputModel>(input_model);
const auto & ggml_model = std::dynamic_pointer_cast<InputModel>(input_model);
std::shared_ptr<GgmlDecoder> ggml_model_decoder = ggml_model->get_model_decoder();
for (const auto& it : ggml_model_decoder->get_model_inputs()) {
for (const auto & it : ggml_model_decoder->get_model_inputs()) {
params.push_back(std::dynamic_pointer_cast<ov::op::v0::Parameter>(it.second));
(*tensor_map)[it.first] = it.second;
}
for (const auto& it : ggml_model_decoder->get_model_extra_inputs()) {
for (const auto & it : ggml_model_decoder->get_model_extra_inputs()) {
params.push_back(std::dynamic_pointer_cast<ov::op::v0::Parameter>(it.second));
(*tensor_map)[it.first] = it.second;
}
for (const auto& it : ggml_model_decoder->get_model_weights()) {
for (const auto & it : ggml_model_decoder->get_model_weights()) {
(*tensor_map)[it.first] = it.second;
}
@ -199,22 +199,15 @@ std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputMo
ov::OutputVector converted_outputs;
auto it = m_translator_map.find(operation_type);
FRONT_END_OP_CONVERSION_CHECK(it != m_translator_map.end(),
"Translation for operation type ",
operation_type,
FRONT_END_OP_CONVERSION_CHECK(it != m_translator_map.end(), "Translation for operation type ", operation_type,
" is not implemented.");
NodeContext node_context(node, tensor_map, this);
converted_outputs = it->second(node_context);
const auto& node_output_names = node->get_output_names();
FRONT_END_OP_CONVERSION_CHECK(node_output_names.size() == converted_outputs.size(),
"Number of ",
operation_type,
" outputs greater than number of converted outputs, which are ",
node_output_names.size(),
" and ",
converted_outputs.size(),
" respectively.");
const auto & node_output_names = node->get_output_names();
FRONT_END_OP_CONVERSION_CHECK(node_output_names.size() == converted_outputs.size(), "Number of ",
operation_type, " outputs greater than number of converted outputs, which are ",
node_output_names.size(), " and ", converted_outputs.size(), " respectively.");
for (size_t i = 0; i < node_output_names.size(); ++i) {
auto output_name = node_output_names[i];
@ -229,10 +222,9 @@ std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputMo
}
ggml_model_decoder->visit_subgraph(node_visitor);
for (const auto& name : ggml_model_decoder->get_model_output_names()) {
for (const auto & name : ggml_model_decoder->get_model_output_names()) {
FRONT_END_GENERAL_CHECK(tensor_map->find(name) != tensor_map->end(),
"Output name not found in tensor map: ",
name);
"Output name not found in tensor map: ", name);
auto result = std::make_shared<v0::Result>(tensor_map->at(name));
result->set_friendly_name(name);
results.push_back(result);

View File

@ -1,5 +1,7 @@
#include "utils.hpp"
#include "ggml-impl.h"
#include <cstddef>
#include <ctime>
#include <memory>
@ -17,8 +19,6 @@
#include <openvino/op/transpose.hpp>
#include <string>
#include "ggml-impl.h"
namespace ov {
namespace frontend {
namespace ggml {
@ -30,7 +30,7 @@ std::string getCurrentTime() {
return buf;
}
void num_inputs_check(const NodeContext& context, size_t min_inputs, size_t max_inputs) {
void num_inputs_check(const NodeContext & context, size_t min_inputs, size_t max_inputs) {
auto input_size = context.get_input_size();
FRONT_END_OP_CONVERSION_CHECK(input_size >= min_inputs, "Got less inputs than expected");
FRONT_END_OP_CONVERSION_CHECK(input_size <= max_inputs, "Got more inputs than expected");
@ -48,20 +48,20 @@ int non_cont_dim(std::vector<size_t> ne, std::vector<size_t> nb) {
return 0;
}
std::shared_ptr<ov::Node> get_dimensions(const std::shared_ptr<ov::op::v3::ShapeOf>& shape,
const std::vector<int>& dims) {
std::shared_ptr<ov::Node> get_dimensions(const std::shared_ptr<ov::op::v3::ShapeOf> & shape,
const std::vector<int> & dims) {
using namespace ov::op;
const auto zero = v0::Constant::create(ov::element::i32, ov::Shape{}, {0});
const auto dims_const = v0::Constant::create(ov::element::i32, ov::Shape{dims.size()}, dims);
return std::make_shared<v8::Gather>(shape, dims_const, zero);
}
std::shared_ptr<ov::Node> get_dimensions(const std::shared_ptr<ov::Node>& node, const std::vector<int>& dims) {
std::shared_ptr<ov::Node> get_dimensions(const std::shared_ptr<ov::Node> & node, const std::vector<int> & dims) {
return get_dimensions(std::make_shared<ov::op::v3::ShapeOf>(node), dims);
}
OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::string& suffix) {
for (const auto& output : outputs) {
OutputVector rename_outputs_with_suffix(const OutputVector & outputs, const std::string & suffix) {
for (const auto & output : outputs) {
auto node = output.get_node_shared_ptr();
std::string name = node->get_friendly_name();
name += "_";
@ -111,7 +111,7 @@ void ggml_rope_yarn_corr_dims(int n_dims,
}
} // namespace
std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t* rope_params,
std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params,
std::shared_ptr<ov::Node> inp_pos,
std::shared_ptr<ov::Node> rope_freqs_weight) {
inp_pos = std::make_shared<ov::op::v0::Convert>(inp_pos, ov::element::f32);
@ -179,11 +179,11 @@ std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t* rope_params,
return std::make_pair(sin_theta, cos_theta);
}
ov::Output<ov::Node> process_view_input(const NodeContext& context, int input_index, int slice_len) {
ov::Output<ov::Node> process_view_input(const NodeContext & context, int input_index, int slice_len) {
// Only works for VIEW operations that slice at the lowest dimension
// If the VIEW also reshape the result, `slice_len` should be provided
auto input = context.get_input(input_index);
int32_t* op_params = context.get_input_op_params(input_index);
int32_t * op_params = context.get_input_op_params(input_index);
auto src1_stride = context.get_input_stride(input_index);
int64_t split_addr = op_params[0] / src1_stride[2];

View File

@ -1,5 +1,11 @@
#include "utils.h"
#include "ggml-impl.h"
#include "ggml-openvino/ggml-decoder.h"
#include "ggml.h"
#include "openvino/frontend.hpp"
#include "openvino/input_model.hpp"
#include <algorithm>
#include <cassert>
#include <cmath>
@ -23,15 +29,9 @@
#include <unordered_map>
#include <vector>
#include "ggml-impl.h"
#include "ggml-openvino/ggml-decoder.h"
#include "ggml.h"
#include "openvino/frontend.hpp"
#include "openvino/input_model.hpp"
ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string& name) {
const auto* ggml_tensor = ggml_decoder->get_input_ggml_tensor(name);
auto* input_data = ggml_tensor->data;
ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & name) {
const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(name);
auto * input_data = ggml_tensor->data;
ov::Shape input_shape;
if (name.find("cache_k") == 0 || name.find("cache_v") == 0) {
input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor).to_shape();
@ -45,13 +45,14 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
return input_tensor;
}
std::map<std::string, void*> get_ggml_graph_output_dst(std::shared_ptr<GgmlOvDecoder> ggml_decoder) {
std::map<std::string, void*> output_tensors;
std::map<std::string, void *> get_ggml_graph_output_dst(std::shared_ptr<GgmlOvDecoder> ggml_decoder) {
std::map<std::string, void *> output_tensors;
auto output_names = ggml_decoder->get_model_output_names();
for (size_t inp = 0; inp < output_names.size(); ++inp) {
auto name = output_names[inp];
const auto* tensor = ggml_decoder->get_output_ggml_tensor(name);
auto* output_data = tensor->view_src ? tensor->view_src->data : tensor->data;
const auto * tensor = ggml_decoder->get_output_ggml_tensor(name);
auto * output_data = tensor->view_src ? tensor->view_src->data : tensor->data;
output_tensors[name] = output_data;
}
return output_tensors;
@ -63,14 +64,14 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() {
return front_end;
}
enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph) {
enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
static ov::Core core;
static std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "";
if (device.empty()) {
const std::vector<std::string> preferred_device = { "GPU", "CPU", "NPU" };
const std::vector<std::string> preferred_device = {"GPU", "CPU", "NPU"};
const auto available_devices = core.get_available_devices();
for (const auto& dev : preferred_device) {
for (const auto & dev : preferred_device) {
if (std::find(available_devices.begin(), available_devices.end(), dev) != available_devices.end()) {
device = dev;
break;
@ -92,17 +93,17 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
auto start_time = ggml_time_us();
auto* cache_dir = getenv("GGML_OPENVINO_CACHE_DIR");
auto * cache_dir = getenv("GGML_OPENVINO_CACHE_DIR");
if (cache_dir && !is_static) {
core.set_property(ov::cache_dir(cache_dir));
}
static std::mutex cache_mutex;
static std::unordered_map<struct ggml_cgraph*, std::shared_ptr<ov::InferRequest>> infer_request_cache;
static std::unordered_map<struct ggml_cgraph*, std::vector<std::string>> ov_input_names_cache;
static std::unordered_map<struct ggml_cgraph*, std::vector<std::string>> ov_output_names_cache;
static std::unordered_map<ggml_cgraph *, std::shared_ptr<ov::InferRequest>> infer_request_cache;
static std::unordered_map<ggml_cgraph *, std::vector<std::string>> ov_input_names_cache;
static std::unordered_map<ggml_cgraph *, std::vector<std::string>> ov_output_names_cache;
// For NPU, store the kvcache model, since we cannot create two infer_request
static std::unordered_map<struct ggml_cgraph*, ov::CompiledModel> compiled_model_cache;
static std::unordered_map<ggml_cgraph *, ov::CompiledModel> compiled_model_cache;
std::shared_ptr<GgmlOvDecoder> ggml_decoder;
ov::InferRequest infer_request;
@ -181,7 +182,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
ov::serialize(model, timestamped_filename);
}
auto* disable_sdpa_optimization = getenv("GGML_OPENVINO_DISABLE_SDPA_OPTIMIZATION");
auto * disable_sdpa_optimization = getenv("GGML_OPENVINO_DISABLE_SDPA_OPTIMIZATION");
if (disable_sdpa_optimization && std::string(disable_sdpa_optimization) != "0") {
config = {
{"GPU_ENABLE_SDPA_OPTIMIZATION", "0"}
@ -196,10 +197,10 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
std::vector<std::string> ov_input_names;
std::vector<std::string> ov_output_names;
for (const auto& ov_param : model->get_parameters()) {
for (const auto & ov_param : model->get_parameters()) {
ov_input_names.push_back(ov_param->get_friendly_name());
}
for (const auto& ov_output : model->get_results()) {
for (const auto & ov_output : model->get_results()) {
ov_output_names.push_back(ov_output->get_friendly_name());
}
ov_input_names_cache[cgraph] = ov_input_names;
@ -225,7 +226,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
auto gguf_tensor_addrs = get_ggml_graph_output_dst(ggml_decoder);
for (size_t i = 0; i < ov_output_names.size(); i++) {
auto& result_name = ov_output_names[i];
auto & result_name = ov_output_names[i];
const auto output_tensor = infer_request.get_output_tensor(i);
std::memcpy(gguf_tensor_addrs[result_name], output_tensor.data(), output_tensor.get_byte_size());
@ -278,7 +279,7 @@ ov::AnyMap get_npu_generate_config() {
return config;
}
std::map<ggml_type, ExtraQuantType> get_types_to_requant(const std::string& device) {
std::map<ggml_type, ExtraQuantType> get_types_to_requant(const std::string & device) {
if (device == "NPU") {
return {
{GGML_TYPE_Q4_0, ExtraQuantType::Q4_0_128},
@ -297,15 +298,15 @@ std::map<ggml_type, ExtraQuantType> get_types_to_requant(const std::string& devi
return {};
}
bool is_naive(struct ggml_cgraph* cgraph) {
bool is_naive(ggml_cgraph * cgraph) {
constexpr int naive_graph_size_threshold = 20;
return cgraph->n_nodes < naive_graph_size_threshold;
}
enum ggml_status naive_compute(struct ggml_cgraph* cgraph,
ov::Core& core,
const std::string& device,
const ov::AnyMap& config) {
enum ggml_status naive_compute(ggml_cgraph * cgraph,
ov::Core & core,
const std::string & device,
const ov::AnyMap & config) {
if (cgraph->n_nodes == 1 && (cgraph->nodes[0]->op == GGML_OP_NONE || cgraph->nodes[0]->op == GGML_OP_VIEW)) {
return GGML_STATUS_SUCCESS;
}
@ -343,7 +344,7 @@ enum ggml_status naive_compute(struct ggml_cgraph* cgraph,
return GGML_STATUS_SUCCESS;
}
ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string& param_name) {
ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & param_name) {
bool is_static = ggml_decoder->is_static();
bool is_first_token = ggml_decoder->is_first_token();
@ -358,10 +359,10 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, cons
if (param_name == "inp_tokens" || param_name == "inp_pos") {
if (is_first_token) {
size_t context_size = ggml_decoder->get_context_size();
const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name);
const auto * input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name);
std::vector<int32_t> padded_data = pad_input<int32_t>(input_tensor_ggml, 1, context_size, 0);
input_tensor = ov::Tensor(ov::element::i32, ov::Shape{1, 1, context_size});
auto* data_ptr = input_tensor.data<int32_t>();
auto * data_ptr = input_tensor.data<int32_t>();
std::copy(padded_data.begin(), padded_data.end(), data_ptr);
} else {
input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name);
@ -369,22 +370,22 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, cons
} else if (param_name.find("KQ_mask") == 0) {
size_t context_size = ggml_decoder->get_context_size();
const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name);
const auto * input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name);
if (is_first_token) {
std::vector<float> padded_data =
pad_input<float>(input_tensor_ggml, context_size, context_size, -INFINITY);
set_zero_diagonal(padded_data, context_size);
input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, context_size, context_size});
auto* data_ptr = input_tensor.data<float>();
auto * data_ptr = input_tensor.data<float>();
std::copy(padded_data.begin(), padded_data.end(), data_ptr);
} else {
std::vector<float> padded_data = pad_input<float>(input_tensor_ggml, 1, context_size, -INFINITY);
input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, context_size});
auto* data_ptr = input_tensor.data<float>();
auto * data_ptr = input_tensor.data<float>();
std::copy(padded_data.begin(), padded_data.end(), data_ptr);
}
} else if (const auto* op = ggml_decoder->get_tensor_used_op(ggml_decoder->get_tensor_from_name(param_name));
} else if (const auto * op = ggml_decoder->get_tensor_used_op(ggml_decoder->get_tensor_from_name(param_name));
op && op->op == GGML_OP_SET_ROWS && is_static && is_first_token) {
input_tensor = ov::Tensor(ov::element::i64, ov::Shape{1, 1, 1});
} else {
@ -394,8 +395,8 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, cons
return input_tensor;
}
size_t checksum(const void* data, size_t size) {
const uint8_t* bytes = static_cast<const uint8_t*>(data);
size_t checksum(const void * data, size_t size) {
const uint8_t * bytes = static_cast<const uint8_t *>(data);
size_t sum = 0;
for (size_t i = 0; i < size; ++i) {
sum += (uint8_t) i;
@ -408,36 +409,37 @@ size_t checksum(const void* data, size_t size) {
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor) {
void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor) {
std::cout << "Input name: " << name << ", Input shape: " << tensor.get_shape() << ", Address: " << tensor.data()
<< std::endl;
switch (tensor.get_element_type()) {
case ov::element::f32:
std::cout << *(tensor.data<float>()) << std::endl;
break;
case ov::element::f16:
std::cout << *(tensor.data<ov::float16>()) << std::endl;
break;
case ov::element::i32:
for (size_t i = 0; i < tensor.get_size(); ++i) {
std::cout << tensor.data<int32_t>()[i] << " ";
}
std::cout << std::endl;
break;
case ov::element::i64:
std::cout << *(tensor.data<int64_t>()) << std::endl;
break;
default:
break;
case ov::element::f32:
std::cout << *(tensor.data<float>()) << std::endl;
break;
case ov::element::f16:
std::cout << *(tensor.data<ov::float16>()) << std::endl;
break;
case ov::element::i32:
for (size_t i = 0; i < tensor.get_size(); ++i) {
std::cout << tensor.data<int32_t>()[i] << " ";
}
std::cout << std::endl;
break;
case ov::element::i64:
std::cout << *(tensor.data<int64_t>()) << std::endl;
break;
default:
break;
}
}
void print_output_tensor_info(const std::string& name, const ov::Tensor& tensor,
std::map<std::string, void*>& output_dst) {
void print_output_tensor_info(const std::string & name,
const ov::Tensor & tensor,
std::map<std::string, void *> & output_dst) {
std::cout << "Output name: " << name << ", Output shape: " << tensor.get_shape()
<< ", Address: " << output_dst[name] << std::endl;
auto print_float_stats = [](const std::string& type_name, size_t size, auto get_value) {
auto print_float_stats = [](const std::string & type_name, size_t size, auto get_value) {
if (size == 0) {
return;
}
@ -467,13 +469,13 @@ void print_output_tensor_info(const std::string& name, const ov::Tensor& tensor,
switch (tensor.get_element_type()) {
case ov::element::f32: {
const float* data = tensor.data<float>();
const float * data = tensor.data<float>();
size_t size = tensor.get_size();
print_float_stats("[f32]", size, [data](size_t i) { return data[i]; });
break;
}
case ov::element::f16: {
const ov::float16* data = tensor.data<ov::float16>();
const ov::float16 * data = tensor.data<ov::float16>();
size_t size = tensor.get_size();
print_float_stats("[f16]", size, [data](size_t i) { return static_cast<float>(data[i]); });
break;
@ -485,17 +487,17 @@ void print_output_tensor_info(const std::string& name, const ov::Tensor& tensor,
#pragma GCC diagnostic pop
void set_zero_diagonal(std::vector<float>& matrix, size_t dim) {
void set_zero_diagonal(std::vector<float> & matrix, size_t dim) {
for (size_t i = 0; i < dim; ++i) {
matrix[i * dim + i] = 0.0f;
}
}
bool is_prefill(struct ggml_cgraph* cgraph) {
bool is_prefill(ggml_cgraph * cgraph) {
for (int i = 0; i < cgraph->n_nodes; ++i) {
auto* op = cgraph->nodes[i];
auto * op = cgraph->nodes[i];
for (int j = 0; j < GGML_MAX_SRC; ++j) {
auto* src = op->src[j];
auto * src = op->src[j];
if (src == nullptr) {
break;
}

View File

@ -1,32 +1,32 @@
#include <algorithm>
#include <openvino/runtime/core.hpp>
#include "ggml-backend-impl.h"
#include "ggml-decoder.h"
#include "ggml-impl.h"
enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph);
#include <algorithm>
#include <openvino/runtime/core.hpp>
std::shared_ptr<GgmlOvDecoder> get_ggml_decoder(struct ggml_cgraph* cgraph, bool is_static, bool is_first_token);
enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph);
ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string& name);
std::shared_ptr<GgmlOvDecoder> get_ggml_decoder(struct ggml_cgraph * cgraph, bool is_static, bool is_first_token);
std::map<std::string, void*> get_ggml_graph_output_dst(std::shared_ptr<GgmlOvDecoder> ggml_decoder);
ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & name);
size_t checksum(const void* data, size_t size);
std::map<std::string, void *> get_ggml_graph_output_dst(std::shared_ptr<GgmlOvDecoder> ggml_decoder);
void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor);
size_t checksum(const void * data, size_t size);
void print_output_tensor_info(const std::string& name,
const ov::Tensor& tensor,
std::map<std::string, void*>& output_dst);
void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor);
void print_output_tensor_info(const std::string & name,
const ov::Tensor & tensor,
std::map<std::string, void *> & output_dst);
template <typename T>
std::vector<T> pad_input(const ggml_tensor* tensor, size_t padded_rows, size_t padded_cols, T pad_value) {
std::vector<T> pad_input(const ggml_tensor * tensor, size_t padded_rows, size_t padded_cols, T pad_value) {
std::vector<T> padded_data(padded_rows * padded_cols, pad_value);
size_t rows = tensor->ne[1];
size_t cols = tensor->ne[0];
T* data = static_cast<T*>(tensor->data);
T * data = static_cast<T *>(tensor->data);
for (size_t i = 0; i < std::min(rows, padded_rows); ++i) {
for (size_t j = 0; j < std::min(cols, padded_cols); ++j) {
@ -36,18 +36,20 @@ std::vector<T> pad_input(const ggml_tensor* tensor, size_t padded_rows, size_t p
return padded_data;
}
void set_zero_diagonal(std::vector<float>& matrix, size_t dim);
void set_zero_diagonal(std::vector<float> & matrix, size_t dim);
bool is_prefill(struct ggml_cgraph * cgraph);
ov::AnyMap get_npu_prefill_config();
ov::AnyMap get_npu_generate_config();
std::map<ggml_type, ExtraQuantType> get_types_to_requant(const std::string& device);
std::map<ggml_type, ExtraQuantType> get_types_to_requant(const std::string & device);
ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string& param_name);
ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & param_name);
bool is_naive(struct ggml_cgraph* cgraph);
bool is_naive(struct ggml_cgraph * cgraph);
enum ggml_status naive_compute(struct ggml_cgraph* cgraph, ov::Core& core, const std::string& device,
const ov::AnyMap& config);
enum ggml_status naive_compute(struct ggml_cgraph * cgraph,
ov::Core & core,
const std::string & device,
const ov::AnyMap & config);