From eba8113dc4655e16e4c7513f48cde57c9cfe5791 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 21 Oct 2025 14:45:32 +0800 Subject: [PATCH] Style: middle ptr and ref align, omit optional struct keyword --- ggml/include/ggml-openvino.h | 24 +- ggml/src/ggml-openvino/.clang-format | 27 +- ggml/src/ggml-openvino/ggml-decoder.cpp | 168 ++++++------ ggml/src/ggml-openvino/ggml-decoder.h | 132 +++++----- ggml/src/ggml-openvino/ggml-openvino.cpp | 162 ++++++------ ggml/src/ggml-openvino/ggml-quants.cpp | 247 ++++++++++-------- ggml/src/ggml-openvino/openvino/frontend.cpp | 4 +- .../ggml-openvino/openvino/input_model.cpp | 4 +- ggml/src/ggml-openvino/openvino/op/cont.cpp | 14 +- ggml/src/ggml-openvino/openvino/op/cpy.cpp | 7 +- .../openvino/op/flash_attn_ext.cpp | 35 ++- .../ggml-openvino/openvino/op/get_rows.cpp | 10 +- .../ggml-openvino/openvino/op/glu_geglu.cpp | 12 +- .../ggml-openvino/openvino/op/glu_swiglu.cpp | 12 +- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 10 +- .../src/ggml-openvino/openvino/op/permute.cpp | 25 +- .../src/ggml-openvino/openvino/op/reshape.cpp | 22 +- .../ggml-openvino/openvino/op/rms_norm.cpp | 10 +- ggml/src/ggml-openvino/openvino/op/rope.cpp | 15 +- ggml/src/ggml-openvino/openvino/op/scale.cpp | 10 +- .../ggml-openvino/openvino/op/set_rows.cpp | 16 +- .../src/ggml-openvino/openvino/op/softmax.cpp | 16 +- .../ggml-openvino/openvino/op/transpose.cpp | 6 +- .../ggml-openvino/openvino/op/unary_silu.cpp | 10 +- ggml/src/ggml-openvino/openvino/op/view.cpp | 5 +- ggml/src/ggml-openvino/openvino/op_table.cpp | 4 +- .../openvino/pass/eliminate_zp.cpp | 32 ++- .../openvino/pass/fuse_to_sdpa.cpp | 4 +- .../openvino/translate_session.cpp | 80 +++--- ggml/src/ggml-openvino/openvino/utils.cpp | 22 +- ggml/src/ggml-openvino/utils.cpp | 136 +++++----- ggml/src/ggml-openvino/utils.h | 42 +-- 32 files changed, 670 insertions(+), 653 deletions(-) diff --git a/ggml/include/ggml-openvino.h b/ggml/include/ggml-openvino.h index 151c48d40d..7b5298e520 100644 --- a/ggml/include/ggml-openvino.h +++ b/ggml/include/ggml-openvino.h @@ -1,17 +1,17 @@ #pragma once -#include "ggml.h" #include "ggml-backend.h" +#include "ggml.h" -#include #include +#include #ifdef __cplusplus extern "C" { #endif -#define GGML_OPENVINO_NAME "OPENVINO" -#define GGML_OPENVINO_MAX_DEVICES 16 +#define GGML_OPENVINO_NAME "OPENVINO" +#define GGML_OPENVINO_MAX_DEVICES 16 // backend API GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device); @@ -28,7 +28,7 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_split_buffer_t // and GPU GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(void); -GGML_BACKEND_API int ggml_backend_openvino_get_device_count(void); +GGML_BACKEND_API int ggml_backend_openvino_get_device_count(void); // GGML_BACKEND_API void ggml_backend_openvino_get_device_description(int device, char * description, // size_t description_size); // GGML_BACKEND_API void ggml_backend_openvino_get_device_memory(int device, size_t * free, size_t * total); @@ -42,13 +42,13 @@ struct ggml_openvino_device_info { int device_count; struct openvino_device_info { - int cc; // compute capability - int nsm; // number of streaming multiprocessors - size_t smpb; // max. shared memory per block - size_t smpbo; // max. shared memory per block (with opt-in) - bool vmm; // virtual memory support - size_t vmm_granularity; // granularity of virtual memory - size_t total_vram; + int cc; // compute capability + int nsm; // number of streaming multiprocessors + size_t smpb; // max. shared memory per block + size_t smpbo; // max. shared memory per block (with opt-in) + bool vmm; // virtual memory support + size_t vmm_granularity; // granularity of virtual memory + size_t total_vram; }; openvino_device_info devices[GGML_OPENVINO_MAX_DEVICES] = {}; diff --git a/ggml/src/ggml-openvino/.clang-format b/ggml/src/ggml-openvino/.clang-format index 63dc2c472a..a2a24d7d33 100644 --- a/ggml/src/ggml-openvino/.clang-format +++ b/ggml/src/ggml-openvino/.clang-format @@ -2,12 +2,10 @@ # Override root .clang-format AlignConsecutiveAssignments: false AlignConsecutiveDeclarations: false -ReferenceAlignment: Left -PointerAlignment: Left Cpp11BracedListStyle: true -AccessModifierOffset: -4 -BinPackArguments: false +SpacesInContainerLiterals: false BreakBeforeBraces: Attach +AccessModifierOffset: -4 IndentCaseBlocks: false IndentCaseLabels: false @@ -32,7 +30,15 @@ AllowShortIfStatementsOnASingleLine: Never AllowShortLambdasOnASingleLine: Inline AllowShortLoopsOnASingleLine: false AlwaysBreakBeforeMultilineStrings: true -BinPackParameters: true +# Treat CUDA keywords/attributes as "attribute macros" and avoid breaking lines inside them +AttributeMacros: + - __host__ + - __device__ + - __global__ + - __forceinline__ + - __launch_bounds__ +BinPackArguments: true +BinPackParameters: false # OnePerLine BitFieldColonSpacing: Both # BreakAdjacentStringLiterals: true BreakAfterAttributes: Never @@ -58,15 +64,18 @@ ExperimentalAutoDetectBinPacking: false FixNamespaceComments: true IncludeBlocks: Regroup IncludeCategories: - - Regex: '^<.*\.h>' + - Regex: '".*"' Priority: 1 SortPriority: 0 - - Regex: '^<.*' + - Regex: '^<.*\.h>' Priority: 2 SortPriority: 0 - - Regex: '.*' + - Regex: '^<.*' Priority: 3 SortPriority: 0 + - Regex: '.*' + Priority: 4 + SortPriority: 0 IncludeIsMainRegex: '([-_](test|unittest))?$' IncludeIsMainSourceRegex: '' IndentAccessModifiers: false @@ -100,6 +109,7 @@ PenaltyBreakString: 1000 PenaltyBreakTemplateDeclaration: 10 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 200 +PointerAlignment: Middle QualifierAlignment: Left #QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict'] RawStringFormats: @@ -113,6 +123,7 @@ RawStringFormats: - 'c++' - 'C++' CanonicalDelimiter: '' +ReferenceAlignment: Middle ReflowComments: false # IndentOnly SeparateDefinitionBlocks: Always SortIncludes: CaseInsensitive diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 7c6bfe7ee7..392d45dd6b 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -1,5 +1,9 @@ #include "ggml-decoder.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" +#include "ggml-quants.hpp" + #include #include @@ -32,13 +36,16 @@ #include #include -#include "ggml-backend-impl.h" -#include "ggml-backend.h" -#include "ggml-quants.hpp" - -GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token, - int context_size, int context_size_swa, int num_heads, int num_heads_kv, int head_size, - const std::vector& swa_layers) : +GgmlOvDecoder::GgmlOvDecoder(ggml_tensor * node, + ggml_cgraph * cgraph, + bool is_static, + bool is_first_token, + int context_size, + int context_size_swa, + int num_heads, + int num_heads_kv, + int head_size, + const std::vector & swa_layers) : m_cgraph(cgraph), m_node(node), m_op_name(std::string(node->name)), @@ -53,8 +60,9 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap set_input_output(node); } -GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, - std::map>& model_weights, bool is_static, +GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, + std::map> & model_weights, + bool is_static, bool is_first_token) : m_cgraph(cgraph), m_op_name(m_node ? std::string(m_node->name) : ""), @@ -68,7 +76,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, set_llm_params(); for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { - auto* cur_node = cgraph->nodes[node_n]; + auto * cur_node = cgraph->nodes[node_n]; m_nodes.push_back(cur_node); set_input_output(cur_node); } @@ -76,12 +84,11 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, // add_extra_inputs(); } -GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, - std::map>& model_weights) { +GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map> & model_weights) { m_cgraph = cgraph; m_model_weights = model_weights; for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { - auto* cur_node = cgraph->nodes[node_n]; + auto * cur_node = cgraph->nodes[node_n]; if (cur_node->op == GGML_OP_NONE) { continue; } @@ -93,7 +100,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, // Called in GgmlOvDecoder constructor. Two cases: 1. constructing a decoder for the whole graph; // 2. constructing a decoder for a node; // 3. constructing a decoder for the whole graph naively (op test case) -void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { +void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { std::string node_name; if (node->op == GGML_OP_SET_ROWS) { // SET_ROWS updates the tensor in place. For later ov op that uses the @@ -109,7 +116,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { m_outputs[node_name] = node; for (int i = 0; i < GGML_MAX_SRC; i++) { - auto* src = node->src[i]; + auto * src = node->src[i]; if (src == nullptr) { continue; } @@ -128,7 +135,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { } } else if (!m_node && !src->view_src) { - ggml_backend_buffer* buffer = src->buffer; + ggml_backend_buffer * buffer = src->buffer; if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) { // GGML_BACKEND_BUFFER_USAGE_ANY are kv caches @@ -236,8 +243,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { } case GGML_OP_VIEW: { if (node->src[0]->op == GGML_OP_VIEW) { - auto* src = node->src[0]; - auto* view_src = src->view_src; + auto * src = node->src[0]; + auto * view_src = src->view_src; if (view_src->ne[1] != src->ne[2]) { throw std::runtime_error("Unsupported VIEW case"); } @@ -250,7 +257,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { } } -int extract_layer_from_name(const std::string& name) { +int extract_layer_from_name(const std::string & name) { size_t pos1 = name.find("_l"); assert(pos1 != std::string::npos); pos1 += 2; @@ -265,10 +272,10 @@ int extract_layer_from_name(const std::string& name) { void GgmlOvDecoder::set_llm_params() { for (int i = 0; i < m_cgraph->n_nodes; i++) { - auto* node = m_cgraph->nodes[i]; + auto * node = m_cgraph->nodes[i]; std::string name = std::string(node->name); if (node->op == GGML_OP_FLASH_ATTN_EXT) { - auto* cache_k = node->src[1]; + auto * cache_k = node->src[1]; cache_k = cache_k->view_src ? cache_k->view_src : cache_k; int layer = extract_layer_from_name(cache_k->name); @@ -290,7 +297,7 @@ void GgmlOvDecoder::set_llm_params() { } } -ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) const { +ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * src) const { auto name = std::string(src->name); ov::PartialShape input_shape; if (name == "inp_tokens" || name == "inp_pos") { @@ -323,7 +330,7 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co } else { input_shape = ov::PartialShape{1, -1, m_num_heads_kv, m_head_size}; } - } else if (const auto* op = get_tensor_used_op(src); op && op->op == GGML_OP_SET_ROWS) { + } else if (const auto * op = get_tensor_used_op(src); op && op->op == GGML_OP_SET_ROWS) { input_shape = ov::PartialShape{1, 1, m_is_static ? 1 : -1}; } else if (src->op == GGML_OP_VIEW) { // This case is added to make test-backend-ops work @@ -342,9 +349,9 @@ void GgmlOvDecoder::add_extra_inputs() { // Update: not used anymore after the optimization of making kvcache dynamic (but breaks iSWA models) int64_t attention_size = -1; int64_t attention_size_swa = -1; - for (const auto& node : m_nodes) { + for (const auto & node : m_nodes) { if (node->op == GGML_OP_FLASH_ATTN_EXT) { - auto* mask = node->src[3]; + auto * mask = node->src[3]; std::string mask_name(mask->name); if (mask_name.find("KQ_mask") != 0) { throw std::runtime_error("Unexpected flash attention node: " + std::string(mask->name)); @@ -357,7 +364,7 @@ void GgmlOvDecoder::add_extra_inputs() { } } - auto create_attention_size_input = [this](const std::string& name, int64_t size) { + auto create_attention_size_input = [this](const std::string & name, int64_t size) { auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); param_node->set_friendly_name(name); param_node->output(0).get_tensor().set_names({name}); @@ -374,12 +381,12 @@ void GgmlOvDecoder::add_extra_inputs() { } } -const ggml_tensor* GgmlOvDecoder::get_tensor_used_op(const ggml_tensor* tensor) const { +const ggml_tensor * GgmlOvDecoder::get_tensor_used_op(const ggml_tensor * tensor) const { if (tensor == nullptr) { return nullptr; } for (int i = 0; i < m_cgraph->n_nodes; i++) { - const auto* node = m_cgraph->nodes[i]; + const auto * node = m_cgraph->nodes[i]; for (int j = 0; j < GGML_MAX_SRC; j++) { if (node->src[j] == tensor) { return node; @@ -389,11 +396,11 @@ const ggml_tensor* GgmlOvDecoder::get_tensor_used_op(const ggml_tensor* tensor) return nullptr; } -const ggml_tensor* GgmlOvDecoder::get_tensor_from_name(const std::string& name) const { +const ggml_tensor * GgmlOvDecoder::get_tensor_from_name(const std::string & name) const { for (int i = 0; i < m_cgraph->n_nodes; i++) { - const auto* node = m_cgraph->nodes[i]; + const auto * node = m_cgraph->nodes[i]; for (int j = 0; j < GGML_MAX_SRC; j++) { - const auto* src = node->src[j]; + const auto * src = node->src[j]; if (src == nullptr) { break; } @@ -407,7 +414,7 @@ const ggml_tensor* GgmlOvDecoder::get_tensor_from_name(const std::string& name) std::map GgmlOvDecoder::get_kv_param_res_names() const { std::map kv_param_res_names; - for (const auto& name : m_kv_names) { + for (const auto & name : m_kv_names) { if (name.find("cache_k") == 0 || name.find("cache_v") == 0) { kv_param_res_names[name] = name; } @@ -416,21 +423,22 @@ std::map GgmlOvDecoder::get_kv_param_res_names() const } std::map> GgmlOvDecoder::create_weight_nodes( - struct ggml_cgraph* cgraph, std::map types_to_requantize) { + ggml_cgraph * cgraph, + std::map types_to_requantize) { std::map> model_weights; static std::mutex weights_mutex; - auto* nodes = cgraph->nodes; + auto * nodes = cgraph->nodes; auto n_nodes = cgraph->n_nodes; - std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor* node) { + std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor * node) { for (int i = 0; i < GGML_MAX_SRC; i++) { - auto* src = node->src[i]; + auto * src = node->src[i]; if (src == nullptr) { continue; } std::string src_name(src->name); if (!src->view_src) { - ggml_backend_buffer* buffer = src->buffer; + ggml_backend_buffer * buffer = src->buffer; if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS || ggml_is_quantized(src->type)) { bool should_create = false; { @@ -458,17 +466,10 @@ std::map> GgmlOvDecoder::create_weight_no return model_weights; } -std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, +std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor, std::optional requant_type) { - std::set weight_types = {GGML_TYPE_F32, - GGML_TYPE_F16, - GGML_TYPE_BF16, - GGML_TYPE_Q8_0, - GGML_TYPE_Q4_0, - GGML_TYPE_Q4_1, - GGML_TYPE_Q4_K, - GGML_TYPE_Q5_K, - GGML_TYPE_Q6_K}; + std::set weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, + GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K}; if (weight_types.find(tensor->type) == weight_types.end()) { throw std::runtime_error("Unexpected weight tensor type: " + std::string(tensor->name) + " with type " + ggml_type_name(tensor->type)); @@ -495,9 +496,8 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, } // Quantized case - OPENVINO_ASSERT( - tensor->extra == nullptr, - "Unsupported weight tensor: " + std::string(tensor->name) + " Possibly this is a repacked quantized weights"); + OPENVINO_ASSERT(tensor->extra == nullptr, "Unsupported weight tensor: " + std::string(tensor->name) + + " Possibly this is a repacked quantized weights"); if (requant_type.has_value()) { return requantize(tensor, requant_type.value()); @@ -518,11 +518,8 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, weights_per_block = 32; } - OPENVINO_ASSERT(node_shape.back() % weights_per_block == 0, - "[load_gguf] tensor ", - tensor->name, - " has incompatible last dim shape: ", - node_shape.back()); + OPENVINO_ASSERT(node_shape.back() % weights_per_block == 0, "[load_gguf] tensor ", tensor->name, + " has incompatible last dim shape: ", node_shape.back()); ov::Tensor weights(weight_type, node_shape); // For scales and biases @@ -557,7 +554,7 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, return weight_node.get_node_shared_ptr(); } -void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename) { +void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filename) { std::ofstream file(filename); if (!file.is_open()) { std::cerr << "Failed to open file" << std::endl; @@ -576,7 +573,7 @@ void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph, std::string& f << std::setw(50) << "stride" << "\n"; for (int i = 0; i < cgraph->n_nodes; i++) { - struct ggml_tensor * node = cgraph->nodes[i]; + ggml_tensor * node = cgraph->nodes[i]; file << " - " << std::setw(3) << i << ": [ " << std::setw(5) << node->ne[0] << ", " @@ -614,7 +611,7 @@ void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph, std::string& f file << "n_leafs = " << cgraph->n_leafs << "\n"; for (int i = 0; i < cgraph->n_leafs; i++) { - struct ggml_tensor * node = cgraph->leafs[i]; + ggml_tensor * node = cgraph->leafs[i]; file << " - " << std::setw(3) << i << ": [ " << std::setw(5) << node->ne[0] << ", " @@ -628,10 +625,10 @@ void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph, std::string& f file.close(); } -void print_tensor_address_map(const struct ggml_cgraph* cgraph) { - std::map> address_map; +void print_tensor_address_map(const ggml_cgraph * cgraph) { + std::map> address_map; for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { - auto* node = cgraph->nodes[node_n]; + auto * node = cgraph->nodes[node_n]; if (node->data) { auto it = address_map.find(node->data); if (it == address_map.end()) { @@ -640,16 +637,16 @@ void print_tensor_address_map(const struct ggml_cgraph* cgraph) { address_map[node->data].push_back(node->name); } } - for (const auto& pair : address_map) { + for (const auto & pair : address_map) { std::cout << "Address: " << pair.first << std::endl; - for (const auto& name : pair.second) { + for (const auto & name : pair.second) { std::cout << name << " ; "; } std::cout << std::endl << std::endl; } } -std::vector GgmlOvDecoder::get_shape(const ggml_tensor* tensor) { +std::vector GgmlOvDecoder::get_shape(const ggml_tensor * tensor) { std::vector shape; for (int i = GGML_MAX_DIMS - 2; i >= 0; --i) { shape.push_back(static_cast(tensor->ne[i])); @@ -657,7 +654,7 @@ std::vector GgmlOvDecoder::get_shape(const ggml_tensor* tensor) { return shape; } -std::vector GgmlOvDecoder::get_stride(const ggml_tensor* tensor) { +std::vector GgmlOvDecoder::get_stride(const ggml_tensor * tensor) { std::vector stride; for (int i = GGML_MAX_DIMS - 2; i >= 0; --i) { stride.push_back(static_cast(tensor->nb[i])); @@ -665,7 +662,7 @@ std::vector GgmlOvDecoder::get_stride(const ggml_tensor* tensor) { return stride; } -ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor* tensor) { +ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor * tensor) { switch (tensor->type) { case GGML_TYPE_F64: return ov::element::f64; @@ -688,15 +685,15 @@ ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor* tensor) { } } -ov::PartialShape GgmlOvDecoder::get_input_shape(const std::string& name) const { +ov::PartialShape GgmlOvDecoder::get_input_shape(const std::string & name) const { return ov::PartialShape(get_shape(m_inputs.at(name))); } -std::vector GgmlOvDecoder::get_input_stride(const std::string& name) const { +std::vector GgmlOvDecoder::get_input_stride(const std::string & name) const { return get_stride(m_inputs.at(name)); } -ov::element::Type GgmlOvDecoder::get_input_type(const std::string& name) const { +ov::element::Type GgmlOvDecoder::get_input_type(const std::string & name) const { return get_ov_type(m_inputs.at(name)); } @@ -704,7 +701,7 @@ size_t GgmlOvDecoder::get_input_size() const { return m_input_names.size(); } -std::string& GgmlOvDecoder::get_input_name(size_t index) const { +std::string & GgmlOvDecoder::get_input_name(size_t index) const { m_name = m_input_names[index]; return m_name; } @@ -713,19 +710,19 @@ std::vector GgmlOvDecoder::get_input_names() const { return m_input_names; } -std::vector GgmlOvDecoder::get_output_stride(const std::string& name) const { +std::vector GgmlOvDecoder::get_output_stride(const std::string & name) const { return get_stride(m_outputs.at(name)); } -ov::PartialShape GgmlOvDecoder::get_output_shape(const std::string& name) const { +ov::PartialShape GgmlOvDecoder::get_output_shape(const std::string & name) const { return ov::PartialShape(get_shape(m_outputs.at(name))); } -ov::element::Type GgmlOvDecoder::get_output_type(const std::string& name) const { +ov::element::Type GgmlOvDecoder::get_output_type(const std::string & name) const { return get_ov_type(m_outputs.at(name)); } -std::string& GgmlOvDecoder::get_output_name(size_t index) const { +std::string & GgmlOvDecoder::get_output_name(size_t index) const { m_name = std::string(m_output_names[index]); return m_name; } @@ -734,35 +731,28 @@ std::vector GgmlOvDecoder::get_output_names() const { return m_output_names; } -const std::string& GgmlOvDecoder::get_op_name() const { +const std::string & GgmlOvDecoder::get_op_name() const { return m_op_name; } -int32_t* GgmlOvDecoder::get_input_op_params(const std::string& name) const { +int32_t * GgmlOvDecoder::get_input_op_params(const std::string & name) const { return m_inputs.at(name)->op_params; } -int32_t* GgmlOvDecoder::get_output_op_params(const std::string& name) const { +int32_t * GgmlOvDecoder::get_output_op_params(const std::string & name) const { return m_outputs.at(name)->op_params; } void GgmlOvDecoder::visit_subgraph(std::function)> node_visitor) const { - for (const auto& node : m_nodes) { - auto decoder = std::make_shared(node, - m_cgraph, - m_is_static, - m_is_first_token, - m_context_size, - m_context_size_swa, - m_num_heads, - m_num_heads_kv, - m_head_size, - m_swa_layers); + for (const auto & node : m_nodes) { + auto decoder = + std::make_shared(node, m_cgraph, m_is_static, m_is_first_token, m_context_size, + m_context_size_swa, m_num_heads, m_num_heads_kv, m_head_size, m_swa_layers); node_visitor(decoder); } } -const std::string& GgmlOvDecoder::get_op_type() const { +const std::string & GgmlOvDecoder::get_op_type() const { static const std::map ops = { {GGML_OP_NONE, "GGML_OP_NONE" }, {GGML_OP_ACC, "GGML_OP_ACC" }, diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 35e79ecefc..884151d32e 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -1,5 +1,9 @@ #pragma once +#include "ggml-quants.hpp" +#include "ggml.h" +#include "openvino/decoder.hpp" + #include #include #include @@ -7,98 +11,99 @@ #include #include -#include "ggml-quants.hpp" -#include "ggml.h" -#include "openvino/decoder.hpp" - class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { public: // Graph decoder - GgmlOvDecoder(struct ggml_cgraph* cgraph, std::map>& model_weights, - bool is_static, bool is_first_token); + GgmlOvDecoder(ggml_cgraph * cgraph, + std::map> & model_weights, + bool is_static, + bool is_first_token); // Node decoder, called in GgmlOvDecoder::visit_subgraph - GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token, - int context_size, int context_size_swa, int num_heads, int num_heads_kv, int head_size, - const std::vector& swa_layers); + GgmlOvDecoder(ggml_tensor * node, + ggml_cgraph * cgraph, + bool is_static, + bool is_first_token, + int context_size, + int context_size_swa, + int num_heads, + int num_heads_kv, + int head_size, + const std::vector & swa_layers); // Naive graph decoder - GgmlOvDecoder(struct ggml_cgraph* cgraph, std::map>& model_weights); + GgmlOvDecoder(ggml_cgraph * cgraph, std::map> & model_weights); - virtual ov::Any get_attribute(const std::string& name) const override { + virtual ov::Any get_attribute(const std::string & name) const override { return nullptr; GGML_UNUSED(name); } - virtual ov::PartialShape get_input_shape(const std::string& name) const override; + virtual ov::PartialShape get_input_shape(const std::string & name) const override; - virtual std::vector get_input_stride(const std::string& name) const override; + virtual std::vector get_input_stride(const std::string & name) const override; - virtual ov::element::Type get_input_type(const std::string& name) const override; + virtual ov::element::Type get_input_type(const std::string & name) const override; virtual size_t get_input_size() const override; virtual void get_input_node(size_t input_port_idx, - std::string& producer_name, - std::string& producer_output_port_name, - size_t& producer_output_port_index) const override { + std::string & producer_name, + std::string & producer_output_port_name, + size_t & producer_output_port_index) const override { GGML_UNUSED(input_port_idx); GGML_UNUSED(producer_name); GGML_UNUSED(producer_output_port_name); GGML_UNUSED(producer_output_port_index); } - virtual std::string& get_input_name(size_t index) const override; + virtual std::string & get_input_name(size_t index) const override; virtual std::vector get_input_names() const override; - virtual ov::PartialShape get_output_shape(const std::string& name) const override; + virtual ov::PartialShape get_output_shape(const std::string & name) const override; - virtual std::vector get_output_stride(const std::string& name) const override; + virtual std::vector get_output_stride(const std::string & name) const override; - virtual ov::element::Type get_output_type(const std::string& name) const override; + virtual ov::element::Type get_output_type(const std::string & name) const override; - virtual int32_t* get_input_op_params(const std::string& name) const override; + virtual int32_t * get_input_op_params(const std::string & name) const override; - virtual int32_t* get_output_op_params(const std::string& name) const override; + virtual int32_t * get_output_op_params(const std::string & name) const override; - virtual std::string& get_output_name(size_t index) const override; + virtual std::string & get_output_name(size_t index) const override; virtual std::vector get_output_names() const override; - virtual const std::string& get_op_type() const override; + virtual const std::string & get_op_type() const override; - virtual const std::string& get_op_name() const override; + virtual const std::string & get_op_name() const override; virtual void visit_subgraph(std::function)> node_visitor) const override; - const ggml_tensor* get_input_ggml_tensor(const std::string& name) const { - return m_inputs.at(name); - } + const ggml_tensor * get_input_ggml_tensor(const std::string & name) const { return m_inputs.at(name); } - const ggml_tensor* get_output_ggml_tensor(const std::string& name) const { - return m_outputs.at(name); - } + const ggml_tensor * get_output_ggml_tensor(const std::string & name) const { return m_outputs.at(name); } - virtual int get_op_case() const override { - return m_op_case; - } + virtual int get_op_case() const override { return m_op_case; } - virtual const std::map>& get_model_inputs() const override { + virtual const std::map> & get_model_inputs() const override { return m_model_inputs; } - virtual const std::map>& get_model_extra_inputs() const override { + + virtual const std::map> & get_model_extra_inputs() const override { return m_model_extra_inputs; } - virtual const std::map>& get_model_extra_input_values() const { + + virtual const std::map> & get_model_extra_input_values() const { return m_model_extra_input_values; } - virtual const std::map>& get_model_weights() const override { + + virtual const std::map> & get_model_weights() const override { return m_model_weights; } - virtual const std::vector& get_model_output_names() const override { - return m_model_output_names; - } + + virtual const std::vector & get_model_output_names() const override { return m_model_output_names; } virtual int get_context_size() const override { return m_context_size; } @@ -114,7 +119,7 @@ public: virtual int get_head_size() const override { return m_head_size; } - virtual int32_t* get_rope_params() const override { return m_rope_params; } + virtual int32_t * get_rope_params() const override { return m_rope_params; } virtual std::map get_kv_param_res_names() const override; @@ -122,36 +127,39 @@ public: virtual bool is_first_token() const override { return m_is_first_token; } - ov::PartialShape get_graph_input_shape(const ggml_tensor* src) const; + ov::PartialShape get_graph_input_shape(const ggml_tensor * src) const; - static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename); + static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename); - static std::shared_ptr create_weight_node(ggml_tensor* tensor, + static std::shared_ptr create_weight_node(ggml_tensor * tensor, std::optional requant_type = std::nullopt); - static std::map> create_weight_nodes( - struct ggml_cgraph* cgraph, std::map types_to_requantize = {}); - const ggml_tensor* get_tensor_used_op(const ggml_tensor* tensor) const; - const ggml_tensor* get_tensor_from_name(const std::string& name) const; + static std::map> create_weight_nodes( + ggml_cgraph * cgraph, + std::map types_to_requantize = {}); + + const ggml_tensor * get_tensor_used_op(const ggml_tensor * tensor) const; + + const ggml_tensor * get_tensor_from_name(const std::string & name) const; void clear_model_weights() { m_model_weights.clear(); } private: - void set_input_output(ggml_tensor* node, bool naive = false); + void set_input_output(ggml_tensor * node, bool naive = false); void add_extra_inputs(); - static std::vector get_shape(const ggml_tensor* tensor); - static std::vector get_stride(const ggml_tensor* tensor); - static ov::element::Type get_ov_type(const ggml_tensor* tensor); + static std::vector get_shape(const ggml_tensor * tensor); + static std::vector get_stride(const ggml_tensor * tensor); + static ov::element::Type get_ov_type(const ggml_tensor * tensor); // set context_size, num_heads, etc void set_llm_params(); - struct ggml_cgraph* m_cgraph = nullptr; - ggml_tensor* m_node = nullptr; - std::vector m_nodes; - std::map m_inputs; + ggml_cgraph * m_cgraph = nullptr; + ggml_tensor * m_node = nullptr; + std::vector m_nodes; + std::map m_inputs; std::vector m_input_names; - std::map m_outputs; + std::map m_outputs; std::vector m_output_names; std::string m_op_name; mutable std::string m_name; @@ -168,12 +176,12 @@ private: int m_num_heads; int m_num_heads_kv; int m_head_size; - int32_t* m_rope_params; + int32_t * m_rope_params; std::vector m_kv_names; bool m_is_static = false; bool m_is_first_token; }; -void print_tensor_address_map(const struct ggml_cgraph* cgraph); +void print_tensor_address_map(const ggml_cgraph * cgraph); -int extract_layer_from_name(const std::string& name); +int extract_layer_from_name(const std::string & name); diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 75c2a76c54..c5acb1ea26 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -1,5 +1,11 @@ #include "ggml-openvino.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" +#include "ggml-impl.h" +#include "ggml-openvino/utils.h" +#include "ggml.h" + #include #include #include @@ -7,39 +13,36 @@ #include #include -#include "ggml-backend-impl.h" -#include "ggml-backend.h" -#include "ggml-impl.h" -#include "ggml-openvino/utils.h" -#include "ggml.h" - #define GGML_OPENVINO_MAX_STREAMS 8 struct ggml_backend_openvino_context { - int device; // the device ID currently in use - std::string name; // context Name - std::string description; // context description + int device; // the device ID currently in use + std::string name; // context Name + std::string description; // context description // OpenVINO core components - ov::Core core; // OpenVINO core interface - std::shared_ptr model; // compiled Model - ov::InferRequest infer_request; // inference Request + ov::Core core; // OpenVINO core interface + std::shared_ptr model; // compiled Model + ov::InferRequest infer_request; // inference Request // OpenVINO Multi-stream support - static const int MAX_STREAMS = 8; // define the maximum number of flows - std::vector streams; // used to support multi-stream reasoning - int current_stream; // the currently active stream index + static const int MAX_STREAMS = 8; // define the maximum number of flows + std::vector streams; // used to support multi-stream reasoning + int current_stream; // the currently active stream index // state Management - bool is_initialized; // initialize + bool is_initialized; // initialize - ggml_backend_openvino_context() - : device(0), name("OpenVINO"), description("OpenVINO Backend Context"), - current_stream(0), is_initialized(false) {} + ggml_backend_openvino_context() : + device(0), + name("OpenVINO"), + description("OpenVINO Backend Context"), + current_stream(0), + is_initialized(false) {} }; static void ggml_backend_openvino_free(ggml_backend_t backend) { - ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *)backend->context; + ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *) backend->context; delete ctx; delete backend; } @@ -49,8 +52,7 @@ static const char * ggml_backend_openvino_get_name(ggml_backend_t backend) { GGML_UNUSED(backend); } -static enum ggml_status -ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph *cgraph) { +static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { openvino_frontend_compute(backend, cgraph); return GGML_STATUS_SUCCESS; @@ -78,7 +80,8 @@ int ggml_backend_openvino_get_device_count() { } static ggml_guid_t ggml_backend_openvino_guid(void) { - static ggml_guid guid = { 0x12, 0xa8, 0xae, 0xf4, 0xc0, 0x1e, 0x61, 0x97, 0x8f, 0xeb, 0x33, 0x04, 0xa1, 0x33, 0x51, 0x2d }; + static ggml_guid guid = {0x12, 0xa8, 0xae, 0xf4, 0xc0, 0x1e, 0x61, 0x97, + 0x8f, 0xeb, 0x33, 0x04, 0xa1, 0x33, 0x51, 0x2d}; return &guid; } @@ -95,7 +98,7 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device) { return nullptr; } - ggml_backend_t openvino_backend = new ggml_backend { + ggml_backend_t openvino_backend = new ggml_backend{ /* .guid = */ ggml_backend_openvino_guid(), /* .interface = */ ggml_backend_openvino_interface, /* .device = */ ggml_backend_reg_dev_get(ggml_backend_openvino_reg(), device), @@ -134,15 +137,15 @@ struct ggml_backend_openvino_buffer_type_context { }; static const char * ggml_backend_openvino_buffer_type_get_name(ggml_backend_buffer_type_t buft) { - ggml_backend_openvino_buffer_type_context * ctx = (ggml_backend_openvino_buffer_type_context *)buft->context; + ggml_backend_openvino_buffer_type_context * ctx = (ggml_backend_openvino_buffer_type_context *) buft->context; return ctx->name.c_str(); } + static bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft) { return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name; } - static const char * ggml_backend_openvino_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) { return GGML_OPENVINO_NAME "_Split"; @@ -160,12 +163,12 @@ struct ggml_backend_openvino_device_context { }; static const char * ggml_backend_openvino_device_get_name(ggml_backend_dev_t dev) { - ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context; + ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *) dev->context; return ctx->name.c_str(); } static const char * ggml_backend_openvino_device_get_description(ggml_backend_dev_t dev) { - ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context; + ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *) dev->context; return ctx->description.c_str(); } @@ -174,7 +177,7 @@ static void ggml_backend_openvino_device_get_memory(ggml_backend_dev_t dev, size GGML_ASSERT(dev->context != nullptr); GGML_ASSERT(free != nullptr); GGML_ASSERT(total != nullptr); - ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context; + ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *) dev->context; GGML_ASSERT(ctx->device >= 0); // ggml_openvino_set_device(ctx->device); *total = 1; @@ -187,9 +190,9 @@ static enum ggml_backend_dev_type ggml_backend_openvino_device_get_type(ggml_bac } static void ggml_backend_openvino_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { - props->name = ggml_backend_openvino_device_get_name(dev); + props->name = ggml_backend_openvino_device_get_name(dev); props->description = ggml_backend_openvino_device_get_description(dev); - props->type = ggml_backend_openvino_device_get_type(dev); + props->type = ggml_backend_openvino_device_get_type(dev); ggml_backend_openvino_device_get_memory(dev, &props->memory_free, &props->memory_total); bool host_buffer = getenv("GGML_OPENVINO_NO_PINNED") == nullptr; @@ -209,12 +212,12 @@ static void ggml_backend_openvino_device_get_props(ggml_backend_dev_t dev, ggml_ static ggml_backend_t ggml_backend_openvino_device_init(ggml_backend_dev_t dev, const char * params) { GGML_UNUSED(params); - ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context; + ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *) dev->context; return ggml_backend_openvino_init(ctx->device); } static ggml_backend_buffer_type_t ggml_backend_openvino_device_get_buffer_type(ggml_backend_dev_t dev) { - ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context; + ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *) dev->context; return ggml_backend_openvino_buffer_type(ctx->device); } @@ -223,7 +226,10 @@ static ggml_backend_buffer_type_t ggml_backend_openvino_device_get_host_buffer_t return ggml_backend_openvino_host_buffer_type(); } -static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { +static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_ptr(ggml_backend_dev_t dev, + void * ptr, + size_t size, + size_t max_tensor_size) { GGML_UNUSED(dev); GGML_UNUSED(ptr); GGML_UNUSED(size); @@ -231,7 +237,10 @@ static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_ptr(ggml_b return nullptr; } -static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { +static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(ggml_backend_dev_t dev, + void * ptr, + size_t size, + size_t max_tensor_size) { GGML_UNUSED(dev); GGML_UNUSED(ptr); GGML_UNUSED(size); @@ -239,7 +248,7 @@ static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(g return nullptr; } -static bool is_op_unsupported_case(const ggml_tensor* op) { +static bool is_op_unsupported_case(const ggml_tensor * op) { switch (op->op) { case GGML_OP_SOFT_MAX: { if (op->src[2] != nullptr) { @@ -248,9 +257,9 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { } float scale = 1.0f; float max_bias = 0.0f; - const auto* op_params = op->op_params; - memcpy(&scale, (const float*) op_params + 0, sizeof(float)); - memcpy(&max_bias, (const float*) op_params + 1, sizeof(float)); + const auto * op_params = op->op_params; + memcpy(&scale, (const float *) op_params + 0, sizeof(float)); + memcpy(&max_bias, (const float *) op_params + 1, sizeof(float)); if (max_bias > 0) { GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with max_bias > 0\n"); return true; @@ -265,10 +274,10 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { float scale = 1.0f; float max_bias = 0.0f; float logit_softcap = 0.0f; - const auto* op_params = op->op_params; - memcpy(&scale, (const float*) op_params + 0, sizeof(float)); - memcpy(&max_bias, (const float*) op_params + 1, sizeof(float)); - memcpy(&logit_softcap, (const float*) op_params + 2, sizeof(float)); + const auto * op_params = op->op_params; + memcpy(&scale, (const float *) op_params + 0, sizeof(float)); + memcpy(&max_bias, (const float *) op_params + 1, sizeof(float)); + memcpy(&logit_softcap, (const float *) op_params + 2, sizeof(float)); if (max_bias > 0) { GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with max_bias > 0\n"); return true; @@ -303,7 +312,7 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { break; } case GGML_OP_ROPE: { - const int32_t* op_params = op->op_params; + const int32_t * op_params = op->op_params; const int n_dims = op_params[1]; const int mode = op_params[2]; if (mode == GGML_ROPE_TYPE_MROPE || mode == GGML_ROPE_TYPE_VISION) { @@ -311,8 +320,7 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { return true; } if (n_dims != 0.0f && n_dims != op->src[0]->ne[0]) { - GGML_LOG_WARN("OpenVINO backend does not support ROPE with n_dims %d != src[0]->ne[0] %ld\n", - n_dims, + GGML_LOG_WARN("OpenVINO backend does not support ROPE with n_dims %d != src[0]->ne[0] %ld\n", n_dims, op->src[0]->ne[0]); return true; } @@ -333,8 +341,7 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { GGML_LOG_WARN( "OpenVINO backend does not support ROPE with src[0]->view_src->ne[1] %ld != src[0]->ne[2] " "%ld\n", - op->src[0]->view_src->ne[1], - op->src[0]->ne[2]); + op->src[0]->view_src->ne[1], op->src[0]->ne[2]); return true; } } @@ -346,39 +353,19 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { return false; } -static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor* op) { +static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { GGML_ASSERT(dev->reg != nullptr); - static std::set supported_types{GGML_TYPE_F32, - GGML_TYPE_F16, - GGML_TYPE_BF16, - GGML_TYPE_I64, - GGML_TYPE_I32, - GGML_TYPE_Q4_0, - GGML_TYPE_Q4_1, - GGML_TYPE_Q4_K, - GGML_TYPE_Q5_K, - GGML_TYPE_Q8_0, - GGML_TYPE_Q6_K}; + static std::set supported_types{GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_I64, + GGML_TYPE_I32, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, + GGML_TYPE_Q5_K, GGML_TYPE_Q8_0, GGML_TYPE_Q6_K}; - static const std::set supported_ops{GGML_OP_NONE, - GGML_OP_ADD, - GGML_OP_MUL, - GGML_OP_MUL_MAT, - GGML_OP_VIEW, - GGML_OP_CONT, - GGML_OP_RESHAPE, - GGML_OP_PERMUTE, - GGML_OP_TRANSPOSE, - GGML_OP_GET_ROWS, - GGML_OP_ROPE, - GGML_OP_RMS_NORM, - GGML_OP_SCALE, + static const std::set supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW, + GGML_OP_CONT, GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE, + GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE, // softmax is not updated due to replaced by flash_attn_ext // GGML_OP_SOFT_MAX, - GGML_OP_SET_ROWS, - GGML_OP_FLASH_ATTN_EXT, - GGML_OP_CPY}; + GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY}; static const std::set supported_unary_ops{ GGML_UNARY_OP_SILU, }; @@ -422,7 +409,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con return false; } for (int i = 0; i < GGML_MAX_SRC; i++) { - auto* src = op->src[i]; + auto * src = op->src[i]; if (src == nullptr) { break; } @@ -483,13 +470,13 @@ static size_t ggml_backend_openvino_reg_get_device_count(ggml_backend_reg_t reg) GGML_UNUSED(reg); // TODO - ggml_backend_openvino_reg_context * ctx = (ggml_backend_openvino_reg_context *)reg->context; + ggml_backend_openvino_reg_context * ctx = (ggml_backend_openvino_reg_context *) reg->context; return ctx->devices.size(); } static ggml_backend_dev_t ggml_backend_openvino_reg_get_device(ggml_backend_reg_t reg, size_t index) { - ggml_backend_openvino_reg_context * ctx = (ggml_backend_openvino_reg_context *)reg->context; + ggml_backend_openvino_reg_context * ctx = (ggml_backend_openvino_reg_context *) reg->context; GGML_ASSERT(index < ctx->devices.size()); return ctx->devices[index]; // GGML_ASSERT(index == 0); @@ -509,7 +496,7 @@ static ggml_backend_dev_t ggml_backend_openvino_reg_get_device(ggml_backend_reg_ static void * ggml_backend_openvino_get_proc_address(ggml_backend_reg_t reg, const char * name) { GGML_UNUSED(reg); if (strcmp(name, "ggml_backend_split_buffer_type") == 0) { - return (void *)ggml_backend_openvino_split_buffer_type; + return (void *) ggml_backend_openvino_split_buffer_type; } // if (strcmp(name, "ggml_backend_register_host_buffer") == 0) { // return (void *)ggml_backend_openvino_register_host_buffer; @@ -565,17 +552,16 @@ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void) { // ggml_openvino_set_device(i); dev_ctx->description = ov::get_openvino_version().description; - ggml_backend_dev_t dev = new ggml_backend_device { - /* .interface = */ ggml_backend_openvino_device_interface, - /* .reg = */ ®, - /* .context = */ dev_ctx - }; + ggml_backend_dev_t dev = + new ggml_backend_device{/* .interface = */ ggml_backend_openvino_device_interface, + /* .reg = */ ®, + /* .context = */ dev_ctx}; ctx->devices.push_back(dev); } - reg = ggml_backend_reg{ /* .api_version = */ GGML_BACKEND_API_VERSION, - /* .iface = */ ggml_backend_openvino_reg_interface, - /* .context = */ ctx }; + reg = ggml_backend_reg{/* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_openvino_reg_interface, + /* .context = */ ctx}; } initialized = true; diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index 017d2ad28c..2076c3c75d 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -1,5 +1,9 @@ #include "ggml-quants.hpp" +#include "ggml-common.h" +#include "ggml-impl.h" +#include "ggml.h" + #include #include #include @@ -24,11 +28,7 @@ #include #include -#include "ggml-common.h" -#include "ggml-impl.h" -#include "ggml.h" - -void unpack_32_4(const uint8_t* data, uint8_t* dst) { +void unpack_32_4(const uint8_t * data, uint8_t * dst) { std::fill_n(dst, 16, 0); for (int j = 0; j < 16; ++j) { uint8_t x = (data[j] & 0x0F); @@ -44,18 +44,19 @@ void unpack_32_4(const uint8_t* data, uint8_t* dst) { // Extracts (weight, scales, biases) from Q4_0 tensors. // Data layout is: |16 bit scale|32 x 4bit weights|. -void extract_q4_0_data(const ggml_tensor* tensor, - ov::Tensor& weights_arr, - ov::Tensor& scales_arr, - ov::Tensor& biases_arr) { +void extract_q4_0_data(const ggml_tensor * tensor, + ov::Tensor & weights_arr, + ov::Tensor & scales_arr, + ov::Tensor & biases_arr) { const uint64_t bytes_per_block = 18; // 2 bytes scale, 32x0.5 byte weights - auto* data = static_cast(tensor->data); - auto* weights = static_cast(weights_arr.data()); - auto* scales = scales_arr.data::value_type>(); - auto* biases = biases_arr.data::value_type>(); + + auto * data = static_cast(tensor->data); + auto * weights = static_cast(weights_arr.data()); + auto * scales = scales_arr.data::value_type>(); + auto * biases = biases_arr.data::value_type>(); ov::parallel_for(scales_arr.get_size(), [&](size_t i) { - scales[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block))); + scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))); biases[i] = ov::float16(-8.f * static_cast(scales[i])); unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16); }); @@ -63,38 +64,40 @@ void extract_q4_0_data(const ggml_tensor* tensor, // Extracts (weight, scales, biases) from Q4_1 tensors. // Data layout is: |16 bit scale|16 bit bias|32 x 4bit weights|. -void extract_q4_1_data(const ggml_tensor* tensor, - ov::Tensor& weights_arr, - ov::Tensor& scales_arr, - ov::Tensor& biases_arr) { +void extract_q4_1_data(const ggml_tensor * tensor, + ov::Tensor & weights_arr, + ov::Tensor & scales_arr, + ov::Tensor & biases_arr) { const uint64_t bytes_per_block = 20; // 2 bytes scale, 2 bytes bias, 32x0.5 byte weights - auto* data = static_cast(tensor->data); - auto* weights = static_cast(weights_arr.data()); - auto* scales = scales_arr.data::value_type>(); - auto* biases = biases_arr.data::value_type>(); + + auto * data = static_cast(tensor->data); + auto * weights = static_cast(weights_arr.data()); + auto * scales = scales_arr.data::value_type>(); + auto * biases = biases_arr.data::value_type>(); ov::parallel_for(scales_arr.get_size(), [&](size_t i) { - scales[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block))); - biases[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block + 2))); + scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))); + biases[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2))); unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16); }); } // Extracts (weight, scales, biases) from Q8_0 tensors. // Data layout is: |16 bit scale|32 x 8bit weights|. -void extract_q8_0_data(const ggml_tensor* tensor, - ov::Tensor& weights_arr, - ov::Tensor& scales_arr, - ov::Tensor& biases_arr) { +void extract_q8_0_data(const ggml_tensor * tensor, + ov::Tensor & weights_arr, + ov::Tensor & scales_arr, + ov::Tensor & biases_arr) { const uint64_t weights_per_block = 32; const uint64_t bytes_per_block = 34; // 2 bytes scale, 32x1 byte weights - auto* data = static_cast(tensor->data); - auto* weights = static_cast(weights_arr.data()); - auto* scales = scales_arr.data::value_type>(); - auto* biases = biases_arr.data::value_type>(); + + auto * data = static_cast(tensor->data); + auto * weights = static_cast(weights_arr.data()); + auto * scales = scales_arr.data::value_type>(); + auto * biases = biases_arr.data::value_type>(); ov::parallel_for(scales_arr.get_size(), [&](size_t i) { - uint8_t* block_data = data + i * bytes_per_block; - scales[i] = ov::float16::from_bits(*(uint16_t*) block_data); + uint8_t * block_data = data + i * bytes_per_block; + scales[i] = ov::float16::from_bits(*(uint16_t *) block_data); biases[i] = ov::float16(-128.f * static_cast(scales[i])); for (size_t j = 0; j < weights_per_block; ++j) { uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes. @@ -105,7 +108,7 @@ void extract_q8_0_data(const ggml_tensor* tensor, }); } -void unpack_256_4(const uint8_t* data, uint8_t* dst) { +void unpack_256_4(const uint8_t * data, uint8_t * dst) { // Initialize the output array with zeros std::fill_n(dst, 128, 0); @@ -123,26 +126,27 @@ void unpack_256_4(const uint8_t* data, uint8_t* dst) { } } -void extract_q4_k_data(const ggml_tensor* tensor, - ov::Tensor& weights_arr, - ov::Tensor& scales_arr, - ov::Tensor& biases_arr) { +void extract_q4_k_data(const ggml_tensor * tensor, + ov::Tensor & weights_arr, + ov::Tensor & scales_arr, + ov::Tensor & biases_arr) { const uint64_t bytes_per_block = 2 + 2 + 12 + 128; const uint64_t n_super_block = tensor->nb[3] / bytes_per_block; - auto* data = static_cast(tensor->data); - auto* weights = static_cast(weights_arr.data()); - auto* scales = scales_arr.data::value_type>(); - auto* biases = biases_arr.data::value_type>(); + + auto * data = static_cast(tensor->data); + auto * weights = static_cast(weights_arr.data()); + auto * scales = scales_arr.data::value_type>(); + auto * biases = biases_arr.data::value_type>(); ov::parallel_for(n_super_block, [&](size_t i) { - uint8_t* block_data = data + i * bytes_per_block; + uint8_t * block_data = data + i * bytes_per_block; // Extract scale factors and offsets - float scale_scales = static_cast(ov::float16::from_bits(*((uint16_t*)block_data))); - float scale_biases = static_cast(ov::float16::from_bits(*((uint16_t*)block_data + 1))); + float scale_scales = static_cast(ov::float16::from_bits(*((uint16_t *) block_data))); + float scale_biases = static_cast(ov::float16::from_bits(*((uint16_t *) block_data + 1))); // Extract qs1 and qs2 - uint8_t* qs1 = block_data + 4; + uint8_t * qs1 = block_data + 4; // uint8_t* qs2 = block_data + 16; scales[i * 8] = ov::float16(scale_scales * static_cast((*(qs1) & 0b111111))); @@ -174,31 +178,32 @@ void extract_q4_k_data(const ggml_tensor* tensor, }); } -void extract_q6_k_data(const ggml_tensor* tensor, - ov::Tensor& weights_arr, - ov::Tensor& scales_arr, - ov::Tensor& biases_arr) { +void extract_q6_k_data(const ggml_tensor * tensor, + ov::Tensor & weights_arr, + ov::Tensor & scales_arr, + ov::Tensor & biases_arr) { const uint64_t bytes_per_block = 128 + 64 + 16 + 2; const uint64_t n_super_block = tensor->nb[3] / bytes_per_block; - auto* data = static_cast(tensor->data); - auto* weights = static_cast(weights_arr.data()); - auto* scales = scales_arr.data::value_type>(); - auto* biases = biases_arr.data::value_type>(); + + auto * data = static_cast(tensor->data); + auto * weights = static_cast(weights_arr.data()); + auto * scales = scales_arr.data::value_type>(); + auto * biases = biases_arr.data::value_type>(); ov::parallel_for(n_super_block, [&](size_t i) { - uint8_t* block_data = data + i * bytes_per_block; + uint8_t * block_data = data + i * bytes_per_block; float scale_factor = - static_cast(ov::float16::from_bits(*((uint16_t*) block_data + 104))); // (128+64+16)/2 + static_cast(ov::float16::from_bits(*((uint16_t *) block_data + 104))); // (128+64+16)/2 for (size_t j = 0; j < 16; j++) { scales[j + i * 16] = - ov::float16(scale_factor * static_cast(*((int8_t*) (block_data + 128 + 64 + j)))); + ov::float16(scale_factor * static_cast(*((int8_t *) (block_data + 128 + 64 + j)))); biases[j + i * 16] = ov::float16(-32.f * static_cast(scales[j + i * 16])); } - uint8_t* ql = block_data; - uint8_t* qh = block_data + 128; + uint8_t * ql = block_data; + uint8_t * qh = block_data + 128; for (int64_t j = 0; j < 32; ++j) { weights[i * 256 + j] = (ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4); @@ -213,7 +218,7 @@ void extract_q6_k_data(const ggml_tensor* tensor, }); } -static inline void get_scale_min_k4(int j, const uint8_t* q, uint8_t* d, uint8_t* m) { +static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) { if (j < 4) { *d = q[j] & 63; *m = q[j + 4] & 63; @@ -223,24 +228,27 @@ static inline void get_scale_min_k4(int j, const uint8_t* q, uint8_t* d, uint8_t } } -void extract_q5_k_data(const ggml_tensor* tensor, ov::Tensor& weights_arr, ov::Tensor& scales_arr, - ov::Tensor& biases_arr) { +void extract_q5_k_data(const ggml_tensor * tensor, + ov::Tensor & weights_arr, + ov::Tensor & scales_arr, + ov::Tensor & biases_arr) { const uint64_t bytes_per_block = 4 + 12 + 32 + 128; const uint64_t n_super_block = tensor->nb[3] / bytes_per_block; - auto* data = static_cast(tensor->data); - auto* weights = static_cast(weights_arr.data()); - auto* scales = scales_arr.data::value_type>(); - auto* biases = biases_arr.data::value_type>(); + + auto * data = static_cast(tensor->data); + auto * weights = static_cast(weights_arr.data()); + auto * scales = scales_arr.data::value_type>(); + auto * biases = biases_arr.data::value_type>(); ov::parallel_for(n_super_block, [&](size_t i) { - uint8_t* block_data = data + i * bytes_per_block; + uint8_t * block_data = data + i * bytes_per_block; - const float d = static_cast(ov::float16::from_bits(*((uint16_t*) block_data))); - const float min = static_cast(ov::float16::from_bits(*((uint16_t*) block_data + 1))); + const float d = static_cast(ov::float16::from_bits(*((uint16_t *) block_data))); + const float min = static_cast(ov::float16::from_bits(*((uint16_t *) block_data + 1))); - const uint8_t* scales_data = block_data + 4; // 12 bytes of scales - const uint8_t* qh = block_data + 4 + 12; // 32 bytes of high bits - const uint8_t* ql = block_data + 4 + 12 + 32; // 128 bytes of low bits + const uint8_t * scales_data = block_data + 4; // 12 bytes of scales + const uint8_t * qh = block_data + 4 + 12; // 32 bytes of high bits + const uint8_t * ql = block_data + 4 + 12 + 32; // 128 bytes of low bits int is = 0; uint8_t u1 = 1; @@ -286,7 +294,10 @@ void extract_q5_k_data(const ggml_tensor* tensor, ov::Tensor& weights_arr, ov::T // TODO Reorder for make_intX_weights -ov::Output make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) { +ov::Output make_int8_weights(ov::Tensor & weight, + ov::Tensor & scales, + ov::Tensor & biases, + size_t group_size) { ov::Shape orig_shape = weight.get_shape(); // Expand dimensions for scales and biases @@ -303,18 +314,19 @@ ov::Output make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, o } // Create graph nodes - auto weights_node = std::make_shared( - ov::element::u8, packed_shape, static_cast(weight.data()), nullptr); + auto weights_node = std::make_shared(ov::element::u8, packed_shape, + static_cast(weight.data()), nullptr); weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; auto scales_f16 = std::make_shared(scales); ov::Tensor biases_u8(ov::element::u8, scale_shape); // Calculate zero point - const ov::float16* bias_data = biases.data::value_type>(); - const ov::float16* scale_data = scales.data::value_type>(); - uint8_t* bias_u8_data = biases_u8.data(); + const ov::float16 * bias_data = biases.data::value_type>(); + const ov::float16 * scale_data = scales.data::value_type>(); + uint8_t * bias_u8_data = biases_u8.data(); for (size_t i = 0; i < biases_u8.get_size(); ++i) { - bias_u8_data[i] = (uint8_t)std::round(-1.f * static_cast(bias_data[i]) / static_cast(scale_data[i])); + bias_u8_data[i] = + (uint8_t) std::round(-1.f * static_cast(bias_data[i]) / static_cast(scale_data[i])); } auto zero_point = std::make_shared(biases_u8); @@ -327,9 +339,7 @@ ov::Output make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, o auto weights_f16 = std::make_shared(weights_node, ov::element::f16); auto zero_point_f16 = std::make_shared(zero_point, ov::element::f16); - auto w_zp = std::make_shared( - weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY - ); + auto w_zp = std::make_shared(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY); ov::Output w_zp_s = std::make_shared(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); @@ -343,18 +353,17 @@ ov::Output make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, o return std::make_shared(w_zp_s, ov::element::f32); } -ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) { +ov::Output make_int4_weights(ov::Tensor & weight, + ov::Tensor & scales, + ov::Tensor & biases, + size_t group_size) { ov::Shape orig_weight_shape = weight.get_shape(); // Expand dimensions for scales and biases ov::Shape scale_bias_shape = scales.get_shape(); // Create INT4 weight tensor - ov::Shape packed_shape = { - orig_weight_shape[0], - orig_weight_shape[1] / group_size, - group_size - }; + ov::Shape packed_shape = {orig_weight_shape[0], orig_weight_shape[1] / group_size, group_size}; // Requantized channel-wise case if (packed_shape[1] == 1) { @@ -365,18 +374,21 @@ ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o biases.set_shape(scale_bias_shape); } - auto weights_node = std::make_shared(ov::element::u4, packed_shape, static_cast(weight.data()), nullptr); + auto weights_node = std::make_shared(ov::element::u4, packed_shape, + static_cast(weight.data()), nullptr); weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; auto weights_f16 = std::make_shared(weights_node, ov::element::f16); // Pack zero points: two subsequent values into one - const ov::float16* bias_data = biases.data::value_type>(); - const ov::float16* scale_data = scales.data::value_type>(); + const ov::float16 * bias_data = biases.data::value_type>(); + const ov::float16 * scale_data = scales.data::value_type>(); ov::Tensor zero_point_tensor(ov::element::u4, scale_bias_shape); - uint8_t* zero_point_data = static_cast(zero_point_tensor.data()); + uint8_t * zero_point_data = static_cast(zero_point_tensor.data()); for (size_t i = 0; i < zero_point_tensor.get_byte_size(); ++i) { - uint8_t bias1 = (uint8_t)std::round(-1.f * static_cast(bias_data[i * 2]) / static_cast(scale_data[i * 2])); - uint8_t bias2 = (uint8_t)std::round(-1.f * static_cast(bias_data[i * 2 + 1]) / static_cast(scale_data[i * 2 + 1])); + uint8_t bias1 = + (uint8_t) std::round(-1.f * static_cast(bias_data[i * 2]) / static_cast(scale_data[i * 2])); + uint8_t bias2 = (uint8_t) std::round(-1.f * static_cast(bias_data[i * 2 + 1]) / + static_cast(scale_data[i * 2 + 1])); zero_point_data[i] = (bias2 << 4) | (bias1 & 0x0F); } @@ -390,16 +402,15 @@ ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o auto scales_f16 = std::make_shared(scales); // Perform dequantization - auto w_zp = std::make_shared( - weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY); + auto w_zp = std::make_shared(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY); ov::Output w_zp_s = std::make_shared(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); if (packed_shape.size() != 2) { // If not requantized channel-wise case, reshape back to original shape - auto final_shape = std::make_shared( - ov::element::i64, ov::Shape{orig_weight_shape.size()}, orig_weight_shape); + auto final_shape = std::make_shared(ov::element::i64, ov::Shape{orig_weight_shape.size()}, + orig_weight_shape); w_zp_s = std::make_shared(w_zp_s, final_shape, false); } @@ -407,7 +418,7 @@ ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o return std::make_shared(w_zp_s, ov::element::f32); } -std::shared_ptr requantize(const ggml_tensor* tensor, ExtraQuantType requant_type) { +std::shared_ptr requantize(const ggml_tensor * tensor, ExtraQuantType requant_type) { std::vector weights_f32(tensor->ne[0] * tensor->ne[1]); ggml_get_type_traits(tensor->type)->to_float(tensor->data, weights_f32.data(), ggml_nelements(tensor)); @@ -459,14 +470,18 @@ std::shared_ptr requantize(const ggml_tensor* tensor, ExtraQuantType r return weight_node; } -void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, +void quantize_q4_0(const float * x, + ov::Tensor & weights_arr, + ov::Tensor & scales_arr, + ov::Tensor & biases_arr, + int64_t k, int64_t qk) { assert(k % qk == 0); const int nb = k / qk; - auto* weights = static_cast(weights_arr.data()); - auto* scales = scales_arr.data::value_type>(); - auto* biases = biases_arr.data::value_type>(); + auto * weights = static_cast(weights_arr.data()); + auto * scales = scales_arr.data::value_type>(); + auto * biases = biases_arr.data::value_type>(); for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max float max = 0.0f; @@ -503,14 +518,18 @@ void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_a } } -void quantize_q8_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, +void quantize_q8_0(const float * x, + ov::Tensor & weights_arr, + ov::Tensor & scales_arr, + ov::Tensor & biases_arr, + int64_t k, int64_t qk) { assert(k % qk == 0); const int nb = k / qk; - auto* weights = static_cast(weights_arr.data()); - auto* scales = scales_arr.data::value_type>(); - auto* biases = biases_arr.data::value_type>(); + auto * weights = static_cast(weights_arr.data()); + auto * scales = scales_arr.data::value_type>(); + auto * biases = biases_arr.data::value_type>(); for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max @@ -534,14 +553,18 @@ void quantize_q8_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_a } } -void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, +void quantize_q8_1(const float * x, + ov::Tensor & weights_arr, + ov::Tensor & scales_arr, + ov::Tensor & biases_arr, + int64_t k, int64_t qk) { assert(k % qk == 0); const int nb = k / qk; - auto* weights = static_cast(weights_arr.data()); - auto* scales = scales_arr.data::value_type>(); - auto* biases = biases_arr.data::value_type>(); + auto * weights = static_cast(weights_arr.data()); + auto * scales = scales_arr.data::value_type>(); + auto * biases = biases_arr.data::value_type>(); for (int i = 0; i < nb; i++) { float min = std::numeric_limits::max(); float max = std::numeric_limits::lowest(); diff --git a/ggml/src/ggml-openvino/openvino/frontend.cpp b/ggml/src/ggml-openvino/openvino/frontend.cpp index dbdae1ed45..27d10d71c1 100644 --- a/ggml/src/ggml-openvino/openvino/frontend.cpp +++ b/ggml/src/ggml-openvino/openvino/frontend.cpp @@ -10,11 +10,11 @@ namespace ggml { FrontEnd::FrontEnd() {} -std::shared_ptr FrontEnd::convert(const InputModel::Ptr& model, bool naive) { +std::shared_ptr FrontEnd::convert(const InputModel::Ptr & model, bool naive) { auto ggml_model = std::dynamic_pointer_cast(model); FRONT_END_GENERAL_CHECK(ggml_model, "Invalid input model"); std::shared_ptr converted_model; - const auto& supported_ops = get_supported_ops(); + const auto & supported_ops = get_supported_ops(); { TranslateSession translate_session(model, supported_ops, naive); converted_model = translate_session.get_converted_model(); diff --git a/ggml/src/ggml-openvino/openvino/input_model.cpp b/ggml/src/ggml-openvino/openvino/input_model.cpp index 5fb16ea2db..0f66270a5e 100644 --- a/ggml/src/ggml-openvino/openvino/input_model.cpp +++ b/ggml/src/ggml-openvino/openvino/input_model.cpp @@ -6,9 +6,9 @@ namespace ov { namespace frontend { namespace ggml { -InputModel::InputModel(const std::shared_ptr& gdecoder) : m_decoder(gdecoder) {} +InputModel::InputModel(const std::shared_ptr & gdecoder) : m_decoder(gdecoder) {} -const std::shared_ptr& InputModel::get_model_decoder() const { +const std::shared_ptr & InputModel::get_model_decoder() const { return m_decoder; } diff --git a/ggml/src/ggml-openvino/openvino/op/cont.cpp b/ggml/src/ggml-openvino/openvino/op/cont.cpp index 9ae0f420cc..a17273d426 100644 --- a/ggml/src/ggml-openvino/openvino/op/cont.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp @@ -1,4 +1,8 @@ +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + #include #include #include @@ -6,16 +10,12 @@ #include #include -#include "../node_context.hpp" -#include "../op_table.hpp" -#include "../utils.hpp" - namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_cont(const NodeContext& context) { +OutputVector translate_cont(const NodeContext & context) { num_inputs_check(context, 1, 1); int op_case = context.get_op_case(); @@ -29,9 +29,7 @@ OutputVector translate_cont(const NodeContext& context) { // The input comes from a PERMUTE dst_shape[1] = -1; res = std::make_shared( - context.get_input(0), - ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), - false); + context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), false); } else if (op_case == 2) { // The input comes from a TRANSPOSE return {context.get_input(0)}; diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index 54b49018a9..d5186cddee 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -1,15 +1,16 @@ -#include -#include #include "../node_context.hpp" #include "../op_table.hpp" #include "../utils.hpp" +#include +#include + namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_cpy(const NodeContext& context) { +OutputVector translate_cpy(const NodeContext & context) { auto res = std::make_shared(context.get_input(0), context.get_output_type(0)); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp index 9845fe0a02..029023637a 100644 --- a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +++ b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp @@ -1,3 +1,7 @@ +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + #include #include #include @@ -8,24 +12,20 @@ #include #include -#include "../node_context.hpp" -#include "../op_table.hpp" -#include "../utils.hpp" - namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_flash_attn_ext(const NodeContext& context) { +OutputVector translate_flash_attn_ext(const NodeContext & context) { num_inputs_check(context, 4, 4); auto q_f32 = context.get_input(0); auto k = context.get_input(1); auto v = context.get_input(2); auto mask = context.get_input(3); - float* params = reinterpret_cast(context.get_output_op_params(0)); - float scale = params[0]; + float * params = reinterpret_cast(context.get_output_op_params(0)); + float scale = params[0]; // float max_bias = params[1]; // float logit_softcap = params[2]; @@ -43,15 +43,14 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) { auto token_len = get_dimensions(q, {2}); auto kv_len = get_dimensions(k.get_node_shared_ptr(), {2}); - auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0}); - auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1}); + auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 0}); + auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 1}); auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2}); + auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2}); auto stop = std::make_shared(ov::OutputVector{token_len, kv_len}, 0); - mask_sliced = - std::make_shared(mask, zero_2d, stop, one_2d, axes); + mask_sliced = std::make_shared(mask, zero_2d, stop, one_2d, axes); mask_sliced = std::make_shared(mask_sliced, zero_1d); } @@ -72,8 +71,8 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) { kv_unsqueezed = std::make_shared(kv, unsqueeze_axes); auto kv_last_two_dims = get_dimensions(kv.get_node_shared_ptr(), {1, 2}); - kv_broadcast_shape = - std::make_shared(ov::OutputVector{kv_batch_node, factor_node, kv_last_two_dims}, 0); + kv_broadcast_shape = std::make_shared( + ov::OutputVector{kv_batch_node, factor_node, kv_last_two_dims}, 0); new_kv_shape = std::make_shared(ov::OutputVector{q_batch_node, kv_last_two_dims}, 0); } else { @@ -82,8 +81,8 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) { kv_unsqueezed = std::make_shared(kv, unsqueeze_axes); auto kv_last_two_dims = get_dimensions(kv.get_node_shared_ptr(), {2, 3}); - kv_broadcast_shape = - std::make_shared(ov::OutputVector{one_1d, kv_batch_node, factor_node, kv_last_two_dims}, 0); + kv_broadcast_shape = std::make_shared( + ov::OutputVector{one_1d, kv_batch_node, factor_node, kv_last_two_dims}, 0); new_kv_shape = std::make_shared(ov::OutputVector{one_1d, q_batch_node, kv_last_two_dims}, 0); } @@ -105,8 +104,8 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) { res = std::make_shared(sdpa_f32, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); } else { - res = std::make_shared(sdpa_f32, - ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); + res = std::make_shared( + sdpa_f32, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); } return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp index 5e4c7d901a..2e3520554e 100644 --- a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp @@ -1,3 +1,7 @@ +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + #include #include #include @@ -5,16 +9,12 @@ #include #include -#include "../node_context.hpp" -#include "../op_table.hpp" -#include "../utils.hpp" - namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_get_rows(const NodeContext& context) { +OutputVector translate_get_rows(const NodeContext & context) { num_inputs_check(context, 2, 2); int op_case = context.get_op_case(); diff --git a/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp index 4295bf7517..3e3cae0071 100644 --- a/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp @@ -1,3 +1,7 @@ +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + #include #include #include @@ -7,16 +11,12 @@ #include #include -#include "../node_context.hpp" -#include "../op_table.hpp" -#include "../utils.hpp" - namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_glu_geglu(const NodeContext& context) { +OutputVector translate_glu_geglu(const NodeContext & context) { num_inputs_check(context, 1, 2); ov::Output src0; @@ -32,7 +32,7 @@ OutputVector translate_glu_geglu(const NodeContext& context) { src1 = split->output(1); } - int32_t* params = context.get_output_op_params(0); + int32_t * params = context.get_output_op_params(0); const int32_t swapped = params[1]; if (swapped) { std::swap(src0, src1); diff --git a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp index bef42fe4b7..61cdaadea3 100644 --- a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp @@ -1,3 +1,7 @@ +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + #include #include #include @@ -7,16 +11,12 @@ #include #include -#include "../node_context.hpp" -#include "../op_table.hpp" -#include "../utils.hpp" - namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_glu_swiglu(const NodeContext& context) { +OutputVector translate_glu_swiglu(const NodeContext & context) { num_inputs_check(context, 1, 2); ov::Output src0; @@ -32,7 +32,7 @@ OutputVector translate_glu_swiglu(const NodeContext& context) { src1 = split->output(1); } - int32_t* params = context.get_output_op_params(0); + int32_t * params = context.get_output_op_params(0); const int32_t swapped = params[1]; if (swapped) { std::swap(src0, src1); diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index b4103378eb..c161bce75d 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -1,3 +1,7 @@ +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + #include #include #include @@ -15,16 +19,12 @@ #include #include -#include "../node_context.hpp" -#include "../op_table.hpp" -#include "../utils.hpp" - namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_mulmat(const NodeContext& context) { +OutputVector translate_mulmat(const NodeContext & context) { num_inputs_check(context, 2, 2); int op_case = context.get_op_case(); diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index 5f86f47c1c..128ffb2933 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -1,3 +1,7 @@ +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + #include #include #include @@ -9,16 +13,12 @@ #include #include -#include "../node_context.hpp" -#include "../op_table.hpp" -#include "../utils.hpp" - namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_permute(const NodeContext& context) { +OutputVector translate_permute(const NodeContext & context) { num_inputs_check(context, 1, 1); int op_case = context.get_op_case(); @@ -28,15 +28,15 @@ OutputVector translate_permute(const NodeContext& context) { if (op_case == 1) { if (context.is_static()) { - res = std::make_shared(context.get_input(0), - ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); + res = std::make_shared( + context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); } else { auto src = context.get_input(0); if (src.get_partial_shape().rank() == 3) { src = std::make_shared(src, zero); } - res = std::make_shared(src, - ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); + res = std::make_shared( + src, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); } } else { auto src = context.get_input(0); @@ -47,7 +47,8 @@ OutputVector translate_permute(const NodeContext& context) { std::vector src_shape(src_shape_.begin(), src_shape_.end()); auto src_reshaped = std::make_shared( src, - ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{-1, src_shape[1], src_shape[2]}), + ov::op::v0::Constant::create(ov::element::i64, {3}, + std::vector{-1, src_shape[1], src_shape[2]}), false); res = std::make_shared( src_reshaped, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); @@ -55,8 +56,8 @@ OutputVector translate_permute(const NodeContext& context) { if (src.get_partial_shape().rank() == 3) { src = std::make_shared(src, zero); } - res = std::make_shared(src, - ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); + res = std::make_shared( + src, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); } } return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/openvino/op/reshape.cpp b/ggml/src/ggml-openvino/openvino/op/reshape.cpp index 1ed6f4b880..bbf94865ef 100644 --- a/ggml/src/ggml-openvino/openvino/op/reshape.cpp +++ b/ggml/src/ggml-openvino/openvino/op/reshape.cpp @@ -1,3 +1,7 @@ +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + #include #include #include @@ -7,16 +11,12 @@ #include #include -#include "../node_context.hpp" -#include "../op_table.hpp" -#include "../utils.hpp" - namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_reshape(const NodeContext& context) { +OutputVector translate_reshape(const NodeContext & context) { num_inputs_check(context, 1, 1); if (context.get_input_shape(0) == context.get_output_shape(0)) { return {context.get_input(0)}; @@ -29,15 +29,11 @@ OutputVector translate_reshape(const NodeContext& context) { auto output_shape = context.get_output_shape(0).to_shape(); std::shared_ptr new_shape_node; if (op_case == 1) { - new_shape_node = - ov::op::v0::Constant::create(ov::element::i64, - {3}, - std::vector{-1, (int64_t)output_shape[1], (int64_t)output_shape[2]}); + new_shape_node = ov::op::v0::Constant::create( + ov::element::i64, {3}, std::vector{-1, (int64_t) output_shape[1], (int64_t) output_shape[2]}); } else if (op_case == 2) { - new_shape_node = - ov::op::v0::Constant::create(ov::element::i64, - {3}, - std::vector{(int64_t)output_shape[0], -1, (int64_t)output_shape[2]}); + new_shape_node = ov::op::v0::Constant::create( + ov::element::i64, {3}, std::vector{(int64_t) output_shape[0], -1, (int64_t) output_shape[2]}); } else if (op_case == 3) { new_shape_node = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{(int64_t) output_shape[0], -1, 1}); diff --git a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp index c9df4c42f3..3ac96d0c22 100644 --- a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp @@ -1,3 +1,7 @@ +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + #include #include #include @@ -7,16 +11,12 @@ #include #include -#include "../node_context.hpp" -#include "../op_table.hpp" -#include "../utils.hpp" - namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_rms_norm(const NodeContext& context) { +OutputVector translate_rms_norm(const NodeContext & context) { num_inputs_check(context, 1, 1); auto input_node = context.get_input(0); diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index 484730d289..362ccce17f 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -1,3 +1,7 @@ +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + #include #include #include @@ -14,16 +18,12 @@ #include #include -#include "../node_context.hpp" -#include "../op_table.hpp" -#include "../utils.hpp" - namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_rope(const NodeContext& context) { +OutputVector translate_rope(const NodeContext & context) { num_inputs_check(context, 2, 3); int op_case = context.get_op_case(); @@ -32,7 +32,7 @@ OutputVector translate_rope(const NodeContext& context) { auto data_node = context.get_input(0).get_node_shared_ptr(); auto output_shape = context.get_output_shape(0).to_shape(); - int32_t* op_params = context.get_output_op_params(0); + int32_t * op_params = context.get_output_op_params(0); Output cos_theta_node; Output sin_theta_node; @@ -85,7 +85,8 @@ OutputVector translate_rope(const NodeContext& context) { auto stack = std::make_shared(OutputVector{first_half, second_half}, 3); res = std::make_shared(stack, std::make_shared(data_node), false); if (!(context.is_static())) { - res = std::make_shared(res, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); + res = + std::make_shared(res, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); } } else if (mode == ROPE_TYPE_NEOX) { auto data_split = std::make_shared( diff --git a/ggml/src/ggml-openvino/openvino/op/scale.cpp b/ggml/src/ggml-openvino/openvino/op/scale.cpp index 783440ebd9..f52381786a 100644 --- a/ggml/src/ggml-openvino/openvino/op/scale.cpp +++ b/ggml/src/ggml-openvino/openvino/op/scale.cpp @@ -1,17 +1,17 @@ -#include -#include -#include - #include "../node_context.hpp" #include "../op_table.hpp" #include "../utils.hpp" +#include +#include +#include + namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_scale(const NodeContext& context) { +OutputVector translate_scale(const NodeContext & context) { num_inputs_check(context, 1, 1); float scale; diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp index 001bd08773..643ba7bffa 100644 --- a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp @@ -1,3 +1,7 @@ +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + #include #include #include @@ -15,16 +19,12 @@ #include #include -#include "../node_context.hpp" -#include "../op_table.hpp" -#include "../utils.hpp" - namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_set_rows(const NodeContext& context) { +OutputVector translate_set_rows(const NodeContext & context) { num_inputs_check(context, 3, 3); auto data = context.get_input(0); @@ -44,8 +44,7 @@ OutputVector translate_set_rows(const NodeContext& context) { Output res; if (context.is_static()) { auto dst_reshaped = std::make_shared( - dst, - ov::op::v0::Constant::create(ov::element::i64, {2}, {(int64_t) dst_shape[1], (int64_t) dst_shape[2]}), + dst, ov::op::v0::Constant::create(ov::element::i64, {2}, {(int64_t) dst_shape[1], (int64_t) dst_shape[2]}), false); auto indices_reshaped = std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); @@ -55,7 +54,8 @@ OutputVector translate_set_rows(const NodeContext& context) { auto updated = std::make_shared(dst_reshaped, indices_reshaped, data_reshaped, zero); res = std::make_shared(updated, std::make_shared(dst), false); } else { - assert(dst.get_partial_shape().rank() == 4 && dst.get_partial_shape()[2].is_static() && dst.get_partial_shape()[3].is_static()); + assert(dst.get_partial_shape().rank() == 4 && dst.get_partial_shape()[2].is_static() && + dst.get_partial_shape()[3].is_static()); int64_t dim2 = dst.get_partial_shape()[2].get_length(); int64_t dim3 = dst.get_partial_shape()[3].get_length(); data = std::make_shared( diff --git a/ggml/src/ggml-openvino/openvino/op/softmax.cpp b/ggml/src/ggml-openvino/openvino/op/softmax.cpp index 1aa3bf76a0..6c43054050 100644 --- a/ggml/src/ggml-openvino/openvino/op/softmax.cpp +++ b/ggml/src/ggml-openvino/openvino/op/softmax.cpp @@ -1,3 +1,7 @@ +#include "../node_context.hpp" +#include "../op_table.hpp" +#include "../utils.hpp" + #include #include #include @@ -13,16 +17,12 @@ #include #include -#include "../node_context.hpp" -#include "../op_table.hpp" -#include "../utils.hpp" - namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_soft_max(const NodeContext& context) { +OutputVector translate_soft_max(const NodeContext & context) { num_inputs_check(context, 1, 2); auto input_node = context.get_input(0).get_node_shared_ptr(); @@ -30,9 +30,9 @@ OutputVector translate_soft_max(const NodeContext& context) { float scale = 1.0f; float max_bias = 0.0f; - auto* op_params = context.get_output_op_params(0); - memcpy(&scale, (float*) op_params + 0, sizeof(float)); - memcpy(&max_bias, (float*) op_params + 1, sizeof(float)); + auto * op_params = context.get_output_op_params(0); + memcpy(&scale, (float *) op_params + 0, sizeof(float)); + memcpy(&max_bias, (float *) op_params + 1, sizeof(float)); auto src0_shape = context.get_input_shape(0).get_shape(); const uint32_t h = src0_shape[2]; const uint32_t n_head = src0_shape[0]; diff --git a/ggml/src/ggml-openvino/openvino/op/transpose.cpp b/ggml/src/ggml-openvino/openvino/op/transpose.cpp index c585dffa6e..6b4f8a849b 100644 --- a/ggml/src/ggml-openvino/openvino/op/transpose.cpp +++ b/ggml/src/ggml-openvino/openvino/op/transpose.cpp @@ -1,15 +1,15 @@ -#include - #include "../node_context.hpp" #include "../op_table.hpp" #include "../utils.hpp" +#include + namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_transpose(const NodeContext& context) { +OutputVector translate_transpose(const NodeContext & context) { num_inputs_check(context, 1, 1); auto res = std::make_shared(context.get_input(0), diff --git a/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp b/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp index 2b27c0be12..b2214fa930 100644 --- a/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp @@ -1,17 +1,17 @@ -#include -#include -#include - #include "../node_context.hpp" #include "../op_table.hpp" #include "../utils.hpp" +#include +#include +#include + namespace ov { namespace frontend { namespace ggml { namespace op { -OutputVector translate_unary_silu(const NodeContext& context) { +OutputVector translate_unary_silu(const NodeContext & context) { num_inputs_check(context, 1, 1); auto input = context.get_input(0); diff --git a/ggml/src/ggml-openvino/openvino/op/view.cpp b/ggml/src/ggml-openvino/openvino/op/view.cpp index 034b6df119..b53abca7e9 100644 --- a/ggml/src/ggml-openvino/openvino/op/view.cpp +++ b/ggml/src/ggml-openvino/openvino/op/view.cpp @@ -6,12 +6,13 @@ namespace frontend { namespace ggml { namespace op { -OutputVector translate_view(const NodeContext& context) { +OutputVector translate_view(const NodeContext & context) { num_inputs_check(context, 1, 1); if (context.get_op_case() == 2) { auto dst_shape = context.get_output_shape(0).to_shape(); - return rename_outputs_with_suffix({process_view_input(context, 0, dst_shape[1] * dst_shape[2])}, context.get_name()); + return rename_outputs_with_suffix({process_view_input(context, 0, dst_shape[1] * dst_shape[2])}, + context.get_name()); } return {context.get_input(0)}; } diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp index e36e8f17cc..8aeb060aa5 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.cpp +++ b/ggml/src/ggml-openvino/openvino/op_table.cpp @@ -1,5 +1,7 @@ #include "op_table.hpp" +#include "utils.hpp" + #include #include #include @@ -7,8 +9,6 @@ #include #include -#include "utils.hpp" - namespace ov { namespace frontend { namespace ggml { diff --git a/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp b/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp index 4759e86e1e..375bbbd735 100644 --- a/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +++ b/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp @@ -1,15 +1,15 @@ #include "eliminate_zp.hpp" #include +#include #include -#include -#include -#include #include #include #include #include -#include +#include +#include +#include namespace ov { namespace frontend { @@ -35,13 +35,17 @@ EliminateZeroPoints::EliminateZeroPoints() { auto m_scale = ov::pass::pattern::any_input(); auto m_multiply = ov::pass::pattern::wrap_type({m_scale, m_subtract}); - const auto callback = [=](ov::pass::pattern::Matcher& m) { - const auto& pattern_map = m.get_pattern_value_map(); + const auto callback = [=](ov::pass::pattern::Matcher & m) { + const auto & pattern_map = m.get_pattern_value_map(); - auto multiply_node = std::dynamic_pointer_cast(pattern_map.at(m_multiply).get_node_shared_ptr()); - auto subtract_node = std::dynamic_pointer_cast(pattern_map.at(m_subtract).get_node_shared_ptr()); - auto data_constant = std::dynamic_pointer_cast(pattern_map.at(m_data_constant).get_node_shared_ptr()); - auto zp_constant = std::dynamic_pointer_cast(pattern_map.at(m_zp_constant).get_node_shared_ptr()); + auto multiply_node = + std::dynamic_pointer_cast(pattern_map.at(m_multiply).get_node_shared_ptr()); + auto subtract_node = + std::dynamic_pointer_cast(pattern_map.at(m_subtract).get_node_shared_ptr()); + auto data_constant = + std::dynamic_pointer_cast(pattern_map.at(m_data_constant).get_node_shared_ptr()); + auto zp_constant = + std::dynamic_pointer_cast(pattern_map.at(m_zp_constant).get_node_shared_ptr()); if (!multiply_node || !subtract_node || !data_constant || !zp_constant) { return false; @@ -101,14 +105,16 @@ EliminateZeroPoints::EliminateZeroPoints() { new_constant = std::make_shared(target_type, data_shape, adjusted_values); } - auto new_convert = std::make_shared(new_constant, subtract_node->get_output_element_type(0)); + auto new_convert = + std::make_shared(new_constant, subtract_node->get_output_element_type(0)); ov::replace_node(subtract_node, new_convert); return true; }; - register_matcher(std::make_shared(m_multiply, "ov::frontend::ggml::pass::EliminateZeroPoints"), - callback); + register_matcher( + std::make_shared(m_multiply, "ov::frontend::ggml::pass::EliminateZeroPoints"), + callback); } } // namespace pass diff --git a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp index f38c0837d1..3e5730c90f 100644 --- a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +++ b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp @@ -33,8 +33,8 @@ FuseToSDPA::FuseToSDPA() { const auto m_v = ov::pass::pattern::any_input(); const auto m_qkv = ov::pass::pattern::wrap_type({m_softmax_qk_f16, m_v}); - const auto callback = [=](ov::pass::pattern::Matcher& m) { - auto& pattern_to_output = m.get_pattern_value_map(); + const auto callback = [=](ov::pass::pattern::Matcher & m) { + auto & pattern_to_output = m.get_pattern_value_map(); auto k = pattern_to_output[m_k]; auto q = pattern_to_output[m_q]; auto v = pattern_to_output[m_v]; diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index e35599084e..67c5b4a51b 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -1,5 +1,11 @@ #include "translate_session.hpp" +#include "ggml-openvino/openvino/node_context.hpp" +#include "ggml-openvino/openvino/utils.hpp" +#include "input_model.hpp" +#include "pass/eliminate_zp.hpp" +#include "pass/mark_decompression_convert_constant_folding.hpp" + #include #include #include @@ -25,12 +31,6 @@ #include #include -#include "ggml-openvino/openvino/node_context.hpp" -#include "ggml-openvino/openvino/utils.hpp" -#include "input_model.hpp" -#include "pass/eliminate_zp.hpp" -#include "pass/mark_decompression_convert_constant_folding.hpp" - namespace ov { namespace frontend { namespace ggml { @@ -40,16 +40,17 @@ using namespace ov::op; namespace { ov::pass::MakeStateful::ParamResPairs get_kv_param_res_pairs( - const std::shared_ptr& model, const std::map& kv_param_res_names) { + const std::shared_ptr & model, + const std::map & kv_param_res_names) { ov::pass::MakeStateful::ParamResPairs pairs; - const auto& params = model->get_parameters(); - const auto& results = model->get_results(); + const auto & params = model->get_parameters(); + const auto & results = model->get_results(); - for (const auto& param_res : kv_param_res_names) { - const auto& param_name = param_res.first; - const auto& res_name = param_res.second; + for (const auto & param_res : kv_param_res_names) { + const auto & param_name = param_res.first; + const auto & res_name = param_res.second; - auto param_it = std::find_if(params.begin(), params.end(), [&](const std::shared_ptr& node) { + auto param_it = std::find_if(params.begin(), params.end(), [&](const std::shared_ptr & node) { return node->get_friendly_name() == param_name; }); @@ -57,7 +58,7 @@ ov::pass::MakeStateful::ParamResPairs get_kv_param_res_pairs( " is not associated with any of " "Parameters in the network."); - auto res_it = std::find_if(results.begin(), results.end(), [&](const std::shared_ptr& node) { + auto res_it = std::find_if(results.begin(), results.end(), [&](const std::shared_ptr & node) { return node->get_friendly_name() == res_name; }); @@ -72,17 +73,17 @@ ov::pass::MakeStateful::ParamResPairs get_kv_param_res_pairs( return pairs; } -void add_token_len(TensorMap& tensor_map) { +void add_token_len(TensorMap & tensor_map) { auto inp_tokens = tensor_map.at("inp_tokens").get_node_shared_ptr(); auto token_len = get_dimensions(inp_tokens, {2}); token_len->set_friendly_name("token_len"); tensor_map.insert({"token_len", token_len->output(0)}); } -void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { +void add_sliced_mask(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) { auto token_len = tensor_map.at("token_len").get_node_shared_ptr(); - auto create_sliced_mask = [&](const std::string& mask_name, const std::string& sliced_name, bool is_static) { + auto create_sliced_mask = [&](const std::string & mask_name, const std::string & sliced_name, bool is_static) { if (tensor_map.find(mask_name) != tensor_map.end()) { auto mask = tensor_map.at(mask_name).get_node_shared_ptr(); std::shared_ptr mask_sliced; @@ -110,8 +111,7 @@ void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { kv_len = std::make_shared(kv_len, one_1d); auto stop = std::make_shared(ov::OutputVector{token_len, kv_len}, 0); - mask_sliced = - std::make_shared(mask, zero_2d, stop, one_2d, axes); + mask_sliced = std::make_shared(mask, zero_2d, stop, one_2d, axes); mask_sliced = std::make_shared(mask_sliced, zero_1d); mask_sliced = std::make_shared(mask_sliced, ov::element::f16); mask_sliced->set_friendly_name(sliced_name); @@ -125,8 +125,8 @@ void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { // create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static()); } -void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { - int32_t* rope_params = ggml_model_decoder.get_rope_params(); +void add_rope_sin_cos(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) { + int32_t * rope_params = ggml_model_decoder.get_rope_params(); auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); std::shared_ptr rope_freqs_weight; if (tensor_map.find("rope_freqs_weight") != tensor_map.end()) { @@ -144,7 +144,7 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { } // Create common patterns -void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { +void preprocess(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) { add_token_len(tensor_map); add_sliced_mask(tensor_map, ggml_model_decoder); add_rope_sin_cos(tensor_map, ggml_model_decoder); @@ -152,8 +152,8 @@ void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { } // namespace -TranslateSession::TranslateSession(const frontend::InputModel::Ptr& input_model, - const std::unordered_map& translator_map, +TranslateSession::TranslateSession(const frontend::InputModel::Ptr & input_model, + const std::unordered_map & translator_map, bool naive) : m_input_model(input_model), m_translator_map(translator_map), @@ -168,26 +168,26 @@ std::shared_ptr TranslateSession::get_converted_model() { return m_ov_model; } -std::shared_ptr TranslateSession::translate_graph(const frontend::InputModel::Ptr& input_model) { +std::shared_ptr TranslateSession::translate_graph(const frontend::InputModel::Ptr & input_model) { ov::ParameterVector params; ov::ResultVector results; auto tensor_map = std::make_shared(); std::shared_ptr resulting_model; - const auto& ggml_model = std::dynamic_pointer_cast(input_model); + const auto & ggml_model = std::dynamic_pointer_cast(input_model); std::shared_ptr ggml_model_decoder = ggml_model->get_model_decoder(); - for (const auto& it : ggml_model_decoder->get_model_inputs()) { + for (const auto & it : ggml_model_decoder->get_model_inputs()) { params.push_back(std::dynamic_pointer_cast(it.second)); (*tensor_map)[it.first] = it.second; } - for (const auto& it : ggml_model_decoder->get_model_extra_inputs()) { + for (const auto & it : ggml_model_decoder->get_model_extra_inputs()) { params.push_back(std::dynamic_pointer_cast(it.second)); (*tensor_map)[it.first] = it.second; } - for (const auto& it : ggml_model_decoder->get_model_weights()) { + for (const auto & it : ggml_model_decoder->get_model_weights()) { (*tensor_map)[it.first] = it.second; } @@ -199,22 +199,15 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo ov::OutputVector converted_outputs; auto it = m_translator_map.find(operation_type); - FRONT_END_OP_CONVERSION_CHECK(it != m_translator_map.end(), - "Translation for operation type ", - operation_type, + FRONT_END_OP_CONVERSION_CHECK(it != m_translator_map.end(), "Translation for operation type ", operation_type, " is not implemented."); NodeContext node_context(node, tensor_map, this); converted_outputs = it->second(node_context); - const auto& node_output_names = node->get_output_names(); - FRONT_END_OP_CONVERSION_CHECK(node_output_names.size() == converted_outputs.size(), - "Number of ", - operation_type, - " outputs greater than number of converted outputs, which are ", - node_output_names.size(), - " and ", - converted_outputs.size(), - " respectively."); + const auto & node_output_names = node->get_output_names(); + FRONT_END_OP_CONVERSION_CHECK(node_output_names.size() == converted_outputs.size(), "Number of ", + operation_type, " outputs greater than number of converted outputs, which are ", + node_output_names.size(), " and ", converted_outputs.size(), " respectively."); for (size_t i = 0; i < node_output_names.size(); ++i) { auto output_name = node_output_names[i]; @@ -229,10 +222,9 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo } ggml_model_decoder->visit_subgraph(node_visitor); - for (const auto& name : ggml_model_decoder->get_model_output_names()) { + for (const auto & name : ggml_model_decoder->get_model_output_names()) { FRONT_END_GENERAL_CHECK(tensor_map->find(name) != tensor_map->end(), - "Output name not found in tensor map: ", - name); + "Output name not found in tensor map: ", name); auto result = std::make_shared(tensor_map->at(name)); result->set_friendly_name(name); results.push_back(result); diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp index f70cb91a17..1723c7d003 100644 --- a/ggml/src/ggml-openvino/openvino/utils.cpp +++ b/ggml/src/ggml-openvino/openvino/utils.cpp @@ -1,5 +1,7 @@ #include "utils.hpp" +#include "ggml-impl.h" + #include #include #include @@ -17,8 +19,6 @@ #include #include -#include "ggml-impl.h" - namespace ov { namespace frontend { namespace ggml { @@ -30,7 +30,7 @@ std::string getCurrentTime() { return buf; } -void num_inputs_check(const NodeContext& context, size_t min_inputs, size_t max_inputs) { +void num_inputs_check(const NodeContext & context, size_t min_inputs, size_t max_inputs) { auto input_size = context.get_input_size(); FRONT_END_OP_CONVERSION_CHECK(input_size >= min_inputs, "Got less inputs than expected"); FRONT_END_OP_CONVERSION_CHECK(input_size <= max_inputs, "Got more inputs than expected"); @@ -48,20 +48,20 @@ int non_cont_dim(std::vector ne, std::vector nb) { return 0; } -std::shared_ptr get_dimensions(const std::shared_ptr& shape, - const std::vector& dims) { +std::shared_ptr get_dimensions(const std::shared_ptr & shape, + const std::vector & dims) { using namespace ov::op; const auto zero = v0::Constant::create(ov::element::i32, ov::Shape{}, {0}); const auto dims_const = v0::Constant::create(ov::element::i32, ov::Shape{dims.size()}, dims); return std::make_shared(shape, dims_const, zero); } -std::shared_ptr get_dimensions(const std::shared_ptr& node, const std::vector& dims) { +std::shared_ptr get_dimensions(const std::shared_ptr & node, const std::vector & dims) { return get_dimensions(std::make_shared(node), dims); } -OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::string& suffix) { - for (const auto& output : outputs) { +OutputVector rename_outputs_with_suffix(const OutputVector & outputs, const std::string & suffix) { + for (const auto & output : outputs) { auto node = output.get_node_shared_ptr(); std::string name = node->get_friendly_name(); name += "_"; @@ -111,7 +111,7 @@ void ggml_rope_yarn_corr_dims(int n_dims, } } // namespace -std::pair, ov::Output> make_sin_cos(int32_t* rope_params, +std::pair, ov::Output> make_sin_cos(int32_t * rope_params, std::shared_ptr inp_pos, std::shared_ptr rope_freqs_weight) { inp_pos = std::make_shared(inp_pos, ov::element::f32); @@ -179,11 +179,11 @@ std::pair, ov::Output> make_sin_cos(int32_t* rope_params, return std::make_pair(sin_theta, cos_theta); } -ov::Output process_view_input(const NodeContext& context, int input_index, int slice_len) { +ov::Output process_view_input(const NodeContext & context, int input_index, int slice_len) { // Only works for VIEW operations that slice at the lowest dimension // If the VIEW also reshape the result, `slice_len` should be provided auto input = context.get_input(input_index); - int32_t* op_params = context.get_input_op_params(input_index); + int32_t * op_params = context.get_input_op_params(input_index); auto src1_stride = context.get_input_stride(input_index); int64_t split_addr = op_params[0] / src1_stride[2]; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 9b000f26d5..eb9ea9fee9 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -1,5 +1,11 @@ #include "utils.h" +#include "ggml-impl.h" +#include "ggml-openvino/ggml-decoder.h" +#include "ggml.h" +#include "openvino/frontend.hpp" +#include "openvino/input_model.hpp" + #include #include #include @@ -23,15 +29,9 @@ #include #include -#include "ggml-impl.h" -#include "ggml-openvino/ggml-decoder.h" -#include "ggml.h" -#include "openvino/frontend.hpp" -#include "openvino/input_model.hpp" - -ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string& name) { - const auto* ggml_tensor = ggml_decoder->get_input_ggml_tensor(name); - auto* input_data = ggml_tensor->data; +ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string & name) { + const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(name); + auto * input_data = ggml_tensor->data; ov::Shape input_shape; if (name.find("cache_k") == 0 || name.find("cache_v") == 0) { input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor).to_shape(); @@ -45,13 +45,14 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, return input_tensor; } -std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder) { - std::map output_tensors; +std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder) { + std::map output_tensors; + auto output_names = ggml_decoder->get_model_output_names(); for (size_t inp = 0; inp < output_names.size(); ++inp) { auto name = output_names[inp]; - const auto* tensor = ggml_decoder->get_output_ggml_tensor(name); - auto* output_data = tensor->view_src ? tensor->view_src->data : tensor->data; + const auto * tensor = ggml_decoder->get_output_ggml_tensor(name); + auto * output_data = tensor->view_src ? tensor->view_src->data : tensor->data; output_tensors[name] = output_data; } return output_tensors; @@ -63,14 +64,14 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { return front_end; } -enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph) { +enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { static ov::Core core; static std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : ""; if (device.empty()) { - const std::vector preferred_device = { "GPU", "CPU", "NPU" }; + const std::vector preferred_device = {"GPU", "CPU", "NPU"}; const auto available_devices = core.get_available_devices(); - for (const auto& dev : preferred_device) { + for (const auto & dev : preferred_device) { if (std::find(available_devices.begin(), available_devices.end(), dev) != available_devices.end()) { device = dev; break; @@ -92,17 +93,17 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto start_time = ggml_time_us(); - auto* cache_dir = getenv("GGML_OPENVINO_CACHE_DIR"); + auto * cache_dir = getenv("GGML_OPENVINO_CACHE_DIR"); if (cache_dir && !is_static) { core.set_property(ov::cache_dir(cache_dir)); } static std::mutex cache_mutex; - static std::unordered_map> infer_request_cache; - static std::unordered_map> ov_input_names_cache; - static std::unordered_map> ov_output_names_cache; + static std::unordered_map> infer_request_cache; + static std::unordered_map> ov_input_names_cache; + static std::unordered_map> ov_output_names_cache; // For NPU, store the kvcache model, since we cannot create two infer_request - static std::unordered_map compiled_model_cache; + static std::unordered_map compiled_model_cache; std::shared_ptr ggml_decoder; ov::InferRequest infer_request; @@ -181,7 +182,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c ov::serialize(model, timestamped_filename); } - auto* disable_sdpa_optimization = getenv("GGML_OPENVINO_DISABLE_SDPA_OPTIMIZATION"); + auto * disable_sdpa_optimization = getenv("GGML_OPENVINO_DISABLE_SDPA_OPTIMIZATION"); if (disable_sdpa_optimization && std::string(disable_sdpa_optimization) != "0") { config = { {"GPU_ENABLE_SDPA_OPTIMIZATION", "0"} @@ -196,10 +197,10 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c std::vector ov_input_names; std::vector ov_output_names; - for (const auto& ov_param : model->get_parameters()) { + for (const auto & ov_param : model->get_parameters()) { ov_input_names.push_back(ov_param->get_friendly_name()); } - for (const auto& ov_output : model->get_results()) { + for (const auto & ov_output : model->get_results()) { ov_output_names.push_back(ov_output->get_friendly_name()); } ov_input_names_cache[cgraph] = ov_input_names; @@ -225,7 +226,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto gguf_tensor_addrs = get_ggml_graph_output_dst(ggml_decoder); for (size_t i = 0; i < ov_output_names.size(); i++) { - auto& result_name = ov_output_names[i]; + auto & result_name = ov_output_names[i]; const auto output_tensor = infer_request.get_output_tensor(i); std::memcpy(gguf_tensor_addrs[result_name], output_tensor.data(), output_tensor.get_byte_size()); @@ -278,7 +279,7 @@ ov::AnyMap get_npu_generate_config() { return config; } -std::map get_types_to_requant(const std::string& device) { +std::map get_types_to_requant(const std::string & device) { if (device == "NPU") { return { {GGML_TYPE_Q4_0, ExtraQuantType::Q4_0_128}, @@ -297,15 +298,15 @@ std::map get_types_to_requant(const std::string& devi return {}; } -bool is_naive(struct ggml_cgraph* cgraph) { +bool is_naive(ggml_cgraph * cgraph) { constexpr int naive_graph_size_threshold = 20; return cgraph->n_nodes < naive_graph_size_threshold; } -enum ggml_status naive_compute(struct ggml_cgraph* cgraph, - ov::Core& core, - const std::string& device, - const ov::AnyMap& config) { +enum ggml_status naive_compute(ggml_cgraph * cgraph, + ov::Core & core, + const std::string & device, + const ov::AnyMap & config) { if (cgraph->n_nodes == 1 && (cgraph->nodes[0]->op == GGML_OP_NONE || cgraph->nodes[0]->op == GGML_OP_VIEW)) { return GGML_STATUS_SUCCESS; } @@ -343,7 +344,7 @@ enum ggml_status naive_compute(struct ggml_cgraph* cgraph, return GGML_STATUS_SUCCESS; } -ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string& param_name) { +ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string & param_name) { bool is_static = ggml_decoder->is_static(); bool is_first_token = ggml_decoder->is_first_token(); @@ -358,10 +359,10 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, cons if (param_name == "inp_tokens" || param_name == "inp_pos") { if (is_first_token) { size_t context_size = ggml_decoder->get_context_size(); - const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); + const auto * input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); std::vector padded_data = pad_input(input_tensor_ggml, 1, context_size, 0); input_tensor = ov::Tensor(ov::element::i32, ov::Shape{1, 1, context_size}); - auto* data_ptr = input_tensor.data(); + auto * data_ptr = input_tensor.data(); std::copy(padded_data.begin(), padded_data.end(), data_ptr); } else { input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); @@ -369,22 +370,22 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, cons } else if (param_name.find("KQ_mask") == 0) { size_t context_size = ggml_decoder->get_context_size(); - const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); + const auto * input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); if (is_first_token) { std::vector padded_data = pad_input(input_tensor_ggml, context_size, context_size, -INFINITY); set_zero_diagonal(padded_data, context_size); input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, context_size, context_size}); - auto* data_ptr = input_tensor.data(); + auto * data_ptr = input_tensor.data(); std::copy(padded_data.begin(), padded_data.end(), data_ptr); } else { std::vector padded_data = pad_input(input_tensor_ggml, 1, context_size, -INFINITY); input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, context_size}); - auto* data_ptr = input_tensor.data(); + auto * data_ptr = input_tensor.data(); std::copy(padded_data.begin(), padded_data.end(), data_ptr); } - } else if (const auto* op = ggml_decoder->get_tensor_used_op(ggml_decoder->get_tensor_from_name(param_name)); + } else if (const auto * op = ggml_decoder->get_tensor_used_op(ggml_decoder->get_tensor_from_name(param_name)); op && op->op == GGML_OP_SET_ROWS && is_static && is_first_token) { input_tensor = ov::Tensor(ov::element::i64, ov::Shape{1, 1, 1}); } else { @@ -394,8 +395,8 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, cons return input_tensor; } -size_t checksum(const void* data, size_t size) { - const uint8_t* bytes = static_cast(data); +size_t checksum(const void * data, size_t size) { + const uint8_t * bytes = static_cast(data); size_t sum = 0; for (size_t i = 0; i < size; ++i) { sum += (uint8_t) i; @@ -408,36 +409,37 @@ size_t checksum(const void* data, size_t size) { #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" -void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor) { +void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor) { std::cout << "Input name: " << name << ", Input shape: " << tensor.get_shape() << ", Address: " << tensor.data() << std::endl; switch (tensor.get_element_type()) { - case ov::element::f32: - std::cout << *(tensor.data()) << std::endl; - break; - case ov::element::f16: - std::cout << *(tensor.data()) << std::endl; - break; - case ov::element::i32: - for (size_t i = 0; i < tensor.get_size(); ++i) { - std::cout << tensor.data()[i] << " "; - } - std::cout << std::endl; - break; - case ov::element::i64: - std::cout << *(tensor.data()) << std::endl; - break; - default: - break; + case ov::element::f32: + std::cout << *(tensor.data()) << std::endl; + break; + case ov::element::f16: + std::cout << *(tensor.data()) << std::endl; + break; + case ov::element::i32: + for (size_t i = 0; i < tensor.get_size(); ++i) { + std::cout << tensor.data()[i] << " "; + } + std::cout << std::endl; + break; + case ov::element::i64: + std::cout << *(tensor.data()) << std::endl; + break; + default: + break; } } -void print_output_tensor_info(const std::string& name, const ov::Tensor& tensor, - std::map& output_dst) { +void print_output_tensor_info(const std::string & name, + const ov::Tensor & tensor, + std::map & output_dst) { std::cout << "Output name: " << name << ", Output shape: " << tensor.get_shape() << ", Address: " << output_dst[name] << std::endl; - auto print_float_stats = [](const std::string& type_name, size_t size, auto get_value) { + auto print_float_stats = [](const std::string & type_name, size_t size, auto get_value) { if (size == 0) { return; } @@ -467,13 +469,13 @@ void print_output_tensor_info(const std::string& name, const ov::Tensor& tensor, switch (tensor.get_element_type()) { case ov::element::f32: { - const float* data = tensor.data(); + const float * data = tensor.data(); size_t size = tensor.get_size(); print_float_stats("[f32]", size, [data](size_t i) { return data[i]; }); break; } case ov::element::f16: { - const ov::float16* data = tensor.data(); + const ov::float16 * data = tensor.data(); size_t size = tensor.get_size(); print_float_stats("[f16]", size, [data](size_t i) { return static_cast(data[i]); }); break; @@ -485,17 +487,17 @@ void print_output_tensor_info(const std::string& name, const ov::Tensor& tensor, #pragma GCC diagnostic pop -void set_zero_diagonal(std::vector& matrix, size_t dim) { +void set_zero_diagonal(std::vector & matrix, size_t dim) { for (size_t i = 0; i < dim; ++i) { matrix[i * dim + i] = 0.0f; } } -bool is_prefill(struct ggml_cgraph* cgraph) { +bool is_prefill(ggml_cgraph * cgraph) { for (int i = 0; i < cgraph->n_nodes; ++i) { - auto* op = cgraph->nodes[i]; + auto * op = cgraph->nodes[i]; for (int j = 0; j < GGML_MAX_SRC; ++j) { - auto* src = op->src[j]; + auto * src = op->src[j]; if (src == nullptr) { break; } diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 42686c593b..22f5cc8c34 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -1,32 +1,32 @@ -#include -#include - #include "ggml-backend-impl.h" #include "ggml-decoder.h" #include "ggml-impl.h" -enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph); +#include +#include -std::shared_ptr get_ggml_decoder(struct ggml_cgraph* cgraph, bool is_static, bool is_first_token); +enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph); -ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string& name); +std::shared_ptr get_ggml_decoder(struct ggml_cgraph * cgraph, bool is_static, bool is_first_token); -std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder); +ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string & name); -size_t checksum(const void* data, size_t size); +std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder); -void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor); +size_t checksum(const void * data, size_t size); -void print_output_tensor_info(const std::string& name, - const ov::Tensor& tensor, - std::map& output_dst); +void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor); + +void print_output_tensor_info(const std::string & name, + const ov::Tensor & tensor, + std::map & output_dst); template -std::vector pad_input(const ggml_tensor* tensor, size_t padded_rows, size_t padded_cols, T pad_value) { +std::vector pad_input(const ggml_tensor * tensor, size_t padded_rows, size_t padded_cols, T pad_value) { std::vector padded_data(padded_rows * padded_cols, pad_value); size_t rows = tensor->ne[1]; size_t cols = tensor->ne[0]; - T* data = static_cast(tensor->data); + T * data = static_cast(tensor->data); for (size_t i = 0; i < std::min(rows, padded_rows); ++i) { for (size_t j = 0; j < std::min(cols, padded_cols); ++j) { @@ -36,18 +36,20 @@ std::vector pad_input(const ggml_tensor* tensor, size_t padded_rows, size_t p return padded_data; } -void set_zero_diagonal(std::vector& matrix, size_t dim); +void set_zero_diagonal(std::vector & matrix, size_t dim); bool is_prefill(struct ggml_cgraph * cgraph); ov::AnyMap get_npu_prefill_config(); ov::AnyMap get_npu_generate_config(); -std::map get_types_to_requant(const std::string& device); +std::map get_types_to_requant(const std::string & device); -ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string& param_name); +ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string & param_name); -bool is_naive(struct ggml_cgraph* cgraph); +bool is_naive(struct ggml_cgraph * cgraph); -enum ggml_status naive_compute(struct ggml_cgraph* cgraph, ov::Core& core, const std::string& device, - const ov::AnyMap& config); +enum ggml_status naive_compute(struct ggml_cgraph * cgraph, + ov::Core & core, + const std::string & device, + const ov::AnyMap & config);