Style: middle ptr and ref align, omit optional struct keyword
This commit is contained in:
parent
bd3093f90c
commit
eba8113dc4
|
|
@ -1,17 +1,17 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cstring>
|
||||
#include <array>
|
||||
#include <cstring>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GGML_OPENVINO_NAME "OPENVINO"
|
||||
#define GGML_OPENVINO_MAX_DEVICES 16
|
||||
#define GGML_OPENVINO_NAME "OPENVINO"
|
||||
#define GGML_OPENVINO_MAX_DEVICES 16
|
||||
|
||||
// backend API
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device);
|
||||
|
|
@ -28,7 +28,7 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_split_buffer_t
|
|||
// and GPU
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(void);
|
||||
|
||||
GGML_BACKEND_API int ggml_backend_openvino_get_device_count(void);
|
||||
GGML_BACKEND_API int ggml_backend_openvino_get_device_count(void);
|
||||
// GGML_BACKEND_API void ggml_backend_openvino_get_device_description(int device, char * description,
|
||||
// size_t description_size);
|
||||
// GGML_BACKEND_API void ggml_backend_openvino_get_device_memory(int device, size_t * free, size_t * total);
|
||||
|
|
@ -42,13 +42,13 @@ struct ggml_openvino_device_info {
|
|||
int device_count;
|
||||
|
||||
struct openvino_device_info {
|
||||
int cc; // compute capability
|
||||
int nsm; // number of streaming multiprocessors
|
||||
size_t smpb; // max. shared memory per block
|
||||
size_t smpbo; // max. shared memory per block (with opt-in)
|
||||
bool vmm; // virtual memory support
|
||||
size_t vmm_granularity; // granularity of virtual memory
|
||||
size_t total_vram;
|
||||
int cc; // compute capability
|
||||
int nsm; // number of streaming multiprocessors
|
||||
size_t smpb; // max. shared memory per block
|
||||
size_t smpbo; // max. shared memory per block (with opt-in)
|
||||
bool vmm; // virtual memory support
|
||||
size_t vmm_granularity; // granularity of virtual memory
|
||||
size_t total_vram;
|
||||
};
|
||||
|
||||
openvino_device_info devices[GGML_OPENVINO_MAX_DEVICES] = {};
|
||||
|
|
|
|||
|
|
@ -2,12 +2,10 @@
|
|||
# Override root .clang-format
|
||||
AlignConsecutiveAssignments: false
|
||||
AlignConsecutiveDeclarations: false
|
||||
ReferenceAlignment: Left
|
||||
PointerAlignment: Left
|
||||
Cpp11BracedListStyle: true
|
||||
AccessModifierOffset: -4
|
||||
BinPackArguments: false
|
||||
SpacesInContainerLiterals: false
|
||||
BreakBeforeBraces: Attach
|
||||
AccessModifierOffset: -4
|
||||
IndentCaseBlocks: false
|
||||
IndentCaseLabels: false
|
||||
|
||||
|
|
@ -32,7 +30,15 @@ AllowShortIfStatementsOnASingleLine: Never
|
|||
AllowShortLambdasOnASingleLine: Inline
|
||||
AllowShortLoopsOnASingleLine: false
|
||||
AlwaysBreakBeforeMultilineStrings: true
|
||||
BinPackParameters: true
|
||||
# Treat CUDA keywords/attributes as "attribute macros" and avoid breaking lines inside them
|
||||
AttributeMacros:
|
||||
- __host__
|
||||
- __device__
|
||||
- __global__
|
||||
- __forceinline__
|
||||
- __launch_bounds__
|
||||
BinPackArguments: true
|
||||
BinPackParameters: false # OnePerLine
|
||||
BitFieldColonSpacing: Both
|
||||
# BreakAdjacentStringLiterals: true
|
||||
BreakAfterAttributes: Never
|
||||
|
|
@ -58,15 +64,18 @@ ExperimentalAutoDetectBinPacking: false
|
|||
FixNamespaceComments: true
|
||||
IncludeBlocks: Regroup
|
||||
IncludeCategories:
|
||||
- Regex: '^<.*\.h>'
|
||||
- Regex: '".*"'
|
||||
Priority: 1
|
||||
SortPriority: 0
|
||||
- Regex: '^<.*'
|
||||
- Regex: '^<.*\.h>'
|
||||
Priority: 2
|
||||
SortPriority: 0
|
||||
- Regex: '.*'
|
||||
- Regex: '^<.*'
|
||||
Priority: 3
|
||||
SortPriority: 0
|
||||
- Regex: '.*'
|
||||
Priority: 4
|
||||
SortPriority: 0
|
||||
IncludeIsMainRegex: '([-_](test|unittest))?$'
|
||||
IncludeIsMainSourceRegex: ''
|
||||
IndentAccessModifiers: false
|
||||
|
|
@ -100,6 +109,7 @@ PenaltyBreakString: 1000
|
|||
PenaltyBreakTemplateDeclaration: 10
|
||||
PenaltyExcessCharacter: 1000000
|
||||
PenaltyReturnTypeOnItsOwnLine: 200
|
||||
PointerAlignment: Middle
|
||||
QualifierAlignment: Left
|
||||
#QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict']
|
||||
RawStringFormats:
|
||||
|
|
@ -113,6 +123,7 @@ RawStringFormats:
|
|||
- 'c++'
|
||||
- 'C++'
|
||||
CanonicalDelimiter: ''
|
||||
ReferenceAlignment: Middle
|
||||
ReflowComments: false # IndentOnly
|
||||
SeparateDefinitionBlocks: Always
|
||||
SortIncludes: CaseInsensitive
|
||||
|
|
|
|||
|
|
@ -1,5 +1,9 @@
|
|||
#include "ggml-decoder.h"
|
||||
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-quants.hpp"
|
||||
|
||||
#include <ggml-impl.h>
|
||||
#include <ggml.h>
|
||||
|
||||
|
|
@ -32,13 +36,16 @@
|
|||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-quants.hpp"
|
||||
|
||||
GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token,
|
||||
int context_size, int context_size_swa, int num_heads, int num_heads_kv, int head_size,
|
||||
const std::vector<int>& swa_layers) :
|
||||
GgmlOvDecoder::GgmlOvDecoder(ggml_tensor * node,
|
||||
ggml_cgraph * cgraph,
|
||||
bool is_static,
|
||||
bool is_first_token,
|
||||
int context_size,
|
||||
int context_size_swa,
|
||||
int num_heads,
|
||||
int num_heads_kv,
|
||||
int head_size,
|
||||
const std::vector<int> & swa_layers) :
|
||||
m_cgraph(cgraph),
|
||||
m_node(node),
|
||||
m_op_name(std::string(node->name)),
|
||||
|
|
@ -53,8 +60,9 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
|
|||
set_input_output(node);
|
||||
}
|
||||
|
||||
GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph,
|
||||
std::map<std::string, std::shared_ptr<ov::Node>>& model_weights, bool is_static,
|
||||
GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
|
||||
bool is_static,
|
||||
bool is_first_token) :
|
||||
m_cgraph(cgraph),
|
||||
m_op_name(m_node ? std::string(m_node->name) : ""),
|
||||
|
|
@ -68,7 +76,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph,
|
|||
set_llm_params();
|
||||
|
||||
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
|
||||
auto* cur_node = cgraph->nodes[node_n];
|
||||
auto * cur_node = cgraph->nodes[node_n];
|
||||
m_nodes.push_back(cur_node);
|
||||
set_input_output(cur_node);
|
||||
}
|
||||
|
|
@ -76,12 +84,11 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph,
|
|||
// add_extra_inputs();
|
||||
}
|
||||
|
||||
GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph,
|
||||
std::map<std::string, std::shared_ptr<ov::Node>>& model_weights) {
|
||||
GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map<std::string, std::shared_ptr<ov::Node>> & model_weights) {
|
||||
m_cgraph = cgraph;
|
||||
m_model_weights = model_weights;
|
||||
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
|
||||
auto* cur_node = cgraph->nodes[node_n];
|
||||
auto * cur_node = cgraph->nodes[node_n];
|
||||
if (cur_node->op == GGML_OP_NONE) {
|
||||
continue;
|
||||
}
|
||||
|
|
@ -93,7 +100,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph,
|
|||
// Called in GgmlOvDecoder constructor. Two cases: 1. constructing a decoder for the whole graph;
|
||||
// 2. constructing a decoder for a node;
|
||||
// 3. constructing a decoder for the whole graph naively (op test case)
|
||||
void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) {
|
||||
void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) {
|
||||
std::string node_name;
|
||||
if (node->op == GGML_OP_SET_ROWS) {
|
||||
// SET_ROWS updates the tensor in place. For later ov op that uses the
|
||||
|
|
@ -109,7 +116,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) {
|
|||
m_outputs[node_name] = node;
|
||||
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
auto* src = node->src[i];
|
||||
auto * src = node->src[i];
|
||||
if (src == nullptr) {
|
||||
continue;
|
||||
}
|
||||
|
|
@ -128,7 +135,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) {
|
|||
}
|
||||
|
||||
} else if (!m_node && !src->view_src) {
|
||||
ggml_backend_buffer* buffer = src->buffer;
|
||||
ggml_backend_buffer * buffer = src->buffer;
|
||||
|
||||
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||
// GGML_BACKEND_BUFFER_USAGE_ANY are kv caches
|
||||
|
|
@ -236,8 +243,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) {
|
|||
}
|
||||
case GGML_OP_VIEW: {
|
||||
if (node->src[0]->op == GGML_OP_VIEW) {
|
||||
auto* src = node->src[0];
|
||||
auto* view_src = src->view_src;
|
||||
auto * src = node->src[0];
|
||||
auto * view_src = src->view_src;
|
||||
if (view_src->ne[1] != src->ne[2]) {
|
||||
throw std::runtime_error("Unsupported VIEW case");
|
||||
}
|
||||
|
|
@ -250,7 +257,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) {
|
|||
}
|
||||
}
|
||||
|
||||
int extract_layer_from_name(const std::string& name) {
|
||||
int extract_layer_from_name(const std::string & name) {
|
||||
size_t pos1 = name.find("_l");
|
||||
assert(pos1 != std::string::npos);
|
||||
pos1 += 2;
|
||||
|
|
@ -265,10 +272,10 @@ int extract_layer_from_name(const std::string& name) {
|
|||
|
||||
void GgmlOvDecoder::set_llm_params() {
|
||||
for (int i = 0; i < m_cgraph->n_nodes; i++) {
|
||||
auto* node = m_cgraph->nodes[i];
|
||||
auto * node = m_cgraph->nodes[i];
|
||||
std::string name = std::string(node->name);
|
||||
if (node->op == GGML_OP_FLASH_ATTN_EXT) {
|
||||
auto* cache_k = node->src[1];
|
||||
auto * cache_k = node->src[1];
|
||||
cache_k = cache_k->view_src ? cache_k->view_src : cache_k;
|
||||
int layer = extract_layer_from_name(cache_k->name);
|
||||
|
||||
|
|
@ -290,7 +297,7 @@ void GgmlOvDecoder::set_llm_params() {
|
|||
}
|
||||
}
|
||||
|
||||
ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) const {
|
||||
ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * src) const {
|
||||
auto name = std::string(src->name);
|
||||
ov::PartialShape input_shape;
|
||||
if (name == "inp_tokens" || name == "inp_pos") {
|
||||
|
|
@ -323,7 +330,7 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co
|
|||
} else {
|
||||
input_shape = ov::PartialShape{1, -1, m_num_heads_kv, m_head_size};
|
||||
}
|
||||
} else if (const auto* op = get_tensor_used_op(src); op && op->op == GGML_OP_SET_ROWS) {
|
||||
} else if (const auto * op = get_tensor_used_op(src); op && op->op == GGML_OP_SET_ROWS) {
|
||||
input_shape = ov::PartialShape{1, 1, m_is_static ? 1 : -1};
|
||||
} else if (src->op == GGML_OP_VIEW) {
|
||||
// This case is added to make test-backend-ops work
|
||||
|
|
@ -342,9 +349,9 @@ void GgmlOvDecoder::add_extra_inputs() {
|
|||
// Update: not used anymore after the optimization of making kvcache dynamic (but breaks iSWA models)
|
||||
int64_t attention_size = -1;
|
||||
int64_t attention_size_swa = -1;
|
||||
for (const auto& node : m_nodes) {
|
||||
for (const auto & node : m_nodes) {
|
||||
if (node->op == GGML_OP_FLASH_ATTN_EXT) {
|
||||
auto* mask = node->src[3];
|
||||
auto * mask = node->src[3];
|
||||
std::string mask_name(mask->name);
|
||||
if (mask_name.find("KQ_mask") != 0) {
|
||||
throw std::runtime_error("Unexpected flash attention node: " + std::string(mask->name));
|
||||
|
|
@ -357,7 +364,7 @@ void GgmlOvDecoder::add_extra_inputs() {
|
|||
}
|
||||
}
|
||||
|
||||
auto create_attention_size_input = [this](const std::string& name, int64_t size) {
|
||||
auto create_attention_size_input = [this](const std::string & name, int64_t size) {
|
||||
auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
|
||||
param_node->set_friendly_name(name);
|
||||
param_node->output(0).get_tensor().set_names({name});
|
||||
|
|
@ -374,12 +381,12 @@ void GgmlOvDecoder::add_extra_inputs() {
|
|||
}
|
||||
}
|
||||
|
||||
const ggml_tensor* GgmlOvDecoder::get_tensor_used_op(const ggml_tensor* tensor) const {
|
||||
const ggml_tensor * GgmlOvDecoder::get_tensor_used_op(const ggml_tensor * tensor) const {
|
||||
if (tensor == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
for (int i = 0; i < m_cgraph->n_nodes; i++) {
|
||||
const auto* node = m_cgraph->nodes[i];
|
||||
const auto * node = m_cgraph->nodes[i];
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
if (node->src[j] == tensor) {
|
||||
return node;
|
||||
|
|
@ -389,11 +396,11 @@ const ggml_tensor* GgmlOvDecoder::get_tensor_used_op(const ggml_tensor* tensor)
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
const ggml_tensor* GgmlOvDecoder::get_tensor_from_name(const std::string& name) const {
|
||||
const ggml_tensor * GgmlOvDecoder::get_tensor_from_name(const std::string & name) const {
|
||||
for (int i = 0; i < m_cgraph->n_nodes; i++) {
|
||||
const auto* node = m_cgraph->nodes[i];
|
||||
const auto * node = m_cgraph->nodes[i];
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
const auto* src = node->src[j];
|
||||
const auto * src = node->src[j];
|
||||
if (src == nullptr) {
|
||||
break;
|
||||
}
|
||||
|
|
@ -407,7 +414,7 @@ const ggml_tensor* GgmlOvDecoder::get_tensor_from_name(const std::string& name)
|
|||
|
||||
std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const {
|
||||
std::map<std::string, std::string> kv_param_res_names;
|
||||
for (const auto& name : m_kv_names) {
|
||||
for (const auto & name : m_kv_names) {
|
||||
if (name.find("cache_k") == 0 || name.find("cache_v") == 0) {
|
||||
kv_param_res_names[name] = name;
|
||||
}
|
||||
|
|
@ -416,21 +423,22 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
|
|||
}
|
||||
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(
|
||||
struct ggml_cgraph* cgraph, std::map<ggml_type, ExtraQuantType> types_to_requantize) {
|
||||
ggml_cgraph * cgraph,
|
||||
std::map<ggml_type, ExtraQuantType> types_to_requantize) {
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
|
||||
static std::mutex weights_mutex;
|
||||
auto* nodes = cgraph->nodes;
|
||||
auto * nodes = cgraph->nodes;
|
||||
auto n_nodes = cgraph->n_nodes;
|
||||
std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor* node) {
|
||||
std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor * node) {
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
auto* src = node->src[i];
|
||||
auto * src = node->src[i];
|
||||
if (src == nullptr) {
|
||||
continue;
|
||||
}
|
||||
|
||||
std::string src_name(src->name);
|
||||
if (!src->view_src) {
|
||||
ggml_backend_buffer* buffer = src->buffer;
|
||||
ggml_backend_buffer * buffer = src->buffer;
|
||||
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS || ggml_is_quantized(src->type)) {
|
||||
bool should_create = false;
|
||||
{
|
||||
|
|
@ -458,17 +466,10 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
|
|||
return model_weights;
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
|
||||
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor,
|
||||
std::optional<ExtraQuantType> requant_type) {
|
||||
std::set<ggml_type> weight_types = {GGML_TYPE_F32,
|
||||
GGML_TYPE_F16,
|
||||
GGML_TYPE_BF16,
|
||||
GGML_TYPE_Q8_0,
|
||||
GGML_TYPE_Q4_0,
|
||||
GGML_TYPE_Q4_1,
|
||||
GGML_TYPE_Q4_K,
|
||||
GGML_TYPE_Q5_K,
|
||||
GGML_TYPE_Q6_K};
|
||||
std::set<ggml_type> weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0,
|
||||
GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K};
|
||||
if (weight_types.find(tensor->type) == weight_types.end()) {
|
||||
throw std::runtime_error("Unexpected weight tensor type: " + std::string(tensor->name) + " with type " +
|
||||
ggml_type_name(tensor->type));
|
||||
|
|
@ -495,9 +496,8 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
|
|||
}
|
||||
|
||||
// Quantized case
|
||||
OPENVINO_ASSERT(
|
||||
tensor->extra == nullptr,
|
||||
"Unsupported weight tensor: " + std::string(tensor->name) + " Possibly this is a repacked quantized weights");
|
||||
OPENVINO_ASSERT(tensor->extra == nullptr, "Unsupported weight tensor: " + std::string(tensor->name) +
|
||||
" Possibly this is a repacked quantized weights");
|
||||
|
||||
if (requant_type.has_value()) {
|
||||
return requantize(tensor, requant_type.value());
|
||||
|
|
@ -518,11 +518,8 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
|
|||
weights_per_block = 32;
|
||||
}
|
||||
|
||||
OPENVINO_ASSERT(node_shape.back() % weights_per_block == 0,
|
||||
"[load_gguf] tensor ",
|
||||
tensor->name,
|
||||
" has incompatible last dim shape: ",
|
||||
node_shape.back());
|
||||
OPENVINO_ASSERT(node_shape.back() % weights_per_block == 0, "[load_gguf] tensor ", tensor->name,
|
||||
" has incompatible last dim shape: ", node_shape.back());
|
||||
|
||||
ov::Tensor weights(weight_type, node_shape);
|
||||
// For scales and biases
|
||||
|
|
@ -557,7 +554,7 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
|
|||
return weight_node.get_node_shared_ptr();
|
||||
}
|
||||
|
||||
void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename) {
|
||||
void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filename) {
|
||||
std::ofstream file(filename);
|
||||
if (!file.is_open()) {
|
||||
std::cerr << "Failed to open file" << std::endl;
|
||||
|
|
@ -576,7 +573,7 @@ void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph, std::string& f
|
|||
<< std::setw(50) << "stride"
|
||||
<< "\n";
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = cgraph->nodes[i];
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
file << " - " << std::setw(3) << i << ": [ "
|
||||
<< std::setw(5) << node->ne[0] << ", "
|
||||
|
|
@ -614,7 +611,7 @@ void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph, std::string& f
|
|||
|
||||
file << "n_leafs = " << cgraph->n_leafs << "\n";
|
||||
for (int i = 0; i < cgraph->n_leafs; i++) {
|
||||
struct ggml_tensor * node = cgraph->leafs[i];
|
||||
ggml_tensor * node = cgraph->leafs[i];
|
||||
|
||||
file << " - " << std::setw(3) << i << ": [ "
|
||||
<< std::setw(5) << node->ne[0] << ", "
|
||||
|
|
@ -628,10 +625,10 @@ void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph, std::string& f
|
|||
file.close();
|
||||
}
|
||||
|
||||
void print_tensor_address_map(const struct ggml_cgraph* cgraph) {
|
||||
std::map<void*, std::vector<std::string>> address_map;
|
||||
void print_tensor_address_map(const ggml_cgraph * cgraph) {
|
||||
std::map<void *, std::vector<std::string>> address_map;
|
||||
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
|
||||
auto* node = cgraph->nodes[node_n];
|
||||
auto * node = cgraph->nodes[node_n];
|
||||
if (node->data) {
|
||||
auto it = address_map.find(node->data);
|
||||
if (it == address_map.end()) {
|
||||
|
|
@ -640,16 +637,16 @@ void print_tensor_address_map(const struct ggml_cgraph* cgraph) {
|
|||
address_map[node->data].push_back(node->name);
|
||||
}
|
||||
}
|
||||
for (const auto& pair : address_map) {
|
||||
for (const auto & pair : address_map) {
|
||||
std::cout << "Address: " << pair.first << std::endl;
|
||||
for (const auto& name : pair.second) {
|
||||
for (const auto & name : pair.second) {
|
||||
std::cout << name << " ; ";
|
||||
}
|
||||
std::cout << std::endl << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<size_t> GgmlOvDecoder::get_shape(const ggml_tensor* tensor) {
|
||||
std::vector<size_t> GgmlOvDecoder::get_shape(const ggml_tensor * tensor) {
|
||||
std::vector<size_t> shape;
|
||||
for (int i = GGML_MAX_DIMS - 2; i >= 0; --i) {
|
||||
shape.push_back(static_cast<size_t>(tensor->ne[i]));
|
||||
|
|
@ -657,7 +654,7 @@ std::vector<size_t> GgmlOvDecoder::get_shape(const ggml_tensor* tensor) {
|
|||
return shape;
|
||||
}
|
||||
|
||||
std::vector<size_t> GgmlOvDecoder::get_stride(const ggml_tensor* tensor) {
|
||||
std::vector<size_t> GgmlOvDecoder::get_stride(const ggml_tensor * tensor) {
|
||||
std::vector<size_t> stride;
|
||||
for (int i = GGML_MAX_DIMS - 2; i >= 0; --i) {
|
||||
stride.push_back(static_cast<size_t>(tensor->nb[i]));
|
||||
|
|
@ -665,7 +662,7 @@ std::vector<size_t> GgmlOvDecoder::get_stride(const ggml_tensor* tensor) {
|
|||
return stride;
|
||||
}
|
||||
|
||||
ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor* tensor) {
|
||||
ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor * tensor) {
|
||||
switch (tensor->type) {
|
||||
case GGML_TYPE_F64:
|
||||
return ov::element::f64;
|
||||
|
|
@ -688,15 +685,15 @@ ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor* tensor) {
|
|||
}
|
||||
}
|
||||
|
||||
ov::PartialShape GgmlOvDecoder::get_input_shape(const std::string& name) const {
|
||||
ov::PartialShape GgmlOvDecoder::get_input_shape(const std::string & name) const {
|
||||
return ov::PartialShape(get_shape(m_inputs.at(name)));
|
||||
}
|
||||
|
||||
std::vector<size_t> GgmlOvDecoder::get_input_stride(const std::string& name) const {
|
||||
std::vector<size_t> GgmlOvDecoder::get_input_stride(const std::string & name) const {
|
||||
return get_stride(m_inputs.at(name));
|
||||
}
|
||||
|
||||
ov::element::Type GgmlOvDecoder::get_input_type(const std::string& name) const {
|
||||
ov::element::Type GgmlOvDecoder::get_input_type(const std::string & name) const {
|
||||
return get_ov_type(m_inputs.at(name));
|
||||
}
|
||||
|
||||
|
|
@ -704,7 +701,7 @@ size_t GgmlOvDecoder::get_input_size() const {
|
|||
return m_input_names.size();
|
||||
}
|
||||
|
||||
std::string& GgmlOvDecoder::get_input_name(size_t index) const {
|
||||
std::string & GgmlOvDecoder::get_input_name(size_t index) const {
|
||||
m_name = m_input_names[index];
|
||||
return m_name;
|
||||
}
|
||||
|
|
@ -713,19 +710,19 @@ std::vector<std::string> GgmlOvDecoder::get_input_names() const {
|
|||
return m_input_names;
|
||||
}
|
||||
|
||||
std::vector<size_t> GgmlOvDecoder::get_output_stride(const std::string& name) const {
|
||||
std::vector<size_t> GgmlOvDecoder::get_output_stride(const std::string & name) const {
|
||||
return get_stride(m_outputs.at(name));
|
||||
}
|
||||
|
||||
ov::PartialShape GgmlOvDecoder::get_output_shape(const std::string& name) const {
|
||||
ov::PartialShape GgmlOvDecoder::get_output_shape(const std::string & name) const {
|
||||
return ov::PartialShape(get_shape(m_outputs.at(name)));
|
||||
}
|
||||
|
||||
ov::element::Type GgmlOvDecoder::get_output_type(const std::string& name) const {
|
||||
ov::element::Type GgmlOvDecoder::get_output_type(const std::string & name) const {
|
||||
return get_ov_type(m_outputs.at(name));
|
||||
}
|
||||
|
||||
std::string& GgmlOvDecoder::get_output_name(size_t index) const {
|
||||
std::string & GgmlOvDecoder::get_output_name(size_t index) const {
|
||||
m_name = std::string(m_output_names[index]);
|
||||
return m_name;
|
||||
}
|
||||
|
|
@ -734,35 +731,28 @@ std::vector<std::string> GgmlOvDecoder::get_output_names() const {
|
|||
return m_output_names;
|
||||
}
|
||||
|
||||
const std::string& GgmlOvDecoder::get_op_name() const {
|
||||
const std::string & GgmlOvDecoder::get_op_name() const {
|
||||
return m_op_name;
|
||||
}
|
||||
|
||||
int32_t* GgmlOvDecoder::get_input_op_params(const std::string& name) const {
|
||||
int32_t * GgmlOvDecoder::get_input_op_params(const std::string & name) const {
|
||||
return m_inputs.at(name)->op_params;
|
||||
}
|
||||
|
||||
int32_t* GgmlOvDecoder::get_output_op_params(const std::string& name) const {
|
||||
int32_t * GgmlOvDecoder::get_output_op_params(const std::string & name) const {
|
||||
return m_outputs.at(name)->op_params;
|
||||
}
|
||||
|
||||
void GgmlOvDecoder::visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>)> node_visitor) const {
|
||||
for (const auto& node : m_nodes) {
|
||||
auto decoder = std::make_shared<GgmlOvDecoder>(node,
|
||||
m_cgraph,
|
||||
m_is_static,
|
||||
m_is_first_token,
|
||||
m_context_size,
|
||||
m_context_size_swa,
|
||||
m_num_heads,
|
||||
m_num_heads_kv,
|
||||
m_head_size,
|
||||
m_swa_layers);
|
||||
for (const auto & node : m_nodes) {
|
||||
auto decoder =
|
||||
std::make_shared<GgmlOvDecoder>(node, m_cgraph, m_is_static, m_is_first_token, m_context_size,
|
||||
m_context_size_swa, m_num_heads, m_num_heads_kv, m_head_size, m_swa_layers);
|
||||
node_visitor(decoder);
|
||||
}
|
||||
}
|
||||
|
||||
const std::string& GgmlOvDecoder::get_op_type() const {
|
||||
const std::string & GgmlOvDecoder::get_op_type() const {
|
||||
static const std::map<ggml_op, std::string> ops = {
|
||||
{GGML_OP_NONE, "GGML_OP_NONE" },
|
||||
{GGML_OP_ACC, "GGML_OP_ACC" },
|
||||
|
|
|
|||
|
|
@ -1,5 +1,9 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml-quants.hpp"
|
||||
#include "ggml.h"
|
||||
#include "openvino/decoder.hpp"
|
||||
|
||||
#include <cstdint>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
|
|
@ -7,98 +11,99 @@
|
|||
#include <optional>
|
||||
#include <vector>
|
||||
|
||||
#include "ggml-quants.hpp"
|
||||
#include "ggml.h"
|
||||
#include "openvino/decoder.hpp"
|
||||
|
||||
class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
|
||||
public:
|
||||
// Graph decoder
|
||||
GgmlOvDecoder(struct ggml_cgraph* cgraph, std::map<std::string, std::shared_ptr<ov::Node>>& model_weights,
|
||||
bool is_static, bool is_first_token);
|
||||
GgmlOvDecoder(ggml_cgraph * cgraph,
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
|
||||
bool is_static,
|
||||
bool is_first_token);
|
||||
|
||||
// Node decoder, called in GgmlOvDecoder::visit_subgraph
|
||||
GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token,
|
||||
int context_size, int context_size_swa, int num_heads, int num_heads_kv, int head_size,
|
||||
const std::vector<int>& swa_layers);
|
||||
GgmlOvDecoder(ggml_tensor * node,
|
||||
ggml_cgraph * cgraph,
|
||||
bool is_static,
|
||||
bool is_first_token,
|
||||
int context_size,
|
||||
int context_size_swa,
|
||||
int num_heads,
|
||||
int num_heads_kv,
|
||||
int head_size,
|
||||
const std::vector<int> & swa_layers);
|
||||
|
||||
// Naive graph decoder
|
||||
GgmlOvDecoder(struct ggml_cgraph* cgraph, std::map<std::string, std::shared_ptr<ov::Node>>& model_weights);
|
||||
GgmlOvDecoder(ggml_cgraph * cgraph, std::map<std::string, std::shared_ptr<ov::Node>> & model_weights);
|
||||
|
||||
virtual ov::Any get_attribute(const std::string& name) const override {
|
||||
virtual ov::Any get_attribute(const std::string & name) const override {
|
||||
return nullptr;
|
||||
GGML_UNUSED(name);
|
||||
}
|
||||
|
||||
virtual ov::PartialShape get_input_shape(const std::string& name) const override;
|
||||
virtual ov::PartialShape get_input_shape(const std::string & name) const override;
|
||||
|
||||
virtual std::vector<size_t> get_input_stride(const std::string& name) const override;
|
||||
virtual std::vector<size_t> get_input_stride(const std::string & name) const override;
|
||||
|
||||
virtual ov::element::Type get_input_type(const std::string& name) const override;
|
||||
virtual ov::element::Type get_input_type(const std::string & name) const override;
|
||||
|
||||
virtual size_t get_input_size() const override;
|
||||
|
||||
virtual void get_input_node(size_t input_port_idx,
|
||||
std::string& producer_name,
|
||||
std::string& producer_output_port_name,
|
||||
size_t& producer_output_port_index) const override {
|
||||
std::string & producer_name,
|
||||
std::string & producer_output_port_name,
|
||||
size_t & producer_output_port_index) const override {
|
||||
GGML_UNUSED(input_port_idx);
|
||||
GGML_UNUSED(producer_name);
|
||||
GGML_UNUSED(producer_output_port_name);
|
||||
GGML_UNUSED(producer_output_port_index);
|
||||
}
|
||||
|
||||
virtual std::string& get_input_name(size_t index) const override;
|
||||
virtual std::string & get_input_name(size_t index) const override;
|
||||
|
||||
virtual std::vector<std::string> get_input_names() const override;
|
||||
|
||||
virtual ov::PartialShape get_output_shape(const std::string& name) const override;
|
||||
virtual ov::PartialShape get_output_shape(const std::string & name) const override;
|
||||
|
||||
virtual std::vector<size_t> get_output_stride(const std::string& name) const override;
|
||||
virtual std::vector<size_t> get_output_stride(const std::string & name) const override;
|
||||
|
||||
virtual ov::element::Type get_output_type(const std::string& name) const override;
|
||||
virtual ov::element::Type get_output_type(const std::string & name) const override;
|
||||
|
||||
virtual int32_t* get_input_op_params(const std::string& name) const override;
|
||||
virtual int32_t * get_input_op_params(const std::string & name) const override;
|
||||
|
||||
virtual int32_t* get_output_op_params(const std::string& name) const override;
|
||||
virtual int32_t * get_output_op_params(const std::string & name) const override;
|
||||
|
||||
virtual std::string& get_output_name(size_t index) const override;
|
||||
virtual std::string & get_output_name(size_t index) const override;
|
||||
|
||||
virtual std::vector<std::string> get_output_names() const override;
|
||||
|
||||
virtual const std::string& get_op_type() const override;
|
||||
virtual const std::string & get_op_type() const override;
|
||||
|
||||
virtual const std::string& get_op_name() const override;
|
||||
virtual const std::string & get_op_name() const override;
|
||||
|
||||
virtual void visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>)> node_visitor) const override;
|
||||
|
||||
const ggml_tensor* get_input_ggml_tensor(const std::string& name) const {
|
||||
return m_inputs.at(name);
|
||||
}
|
||||
const ggml_tensor * get_input_ggml_tensor(const std::string & name) const { return m_inputs.at(name); }
|
||||
|
||||
const ggml_tensor* get_output_ggml_tensor(const std::string& name) const {
|
||||
return m_outputs.at(name);
|
||||
}
|
||||
const ggml_tensor * get_output_ggml_tensor(const std::string & name) const { return m_outputs.at(name); }
|
||||
|
||||
virtual int get_op_case() const override {
|
||||
return m_op_case;
|
||||
}
|
||||
virtual int get_op_case() const override { return m_op_case; }
|
||||
|
||||
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_inputs() const override {
|
||||
virtual const std::map<std::string, std::shared_ptr<ov::Node>> & get_model_inputs() const override {
|
||||
return m_model_inputs;
|
||||
}
|
||||
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_extra_inputs() const override {
|
||||
|
||||
virtual const std::map<std::string, std::shared_ptr<ov::Node>> & get_model_extra_inputs() const override {
|
||||
return m_model_extra_inputs;
|
||||
}
|
||||
virtual const std::map<std::string, std::shared_ptr<ov::Tensor>>& get_model_extra_input_values() const {
|
||||
|
||||
virtual const std::map<std::string, std::shared_ptr<ov::Tensor>> & get_model_extra_input_values() const {
|
||||
return m_model_extra_input_values;
|
||||
}
|
||||
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_weights() const override {
|
||||
|
||||
virtual const std::map<std::string, std::shared_ptr<ov::Node>> & get_model_weights() const override {
|
||||
return m_model_weights;
|
||||
}
|
||||
virtual const std::vector<std::string>& get_model_output_names() const override {
|
||||
return m_model_output_names;
|
||||
}
|
||||
|
||||
virtual const std::vector<std::string> & get_model_output_names() const override { return m_model_output_names; }
|
||||
|
||||
virtual int get_context_size() const override { return m_context_size; }
|
||||
|
||||
|
|
@ -114,7 +119,7 @@ public:
|
|||
|
||||
virtual int get_head_size() const override { return m_head_size; }
|
||||
|
||||
virtual int32_t* get_rope_params() const override { return m_rope_params; }
|
||||
virtual int32_t * get_rope_params() const override { return m_rope_params; }
|
||||
|
||||
virtual std::map<std::string, std::string> get_kv_param_res_names() const override;
|
||||
|
||||
|
|
@ -122,36 +127,39 @@ public:
|
|||
|
||||
virtual bool is_first_token() const override { return m_is_first_token; }
|
||||
|
||||
ov::PartialShape get_graph_input_shape(const ggml_tensor* src) const;
|
||||
ov::PartialShape get_graph_input_shape(const ggml_tensor * src) const;
|
||||
|
||||
static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename);
|
||||
static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename);
|
||||
|
||||
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor,
|
||||
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor * tensor,
|
||||
std::optional<ExtraQuantType> requant_type = std::nullopt);
|
||||
static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(
|
||||
struct ggml_cgraph* cgraph, std::map<ggml_type, ExtraQuantType> types_to_requantize = {});
|
||||
|
||||
const ggml_tensor* get_tensor_used_op(const ggml_tensor* tensor) const;
|
||||
const ggml_tensor* get_tensor_from_name(const std::string& name) const;
|
||||
static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(
|
||||
ggml_cgraph * cgraph,
|
||||
std::map<ggml_type, ExtraQuantType> types_to_requantize = {});
|
||||
|
||||
const ggml_tensor * get_tensor_used_op(const ggml_tensor * tensor) const;
|
||||
|
||||
const ggml_tensor * get_tensor_from_name(const std::string & name) const;
|
||||
|
||||
void clear_model_weights() { m_model_weights.clear(); }
|
||||
|
||||
private:
|
||||
void set_input_output(ggml_tensor* node, bool naive = false);
|
||||
void set_input_output(ggml_tensor * node, bool naive = false);
|
||||
void add_extra_inputs();
|
||||
static std::vector<size_t> get_shape(const ggml_tensor* tensor);
|
||||
static std::vector<size_t> get_stride(const ggml_tensor* tensor);
|
||||
static ov::element::Type get_ov_type(const ggml_tensor* tensor);
|
||||
static std::vector<size_t> get_shape(const ggml_tensor * tensor);
|
||||
static std::vector<size_t> get_stride(const ggml_tensor * tensor);
|
||||
static ov::element::Type get_ov_type(const ggml_tensor * tensor);
|
||||
|
||||
// set context_size, num_heads, etc
|
||||
void set_llm_params();
|
||||
|
||||
struct ggml_cgraph* m_cgraph = nullptr;
|
||||
ggml_tensor* m_node = nullptr;
|
||||
std::vector<ggml_tensor*> m_nodes;
|
||||
std::map<std::string, ggml_tensor*> m_inputs;
|
||||
ggml_cgraph * m_cgraph = nullptr;
|
||||
ggml_tensor * m_node = nullptr;
|
||||
std::vector<ggml_tensor *> m_nodes;
|
||||
std::map<std::string, ggml_tensor *> m_inputs;
|
||||
std::vector<std::string> m_input_names;
|
||||
std::map<std::string, ggml_tensor*> m_outputs;
|
||||
std::map<std::string, ggml_tensor *> m_outputs;
|
||||
std::vector<std::string> m_output_names;
|
||||
std::string m_op_name;
|
||||
mutable std::string m_name;
|
||||
|
|
@ -168,12 +176,12 @@ private:
|
|||
int m_num_heads;
|
||||
int m_num_heads_kv;
|
||||
int m_head_size;
|
||||
int32_t* m_rope_params;
|
||||
int32_t * m_rope_params;
|
||||
std::vector<std::string> m_kv_names;
|
||||
bool m_is_static = false;
|
||||
bool m_is_first_token;
|
||||
};
|
||||
|
||||
void print_tensor_address_map(const struct ggml_cgraph* cgraph);
|
||||
void print_tensor_address_map(const ggml_cgraph * cgraph);
|
||||
|
||||
int extract_layer_from_name(const std::string& name);
|
||||
int extract_layer_from_name(const std::string & name);
|
||||
|
|
|
|||
|
|
@ -1,5 +1,11 @@
|
|||
#include "ggml-openvino.h"
|
||||
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-openvino/utils.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <mutex>
|
||||
#include <openvino/openvino.hpp>
|
||||
|
|
@ -7,39 +13,36 @@
|
|||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-openvino/utils.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#define GGML_OPENVINO_MAX_STREAMS 8
|
||||
|
||||
struct ggml_backend_openvino_context {
|
||||
int device; // the device ID currently in use
|
||||
std::string name; // context Name
|
||||
std::string description; // context description
|
||||
int device; // the device ID currently in use
|
||||
std::string name; // context Name
|
||||
std::string description; // context description
|
||||
|
||||
// OpenVINO core components
|
||||
ov::Core core; // OpenVINO core interface
|
||||
std::shared_ptr<ov::CompiledModel> model; // compiled Model
|
||||
ov::InferRequest infer_request; // inference Request
|
||||
ov::Core core; // OpenVINO core interface
|
||||
std::shared_ptr<ov::CompiledModel> model; // compiled Model
|
||||
ov::InferRequest infer_request; // inference Request
|
||||
|
||||
// OpenVINO Multi-stream support
|
||||
static const int MAX_STREAMS = 8; // define the maximum number of flows
|
||||
std::vector<ov::InferRequest> streams; // used to support multi-stream reasoning
|
||||
int current_stream; // the currently active stream index
|
||||
static const int MAX_STREAMS = 8; // define the maximum number of flows
|
||||
std::vector<ov::InferRequest> streams; // used to support multi-stream reasoning
|
||||
int current_stream; // the currently active stream index
|
||||
|
||||
// state Management
|
||||
bool is_initialized; // initialize
|
||||
bool is_initialized; // initialize
|
||||
|
||||
ggml_backend_openvino_context()
|
||||
: device(0), name("OpenVINO"), description("OpenVINO Backend Context"),
|
||||
current_stream(0), is_initialized(false) {}
|
||||
ggml_backend_openvino_context() :
|
||||
device(0),
|
||||
name("OpenVINO"),
|
||||
description("OpenVINO Backend Context"),
|
||||
current_stream(0),
|
||||
is_initialized(false) {}
|
||||
};
|
||||
|
||||
static void ggml_backend_openvino_free(ggml_backend_t backend) {
|
||||
ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *)backend->context;
|
||||
ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *) backend->context;
|
||||
delete ctx;
|
||||
delete backend;
|
||||
}
|
||||
|
|
@ -49,8 +52,7 @@ static const char * ggml_backend_openvino_get_name(ggml_backend_t backend) {
|
|||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
static enum ggml_status
|
||||
ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph *cgraph) {
|
||||
static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
openvino_frontend_compute(backend, cgraph);
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
|
|
@ -78,7 +80,8 @@ int ggml_backend_openvino_get_device_count() {
|
|||
}
|
||||
|
||||
static ggml_guid_t ggml_backend_openvino_guid(void) {
|
||||
static ggml_guid guid = { 0x12, 0xa8, 0xae, 0xf4, 0xc0, 0x1e, 0x61, 0x97, 0x8f, 0xeb, 0x33, 0x04, 0xa1, 0x33, 0x51, 0x2d };
|
||||
static ggml_guid guid = {0x12, 0xa8, 0xae, 0xf4, 0xc0, 0x1e, 0x61, 0x97,
|
||||
0x8f, 0xeb, 0x33, 0x04, 0xa1, 0x33, 0x51, 0x2d};
|
||||
return &guid;
|
||||
}
|
||||
|
||||
|
|
@ -95,7 +98,7 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device) {
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
ggml_backend_t openvino_backend = new ggml_backend {
|
||||
ggml_backend_t openvino_backend = new ggml_backend{
|
||||
/* .guid = */ ggml_backend_openvino_guid(),
|
||||
/* .interface = */ ggml_backend_openvino_interface,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_openvino_reg(), device),
|
||||
|
|
@ -134,15 +137,15 @@ struct ggml_backend_openvino_buffer_type_context {
|
|||
};
|
||||
|
||||
static const char * ggml_backend_openvino_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
ggml_backend_openvino_buffer_type_context * ctx = (ggml_backend_openvino_buffer_type_context *)buft->context;
|
||||
ggml_backend_openvino_buffer_type_context * ctx = (ggml_backend_openvino_buffer_type_context *) buft->context;
|
||||
|
||||
return ctx->name.c_str();
|
||||
}
|
||||
|
||||
static bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft) {
|
||||
return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name;
|
||||
}
|
||||
|
||||
|
||||
static const char * ggml_backend_openvino_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return GGML_OPENVINO_NAME "_Split";
|
||||
|
||||
|
|
@ -160,12 +163,12 @@ struct ggml_backend_openvino_device_context {
|
|||
};
|
||||
|
||||
static const char * ggml_backend_openvino_device_get_name(ggml_backend_dev_t dev) {
|
||||
ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context;
|
||||
ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *) dev->context;
|
||||
return ctx->name.c_str();
|
||||
}
|
||||
|
||||
static const char * ggml_backend_openvino_device_get_description(ggml_backend_dev_t dev) {
|
||||
ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context;
|
||||
ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *) dev->context;
|
||||
return ctx->description.c_str();
|
||||
}
|
||||
|
||||
|
|
@ -174,7 +177,7 @@ static void ggml_backend_openvino_device_get_memory(ggml_backend_dev_t dev, size
|
|||
GGML_ASSERT(dev->context != nullptr);
|
||||
GGML_ASSERT(free != nullptr);
|
||||
GGML_ASSERT(total != nullptr);
|
||||
ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context;
|
||||
ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *) dev->context;
|
||||
GGML_ASSERT(ctx->device >= 0);
|
||||
// ggml_openvino_set_device(ctx->device);
|
||||
*total = 1;
|
||||
|
|
@ -187,9 +190,9 @@ static enum ggml_backend_dev_type ggml_backend_openvino_device_get_type(ggml_bac
|
|||
}
|
||||
|
||||
static void ggml_backend_openvino_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||
props->name = ggml_backend_openvino_device_get_name(dev);
|
||||
props->name = ggml_backend_openvino_device_get_name(dev);
|
||||
props->description = ggml_backend_openvino_device_get_description(dev);
|
||||
props->type = ggml_backend_openvino_device_get_type(dev);
|
||||
props->type = ggml_backend_openvino_device_get_type(dev);
|
||||
ggml_backend_openvino_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
|
||||
bool host_buffer = getenv("GGML_OPENVINO_NO_PINNED") == nullptr;
|
||||
|
|
@ -209,12 +212,12 @@ static void ggml_backend_openvino_device_get_props(ggml_backend_dev_t dev, ggml_
|
|||
|
||||
static ggml_backend_t ggml_backend_openvino_device_init(ggml_backend_dev_t dev, const char * params) {
|
||||
GGML_UNUSED(params);
|
||||
ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context;
|
||||
ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *) dev->context;
|
||||
return ggml_backend_openvino_init(ctx->device);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_openvino_device_get_buffer_type(ggml_backend_dev_t dev) {
|
||||
ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *)dev->context;
|
||||
ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *) dev->context;
|
||||
return ggml_backend_openvino_buffer_type(ctx->device);
|
||||
}
|
||||
|
||||
|
|
@ -223,7 +226,10 @@ static ggml_backend_buffer_type_t ggml_backend_openvino_device_get_host_buffer_t
|
|||
return ggml_backend_openvino_host_buffer_type();
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||
static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_ptr(ggml_backend_dev_t dev,
|
||||
void * ptr,
|
||||
size_t size,
|
||||
size_t max_tensor_size) {
|
||||
GGML_UNUSED(dev);
|
||||
GGML_UNUSED(ptr);
|
||||
GGML_UNUSED(size);
|
||||
|
|
@ -231,7 +237,10 @@ static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_ptr(ggml_b
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||
static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(ggml_backend_dev_t dev,
|
||||
void * ptr,
|
||||
size_t size,
|
||||
size_t max_tensor_size) {
|
||||
GGML_UNUSED(dev);
|
||||
GGML_UNUSED(ptr);
|
||||
GGML_UNUSED(size);
|
||||
|
|
@ -239,7 +248,7 @@ static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(g
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
static bool is_op_unsupported_case(const ggml_tensor* op) {
|
||||
static bool is_op_unsupported_case(const ggml_tensor * op) {
|
||||
switch (op->op) {
|
||||
case GGML_OP_SOFT_MAX: {
|
||||
if (op->src[2] != nullptr) {
|
||||
|
|
@ -248,9 +257,9 @@ static bool is_op_unsupported_case(const ggml_tensor* op) {
|
|||
}
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
const auto* op_params = op->op_params;
|
||||
memcpy(&scale, (const float*) op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (const float*) op_params + 1, sizeof(float));
|
||||
const auto * op_params = op->op_params;
|
||||
memcpy(&scale, (const float *) op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (const float *) op_params + 1, sizeof(float));
|
||||
if (max_bias > 0) {
|
||||
GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with max_bias > 0\n");
|
||||
return true;
|
||||
|
|
@ -265,10 +274,10 @@ static bool is_op_unsupported_case(const ggml_tensor* op) {
|
|||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
float logit_softcap = 0.0f;
|
||||
const auto* op_params = op->op_params;
|
||||
memcpy(&scale, (const float*) op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (const float*) op_params + 1, sizeof(float));
|
||||
memcpy(&logit_softcap, (const float*) op_params + 2, sizeof(float));
|
||||
const auto * op_params = op->op_params;
|
||||
memcpy(&scale, (const float *) op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (const float *) op_params + 1, sizeof(float));
|
||||
memcpy(&logit_softcap, (const float *) op_params + 2, sizeof(float));
|
||||
if (max_bias > 0) {
|
||||
GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with max_bias > 0\n");
|
||||
return true;
|
||||
|
|
@ -303,7 +312,7 @@ static bool is_op_unsupported_case(const ggml_tensor* op) {
|
|||
break;
|
||||
}
|
||||
case GGML_OP_ROPE: {
|
||||
const int32_t* op_params = op->op_params;
|
||||
const int32_t * op_params = op->op_params;
|
||||
const int n_dims = op_params[1];
|
||||
const int mode = op_params[2];
|
||||
if (mode == GGML_ROPE_TYPE_MROPE || mode == GGML_ROPE_TYPE_VISION) {
|
||||
|
|
@ -311,8 +320,7 @@ static bool is_op_unsupported_case(const ggml_tensor* op) {
|
|||
return true;
|
||||
}
|
||||
if (n_dims != 0.0f && n_dims != op->src[0]->ne[0]) {
|
||||
GGML_LOG_WARN("OpenVINO backend does not support ROPE with n_dims %d != src[0]->ne[0] %ld\n",
|
||||
n_dims,
|
||||
GGML_LOG_WARN("OpenVINO backend does not support ROPE with n_dims %d != src[0]->ne[0] %ld\n", n_dims,
|
||||
op->src[0]->ne[0]);
|
||||
return true;
|
||||
}
|
||||
|
|
@ -333,8 +341,7 @@ static bool is_op_unsupported_case(const ggml_tensor* op) {
|
|||
GGML_LOG_WARN(
|
||||
"OpenVINO backend does not support ROPE with src[0]->view_src->ne[1] %ld != src[0]->ne[2] "
|
||||
"%ld\n",
|
||||
op->src[0]->view_src->ne[1],
|
||||
op->src[0]->ne[2]);
|
||||
op->src[0]->view_src->ne[1], op->src[0]->ne[2]);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
@ -346,39 +353,19 @@ static bool is_op_unsupported_case(const ggml_tensor* op) {
|
|||
return false;
|
||||
}
|
||||
|
||||
static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor* op) {
|
||||
static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
GGML_ASSERT(dev->reg != nullptr);
|
||||
|
||||
static std::set<ggml_type> supported_types{GGML_TYPE_F32,
|
||||
GGML_TYPE_F16,
|
||||
GGML_TYPE_BF16,
|
||||
GGML_TYPE_I64,
|
||||
GGML_TYPE_I32,
|
||||
GGML_TYPE_Q4_0,
|
||||
GGML_TYPE_Q4_1,
|
||||
GGML_TYPE_Q4_K,
|
||||
GGML_TYPE_Q5_K,
|
||||
GGML_TYPE_Q8_0,
|
||||
GGML_TYPE_Q6_K};
|
||||
static std::set<ggml_type> supported_types{GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_I64,
|
||||
GGML_TYPE_I32, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K,
|
||||
GGML_TYPE_Q5_K, GGML_TYPE_Q8_0, GGML_TYPE_Q6_K};
|
||||
|
||||
static const std::set<ggml_op> supported_ops{GGML_OP_NONE,
|
||||
GGML_OP_ADD,
|
||||
GGML_OP_MUL,
|
||||
GGML_OP_MUL_MAT,
|
||||
GGML_OP_VIEW,
|
||||
GGML_OP_CONT,
|
||||
GGML_OP_RESHAPE,
|
||||
GGML_OP_PERMUTE,
|
||||
GGML_OP_TRANSPOSE,
|
||||
GGML_OP_GET_ROWS,
|
||||
GGML_OP_ROPE,
|
||||
GGML_OP_RMS_NORM,
|
||||
GGML_OP_SCALE,
|
||||
static const std::set<ggml_op> supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW,
|
||||
GGML_OP_CONT, GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE,
|
||||
GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE,
|
||||
// softmax is not updated due to replaced by flash_attn_ext
|
||||
// GGML_OP_SOFT_MAX,
|
||||
GGML_OP_SET_ROWS,
|
||||
GGML_OP_FLASH_ATTN_EXT,
|
||||
GGML_OP_CPY};
|
||||
GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY};
|
||||
static const std::set<ggml_unary_op> supported_unary_ops{
|
||||
GGML_UNARY_OP_SILU,
|
||||
};
|
||||
|
|
@ -422,7 +409,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
|
|||
return false;
|
||||
}
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
auto* src = op->src[i];
|
||||
auto * src = op->src[i];
|
||||
if (src == nullptr) {
|
||||
break;
|
||||
}
|
||||
|
|
@ -483,13 +470,13 @@ static size_t ggml_backend_openvino_reg_get_device_count(ggml_backend_reg_t reg)
|
|||
GGML_UNUSED(reg);
|
||||
|
||||
// TODO
|
||||
ggml_backend_openvino_reg_context * ctx = (ggml_backend_openvino_reg_context *)reg->context;
|
||||
ggml_backend_openvino_reg_context * ctx = (ggml_backend_openvino_reg_context *) reg->context;
|
||||
|
||||
return ctx->devices.size();
|
||||
}
|
||||
|
||||
static ggml_backend_dev_t ggml_backend_openvino_reg_get_device(ggml_backend_reg_t reg, size_t index) {
|
||||
ggml_backend_openvino_reg_context * ctx = (ggml_backend_openvino_reg_context *)reg->context;
|
||||
ggml_backend_openvino_reg_context * ctx = (ggml_backend_openvino_reg_context *) reg->context;
|
||||
GGML_ASSERT(index < ctx->devices.size());
|
||||
return ctx->devices[index];
|
||||
// GGML_ASSERT(index == 0);
|
||||
|
|
@ -509,7 +496,7 @@ static ggml_backend_dev_t ggml_backend_openvino_reg_get_device(ggml_backend_reg_
|
|||
static void * ggml_backend_openvino_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
||||
GGML_UNUSED(reg);
|
||||
if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
|
||||
return (void *)ggml_backend_openvino_split_buffer_type;
|
||||
return (void *) ggml_backend_openvino_split_buffer_type;
|
||||
}
|
||||
// if (strcmp(name, "ggml_backend_register_host_buffer") == 0) {
|
||||
// return (void *)ggml_backend_openvino_register_host_buffer;
|
||||
|
|
@ -565,17 +552,16 @@ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void) {
|
|||
// ggml_openvino_set_device(i);
|
||||
dev_ctx->description = ov::get_openvino_version().description;
|
||||
|
||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||
/* .interface = */ ggml_backend_openvino_device_interface,
|
||||
/* .reg = */ ®,
|
||||
/* .context = */ dev_ctx
|
||||
};
|
||||
ggml_backend_dev_t dev =
|
||||
new ggml_backend_device{/* .interface = */ ggml_backend_openvino_device_interface,
|
||||
/* .reg = */ ®,
|
||||
/* .context = */ dev_ctx};
|
||||
ctx->devices.push_back(dev);
|
||||
}
|
||||
|
||||
reg = ggml_backend_reg{ /* .api_version = */ GGML_BACKEND_API_VERSION,
|
||||
/* .iface = */ ggml_backend_openvino_reg_interface,
|
||||
/* .context = */ ctx };
|
||||
reg = ggml_backend_reg{/* .api_version = */ GGML_BACKEND_API_VERSION,
|
||||
/* .iface = */ ggml_backend_openvino_reg_interface,
|
||||
/* .context = */ ctx};
|
||||
}
|
||||
|
||||
initialized = true;
|
||||
|
|
|
|||
|
|
@ -1,5 +1,9 @@
|
|||
#include "ggml-quants.hpp"
|
||||
|
||||
#include "ggml-common.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
|
|
@ -24,11 +28,7 @@
|
|||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "ggml-common.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml.h"
|
||||
|
||||
void unpack_32_4(const uint8_t* data, uint8_t* dst) {
|
||||
void unpack_32_4(const uint8_t * data, uint8_t * dst) {
|
||||
std::fill_n(dst, 16, 0);
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
uint8_t x = (data[j] & 0x0F);
|
||||
|
|
@ -44,18 +44,19 @@ void unpack_32_4(const uint8_t* data, uint8_t* dst) {
|
|||
|
||||
// Extracts (weight, scales, biases) from Q4_0 tensors.
|
||||
// Data layout is: |16 bit scale|32 x 4bit weights|.
|
||||
void extract_q4_0_data(const ggml_tensor* tensor,
|
||||
ov::Tensor& weights_arr,
|
||||
ov::Tensor& scales_arr,
|
||||
ov::Tensor& biases_arr) {
|
||||
void extract_q4_0_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & biases_arr) {
|
||||
const uint64_t bytes_per_block = 18; // 2 bytes scale, 32x0.5 byte weights
|
||||
auto* data = static_cast<uint8_t*>(tensor->data);
|
||||
auto* weights = static_cast<uint8_t*>(weights_arr.data());
|
||||
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
|
||||
auto * data = static_cast<uint8_t *>(tensor->data);
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
|
||||
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
|
||||
scales[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block)));
|
||||
scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)));
|
||||
biases[i] = ov::float16(-8.f * static_cast<float>(scales[i]));
|
||||
unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16);
|
||||
});
|
||||
|
|
@ -63,38 +64,40 @@ void extract_q4_0_data(const ggml_tensor* tensor,
|
|||
|
||||
// Extracts (weight, scales, biases) from Q4_1 tensors.
|
||||
// Data layout is: |16 bit scale|16 bit bias|32 x 4bit weights|.
|
||||
void extract_q4_1_data(const ggml_tensor* tensor,
|
||||
ov::Tensor& weights_arr,
|
||||
ov::Tensor& scales_arr,
|
||||
ov::Tensor& biases_arr) {
|
||||
void extract_q4_1_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & biases_arr) {
|
||||
const uint64_t bytes_per_block = 20; // 2 bytes scale, 2 bytes bias, 32x0.5 byte weights
|
||||
auto* data = static_cast<uint8_t*>(tensor->data);
|
||||
auto* weights = static_cast<uint8_t*>(weights_arr.data());
|
||||
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
|
||||
auto * data = static_cast<uint8_t *>(tensor->data);
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
|
||||
scales[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block)));
|
||||
biases[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block + 2)));
|
||||
scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)));
|
||||
biases[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2)));
|
||||
unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16);
|
||||
});
|
||||
}
|
||||
|
||||
// Extracts (weight, scales, biases) from Q8_0 tensors.
|
||||
// Data layout is: |16 bit scale|32 x 8bit weights|.
|
||||
void extract_q8_0_data(const ggml_tensor* tensor,
|
||||
ov::Tensor& weights_arr,
|
||||
ov::Tensor& scales_arr,
|
||||
ov::Tensor& biases_arr) {
|
||||
void extract_q8_0_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & biases_arr) {
|
||||
const uint64_t weights_per_block = 32;
|
||||
const uint64_t bytes_per_block = 34; // 2 bytes scale, 32x1 byte weights
|
||||
auto* data = static_cast<uint8_t*>(tensor->data);
|
||||
auto* weights = static_cast<uint8_t*>(weights_arr.data());
|
||||
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
|
||||
auto * data = static_cast<uint8_t *>(tensor->data);
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
|
||||
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
|
||||
uint8_t* block_data = data + i * bytes_per_block;
|
||||
scales[i] = ov::float16::from_bits(*(uint16_t*) block_data);
|
||||
uint8_t * block_data = data + i * bytes_per_block;
|
||||
scales[i] = ov::float16::from_bits(*(uint16_t *) block_data);
|
||||
biases[i] = ov::float16(-128.f * static_cast<float>(scales[i]));
|
||||
for (size_t j = 0; j < weights_per_block; ++j) {
|
||||
uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes.
|
||||
|
|
@ -105,7 +108,7 @@ void extract_q8_0_data(const ggml_tensor* tensor,
|
|||
});
|
||||
}
|
||||
|
||||
void unpack_256_4(const uint8_t* data, uint8_t* dst) {
|
||||
void unpack_256_4(const uint8_t * data, uint8_t * dst) {
|
||||
// Initialize the output array with zeros
|
||||
std::fill_n(dst, 128, 0);
|
||||
|
||||
|
|
@ -123,26 +126,27 @@ void unpack_256_4(const uint8_t* data, uint8_t* dst) {
|
|||
}
|
||||
}
|
||||
|
||||
void extract_q4_k_data(const ggml_tensor* tensor,
|
||||
ov::Tensor& weights_arr,
|
||||
ov::Tensor& scales_arr,
|
||||
ov::Tensor& biases_arr) {
|
||||
void extract_q4_k_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & biases_arr) {
|
||||
const uint64_t bytes_per_block = 2 + 2 + 12 + 128;
|
||||
const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
|
||||
auto* data = static_cast<uint8_t*>(tensor->data);
|
||||
auto* weights = static_cast<uint8_t*>(weights_arr.data());
|
||||
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
|
||||
auto * data = static_cast<uint8_t *>(tensor->data);
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
|
||||
ov::parallel_for(n_super_block, [&](size_t i) {
|
||||
uint8_t* block_data = data + i * bytes_per_block;
|
||||
uint8_t * block_data = data + i * bytes_per_block;
|
||||
|
||||
// Extract scale factors and offsets
|
||||
float scale_scales = static_cast<float>(ov::float16::from_bits(*((uint16_t*)block_data)));
|
||||
float scale_biases = static_cast<float>(ov::float16::from_bits(*((uint16_t*)block_data + 1)));
|
||||
float scale_scales = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data)));
|
||||
float scale_biases = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 1)));
|
||||
|
||||
// Extract qs1 and qs2
|
||||
uint8_t* qs1 = block_data + 4;
|
||||
uint8_t * qs1 = block_data + 4;
|
||||
// uint8_t* qs2 = block_data + 16;
|
||||
|
||||
scales[i * 8] = ov::float16(scale_scales * static_cast<float>((*(qs1) & 0b111111)));
|
||||
|
|
@ -174,31 +178,32 @@ void extract_q4_k_data(const ggml_tensor* tensor,
|
|||
});
|
||||
}
|
||||
|
||||
void extract_q6_k_data(const ggml_tensor* tensor,
|
||||
ov::Tensor& weights_arr,
|
||||
ov::Tensor& scales_arr,
|
||||
ov::Tensor& biases_arr) {
|
||||
void extract_q6_k_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & biases_arr) {
|
||||
const uint64_t bytes_per_block = 128 + 64 + 16 + 2;
|
||||
const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
|
||||
auto* data = static_cast<uint8_t*>(tensor->data);
|
||||
auto* weights = static_cast<uint8_t*>(weights_arr.data());
|
||||
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
|
||||
auto * data = static_cast<uint8_t *>(tensor->data);
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
|
||||
ov::parallel_for(n_super_block, [&](size_t i) {
|
||||
uint8_t* block_data = data + i * bytes_per_block;
|
||||
uint8_t * block_data = data + i * bytes_per_block;
|
||||
|
||||
float scale_factor =
|
||||
static_cast<float>(ov::float16::from_bits(*((uint16_t*) block_data + 104))); // (128+64+16)/2
|
||||
static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 104))); // (128+64+16)/2
|
||||
|
||||
for (size_t j = 0; j < 16; j++) {
|
||||
scales[j + i * 16] =
|
||||
ov::float16(scale_factor * static_cast<float>(*((int8_t*) (block_data + 128 + 64 + j))));
|
||||
ov::float16(scale_factor * static_cast<float>(*((int8_t *) (block_data + 128 + 64 + j))));
|
||||
biases[j + i * 16] = ov::float16(-32.f * static_cast<float>(scales[j + i * 16]));
|
||||
}
|
||||
|
||||
uint8_t* ql = block_data;
|
||||
uint8_t* qh = block_data + 128;
|
||||
uint8_t * ql = block_data;
|
||||
uint8_t * qh = block_data + 128;
|
||||
|
||||
for (int64_t j = 0; j < 32; ++j) {
|
||||
weights[i * 256 + j] = (ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4);
|
||||
|
|
@ -213,7 +218,7 @@ void extract_q6_k_data(const ggml_tensor* tensor,
|
|||
});
|
||||
}
|
||||
|
||||
static inline void get_scale_min_k4(int j, const uint8_t* q, uint8_t* d, uint8_t* m) {
|
||||
static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) {
|
||||
if (j < 4) {
|
||||
*d = q[j] & 63;
|
||||
*m = q[j + 4] & 63;
|
||||
|
|
@ -223,24 +228,27 @@ static inline void get_scale_min_k4(int j, const uint8_t* q, uint8_t* d, uint8_t
|
|||
}
|
||||
}
|
||||
|
||||
void extract_q5_k_data(const ggml_tensor* tensor, ov::Tensor& weights_arr, ov::Tensor& scales_arr,
|
||||
ov::Tensor& biases_arr) {
|
||||
void extract_q5_k_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & biases_arr) {
|
||||
const uint64_t bytes_per_block = 4 + 12 + 32 + 128;
|
||||
const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
|
||||
auto* data = static_cast<uint8_t*>(tensor->data);
|
||||
auto* weights = static_cast<uint8_t*>(weights_arr.data());
|
||||
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
|
||||
auto * data = static_cast<uint8_t *>(tensor->data);
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
|
||||
ov::parallel_for(n_super_block, [&](size_t i) {
|
||||
uint8_t* block_data = data + i * bytes_per_block;
|
||||
uint8_t * block_data = data + i * bytes_per_block;
|
||||
|
||||
const float d = static_cast<float>(ov::float16::from_bits(*((uint16_t*) block_data)));
|
||||
const float min = static_cast<float>(ov::float16::from_bits(*((uint16_t*) block_data + 1)));
|
||||
const float d = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data)));
|
||||
const float min = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 1)));
|
||||
|
||||
const uint8_t* scales_data = block_data + 4; // 12 bytes of scales
|
||||
const uint8_t* qh = block_data + 4 + 12; // 32 bytes of high bits
|
||||
const uint8_t* ql = block_data + 4 + 12 + 32; // 128 bytes of low bits
|
||||
const uint8_t * scales_data = block_data + 4; // 12 bytes of scales
|
||||
const uint8_t * qh = block_data + 4 + 12; // 32 bytes of high bits
|
||||
const uint8_t * ql = block_data + 4 + 12 + 32; // 128 bytes of low bits
|
||||
|
||||
int is = 0;
|
||||
uint8_t u1 = 1;
|
||||
|
|
@ -286,7 +294,10 @@ void extract_q5_k_data(const ggml_tensor* tensor, ov::Tensor& weights_arr, ov::T
|
|||
|
||||
// TODO Reorder for make_intX_weights
|
||||
|
||||
ov::Output<ov::Node> make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) {
|
||||
ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
|
||||
ov::Tensor & scales,
|
||||
ov::Tensor & biases,
|
||||
size_t group_size) {
|
||||
ov::Shape orig_shape = weight.get_shape();
|
||||
|
||||
// Expand dimensions for scales and biases
|
||||
|
|
@ -303,18 +314,19 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, o
|
|||
}
|
||||
|
||||
// Create graph nodes
|
||||
auto weights_node = std::make_shared<ov::op::v0::Constant>(
|
||||
ov::element::u8, packed_shape, static_cast<uint8_t*>(weight.data()), nullptr);
|
||||
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u8, packed_shape,
|
||||
static_cast<uint8_t *>(weight.data()), nullptr);
|
||||
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
|
||||
auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
|
||||
ov::Tensor biases_u8(ov::element::u8, scale_shape);
|
||||
|
||||
// Calculate zero point
|
||||
const ov::float16* bias_data = biases.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
const ov::float16* scale_data = scales.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
uint8_t* bias_u8_data = biases_u8.data<uint8_t>();
|
||||
const ov::float16 * bias_data = biases.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
const ov::float16 * scale_data = scales.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
uint8_t * bias_u8_data = biases_u8.data<uint8_t>();
|
||||
for (size_t i = 0; i < biases_u8.get_size(); ++i) {
|
||||
bias_u8_data[i] = (uint8_t)std::round(-1.f * static_cast<float>(bias_data[i]) / static_cast<float>(scale_data[i]));
|
||||
bias_u8_data[i] =
|
||||
(uint8_t) std::round(-1.f * static_cast<float>(bias_data[i]) / static_cast<float>(scale_data[i]));
|
||||
}
|
||||
|
||||
auto zero_point = std::make_shared<ov::op::v0::Constant>(biases_u8);
|
||||
|
|
@ -327,9 +339,7 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, o
|
|||
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
|
||||
auto zero_point_f16 = std::make_shared<ov::op::v0::Convert>(zero_point, ov::element::f16);
|
||||
|
||||
auto w_zp = std::make_shared<ov::op::v1::Subtract>(
|
||||
weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY
|
||||
);
|
||||
auto w_zp = std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
ov::Output<ov::Node> w_zp_s =
|
||||
std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
|
||||
|
|
@ -343,18 +353,17 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, o
|
|||
return std::make_shared<ov::op::v0::Convert>(w_zp_s, ov::element::f32);
|
||||
}
|
||||
|
||||
ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) {
|
||||
ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
|
||||
ov::Tensor & scales,
|
||||
ov::Tensor & biases,
|
||||
size_t group_size) {
|
||||
ov::Shape orig_weight_shape = weight.get_shape();
|
||||
|
||||
// Expand dimensions for scales and biases
|
||||
ov::Shape scale_bias_shape = scales.get_shape();
|
||||
|
||||
// Create INT4 weight tensor
|
||||
ov::Shape packed_shape = {
|
||||
orig_weight_shape[0],
|
||||
orig_weight_shape[1] / group_size,
|
||||
group_size
|
||||
};
|
||||
ov::Shape packed_shape = {orig_weight_shape[0], orig_weight_shape[1] / group_size, group_size};
|
||||
|
||||
// Requantized channel-wise case
|
||||
if (packed_shape[1] == 1) {
|
||||
|
|
@ -365,18 +374,21 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o
|
|||
biases.set_shape(scale_bias_shape);
|
||||
}
|
||||
|
||||
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u4, packed_shape, static_cast<uint8_t*>(weight.data()), nullptr);
|
||||
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u4, packed_shape,
|
||||
static_cast<uint8_t *>(weight.data()), nullptr);
|
||||
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
|
||||
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
|
||||
|
||||
// Pack zero points: two subsequent values into one
|
||||
const ov::float16* bias_data = biases.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
const ov::float16* scale_data = scales.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
const ov::float16 * bias_data = biases.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
const ov::float16 * scale_data = scales.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
ov::Tensor zero_point_tensor(ov::element::u4, scale_bias_shape);
|
||||
uint8_t* zero_point_data = static_cast<uint8_t*>(zero_point_tensor.data());
|
||||
uint8_t * zero_point_data = static_cast<uint8_t *>(zero_point_tensor.data());
|
||||
for (size_t i = 0; i < zero_point_tensor.get_byte_size(); ++i) {
|
||||
uint8_t bias1 = (uint8_t)std::round(-1.f * static_cast<float>(bias_data[i * 2]) / static_cast<float>(scale_data[i * 2]));
|
||||
uint8_t bias2 = (uint8_t)std::round(-1.f * static_cast<float>(bias_data[i * 2 + 1]) / static_cast<float>(scale_data[i * 2 + 1]));
|
||||
uint8_t bias1 =
|
||||
(uint8_t) std::round(-1.f * static_cast<float>(bias_data[i * 2]) / static_cast<float>(scale_data[i * 2]));
|
||||
uint8_t bias2 = (uint8_t) std::round(-1.f * static_cast<float>(bias_data[i * 2 + 1]) /
|
||||
static_cast<float>(scale_data[i * 2 + 1]));
|
||||
zero_point_data[i] = (bias2 << 4) | (bias1 & 0x0F);
|
||||
}
|
||||
|
||||
|
|
@ -390,16 +402,15 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o
|
|||
auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
|
||||
|
||||
// Perform dequantization
|
||||
auto w_zp = std::make_shared<ov::op::v1::Subtract>(
|
||||
weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
auto w_zp = std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
|
||||
ov::Output<ov::Node> w_zp_s =
|
||||
std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
|
||||
if (packed_shape.size() != 2) {
|
||||
// If not requantized channel-wise case, reshape back to original shape
|
||||
auto final_shape = std::make_shared<ov::op::v0::Constant>(
|
||||
ov::element::i64, ov::Shape{orig_weight_shape.size()}, orig_weight_shape);
|
||||
auto final_shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{orig_weight_shape.size()},
|
||||
orig_weight_shape);
|
||||
|
||||
w_zp_s = std::make_shared<ov::op::v1::Reshape>(w_zp_s, final_shape, false);
|
||||
}
|
||||
|
|
@ -407,7 +418,7 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o
|
|||
return std::make_shared<ov::op::v0::Convert>(w_zp_s, ov::element::f32);
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Node> requantize(const ggml_tensor* tensor, ExtraQuantType requant_type) {
|
||||
std::shared_ptr<ov::Node> requantize(const ggml_tensor * tensor, ExtraQuantType requant_type) {
|
||||
std::vector<float> weights_f32(tensor->ne[0] * tensor->ne[1]);
|
||||
ggml_get_type_traits(tensor->type)->to_float(tensor->data, weights_f32.data(), ggml_nelements(tensor));
|
||||
|
||||
|
|
@ -459,14 +470,18 @@ std::shared_ptr<ov::Node> requantize(const ggml_tensor* tensor, ExtraQuantType r
|
|||
return weight_node;
|
||||
}
|
||||
|
||||
void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
|
||||
void quantize_q4_0(const float * x,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & biases_arr,
|
||||
int64_t k,
|
||||
int64_t qk) {
|
||||
assert(k % qk == 0);
|
||||
const int nb = k / qk;
|
||||
|
||||
auto* weights = static_cast<uint8_t*>(weights_arr.data());
|
||||
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
float max = 0.0f;
|
||||
|
|
@ -503,14 +518,18 @@ void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_a
|
|||
}
|
||||
}
|
||||
|
||||
void quantize_q8_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
|
||||
void quantize_q8_0(const float * x,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & biases_arr,
|
||||
int64_t k,
|
||||
int64_t qk) {
|
||||
assert(k % qk == 0);
|
||||
const int nb = k / qk;
|
||||
|
||||
auto* weights = static_cast<uint8_t*>(weights_arr.data());
|
||||
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
|
||||
|
|
@ -534,14 +553,18 @@ void quantize_q8_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_a
|
|||
}
|
||||
}
|
||||
|
||||
void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
|
||||
void quantize_q8_1(const float * x,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & biases_arr,
|
||||
int64_t k,
|
||||
int64_t qk) {
|
||||
assert(k % qk == 0);
|
||||
const int nb = k / qk;
|
||||
|
||||
auto* weights = static_cast<uint8_t*>(weights_arr.data());
|
||||
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float min = std::numeric_limits<float>::max();
|
||||
float max = std::numeric_limits<float>::lowest();
|
||||
|
|
|
|||
|
|
@ -10,11 +10,11 @@ namespace ggml {
|
|||
|
||||
FrontEnd::FrontEnd() {}
|
||||
|
||||
std::shared_ptr<Model> FrontEnd::convert(const InputModel::Ptr& model, bool naive) {
|
||||
std::shared_ptr<Model> FrontEnd::convert(const InputModel::Ptr & model, bool naive) {
|
||||
auto ggml_model = std::dynamic_pointer_cast<ggml::InputModel>(model);
|
||||
FRONT_END_GENERAL_CHECK(ggml_model, "Invalid input model");
|
||||
std::shared_ptr<Model> converted_model;
|
||||
const auto& supported_ops = get_supported_ops();
|
||||
const auto & supported_ops = get_supported_ops();
|
||||
{
|
||||
TranslateSession translate_session(model, supported_ops, naive);
|
||||
converted_model = translate_session.get_converted_model();
|
||||
|
|
|
|||
|
|
@ -6,9 +6,9 @@ namespace ov {
|
|||
namespace frontend {
|
||||
namespace ggml {
|
||||
|
||||
InputModel::InputModel(const std::shared_ptr<GgmlDecoder>& gdecoder) : m_decoder(gdecoder) {}
|
||||
InputModel::InputModel(const std::shared_ptr<GgmlDecoder> & gdecoder) : m_decoder(gdecoder) {}
|
||||
|
||||
const std::shared_ptr<GgmlDecoder>& InputModel::get_model_decoder() const {
|
||||
const std::shared_ptr<GgmlDecoder> & InputModel::get_model_decoder() const {
|
||||
return m_decoder;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,8 @@
|
|||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <climits>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
|
@ -6,16 +10,12 @@
|
|||
#include <openvino/op/slice.hpp>
|
||||
#include <vector>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_cont(const NodeContext& context) {
|
||||
OutputVector translate_cont(const NodeContext & context) {
|
||||
num_inputs_check(context, 1, 1);
|
||||
|
||||
int op_case = context.get_op_case();
|
||||
|
|
@ -29,9 +29,7 @@ OutputVector translate_cont(const NodeContext& context) {
|
|||
// The input comes from a PERMUTE
|
||||
dst_shape[1] = -1;
|
||||
res = std::make_shared<ov::op::v1::Reshape>(
|
||||
context.get_input(0),
|
||||
ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape),
|
||||
false);
|
||||
context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), false);
|
||||
} else if (op_case == 2) {
|
||||
// The input comes from a TRANSPOSE
|
||||
return {context.get_input(0)};
|
||||
|
|
|
|||
|
|
@ -1,15 +1,16 @@
|
|||
#include <memory>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <memory>
|
||||
#include <openvino/op/convert.hpp>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_cpy(const NodeContext& context) {
|
||||
OutputVector translate_cpy(const NodeContext & context) {
|
||||
auto res = std::make_shared<ov::op::v0::Convert>(context.get_input(0), context.get_output_type(0));
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,3 +1,7 @@
|
|||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <memory>
|
||||
#include <openvino/op/broadcast.hpp>
|
||||
#include <openvino/op/concat.hpp>
|
||||
|
|
@ -8,24 +12,20 @@
|
|||
#include <openvino/op/unsqueeze.hpp>
|
||||
#include <string>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_flash_attn_ext(const NodeContext& context) {
|
||||
OutputVector translate_flash_attn_ext(const NodeContext & context) {
|
||||
num_inputs_check(context, 4, 4);
|
||||
auto q_f32 = context.get_input(0);
|
||||
auto k = context.get_input(1);
|
||||
auto v = context.get_input(2);
|
||||
auto mask = context.get_input(3);
|
||||
|
||||
float* params = reinterpret_cast<float*>(context.get_output_op_params(0));
|
||||
float scale = params[0];
|
||||
float * params = reinterpret_cast<float *>(context.get_output_op_params(0));
|
||||
float scale = params[0];
|
||||
// float max_bias = params[1];
|
||||
// float logit_softcap = params[2];
|
||||
|
||||
|
|
@ -43,15 +43,14 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) {
|
|||
auto token_len = get_dimensions(q, {2});
|
||||
auto kv_len = get_dimensions(k.get_node_shared_ptr(), {2});
|
||||
|
||||
auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0});
|
||||
auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1});
|
||||
auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 0});
|
||||
auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 1});
|
||||
auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
|
||||
auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
|
||||
auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2});
|
||||
auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2});
|
||||
|
||||
auto stop = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{token_len, kv_len}, 0);
|
||||
mask_sliced =
|
||||
std::make_shared<ov::op::v8::Slice>(mask, zero_2d, stop, one_2d, axes);
|
||||
mask_sliced = std::make_shared<ov::op::v8::Slice>(mask, zero_2d, stop, one_2d, axes);
|
||||
mask_sliced = std::make_shared<ov::op::v0::Unsqueeze>(mask_sliced, zero_1d);
|
||||
}
|
||||
|
||||
|
|
@ -72,8 +71,8 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) {
|
|||
kv_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(kv, unsqueeze_axes);
|
||||
|
||||
auto kv_last_two_dims = get_dimensions(kv.get_node_shared_ptr(), {1, 2});
|
||||
kv_broadcast_shape =
|
||||
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{kv_batch_node, factor_node, kv_last_two_dims}, 0);
|
||||
kv_broadcast_shape = std::make_shared<ov::op::v0::Concat>(
|
||||
ov::OutputVector{kv_batch_node, factor_node, kv_last_two_dims}, 0);
|
||||
new_kv_shape =
|
||||
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{q_batch_node, kv_last_two_dims}, 0);
|
||||
} else {
|
||||
|
|
@ -82,8 +81,8 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) {
|
|||
kv_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(kv, unsqueeze_axes);
|
||||
|
||||
auto kv_last_two_dims = get_dimensions(kv.get_node_shared_ptr(), {2, 3});
|
||||
kv_broadcast_shape =
|
||||
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{one_1d, kv_batch_node, factor_node, kv_last_two_dims}, 0);
|
||||
kv_broadcast_shape = std::make_shared<ov::op::v0::Concat>(
|
||||
ov::OutputVector{one_1d, kv_batch_node, factor_node, kv_last_two_dims}, 0);
|
||||
new_kv_shape =
|
||||
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{one_1d, q_batch_node, kv_last_two_dims}, 0);
|
||||
}
|
||||
|
|
@ -105,8 +104,8 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) {
|
|||
res = std::make_shared<ov::op::v1::Transpose>(sdpa_f32,
|
||||
ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2}));
|
||||
} else {
|
||||
res = std::make_shared<ov::op::v1::Transpose>(sdpa_f32,
|
||||
ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}));
|
||||
res = std::make_shared<ov::op::v1::Transpose>(
|
||||
sdpa_f32, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}));
|
||||
}
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,3 +1,7 @@
|
|||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/core/node_output.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
|
|
@ -5,16 +9,12 @@
|
|||
#include <openvino/op/gather.hpp>
|
||||
#include <openvino/op/squeeze.hpp>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_get_rows(const NodeContext& context) {
|
||||
OutputVector translate_get_rows(const NodeContext & context) {
|
||||
num_inputs_check(context, 2, 2);
|
||||
|
||||
int op_case = context.get_op_case();
|
||||
|
|
|
|||
|
|
@ -1,3 +1,7 @@
|
|||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <memory>
|
||||
#include <openvino/core/node_output.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
|
|
@ -7,16 +11,12 @@
|
|||
#include <openvino/op/slice.hpp>
|
||||
#include <openvino/op/split.hpp>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_glu_geglu(const NodeContext& context) {
|
||||
OutputVector translate_glu_geglu(const NodeContext & context) {
|
||||
num_inputs_check(context, 1, 2);
|
||||
|
||||
ov::Output<ov::Node> src0;
|
||||
|
|
@ -32,7 +32,7 @@ OutputVector translate_glu_geglu(const NodeContext& context) {
|
|||
src1 = split->output(1);
|
||||
}
|
||||
|
||||
int32_t* params = context.get_output_op_params(0);
|
||||
int32_t * params = context.get_output_op_params(0);
|
||||
const int32_t swapped = params[1];
|
||||
if (swapped) {
|
||||
std::swap(src0, src1);
|
||||
|
|
|
|||
|
|
@ -1,3 +1,7 @@
|
|||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <openvino/core/node_output.hpp>
|
||||
|
|
@ -7,16 +11,12 @@
|
|||
#include <openvino/op/slice.hpp>
|
||||
#include <openvino/op/split.hpp>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_glu_swiglu(const NodeContext& context) {
|
||||
OutputVector translate_glu_swiglu(const NodeContext & context) {
|
||||
num_inputs_check(context, 1, 2);
|
||||
|
||||
ov::Output<ov::Node> src0;
|
||||
|
|
@ -32,7 +32,7 @@ OutputVector translate_glu_swiglu(const NodeContext& context) {
|
|||
src1 = split->output(1);
|
||||
}
|
||||
|
||||
int32_t* params = context.get_output_op_params(0);
|
||||
int32_t * params = context.get_output_op_params(0);
|
||||
const int32_t swapped = params[1];
|
||||
if (swapped) {
|
||||
std::swap(src0, src1);
|
||||
|
|
|
|||
|
|
@ -1,3 +1,7 @@
|
|||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <climits>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
|
@ -15,16 +19,12 @@
|
|||
#include <openvino/op/util/op_types.hpp>
|
||||
#include <vector>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_mulmat(const NodeContext& context) {
|
||||
OutputVector translate_mulmat(const NodeContext & context) {
|
||||
num_inputs_check(context, 2, 2);
|
||||
|
||||
int op_case = context.get_op_case();
|
||||
|
|
|
|||
|
|
@ -1,3 +1,7 @@
|
|||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <climits>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
|
@ -9,16 +13,12 @@
|
|||
#include <openvino/op/transpose.hpp>
|
||||
#include <openvino/op/unsqueeze.hpp>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_permute(const NodeContext& context) {
|
||||
OutputVector translate_permute(const NodeContext & context) {
|
||||
num_inputs_check(context, 1, 1);
|
||||
|
||||
int op_case = context.get_op_case();
|
||||
|
|
@ -28,15 +28,15 @@ OutputVector translate_permute(const NodeContext& context) {
|
|||
|
||||
if (op_case == 1) {
|
||||
if (context.is_static()) {
|
||||
res = std::make_shared<ov::op::v1::Transpose>(context.get_input(0),
|
||||
ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2}));
|
||||
res = std::make_shared<ov::op::v1::Transpose>(
|
||||
context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2}));
|
||||
} else {
|
||||
auto src = context.get_input(0);
|
||||
if (src.get_partial_shape().rank() == 3) {
|
||||
src = std::make_shared<ov::op::v0::Unsqueeze>(src, zero);
|
||||
}
|
||||
res = std::make_shared<ov::op::v1::Transpose>(src,
|
||||
ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}));
|
||||
res = std::make_shared<ov::op::v1::Transpose>(
|
||||
src, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}));
|
||||
}
|
||||
} else {
|
||||
auto src = context.get_input(0);
|
||||
|
|
@ -47,7 +47,8 @@ OutputVector translate_permute(const NodeContext& context) {
|
|||
std::vector<int64_t> src_shape(src_shape_.begin(), src_shape_.end());
|
||||
auto src_reshaped = std::make_shared<ov::op::v1::Reshape>(
|
||||
src,
|
||||
ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{-1, src_shape[1], src_shape[2]}),
|
||||
ov::op::v0::Constant::create(ov::element::i64, {3},
|
||||
std::vector<int64_t>{-1, src_shape[1], src_shape[2]}),
|
||||
false);
|
||||
res = std::make_shared<ov::op::v1::Transpose>(
|
||||
src_reshaped, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2}));
|
||||
|
|
@ -55,8 +56,8 @@ OutputVector translate_permute(const NodeContext& context) {
|
|||
if (src.get_partial_shape().rank() == 3) {
|
||||
src = std::make_shared<ov::op::v0::Unsqueeze>(src, zero);
|
||||
}
|
||||
res = std::make_shared<ov::op::v1::Transpose>(src,
|
||||
ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}));
|
||||
res = std::make_shared<ov::op::v1::Transpose>(
|
||||
src, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}));
|
||||
}
|
||||
}
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
|
|
|
|||
|
|
@ -1,3 +1,7 @@
|
|||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <openvino/core/node.hpp>
|
||||
|
|
@ -7,16 +11,12 @@
|
|||
#include <openvino/op/reshape.hpp>
|
||||
#include <vector>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_reshape(const NodeContext& context) {
|
||||
OutputVector translate_reshape(const NodeContext & context) {
|
||||
num_inputs_check(context, 1, 1);
|
||||
if (context.get_input_shape(0) == context.get_output_shape(0)) {
|
||||
return {context.get_input(0)};
|
||||
|
|
@ -29,15 +29,11 @@ OutputVector translate_reshape(const NodeContext& context) {
|
|||
auto output_shape = context.get_output_shape(0).to_shape();
|
||||
std::shared_ptr<ov::Node> new_shape_node;
|
||||
if (op_case == 1) {
|
||||
new_shape_node =
|
||||
ov::op::v0::Constant::create(ov::element::i64,
|
||||
{3},
|
||||
std::vector<int64_t>{-1, (int64_t)output_shape[1], (int64_t)output_shape[2]});
|
||||
new_shape_node = ov::op::v0::Constant::create(
|
||||
ov::element::i64, {3}, std::vector<int64_t>{-1, (int64_t) output_shape[1], (int64_t) output_shape[2]});
|
||||
} else if (op_case == 2) {
|
||||
new_shape_node =
|
||||
ov::op::v0::Constant::create(ov::element::i64,
|
||||
{3},
|
||||
std::vector<int64_t>{(int64_t)output_shape[0], -1, (int64_t)output_shape[2]});
|
||||
new_shape_node = ov::op::v0::Constant::create(
|
||||
ov::element::i64, {3}, std::vector<int64_t>{(int64_t) output_shape[0], -1, (int64_t) output_shape[2]});
|
||||
} else if (op_case == 3) {
|
||||
new_shape_node =
|
||||
ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{(int64_t) output_shape[0], -1, 1});
|
||||
|
|
|
|||
|
|
@ -1,3 +1,7 @@
|
|||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <memory>
|
||||
#include <openvino/op/add.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
|
|
@ -7,16 +11,12 @@
|
|||
#include <openvino/op/reduce_mean.hpp>
|
||||
#include <openvino/op/sqrt.hpp>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_rms_norm(const NodeContext& context) {
|
||||
OutputVector translate_rms_norm(const NodeContext & context) {
|
||||
num_inputs_check(context, 1, 1);
|
||||
|
||||
auto input_node = context.get_input(0);
|
||||
|
|
|
|||
|
|
@ -1,3 +1,7 @@
|
|||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <openvino/core/node.hpp>
|
||||
|
|
@ -14,16 +18,12 @@
|
|||
#include <openvino/op/unsqueeze.hpp>
|
||||
#include <vector>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_rope(const NodeContext& context) {
|
||||
OutputVector translate_rope(const NodeContext & context) {
|
||||
num_inputs_check(context, 2, 3);
|
||||
|
||||
int op_case = context.get_op_case();
|
||||
|
|
@ -32,7 +32,7 @@ OutputVector translate_rope(const NodeContext& context) {
|
|||
|
||||
auto data_node = context.get_input(0).get_node_shared_ptr();
|
||||
auto output_shape = context.get_output_shape(0).to_shape();
|
||||
int32_t* op_params = context.get_output_op_params(0);
|
||||
int32_t * op_params = context.get_output_op_params(0);
|
||||
|
||||
Output<Node> cos_theta_node;
|
||||
Output<Node> sin_theta_node;
|
||||
|
|
@ -85,7 +85,8 @@ OutputVector translate_rope(const NodeContext& context) {
|
|||
auto stack = std::make_shared<ov::op::v0::Concat>(OutputVector{first_half, second_half}, 3);
|
||||
res = std::make_shared<ov::op::v1::Reshape>(stack, std::make_shared<ov::op::v0::ShapeOf>(data_node), false);
|
||||
if (!(context.is_static())) {
|
||||
res = std::make_shared<ov::op::v0::Unsqueeze>(res, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
|
||||
res =
|
||||
std::make_shared<ov::op::v0::Unsqueeze>(res, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
|
||||
}
|
||||
} else if (mode == ROPE_TYPE_NEOX) {
|
||||
auto data_split = std::make_shared<ov::op::v1::Split>(
|
||||
|
|
|
|||
|
|
@ -1,17 +1,17 @@
|
|||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <vector>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <vector>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_scale(const NodeContext& context) {
|
||||
OutputVector translate_scale(const NodeContext & context) {
|
||||
num_inputs_check(context, 1, 1);
|
||||
|
||||
float scale;
|
||||
|
|
|
|||
|
|
@ -1,3 +1,7 @@
|
|||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
|
@ -15,16 +19,12 @@
|
|||
#include <openvino/op/squeeze.hpp>
|
||||
#include <openvino/op/transpose.hpp>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_set_rows(const NodeContext& context) {
|
||||
OutputVector translate_set_rows(const NodeContext & context) {
|
||||
num_inputs_check(context, 3, 3);
|
||||
|
||||
auto data = context.get_input(0);
|
||||
|
|
@ -44,8 +44,7 @@ OutputVector translate_set_rows(const NodeContext& context) {
|
|||
Output<Node> res;
|
||||
if (context.is_static()) {
|
||||
auto dst_reshaped = std::make_shared<ov::op::v1::Reshape>(
|
||||
dst,
|
||||
ov::op::v0::Constant::create(ov::element::i64, {2}, {(int64_t) dst_shape[1], (int64_t) dst_shape[2]}),
|
||||
dst, ov::op::v0::Constant::create(ov::element::i64, {2}, {(int64_t) dst_shape[1], (int64_t) dst_shape[2]}),
|
||||
false);
|
||||
auto indices_reshaped =
|
||||
std::make_shared<ov::op::v0::Squeeze>(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1}));
|
||||
|
|
@ -55,7 +54,8 @@ OutputVector translate_set_rows(const NodeContext& context) {
|
|||
auto updated = std::make_shared<ov::op::v3::ScatterUpdate>(dst_reshaped, indices_reshaped, data_reshaped, zero);
|
||||
res = std::make_shared<ov::op::v1::Reshape>(updated, std::make_shared<ov::op::v0::ShapeOf>(dst), false);
|
||||
} else {
|
||||
assert(dst.get_partial_shape().rank() == 4 && dst.get_partial_shape()[2].is_static() && dst.get_partial_shape()[3].is_static());
|
||||
assert(dst.get_partial_shape().rank() == 4 && dst.get_partial_shape()[2].is_static() &&
|
||||
dst.get_partial_shape()[3].is_static());
|
||||
int64_t dim2 = dst.get_partial_shape()[2].get_length();
|
||||
int64_t dim3 = dst.get_partial_shape()[3].get_length();
|
||||
data = std::make_shared<ov::op::v1::Reshape>(
|
||||
|
|
|
|||
|
|
@ -1,3 +1,7 @@
|
|||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <climits>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
|
@ -13,16 +17,12 @@
|
|||
#include <openvino/op/softmax.hpp>
|
||||
#include <vector>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_soft_max(const NodeContext& context) {
|
||||
OutputVector translate_soft_max(const NodeContext & context) {
|
||||
num_inputs_check(context, 1, 2);
|
||||
|
||||
auto input_node = context.get_input(0).get_node_shared_ptr();
|
||||
|
|
@ -30,9 +30,9 @@ OutputVector translate_soft_max(const NodeContext& context) {
|
|||
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
auto* op_params = context.get_output_op_params(0);
|
||||
memcpy(&scale, (float*) op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (float*) op_params + 1, sizeof(float));
|
||||
auto * op_params = context.get_output_op_params(0);
|
||||
memcpy(&scale, (float *) op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (float *) op_params + 1, sizeof(float));
|
||||
auto src0_shape = context.get_input_shape(0).get_shape();
|
||||
const uint32_t h = src0_shape[2];
|
||||
const uint32_t n_head = src0_shape[0];
|
||||
|
|
|
|||
|
|
@ -1,15 +1,15 @@
|
|||
#include <openvino/op/transpose.hpp>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <openvino/op/transpose.hpp>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_transpose(const NodeContext& context) {
|
||||
OutputVector translate_transpose(const NodeContext & context) {
|
||||
num_inputs_check(context, 1, 1);
|
||||
|
||||
auto res = std::make_shared<ov::op::v1::Transpose>(context.get_input(0),
|
||||
|
|
|
|||
|
|
@ -1,17 +1,17 @@
|
|||
#include <openvino/core/node_output.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/sigmoid.hpp>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#include <openvino/core/node_output.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/sigmoid.hpp>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_unary_silu(const NodeContext& context) {
|
||||
OutputVector translate_unary_silu(const NodeContext & context) {
|
||||
num_inputs_check(context, 1, 1);
|
||||
|
||||
auto input = context.get_input(0);
|
||||
|
|
|
|||
|
|
@ -6,12 +6,13 @@ namespace frontend {
|
|||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_view(const NodeContext& context) {
|
||||
OutputVector translate_view(const NodeContext & context) {
|
||||
num_inputs_check(context, 1, 1);
|
||||
|
||||
if (context.get_op_case() == 2) {
|
||||
auto dst_shape = context.get_output_shape(0).to_shape();
|
||||
return rename_outputs_with_suffix({process_view_input(context, 0, dst_shape[1] * dst_shape[2])}, context.get_name());
|
||||
return rename_outputs_with_suffix({process_view_input(context, 0, dst_shape[1] * dst_shape[2])},
|
||||
context.get_name());
|
||||
}
|
||||
return {context.get_input(0)};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
#include "op_table.hpp"
|
||||
|
||||
#include "utils.hpp"
|
||||
|
||||
#include <openvino/op/add.hpp>
|
||||
#include <openvino/op/divide.hpp>
|
||||
#include <openvino/op/gather.hpp>
|
||||
|
|
@ -7,8 +9,6 @@
|
|||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/subtract.hpp>
|
||||
|
||||
#include "utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
|
|
|
|||
|
|
@ -1,15 +1,15 @@
|
|||
#include "eliminate_zp.hpp"
|
||||
|
||||
#include <openvino/core/graph_util.hpp>
|
||||
#include <openvino/core/parallel.hpp>
|
||||
#include <openvino/core/rt_info.hpp>
|
||||
#include <openvino/pass/pattern/op/label.hpp>
|
||||
#include <openvino/pass/pattern/op/pattern.hpp>
|
||||
#include <openvino/pass/pattern/op/wrap_type.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/subtract.hpp>
|
||||
#include <openvino/core/parallel.hpp>
|
||||
#include <openvino/pass/pattern/op/label.hpp>
|
||||
#include <openvino/pass/pattern/op/pattern.hpp>
|
||||
#include <openvino/pass/pattern/op/wrap_type.hpp>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
|
|
@ -35,13 +35,17 @@ EliminateZeroPoints::EliminateZeroPoints() {
|
|||
auto m_scale = ov::pass::pattern::any_input();
|
||||
auto m_multiply = ov::pass::pattern::wrap_type<ov::op::v1::Multiply>({m_scale, m_subtract});
|
||||
|
||||
const auto callback = [=](ov::pass::pattern::Matcher& m) {
|
||||
const auto& pattern_map = m.get_pattern_value_map();
|
||||
const auto callback = [=](ov::pass::pattern::Matcher & m) {
|
||||
const auto & pattern_map = m.get_pattern_value_map();
|
||||
|
||||
auto multiply_node = std::dynamic_pointer_cast<ov::op::v1::Multiply>(pattern_map.at(m_multiply).get_node_shared_ptr());
|
||||
auto subtract_node = std::dynamic_pointer_cast<ov::op::v1::Subtract>(pattern_map.at(m_subtract).get_node_shared_ptr());
|
||||
auto data_constant = std::dynamic_pointer_cast<ov::op::v0::Constant>(pattern_map.at(m_data_constant).get_node_shared_ptr());
|
||||
auto zp_constant = std::dynamic_pointer_cast<ov::op::v0::Constant>(pattern_map.at(m_zp_constant).get_node_shared_ptr());
|
||||
auto multiply_node =
|
||||
std::dynamic_pointer_cast<ov::op::v1::Multiply>(pattern_map.at(m_multiply).get_node_shared_ptr());
|
||||
auto subtract_node =
|
||||
std::dynamic_pointer_cast<ov::op::v1::Subtract>(pattern_map.at(m_subtract).get_node_shared_ptr());
|
||||
auto data_constant =
|
||||
std::dynamic_pointer_cast<ov::op::v0::Constant>(pattern_map.at(m_data_constant).get_node_shared_ptr());
|
||||
auto zp_constant =
|
||||
std::dynamic_pointer_cast<ov::op::v0::Constant>(pattern_map.at(m_zp_constant).get_node_shared_ptr());
|
||||
|
||||
if (!multiply_node || !subtract_node || !data_constant || !zp_constant) {
|
||||
return false;
|
||||
|
|
@ -101,14 +105,16 @@ EliminateZeroPoints::EliminateZeroPoints() {
|
|||
new_constant = std::make_shared<ov::op::v0::Constant>(target_type, data_shape, adjusted_values);
|
||||
}
|
||||
|
||||
auto new_convert = std::make_shared<ov::op::v0::Convert>(new_constant, subtract_node->get_output_element_type(0));
|
||||
auto new_convert =
|
||||
std::make_shared<ov::op::v0::Convert>(new_constant, subtract_node->get_output_element_type(0));
|
||||
ov::replace_node(subtract_node, new_convert);
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
register_matcher(std::make_shared<ov::pass::pattern::Matcher>(m_multiply, "ov::frontend::ggml::pass::EliminateZeroPoints"),
|
||||
callback);
|
||||
register_matcher(
|
||||
std::make_shared<ov::pass::pattern::Matcher>(m_multiply, "ov::frontend::ggml::pass::EliminateZeroPoints"),
|
||||
callback);
|
||||
}
|
||||
|
||||
} // namespace pass
|
||||
|
|
|
|||
|
|
@ -33,8 +33,8 @@ FuseToSDPA::FuseToSDPA() {
|
|||
const auto m_v = ov::pass::pattern::any_input();
|
||||
const auto m_qkv = ov::pass::pattern::wrap_type<ov::op::v0::MatMul>({m_softmax_qk_f16, m_v});
|
||||
|
||||
const auto callback = [=](ov::pass::pattern::Matcher& m) {
|
||||
auto& pattern_to_output = m.get_pattern_value_map();
|
||||
const auto callback = [=](ov::pass::pattern::Matcher & m) {
|
||||
auto & pattern_to_output = m.get_pattern_value_map();
|
||||
auto k = pattern_to_output[m_k];
|
||||
auto q = pattern_to_output[m_q];
|
||||
auto v = pattern_to_output[m_v];
|
||||
|
|
|
|||
|
|
@ -1,5 +1,11 @@
|
|||
#include "translate_session.hpp"
|
||||
|
||||
#include "ggml-openvino/openvino/node_context.hpp"
|
||||
#include "ggml-openvino/openvino/utils.hpp"
|
||||
#include "input_model.hpp"
|
||||
#include "pass/eliminate_zp.hpp"
|
||||
#include "pass/mark_decompression_convert_constant_folding.hpp"
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <map>
|
||||
|
|
@ -25,12 +31,6 @@
|
|||
#include <openvino/pass/constant_folding.hpp>
|
||||
#include <openvino/pass/make_stateful.hpp>
|
||||
|
||||
#include "ggml-openvino/openvino/node_context.hpp"
|
||||
#include "ggml-openvino/openvino/utils.hpp"
|
||||
#include "input_model.hpp"
|
||||
#include "pass/eliminate_zp.hpp"
|
||||
#include "pass/mark_decompression_convert_constant_folding.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
|
|
@ -40,16 +40,17 @@ using namespace ov::op;
|
|||
namespace {
|
||||
|
||||
ov::pass::MakeStateful::ParamResPairs get_kv_param_res_pairs(
|
||||
const std::shared_ptr<ov::Model>& model, const std::map<std::string, std::string>& kv_param_res_names) {
|
||||
const std::shared_ptr<ov::Model> & model,
|
||||
const std::map<std::string, std::string> & kv_param_res_names) {
|
||||
ov::pass::MakeStateful::ParamResPairs pairs;
|
||||
const auto& params = model->get_parameters();
|
||||
const auto& results = model->get_results();
|
||||
const auto & params = model->get_parameters();
|
||||
const auto & results = model->get_results();
|
||||
|
||||
for (const auto& param_res : kv_param_res_names) {
|
||||
const auto& param_name = param_res.first;
|
||||
const auto& res_name = param_res.second;
|
||||
for (const auto & param_res : kv_param_res_names) {
|
||||
const auto & param_name = param_res.first;
|
||||
const auto & res_name = param_res.second;
|
||||
|
||||
auto param_it = std::find_if(params.begin(), params.end(), [&](const std::shared_ptr<v0::Parameter>& node) {
|
||||
auto param_it = std::find_if(params.begin(), params.end(), [&](const std::shared_ptr<v0::Parameter> & node) {
|
||||
return node->get_friendly_name() == param_name;
|
||||
});
|
||||
|
||||
|
|
@ -57,7 +58,7 @@ ov::pass::MakeStateful::ParamResPairs get_kv_param_res_pairs(
|
|||
" is not associated with any of "
|
||||
"Parameters in the network.");
|
||||
|
||||
auto res_it = std::find_if(results.begin(), results.end(), [&](const std::shared_ptr<v0::Result>& node) {
|
||||
auto res_it = std::find_if(results.begin(), results.end(), [&](const std::shared_ptr<v0::Result> & node) {
|
||||
return node->get_friendly_name() == res_name;
|
||||
});
|
||||
|
||||
|
|
@ -72,17 +73,17 @@ ov::pass::MakeStateful::ParamResPairs get_kv_param_res_pairs(
|
|||
return pairs;
|
||||
}
|
||||
|
||||
void add_token_len(TensorMap& tensor_map) {
|
||||
void add_token_len(TensorMap & tensor_map) {
|
||||
auto inp_tokens = tensor_map.at("inp_tokens").get_node_shared_ptr();
|
||||
auto token_len = get_dimensions(inp_tokens, {2});
|
||||
token_len->set_friendly_name("token_len");
|
||||
tensor_map.insert({"token_len", token_len->output(0)});
|
||||
}
|
||||
|
||||
void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
|
||||
void add_sliced_mask(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) {
|
||||
auto token_len = tensor_map.at("token_len").get_node_shared_ptr();
|
||||
|
||||
auto create_sliced_mask = [&](const std::string& mask_name, const std::string& sliced_name, bool is_static) {
|
||||
auto create_sliced_mask = [&](const std::string & mask_name, const std::string & sliced_name, bool is_static) {
|
||||
if (tensor_map.find(mask_name) != tensor_map.end()) {
|
||||
auto mask = tensor_map.at(mask_name).get_node_shared_ptr();
|
||||
std::shared_ptr<ov::Node> mask_sliced;
|
||||
|
|
@ -110,8 +111,7 @@ void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
|
|||
kv_len = std::make_shared<ov::op::v1::Add>(kv_len, one_1d);
|
||||
auto stop = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{token_len, kv_len}, 0);
|
||||
|
||||
mask_sliced =
|
||||
std::make_shared<ov::op::v8::Slice>(mask, zero_2d, stop, one_2d, axes);
|
||||
mask_sliced = std::make_shared<ov::op::v8::Slice>(mask, zero_2d, stop, one_2d, axes);
|
||||
mask_sliced = std::make_shared<ov::op::v0::Unsqueeze>(mask_sliced, zero_1d);
|
||||
mask_sliced = std::make_shared<ov::op::v0::Convert>(mask_sliced, ov::element::f16);
|
||||
mask_sliced->set_friendly_name(sliced_name);
|
||||
|
|
@ -125,8 +125,8 @@ void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
|
|||
// create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static());
|
||||
}
|
||||
|
||||
void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
|
||||
int32_t* rope_params = ggml_model_decoder.get_rope_params();
|
||||
void add_rope_sin_cos(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) {
|
||||
int32_t * rope_params = ggml_model_decoder.get_rope_params();
|
||||
auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr();
|
||||
std::shared_ptr<ov::Node> rope_freqs_weight;
|
||||
if (tensor_map.find("rope_freqs_weight") != tensor_map.end()) {
|
||||
|
|
@ -144,7 +144,7 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
|
|||
}
|
||||
|
||||
// Create common patterns
|
||||
void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
|
||||
void preprocess(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) {
|
||||
add_token_len(tensor_map);
|
||||
add_sliced_mask(tensor_map, ggml_model_decoder);
|
||||
add_rope_sin_cos(tensor_map, ggml_model_decoder);
|
||||
|
|
@ -152,8 +152,8 @@ void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
|
|||
|
||||
} // namespace
|
||||
|
||||
TranslateSession::TranslateSession(const frontend::InputModel::Ptr& input_model,
|
||||
const std::unordered_map<std::string, CreatorFunction>& translator_map,
|
||||
TranslateSession::TranslateSession(const frontend::InputModel::Ptr & input_model,
|
||||
const std::unordered_map<std::string, CreatorFunction> & translator_map,
|
||||
bool naive) :
|
||||
m_input_model(input_model),
|
||||
m_translator_map(translator_map),
|
||||
|
|
@ -168,26 +168,26 @@ std::shared_ptr<Model> TranslateSession::get_converted_model() {
|
|||
return m_ov_model;
|
||||
}
|
||||
|
||||
std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputModel::Ptr& input_model) {
|
||||
std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputModel::Ptr & input_model) {
|
||||
ov::ParameterVector params;
|
||||
ov::ResultVector results;
|
||||
auto tensor_map = std::make_shared<TensorMap>();
|
||||
std::shared_ptr<Model> resulting_model;
|
||||
|
||||
const auto& ggml_model = std::dynamic_pointer_cast<InputModel>(input_model);
|
||||
const auto & ggml_model = std::dynamic_pointer_cast<InputModel>(input_model);
|
||||
std::shared_ptr<GgmlDecoder> ggml_model_decoder = ggml_model->get_model_decoder();
|
||||
|
||||
for (const auto& it : ggml_model_decoder->get_model_inputs()) {
|
||||
for (const auto & it : ggml_model_decoder->get_model_inputs()) {
|
||||
params.push_back(std::dynamic_pointer_cast<ov::op::v0::Parameter>(it.second));
|
||||
(*tensor_map)[it.first] = it.second;
|
||||
}
|
||||
|
||||
for (const auto& it : ggml_model_decoder->get_model_extra_inputs()) {
|
||||
for (const auto & it : ggml_model_decoder->get_model_extra_inputs()) {
|
||||
params.push_back(std::dynamic_pointer_cast<ov::op::v0::Parameter>(it.second));
|
||||
(*tensor_map)[it.first] = it.second;
|
||||
}
|
||||
|
||||
for (const auto& it : ggml_model_decoder->get_model_weights()) {
|
||||
for (const auto & it : ggml_model_decoder->get_model_weights()) {
|
||||
(*tensor_map)[it.first] = it.second;
|
||||
}
|
||||
|
||||
|
|
@ -199,22 +199,15 @@ std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputMo
|
|||
|
||||
ov::OutputVector converted_outputs;
|
||||
auto it = m_translator_map.find(operation_type);
|
||||
FRONT_END_OP_CONVERSION_CHECK(it != m_translator_map.end(),
|
||||
"Translation for operation type ",
|
||||
operation_type,
|
||||
FRONT_END_OP_CONVERSION_CHECK(it != m_translator_map.end(), "Translation for operation type ", operation_type,
|
||||
" is not implemented.");
|
||||
NodeContext node_context(node, tensor_map, this);
|
||||
converted_outputs = it->second(node_context);
|
||||
|
||||
const auto& node_output_names = node->get_output_names();
|
||||
FRONT_END_OP_CONVERSION_CHECK(node_output_names.size() == converted_outputs.size(),
|
||||
"Number of ",
|
||||
operation_type,
|
||||
" outputs greater than number of converted outputs, which are ",
|
||||
node_output_names.size(),
|
||||
" and ",
|
||||
converted_outputs.size(),
|
||||
" respectively.");
|
||||
const auto & node_output_names = node->get_output_names();
|
||||
FRONT_END_OP_CONVERSION_CHECK(node_output_names.size() == converted_outputs.size(), "Number of ",
|
||||
operation_type, " outputs greater than number of converted outputs, which are ",
|
||||
node_output_names.size(), " and ", converted_outputs.size(), " respectively.");
|
||||
|
||||
for (size_t i = 0; i < node_output_names.size(); ++i) {
|
||||
auto output_name = node_output_names[i];
|
||||
|
|
@ -229,10 +222,9 @@ std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputMo
|
|||
}
|
||||
ggml_model_decoder->visit_subgraph(node_visitor);
|
||||
|
||||
for (const auto& name : ggml_model_decoder->get_model_output_names()) {
|
||||
for (const auto & name : ggml_model_decoder->get_model_output_names()) {
|
||||
FRONT_END_GENERAL_CHECK(tensor_map->find(name) != tensor_map->end(),
|
||||
"Output name not found in tensor map: ",
|
||||
name);
|
||||
"Output name not found in tensor map: ", name);
|
||||
auto result = std::make_shared<v0::Result>(tensor_map->at(name));
|
||||
result->set_friendly_name(name);
|
||||
results.push_back(result);
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
#include "utils.hpp"
|
||||
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#include <cstddef>
|
||||
#include <ctime>
|
||||
#include <memory>
|
||||
|
|
@ -17,8 +19,6 @@
|
|||
#include <openvino/op/transpose.hpp>
|
||||
#include <string>
|
||||
|
||||
#include "ggml-impl.h"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
|
|
@ -30,7 +30,7 @@ std::string getCurrentTime() {
|
|||
return buf;
|
||||
}
|
||||
|
||||
void num_inputs_check(const NodeContext& context, size_t min_inputs, size_t max_inputs) {
|
||||
void num_inputs_check(const NodeContext & context, size_t min_inputs, size_t max_inputs) {
|
||||
auto input_size = context.get_input_size();
|
||||
FRONT_END_OP_CONVERSION_CHECK(input_size >= min_inputs, "Got less inputs than expected");
|
||||
FRONT_END_OP_CONVERSION_CHECK(input_size <= max_inputs, "Got more inputs than expected");
|
||||
|
|
@ -48,20 +48,20 @@ int non_cont_dim(std::vector<size_t> ne, std::vector<size_t> nb) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Node> get_dimensions(const std::shared_ptr<ov::op::v3::ShapeOf>& shape,
|
||||
const std::vector<int>& dims) {
|
||||
std::shared_ptr<ov::Node> get_dimensions(const std::shared_ptr<ov::op::v3::ShapeOf> & shape,
|
||||
const std::vector<int> & dims) {
|
||||
using namespace ov::op;
|
||||
const auto zero = v0::Constant::create(ov::element::i32, ov::Shape{}, {0});
|
||||
const auto dims_const = v0::Constant::create(ov::element::i32, ov::Shape{dims.size()}, dims);
|
||||
return std::make_shared<v8::Gather>(shape, dims_const, zero);
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Node> get_dimensions(const std::shared_ptr<ov::Node>& node, const std::vector<int>& dims) {
|
||||
std::shared_ptr<ov::Node> get_dimensions(const std::shared_ptr<ov::Node> & node, const std::vector<int> & dims) {
|
||||
return get_dimensions(std::make_shared<ov::op::v3::ShapeOf>(node), dims);
|
||||
}
|
||||
|
||||
OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::string& suffix) {
|
||||
for (const auto& output : outputs) {
|
||||
OutputVector rename_outputs_with_suffix(const OutputVector & outputs, const std::string & suffix) {
|
||||
for (const auto & output : outputs) {
|
||||
auto node = output.get_node_shared_ptr();
|
||||
std::string name = node->get_friendly_name();
|
||||
name += "_";
|
||||
|
|
@ -111,7 +111,7 @@ void ggml_rope_yarn_corr_dims(int n_dims,
|
|||
}
|
||||
} // namespace
|
||||
|
||||
std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t* rope_params,
|
||||
std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params,
|
||||
std::shared_ptr<ov::Node> inp_pos,
|
||||
std::shared_ptr<ov::Node> rope_freqs_weight) {
|
||||
inp_pos = std::make_shared<ov::op::v0::Convert>(inp_pos, ov::element::f32);
|
||||
|
|
@ -179,11 +179,11 @@ std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t* rope_params,
|
|||
return std::make_pair(sin_theta, cos_theta);
|
||||
}
|
||||
|
||||
ov::Output<ov::Node> process_view_input(const NodeContext& context, int input_index, int slice_len) {
|
||||
ov::Output<ov::Node> process_view_input(const NodeContext & context, int input_index, int slice_len) {
|
||||
// Only works for VIEW operations that slice at the lowest dimension
|
||||
// If the VIEW also reshape the result, `slice_len` should be provided
|
||||
auto input = context.get_input(input_index);
|
||||
int32_t* op_params = context.get_input_op_params(input_index);
|
||||
int32_t * op_params = context.get_input_op_params(input_index);
|
||||
auto src1_stride = context.get_input_stride(input_index);
|
||||
|
||||
int64_t split_addr = op_params[0] / src1_stride[2];
|
||||
|
|
|
|||
|
|
@ -1,5 +1,11 @@
|
|||
#include "utils.h"
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-openvino/ggml-decoder.h"
|
||||
#include "ggml.h"
|
||||
#include "openvino/frontend.hpp"
|
||||
#include "openvino/input_model.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
|
|
@ -23,15 +29,9 @@
|
|||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-openvino/ggml-decoder.h"
|
||||
#include "ggml.h"
|
||||
#include "openvino/frontend.hpp"
|
||||
#include "openvino/input_model.hpp"
|
||||
|
||||
ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string& name) {
|
||||
const auto* ggml_tensor = ggml_decoder->get_input_ggml_tensor(name);
|
||||
auto* input_data = ggml_tensor->data;
|
||||
ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & name) {
|
||||
const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(name);
|
||||
auto * input_data = ggml_tensor->data;
|
||||
ov::Shape input_shape;
|
||||
if (name.find("cache_k") == 0 || name.find("cache_v") == 0) {
|
||||
input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor).to_shape();
|
||||
|
|
@ -45,13 +45,14 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
|
|||
return input_tensor;
|
||||
}
|
||||
|
||||
std::map<std::string, void*> get_ggml_graph_output_dst(std::shared_ptr<GgmlOvDecoder> ggml_decoder) {
|
||||
std::map<std::string, void*> output_tensors;
|
||||
std::map<std::string, void *> get_ggml_graph_output_dst(std::shared_ptr<GgmlOvDecoder> ggml_decoder) {
|
||||
std::map<std::string, void *> output_tensors;
|
||||
|
||||
auto output_names = ggml_decoder->get_model_output_names();
|
||||
for (size_t inp = 0; inp < output_names.size(); ++inp) {
|
||||
auto name = output_names[inp];
|
||||
const auto* tensor = ggml_decoder->get_output_ggml_tensor(name);
|
||||
auto* output_data = tensor->view_src ? tensor->view_src->data : tensor->data;
|
||||
const auto * tensor = ggml_decoder->get_output_ggml_tensor(name);
|
||||
auto * output_data = tensor->view_src ? tensor->view_src->data : tensor->data;
|
||||
output_tensors[name] = output_data;
|
||||
}
|
||||
return output_tensors;
|
||||
|
|
@ -63,14 +64,14 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() {
|
|||
return front_end;
|
||||
}
|
||||
|
||||
enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph) {
|
||||
enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
static ov::Core core;
|
||||
|
||||
static std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "";
|
||||
if (device.empty()) {
|
||||
const std::vector<std::string> preferred_device = { "GPU", "CPU", "NPU" };
|
||||
const std::vector<std::string> preferred_device = {"GPU", "CPU", "NPU"};
|
||||
const auto available_devices = core.get_available_devices();
|
||||
for (const auto& dev : preferred_device) {
|
||||
for (const auto & dev : preferred_device) {
|
||||
if (std::find(available_devices.begin(), available_devices.end(), dev) != available_devices.end()) {
|
||||
device = dev;
|
||||
break;
|
||||
|
|
@ -92,17 +93,17 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
|
|||
|
||||
auto start_time = ggml_time_us();
|
||||
|
||||
auto* cache_dir = getenv("GGML_OPENVINO_CACHE_DIR");
|
||||
auto * cache_dir = getenv("GGML_OPENVINO_CACHE_DIR");
|
||||
if (cache_dir && !is_static) {
|
||||
core.set_property(ov::cache_dir(cache_dir));
|
||||
}
|
||||
|
||||
static std::mutex cache_mutex;
|
||||
static std::unordered_map<struct ggml_cgraph*, std::shared_ptr<ov::InferRequest>> infer_request_cache;
|
||||
static std::unordered_map<struct ggml_cgraph*, std::vector<std::string>> ov_input_names_cache;
|
||||
static std::unordered_map<struct ggml_cgraph*, std::vector<std::string>> ov_output_names_cache;
|
||||
static std::unordered_map<ggml_cgraph *, std::shared_ptr<ov::InferRequest>> infer_request_cache;
|
||||
static std::unordered_map<ggml_cgraph *, std::vector<std::string>> ov_input_names_cache;
|
||||
static std::unordered_map<ggml_cgraph *, std::vector<std::string>> ov_output_names_cache;
|
||||
// For NPU, store the kvcache model, since we cannot create two infer_request
|
||||
static std::unordered_map<struct ggml_cgraph*, ov::CompiledModel> compiled_model_cache;
|
||||
static std::unordered_map<ggml_cgraph *, ov::CompiledModel> compiled_model_cache;
|
||||
|
||||
std::shared_ptr<GgmlOvDecoder> ggml_decoder;
|
||||
ov::InferRequest infer_request;
|
||||
|
|
@ -181,7 +182,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
|
|||
ov::serialize(model, timestamped_filename);
|
||||
}
|
||||
|
||||
auto* disable_sdpa_optimization = getenv("GGML_OPENVINO_DISABLE_SDPA_OPTIMIZATION");
|
||||
auto * disable_sdpa_optimization = getenv("GGML_OPENVINO_DISABLE_SDPA_OPTIMIZATION");
|
||||
if (disable_sdpa_optimization && std::string(disable_sdpa_optimization) != "0") {
|
||||
config = {
|
||||
{"GPU_ENABLE_SDPA_OPTIMIZATION", "0"}
|
||||
|
|
@ -196,10 +197,10 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
|
|||
|
||||
std::vector<std::string> ov_input_names;
|
||||
std::vector<std::string> ov_output_names;
|
||||
for (const auto& ov_param : model->get_parameters()) {
|
||||
for (const auto & ov_param : model->get_parameters()) {
|
||||
ov_input_names.push_back(ov_param->get_friendly_name());
|
||||
}
|
||||
for (const auto& ov_output : model->get_results()) {
|
||||
for (const auto & ov_output : model->get_results()) {
|
||||
ov_output_names.push_back(ov_output->get_friendly_name());
|
||||
}
|
||||
ov_input_names_cache[cgraph] = ov_input_names;
|
||||
|
|
@ -225,7 +226,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
|
|||
|
||||
auto gguf_tensor_addrs = get_ggml_graph_output_dst(ggml_decoder);
|
||||
for (size_t i = 0; i < ov_output_names.size(); i++) {
|
||||
auto& result_name = ov_output_names[i];
|
||||
auto & result_name = ov_output_names[i];
|
||||
const auto output_tensor = infer_request.get_output_tensor(i);
|
||||
|
||||
std::memcpy(gguf_tensor_addrs[result_name], output_tensor.data(), output_tensor.get_byte_size());
|
||||
|
|
@ -278,7 +279,7 @@ ov::AnyMap get_npu_generate_config() {
|
|||
return config;
|
||||
}
|
||||
|
||||
std::map<ggml_type, ExtraQuantType> get_types_to_requant(const std::string& device) {
|
||||
std::map<ggml_type, ExtraQuantType> get_types_to_requant(const std::string & device) {
|
||||
if (device == "NPU") {
|
||||
return {
|
||||
{GGML_TYPE_Q4_0, ExtraQuantType::Q4_0_128},
|
||||
|
|
@ -297,15 +298,15 @@ std::map<ggml_type, ExtraQuantType> get_types_to_requant(const std::string& devi
|
|||
return {};
|
||||
}
|
||||
|
||||
bool is_naive(struct ggml_cgraph* cgraph) {
|
||||
bool is_naive(ggml_cgraph * cgraph) {
|
||||
constexpr int naive_graph_size_threshold = 20;
|
||||
return cgraph->n_nodes < naive_graph_size_threshold;
|
||||
}
|
||||
|
||||
enum ggml_status naive_compute(struct ggml_cgraph* cgraph,
|
||||
ov::Core& core,
|
||||
const std::string& device,
|
||||
const ov::AnyMap& config) {
|
||||
enum ggml_status naive_compute(ggml_cgraph * cgraph,
|
||||
ov::Core & core,
|
||||
const std::string & device,
|
||||
const ov::AnyMap & config) {
|
||||
if (cgraph->n_nodes == 1 && (cgraph->nodes[0]->op == GGML_OP_NONE || cgraph->nodes[0]->op == GGML_OP_VIEW)) {
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
|
@ -343,7 +344,7 @@ enum ggml_status naive_compute(struct ggml_cgraph* cgraph,
|
|||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string& param_name) {
|
||||
ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & param_name) {
|
||||
bool is_static = ggml_decoder->is_static();
|
||||
bool is_first_token = ggml_decoder->is_first_token();
|
||||
|
||||
|
|
@ -358,10 +359,10 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, cons
|
|||
if (param_name == "inp_tokens" || param_name == "inp_pos") {
|
||||
if (is_first_token) {
|
||||
size_t context_size = ggml_decoder->get_context_size();
|
||||
const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name);
|
||||
const auto * input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name);
|
||||
std::vector<int32_t> padded_data = pad_input<int32_t>(input_tensor_ggml, 1, context_size, 0);
|
||||
input_tensor = ov::Tensor(ov::element::i32, ov::Shape{1, 1, context_size});
|
||||
auto* data_ptr = input_tensor.data<int32_t>();
|
||||
auto * data_ptr = input_tensor.data<int32_t>();
|
||||
std::copy(padded_data.begin(), padded_data.end(), data_ptr);
|
||||
} else {
|
||||
input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name);
|
||||
|
|
@ -369,22 +370,22 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, cons
|
|||
|
||||
} else if (param_name.find("KQ_mask") == 0) {
|
||||
size_t context_size = ggml_decoder->get_context_size();
|
||||
const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name);
|
||||
const auto * input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name);
|
||||
if (is_first_token) {
|
||||
std::vector<float> padded_data =
|
||||
pad_input<float>(input_tensor_ggml, context_size, context_size, -INFINITY);
|
||||
set_zero_diagonal(padded_data, context_size);
|
||||
input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, context_size, context_size});
|
||||
auto* data_ptr = input_tensor.data<float>();
|
||||
auto * data_ptr = input_tensor.data<float>();
|
||||
std::copy(padded_data.begin(), padded_data.end(), data_ptr);
|
||||
} else {
|
||||
std::vector<float> padded_data = pad_input<float>(input_tensor_ggml, 1, context_size, -INFINITY);
|
||||
input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, context_size});
|
||||
auto* data_ptr = input_tensor.data<float>();
|
||||
auto * data_ptr = input_tensor.data<float>();
|
||||
std::copy(padded_data.begin(), padded_data.end(), data_ptr);
|
||||
}
|
||||
|
||||
} else if (const auto* op = ggml_decoder->get_tensor_used_op(ggml_decoder->get_tensor_from_name(param_name));
|
||||
} else if (const auto * op = ggml_decoder->get_tensor_used_op(ggml_decoder->get_tensor_from_name(param_name));
|
||||
op && op->op == GGML_OP_SET_ROWS && is_static && is_first_token) {
|
||||
input_tensor = ov::Tensor(ov::element::i64, ov::Shape{1, 1, 1});
|
||||
} else {
|
||||
|
|
@ -394,8 +395,8 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, cons
|
|||
return input_tensor;
|
||||
}
|
||||
|
||||
size_t checksum(const void* data, size_t size) {
|
||||
const uint8_t* bytes = static_cast<const uint8_t*>(data);
|
||||
size_t checksum(const void * data, size_t size) {
|
||||
const uint8_t * bytes = static_cast<const uint8_t *>(data);
|
||||
size_t sum = 0;
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
sum += (uint8_t) i;
|
||||
|
|
@ -408,36 +409,37 @@ size_t checksum(const void* data, size_t size) {
|
|||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
|
||||
|
||||
void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor) {
|
||||
void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor) {
|
||||
std::cout << "Input name: " << name << ", Input shape: " << tensor.get_shape() << ", Address: " << tensor.data()
|
||||
<< std::endl;
|
||||
switch (tensor.get_element_type()) {
|
||||
case ov::element::f32:
|
||||
std::cout << *(tensor.data<float>()) << std::endl;
|
||||
break;
|
||||
case ov::element::f16:
|
||||
std::cout << *(tensor.data<ov::float16>()) << std::endl;
|
||||
break;
|
||||
case ov::element::i32:
|
||||
for (size_t i = 0; i < tensor.get_size(); ++i) {
|
||||
std::cout << tensor.data<int32_t>()[i] << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
break;
|
||||
case ov::element::i64:
|
||||
std::cout << *(tensor.data<int64_t>()) << std::endl;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
case ov::element::f32:
|
||||
std::cout << *(tensor.data<float>()) << std::endl;
|
||||
break;
|
||||
case ov::element::f16:
|
||||
std::cout << *(tensor.data<ov::float16>()) << std::endl;
|
||||
break;
|
||||
case ov::element::i32:
|
||||
for (size_t i = 0; i < tensor.get_size(); ++i) {
|
||||
std::cout << tensor.data<int32_t>()[i] << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
break;
|
||||
case ov::element::i64:
|
||||
std::cout << *(tensor.data<int64_t>()) << std::endl;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void print_output_tensor_info(const std::string& name, const ov::Tensor& tensor,
|
||||
std::map<std::string, void*>& output_dst) {
|
||||
void print_output_tensor_info(const std::string & name,
|
||||
const ov::Tensor & tensor,
|
||||
std::map<std::string, void *> & output_dst) {
|
||||
std::cout << "Output name: " << name << ", Output shape: " << tensor.get_shape()
|
||||
<< ", Address: " << output_dst[name] << std::endl;
|
||||
|
||||
auto print_float_stats = [](const std::string& type_name, size_t size, auto get_value) {
|
||||
auto print_float_stats = [](const std::string & type_name, size_t size, auto get_value) {
|
||||
if (size == 0) {
|
||||
return;
|
||||
}
|
||||
|
|
@ -467,13 +469,13 @@ void print_output_tensor_info(const std::string& name, const ov::Tensor& tensor,
|
|||
|
||||
switch (tensor.get_element_type()) {
|
||||
case ov::element::f32: {
|
||||
const float* data = tensor.data<float>();
|
||||
const float * data = tensor.data<float>();
|
||||
size_t size = tensor.get_size();
|
||||
print_float_stats("[f32]", size, [data](size_t i) { return data[i]; });
|
||||
break;
|
||||
}
|
||||
case ov::element::f16: {
|
||||
const ov::float16* data = tensor.data<ov::float16>();
|
||||
const ov::float16 * data = tensor.data<ov::float16>();
|
||||
size_t size = tensor.get_size();
|
||||
print_float_stats("[f16]", size, [data](size_t i) { return static_cast<float>(data[i]); });
|
||||
break;
|
||||
|
|
@ -485,17 +487,17 @@ void print_output_tensor_info(const std::string& name, const ov::Tensor& tensor,
|
|||
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
void set_zero_diagonal(std::vector<float>& matrix, size_t dim) {
|
||||
void set_zero_diagonal(std::vector<float> & matrix, size_t dim) {
|
||||
for (size_t i = 0; i < dim; ++i) {
|
||||
matrix[i * dim + i] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
bool is_prefill(struct ggml_cgraph* cgraph) {
|
||||
bool is_prefill(ggml_cgraph * cgraph) {
|
||||
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
||||
auto* op = cgraph->nodes[i];
|
||||
auto * op = cgraph->nodes[i];
|
||||
for (int j = 0; j < GGML_MAX_SRC; ++j) {
|
||||
auto* src = op->src[j];
|
||||
auto * src = op->src[j];
|
||||
if (src == nullptr) {
|
||||
break;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,32 +1,32 @@
|
|||
#include <algorithm>
|
||||
#include <openvino/runtime/core.hpp>
|
||||
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-decoder.h"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph);
|
||||
#include <algorithm>
|
||||
#include <openvino/runtime/core.hpp>
|
||||
|
||||
std::shared_ptr<GgmlOvDecoder> get_ggml_decoder(struct ggml_cgraph* cgraph, bool is_static, bool is_first_token);
|
||||
enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
|
||||
ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string& name);
|
||||
std::shared_ptr<GgmlOvDecoder> get_ggml_decoder(struct ggml_cgraph * cgraph, bool is_static, bool is_first_token);
|
||||
|
||||
std::map<std::string, void*> get_ggml_graph_output_dst(std::shared_ptr<GgmlOvDecoder> ggml_decoder);
|
||||
ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & name);
|
||||
|
||||
size_t checksum(const void* data, size_t size);
|
||||
std::map<std::string, void *> get_ggml_graph_output_dst(std::shared_ptr<GgmlOvDecoder> ggml_decoder);
|
||||
|
||||
void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor);
|
||||
size_t checksum(const void * data, size_t size);
|
||||
|
||||
void print_output_tensor_info(const std::string& name,
|
||||
const ov::Tensor& tensor,
|
||||
std::map<std::string, void*>& output_dst);
|
||||
void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor);
|
||||
|
||||
void print_output_tensor_info(const std::string & name,
|
||||
const ov::Tensor & tensor,
|
||||
std::map<std::string, void *> & output_dst);
|
||||
|
||||
template <typename T>
|
||||
std::vector<T> pad_input(const ggml_tensor* tensor, size_t padded_rows, size_t padded_cols, T pad_value) {
|
||||
std::vector<T> pad_input(const ggml_tensor * tensor, size_t padded_rows, size_t padded_cols, T pad_value) {
|
||||
std::vector<T> padded_data(padded_rows * padded_cols, pad_value);
|
||||
size_t rows = tensor->ne[1];
|
||||
size_t cols = tensor->ne[0];
|
||||
T* data = static_cast<T*>(tensor->data);
|
||||
T * data = static_cast<T *>(tensor->data);
|
||||
|
||||
for (size_t i = 0; i < std::min(rows, padded_rows); ++i) {
|
||||
for (size_t j = 0; j < std::min(cols, padded_cols); ++j) {
|
||||
|
|
@ -36,18 +36,20 @@ std::vector<T> pad_input(const ggml_tensor* tensor, size_t padded_rows, size_t p
|
|||
return padded_data;
|
||||
}
|
||||
|
||||
void set_zero_diagonal(std::vector<float>& matrix, size_t dim);
|
||||
void set_zero_diagonal(std::vector<float> & matrix, size_t dim);
|
||||
|
||||
bool is_prefill(struct ggml_cgraph * cgraph);
|
||||
|
||||
ov::AnyMap get_npu_prefill_config();
|
||||
ov::AnyMap get_npu_generate_config();
|
||||
|
||||
std::map<ggml_type, ExtraQuantType> get_types_to_requant(const std::string& device);
|
||||
std::map<ggml_type, ExtraQuantType> get_types_to_requant(const std::string & device);
|
||||
|
||||
ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string& param_name);
|
||||
ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & param_name);
|
||||
|
||||
bool is_naive(struct ggml_cgraph* cgraph);
|
||||
bool is_naive(struct ggml_cgraph * cgraph);
|
||||
|
||||
enum ggml_status naive_compute(struct ggml_cgraph* cgraph, ov::Core& core, const std::string& device,
|
||||
const ov::AnyMap& config);
|
||||
enum ggml_status naive_compute(struct ggml_cgraph * cgraph,
|
||||
ov::Core & core,
|
||||
const std::string & device,
|
||||
const ov::AnyMap & config);
|
||||
|
|
|
|||
Loading…
Reference in New Issue