Fix Phi3 ROPE; Add test-backend-ops
This commit is contained in:
parent
1ed49bbfaf
commit
44f4cf34b1
|
|
@ -5,6 +5,10 @@ AlignConsecutiveDeclarations: false
|
|||
ReferenceAlignment: Left
|
||||
PointerAlignment: Left
|
||||
Cpp11BracedListStyle: true
|
||||
AccessModifierOffset: -4
|
||||
BinPackArguments: false
|
||||
BinPackParameters: false
|
||||
BreakBeforeBraces: Attach
|
||||
|
||||
Language: Cpp
|
||||
AlignAfterOpenBracket: Align
|
||||
|
|
@ -27,29 +31,7 @@ AllowShortIfStatementsOnASingleLine: Never
|
|||
AllowShortLambdasOnASingleLine: Inline
|
||||
AllowShortLoopsOnASingleLine: false
|
||||
AlwaysBreakBeforeMultilineStrings: true
|
||||
BinPackArguments: true
|
||||
BinPackParameters: true # OnePerLine
|
||||
BitFieldColonSpacing: Both
|
||||
BreakBeforeBraces: Custom # Attach
|
||||
BraceWrapping:
|
||||
AfterCaseLabel: true
|
||||
AfterClass: false
|
||||
AfterControlStatement: false
|
||||
AfterEnum: false
|
||||
AfterFunction: false
|
||||
AfterNamespace: false
|
||||
AfterObjCDeclaration: false
|
||||
AfterStruct: false
|
||||
AfterUnion: false
|
||||
AfterExternBlock: false
|
||||
BeforeCatch: false
|
||||
BeforeElse: false
|
||||
BeforeLambdaBody: false
|
||||
BeforeWhile: false
|
||||
IndentBraces: false
|
||||
SplitEmptyFunction: false
|
||||
SplitEmptyRecord: false
|
||||
SplitEmptyNamespace: false
|
||||
# BreakAdjacentStringLiterals: true
|
||||
BreakAfterAttributes: Never
|
||||
BreakBeforeBinaryOperators: None
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@
|
|||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <execution>
|
||||
|
|
@ -15,6 +16,8 @@
|
|||
#include <openvino/core/dimension.hpp>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/core/partial_shape.hpp>
|
||||
#include <openvino/core/type/bfloat16.hpp>
|
||||
#include <openvino/core/type/element_type.hpp>
|
||||
#include <openvino/core/type/float16.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/parameter.hpp>
|
||||
|
|
@ -71,9 +74,19 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
|
|||
}
|
||||
}
|
||||
|
||||
GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) {
|
||||
m_cgraph = cgraph;
|
||||
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
|
||||
auto* cur_node = cgraph->nodes[node_n];
|
||||
m_nodes.push_back(cur_node);
|
||||
set_input_output(cur_node, true);
|
||||
}
|
||||
}
|
||||
|
||||
// Called in GgmlOvDecoder constructor. Two cases: 1. constructing a decoder for the whole graph;
|
||||
// 2. constructing a decoder for a node.
|
||||
void GgmlOvDecoder::set_input_output(ggml_tensor* node) {
|
||||
// 2. constructing a decoder for a node;
|
||||
// 3. constructing a decoder for the whole graph naively (op test case)
|
||||
void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) {
|
||||
std::string node_name;
|
||||
if (node->op == GGML_OP_CPY) {
|
||||
// CPY updates the input tensor in place. For later ov op that uses the
|
||||
|
|
@ -98,8 +111,14 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) {
|
|||
m_inputs[src_name] = src;
|
||||
m_op_node_name.emplace_back(src_name, ggml_op_name(node->op));
|
||||
|
||||
// If called for the whole graph, create constant nodes for weights and param nodes for inputs
|
||||
if (!m_node && !src->view_src) {
|
||||
// Add model inputs and weights constants, if called for the whole graph
|
||||
if (naive) {
|
||||
auto param_node = std::make_shared<ov::op::v0::Parameter>(get_ov_type(src), get_graph_input_shape(src));
|
||||
param_node->set_friendly_name(src_name);
|
||||
param_node->output(0).get_tensor().set_names({src_name});
|
||||
m_model_inputs[src_name] = param_node;
|
||||
|
||||
} else if (!m_node && !src->view_src) {
|
||||
ggml_backend_buffer* buffer = src->buffer;
|
||||
|
||||
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||
|
|
@ -118,7 +137,10 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) {
|
|||
}
|
||||
}
|
||||
|
||||
if (!m_node) {
|
||||
// Add model outputs, if called for the whole graph
|
||||
if (naive) {
|
||||
m_model_output_names.push_back(node->name);
|
||||
} else if (!m_node) {
|
||||
static std::set<std::string> debug_output_names = {};
|
||||
// Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph
|
||||
if (node->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || node->flags & GGML_TENSOR_FLAG_OUTPUT ||
|
||||
|
|
@ -164,17 +186,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) {
|
|||
m_op_case = 2;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case GGML_OP_MUL_MAT: {
|
||||
if (node->src[0]->view_src == nullptr) {
|
||||
m_op_case = 1;
|
||||
} else if (std::string(node->src[0]->name).find("cache_k") == 0) {
|
||||
m_op_case = 2;
|
||||
} else if (std::string(node->src[0]->name).find("cache_v") == 0) {
|
||||
m_op_case = 3;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case GGML_OP_PERMUTE: {
|
||||
if (node->src[0]->view_src == nullptr) {
|
||||
// Permute Qcur
|
||||
|
|
@ -188,6 +200,23 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) {
|
|||
}
|
||||
break;
|
||||
}
|
||||
case GGML_OP_GET_ROWS:
|
||||
{
|
||||
if (node->src[1]->op == GGML_OP_VIEW) {
|
||||
m_op_case = 2;
|
||||
} else {
|
||||
m_op_case = 1;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case GGML_OP_ROPE:
|
||||
{
|
||||
if (node->src[0]->op == GGML_OP_VIEW) {
|
||||
m_op_case = 2;
|
||||
} else {
|
||||
m_op_case = 1;
|
||||
}
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
@ -237,6 +266,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co
|
|||
input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size};
|
||||
} else if (std::string(src->name).find("cache_v") == 0) {
|
||||
input_shape = ov::PartialShape{m_num_heads_kv, m_head_size, m_context_size};
|
||||
} else if (src->op == GGML_OP_VIEW) {
|
||||
// This case is added to make test-backend-ops work
|
||||
input_shape = ov::PartialShape{get_shape(src->view_src)};
|
||||
} else {
|
||||
input_shape = ov::PartialShape{get_shape(src)};
|
||||
}
|
||||
|
|
@ -373,6 +405,17 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor)
|
|||
weight_node = std::make_shared<ov::op::v0::Constant>(node_type, node_shape, data_f16);
|
||||
break;
|
||||
}
|
||||
case GGML_TYPE_BF16:
|
||||
{
|
||||
const auto* ptr = reinterpret_cast<const uint16_t*>(tensor->data);
|
||||
std::vector<ov::bfloat16> data_bf16;
|
||||
data_bf16.reserve(ne_total);
|
||||
for (int i = 0; i < ne_total; ++i) {
|
||||
data_bf16.push_back(ov::bfloat16::from_bits(ptr[i]));
|
||||
}
|
||||
weight_node = std::make_shared<ov::op::v0::Constant>(node_type, node_shape, data_bf16);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
throw std::invalid_argument("Unsupported tensor type");
|
||||
}
|
||||
|
|
@ -496,6 +539,9 @@ ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor* tensor) {
|
|||
case GGML_TYPE_F16:
|
||||
type = ov::element::f16;
|
||||
break;
|
||||
case GGML_TYPE_BF16:
|
||||
type = ov::element::bf16;
|
||||
break;
|
||||
case GGML_TYPE_I64:
|
||||
type = ov::element::i64;
|
||||
break;
|
||||
|
|
@ -576,6 +622,7 @@ void GgmlOvDecoder::visit_subgraph(std::function<void(std::shared_ptr<GgmlDecode
|
|||
|
||||
const std::string& GgmlOvDecoder::get_op_type() const {
|
||||
static const std::map<ggml_op, std::string> ops = {
|
||||
{GGML_OP_NONE, "GGML_OP_NONE" },
|
||||
{GGML_OP_ACC, "GGML_OP_ACC" },
|
||||
{GGML_OP_ADD, "GGML_OP_ADD" },
|
||||
{GGML_OP_ADD1, "GGML_OP_ADD1" },
|
||||
|
|
|
|||
|
|
@ -15,6 +15,8 @@ public:
|
|||
GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token,
|
||||
int context_size, int num_heads, int num_heads_kv, int head_size);
|
||||
|
||||
// Naive decoder
|
||||
GgmlOvDecoder(struct ggml_cgraph* cgraph);
|
||||
virtual ov::Any get_attribute(const std::string& name) const override {
|
||||
return nullptr;
|
||||
GGML_UNUSED(name);
|
||||
|
|
@ -111,7 +113,7 @@ public:
|
|||
void clear_model_weights() { m_model_weights.clear(); }
|
||||
|
||||
private:
|
||||
void set_input_output(ggml_tensor* node);
|
||||
void set_input_output(ggml_tensor* node, bool naive = false);
|
||||
void add_extra_inputs();
|
||||
static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename);
|
||||
static std::vector<size_t> get_shape(const ggml_tensor* tensor);
|
||||
|
|
@ -124,13 +126,13 @@ private:
|
|||
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor);
|
||||
void add_weight_const_parallel(std::map<std::string, std::shared_ptr<ov::Node>>& model_weights);
|
||||
|
||||
struct ggml_cgraph* m_cgraph;
|
||||
struct ggml_cgraph* m_cgraph = nullptr;
|
||||
ggml_tensor* m_node = nullptr;
|
||||
std::vector<ggml_tensor*> m_nodes;
|
||||
std::map<std::string, ggml_tensor*> m_inputs;
|
||||
std::vector<std::string> m_input_names;
|
||||
std::map<std::string, ggml_tensor*> m_outputs;
|
||||
std::vector<std::string> m_output_names;
|
||||
ggml_tensor* m_node;
|
||||
std::vector<ggml_tensor*> m_nodes;
|
||||
std::string m_op_name;
|
||||
mutable std::string m_name;
|
||||
int m_op_case;
|
||||
|
|
|
|||
|
|
@ -1,15 +1,17 @@
|
|||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-openvino.h"
|
||||
#include "ggml-openvino/utils.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <mutex>
|
||||
#include <openvino/openvino.hpp>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-openvino/utils.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#define GGML_OPENVINO_MAX_STREAMS 8
|
||||
|
||||
struct ggml_backend_openvino_context {
|
||||
|
|
@ -234,9 +236,85 @@ static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(g
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
static bool is_op_unsupported_case(const ggml_tensor* op) {
|
||||
if (op->op == GGML_OP_SOFT_MAX) {
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
const auto* op_params = op->op_params;
|
||||
memcpy(&scale, (const float*) op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (const float*) op_params + 1, sizeof(float));
|
||||
const uint32_t h = op->src[0]->ne[2];
|
||||
const uint32_t n_head = op->src[0]->ne[0];
|
||||
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
|
||||
|
||||
const float m0 = powf(2.0f, -(max_bias) / n_head_log2);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
const float slope =
|
||||
(max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2 * (h - n_head_log2) + 1) : 1.0f;
|
||||
|
||||
if (slope != 1.0f) {
|
||||
GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with slope != 1.0f\n");
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (op->op == GGML_OP_MUL_MAT) {
|
||||
if ((op->src[0]->view_src && op->src[0]->op != GGML_OP_PERMUTE) ||
|
||||
(op->src[1]->view_src && op->src[1]->op != GGML_OP_PERMUTE)) {
|
||||
GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with view_src tensors that are not PERMUTE\n");
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (op->op == GGML_OP_ROPE) {
|
||||
const int32_t* op_params = op->op_params;
|
||||
const int n_dims = op_params[1];
|
||||
const int mode = op_params[2];
|
||||
if (mode == GGML_ROPE_TYPE_MROPE || mode == GGML_ROPE_TYPE_VISION) {
|
||||
GGML_LOG_WARN("OpenVINO backend does not support ROPE with mode %d\n", mode);
|
||||
return true;
|
||||
}
|
||||
if (n_dims != op->src[0]->ne[0]) {
|
||||
GGML_LOG_WARN("OpenVINO backend does not support ROPE with n_dims %d != src[0]->ne[0] %ld\n",
|
||||
n_dims,
|
||||
op->src[0]->ne[0]);
|
||||
return true;
|
||||
}
|
||||
if (op->type != GGML_TYPE_F32) {
|
||||
GGML_LOG_WARN("OpenVINO backend does not support ROPE with type %s\n", ggml_type_name(op->type));
|
||||
return true;
|
||||
}
|
||||
float freq_scale;
|
||||
memcpy(&freq_scale, op_params + 6, sizeof(float));
|
||||
if (freq_scale != 1.0f) {
|
||||
GGML_LOG_WARN("OpenVINO backend does not support ROPE with freq_scale %f != 1.0f\n", freq_scale);
|
||||
return true;
|
||||
}
|
||||
float ext_factor;
|
||||
memcpy(&ext_factor, op_params + 7, sizeof(float));
|
||||
if (ext_factor != 0.0f) {
|
||||
GGML_LOG_WARN("OpenVINO backend does not support ROPE with ext_factor %f != 0.0f\n", ext_factor);
|
||||
return true;
|
||||
}
|
||||
if (op->src[0]->op == GGML_OP_VIEW) {
|
||||
if (op->src[0]->view_src->ne[1] != op->src[0]->ne[2]) {
|
||||
GGML_LOG_WARN(
|
||||
"OpenVINO backend does not support ROPE with src[0]->view_src->ne[1] %ld != src[0]->ne[2] %ld\n",
|
||||
op->src[0]->view_src->ne[1],
|
||||
op->src[0]->ne[2]);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
GGML_ASSERT(dev->reg != nullptr);
|
||||
|
||||
static const std::set<ggml_type> supported_types{
|
||||
GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_I64, GGML_TYPE_I32};
|
||||
|
||||
static const std::set<ggml_op> supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT,
|
||||
GGML_OP_VIEW, GGML_OP_CONT, GGML_OP_CPY, GGML_OP_RESHAPE,
|
||||
GGML_OP_PERMUTE, GGML_OP_TRANSPOSE, GGML_OP_GET_ROWS, GGML_OP_ROPE,
|
||||
|
|
@ -248,18 +326,60 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
|
|||
GGML_GLU_OP_SWIGLU,
|
||||
};
|
||||
|
||||
auto res = false;
|
||||
switch (op->op) {
|
||||
case GGML_OP_UNARY:
|
||||
res = supported_unary_ops.find(ggml_get_unary_op(op)) != supported_unary_ops.end();
|
||||
break;
|
||||
{
|
||||
auto supported = supported_unary_ops.find(ggml_get_unary_op(op)) != supported_unary_ops.end();
|
||||
if (!supported) {
|
||||
GGML_LOG_WARN("OpenVINO backend does not support unary op %s\n",
|
||||
ggml_unary_op_name(ggml_get_unary_op(op)));
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case GGML_OP_GLU:
|
||||
res = supported_glu_ops.find(ggml_get_glu_op(op)) != supported_glu_ops.end();
|
||||
break;
|
||||
{
|
||||
auto supported = supported_glu_ops.find(ggml_get_glu_op(op)) != supported_glu_ops.end();
|
||||
if (!supported) {
|
||||
GGML_LOG_WARN("OpenVINO backend does not support GLU op %s\n",
|
||||
ggml_glu_op_name(ggml_get_glu_op(op)));
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
res = supported_ops.find(op->op) != supported_ops.end();
|
||||
{
|
||||
auto supported = supported_ops.find(op->op) != supported_ops.end();
|
||||
if (!supported) {
|
||||
GGML_LOG_WARN("OpenVINO backend does not support op %s\n", ggml_op_name(op->op));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return res;
|
||||
|
||||
if (supported_types.find(op->type) == supported_types.end()) {
|
||||
GGML_LOG_WARN("OpenVINO backend does not support tensor type %s\n", ggml_type_name(op->type));
|
||||
return false;
|
||||
}
|
||||
if (op->ne[3] != 1) {
|
||||
GGML_LOG_WARN("OpenVINO backend does not support tensors with ne[3] != 1\n");
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (supported_types.find(op->type) == supported_types.end()) {
|
||||
GGML_LOG_WARN("OpenVINO backend does not support tensor type %s\n", ggml_type_name(op->type));
|
||||
return false;
|
||||
}
|
||||
if (op->src[i] != nullptr && op->src[i]->ne[3] != 1) {
|
||||
GGML_LOG_WARN("OpenVINO backend does not support tensors with ne[3] != 1\n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (is_op_unsupported_case(op)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||
|
|
|
|||
|
|
@ -10,13 +10,13 @@ namespace ggml {
|
|||
|
||||
FrontEnd::FrontEnd() {}
|
||||
|
||||
std::shared_ptr<Model> FrontEnd::convert(const InputModel::Ptr& model) {
|
||||
std::shared_ptr<Model> FrontEnd::convert(const InputModel::Ptr& model, bool naive) {
|
||||
auto ggml_model = std::dynamic_pointer_cast<ggml::InputModel>(model);
|
||||
FRONT_END_GENERAL_CHECK(ggml_model, "Invalid input model");
|
||||
std::shared_ptr<Model> converted_model;
|
||||
const auto& supported_ops = get_supported_ops();
|
||||
{
|
||||
TranslateSession translate_session(model, supported_ops);
|
||||
TranslateSession translate_session(model, supported_ops, naive);
|
||||
converted_model = translate_session.get_converted_model();
|
||||
}
|
||||
return converted_model;
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ public:
|
|||
using Ptr = std::shared_ptr<FrontEnd>;
|
||||
FrontEnd();
|
||||
|
||||
static std::shared_ptr<Model> convert(const InputModel::Ptr& model);
|
||||
static std::shared_ptr<Model> convert(const InputModel::Ptr& model, bool naive = false);
|
||||
};
|
||||
|
||||
} // namespace ggml
|
||||
|
|
|
|||
|
|
@ -77,6 +77,10 @@ public:
|
|||
return m_tensor_map->at(name);
|
||||
}
|
||||
|
||||
bool has_input(const std::string& name) const {
|
||||
return m_tensor_map->find(name) != m_tensor_map->end();
|
||||
}
|
||||
|
||||
const std::string& get_name() const override {
|
||||
return m_decoder->get_op_name();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -34,19 +34,7 @@ OutputVector translate_cont(const NodeContext& context) {
|
|||
false);
|
||||
} else {
|
||||
// The input comes from a VIEW
|
||||
// Currently all cases are slicing at lowest dim
|
||||
int32_t* op_params = context.get_input_op_params(0);
|
||||
auto output_stride = context.get_output_stride(0);
|
||||
|
||||
int64_t split_addr = op_params[0] / output_stride[2];
|
||||
std::vector<int64_t> begin = {0, 0, split_addr};
|
||||
std::vector<int64_t> end = {(int64_t)src_shape[0], INT_MAX, split_addr + (int64_t)src_shape[2]};
|
||||
std::vector<int64_t> strides = {1, 1, 1};
|
||||
|
||||
auto begin_const = ov::op::v0::Constant::create(element::i64, {begin.size()}, begin);
|
||||
auto end_const = ov::op::v0::Constant::create(ov::element::i64, {end.size()}, end);
|
||||
auto strides_const = ov::op::v0::Constant::create(ov::element::i64, {strides.size()}, strides);
|
||||
res = std::make_shared<ov::op::v8::Slice>(context.get_input(0), begin_const, end_const, strides_const);
|
||||
res = process_view_input(context, 0);
|
||||
}
|
||||
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
|
|
|
|||
|
|
@ -1,10 +1,12 @@
|
|||
#include <cstdint>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/core/node_output.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/gather.hpp>
|
||||
#include <openvino/op/reshape.hpp>
|
||||
#include <vector>
|
||||
#include <openvino/op/slice.hpp>
|
||||
#include <openvino/op/squeeze.hpp>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
|
|
@ -18,19 +20,32 @@ namespace op {
|
|||
OutputVector translate_get_rows(const NodeContext& context) {
|
||||
num_inputs_check(context, 2, 2);
|
||||
|
||||
auto data_node = context.get_input(0);
|
||||
auto indices_node = context.get_input(1);
|
||||
int op_case = context.get_op_case();
|
||||
FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case");
|
||||
|
||||
auto indices_shape = get_dimensions(indices_node.get_node_shared_ptr(), {2});
|
||||
Output<Node> indice_reshaped = std::make_shared<ov::op::v1::Reshape>(indices_node, indices_shape, false);
|
||||
Output<Node> res;
|
||||
auto data = context.get_input(0);
|
||||
auto indices = context.get_input(1);
|
||||
|
||||
auto axis_node = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1});
|
||||
if (op_case == 2) {
|
||||
// The input comes from a VIEW
|
||||
indices = process_view_input(context, 1);
|
||||
}
|
||||
|
||||
auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1});
|
||||
if (indices.get_partial_shape()[1].get_length() == 1) {
|
||||
indices =
|
||||
std::make_shared<ov::op::v0::Squeeze>(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1}));
|
||||
res = std::make_shared<ov::op::v8::Gather>(data, indices, axis);
|
||||
} else {
|
||||
indices =
|
||||
std::make_shared<ov::op::v0::Squeeze>(indices, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
|
||||
res = std::make_shared<ov::op::v8::Gather>(data, indices, axis, 1);
|
||||
}
|
||||
|
||||
Output<Node> res = std::make_shared<ov::op::v8::Gather>(data_node, indice_reshaped, axis_node);
|
||||
if (res.get_element_type() != context.get_output_type(0)) {
|
||||
res = std::make_shared<ov::op::v0::Convert>(res, context.get_output_type(0));
|
||||
}
|
||||
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -26,48 +26,46 @@ namespace op {
|
|||
OutputVector translate_mulmat(const NodeContext& context) {
|
||||
num_inputs_check(context, 2, 2);
|
||||
|
||||
int op_case = context.get_op_case();
|
||||
FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported MULMAT case");
|
||||
|
||||
ov::Output<Node> res;
|
||||
ov::Output<ov::Node> B = context.get_input(0);
|
||||
ov::Output<ov::Node> A = std::make_shared<ov::op::v0::Convert>(context.get_input(1), context.get_input_type(0));
|
||||
|
||||
if (op_case == 1) {
|
||||
auto src0 = context.get_input(0);
|
||||
auto src1 = std::make_shared<ov::op::v0::Convert>(context.get_input(1), context.get_input_type(0));
|
||||
auto result_lp = std::make_shared<ov::op::v0::MatMul>(src1, src0, false, true);
|
||||
res = std::make_shared<ov::op::v0::Convert>(result_lp, context.get_output_type(0));
|
||||
} else {
|
||||
ov::Output<ov::Node> B = context.get_input(0);
|
||||
ov::Output<ov::Node> A = std::make_shared<ov::op::v0::Convert>(context.get_input(1), context.get_input_type(0));
|
||||
auto B_shape = context.get_input_shape(0).to_shape();
|
||||
auto A_shape = context.get_input_shape(1).to_shape();
|
||||
int64_t A_batch = A_shape[0];
|
||||
int64_t B_batch = B_shape[0];
|
||||
auto A_batch_larger = A_batch > B_batch;
|
||||
Output<Node> Z = A_batch_larger ? B : A;
|
||||
int64_t factor = A_batch_larger ? A_batch / B_batch : B_batch / A_batch;
|
||||
if (factor > 1) {
|
||||
auto A_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{A_batch});
|
||||
auto B_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{B_batch});
|
||||
auto factor_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{factor});
|
||||
|
||||
int64_t num_heads = context.get_num_heads();
|
||||
int64_t num_heads_kv = context.get_num_heads_kv();
|
||||
int64_t kv_num_heads_factor = num_heads / num_heads_kv;
|
||||
if (kv_num_heads_factor > 1) {
|
||||
auto num_heads_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{num_heads});
|
||||
auto num_heads_kv_node =
|
||||
ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{num_heads_kv});
|
||||
auto factor_node =
|
||||
ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{kv_num_heads_factor});
|
||||
auto B_shape_last_two = get_dimensions(B.get_node_shared_ptr(), {1, 2});
|
||||
auto Z_last_two_dim = get_dimensions(Z.get_node_shared_ptr(), {1, 2});
|
||||
|
||||
auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1});
|
||||
auto B_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(B, unsqueeze_axes);
|
||||
auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1});
|
||||
auto Z_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(Z, unsqueeze_axes);
|
||||
|
||||
auto broadcast_shape = std::make_shared<ov::op::v0::Concat>(
|
||||
ov::OutputVector{num_heads_kv_node, factor_node, B_shape_last_two}, 0);
|
||||
auto B_broadcasted = std::make_shared<ov::op::v3::Broadcast>(B_unsqueezed, broadcast_shape);
|
||||
Output<Node> batch_small = A_batch_larger ? B_batch_node : A_batch_node;
|
||||
Output<Node> batch_large = A_batch_larger ? A_batch_node : B_batch_node;
|
||||
auto broadcast_shape =
|
||||
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{batch_small, factor_node, Z_last_two_dim}, 0);
|
||||
auto Z_broadcasted = std::make_shared<ov::op::v3::Broadcast>(Z_unsqueezed, broadcast_shape);
|
||||
|
||||
auto new_B_shape =
|
||||
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{num_heads_node, B_shape_last_two}, 0);
|
||||
B = std::make_shared<ov::op::v1::Reshape>(B_broadcasted, new_B_shape, false);
|
||||
auto new_Z_shape = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{batch_large, Z_last_two_dim}, 0);
|
||||
Z = std::make_shared<ov::op::v1::Reshape>(Z_broadcasted, new_Z_shape, false);
|
||||
}
|
||||
if (A_batch_larger) {
|
||||
B = Z;
|
||||
} else {
|
||||
A = Z;
|
||||
}
|
||||
|
||||
auto result_lp = std::make_shared<ov::op::v0::MatMul>(A, B, false, true);
|
||||
res = std::make_shared<ov::op::v0::Convert>(result_lp, context.get_output_type(0));
|
||||
}
|
||||
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@
|
|||
#include <openvino/op/slice.hpp>
|
||||
#include <openvino/op/split.hpp>
|
||||
#include <openvino/op/subtract.hpp>
|
||||
#include <openvino/op/unsqueeze.hpp>
|
||||
#include <vector>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
|
|
@ -25,37 +26,66 @@ namespace op {
|
|||
OutputVector translate_rope(const NodeContext& context) {
|
||||
num_inputs_check(context, 2, 3);
|
||||
|
||||
int op_case = context.get_op_case();
|
||||
FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case");
|
||||
|
||||
ov::Output<Node> res;
|
||||
|
||||
auto data_node = context.get_input(0).get_node_shared_ptr();
|
||||
auto cos_theta_node = context.get_input("rope_cos");
|
||||
auto sin_theta_node = context.get_input("rope_sin");
|
||||
|
||||
auto output_shape = context.get_output_shape(0).to_shape();
|
||||
int32_t* op_params = context.get_output_op_params(0);
|
||||
|
||||
Output<Node> cos_theta_node;
|
||||
Output<Node> sin_theta_node;
|
||||
if (context.has_input("rope_cos")) {
|
||||
cos_theta_node = context.get_input("rope_cos");
|
||||
sin_theta_node = context.get_input("rope_sin");
|
||||
} else {
|
||||
auto inp_pos = context.get_input(1).get_node_shared_ptr();
|
||||
std::shared_ptr<ov::Node> rope_freqs_weight;
|
||||
if (context.get_input_size() == 3) {
|
||||
rope_freqs_weight = context.get_input(2).get_node_shared_ptr();
|
||||
}
|
||||
auto sin_cos = make_sin_cos(op_params, inp_pos, rope_freqs_weight);
|
||||
sin_theta_node = sin_cos.first;
|
||||
cos_theta_node = sin_cos.second;
|
||||
}
|
||||
|
||||
if (op_case == 2) {
|
||||
// The input comes from a VIEW
|
||||
int slice_len = output_shape[1] * output_shape[2];
|
||||
data_node = process_view_input(context, 0, slice_len).get_node_shared_ptr();
|
||||
auto data_shape = ov::op::v0::Constant::create(
|
||||
ov::element::i64, {3}, std::vector<int64_t>{-1, (int64_t) output_shape[1], (int64_t) output_shape[2]});
|
||||
data_node = std::make_shared<ov::op::v1::Reshape>(data_node, data_shape, false);
|
||||
}
|
||||
|
||||
const int mode = op_params[2];
|
||||
constexpr int GGML_ROPE_TYPE_NEOX = 2;
|
||||
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
||||
constexpr int ROPE_TYPE_NEOX = 2;
|
||||
constexpr int ROPE_TYPE_NORM = 0;
|
||||
|
||||
if (!is_neox) {
|
||||
auto input_shape = context.get_input_shape(0);
|
||||
if (mode == ROPE_TYPE_NORM) {
|
||||
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
|
||||
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
|
||||
auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
|
||||
auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[2]});
|
||||
auto even_slice = std::make_shared<ov::op::v8::Slice>(data_node, zero, end, two, two);
|
||||
auto odd_slice = std::make_shared<ov::op::v8::Slice>(data_node, one, end, two, two);
|
||||
|
||||
auto begin_even = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {0, 0, 0});
|
||||
auto begin_odd = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {0, 0, 1});
|
||||
auto end = std::make_shared<ov::op::v0::ShapeOf>(data_node);
|
||||
auto stride = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {1, 1, 2});
|
||||
auto even_slice = std::make_shared<ov::op::v8::Slice>(data_node, begin_even, end, stride);
|
||||
auto odd_slice = std::make_shared<ov::op::v8::Slice>(data_node, begin_odd, end, stride);
|
||||
|
||||
auto first_half =
|
||||
Output<Node> first_half =
|
||||
std::make_shared<ov::op::v1::Subtract>(std::make_shared<ov::op::v1::Multiply>(even_slice, cos_theta_node),
|
||||
std::make_shared<ov::op::v1::Multiply>(odd_slice, sin_theta_node));
|
||||
auto second_half =
|
||||
Output<Node> second_half =
|
||||
std::make_shared<ov::op::v1::Add>(std::make_shared<ov::op::v1::Multiply>(even_slice, sin_theta_node),
|
||||
std::make_shared<ov::op::v1::Multiply>(odd_slice, cos_theta_node));
|
||||
|
||||
auto stack = std::make_shared<ov::op::v0::Concat>(OutputVector{first_half, second_half}, 2);
|
||||
first_half = std::make_shared<ov::op::v0::Unsqueeze>(first_half,
|
||||
ov::op::v0::Constant::create(ov::element::i64, {1}, {3}));
|
||||
second_half = std::make_shared<ov::op::v0::Unsqueeze>(second_half,
|
||||
ov::op::v0::Constant::create(ov::element::i64, {1}, {3}));
|
||||
auto stack = std::make_shared<ov::op::v0::Concat>(OutputVector{first_half, second_half}, 3);
|
||||
res = std::make_shared<ov::op::v1::Reshape>(stack, std::make_shared<ov::op::v0::ShapeOf>(data_node), false);
|
||||
} else {
|
||||
} else if (mode == ROPE_TYPE_NEOX) {
|
||||
auto data_split = std::make_shared<ov::op::v1::Split>(
|
||||
data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}), 2);
|
||||
Output<Node> slice_data_node_0 = data_split->outputs()[0];
|
||||
|
|
|
|||
|
|
@ -33,9 +33,9 @@ OutputVector translate_soft_max(const NodeContext& context) {
|
|||
auto* op_params = context.get_output_op_params(0);
|
||||
memcpy(&scale, (float*) op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (float*) op_params + 1, sizeof(float));
|
||||
const uint32_t h = context.get_head_size();
|
||||
|
||||
const uint32_t n_head = context.get_input_shape(0)[0].get_length();
|
||||
auto src0_shape = context.get_input_shape(0).get_shape();
|
||||
const uint32_t h = src0_shape[2];
|
||||
const uint32_t n_head = src0_shape[0];
|
||||
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
|
||||
|
||||
const float m0 = powf(2.0f, -(max_bias) / n_head_log2);
|
||||
|
|
@ -46,23 +46,30 @@ OutputVector translate_soft_max(const NodeContext& context) {
|
|||
auto scale_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{}, std::vector<float>{scale});
|
||||
auto scaled_input = std::make_shared<ov::op::v1::Multiply>(input_node, scale_node);
|
||||
|
||||
if (context.get_input_size() < 2) {
|
||||
res = std::make_shared<ov::op::v8::Softmax>(scaled_input, 2);
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
auto mask_node = context.get_input(1);
|
||||
|
||||
// Use Q-cur to retrieve the token length, so that the translation of SOFT_MAX
|
||||
std::shared_ptr<ov::Node> token_len = get_dimensions(input_node, {1});
|
||||
// Try using Q-cur to retrieve the token length, so that the translation of SOFT_MAX
|
||||
// does not depend on the result of the QK MatMul, so that QK matmul + softmax + qkv matmul
|
||||
// can be fused into SDPA.
|
||||
if (input_node->get_type_info() != ov::op::v0::Convert::get_type_info_static()) {
|
||||
throw std::runtime_error("Input of SOFT_MAX should be MatMul of qk followed by a Convert");
|
||||
if (input_node->get_type_info() == ov::op::v0::Convert::get_type_info_static()) {
|
||||
auto qk = input_node->get_input_node_shared_ptr(0);
|
||||
if (qk->get_type_info() == ov::op::v0::MatMul::get_type_info_static()) {
|
||||
token_len = get_dimensions(qk->get_input_node_shared_ptr(0), {1});
|
||||
}
|
||||
}
|
||||
auto qk = input_node->get_input_node_shared_ptr(0);
|
||||
if (qk->get_type_info() != ov::op::v0::MatMul::get_type_info_static()) {
|
||||
throw std::runtime_error("Input of SOFT_MAX should be MatMul of qk followed by a Convert");
|
||||
}
|
||||
auto token_len = get_dimensions(qk->get_input_node_shared_ptr(0), {1});
|
||||
|
||||
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
|
||||
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
|
||||
auto mask_node_sliced = std::make_shared<ov::op::v8::Slice>(mask_node, zero, token_len, one, one);
|
||||
std::shared_ptr<ov::Node> mask_node_sliced =
|
||||
std::make_shared<ov::op::v8::Slice>(mask_node, zero, token_len, one, one);
|
||||
if (mask_node_sliced->get_element_type() != context.get_output_type(0)) {
|
||||
mask_node_sliced = std::make_shared<ov::op::v0::Convert>(mask_node_sliced, context.get_output_type(0));
|
||||
}
|
||||
|
||||
Output<Node> slope_mask;
|
||||
if (slope != 1.0f) {
|
||||
|
|
|
|||
|
|
@ -145,69 +145,18 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
|
|||
int32_t* rope_params = ggml_model_decoder.get_rope_params();
|
||||
auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr();
|
||||
std::shared_ptr<ov::Node> rope_freqs_weight;
|
||||
|
||||
inp_pos = std::make_shared<ov::op::v0::Convert>(inp_pos, ov::element::f32);
|
||||
auto pos_perm =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{3}, std::vector<int64_t>{2, 1, 0});
|
||||
inp_pos = std::make_shared<ov::op::v1::Transpose>(inp_pos, pos_perm);
|
||||
if (tensor_map.find("rope_freqs_weight") != tensor_map.end()) {
|
||||
rope_freqs_weight = tensor_map.at("rope_freqs.weight").get_node_shared_ptr();
|
||||
}
|
||||
|
||||
float freq_base;
|
||||
float freq_scale;
|
||||
float ext_factor;
|
||||
float attn_factor;
|
||||
float beta_fast;
|
||||
float beta_slow;
|
||||
const int n_dims = rope_params[1];
|
||||
const int n_ctx_orig = rope_params[4];
|
||||
memcpy(&freq_base, rope_params + 5, sizeof(float));
|
||||
memcpy(&freq_scale, rope_params + 6, sizeof(float));
|
||||
memcpy(&ext_factor, rope_params + 7, sizeof(float));
|
||||
memcpy(&attn_factor, rope_params + 8, sizeof(float));
|
||||
memcpy(&beta_fast, rope_params + 9, sizeof(float));
|
||||
memcpy(&beta_slow, rope_params + 10, sizeof(float));
|
||||
auto sin_cos = make_sin_cos(rope_params, inp_pos, rope_freqs_weight);
|
||||
auto sin_theta = sin_cos.first;
|
||||
auto cos_theta = sin_cos.second;
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f / n_dims);
|
||||
|
||||
// TODO: corr_dims is not used in the current implementation
|
||||
float corr_dims[2];
|
||||
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
||||
|
||||
// TODO: GGML_OP_ROPE_BACK -> false
|
||||
// bool forward = true;
|
||||
// const float sin_sign = forward ? 1.0f : -1.0f;
|
||||
|
||||
const int64_t half_head_size = ggml_model_decoder.get_head_size() / 2;
|
||||
std::vector<float> factor(half_head_size);
|
||||
factor[0] = freq_scale;
|
||||
for (int64_t i = 1; i < half_head_size; i++) {
|
||||
factor[i] = theta_scale * factor[i - 1];
|
||||
}
|
||||
|
||||
Output<Node> factor_node =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor);
|
||||
if (rope_freqs_weight) {
|
||||
factor_node = std::make_shared<ov::op::v1::Divide>(factor_node, rope_freqs_weight);
|
||||
}
|
||||
|
||||
auto half_head_size_node = ov::op::v0::Constant::create(ov::element::i64, Shape{1}, {half_head_size});
|
||||
Output<Node> cos_factor =
|
||||
std::make_shared<ov::op::v0::Cos>(std::make_shared<ov::op::v1::Multiply>(factor_node, inp_pos));
|
||||
Output<Node> sin_factor =
|
||||
std::make_shared<ov::op::v0::Sin>(std::make_shared<ov::op::v1::Multiply>(factor_node, inp_pos));
|
||||
|
||||
float mscale = attn_factor;
|
||||
Output<Node> mscale_node =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{}, std::vector<float>{mscale});
|
||||
|
||||
auto cos_theta = std::make_shared<ov::op::v1::Multiply>(cos_factor, mscale_node);
|
||||
auto sin_theta = std::make_shared<ov::op::v1::Multiply>(sin_factor, mscale_node);
|
||||
cos_theta->set_friendly_name("rope_cos");
|
||||
sin_theta->set_friendly_name("rope_sin");
|
||||
tensor_map.insert({"rope_cos", cos_theta->output(0)});
|
||||
tensor_map.insert({"rope_sin", sin_theta->output(0)});
|
||||
cos_theta.get_node_shared_ptr()->set_friendly_name("rope_cos");
|
||||
sin_theta.get_node_shared_ptr()->set_friendly_name("rope_sin");
|
||||
tensor_map.insert({"rope_cos", cos_theta});
|
||||
tensor_map.insert({"rope_sin", sin_theta});
|
||||
}
|
||||
|
||||
// Create common patterns
|
||||
|
|
@ -220,10 +169,12 @@ void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
|
|||
} // namespace
|
||||
|
||||
TranslateSession::TranslateSession(const frontend::InputModel::Ptr& input_model,
|
||||
const std::unordered_map<std::string, CreatorFunction>& translator_map)
|
||||
: m_input_model(input_model),
|
||||
m_translator_map(translator_map),
|
||||
m_ov_model(nullptr) {}
|
||||
const std::unordered_map<std::string, CreatorFunction>& translator_map,
|
||||
bool naive) :
|
||||
m_input_model(input_model),
|
||||
m_translator_map(translator_map),
|
||||
m_ov_model(nullptr),
|
||||
m_naive(naive) {}
|
||||
|
||||
std::shared_ptr<Model> TranslateSession::get_converted_model() {
|
||||
if (m_ov_model) {
|
||||
|
|
@ -258,6 +209,10 @@ std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputMo
|
|||
|
||||
auto node_visitor = [&](std::shared_ptr<GgmlDecoder> node) {
|
||||
auto operation_type = node->get_op_type();
|
||||
if (operation_type == "GGML_OP_NONE") {
|
||||
return;
|
||||
}
|
||||
|
||||
ov::OutputVector converted_outputs;
|
||||
auto it = m_translator_map.find(operation_type);
|
||||
FRONT_END_OP_CONVERSION_CHECK(it != m_translator_map.end(),
|
||||
|
|
@ -285,7 +240,9 @@ std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputMo
|
|||
}
|
||||
};
|
||||
|
||||
preprocess(*tensor_map, *ggml_model_decoder);
|
||||
if (!m_naive) {
|
||||
preprocess(*tensor_map, *ggml_model_decoder);
|
||||
}
|
||||
ggml_model_decoder->visit_subgraph(node_visitor);
|
||||
|
||||
for (const auto& name : ggml_model_decoder->get_model_output_names()) {
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ namespace ggml {
|
|||
class TranslateSession {
|
||||
public:
|
||||
TranslateSession(const frontend::InputModel::Ptr& input_model,
|
||||
const std::unordered_map<std::string, CreatorFunction>& translator_map);
|
||||
const std::unordered_map<std::string, CreatorFunction>& translator_map, bool naive = false);
|
||||
|
||||
std::shared_ptr<Model> get_converted_model();
|
||||
std::shared_ptr<Model> translate_graph(const frontend::InputModel::Ptr& input_model);
|
||||
|
|
@ -20,6 +20,7 @@ private:
|
|||
const frontend::InputModel::Ptr m_input_model;
|
||||
const std::unordered_map<std::string, CreatorFunction>& m_translator_map;
|
||||
std::shared_ptr<Model> m_ov_model;
|
||||
bool m_naive;
|
||||
};
|
||||
|
||||
} // namespace ggml
|
||||
|
|
|
|||
|
|
@ -1,9 +1,20 @@
|
|||
#include "utils.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
#include <ctime>
|
||||
#include <memory>
|
||||
#include <openvino/op/add.hpp>
|
||||
#include <openvino/op/clamp.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/cos.hpp>
|
||||
#include <openvino/op/divide.hpp>
|
||||
#include <openvino/op/gather.hpp>
|
||||
#include <openvino/op/maximum.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/shape_of.hpp>
|
||||
#include <openvino/op/sin.hpp>
|
||||
#include <openvino/op/subtract.hpp>
|
||||
#include <openvino/op/transpose.hpp>
|
||||
#include <string>
|
||||
|
||||
namespace ov {
|
||||
|
|
@ -58,6 +69,134 @@ OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::
|
|||
return outputs;
|
||||
}
|
||||
|
||||
namespace {
|
||||
ov::Output<ov::Node> rope_yarn_ramp_mix(int n_dims, const float corr_dims[2], float ext_factor) {
|
||||
int half_n_dims = n_dims / 2;
|
||||
std::vector<float> dim_ids_vec(half_n_dims);
|
||||
std::iota(dim_ids_vec.begin(), dim_ids_vec.end(), 0);
|
||||
auto dim_ids = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, (size_t) half_n_dims}, dim_ids_vec);
|
||||
auto corr_low = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {corr_dims[0]});
|
||||
auto corr_high = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {corr_dims[1]});
|
||||
auto denom =
|
||||
std::make_shared<ov::op::v1::Maximum>(std::make_shared<ov::op::v1::Subtract>(corr_high, corr_low),
|
||||
ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {0.001f}));
|
||||
auto ramp_y =
|
||||
std::make_shared<ov::op::v1::Divide>(std::make_shared<ov::op::v1::Subtract>(dim_ids, corr_low), denom);
|
||||
auto ramp_clamped = std::make_shared<ov::op::v0::Clamp>(ramp_y, 0.0f, 1.0f);
|
||||
auto ext_factor_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {ext_factor});
|
||||
auto ramp_mix = std::make_shared<ov::op::v1::Multiply>(ramp_clamped, ext_factor_node);
|
||||
return ramp_mix;
|
||||
}
|
||||
|
||||
float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
|
||||
#ifndef M_PI
|
||||
# define M_PI 3.14159265358979323846
|
||||
#endif
|
||||
return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float) M_PI)) / (2 * logf(base));
|
||||
}
|
||||
|
||||
void ggml_rope_yarn_corr_dims(int n_dims,
|
||||
int n_ctx_orig,
|
||||
float freq_base,
|
||||
float beta_fast,
|
||||
float beta_slow,
|
||||
float dims[2]) {
|
||||
float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
|
||||
float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
|
||||
dims[0] = std::max(0.0f, start);
|
||||
dims[1] = std::min(static_cast<float>(n_dims - 1), end);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t* rope_params,
|
||||
std::shared_ptr<ov::Node> inp_pos,
|
||||
std::shared_ptr<ov::Node> rope_freqs_weight) {
|
||||
inp_pos = std::make_shared<ov::op::v0::Convert>(inp_pos, ov::element::f32);
|
||||
auto pos_perm =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{3}, std::vector<int64_t>{2, 1, 0});
|
||||
inp_pos = std::make_shared<ov::op::v1::Transpose>(inp_pos, pos_perm);
|
||||
|
||||
float freq_base;
|
||||
float freq_scale;
|
||||
float ext_factor;
|
||||
float attn_factor;
|
||||
float beta_fast;
|
||||
float beta_slow;
|
||||
const int n_dims = rope_params[1];
|
||||
const int n_ctx_orig = rope_params[4];
|
||||
memcpy(&freq_base, rope_params + 5, sizeof(float));
|
||||
memcpy(&freq_scale, rope_params + 6, sizeof(float));
|
||||
memcpy(&ext_factor, rope_params + 7, sizeof(float));
|
||||
memcpy(&attn_factor, rope_params + 8, sizeof(float));
|
||||
memcpy(&beta_fast, rope_params + 9, sizeof(float));
|
||||
memcpy(&beta_slow, rope_params + 10, sizeof(float));
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f / n_dims);
|
||||
|
||||
float corr_dims[2];
|
||||
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
||||
|
||||
std::vector<float> factor(n_dims / 2);
|
||||
factor[0] = freq_scale;
|
||||
for (size_t i = 1; i < factor.size(); i++) {
|
||||
factor[i] = theta_scale * factor[i - 1];
|
||||
}
|
||||
|
||||
Output<Node> freq_factors =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor);
|
||||
if (rope_freqs_weight) {
|
||||
freq_factors = std::make_shared<ov::op::v1::Divide>(freq_factors, rope_freqs_weight);
|
||||
}
|
||||
|
||||
auto theta_extrap = std::make_shared<ov::op::v1::Multiply>(freq_factors, inp_pos);
|
||||
auto theta_interp = std::make_shared<ov::op::v1::Multiply>(
|
||||
theta_extrap, ov::op::v0::Constant::create(ov::element::f32, {1}, {freq_scale}));
|
||||
|
||||
Output<Node> theta;
|
||||
float mscale = attn_factor;
|
||||
if (ext_factor == 0.0f) {
|
||||
theta = theta_interp;
|
||||
} else {
|
||||
auto ramp_mix = rope_yarn_ramp_mix(n_dims, corr_dims, ext_factor);
|
||||
auto one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {1.0f});
|
||||
auto one_minus_ramp = std::make_shared<ov::op::v1::Subtract>(one, ramp_mix);
|
||||
|
||||
theta = std::make_shared<ov::op::v1::Add>(std::make_shared<ov::op::v1::Multiply>(theta_interp, one_minus_ramp),
|
||||
std::make_shared<ov::op::v1::Multiply>(theta_extrap, ramp_mix));
|
||||
mscale *= (1.0f + 0.1f * std::log(1.0f / freq_scale));
|
||||
}
|
||||
|
||||
Output<Node> cos_theta = std::make_shared<ov::op::v0::Cos>(theta);
|
||||
Output<Node> sin_theta = std::make_shared<ov::op::v0::Sin>(theta);
|
||||
|
||||
auto mscale_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {mscale});
|
||||
|
||||
cos_theta = std::make_shared<ov::op::v1::Multiply>(cos_theta, mscale_node);
|
||||
sin_theta = std::make_shared<ov::op::v1::Multiply>(sin_theta, mscale_node);
|
||||
return std::make_pair(sin_theta, cos_theta);
|
||||
}
|
||||
|
||||
ov::Output<ov::Node> process_view_input(const NodeContext& context, int input_index, int slice_len) {
|
||||
// Only works for VIEW operations that slice at the lowest dimension
|
||||
// If the VIEW also reshape the result, `slice_len` should be provided
|
||||
auto input = context.get_input(input_index);
|
||||
int32_t* op_params = context.get_input_op_params(input_index);
|
||||
auto src1_stride = context.get_input_stride(input_index);
|
||||
|
||||
int64_t split_addr = op_params[0] / src1_stride[2];
|
||||
if (slice_len == 0) {
|
||||
slice_len = context.get_input_shape(input_index)[2].get_length();
|
||||
}
|
||||
int64_t slice_end = split_addr + slice_len;
|
||||
|
||||
auto begin = ov::op::v0::Constant::create(ov::element::i64, {1}, {split_addr});
|
||||
auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_end});
|
||||
auto stride = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
|
||||
auto axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
|
||||
auto sliced = std::make_shared<ov::op::v8::Slice>(input, begin, end, stride, axes);
|
||||
return sliced;
|
||||
}
|
||||
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
|
|
|
|||
|
|
@ -1,6 +1,10 @@
|
|||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/op/shape_of.hpp>
|
||||
#include <openvino/op/slice.hpp>
|
||||
#include <utility>
|
||||
|
||||
#include "node_context.hpp"
|
||||
|
||||
|
|
@ -60,6 +64,12 @@ std::shared_ptr<ov::Node> get_dimensions(const std::shared_ptr<ov::Node>& node,
|
|||
|
||||
OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::string& suffix);
|
||||
|
||||
std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t* rope_params,
|
||||
std::shared_ptr<ov::Node> inp_pos,
|
||||
std::shared_ptr<ov::Node> rope_freqs_weight = nullptr);
|
||||
|
||||
ov::Output<ov::Node> process_view_input(const NodeContext& context, int input_index, int slice_len = 0);
|
||||
|
||||
namespace op {
|
||||
template <typename T>
|
||||
OutputVector translate_1to1_match_2_inputs(const NodeContext& context) {
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@
|
|||
#include <vector>
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-openvino/ggml-decoder.h"
|
||||
#include "ggml.h"
|
||||
#include "openvino/frontend.hpp"
|
||||
#include "openvino/input_model.hpp"
|
||||
|
|
@ -35,6 +36,9 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
|
|||
ov::Shape input_shape;
|
||||
if (name.find("cache_k") == 0 || name.find("cache_v") == 0) {
|
||||
input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor).to_shape();
|
||||
} else if (ggml_tensor->op == GGML_OP_VIEW) {
|
||||
// This case is added to make test-backend-ops work
|
||||
input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor->view_src).to_shape();
|
||||
} else {
|
||||
input_shape = ggml_decoder->get_input_shape(name).to_shape();
|
||||
}
|
||||
|
|
@ -81,6 +85,10 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
|
|||
config = get_npu_config();
|
||||
}
|
||||
|
||||
if (cgraph->n_nodes == 1) {
|
||||
return naive_compute(cgraph, core, device, config);
|
||||
}
|
||||
|
||||
auto start_time = ggml_time_us();
|
||||
|
||||
auto* cache_dir = getenv("GGML_OPENVINO_CACHE_DIR");
|
||||
|
|
@ -242,6 +250,42 @@ ov::AnyMap get_npu_config() {
|
|||
return config;
|
||||
}
|
||||
|
||||
enum ggml_status naive_compute(struct ggml_cgraph* cgraph,
|
||||
ov::Core& core,
|
||||
const std::string& device,
|
||||
const ov::AnyMap& config) {
|
||||
if (cgraph->nodes[0]->op == GGML_OP_NONE) {
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
auto decoder = std::make_shared<GgmlOvDecoder>(cgraph);
|
||||
auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(decoder);
|
||||
auto naive = true;
|
||||
auto model = ov::frontend::ggml::FrontEnd::convert(input_model, naive);
|
||||
auto infer_request = core.compile_model(model, device, config).create_infer_request();
|
||||
|
||||
ov::serialize(model, "IR.xml");
|
||||
|
||||
auto ov_params = model->get_parameters();
|
||||
for (size_t i = 0; i < ov_params.size(); i++) {
|
||||
auto param_name = ov_params[i]->get_friendly_name();
|
||||
auto input_tensor = get_ov_input_tensor(decoder, param_name);
|
||||
infer_request.set_input_tensor(i, input_tensor);
|
||||
}
|
||||
|
||||
infer_request.infer();
|
||||
|
||||
auto gguf_tensor_addrs = get_ggml_graph_output_dst(decoder);
|
||||
auto ov_results = model->get_results();
|
||||
for (size_t i = 0; i < ov_results.size(); i++) {
|
||||
auto result_name = ov_results[i]->get_friendly_name();
|
||||
const auto output_tensor = infer_request.get_output_tensor(i);
|
||||
|
||||
std::memcpy(gguf_tensor_addrs[result_name], output_tensor.data(), output_tensor.get_byte_size());
|
||||
}
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string& param_name) {
|
||||
bool is_static = ggml_decoder->is_static();
|
||||
bool is_first_token = ggml_decoder->is_first_token();
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
#include <algorithm>
|
||||
#include <openvino/runtime/core.hpp>
|
||||
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-decoder.h"
|
||||
|
|
@ -42,3 +43,6 @@ bool is_prefill(struct ggml_cgraph * cgraph);
|
|||
ov::AnyMap get_npu_config();
|
||||
|
||||
ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string& param_name);
|
||||
|
||||
enum ggml_status naive_compute(struct ggml_cgraph* cgraph, ov::Core& core, const std::string& device,
|
||||
const ov::AnyMap& config);
|
||||
|
|
|
|||
Loading…
Reference in New Issue