Pull out sin cos from rope
This commit is contained in:
parent
3533c14cf6
commit
a80da69448
|
|
@ -209,6 +209,7 @@ void GgmlOvDecoder::set_llm_params() {
|
|||
} else if (node->op == GGML_OP_ROPE && std::string(node->name) == "Qcur-0") {
|
||||
m_head_size = node->ne[0];
|
||||
m_num_heads = node->ne[1];
|
||||
m_rope_params = node->op_params;
|
||||
} else if (node->op == GGML_OP_ROPE && std::string(node->name) == "Kcur-0") {
|
||||
m_num_heads_kv = node->ne[1];
|
||||
}
|
||||
|
|
|
|||
|
|
@ -98,6 +98,8 @@ public:
|
|||
|
||||
virtual int get_head_size() const override { return m_head_size; }
|
||||
|
||||
virtual int32_t* get_rope_params() const override { return m_rope_params; }
|
||||
|
||||
virtual std::map<std::string, std::string> get_kv_param_res_names() const override;
|
||||
|
||||
virtual bool is_static() const override { return m_is_static; }
|
||||
|
|
@ -140,6 +142,7 @@ private:
|
|||
int m_num_heads;
|
||||
int m_num_heads_kv;
|
||||
int m_head_size;
|
||||
int32_t* m_rope_params;
|
||||
std::vector<std::string> m_kv_names;
|
||||
bool m_is_static;
|
||||
bool m_is_first_token;
|
||||
|
|
|
|||
|
|
@ -61,6 +61,7 @@ public:
|
|||
virtual int get_num_heads() const = 0;
|
||||
virtual int get_num_heads_kv() const = 0;
|
||||
virtual int get_head_size() const = 0;
|
||||
virtual int32_t* get_rope_params() const = 0;
|
||||
virtual std::map<std::string, std::string> get_kv_param_res_names() const = 0;
|
||||
|
||||
virtual bool is_static() const = 0;
|
||||
|
|
|
|||
|
|
@ -3,131 +3,39 @@
|
|||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/core/node_output.hpp>
|
||||
#include <openvino/op/add.hpp>
|
||||
#include <openvino/op/broadcast.hpp>
|
||||
#include <openvino/op/concat.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/cos.hpp>
|
||||
#include <openvino/op/divide.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/reshape.hpp>
|
||||
#include <openvino/op/shape_of.hpp>
|
||||
#include <openvino/op/sin.hpp>
|
||||
#include <openvino/op/slice.hpp>
|
||||
#include <openvino/op/split.hpp>
|
||||
#include <openvino/op/subtract.hpp>
|
||||
#include <openvino/op/transpose.hpp>
|
||||
#include <vector>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
#include "../utils.hpp"
|
||||
|
||||
#ifndef M_PI
|
||||
# define M_PI 3.14159265358979323846
|
||||
#endif
|
||||
|
||||
#define GGML_ROPE_TYPE_NEOX 2
|
||||
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
namespace {
|
||||
float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
|
||||
return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float) M_PI)) / (2 * logf(base));
|
||||
}
|
||||
|
||||
void ggml_rope_yarn_corr_dims(int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow,
|
||||
float dims[2]) {
|
||||
float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
|
||||
float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
|
||||
dims[0] = MAX(0, start);
|
||||
dims[1] = MIN(n_dims - 1, end);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
OutputVector translate_rope(const NodeContext& context) {
|
||||
num_inputs_check(context, 2, 3);
|
||||
|
||||
ov::Output<Node> res;
|
||||
|
||||
auto data_node = context.get_input(0);
|
||||
auto pos_node = context.get_input(1);
|
||||
pos_node = std::make_shared<ov::op::v0::Convert>(pos_node, ov::element::f32);
|
||||
auto data_node = context.get_input(0).get_node_shared_ptr();
|
||||
auto cos_theta_node = context.get_input("rope_cos");
|
||||
auto sin_theta_node = context.get_input("rope_sin");
|
||||
|
||||
auto permutation_node =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{3}, std::vector<int64_t>{2, 1, 0});
|
||||
Output<Node> pos_node_reshaped = std::make_shared<ov::op::v1::Transpose>(pos_node, permutation_node);
|
||||
|
||||
auto output_shape = context.get_output_shape(0);
|
||||
|
||||
float freq_base;
|
||||
float freq_scale;
|
||||
float ext_factor;
|
||||
float attn_factor;
|
||||
float beta_fast;
|
||||
float beta_slow;
|
||||
int32_t* op_params = context.get_output_op_params(0);
|
||||
const int n_dims = op_params[1];
|
||||
const int mode = op_params[2];
|
||||
const int n_ctx_orig = op_params[4];
|
||||
memcpy(&freq_base, op_params + 5, sizeof(float));
|
||||
memcpy(&freq_scale, op_params + 6, sizeof(float));
|
||||
memcpy(&ext_factor, op_params + 7, sizeof(float));
|
||||
memcpy(&attn_factor, op_params + 8, sizeof(float));
|
||||
memcpy(&beta_fast, op_params + 9, sizeof(float));
|
||||
memcpy(&beta_slow, op_params + 10, sizeof(float));
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f / n_dims);
|
||||
|
||||
// TODO: corr_dims is not used in the current implementation
|
||||
float corr_dims[2];
|
||||
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
||||
|
||||
constexpr int GGML_ROPE_TYPE_NEOX = 2;
|
||||
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
||||
|
||||
// TODO: GGML_OP_ROPE_BACK -> false
|
||||
bool forward = true;
|
||||
const float sin_sign = forward ? 1.0f : -1.0f;
|
||||
|
||||
const int64_t ne0 = output_shape[2].get_length();
|
||||
std::vector<float> factor(ne0 / 2);
|
||||
factor[0] = freq_scale;
|
||||
for (int64_t i = 1; i < ne0 / 2; i++) {
|
||||
factor[i] = theta_scale * factor[i - 1];
|
||||
}
|
||||
|
||||
Output<Node> factor_node =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{factor.size()}, factor);
|
||||
if (context.get_input_size() == 3) {
|
||||
auto freq_factors_node = context.get_input(2);
|
||||
factor_node = std::make_shared<ov::op::v1::Divide>(factor_node, freq_factors_node);
|
||||
}
|
||||
|
||||
auto half_last_dim = ov::op::v0::Constant::create(ov::element::i64, Shape{1}, {output_shape[2].get_length() / 2});
|
||||
Output<Node> input_shape_node = std::make_shared<ov::op::v0::Concat>(
|
||||
OutputVector{get_dimensions(data_node.get_node_shared_ptr(), {0, 1}), half_last_dim},
|
||||
0);
|
||||
Output<Node> factor_broadcasted_node = std::make_shared<ov::op::v3::Broadcast>(factor_node, input_shape_node);
|
||||
|
||||
Output<Node> cos_factor_broadcasted_node = std::make_shared<ov::op::v0::Cos>(
|
||||
std::make_shared<ov::op::v1::Multiply>(factor_broadcasted_node, pos_node_reshaped));
|
||||
Output<Node> sin_factor_broadcasted_node = std::make_shared<ov::op::v0::Sin>(
|
||||
std::make_shared<ov::op::v1::Multiply>(factor_broadcasted_node, pos_node_reshaped));
|
||||
|
||||
float mscale = attn_factor;
|
||||
Output<Node> mscale_node =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{}, std::vector<float>{mscale});
|
||||
Output<Node> mscale_sin_sign_node =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{}, std::vector<float>{mscale * sin_sign});
|
||||
Output<Node> cos_theta_node = std::make_shared<ov::op::v1::Multiply>(cos_factor_broadcasted_node, mscale_node);
|
||||
Output<Node> sin_theta_node = std::make_shared<ov::op::v1::Multiply>(sin_factor_broadcasted_node, mscale_node);
|
||||
|
||||
if (!is_neox) {
|
||||
auto input_shape = context.get_input_shape(0);
|
||||
|
||||
|
|
@ -146,18 +54,12 @@ OutputVector translate_rope(const NodeContext& context) {
|
|||
std::make_shared<ov::op::v1::Multiply>(odd_slice, cos_theta_node));
|
||||
|
||||
auto stack = std::make_shared<ov::op::v0::Concat>(OutputVector{first_half, second_half}, 2);
|
||||
auto shape_const = ov::op::v0::Constant::create(
|
||||
ov::element::i64,
|
||||
Shape{3},
|
||||
std::vector<int64_t>{-1, input_shape[1].get_length(), input_shape[2].get_length()});
|
||||
res = std::make_shared<ov::op::v1::Reshape>(stack, shape_const, false);
|
||||
res = std::make_shared<ov::op::v1::Reshape>(stack, std::make_shared<ov::op::v0::ShapeOf>(data_node), false);
|
||||
} else {
|
||||
auto slice_node =
|
||||
std::make_shared<ov::op::v1::Split>(data_node,
|
||||
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}),
|
||||
2);
|
||||
Output<Node> slice_data_node_0 = slice_node->outputs()[0];
|
||||
Output<Node> slice_data_node_1 = slice_node->outputs()[1];
|
||||
auto data_split = std::make_shared<ov::op::v1::Split>(
|
||||
data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}), 2);
|
||||
Output<Node> slice_data_node_0 = data_split->outputs()[0];
|
||||
Output<Node> slice_data_node_1 = data_split->outputs()[1];
|
||||
|
||||
auto first_half_node = std::make_shared<ov::op::v1::Subtract>(
|
||||
std::make_shared<ov::op::v1::Multiply>(slice_data_node_0, cos_theta_node),
|
||||
|
|
|
|||
|
|
@ -1,16 +1,23 @@
|
|||
#include "translate_session.hpp"
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/op/broadcast.hpp>
|
||||
#include <openvino/op/concat.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/cos.hpp>
|
||||
#include <openvino/op/divide.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/parameter.hpp>
|
||||
#include <openvino/op/range.hpp>
|
||||
#include <openvino/op/reshape.hpp>
|
||||
#include <openvino/op/result.hpp>
|
||||
#include <openvino/op/sin.hpp>
|
||||
#include <openvino/op/squeeze.hpp>
|
||||
#include <openvino/op/transpose.hpp>
|
||||
#include <openvino/op/unsqueeze.hpp>
|
||||
#include <openvino/pass/constant_folding.hpp>
|
||||
#include <openvino/pass/make_stateful.hpp>
|
||||
|
|
@ -119,10 +126,95 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode
|
|||
tensor_map.insert({"update_indices_v", update_indices_v->output(0)});
|
||||
}
|
||||
|
||||
float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
|
||||
#ifndef M_PI
|
||||
# define M_PI 3.14159265358979323846
|
||||
#endif
|
||||
return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float) M_PI)) / (2 * logf(base));
|
||||
}
|
||||
|
||||
void ggml_rope_yarn_corr_dims(int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow,
|
||||
float dims[2]) {
|
||||
float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
|
||||
float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
|
||||
dims[0] = std::max(0.0f, start);
|
||||
dims[1] = std::min(static_cast<float>(n_dims - 1), end);
|
||||
}
|
||||
|
||||
void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
|
||||
int32_t* rope_params = ggml_model_decoder.get_rope_params();
|
||||
auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr();
|
||||
std::shared_ptr<ov::Node> rope_freqs_weight;
|
||||
|
||||
inp_pos = std::make_shared<ov::op::v0::Convert>(inp_pos, ov::element::f32);
|
||||
auto pos_perm =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{3}, std::vector<int64_t>{2, 1, 0});
|
||||
inp_pos = std::make_shared<ov::op::v1::Transpose>(inp_pos, pos_perm);
|
||||
if (tensor_map.find("rope_freqs_weight") != tensor_map.end()) {
|
||||
rope_freqs_weight = tensor_map.at("rope_freqs.weight").get_node_shared_ptr();
|
||||
}
|
||||
|
||||
float freq_base;
|
||||
float freq_scale;
|
||||
float ext_factor;
|
||||
float attn_factor;
|
||||
float beta_fast;
|
||||
float beta_slow;
|
||||
const int n_dims = rope_params[1];
|
||||
const int n_ctx_orig = rope_params[4];
|
||||
memcpy(&freq_base, rope_params + 5, sizeof(float));
|
||||
memcpy(&freq_scale, rope_params + 6, sizeof(float));
|
||||
memcpy(&ext_factor, rope_params + 7, sizeof(float));
|
||||
memcpy(&attn_factor, rope_params + 8, sizeof(float));
|
||||
memcpy(&beta_fast, rope_params + 9, sizeof(float));
|
||||
memcpy(&beta_slow, rope_params + 10, sizeof(float));
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f / n_dims);
|
||||
|
||||
// TODO: corr_dims is not used in the current implementation
|
||||
float corr_dims[2];
|
||||
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
||||
|
||||
// TODO: GGML_OP_ROPE_BACK -> false
|
||||
// bool forward = true;
|
||||
// const float sin_sign = forward ? 1.0f : -1.0f;
|
||||
|
||||
const int64_t half_head_size = ggml_model_decoder.get_head_size() / 2;
|
||||
std::vector<float> factor(half_head_size);
|
||||
factor[0] = freq_scale;
|
||||
for (int64_t i = 1; i < half_head_size; i++) {
|
||||
factor[i] = theta_scale * factor[i - 1];
|
||||
}
|
||||
|
||||
Output<Node> factor_node =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor);
|
||||
if (rope_freqs_weight) {
|
||||
factor_node = std::make_shared<ov::op::v1::Divide>(factor_node, rope_freqs_weight);
|
||||
}
|
||||
|
||||
auto half_head_size_node = ov::op::v0::Constant::create(ov::element::i64, Shape{1}, {half_head_size});
|
||||
Output<Node> cos_factor =
|
||||
std::make_shared<ov::op::v0::Cos>(std::make_shared<ov::op::v1::Multiply>(factor_node, inp_pos));
|
||||
Output<Node> sin_factor =
|
||||
std::make_shared<ov::op::v0::Sin>(std::make_shared<ov::op::v1::Multiply>(factor_node, inp_pos));
|
||||
|
||||
float mscale = attn_factor;
|
||||
Output<Node> mscale_node =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{}, std::vector<float>{mscale});
|
||||
|
||||
auto cos_theta = std::make_shared<ov::op::v1::Multiply>(cos_factor, mscale_node);
|
||||
auto sin_theta = std::make_shared<ov::op::v1::Multiply>(sin_factor, mscale_node);
|
||||
cos_theta->set_friendly_name("rope_cos");
|
||||
sin_theta->set_friendly_name("rope_sin");
|
||||
tensor_map.insert({"rope_cos", cos_theta->output(0)});
|
||||
tensor_map.insert({"rope_sin", sin_theta->output(0)});
|
||||
}
|
||||
|
||||
// Create common patterns
|
||||
void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
|
||||
add_token_len(tensor_map);
|
||||
add_kv_update_indices(tensor_map, ggml_model_decoder);
|
||||
add_rope_sin_cos(tensor_map, ggml_model_decoder);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
|
|
|||
Loading…
Reference in New Issue