Pull out sin cos from rope

This commit is contained in:
Yu, Zijun 2025-07-09 15:14:10 +08:00 committed by Mustafa Cavus
parent 3533c14cf6
commit a80da69448
5 changed files with 106 additions and 107 deletions

View File

@ -209,6 +209,7 @@ void GgmlOvDecoder::set_llm_params() {
} else if (node->op == GGML_OP_ROPE && std::string(node->name) == "Qcur-0") {
m_head_size = node->ne[0];
m_num_heads = node->ne[1];
m_rope_params = node->op_params;
} else if (node->op == GGML_OP_ROPE && std::string(node->name) == "Kcur-0") {
m_num_heads_kv = node->ne[1];
}

View File

@ -98,6 +98,8 @@ public:
virtual int get_head_size() const override { return m_head_size; }
virtual int32_t* get_rope_params() const override { return m_rope_params; }
virtual std::map<std::string, std::string> get_kv_param_res_names() const override;
virtual bool is_static() const override { return m_is_static; }
@ -140,6 +142,7 @@ private:
int m_num_heads;
int m_num_heads_kv;
int m_head_size;
int32_t* m_rope_params;
std::vector<std::string> m_kv_names;
bool m_is_static;
bool m_is_first_token;

View File

@ -61,6 +61,7 @@ public:
virtual int get_num_heads() const = 0;
virtual int get_num_heads_kv() const = 0;
virtual int get_head_size() const = 0;
virtual int32_t* get_rope_params() const = 0;
virtual std::map<std::string, std::string> get_kv_param_res_names() const = 0;
virtual bool is_static() const = 0;

View File

@ -3,131 +3,39 @@
#include <openvino/core/node.hpp>
#include <openvino/core/node_output.hpp>
#include <openvino/op/add.hpp>
#include <openvino/op/broadcast.hpp>
#include <openvino/op/concat.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/convert.hpp>
#include <openvino/op/cos.hpp>
#include <openvino/op/divide.hpp>
#include <openvino/op/multiply.hpp>
#include <openvino/op/reshape.hpp>
#include <openvino/op/shape_of.hpp>
#include <openvino/op/sin.hpp>
#include <openvino/op/slice.hpp>
#include <openvino/op/split.hpp>
#include <openvino/op/subtract.hpp>
#include <openvino/op/transpose.hpp>
#include <vector>
#include "../node_context.hpp"
#include "../op_table.hpp"
#include "../utils.hpp"
#ifndef M_PI
# define M_PI 3.14159265358979323846
#endif
#define GGML_ROPE_TYPE_NEOX 2
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
namespace {
float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float) M_PI)) / (2 * logf(base));
}
void ggml_rope_yarn_corr_dims(int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow,
float dims[2]) {
float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
dims[0] = MAX(0, start);
dims[1] = MIN(n_dims - 1, end);
}
} // namespace
OutputVector translate_rope(const NodeContext& context) {
num_inputs_check(context, 2, 3);
ov::Output<Node> res;
auto data_node = context.get_input(0);
auto pos_node = context.get_input(1);
pos_node = std::make_shared<ov::op::v0::Convert>(pos_node, ov::element::f32);
auto data_node = context.get_input(0).get_node_shared_ptr();
auto cos_theta_node = context.get_input("rope_cos");
auto sin_theta_node = context.get_input("rope_sin");
auto permutation_node =
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{3}, std::vector<int64_t>{2, 1, 0});
Output<Node> pos_node_reshaped = std::make_shared<ov::op::v1::Transpose>(pos_node, permutation_node);
auto output_shape = context.get_output_shape(0);
float freq_base;
float freq_scale;
float ext_factor;
float attn_factor;
float beta_fast;
float beta_slow;
int32_t* op_params = context.get_output_op_params(0);
const int n_dims = op_params[1];
const int mode = op_params[2];
const int n_ctx_orig = op_params[4];
memcpy(&freq_base, op_params + 5, sizeof(float));
memcpy(&freq_scale, op_params + 6, sizeof(float));
memcpy(&ext_factor, op_params + 7, sizeof(float));
memcpy(&attn_factor, op_params + 8, sizeof(float));
memcpy(&beta_fast, op_params + 9, sizeof(float));
memcpy(&beta_slow, op_params + 10, sizeof(float));
const float theta_scale = powf(freq_base, -2.0f / n_dims);
// TODO: corr_dims is not used in the current implementation
float corr_dims[2];
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
constexpr int GGML_ROPE_TYPE_NEOX = 2;
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
// TODO: GGML_OP_ROPE_BACK -> false
bool forward = true;
const float sin_sign = forward ? 1.0f : -1.0f;
const int64_t ne0 = output_shape[2].get_length();
std::vector<float> factor(ne0 / 2);
factor[0] = freq_scale;
for (int64_t i = 1; i < ne0 / 2; i++) {
factor[i] = theta_scale * factor[i - 1];
}
Output<Node> factor_node =
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{factor.size()}, factor);
if (context.get_input_size() == 3) {
auto freq_factors_node = context.get_input(2);
factor_node = std::make_shared<ov::op::v1::Divide>(factor_node, freq_factors_node);
}
auto half_last_dim = ov::op::v0::Constant::create(ov::element::i64, Shape{1}, {output_shape[2].get_length() / 2});
Output<Node> input_shape_node = std::make_shared<ov::op::v0::Concat>(
OutputVector{get_dimensions(data_node.get_node_shared_ptr(), {0, 1}), half_last_dim},
0);
Output<Node> factor_broadcasted_node = std::make_shared<ov::op::v3::Broadcast>(factor_node, input_shape_node);
Output<Node> cos_factor_broadcasted_node = std::make_shared<ov::op::v0::Cos>(
std::make_shared<ov::op::v1::Multiply>(factor_broadcasted_node, pos_node_reshaped));
Output<Node> sin_factor_broadcasted_node = std::make_shared<ov::op::v0::Sin>(
std::make_shared<ov::op::v1::Multiply>(factor_broadcasted_node, pos_node_reshaped));
float mscale = attn_factor;
Output<Node> mscale_node =
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{}, std::vector<float>{mscale});
Output<Node> mscale_sin_sign_node =
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{}, std::vector<float>{mscale * sin_sign});
Output<Node> cos_theta_node = std::make_shared<ov::op::v1::Multiply>(cos_factor_broadcasted_node, mscale_node);
Output<Node> sin_theta_node = std::make_shared<ov::op::v1::Multiply>(sin_factor_broadcasted_node, mscale_node);
if (!is_neox) {
auto input_shape = context.get_input_shape(0);
@ -146,18 +54,12 @@ OutputVector translate_rope(const NodeContext& context) {
std::make_shared<ov::op::v1::Multiply>(odd_slice, cos_theta_node));
auto stack = std::make_shared<ov::op::v0::Concat>(OutputVector{first_half, second_half}, 2);
auto shape_const = ov::op::v0::Constant::create(
ov::element::i64,
Shape{3},
std::vector<int64_t>{-1, input_shape[1].get_length(), input_shape[2].get_length()});
res = std::make_shared<ov::op::v1::Reshape>(stack, shape_const, false);
res = std::make_shared<ov::op::v1::Reshape>(stack, std::make_shared<ov::op::v0::ShapeOf>(data_node), false);
} else {
auto slice_node =
std::make_shared<ov::op::v1::Split>(data_node,
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}),
2);
Output<Node> slice_data_node_0 = slice_node->outputs()[0];
Output<Node> slice_data_node_1 = slice_node->outputs()[1];
auto data_split = std::make_shared<ov::op::v1::Split>(
data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}), 2);
Output<Node> slice_data_node_0 = data_split->outputs()[0];
Output<Node> slice_data_node_1 = data_split->outputs()[1];
auto first_half_node = std::make_shared<ov::op::v1::Subtract>(
std::make_shared<ov::op::v1::Multiply>(slice_data_node_0, cos_theta_node),

View File

@ -1,16 +1,23 @@
#include "translate_session.hpp"
#include <cstdint>
#include <cstdlib>
#include <map>
#include <memory>
#include <openvino/core/node.hpp>
#include <openvino/op/broadcast.hpp>
#include <openvino/op/concat.hpp>
#include <openvino/op/convert.hpp>
#include <openvino/op/cos.hpp>
#include <openvino/op/divide.hpp>
#include <openvino/op/multiply.hpp>
#include <openvino/op/parameter.hpp>
#include <openvino/op/range.hpp>
#include <openvino/op/reshape.hpp>
#include <openvino/op/result.hpp>
#include <openvino/op/sin.hpp>
#include <openvino/op/squeeze.hpp>
#include <openvino/op/transpose.hpp>
#include <openvino/op/unsqueeze.hpp>
#include <openvino/pass/constant_folding.hpp>
#include <openvino/pass/make_stateful.hpp>
@ -119,10 +126,95 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode
tensor_map.insert({"update_indices_v", update_indices_v->output(0)});
}
float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
#ifndef M_PI
# define M_PI 3.14159265358979323846
#endif
return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float) M_PI)) / (2 * logf(base));
}
void ggml_rope_yarn_corr_dims(int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow,
float dims[2]) {
float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
dims[0] = std::max(0.0f, start);
dims[1] = std::min(static_cast<float>(n_dims - 1), end);
}
void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
int32_t* rope_params = ggml_model_decoder.get_rope_params();
auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr();
std::shared_ptr<ov::Node> rope_freqs_weight;
inp_pos = std::make_shared<ov::op::v0::Convert>(inp_pos, ov::element::f32);
auto pos_perm =
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{3}, std::vector<int64_t>{2, 1, 0});
inp_pos = std::make_shared<ov::op::v1::Transpose>(inp_pos, pos_perm);
if (tensor_map.find("rope_freqs_weight") != tensor_map.end()) {
rope_freqs_weight = tensor_map.at("rope_freqs.weight").get_node_shared_ptr();
}
float freq_base;
float freq_scale;
float ext_factor;
float attn_factor;
float beta_fast;
float beta_slow;
const int n_dims = rope_params[1];
const int n_ctx_orig = rope_params[4];
memcpy(&freq_base, rope_params + 5, sizeof(float));
memcpy(&freq_scale, rope_params + 6, sizeof(float));
memcpy(&ext_factor, rope_params + 7, sizeof(float));
memcpy(&attn_factor, rope_params + 8, sizeof(float));
memcpy(&beta_fast, rope_params + 9, sizeof(float));
memcpy(&beta_slow, rope_params + 10, sizeof(float));
const float theta_scale = powf(freq_base, -2.0f / n_dims);
// TODO: corr_dims is not used in the current implementation
float corr_dims[2];
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
// TODO: GGML_OP_ROPE_BACK -> false
// bool forward = true;
// const float sin_sign = forward ? 1.0f : -1.0f;
const int64_t half_head_size = ggml_model_decoder.get_head_size() / 2;
std::vector<float> factor(half_head_size);
factor[0] = freq_scale;
for (int64_t i = 1; i < half_head_size; i++) {
factor[i] = theta_scale * factor[i - 1];
}
Output<Node> factor_node =
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor);
if (rope_freqs_weight) {
factor_node = std::make_shared<ov::op::v1::Divide>(factor_node, rope_freqs_weight);
}
auto half_head_size_node = ov::op::v0::Constant::create(ov::element::i64, Shape{1}, {half_head_size});
Output<Node> cos_factor =
std::make_shared<ov::op::v0::Cos>(std::make_shared<ov::op::v1::Multiply>(factor_node, inp_pos));
Output<Node> sin_factor =
std::make_shared<ov::op::v0::Sin>(std::make_shared<ov::op::v1::Multiply>(factor_node, inp_pos));
float mscale = attn_factor;
Output<Node> mscale_node =
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{}, std::vector<float>{mscale});
auto cos_theta = std::make_shared<ov::op::v1::Multiply>(cos_factor, mscale_node);
auto sin_theta = std::make_shared<ov::op::v1::Multiply>(sin_factor, mscale_node);
cos_theta->set_friendly_name("rope_cos");
sin_theta->set_friendly_name("rope_sin");
tensor_map.insert({"rope_cos", cos_theta->output(0)});
tensor_map.insert({"rope_sin", sin_theta->output(0)});
}
// Create common patterns
void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
add_token_len(tensor_map);
add_kv_update_indices(tensor_map, ggml_model_decoder);
add_rope_sin_cos(tensor_map, ggml_model_decoder);
}
} // namespace