diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 19152a5e6d..ae4beca23e 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -209,6 +209,7 @@ void GgmlOvDecoder::set_llm_params() { } else if (node->op == GGML_OP_ROPE && std::string(node->name) == "Qcur-0") { m_head_size = node->ne[0]; m_num_heads = node->ne[1]; + m_rope_params = node->op_params; } else if (node->op == GGML_OP_ROPE && std::string(node->name) == "Kcur-0") { m_num_heads_kv = node->ne[1]; } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 171300b406..8b507438c5 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -98,6 +98,8 @@ public: virtual int get_head_size() const override { return m_head_size; } + virtual int32_t* get_rope_params() const override { return m_rope_params; } + virtual std::map get_kv_param_res_names() const override; virtual bool is_static() const override { return m_is_static; } @@ -140,6 +142,7 @@ private: int m_num_heads; int m_num_heads_kv; int m_head_size; + int32_t* m_rope_params; std::vector m_kv_names; bool m_is_static; bool m_is_first_token; diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index 8d2e06c0e5..a3387ba394 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -61,6 +61,7 @@ public: virtual int get_num_heads() const = 0; virtual int get_num_heads_kv() const = 0; virtual int get_head_size() const = 0; + virtual int32_t* get_rope_params() const = 0; virtual std::map get_kv_param_res_names() const = 0; virtual bool is_static() const = 0; diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index 78523e5781..f5736fefc8 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -3,131 +3,39 @@ #include #include #include -#include #include #include -#include -#include -#include #include #include #include -#include #include #include #include -#include #include #include "../node_context.hpp" #include "../op_table.hpp" #include "../utils.hpp" -#ifndef M_PI -# define M_PI 3.14159265358979323846 -#endif - -#define GGML_ROPE_TYPE_NEOX 2 - -#define MIN(a, b) ((a) < (b) ? (a) : (b)) -#define MAX(a, b) ((a) > (b) ? (a) : (b)) - namespace ov { namespace frontend { namespace ggml { namespace op { -namespace { -float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) { - return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float) M_PI)) / (2 * logf(base)); -} - -void ggml_rope_yarn_corr_dims(int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, - float dims[2]) { - float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base)); - float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base)); - dims[0] = MAX(0, start); - dims[1] = MIN(n_dims - 1, end); -} -} // namespace - OutputVector translate_rope(const NodeContext& context) { num_inputs_check(context, 2, 3); ov::Output res; - auto data_node = context.get_input(0); - auto pos_node = context.get_input(1); - pos_node = std::make_shared(pos_node, ov::element::f32); + auto data_node = context.get_input(0).get_node_shared_ptr(); + auto cos_theta_node = context.get_input("rope_cos"); + auto sin_theta_node = context.get_input("rope_sin"); - auto permutation_node = - std::make_shared(ov::element::i64, ov::Shape{3}, std::vector{2, 1, 0}); - Output pos_node_reshaped = std::make_shared(pos_node, permutation_node); - - auto output_shape = context.get_output_shape(0); - - float freq_base; - float freq_scale; - float ext_factor; - float attn_factor; - float beta_fast; - float beta_slow; int32_t* op_params = context.get_output_op_params(0); - const int n_dims = op_params[1]; const int mode = op_params[2]; - const int n_ctx_orig = op_params[4]; - memcpy(&freq_base, op_params + 5, sizeof(float)); - memcpy(&freq_scale, op_params + 6, sizeof(float)); - memcpy(&ext_factor, op_params + 7, sizeof(float)); - memcpy(&attn_factor, op_params + 8, sizeof(float)); - memcpy(&beta_fast, op_params + 9, sizeof(float)); - memcpy(&beta_slow, op_params + 10, sizeof(float)); - - const float theta_scale = powf(freq_base, -2.0f / n_dims); - - // TODO: corr_dims is not used in the current implementation - float corr_dims[2]; - ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); - + constexpr int GGML_ROPE_TYPE_NEOX = 2; const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; - // TODO: GGML_OP_ROPE_BACK -> false - bool forward = true; - const float sin_sign = forward ? 1.0f : -1.0f; - - const int64_t ne0 = output_shape[2].get_length(); - std::vector factor(ne0 / 2); - factor[0] = freq_scale; - for (int64_t i = 1; i < ne0 / 2; i++) { - factor[i] = theta_scale * factor[i - 1]; - } - - Output factor_node = - std::make_shared(ov::element::f32, ov::Shape{factor.size()}, factor); - if (context.get_input_size() == 3) { - auto freq_factors_node = context.get_input(2); - factor_node = std::make_shared(factor_node, freq_factors_node); - } - - auto half_last_dim = ov::op::v0::Constant::create(ov::element::i64, Shape{1}, {output_shape[2].get_length() / 2}); - Output input_shape_node = std::make_shared( - OutputVector{get_dimensions(data_node.get_node_shared_ptr(), {0, 1}), half_last_dim}, - 0); - Output factor_broadcasted_node = std::make_shared(factor_node, input_shape_node); - - Output cos_factor_broadcasted_node = std::make_shared( - std::make_shared(factor_broadcasted_node, pos_node_reshaped)); - Output sin_factor_broadcasted_node = std::make_shared( - std::make_shared(factor_broadcasted_node, pos_node_reshaped)); - - float mscale = attn_factor; - Output mscale_node = - std::make_shared(ov::element::f32, ov::Shape{}, std::vector{mscale}); - Output mscale_sin_sign_node = - std::make_shared(ov::element::f32, ov::Shape{}, std::vector{mscale * sin_sign}); - Output cos_theta_node = std::make_shared(cos_factor_broadcasted_node, mscale_node); - Output sin_theta_node = std::make_shared(sin_factor_broadcasted_node, mscale_node); - if (!is_neox) { auto input_shape = context.get_input_shape(0); @@ -146,18 +54,12 @@ OutputVector translate_rope(const NodeContext& context) { std::make_shared(odd_slice, cos_theta_node)); auto stack = std::make_shared(OutputVector{first_half, second_half}, 2); - auto shape_const = ov::op::v0::Constant::create( - ov::element::i64, - Shape{3}, - std::vector{-1, input_shape[1].get_length(), input_shape[2].get_length()}); - res = std::make_shared(stack, shape_const, false); + res = std::make_shared(stack, std::make_shared(data_node), false); } else { - auto slice_node = - std::make_shared(data_node, - ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}), - 2); - Output slice_data_node_0 = slice_node->outputs()[0]; - Output slice_data_node_1 = slice_node->outputs()[1]; + auto data_split = std::make_shared( + data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}), 2); + Output slice_data_node_0 = data_split->outputs()[0]; + Output slice_data_node_1 = data_split->outputs()[1]; auto first_half_node = std::make_shared( std::make_shared(slice_data_node_0, cos_theta_node), diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 9580586684..d122497e63 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -1,16 +1,23 @@ #include "translate_session.hpp" +#include #include #include #include #include #include #include +#include +#include +#include +#include #include #include #include #include +#include #include +#include #include #include #include @@ -119,10 +126,95 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode tensor_map.insert({"update_indices_v", update_indices_v->output(0)}); } +float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) { +#ifndef M_PI +# define M_PI 3.14159265358979323846 +#endif + return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float) M_PI)) / (2 * logf(base)); +} + +void ggml_rope_yarn_corr_dims(int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, + float dims[2]) { + float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base)); + float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base)); + dims[0] = std::max(0.0f, start); + dims[1] = std::min(static_cast(n_dims - 1), end); +} + +void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { + int32_t* rope_params = ggml_model_decoder.get_rope_params(); + auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); + std::shared_ptr rope_freqs_weight; + + inp_pos = std::make_shared(inp_pos, ov::element::f32); + auto pos_perm = + std::make_shared(ov::element::i64, ov::Shape{3}, std::vector{2, 1, 0}); + inp_pos = std::make_shared(inp_pos, pos_perm); + if (tensor_map.find("rope_freqs_weight") != tensor_map.end()) { + rope_freqs_weight = tensor_map.at("rope_freqs.weight").get_node_shared_ptr(); + } + + float freq_base; + float freq_scale; + float ext_factor; + float attn_factor; + float beta_fast; + float beta_slow; + const int n_dims = rope_params[1]; + const int n_ctx_orig = rope_params[4]; + memcpy(&freq_base, rope_params + 5, sizeof(float)); + memcpy(&freq_scale, rope_params + 6, sizeof(float)); + memcpy(&ext_factor, rope_params + 7, sizeof(float)); + memcpy(&attn_factor, rope_params + 8, sizeof(float)); + memcpy(&beta_fast, rope_params + 9, sizeof(float)); + memcpy(&beta_slow, rope_params + 10, sizeof(float)); + + const float theta_scale = powf(freq_base, -2.0f / n_dims); + + // TODO: corr_dims is not used in the current implementation + float corr_dims[2]; + ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); + + // TODO: GGML_OP_ROPE_BACK -> false + // bool forward = true; + // const float sin_sign = forward ? 1.0f : -1.0f; + + const int64_t half_head_size = ggml_model_decoder.get_head_size() / 2; + std::vector factor(half_head_size); + factor[0] = freq_scale; + for (int64_t i = 1; i < half_head_size; i++) { + factor[i] = theta_scale * factor[i - 1]; + } + + Output factor_node = + std::make_shared(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor); + if (rope_freqs_weight) { + factor_node = std::make_shared(factor_node, rope_freqs_weight); + } + + auto half_head_size_node = ov::op::v0::Constant::create(ov::element::i64, Shape{1}, {half_head_size}); + Output cos_factor = + std::make_shared(std::make_shared(factor_node, inp_pos)); + Output sin_factor = + std::make_shared(std::make_shared(factor_node, inp_pos)); + + float mscale = attn_factor; + Output mscale_node = + std::make_shared(ov::element::f32, ov::Shape{}, std::vector{mscale}); + + auto cos_theta = std::make_shared(cos_factor, mscale_node); + auto sin_theta = std::make_shared(sin_factor, mscale_node); + cos_theta->set_friendly_name("rope_cos"); + sin_theta->set_friendly_name("rope_sin"); + tensor_map.insert({"rope_cos", cos_theta->output(0)}); + tensor_map.insert({"rope_sin", sin_theta->output(0)}); +} + // Create common patterns void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { add_token_len(tensor_map); add_kv_update_indices(tensor_map, ggml_model_decoder); + add_rope_sin_cos(tensor_map, ggml_model_decoder); } } // namespace