Fix Phi3 ROPE; Add test-backend-ops

2025-07-21 21:52:39 +08:00 · 2025-07-21 21:52:39 +08:00 · 44f4cf34b1
parent 1ed49bbfaf
commit 44f4cf34b1
18 changed files with 550 additions and 202 deletions
--- a/ggml/src/ggml-openvino/.clang-format
+++ b/ggml/src/ggml-openvino/.clang-format
@ -5,6 +5,10 @@ AlignConsecutiveDeclarations: false
 ReferenceAlignment: Left
 PointerAlignment: Left
 Cpp11BracedListStyle: true
+AccessModifierOffset: -4
+BinPackArguments: false
+BinPackParameters: false
+BreakBeforeBraces: Attach

 Language:        Cpp
 AlignAfterOpenBracket: Align
@ -27,29 +31,7 @@ AllowShortIfStatementsOnASingleLine: Never
 AllowShortLambdasOnASingleLine: Inline
 AllowShortLoopsOnASingleLine: false
 AlwaysBreakBeforeMultilineStrings: true
-BinPackArguments: true
-BinPackParameters: true # OnePerLine
 BitFieldColonSpacing: Both
-BreakBeforeBraces: Custom # Attach
-BraceWrapping:
-  AfterCaseLabel:  true
-  AfterClass:      false
-  AfterControlStatement: false
-  AfterEnum:       false
-  AfterFunction:   false
-  AfterNamespace:  false
-  AfterObjCDeclaration: false
-  AfterStruct:     false
-  AfterUnion:      false
-  AfterExternBlock: false
-  BeforeCatch:     false
-  BeforeElse:      false
-  BeforeLambdaBody: false
-  BeforeWhile: false
-  IndentBraces:    false
-  SplitEmptyFunction: false
-  SplitEmptyRecord: false
-  SplitEmptyNamespace: false
 # BreakAdjacentStringLiterals: true
 BreakAfterAttributes: Never
 BreakBeforeBinaryOperators: None
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@ -5,6 +5,7 @@

 #include <algorithm>
 #include <cassert>
+#include <cstddef>
 #include <cstdint>
 #include <cstdlib>
 #include <execution>
@ -15,6 +16,8 @@
 #include <openvino/core/dimension.hpp>
 #include <openvino/core/node.hpp>
 #include <openvino/core/partial_shape.hpp>
+#include <openvino/core/type/bfloat16.hpp>
+#include <openvino/core/type/element_type.hpp>
 #include <openvino/core/type/float16.hpp>
 #include <openvino/op/constant.hpp>
 #include <openvino/op/parameter.hpp>
@ -71,9 +74,19 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
    }
 }

+GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) {
+    m_cgraph = cgraph;
+    for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
+        auto* cur_node = cgraph->nodes[node_n];
+        m_nodes.push_back(cur_node);
+        set_input_output(cur_node, true);
+    }
+}
+
 // Called in GgmlOvDecoder constructor. Two cases: 1. constructing a decoder for the whole graph;
-// 2. constructing a decoder for a node.
-void GgmlOvDecoder::set_input_output(ggml_tensor* node) {
+// 2. constructing a decoder for a node;
+// 3. constructing a decoder for the whole graph naively (op test case)
+void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) {
    std::string node_name;
    if (node->op == GGML_OP_CPY) {
        // CPY updates the input tensor in place. For later ov op that uses the
@ -98,8 +111,14 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) {
        m_inputs[src_name] = src;
        m_op_node_name.emplace_back(src_name, ggml_op_name(node->op));

-        // If called for the whole graph, create constant nodes for weights and param nodes for inputs
-        if (!m_node && !src->view_src) {
+        // Add model inputs and weights constants, if called for the whole graph
+        if (naive) {
+            auto param_node = std::make_shared<ov::op::v0::Parameter>(get_ov_type(src), get_graph_input_shape(src));
+            param_node->set_friendly_name(src_name);
+            param_node->output(0).get_tensor().set_names({src_name});
+            m_model_inputs[src_name] = param_node;
+
+        } else if (!m_node && !src->view_src) {
            ggml_backend_buffer* buffer = src->buffer;

            if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) {
@ -118,7 +137,10 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) {
        }
    }

-    if (!m_node) {
+    // Add model outputs, if called for the whole graph
+    if (naive) {
+        m_model_output_names.push_back(node->name);
+    } else if (!m_node) {
        static std::set<std::string> debug_output_names = {};
        // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph
        if (node->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || node->flags & GGML_TENSOR_FLAG_OUTPUT ||
@ -164,17 +186,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) {
                m_op_case = 2;
            }
            break;
-        }
-        case GGML_OP_MUL_MAT: {
-            if (node->src[0]->view_src == nullptr) {
-                m_op_case = 1;
-            } else if (std::string(node->src[0]->name).find("cache_k") == 0) {
-                m_op_case = 2;
-            } else if (std::string(node->src[0]->name).find("cache_v") == 0) {
-                m_op_case = 3;
            }
-            break;
-        }
        case GGML_OP_PERMUTE: {
            if (node->src[0]->view_src == nullptr) {
                // Permute Qcur
@ -188,6 +200,23 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) {
            }
            break;
        }
+        case GGML_OP_GET_ROWS:
+            {
+                if (node->src[1]->op == GGML_OP_VIEW) {
+                    m_op_case = 2;
+                } else {
+                    m_op_case = 1;
+                }
+                break;
+            }
+        case GGML_OP_ROPE:
+            {
+                if (node->src[0]->op == GGML_OP_VIEW) {
+                    m_op_case = 2;
+                } else {
+                    m_op_case = 1;
+                }
+            }
        default:
            break;
        }
@ -237,6 +266,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co
        input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size};
    } else if (std::string(src->name).find("cache_v") == 0) {
        input_shape = ov::PartialShape{m_num_heads_kv, m_head_size, m_context_size};
+    } else if (src->op == GGML_OP_VIEW) {
+        // This case is added to make test-backend-ops work
+        input_shape = ov::PartialShape{get_shape(src->view_src)};
    } else {
        input_shape = ov::PartialShape{get_shape(src)};
    }
@ -373,6 +405,17 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor)
        weight_node = std::make_shared<ov::op::v0::Constant>(node_type, node_shape, data_f16);
        break;
    }
+    case GGML_TYPE_BF16:
+        {
+            const auto* ptr = reinterpret_cast<const uint16_t*>(tensor->data);
+            std::vector<ov::bfloat16> data_bf16;
+            data_bf16.reserve(ne_total);
+            for (int i = 0; i < ne_total; ++i) {
+                data_bf16.push_back(ov::bfloat16::from_bits(ptr[i]));
+            }
+            weight_node = std::make_shared<ov::op::v0::Constant>(node_type, node_shape, data_bf16);
+            break;
+        }
    default:
        throw std::invalid_argument("Unsupported tensor type");
    }
@ -496,6 +539,9 @@ ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor* tensor) {
    case GGML_TYPE_F16:
        type = ov::element::f16;
        break;
+    case GGML_TYPE_BF16:
+        type = ov::element::bf16;
+        break;
    case GGML_TYPE_I64:
        type = ov::element::i64;
        break;
@ -576,6 +622,7 @@ void GgmlOvDecoder::visit_subgraph(std::function<void(std::shared_ptr<GgmlDecode

 const std::string& GgmlOvDecoder::get_op_type() const {
    static const std::map<ggml_op, std::string> ops = {
+        {GGML_OP_NONE,      "GGML_OP_NONE"     },
        {GGML_OP_ACC,       "GGML_OP_ACC"      },
        {GGML_OP_ADD,       "GGML_OP_ADD"      },
        {GGML_OP_ADD1,      "GGML_OP_ADD1"     },
--- a/ggml/src/ggml-openvino/ggml-decoder.h
+++ b/ggml/src/ggml-openvino/ggml-decoder.h
@ -15,6 +15,8 @@ public:
    GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token,
                  int context_size, int num_heads, int num_heads_kv, int head_size);

+    // Naive decoder
+    GgmlOvDecoder(struct ggml_cgraph* cgraph);
    virtual ov::Any get_attribute(const std::string& name) const override {
        return nullptr;
        GGML_UNUSED(name);
@ -111,7 +113,7 @@ public:
    void clear_model_weights() { m_model_weights.clear(); }

 private:
-    void set_input_output(ggml_tensor* node);
+    void set_input_output(ggml_tensor* node, bool naive = false);
    void add_extra_inputs();
    static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename);
    static std::vector<size_t> get_shape(const ggml_tensor* tensor);
@ -124,13 +126,13 @@ private:
    static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor);
    void add_weight_const_parallel(std::map<std::string, std::shared_ptr<ov::Node>>& model_weights);

-    struct ggml_cgraph* m_cgraph;
+    struct ggml_cgraph* m_cgraph = nullptr;
+    ggml_tensor* m_node = nullptr;
+    std::vector<ggml_tensor*> m_nodes;
    std::map<std::string, ggml_tensor*> m_inputs;
    std::vector<std::string> m_input_names;
    std::map<std::string, ggml_tensor*> m_outputs;
    std::vector<std::string> m_output_names;
-    ggml_tensor* m_node;
-    std::vector<ggml_tensor*> m_nodes;
    std::string m_op_name;
    mutable std::string m_name;
    int m_op_case;
--- a/ggml/src/ggml-openvino/ggml-openvino.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino.cpp
@ -1,15 +1,17 @@
-#include "ggml-backend-impl.h"
-#include "ggml-impl.h"
 #include "ggml-openvino.h"
-#include "ggml-openvino/utils.h"
-#include "ggml.h"

+#include <cstdint>
 #include <mutex>
 #include <openvino/openvino.hpp>
 #include <set>
 #include <string>
 #include <vector>

+#include "ggml-backend-impl.h"
+#include "ggml-impl.h"
+#include "ggml-openvino/utils.h"
+#include "ggml.h"
+
 #define GGML_OPENVINO_MAX_STREAMS 8

 struct ggml_backend_openvino_context {
@ -234,9 +236,85 @@ static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(g
    return nullptr;
 }

+static bool is_op_unsupported_case(const ggml_tensor* op) {
+    if (op->op == GGML_OP_SOFT_MAX) {
+        float scale = 1.0f;
+        float max_bias = 0.0f;
+        const auto* op_params = op->op_params;
+        memcpy(&scale, (const float*) op_params + 0, sizeof(float));
+        memcpy(&max_bias, (const float*) op_params + 1, sizeof(float));
+        const uint32_t h = op->src[0]->ne[2];
+        const uint32_t n_head = op->src[0]->ne[0];
+        const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
+
+        const float m0 = powf(2.0f, -(max_bias) / n_head_log2);
+        const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
+        const float slope =
+            (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2 * (h - n_head_log2) + 1) : 1.0f;
+
+        if (slope != 1.0f) {
+            GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with slope != 1.0f\n");
+            return true;
+        }
+    }
+
+    if (op->op == GGML_OP_MUL_MAT) {
+        if ((op->src[0]->view_src && op->src[0]->op != GGML_OP_PERMUTE) ||
+            (op->src[1]->view_src && op->src[1]->op != GGML_OP_PERMUTE)) {
+            GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with view_src tensors that are not PERMUTE\n");
+            return true;
+        }
+    }
+
+    if (op->op == GGML_OP_ROPE) {
+        const int32_t* op_params = op->op_params;
+        const int n_dims = op_params[1];
+        const int mode = op_params[2];
+        if (mode == GGML_ROPE_TYPE_MROPE || mode == GGML_ROPE_TYPE_VISION) {
+            GGML_LOG_WARN("OpenVINO backend does not support ROPE with mode %d\n", mode);
+            return true;
+        }
+        if (n_dims != op->src[0]->ne[0]) {
+            GGML_LOG_WARN("OpenVINO backend does not support ROPE with n_dims %d != src[0]->ne[0] %ld\n",
+                          n_dims,
+                          op->src[0]->ne[0]);
+            return true;
+        }
+        if (op->type != GGML_TYPE_F32) {
+            GGML_LOG_WARN("OpenVINO backend does not support ROPE with type %s\n", ggml_type_name(op->type));
+            return true;
+        }
+        float freq_scale;
+        memcpy(&freq_scale, op_params + 6, sizeof(float));
+        if (freq_scale != 1.0f) {
+            GGML_LOG_WARN("OpenVINO backend does not support ROPE with freq_scale %f != 1.0f\n", freq_scale);
+            return true;
+        }
+        float ext_factor;
+        memcpy(&ext_factor, op_params + 7, sizeof(float));
+        if (ext_factor != 0.0f) {
+            GGML_LOG_WARN("OpenVINO backend does not support ROPE with ext_factor %f != 0.0f\n", ext_factor);
+            return true;
+        }
+        if (op->src[0]->op == GGML_OP_VIEW) {
+            if (op->src[0]->view_src->ne[1] != op->src[0]->ne[2]) {
+                GGML_LOG_WARN(
+                    "OpenVINO backend does not support ROPE with src[0]->view_src->ne[1] %ld != src[0]->ne[2] %ld\n",
+                    op->src[0]->view_src->ne[1],
+                    op->src[0]->ne[2]);
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
 static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
    GGML_ASSERT(dev->reg != nullptr);

+    static const std::set<ggml_type> supported_types{
+        GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_I64, GGML_TYPE_I32};
+
    static const std::set<ggml_op> supported_ops{GGML_OP_NONE,     GGML_OP_ADD,       GGML_OP_MUL,      GGML_OP_MUL_MAT,
                                                 GGML_OP_VIEW,     GGML_OP_CONT,      GGML_OP_CPY,      GGML_OP_RESHAPE,
                                                 GGML_OP_PERMUTE,  GGML_OP_TRANSPOSE, GGML_OP_GET_ROWS, GGML_OP_ROPE,
@ -248,18 +326,60 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
        GGML_GLU_OP_SWIGLU,
    };

-    auto res = false;
    switch (op->op) {
        case GGML_OP_UNARY:
-            res = supported_unary_ops.find(ggml_get_unary_op(op)) != supported_unary_ops.end();
-            break;
+            {
+                auto supported = supported_unary_ops.find(ggml_get_unary_op(op)) != supported_unary_ops.end();
+                if (!supported) {
+                    GGML_LOG_WARN("OpenVINO backend does not support unary op %s\n",
+                                  ggml_unary_op_name(ggml_get_unary_op(op)));
+                    return false;
+                }
+                break;
+            }
        case GGML_OP_GLU:
-            res = supported_glu_ops.find(ggml_get_glu_op(op)) != supported_glu_ops.end();
-            break;
+            {
+                auto supported = supported_glu_ops.find(ggml_get_glu_op(op)) != supported_glu_ops.end();
+                if (!supported) {
+                    GGML_LOG_WARN("OpenVINO backend does not support GLU op %s\n",
+                                  ggml_glu_op_name(ggml_get_glu_op(op)));
+                    return false;
+                }
+                break;
+            }
        default:
-            res = supported_ops.find(op->op) != supported_ops.end();
+            {
+                auto supported = supported_ops.find(op->op) != supported_ops.end();
+                if (!supported) {
+                    GGML_LOG_WARN("OpenVINO backend does not support op %s\n", ggml_op_name(op->op));
+                    return false;
+                }
+            }
    }
-    return res;
+
+    if (supported_types.find(op->type) == supported_types.end()) {
+        GGML_LOG_WARN("OpenVINO backend does not support tensor type %s\n", ggml_type_name(op->type));
+        return false;
+    }
+    if (op->ne[3] != 1) {
+        GGML_LOG_WARN("OpenVINO backend does not support tensors with ne[3] != 1\n");
+        return false;
+    }
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        if (supported_types.find(op->type) == supported_types.end()) {
+            GGML_LOG_WARN("OpenVINO backend does not support tensor type %s\n", ggml_type_name(op->type));
+            return false;
+        }
+        if (op->src[i] != nullptr && op->src[i]->ne[3] != 1) {
+            GGML_LOG_WARN("OpenVINO backend does not support tensors with ne[3] != 1\n");
+            return false;
+        }
+    }
+
+    if (is_op_unsupported_case(op)) {
+        return false;
+    }
+    return true;
 }

 static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
--- a/ggml/src/ggml-openvino/openvino/frontend.cpp
+++ b/ggml/src/ggml-openvino/openvino/frontend.cpp
@ -10,13 +10,13 @@ namespace ggml {

 FrontEnd::FrontEnd() {}

-std::shared_ptr<Model> FrontEnd::convert(const InputModel::Ptr& model) {
+std::shared_ptr<Model> FrontEnd::convert(const InputModel::Ptr& model, bool naive) {
    auto ggml_model = std::dynamic_pointer_cast<ggml::InputModel>(model);
    FRONT_END_GENERAL_CHECK(ggml_model, "Invalid input model");
    std::shared_ptr<Model> converted_model;
    const auto& supported_ops = get_supported_ops();
    {
-        TranslateSession translate_session(model, supported_ops);
+        TranslateSession translate_session(model, supported_ops, naive);
        converted_model = translate_session.get_converted_model();
    }
    return converted_model;
--- a/ggml/src/ggml-openvino/openvino/frontend.hpp
+++ b/ggml/src/ggml-openvino/openvino/frontend.hpp
@ -15,7 +15,7 @@ public:
    using Ptr = std::shared_ptr<FrontEnd>;
    FrontEnd();

-    static std::shared_ptr<Model> convert(const InputModel::Ptr& model);
+    static std::shared_ptr<Model> convert(const InputModel::Ptr& model, bool naive = false);
 };

 }  // namespace ggml
--- a/ggml/src/ggml-openvino/openvino/node_context.hpp
+++ b/ggml/src/ggml-openvino/openvino/node_context.hpp
@ -77,6 +77,10 @@ public:
        return m_tensor_map->at(name);
    }

+    bool has_input(const std::string& name) const {
+        return m_tensor_map->find(name) != m_tensor_map->end();
+    }
+
    const std::string& get_name() const override {
        return m_decoder->get_op_name();
    }
--- a/ggml/src/ggml-openvino/openvino/op/cont.cpp
+++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp
@ -34,19 +34,7 @@ OutputVector translate_cont(const NodeContext& context) {
            false);
    } else {
        // The input comes from a VIEW
-        // Currently all cases are slicing at lowest dim
-        int32_t* op_params = context.get_input_op_params(0);
-        auto output_stride = context.get_output_stride(0);
-
-        int64_t split_addr = op_params[0] / output_stride[2];
-        std::vector<int64_t> begin = {0, 0, split_addr};
-        std::vector<int64_t> end = {(int64_t)src_shape[0], INT_MAX, split_addr + (int64_t)src_shape[2]};
-        std::vector<int64_t> strides = {1, 1, 1};
-
-        auto begin_const = ov::op::v0::Constant::create(element::i64, {begin.size()}, begin);
-        auto end_const = ov::op::v0::Constant::create(ov::element::i64, {end.size()}, end);
-        auto strides_const = ov::op::v0::Constant::create(ov::element::i64, {strides.size()}, strides);
-        res = std::make_shared<ov::op::v8::Slice>(context.get_input(0), begin_const, end_const, strides_const);
+        res = process_view_input(context, 0);
    }

    return rename_outputs_with_suffix({res}, context.get_name());
--- a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp
+++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp
@ -1,10 +1,12 @@
+#include <cstdint>
 #include <openvino/core/node.hpp>
 #include <openvino/core/node_output.hpp>
 #include <openvino/op/constant.hpp>
 #include <openvino/op/convert.hpp>
 #include <openvino/op/gather.hpp>
 #include <openvino/op/reshape.hpp>
-#include <vector>
+#include <openvino/op/slice.hpp>
+#include <openvino/op/squeeze.hpp>

 #include "../node_context.hpp"
 #include "../op_table.hpp"
@ -18,19 +20,32 @@ namespace op {
 OutputVector translate_get_rows(const NodeContext& context) {
    num_inputs_check(context, 2, 2);

-    auto data_node = context.get_input(0);
-    auto indices_node = context.get_input(1);
+    int op_case = context.get_op_case();
+    FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case");

-    auto indices_shape = get_dimensions(indices_node.get_node_shared_ptr(), {2});
-    Output<Node> indice_reshaped = std::make_shared<ov::op::v1::Reshape>(indices_node, indices_shape, false);
+    Output<Node> res;
+    auto data = context.get_input(0);
+    auto indices = context.get_input(1);

-    auto axis_node = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1});
+    if (op_case == 2) {
+        // The input comes from a VIEW
+        indices = process_view_input(context, 1);
+    }
+
+    auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1});
+    if (indices.get_partial_shape()[1].get_length() == 1) {
+        indices =
+            std::make_shared<ov::op::v0::Squeeze>(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1}));
+        res = std::make_shared<ov::op::v8::Gather>(data, indices, axis);
+    } else {
+        indices =
+            std::make_shared<ov::op::v0::Squeeze>(indices, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
+        res = std::make_shared<ov::op::v8::Gather>(data, indices, axis, 1);
+    }

-    Output<Node> res = std::make_shared<ov::op::v8::Gather>(data_node, indice_reshaped, axis_node);
    if (res.get_element_type() != context.get_output_type(0)) {
        res = std::make_shared<ov::op::v0::Convert>(res, context.get_output_type(0));
    }
-
    return rename_outputs_with_suffix({res}, context.get_name());
 }

--- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp
+++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp
@ -26,48 +26,46 @@ namespace op {
 OutputVector translate_mulmat(const NodeContext& context) {
    num_inputs_check(context, 2, 2);

-    int op_case = context.get_op_case();
-    FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported MULMAT case");
-
    ov::Output<Node> res;
+    ov::Output<ov::Node> B = context.get_input(0);
+    ov::Output<ov::Node> A = std::make_shared<ov::op::v0::Convert>(context.get_input(1), context.get_input_type(0));

-    if (op_case == 1) {
-        auto src0 = context.get_input(0);
-        auto src1 = std::make_shared<ov::op::v0::Convert>(context.get_input(1), context.get_input_type(0));
-        auto result_lp = std::make_shared<ov::op::v0::MatMul>(src1, src0, false, true);
-        res = std::make_shared<ov::op::v0::Convert>(result_lp, context.get_output_type(0));
-    } else {
-        ov::Output<ov::Node> B = context.get_input(0);
-        ov::Output<ov::Node> A = std::make_shared<ov::op::v0::Convert>(context.get_input(1), context.get_input_type(0));
+    auto B_shape = context.get_input_shape(0).to_shape();
+    auto A_shape = context.get_input_shape(1).to_shape();
+    int64_t A_batch = A_shape[0];
+    int64_t B_batch = B_shape[0];
+    auto A_batch_larger = A_batch > B_batch;
+    Output<Node> Z = A_batch_larger ? B : A;
+    int64_t factor = A_batch_larger ? A_batch / B_batch : B_batch / A_batch;
+    if (factor > 1) {
+        auto A_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{A_batch});
+        auto B_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{B_batch});
+        auto factor_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{factor});

-        int64_t num_heads = context.get_num_heads();
-        int64_t num_heads_kv = context.get_num_heads_kv();
-        int64_t kv_num_heads_factor = num_heads / num_heads_kv;
-        if (kv_num_heads_factor > 1) {
-            auto num_heads_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{num_heads});
-            auto num_heads_kv_node =
-                ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{num_heads_kv});
-            auto factor_node =
-                ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{kv_num_heads_factor});
-            auto B_shape_last_two = get_dimensions(B.get_node_shared_ptr(), {1, 2});
+        auto Z_last_two_dim = get_dimensions(Z.get_node_shared_ptr(), {1, 2});

-            auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1});
-            auto B_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(B, unsqueeze_axes);
+        auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1});
+        auto Z_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(Z, unsqueeze_axes);

-            auto broadcast_shape = std::make_shared<ov::op::v0::Concat>(
-                ov::OutputVector{num_heads_kv_node, factor_node, B_shape_last_two}, 0);
-            auto B_broadcasted = std::make_shared<ov::op::v3::Broadcast>(B_unsqueezed, broadcast_shape);
+        Output<Node> batch_small = A_batch_larger ? B_batch_node : A_batch_node;
+        Output<Node> batch_large = A_batch_larger ? A_batch_node : B_batch_node;
+        auto broadcast_shape =
+            std::make_shared<ov::op::v0::Concat>(ov::OutputVector{batch_small, factor_node, Z_last_two_dim}, 0);
+        auto Z_broadcasted = std::make_shared<ov::op::v3::Broadcast>(Z_unsqueezed, broadcast_shape);

-            auto new_B_shape =
-                std::make_shared<ov::op::v0::Concat>(ov::OutputVector{num_heads_node, B_shape_last_two}, 0);
-            B = std::make_shared<ov::op::v1::Reshape>(B_broadcasted, new_B_shape, false);
+        auto new_Z_shape = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{batch_large, Z_last_two_dim}, 0);
+        Z = std::make_shared<ov::op::v1::Reshape>(Z_broadcasted, new_Z_shape, false);
+        }
+        if (A_batch_larger) {
+            B = Z;
+        } else {
+            A = Z;
        }

        auto result_lp = std::make_shared<ov::op::v0::MatMul>(A, B, false, true);
        res = std::make_shared<ov::op::v0::Convert>(result_lp, context.get_output_type(0));
-    }

-    return rename_outputs_with_suffix({res}, context.get_name());
+        return rename_outputs_with_suffix({res}, context.get_name());
 }

 }  // namespace op
--- a/ggml/src/ggml-openvino/openvino/op/rope.cpp
+++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp
@ -11,6 +11,7 @@
 #include <openvino/op/slice.hpp>
 #include <openvino/op/split.hpp>
 #include <openvino/op/subtract.hpp>
+#include <openvino/op/unsqueeze.hpp>
 #include <vector>

 #include "../node_context.hpp"
@ -25,37 +26,66 @@ namespace op {
 OutputVector translate_rope(const NodeContext& context) {
    num_inputs_check(context, 2, 3);

+    int op_case = context.get_op_case();
+    FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case");
+
    ov::Output<Node> res;

    auto data_node = context.get_input(0).get_node_shared_ptr();
-    auto cos_theta_node = context.get_input("rope_cos");
-    auto sin_theta_node = context.get_input("rope_sin");
-
+    auto output_shape = context.get_output_shape(0).to_shape();
    int32_t* op_params = context.get_output_op_params(0);
+
+    Output<Node> cos_theta_node;
+    Output<Node> sin_theta_node;
+    if (context.has_input("rope_cos")) {
+        cos_theta_node = context.get_input("rope_cos");
+        sin_theta_node = context.get_input("rope_sin");
+    } else {
+        auto inp_pos = context.get_input(1).get_node_shared_ptr();
+        std::shared_ptr<ov::Node> rope_freqs_weight;
+        if (context.get_input_size() == 3) {
+            rope_freqs_weight = context.get_input(2).get_node_shared_ptr();
+        }
+        auto sin_cos = make_sin_cos(op_params, inp_pos, rope_freqs_weight);
+        sin_theta_node = sin_cos.first;
+        cos_theta_node = sin_cos.second;
+    }
+
+    if (op_case == 2) {
+        // The input comes from a VIEW
+        int slice_len = output_shape[1] * output_shape[2];
+        data_node = process_view_input(context, 0, slice_len).get_node_shared_ptr();
+        auto data_shape = ov::op::v0::Constant::create(
+            ov::element::i64, {3}, std::vector<int64_t>{-1, (int64_t) output_shape[1], (int64_t) output_shape[2]});
+        data_node = std::make_shared<ov::op::v1::Reshape>(data_node, data_shape, false);
+    }
+
    const int mode = op_params[2];
-    constexpr int GGML_ROPE_TYPE_NEOX = 2;
-    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
+    constexpr int ROPE_TYPE_NEOX = 2;
+    constexpr int ROPE_TYPE_NORM = 0;

-    if (!is_neox) {
-        auto input_shape = context.get_input_shape(0);
+    if (mode == ROPE_TYPE_NORM) {
+        auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
+        auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
+        auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
+        auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[2]});
+        auto even_slice = std::make_shared<ov::op::v8::Slice>(data_node, zero, end, two, two);
+        auto odd_slice = std::make_shared<ov::op::v8::Slice>(data_node, one, end, two, two);

-        auto begin_even = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {0, 0, 0});
-        auto begin_odd = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {0, 0, 1});
-        auto end = std::make_shared<ov::op::v0::ShapeOf>(data_node);
-        auto stride = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {1, 1, 2});
-        auto even_slice = std::make_shared<ov::op::v8::Slice>(data_node, begin_even, end, stride);
-        auto odd_slice = std::make_shared<ov::op::v8::Slice>(data_node, begin_odd, end, stride);
-
-        auto first_half =
+        Output<Node> first_half =
            std::make_shared<ov::op::v1::Subtract>(std::make_shared<ov::op::v1::Multiply>(even_slice, cos_theta_node),
                                                   std::make_shared<ov::op::v1::Multiply>(odd_slice, sin_theta_node));
-        auto second_half =
+        Output<Node> second_half =
            std::make_shared<ov::op::v1::Add>(std::make_shared<ov::op::v1::Multiply>(even_slice, sin_theta_node),
                                              std::make_shared<ov::op::v1::Multiply>(odd_slice, cos_theta_node));

-        auto stack = std::make_shared<ov::op::v0::Concat>(OutputVector{first_half, second_half}, 2);
+        first_half = std::make_shared<ov::op::v0::Unsqueeze>(first_half,
+                                                             ov::op::v0::Constant::create(ov::element::i64, {1}, {3}));
+        second_half = std::make_shared<ov::op::v0::Unsqueeze>(second_half,
+                                                              ov::op::v0::Constant::create(ov::element::i64, {1}, {3}));
+        auto stack = std::make_shared<ov::op::v0::Concat>(OutputVector{first_half, second_half}, 3);
        res = std::make_shared<ov::op::v1::Reshape>(stack, std::make_shared<ov::op::v0::ShapeOf>(data_node), false);
-    } else {
+    } else if (mode == ROPE_TYPE_NEOX) {
        auto data_split = std::make_shared<ov::op::v1::Split>(
            data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}), 2);
        Output<Node> slice_data_node_0 = data_split->outputs()[0];
--- a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp
+++ b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp
@ -33,9 +33,9 @@ OutputVector translate_soft_max(const NodeContext& context) {
    auto* op_params = context.get_output_op_params(0);
    memcpy(&scale, (float*) op_params + 0, sizeof(float));
    memcpy(&max_bias, (float*) op_params + 1, sizeof(float));
-    const uint32_t h = context.get_head_size();
-
-    const uint32_t n_head = context.get_input_shape(0)[0].get_length();
+    auto src0_shape = context.get_input_shape(0).get_shape();
+    const uint32_t h = src0_shape[2];
+    const uint32_t n_head = src0_shape[0];
    const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));

    const float m0 = powf(2.0f, -(max_bias) / n_head_log2);
@ -46,23 +46,30 @@ OutputVector translate_soft_max(const NodeContext& context) {
    auto scale_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{}, std::vector<float>{scale});
    auto scaled_input = std::make_shared<ov::op::v1::Multiply>(input_node, scale_node);

+    if (context.get_input_size() < 2) {
+        res = std::make_shared<ov::op::v8::Softmax>(scaled_input, 2);
+        return rename_outputs_with_suffix({res}, context.get_name());
+    }
+
    auto mask_node = context.get_input(1);

-    // Use Q-cur to retrieve the token length, so that the translation of SOFT_MAX
+    std::shared_ptr<ov::Node> token_len = get_dimensions(input_node, {1});
+    // Try using Q-cur to retrieve the token length, so that the translation of SOFT_MAX
    // does not depend on the result of the QK MatMul, so that QK matmul + softmax + qkv matmul
    // can be fused into SDPA.
-    if (input_node->get_type_info() != ov::op::v0::Convert::get_type_info_static()) {
-        throw std::runtime_error("Input of SOFT_MAX should be MatMul of qk followed by a Convert");
+    if (input_node->get_type_info() == ov::op::v0::Convert::get_type_info_static()) {
+        auto qk = input_node->get_input_node_shared_ptr(0);
+        if (qk->get_type_info() == ov::op::v0::MatMul::get_type_info_static()) {
+            token_len = get_dimensions(qk->get_input_node_shared_ptr(0), {1});
+        }
    }
-    auto qk = input_node->get_input_node_shared_ptr(0);
-    if (qk->get_type_info() != ov::op::v0::MatMul::get_type_info_static()) {
-        throw std::runtime_error("Input of SOFT_MAX should be MatMul of qk followed by a Convert");
-    }
-    auto token_len = get_dimensions(qk->get_input_node_shared_ptr(0), {1});
-
    auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
    auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
-    auto mask_node_sliced = std::make_shared<ov::op::v8::Slice>(mask_node, zero, token_len, one, one);
+    std::shared_ptr<ov::Node> mask_node_sliced =
+        std::make_shared<ov::op::v8::Slice>(mask_node, zero, token_len, one, one);
+    if (mask_node_sliced->get_element_type() != context.get_output_type(0)) {
+        mask_node_sliced = std::make_shared<ov::op::v0::Convert>(mask_node_sliced, context.get_output_type(0));
+    }

    Output<Node> slope_mask;
    if (slope != 1.0f) {
--- a/ggml/src/ggml-openvino/openvino/translate_session.cpp
+++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp
@ -145,69 +145,18 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
    int32_t* rope_params = ggml_model_decoder.get_rope_params();
    auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr();
    std::shared_ptr<ov::Node> rope_freqs_weight;
-
-    inp_pos = std::make_shared<ov::op::v0::Convert>(inp_pos, ov::element::f32);
-    auto pos_perm =
-        std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{3}, std::vector<int64_t>{2, 1, 0});
-    inp_pos = std::make_shared<ov::op::v1::Transpose>(inp_pos, pos_perm);
    if (tensor_map.find("rope_freqs_weight") != tensor_map.end()) {
        rope_freqs_weight = tensor_map.at("rope_freqs.weight").get_node_shared_ptr();
    }

-    float freq_base;
-    float freq_scale;
-    float ext_factor;
-    float attn_factor;
-    float beta_fast;
-    float beta_slow;
-    const int n_dims = rope_params[1];
-    const int n_ctx_orig = rope_params[4];
-    memcpy(&freq_base, rope_params + 5, sizeof(float));
-    memcpy(&freq_scale, rope_params + 6, sizeof(float));
-    memcpy(&ext_factor, rope_params + 7, sizeof(float));
-    memcpy(&attn_factor, rope_params + 8, sizeof(float));
-    memcpy(&beta_fast, rope_params + 9, sizeof(float));
-    memcpy(&beta_slow, rope_params + 10, sizeof(float));
+    auto sin_cos = make_sin_cos(rope_params, inp_pos, rope_freqs_weight);
+    auto sin_theta = sin_cos.first;
+    auto cos_theta = sin_cos.second;

-    const float theta_scale = powf(freq_base, -2.0f / n_dims);
-
-    // TODO: corr_dims is not used in the current implementation
-    float corr_dims[2];
-    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
-
-    // TODO: GGML_OP_ROPE_BACK -> false
-    // bool forward = true;
-    // const float sin_sign = forward ? 1.0f : -1.0f;
-
-    const int64_t half_head_size = ggml_model_decoder.get_head_size() / 2;
-    std::vector<float> factor(half_head_size);
-    factor[0] = freq_scale;
-    for (int64_t i = 1; i < half_head_size; i++) {
-        factor[i] = theta_scale * factor[i - 1];
-    }
-
-    Output<Node> factor_node =
-        std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor);
-    if (rope_freqs_weight) {
-        factor_node = std::make_shared<ov::op::v1::Divide>(factor_node, rope_freqs_weight);
-    }
-
-    auto half_head_size_node = ov::op::v0::Constant::create(ov::element::i64, Shape{1}, {half_head_size});
-    Output<Node> cos_factor =
-        std::make_shared<ov::op::v0::Cos>(std::make_shared<ov::op::v1::Multiply>(factor_node, inp_pos));
-    Output<Node> sin_factor =
-        std::make_shared<ov::op::v0::Sin>(std::make_shared<ov::op::v1::Multiply>(factor_node, inp_pos));
-
-    float mscale = attn_factor;
-    Output<Node> mscale_node =
-        std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{}, std::vector<float>{mscale});
-
-    auto cos_theta = std::make_shared<ov::op::v1::Multiply>(cos_factor, mscale_node);
-    auto sin_theta = std::make_shared<ov::op::v1::Multiply>(sin_factor, mscale_node);
-    cos_theta->set_friendly_name("rope_cos");
-    sin_theta->set_friendly_name("rope_sin");
-    tensor_map.insert({"rope_cos", cos_theta->output(0)});
-    tensor_map.insert({"rope_sin", sin_theta->output(0)});
+    cos_theta.get_node_shared_ptr()->set_friendly_name("rope_cos");
+    sin_theta.get_node_shared_ptr()->set_friendly_name("rope_sin");
+    tensor_map.insert({"rope_cos", cos_theta});
+    tensor_map.insert({"rope_sin", sin_theta});
 }

 // Create common patterns
@ -220,10 +169,12 @@ void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
 }  // namespace

 TranslateSession::TranslateSession(const frontend::InputModel::Ptr& input_model,
-                                   const std::unordered_map<std::string, CreatorFunction>& translator_map)
-    : m_input_model(input_model),
-      m_translator_map(translator_map),
-      m_ov_model(nullptr) {}
+                                   const std::unordered_map<std::string, CreatorFunction>& translator_map,
+                                   bool naive) :
+    m_input_model(input_model),
+    m_translator_map(translator_map),
+    m_ov_model(nullptr),
+    m_naive(naive) {}

 std::shared_ptr<Model> TranslateSession::get_converted_model() {
    if (m_ov_model) {
@ -258,6 +209,10 @@ std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputMo

    auto node_visitor = [&](std::shared_ptr<GgmlDecoder> node) {
        auto operation_type = node->get_op_type();
+        if (operation_type == "GGML_OP_NONE") {
+            return;
+        }
+
        ov::OutputVector converted_outputs;
        auto it = m_translator_map.find(operation_type);
        FRONT_END_OP_CONVERSION_CHECK(it != m_translator_map.end(),
@ -285,7 +240,9 @@ std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputMo
        }
    };

-    preprocess(*tensor_map, *ggml_model_decoder);
+    if (!m_naive) {
+        preprocess(*tensor_map, *ggml_model_decoder);
+    }
    ggml_model_decoder->visit_subgraph(node_visitor);

    for (const auto& name : ggml_model_decoder->get_model_output_names()) {
--- a/ggml/src/ggml-openvino/openvino/translate_session.hpp
+++ b/ggml/src/ggml-openvino/openvino/translate_session.hpp
@ -10,7 +10,7 @@ namespace ggml {
 class TranslateSession {
 public:
    TranslateSession(const frontend::InputModel::Ptr& input_model,
-                     const std::unordered_map<std::string, CreatorFunction>& translator_map);
+                     const std::unordered_map<std::string, CreatorFunction>& translator_map, bool naive = false);

    std::shared_ptr<Model> get_converted_model();
    std::shared_ptr<Model> translate_graph(const frontend::InputModel::Ptr& input_model);
@ -20,6 +20,7 @@ private:
    const frontend::InputModel::Ptr m_input_model;
    const std::unordered_map<std::string, CreatorFunction>& m_translator_map;
    std::shared_ptr<Model> m_ov_model;
+    bool m_naive;
 };

 }  // namespace ggml
--- a/ggml/src/ggml-openvino/openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/openvino/utils.cpp
@ -1,9 +1,20 @@
 #include "utils.hpp"

+#include <cstddef>
 #include <ctime>
 #include <memory>
+#include <openvino/op/add.hpp>
+#include <openvino/op/clamp.hpp>
+#include <openvino/op/convert.hpp>
+#include <openvino/op/cos.hpp>
+#include <openvino/op/divide.hpp>
 #include <openvino/op/gather.hpp>
+#include <openvino/op/maximum.hpp>
+#include <openvino/op/multiply.hpp>
 #include <openvino/op/shape_of.hpp>
+#include <openvino/op/sin.hpp>
+#include <openvino/op/subtract.hpp>
+#include <openvino/op/transpose.hpp>
 #include <string>

 namespace ov {
@ -58,6 +69,134 @@ OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::
    return outputs;
 }

+namespace {
+ov::Output<ov::Node> rope_yarn_ramp_mix(int n_dims, const float corr_dims[2], float ext_factor) {
+    int half_n_dims = n_dims / 2;
+    std::vector<float> dim_ids_vec(half_n_dims);
+    std::iota(dim_ids_vec.begin(), dim_ids_vec.end(), 0);
+    auto dim_ids = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, (size_t) half_n_dims}, dim_ids_vec);
+    auto corr_low = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {corr_dims[0]});
+    auto corr_high = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {corr_dims[1]});
+    auto denom =
+        std::make_shared<ov::op::v1::Maximum>(std::make_shared<ov::op::v1::Subtract>(corr_high, corr_low),
+                                              ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {0.001f}));
+    auto ramp_y =
+        std::make_shared<ov::op::v1::Divide>(std::make_shared<ov::op::v1::Subtract>(dim_ids, corr_low), denom);
+    auto ramp_clamped = std::make_shared<ov::op::v0::Clamp>(ramp_y, 0.0f, 1.0f);
+    auto ext_factor_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {ext_factor});
+    auto ramp_mix = std::make_shared<ov::op::v1::Multiply>(ramp_clamped, ext_factor_node);
+    return ramp_mix;
+}
+
+float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
+#ifndef M_PI
+#    define M_PI 3.14159265358979323846
+#endif
+    return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float) M_PI)) / (2 * logf(base));
+}
+
+void ggml_rope_yarn_corr_dims(int n_dims,
+                              int n_ctx_orig,
+                              float freq_base,
+                              float beta_fast,
+                              float beta_slow,
+                              float dims[2]) {
+    float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
+    float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
+    dims[0] = std::max(0.0f, start);
+    dims[1] = std::min(static_cast<float>(n_dims - 1), end);
+}
+}  // namespace
+
+std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t* rope_params,
+                                                           std::shared_ptr<ov::Node> inp_pos,
+                                                           std::shared_ptr<ov::Node> rope_freqs_weight) {
+    inp_pos = std::make_shared<ov::op::v0::Convert>(inp_pos, ov::element::f32);
+    auto pos_perm =
+        std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{3}, std::vector<int64_t>{2, 1, 0});
+    inp_pos = std::make_shared<ov::op::v1::Transpose>(inp_pos, pos_perm);
+
+    float freq_base;
+    float freq_scale;
+    float ext_factor;
+    float attn_factor;
+    float beta_fast;
+    float beta_slow;
+    const int n_dims = rope_params[1];
+    const int n_ctx_orig = rope_params[4];
+    memcpy(&freq_base, rope_params + 5, sizeof(float));
+    memcpy(&freq_scale, rope_params + 6, sizeof(float));
+    memcpy(&ext_factor, rope_params + 7, sizeof(float));
+    memcpy(&attn_factor, rope_params + 8, sizeof(float));
+    memcpy(&beta_fast, rope_params + 9, sizeof(float));
+    memcpy(&beta_slow, rope_params + 10, sizeof(float));
+
+    const float theta_scale = powf(freq_base, -2.0f / n_dims);
+
+    float corr_dims[2];
+    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
+
+    std::vector<float> factor(n_dims / 2);
+    factor[0] = freq_scale;
+    for (size_t i = 1; i < factor.size(); i++) {
+        factor[i] = theta_scale * factor[i - 1];
+    }
+
+    Output<Node> freq_factors =
+        std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor);
+    if (rope_freqs_weight) {
+        freq_factors = std::make_shared<ov::op::v1::Divide>(freq_factors, rope_freqs_weight);
+    }
+
+    auto theta_extrap = std::make_shared<ov::op::v1::Multiply>(freq_factors, inp_pos);
+    auto theta_interp = std::make_shared<ov::op::v1::Multiply>(
+        theta_extrap, ov::op::v0::Constant::create(ov::element::f32, {1}, {freq_scale}));
+
+    Output<Node> theta;
+    float mscale = attn_factor;
+    if (ext_factor == 0.0f) {
+        theta = theta_interp;
+    } else {
+        auto ramp_mix = rope_yarn_ramp_mix(n_dims, corr_dims, ext_factor);
+        auto one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {1.0f});
+        auto one_minus_ramp = std::make_shared<ov::op::v1::Subtract>(one, ramp_mix);
+
+        theta = std::make_shared<ov::op::v1::Add>(std::make_shared<ov::op::v1::Multiply>(theta_interp, one_minus_ramp),
+                                                  std::make_shared<ov::op::v1::Multiply>(theta_extrap, ramp_mix));
+        mscale *= (1.0f + 0.1f * std::log(1.0f / freq_scale));
+    }
+
+    Output<Node> cos_theta = std::make_shared<ov::op::v0::Cos>(theta);
+    Output<Node> sin_theta = std::make_shared<ov::op::v0::Sin>(theta);
+
+    auto mscale_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {mscale});
+
+    cos_theta = std::make_shared<ov::op::v1::Multiply>(cos_theta, mscale_node);
+    sin_theta = std::make_shared<ov::op::v1::Multiply>(sin_theta, mscale_node);
+    return std::make_pair(sin_theta, cos_theta);
+}
+
+ov::Output<ov::Node> process_view_input(const NodeContext& context, int input_index, int slice_len) {
+    // Only works for VIEW operations that slice at the lowest dimension
+    // If the VIEW also reshape the result, `slice_len` should be provided
+    auto input = context.get_input(input_index);
+    int32_t* op_params = context.get_input_op_params(input_index);
+    auto src1_stride = context.get_input_stride(input_index);
+
+    int64_t split_addr = op_params[0] / src1_stride[2];
+    if (slice_len == 0) {
+        slice_len = context.get_input_shape(input_index)[2].get_length();
+    }
+    int64_t slice_end = split_addr + slice_len;
+
+    auto begin = ov::op::v0::Constant::create(ov::element::i64, {1}, {split_addr});
+    auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_end});
+    auto stride = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
+    auto axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
+    auto sliced = std::make_shared<ov::op::v8::Slice>(input, begin, end, stride, axes);
+    return sliced;
+}
+
 }  // namespace ggml
 }  // namespace frontend
 }  // namespace ov
--- a/ggml/src/ggml-openvino/openvino/utils.hpp
+++ b/ggml/src/ggml-openvino/openvino/utils.hpp
@ -1,6 +1,10 @@
 #pragma once

+#include <memory>
+#include <openvino/core/node.hpp>
 #include <openvino/op/shape_of.hpp>
+#include <openvino/op/slice.hpp>
+#include <utility>

 #include "node_context.hpp"

@ -60,6 +64,12 @@ std::shared_ptr<ov::Node> get_dimensions(const std::shared_ptr<ov::Node>& node,

 OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::string& suffix);

+std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t* rope_params,
+                                                           std::shared_ptr<ov::Node> inp_pos,
+                                                           std::shared_ptr<ov::Node> rope_freqs_weight = nullptr);
+
+ov::Output<ov::Node> process_view_input(const NodeContext& context, int input_index, int slice_len = 0);
+
 namespace op {
 template <typename T>
 OutputVector translate_1to1_match_2_inputs(const NodeContext& context) {
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@ -21,6 +21,7 @@
 #include <vector>

 #include "ggml-impl.h"
+#include "ggml-openvino/ggml-decoder.h"
 #include "ggml.h"
 #include "openvino/frontend.hpp"
 #include "openvino/input_model.hpp"
@ -35,6 +36,9 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
    ov::Shape input_shape;
    if (name.find("cache_k") == 0 || name.find("cache_v") == 0) {
        input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor).to_shape();
+    } else if (ggml_tensor->op == GGML_OP_VIEW) {
+        // This case is added to make test-backend-ops work
+        input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor->view_src).to_shape();
    } else {
        input_shape = ggml_decoder->get_input_shape(name).to_shape();
    }
@ -81,6 +85,10 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
        config = get_npu_config();
    }

+    if (cgraph->n_nodes == 1) {
+        return naive_compute(cgraph, core, device, config);
+    }
+
    auto start_time = ggml_time_us();

    auto* cache_dir = getenv("GGML_OPENVINO_CACHE_DIR");
@ -242,6 +250,42 @@ ov::AnyMap get_npu_config() {
    return config;
 }

+enum ggml_status naive_compute(struct ggml_cgraph* cgraph,
+                               ov::Core& core,
+                               const std::string& device,
+                               const ov::AnyMap& config) {
+    if (cgraph->nodes[0]->op == GGML_OP_NONE) {
+        return GGML_STATUS_SUCCESS;
+    }
+
+    auto decoder = std::make_shared<GgmlOvDecoder>(cgraph);
+    auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(decoder);
+    auto naive = true;
+    auto model = ov::frontend::ggml::FrontEnd::convert(input_model, naive);
+    auto infer_request = core.compile_model(model, device, config).create_infer_request();
+
+    ov::serialize(model, "IR.xml");
+
+    auto ov_params = model->get_parameters();
+    for (size_t i = 0; i < ov_params.size(); i++) {
+        auto param_name = ov_params[i]->get_friendly_name();
+        auto input_tensor = get_ov_input_tensor(decoder, param_name);
+        infer_request.set_input_tensor(i, input_tensor);
+    }
+
+    infer_request.infer();
+
+    auto gguf_tensor_addrs = get_ggml_graph_output_dst(decoder);
+    auto ov_results = model->get_results();
+    for (size_t i = 0; i < ov_results.size(); i++) {
+        auto result_name = ov_results[i]->get_friendly_name();
+        const auto output_tensor = infer_request.get_output_tensor(i);
+
+        std::memcpy(gguf_tensor_addrs[result_name], output_tensor.data(), output_tensor.get_byte_size());
+    }
+    return GGML_STATUS_SUCCESS;
+}
+
 ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string& param_name) {
    bool is_static = ggml_decoder->is_static();
    bool is_first_token = ggml_decoder->is_first_token();
--- a/ggml/src/ggml-openvino/utils.h
+++ b/ggml/src/ggml-openvino/utils.h
@ -1,4 +1,5 @@
 #include <algorithm>
+#include <openvino/runtime/core.hpp>

 #include "ggml-backend-impl.h"
 #include "ggml-decoder.h"
@ -42,3 +43,6 @@ bool is_prefill(struct ggml_cgraph * cgraph);
 ov::AnyMap get_npu_config();

 ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string& param_name);
+
+enum ggml_status naive_compute(struct ggml_cgraph* cgraph, ov::Core& core, const std::string& device,
+                               const ov::AnyMap& config);