PERF: compile once (dynamic graph + cache)

2025-05-08 16:07:14 +08:00 · 2025-05-08 16:07:14 +08:00 · a8e5efa44e
parent 7d5e234254
commit a8e5efa44e
5 changed files with 181 additions and 63 deletions
--- a/ggml/src/ggml-openvino/decoder.h
+++ b/ggml/src/ggml-openvino/decoder.h
@ -58,6 +58,7 @@ public:
    virtual bool check_if_continuous() const = 0;

    virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_inputs() const = 0;
+    virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_extra_inputs() const = 0;
    virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_weights() const = 0;
    virtual const std::vector<std::string>& get_model_output_names() const = 0;
 };
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@ -10,9 +10,11 @@
 #include <iomanip>
 #include <map>
 #include <memory>
+#include <openvino/core/dimension.hpp>
 #include <openvino/core/node.hpp>
 #include <openvino/core/type/float16.hpp>
 #include <openvino/op/constant.hpp>
+#include <openvino/runtime/tensor.hpp>
 #include <ostream>
 #include <set>
 #include <string>
@ -35,6 +37,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
            printed = true;
        }

+        set_max_token_len();
        for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) {
            auto* cur_node = m_cgraph->nodes[node_n];
            m_nodes.push_back(cur_node);
@ -42,6 +45,8 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
        }
        m_model_weights = model_weights;

+        add_extra_inputs();
+
        if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
            dump_cgraph(m_cgraph);
        }
@ -102,7 +107,16 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node,
                if (m_model_inputs.find(src_name) != m_model_inputs.end()) {
                    continue;
                }
-                auto param_node = std::make_shared<ov::op::v0::Parameter>(get_ov_type(src), ov::Shape{get_shape(src)});
+                ov::PartialShape input_shape;
+                if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") {
+                    input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_max_token_len)};
+                } else if (std::string(src->name).find("KQ_mask") == 0) {
+                    input_shape =
+                        ov::PartialShape{1, ov::Dimension(1, m_max_token_len), ov::Dimension(1, m_max_token_len)};
+                } else {
+                    input_shape = ov::Shape{get_shape(src)};
+                }
+                auto param_node = std::make_shared<ov::op::v0::Parameter>(get_ov_type(src), input_shape);
                param_node->set_friendly_name(src_name);
                m_model_inputs[src_name] = param_node;
            }
@ -146,6 +160,57 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node,
    }
 }

+void GgmlOvDecoder::set_max_token_len() {
+    for (int i = 0; i < m_cgraph->n_nodes; i++) {
+        auto* node = m_cgraph->nodes[i];
+        if (std::string(node->name) == "v-0") {
+            auto* cache_v = node->src[0];
+            m_max_token_len = cache_v->ne[0] / node->ne[1] / node->ne[2];
+            break;
+        }
+    }
+}
+
+void GgmlOvDecoder::add_extra_inputs() {
+    int64_t past_token_len;
+    int64_t attention_size;
+
+    for (const auto& node : m_nodes) {
+        if (node->op == GGML_OP_CPY && ggml_is_contiguous(node)) {
+            assert(std::string(node->view_src->name).find("cache_k") == 0);
+            int64_t head_size = node->src[0]->ne[0];
+            int64_t num_heads = node->src[0]->ne[1];
+            past_token_len = (int64_t)(node->src[1]->op_params[0] / node->src[1]->nb[0] / head_size / num_heads);
+
+            std::string name = "past_token_len";
+            auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{});
+            param_node->set_friendly_name(name);
+            m_model_extra_inputs[name] = param_node;
+
+            auto tensor = std::make_shared<ov::Tensor>(ov::element::i64, ov::Shape{});
+            *tensor->data<int64_t>() = past_token_len;
+            m_model_extra_input_values[name] = tensor;
+            break;
+        }
+    }
+    for (const auto& node : m_nodes) {
+        if (node->src[1] && std::string(node->src[1]->name).find("inp_tokens") == 0) {
+            int64_t total_token_len = node->src[1]->ne[0] + past_token_len;
+            attention_size = (total_token_len + 31) / 32 * 32;
+
+            std::string name = "attention_size";
+            auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
+            param_node->set_friendly_name(name);
+            m_model_extra_inputs[name] = param_node;
+
+            auto tensor = std::make_shared<ov::Tensor>(ov::element::i64, ov::Shape{1});
+            *tensor->data<int64_t>() = attention_size;
+            m_model_extra_input_values[name] = tensor;
+            break;
+        }
+    }
+}
+
 std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) {
    std::shared_ptr<ov::Node> weight_node;
    auto node_type = get_ov_type(tensor);
--- a/ggml/src/ggml-openvino/ggml-decoder.h
+++ b/ggml/src/ggml-openvino/ggml-decoder.h
@ -1,5 +1,6 @@
 #pragma once

+#include <cstdint>
 #include <map>
 #include <memory>
 #include <vector>
@ -79,6 +80,12 @@ public:
    virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_inputs() const override {
        return m_model_inputs;
    }
+    virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_extra_inputs() const override {
+        return m_model_extra_inputs;
+    }
+    virtual const std::map<std::string, std::shared_ptr<ov::Tensor>>& get_model_extra_input_values() const {
+        return m_model_extra_input_values;
+    }
    virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_weights() const override {
        return m_model_weights;
    }
@ -88,12 +95,16 @@ public:

 private:
    void set_input_output(ggml_tensor* node, std::map<std::string, std::shared_ptr<ov::Node>>& model_weights);
+    void add_extra_inputs();
    static void dump_cgraph(const struct ggml_cgraph* cgraph);
    static std::vector<size_t> get_shape(const ggml_tensor* tensor);
    static std::vector<size_t> get_stride(const ggml_tensor* tensor);
    static ov::element::Type get_ov_type(const ggml_tensor* tensor);
    static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor);

+    void set_max_token_len();
+    int64_t m_max_token_len;
+
    struct ggml_cgraph * m_cgraph;
    std::map<std::string, ggml_tensor *> m_inputs;
    std::vector<std::string> m_input_names;
@ -106,6 +117,8 @@ private:
    bool m_continuous;
    std::vector<std::pair<std::string, std::string>> m_op_node_name;
    std::map<std::string, std::shared_ptr<ov::Node>> m_model_inputs;
+    std::map<std::string, std::shared_ptr<ov::Node>> m_model_extra_inputs;
+    std::map<std::string, std::shared_ptr<ov::Tensor>> m_model_extra_input_values;
    std::map<std::string, std::shared_ptr<ov::Node>> m_model_weights;
    std::vector<std::string> m_model_output_names;
 };
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@ -3,10 +3,14 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstdlib>
+#include <memory>
 #include <openvino/core/graph_util.hpp>
 #include <openvino/core/type/float16.hpp>
 #include <openvino/frontend/manager.hpp>
 #include <openvino/openvino.hpp>
+#include <openvino/runtime/compiled_model.hpp>
+#include <openvino/runtime/tensor.hpp>
+#include <unordered_map>

 #include "ggml-impl.h"
 #include "ggml.h"
@ -63,61 +67,65 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
        return GGML_STATUS_FAILED;
    }

+    using CachedItem = std::pair<std::shared_ptr<ov::Model>, ov::CompiledModel>;
+    static std::unordered_map<struct ggml_cgraph*, CachedItem> compiled_cache;
+
+    std::shared_ptr<ov::Model> model;
+    ov::CompiledModel compiled_model;
+    int64_t conversion_end_time;
+    int64_t compile_end_time;
+
    auto ggml_decoder = get_ggml_decoder(cgraph);
-    std::shared_ptr<ov::frontend::DecoderBase> graph_decoder = ggml_decoder;
-    ov::frontend::InputModel::Ptr input_model = front_end->load(graph_decoder);
-    if (!input_model) {
-        GGML_LOG_ERROR("Input Model is not loaded \n");
-        return GGML_STATUS_FAILED;
+    auto it = compiled_cache.find(cgraph);
+    if (it != compiled_cache.end()) {
+        model = it->second.first;
+        conversion_end_time = ggml_time_us();
+
+        compiled_model = it->second.second;
+        compile_end_time = ggml_time_us();
+    } else {
+        std::shared_ptr<ov::frontend::DecoderBase> graph_decoder = ggml_decoder;
+        ov::frontend::InputModel::Ptr input_model = front_end->load(graph_decoder);
+        if (!input_model) {
+            GGML_LOG_ERROR("Input Model is not loaded \n");
+            return GGML_STATUS_FAILED;
+        }
+
+        model = front_end->convert(input_model);
+        conversion_end_time = ggml_time_us();
+
+        if (getenv("GGML_OPENVINO_DUMP_IR")) {
+            char timestamped_filename[64];
+            auto timestamp = (long long)ggml_time_us();
+            snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp);
+            ov::serialize(model, timestamped_filename);
+        }
+
+        if (!model) {
+            GGML_LOG_ERROR("Model is not converted \n");
+        }
+        compiled_model = core.compile_model(model, "CPU");
+        compile_end_time = ggml_time_us();
+
+        compiled_cache[cgraph] = std::make_pair(model, compiled_model);
    }

-    std::shared_ptr<ov::Model> model = front_end->convert(input_model);
-    auto conversion_end_time = ggml_time_us();
-
-    if (getenv("GGML_OPENVINO_DUMP_IR")) {
-        char timestamped_filename[64];
-        auto timestamp = (long long)ggml_time_us();
-        snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp);
-        ov::serialize(model, timestamped_filename);
-    }
-
-    if (!model) {
-        GGML_LOG_ERROR("Model is not converted \n");
-    }
-
-    ov::CompiledModel compiled_model = core.compile_model(model, "CPU");
-    auto compile_end_time = ggml_time_us();
-
    ov::InferRequest infer_request = compiled_model.create_infer_request();
-    auto infer_request_start_time = ggml_time_us();

-    auto input_names = ggml_decoder->get_input_names();
    auto ov_params = model->get_parameters();
    for (size_t i = 0; i < ov_params.size(); i++) {
        auto param_name = ov_params[i]->get_friendly_name();
-        auto input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name);
-
-        if (getenv("GGML_OPENVINO_DEBUG_INPUT")) {
-            std::cout << "Input name: " << param_name << ", Input shape: " << input_tensor.get_shape()
-                      << ", Address: " << input_tensor.data() << std::endl;
-            switch (input_tensor.get_element_type()) {
-            case ov::element::f32:
-                std::cout << *(float*)(input_tensor.data()) << std::endl;
-                break;
-            case ov::element::f16:
-                std::cout << ov::float16::from_bits(*(uint16_t*)(input_tensor.data())) << std::endl;
-                break;
-            case ov::element::i32:
-                std::cout << *(int32_t*)(input_tensor.data()) << std::endl;
-                break;
-            case ov::element::i64:
-                std::cout << *(int64_t*)(input_tensor.data()) << std::endl;
-                break;
-            default:
-                break;
-            }
+        ov::Tensor input_tensor;
+        if (ggml_decoder->get_model_extra_inputs().find(param_name) != ggml_decoder->get_model_extra_inputs().end()) {
+            input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name);
+        } else {
+            input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name);
        }
        infer_request.set_input_tensor(i, input_tensor);
+
+        if (getenv("GGML_OPENVINO_DEBUG_INPUT")) {
+            print_input_tensor_info(param_name, input_tensor);
+        }
    }
    auto input_end_time = ggml_time_us();

@ -131,20 +139,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
        std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size());

        if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) {
-            std::cout << "Output name: " << output_names[i] << ", Output shape: " << output_tensor.get_shape()
-                      << ", Address: " << output_tensors[output_names[i]] << std::endl;
-            switch (output_tensor.get_element_type()) {
-            case ov::element::f32:
-                std::cout << *(float*)(output_tensor.data()) << std::endl;
-                std::cout << checksum(output_tensor.data(), output_tensor.get_byte_size()) << std::endl;
-                break;
-            case ov::element::f16:
-                std::cout << ov::float16::from_bits(*(uint16_t*)(output_tensor.data())) << std::endl;
-                std::cout << checksum(output_tensor.data(), output_tensor.get_byte_size()) << std::endl;
-                break;
-            default:
-                break;
-            }
+            print_output_tensor_info(output_names[i], output_tensor, output_tensors);
        }
    }
    auto end_time = ggml_time_us();
@ -153,9 +148,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
        GGML_LOG_INFO("GGML OpenVINO Backend: \n");
        GGML_LOG_INFO("  - Graph conversion Time: %ld ms \n", (conversion_end_time - start_time) / 1000);
        GGML_LOG_INFO("  - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000);
-        GGML_LOG_INFO("  - Graph InferRequest created Time: %ld ms \n",
-                      (infer_request_start_time - compile_end_time) / 1000);
-        GGML_LOG_INFO("  - Graph Input Time: %ld ms \n", (input_end_time - infer_request_start_time) / 1000);
+        GGML_LOG_INFO("  - Graph Input Time: %ld ms \n", (input_end_time - compile_end_time) / 1000);
        GGML_LOG_INFO("  - Graph Inference Time: %ld ms \n", (infer_end_time - input_end_time) / 1000);
        GGML_LOG_INFO("  - Graph Output Time: %ld ms \n", (end_time - infer_end_time) / 1000);
    }
@ -172,3 +165,43 @@ size_t checksum(const void* data, size_t size) {
    }
    return sum;
 }
+
+void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor) {
+    std::cout << "Input name: " << name << ", Input shape: " << tensor.get_shape() << ", Address: " << tensor.data()
+              << std::endl;
+    switch (tensor.get_element_type()) {
+    case ov::element::f32:
+        std::cout << *(float*)(tensor.data()) << std::endl;
+        break;
+    case ov::element::f16:
+        std::cout << ov::float16::from_bits(*(uint16_t*)(tensor.data())) << std::endl;
+        break;
+    case ov::element::i32:
+        std::cout << *(int32_t*)(tensor.data()) << std::endl;
+        break;
+    case ov::element::i64:
+        std::cout << *(int64_t*)(tensor.data()) << std::endl;
+        break;
+    default:
+        break;
+    }
+}
+
+void print_output_tensor_info(const std::string& name,
+                              const ov::Tensor& tensor,
+                              std::map<std::string, void*>& output_dst) {
+    std::cout << "Output name: " << name << ", Output shape: " << tensor.get_shape()
+              << ", Address: " << output_dst[name] << std::endl;
+    switch (tensor.get_element_type()) {
+    case ov::element::f32:
+        std::cout << *(float*)(tensor.data()) << std::endl;
+        std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl;
+        break;
+    case ov::element::f16:
+        std::cout << ov::float16::from_bits(*(uint16_t*)(tensor.data())) << std::endl;
+        std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl;
+        break;
+    default:
+        break;
+    }
+}
--- a/ggml/src/ggml-openvino/utils.h
+++ b/ggml/src/ggml-openvino/utils.h
@ -4,3 +4,9 @@
 enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph);

 size_t checksum(const void* data, size_t size);
+
+void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor);
+
+void print_output_tensor_info(const std::string& name,
+                              const ov::Tensor& tensor,
+                              std::map<std::string, void*>& output_dst);