From 01cdf4a9cc685fe7c5f1a64b20fd7c02e8383083 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 29 Jul 2025 14:07:03 +0800 Subject: [PATCH] matmul in fp32 --- ggml/src/ggml-openvino/ggml-decoder.cpp | 1 + ggml/src/ggml-openvino/ggml-decoder.h | 2 +- ggml/src/ggml-openvino/openvino/op/mulmat.cpp | 14 ++------- .../ggml-openvino/openvino/op/soft_max.cpp | 7 ++--- .../openvino/pass/fuse_to_sdpa.cpp | 11 +++---- .../openvino/translate_session.cpp | 29 ++++++++++--------- .../openvino/translate_session.hpp | 2 +- 7 files changed, 28 insertions(+), 38 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index b43f45dbbd..f7846382b9 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -212,6 +212,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { } else { m_op_case = 1; } + break; } default: break; diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 78422afaf7..c1970af53a 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -139,7 +139,7 @@ private: std::vector m_output_names; std::string m_op_name; mutable std::string m_name; - int m_op_case; + int m_op_case = 0; std::vector> m_op_node_name; std::map> m_model_inputs; std::map> m_model_extra_inputs; diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index aa230550a4..57fd476f0a 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -29,15 +29,8 @@ OutputVector translate_mulmat(const NodeContext& context) { ov::Output res; ov::Output B = context.get_input(0); ov::Output A = context.get_input(1); - if (context.get_op_case() == 1) { - if (context.get_input_type(0) == ov::element::f16) { - B = std::make_shared(context.get_input(0), ov::element::f32); - } - if (context.get_input_type(1) == ov::element::f16) { - A = std::make_shared(context.get_input(1), ov::element::f32); - } - } else { - A = std::make_shared(context.get_input(1), context.get_input_type(0)); + if (context.get_input_type(0) != context.get_input_type(1)) { + B = std::make_shared(context.get_input(0), context.get_input_type(1)); } auto B_shape = context.get_input_shape(0).to_shape(); @@ -72,8 +65,7 @@ OutputVector translate_mulmat(const NodeContext& context) { A = Z; } - auto result_lp = std::make_shared(A, B, false, true); - res = std::make_shared(result_lp, context.get_output_type(0)); + res = std::make_shared(A, B, false, true); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp index 001a62be8b..401acaf865 100644 --- a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp +++ b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp @@ -57,11 +57,8 @@ OutputVector translate_soft_max(const NodeContext& context) { // Try using Q-cur to retrieve the token length, so that the translation of SOFT_MAX // does not depend on the result of the QK MatMul, so that QK matmul + softmax + qkv matmul // can be fused into SDPA. - if (input_node->get_type_info() == ov::op::v0::Convert::get_type_info_static()) { - auto qk = input_node->get_input_node_shared_ptr(0); - if (qk->get_type_info() == ov::op::v0::MatMul::get_type_info_static()) { - token_len = get_dimensions(qk->get_input_node_shared_ptr(0), {1}); - } + if (input_node->get_type_info() == ov::op::v0::MatMul::get_type_info_static()) { + token_len = get_dimensions(input_node->get_input_node_shared_ptr(0), {1}); } auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); diff --git a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp index 1b7ac60271..aa6e28b627 100644 --- a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +++ b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -22,15 +23,13 @@ FuseToSDPA::FuseToSDPA() { const auto m_k = ov::pass::pattern::any_input(); const auto m_q = ov::pass::pattern::any_input(); const auto m_qk = ov::pass::pattern::wrap_type({m_q, m_k}); - const auto m_qk_f32 = ov::pass::pattern::wrap_type({m_qk}); const auto m_scale = ov::pass::pattern::any_input(); - const auto m_scaled_qk = ov::pass::pattern::wrap_type({m_qk_f32, m_scale}); + const auto m_scaled_qk = ov::pass::pattern::wrap_type({m_qk, m_scale}); const auto m_mask = ov::pass::pattern::any_input(); const auto m_masked_qk = ov::pass::pattern::wrap_type({m_scaled_qk, m_mask}); const auto m_softmax_qk = ov::pass::pattern::wrap_type({m_masked_qk}); - const auto m_softmax_qk_f16 = ov::pass::pattern::wrap_type({m_softmax_qk}); const auto m_v = ov::pass::pattern::any_input(); - const auto m_qkv = ov::pass::pattern::wrap_type({m_softmax_qk_f16, m_v}); + const auto m_qkv = ov::pass::pattern::wrap_type({m_softmax_qk, m_v}); const auto callback = [=](ov::pass::pattern::Matcher& m) { auto& pattern_to_output = m.get_pattern_value_map(); @@ -42,9 +41,7 @@ FuseToSDPA::FuseToSDPA() { auto v_trans = register_new_node(v, ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 2, 1})); - auto mask_f16 = register_new_node(mask, ov::element::f16); - auto scale_f16 = register_new_node(scale, ov::element::f16); - auto sdpa = std::make_shared(q, k, v_trans, mask_f16, scale_f16, false); + auto sdpa = std::make_shared(q, k, v_trans, mask, scale, false); ov::replace_node(m.get_match_root(), sdpa); ov::copy_runtime_info(m.get_matched_nodes(), sdpa); diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 563613aa7f..c4fe8c88ee 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include "ggml-openvino/openvino/node_context.hpp" #include "ggml-openvino/openvino/utils.hpp" @@ -254,22 +254,25 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo return resulting_model; } -void TranslateSession::apply_transformations(const std::shared_ptr& model) { +std::shared_ptr TranslateSession::apply_transformations(std::shared_ptr model) { auto ggml_model_decoder = std::dynamic_pointer_cast(m_input_model)->get_model_decoder(); + { + ov::pass::Manager manager; + manager.set_per_pass_validation(true); - ov::pass::Manager manager; - manager.set_per_pass_validation(true); - manager.register_pass(); - manager.register_pass(); + if (!ggml_model_decoder->is_static()) { + const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names(); + const auto kv_param_res_pairs = get_kv_param_res_pairs(model, kv_param_res_names); + manager.register_pass(kv_param_res_pairs); + } - if (!ggml_model_decoder->is_static()) { - const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names(); - const auto kv_param_res_pairs = get_kv_param_res_pairs(model, kv_param_res_names); - manager.register_pass(kv_param_res_pairs); + // SDPA is even worse on performance + // manager.register_pass(); + manager.run_passes(model); } - - manager.register_pass(); - manager.run_passes(model); + auto preprocessor = ov::preprocess::PrePostProcessor(model); + model = preprocessor.build(); + return model; } } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/translate_session.hpp b/ggml/src/ggml-openvino/openvino/translate_session.hpp index 9eea5fd11c..7072d4a9e8 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.hpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.hpp @@ -16,7 +16,7 @@ public: std::shared_ptr translate_graph(const frontend::InputModel::Ptr& input_model); private: - void apply_transformations(const std::shared_ptr& model); + std::shared_ptr apply_transformations(std::shared_ptr model); const frontend::InputModel::Ptr m_input_model; const std::unordered_map& m_translator_map; std::shared_ptr m_ov_model;