From 602f9ca4afa1fa0402f0cb2a263ab202deb488ce Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 17 Sep 2025 11:16:14 +0800 Subject: [PATCH] Fix NPU accuracy --- .../openvino/translate_session.cpp | 25 +++++++++++-------- ggml/src/ggml-openvino/utils.cpp | 5 +--- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 9c82fe5f85..c37aa21602 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -77,23 +77,28 @@ void add_token_len(TensorMap& tensor_map) { tensor_map.insert({"token_len", token_len->output(0)}); } -void add_sliced_mask(TensorMap& tensor_map) { +void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { auto token_len = tensor_map.at("token_len").get_node_shared_ptr(); - auto create_sliced_mask = [&](const std::string& mask_name, const std::string& sliced_name) { + auto create_sliced_mask = [&](const std::string& mask_name, const std::string& sliced_name, bool is_static) { if (tensor_map.find(mask_name) != tensor_map.end()) { - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); auto mask = tensor_map.at(mask_name).get_node_shared_ptr(); - std::shared_ptr mask_sliced = - std::make_shared(mask, zero, token_len, one, one); - mask_sliced->set_friendly_name(sliced_name); + std::shared_ptr mask_sliced; + if (is_static) { + mask_sliced = mask; + } else { + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + mask_sliced = std::make_shared(mask, zero, token_len, one, one); + mask_sliced = std::make_shared(mask_sliced, ov::element::f16); + mask_sliced->set_friendly_name(sliced_name); + } tensor_map.insert({sliced_name, mask_sliced->output(0)}); } }; - create_sliced_mask("KQ_mask", "KQ_mask_sliced"); - create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced"); + create_sliced_mask("KQ_mask", "KQ_mask_sliced", ggml_model_decoder.is_static()); + create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static()); } void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { @@ -117,7 +122,7 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { // Create common patterns void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { add_token_len(tensor_map); - add_sliced_mask(tensor_map); + add_sliced_mask(tensor_map, ggml_model_decoder); add_rope_sin_cos(tensor_map, ggml_model_decoder); } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 8724404098..db47163645 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -253,6 +253,7 @@ ov::AnyMap get_npu_base_config() { {"NPUW_FOLD", "YES" }, {"NPUW_WEIGHTS_BANK", "shared" }, {"NPUW_FUNCALL_FOR_ALL", "YES" }, + {"NPUW_FUNCALL_ASYNC", "YES" }, {"NPUW_DQ", "YES" }, {"NPUW_DQ_FULL", "NO" }, {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""}, @@ -262,15 +263,11 @@ ov::AnyMap get_npu_base_config() { ov::AnyMap get_npu_prefill_config() { auto config = get_npu_base_config(); - config.emplace("NPUW_FUNCALL_ASYNC", "NO"); - config.emplace("NPUW_ACC_CHECK", "YES"); - config.emplace("NPUW_ACC_DEVICE", "CPU"); return config; } ov::AnyMap get_npu_generate_config() { auto config = get_npu_base_config(); - config.emplace("NPUW_FUNCALL_ASYNC", "YES"); return config; }