Fix NPU accuracy

This commit is contained in:
Yu, Zijun 2025-09-17 11:16:14 +08:00 committed by Mustafa Cavus
parent 9de874cb7b
commit 602f9ca4af
2 changed files with 16 additions and 14 deletions

View File

@ -77,23 +77,28 @@ void add_token_len(TensorMap& tensor_map) {
tensor_map.insert({"token_len", token_len->output(0)});
}
void add_sliced_mask(TensorMap& tensor_map) {
void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
auto token_len = tensor_map.at("token_len").get_node_shared_ptr();
auto create_sliced_mask = [&](const std::string& mask_name, const std::string& sliced_name) {
auto create_sliced_mask = [&](const std::string& mask_name, const std::string& sliced_name, bool is_static) {
if (tensor_map.find(mask_name) != tensor_map.end()) {
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
auto mask = tensor_map.at(mask_name).get_node_shared_ptr();
std::shared_ptr<ov::Node> mask_sliced =
std::make_shared<ov::op::v8::Slice>(mask, zero, token_len, one, one);
mask_sliced->set_friendly_name(sliced_name);
std::shared_ptr<ov::Node> mask_sliced;
if (is_static) {
mask_sliced = mask;
} else {
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
mask_sliced = std::make_shared<ov::op::v8::Slice>(mask, zero, token_len, one, one);
mask_sliced = std::make_shared<ov::op::v0::Convert>(mask_sliced, ov::element::f16);
mask_sliced->set_friendly_name(sliced_name);
}
tensor_map.insert({sliced_name, mask_sliced->output(0)});
}
};
create_sliced_mask("KQ_mask", "KQ_mask_sliced");
create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced");
create_sliced_mask("KQ_mask", "KQ_mask_sliced", ggml_model_decoder.is_static());
create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static());
}
void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
@ -117,7 +122,7 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
// Create common patterns
void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
add_token_len(tensor_map);
add_sliced_mask(tensor_map);
add_sliced_mask(tensor_map, ggml_model_decoder);
add_rope_sin_cos(tensor_map, ggml_model_decoder);
}

View File

@ -253,6 +253,7 @@ ov::AnyMap get_npu_base_config() {
{"NPUW_FOLD", "YES" },
{"NPUW_WEIGHTS_BANK", "shared" },
{"NPUW_FUNCALL_FOR_ALL", "YES" },
{"NPUW_FUNCALL_ASYNC", "YES" },
{"NPUW_DQ", "YES" },
{"NPUW_DQ_FULL", "NO" },
{"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""},
@ -262,15 +263,11 @@ ov::AnyMap get_npu_base_config() {
ov::AnyMap get_npu_prefill_config() {
auto config = get_npu_base_config();
config.emplace("NPUW_FUNCALL_ASYNC", "NO");
config.emplace("NPUW_ACC_CHECK", "YES");
config.emplace("NPUW_ACC_DEVICE", "CPU");
return config;
}
ov::AnyMap get_npu_generate_config() {
auto config = get_npu_base_config();
config.emplace("NPUW_FUNCALL_ASYNC", "YES");
return config;
}