Fix NPU accuracy
This commit is contained in:
parent
9de874cb7b
commit
602f9ca4af
|
|
@ -77,23 +77,28 @@ void add_token_len(TensorMap& tensor_map) {
|
|||
tensor_map.insert({"token_len", token_len->output(0)});
|
||||
}
|
||||
|
||||
void add_sliced_mask(TensorMap& tensor_map) {
|
||||
void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
|
||||
auto token_len = tensor_map.at("token_len").get_node_shared_ptr();
|
||||
|
||||
auto create_sliced_mask = [&](const std::string& mask_name, const std::string& sliced_name) {
|
||||
auto create_sliced_mask = [&](const std::string& mask_name, const std::string& sliced_name, bool is_static) {
|
||||
if (tensor_map.find(mask_name) != tensor_map.end()) {
|
||||
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
|
||||
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
|
||||
auto mask = tensor_map.at(mask_name).get_node_shared_ptr();
|
||||
std::shared_ptr<ov::Node> mask_sliced =
|
||||
std::make_shared<ov::op::v8::Slice>(mask, zero, token_len, one, one);
|
||||
mask_sliced->set_friendly_name(sliced_name);
|
||||
std::shared_ptr<ov::Node> mask_sliced;
|
||||
if (is_static) {
|
||||
mask_sliced = mask;
|
||||
} else {
|
||||
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
|
||||
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
|
||||
mask_sliced = std::make_shared<ov::op::v8::Slice>(mask, zero, token_len, one, one);
|
||||
mask_sliced = std::make_shared<ov::op::v0::Convert>(mask_sliced, ov::element::f16);
|
||||
mask_sliced->set_friendly_name(sliced_name);
|
||||
}
|
||||
tensor_map.insert({sliced_name, mask_sliced->output(0)});
|
||||
}
|
||||
};
|
||||
|
||||
create_sliced_mask("KQ_mask", "KQ_mask_sliced");
|
||||
create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced");
|
||||
create_sliced_mask("KQ_mask", "KQ_mask_sliced", ggml_model_decoder.is_static());
|
||||
create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static());
|
||||
}
|
||||
|
||||
void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
|
||||
|
|
@ -117,7 +122,7 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
|
|||
// Create common patterns
|
||||
void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
|
||||
add_token_len(tensor_map);
|
||||
add_sliced_mask(tensor_map);
|
||||
add_sliced_mask(tensor_map, ggml_model_decoder);
|
||||
add_rope_sin_cos(tensor_map, ggml_model_decoder);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -253,6 +253,7 @@ ov::AnyMap get_npu_base_config() {
|
|||
{"NPUW_FOLD", "YES" },
|
||||
{"NPUW_WEIGHTS_BANK", "shared" },
|
||||
{"NPUW_FUNCALL_FOR_ALL", "YES" },
|
||||
{"NPUW_FUNCALL_ASYNC", "YES" },
|
||||
{"NPUW_DQ", "YES" },
|
||||
{"NPUW_DQ_FULL", "NO" },
|
||||
{"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""},
|
||||
|
|
@ -262,15 +263,11 @@ ov::AnyMap get_npu_base_config() {
|
|||
|
||||
ov::AnyMap get_npu_prefill_config() {
|
||||
auto config = get_npu_base_config();
|
||||
config.emplace("NPUW_FUNCALL_ASYNC", "NO");
|
||||
config.emplace("NPUW_ACC_CHECK", "YES");
|
||||
config.emplace("NPUW_ACC_DEVICE", "CPU");
|
||||
return config;
|
||||
}
|
||||
|
||||
ov::AnyMap get_npu_generate_config() {
|
||||
auto config = get_npu_base_config();
|
||||
config.emplace("NPUW_FUNCALL_ASYNC", "YES");
|
||||
return config;
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue