diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index ff9048fb78..2cea7784c7 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -577,9 +576,6 @@ std::map GgmlOvDecoder::get_kv_param_res_names() const } std::map> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph, bool naive) { - static std::mutex weights_mutex; - std::lock_guard lock(weights_mutex); - std::map> model_weights; auto * nodes = cgraph->nodes; auto n_nodes = cgraph->n_nodes; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 1b553a0de0..6f6ecde65e 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -106,17 +106,23 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr< int64_t infer_end_time; { - std::lock_guard lock(r_ctx->ov_compute_mutex); + std::shared_ptr mutex; auto it = r_ctx->decoder_cache.find(key); cache_hit = it != r_ctx->decoder_cache.end(); ModelParams old_m_params; if (cache_hit) { - ggml_decoder = it->second; + mutex = it->second->mutex; + std::lock_guard lock(*(mutex)); + ggml_decoder = it->second->ptr; old_m_params = ggml_decoder->get_model_params(); cache_hit = old_m_params.can_reuse_dynamically(m_params); + } else { + mutex = std::make_shared(); + r_ctx->decoder_cache[key] = std::make_shared(mutex); } + std::lock_guard lock(*(mutex)); if (cache_hit) { std::map> model_weights; @@ -200,7 +206,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr< compile_end_time = ggml_time_us(); infer_request = std::make_shared(compiled_model.create_infer_request()); r_ctx->infer_request_cache[key] = infer_request; - r_ctx->decoder_cache[key] = ggml_decoder; + r_ctx->decoder_cache.at(key)->ptr = ggml_decoder; std::vector ov_input_names; std::vector ov_output_names; @@ -306,15 +312,23 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr mutex; + auto it = r_ctx->decoder_cache.find(key); cache_hit = it != r_ctx->decoder_cache.end(); ModelParams old_m_params; if (cache_hit) { - ggml_decoder = it->second; + mutex = it->second->mutex; + std::lock_guard lock(*(mutex)); + ggml_decoder = it->second->ptr; old_m_params = ggml_decoder->get_model_params(); cache_hit = old_m_params.can_reuse_statically(m_params); + } else { + mutex = std::make_shared(); + r_ctx->decoder_cache[key] = std::make_shared(mutex); } + std::lock_guard lock(*(mutex)); if (cache_hit) { std::map> model_weights; @@ -381,7 +395,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptrinfer_request_cache_prefill[key] : r_ctx->infer_request_cache[key]; - r_ctx->decoder_cache[key] = ggml_decoder; + r_ctx->decoder_cache.at(key)->ptr = ggml_decoder; std::vector ov_input_names; std::vector ov_output_names; diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 656573d138..89b1b20934 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -40,11 +40,17 @@ struct graph_key_hash { } }; +struct decoder_runtime_ctx { + decoder_runtime_ctx(std::shared_ptr mutex) : + mutex(mutex) {} + std::shared_ptr mutex; + std::shared_ptr ptr; +}; + struct ov_runtime_context { - std::mutex ov_compute_mutex; std::string device; bool stateful; - std::unordered_map, graph_key_hash> decoder_cache; + std::unordered_map, graph_key_hash> decoder_cache; std::unordered_map, graph_key_hash> infer_request_cache; std::unordered_map, graph_key_hash> infer_request_cache_prefill; std::unordered_map, graph_key_hash> ov_input_names_cache;