Reduce memory: free ov weights node after graph conversion
This commit is contained in:
parent
a80da69448
commit
f3c0519096
|
|
@ -42,28 +42,23 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
|
|||
m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"),
|
||||
m_is_static(is_static),
|
||||
m_is_first_token(is_first_token) {
|
||||
// TODO avoid static
|
||||
static std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
|
||||
if (m_node) {
|
||||
set_input_output(m_node);
|
||||
} else {
|
||||
static bool printed = false;
|
||||
if (!printed && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) {
|
||||
if (is_first_token && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) {
|
||||
print_tensor_address_map(cgraph);
|
||||
printed = true;
|
||||
}
|
||||
|
||||
if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
|
||||
std::string filename = "cgraph.txt";
|
||||
auto timestamp = (long long) ggml_time_us();
|
||||
std::string filename = "cgraph_" + std::to_string(timestamp) + ".txt";
|
||||
dump_cgraph(cgraph, filename);
|
||||
}
|
||||
|
||||
set_llm_params();
|
||||
|
||||
static bool weight_created = false;
|
||||
if (!weight_created) {
|
||||
add_weight_const_parallel(model_weights);
|
||||
weight_created = true;
|
||||
if (is_first_token) {
|
||||
add_weight_const_parallel(m_model_weights);
|
||||
}
|
||||
|
||||
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
|
||||
|
|
@ -71,7 +66,6 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
|
|||
m_nodes.push_back(cur_node);
|
||||
set_input_output(cur_node);
|
||||
}
|
||||
m_model_weights = model_weights;
|
||||
|
||||
add_extra_inputs();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -108,6 +108,8 @@ public:
|
|||
|
||||
ov::PartialShape get_graph_input_shape(const ggml_tensor* src) const;
|
||||
|
||||
void clear_model_weights() { m_model_weights.clear(); }
|
||||
|
||||
private:
|
||||
void set_input_output(ggml_tensor* node);
|
||||
void add_extra_inputs();
|
||||
|
|
|
|||
|
|
@ -9,10 +9,8 @@
|
|||
#include <memory>
|
||||
#include <openvino/core/any.hpp>
|
||||
#include <openvino/core/graph_util.hpp>
|
||||
#include <openvino/core/partial_shape.hpp>
|
||||
#include <openvino/core/type/float16.hpp>
|
||||
#include <openvino/frontend/manager.hpp>
|
||||
#include <openvino/op/parameter.hpp>
|
||||
#include <openvino/openvino.hpp>
|
||||
#include <openvino/runtime/compiled_model.hpp>
|
||||
#include <openvino/runtime/infer_request.hpp>
|
||||
|
|
@ -89,7 +87,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
|
|||
if (cache_dir && !is_static) {
|
||||
core.set_property(ov::cache_dir(cache_dir));
|
||||
}
|
||||
// core.set_property(ov::enable_profiling(true));
|
||||
|
||||
static std::unordered_map<struct ggml_cgraph*, std::shared_ptr<ov::InferRequest>> infer_request_cache;
|
||||
static std::unordered_map<struct ggml_cgraph*, std::vector<std::string>> ov_input_names_cache;
|
||||
|
|
@ -157,6 +154,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
|
|||
|
||||
auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
|
||||
model = ov::frontend::ggml::FrontEnd::convert(input_model);
|
||||
ggml_decoder->clear_model_weights();
|
||||
conversion_end_time = ggml_time_us();
|
||||
|
||||
auto compiled_model = core.compile_model(model, device, config);
|
||||
|
|
|
|||
Loading…
Reference in New Issue