#include "utils.h" #include "ggml-impl.h" #include "ggml-openvino/ggml-decoder.h" #include "ggml.h" #include "openvino/frontend.hpp" #include "openvino/input_model.hpp" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include // Suppress deprecation warning for ov::Tensor::data() #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" static ov::Core core; enum ggml_status ov_graph_compute(ggml_cgraph * cgraph) { auto get_device = [&] { std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "CPU"; auto available_devices = core.get_available_devices(); if (std::find(available_devices.begin(), available_devices.end(), device) == available_devices.end()) { GGML_LOG_WARN("GGML OpenVINO Backend: device %s is not available, fallback to CPU\n", device.c_str()); device = "CPU"; } return device; }; if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { std::string filename = "cgraph.txt"; GgmlOvDecoder::dump_cgraph(cgraph, filename); } static const auto device = get_device(); static const auto is_static = device == "NPU" ? true : false; return is_static ? ov_graph_compute_static(cgraph) : ov_graph_compute_dynamic(cgraph, device); } enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::string & device) { static auto is_static = false; static auto config = get_ov_compile_config(device); // if (is_naive(cgraph)) { // return naive_compute(cgraph, core, device, config); // } auto start_time = ggml_time_us(); static std::mutex cache_mutex; static std::unordered_map, graph_key_hash> decoder_cache; static std::unordered_map, graph_key_hash> infer_request_cache; static std::unordered_map, graph_key_hash> ov_input_names_cache; static std::unordered_map, graph_key_hash> ov_output_names_cache; std::shared_ptr ggml_decoder; std::shared_ptr infer_request; ModelParams m_params; ComputeParams c_params; std::tie(m_params, c_params) = GgmlOvDecoder::compute_llm_params(cgraph, is_static); const auto key = compute_graph_key(cgraph); bool cache_hit; int64_t decoder_end_time; int64_t conversion_end_time; int64_t compile_end_time; int64_t infer_end_time; { std::lock_guard lock(cache_mutex); auto it = decoder_cache.find(key); cache_hit = it != decoder_cache.end(); if (cache_hit) { ggml_decoder = it->second; cache_hit = ggml_decoder->get_model_params().can_reuse_statically(m_params); } if (cache_hit) { std::map> model_weights; ggml_decoder = decoder_cache[key]; ggml_decoder->set_compute_params(c_params); ggml_decoder->set_model_params(m_params); ggml_decoder->add_extra_inputs(); infer_request = infer_request_cache[key]; decoder_end_time = ggml_time_us(); conversion_end_time = decoder_end_time; compile_end_time = decoder_end_time; } else { infer_request_cache.erase(key); std::shared_ptr model; auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, get_types_to_requant(device)); ggml_decoder = std::make_shared(cgraph, m_params, c_params, model_weights, is_static); decoder_end_time = ggml_time_us(); auto input_model = std::make_shared(ggml_decoder); model = ov::frontend::ggml::FrontEnd::convert(input_model); ggml_decoder->clear_model_weights(); conversion_end_time = ggml_time_us(); if (getenv("GGML_OPENVINO_DUMP_IR")) { char timestamped_filename[64]; auto timestamp = (long long) ggml_time_us(); snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); ov::serialize(model, timestamped_filename); } auto compiled_model = core.compile_model(model, device, config); compile_end_time = ggml_time_us(); infer_request = std::make_shared(compiled_model.create_infer_request()); infer_request_cache[key] = infer_request; decoder_cache[key] = ggml_decoder; std::vector ov_input_names; std::vector ov_output_names; for (const auto & ov_param : model->get_parameters()) { ov_input_names.push_back(ov_param->get_friendly_name()); } for (const auto & ov_output : model->get_results()) { ov_output_names.push_back(ov_output->get_friendly_name()); } ov_input_names_cache[key] = std::move(ov_input_names); ov_output_names_cache[key] = std::move(ov_output_names); } } auto ov_input_names = ov_input_names_cache[key]; auto ov_output_names = ov_output_names_cache[key]; for (size_t i = 0; i < ov_input_names.size(); i++) { auto param_name = ov_input_names[i]; auto input_tensor = get_ov_input_tensor(ggml_decoder, param_name); infer_request->set_input_tensor(i, input_tensor); if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { print_input_tensor_info(param_name, input_tensor); } } for (size_t i = 0; i < ov_output_names.size(); i++) { auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); infer_request->set_output_tensor(i, output_tensor); } infer_request->infer(); infer_end_time = ggml_time_us(); if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { for (size_t i = 0; i < ov_output_names.size(); i++) { const auto output_tensor = infer_request->get_output_tensor(i); print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data()); } } if (getenv("GGML_OPENVINO_PROFILING")) { GGML_LOG_INFO("\nGGML OpenVINO Backend: \n"); GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000); if (!cache_hit) { GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000); GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); } GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - compile_end_time) / 1000); } return GGML_STATUS_SUCCESS; } enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { auto get_prefill_chunk_size = [] { const char * chunk_size_str = getenv("GGML_OPENVINO_PREFILL_CHUNK_SIZE"); if (chunk_size_str && atoi(chunk_size_str) > 0) { return atoi(chunk_size_str); } return 256; }; static std::string device = "NPU"; static auto is_static = true; static auto prefill_chunk_size = get_prefill_chunk_size(); static auto config = get_ov_compile_config(device); if (is_naive(cgraph)) { return naive_compute(cgraph, core, device, config); } auto start_time = ggml_time_us(); static std::mutex cache_mutex; static std::unordered_map, graph_key_hash> decoder_cache; static std::unordered_map, graph_key_hash> infer_request_cache; static std::unordered_map, graph_key_hash> infer_request_cache_prefill; static std::unordered_map, graph_key_hash> ov_input_names_cache; static std::unordered_map, graph_key_hash> ov_output_names_cache; std::shared_ptr ggml_decoder; std::shared_ptr infer_request; ModelParams m_params; ComputeParams c_params; std::tie(m_params, c_params) = GgmlOvDecoder::compute_llm_params(cgraph, is_static); const auto * inp_pos = get_inp_pos_tensor(cgraph); const auto is_prefill = get_is_prefill(inp_pos); const auto key = compute_graph_key(cgraph); bool cache_hit; int64_t decoder_end_time; int64_t conversion_end_time; int64_t compile_end_time; int64_t infer_end_time; { std::lock_guard lock(cache_mutex); auto it = decoder_cache.find(key); cache_hit = it != decoder_cache.end(); if (cache_hit) { ggml_decoder = it->second; cache_hit = ggml_decoder->get_model_params().can_reuse_statically(m_params); } if (cache_hit) { std::map> model_weights; ggml_decoder = std::make_shared(cgraph, m_params, c_params, model_weights, is_static, is_prefill, prefill_chunk_size); decoder_cache[key] = ggml_decoder; infer_request = is_prefill ? infer_request_cache_prefill[key] : infer_request_cache[key]; decoder_end_time = ggml_time_us(); conversion_end_time = decoder_end_time; compile_end_time = decoder_end_time; } else { infer_request_cache.erase(key); infer_request_cache_prefill.erase(key); std::shared_ptr model; auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, get_types_to_requant(device)); auto ggml_decoder_prefill = std::make_shared(cgraph, m_params, c_params, model_weights, is_static, true, prefill_chunk_size); auto ggml_decoder_decode = std::make_shared(cgraph, m_params, c_params, model_weights, is_static, false, prefill_chunk_size); decoder_end_time = ggml_time_us(); auto input_model_prefill = std::make_shared(ggml_decoder_prefill); auto input_model_decode = std::make_shared(ggml_decoder_decode); auto model_prefill = ov::frontend::ggml::FrontEnd::convert(input_model_prefill); ggml_decoder_prefill->clear_model_weights(); auto model_decode = ov::frontend::ggml::FrontEnd::convert(input_model_decode); ggml_decoder_decode->clear_model_weights(); conversion_end_time = ggml_time_us(); if (getenv("GGML_OPENVINO_DUMP_IR")) { char timestamped_filename[64]; auto timestamp = (long long) ggml_time_us(); snprintf(timestamped_filename, sizeof(timestamped_filename), "model_prefill_%lld.xml", timestamp); ov::serialize(model_prefill, timestamped_filename); snprintf(timestamped_filename, sizeof(timestamped_filename), "model_decode_%lld.xml", timestamp); ov::serialize(model_decode, timestamped_filename); } auto compiled_model_prefill = core.compile_model(model_prefill, device, get_ov_compile_config(device)); auto compiled_model_decode = core.compile_model(model_decode, device, get_ov_compile_config(device)); infer_request_cache_prefill[key] = std::make_shared(compiled_model_prefill.create_infer_request()); infer_request_cache[key] = std::make_shared(compiled_model_decode.create_infer_request()); compile_end_time = ggml_time_us(); model = is_prefill ? model_prefill : model_decode; ggml_decoder = is_prefill ? ggml_decoder_prefill : ggml_decoder_decode; infer_request = is_prefill ? infer_request_cache_prefill[key] : infer_request_cache[key]; decoder_cache[key] = ggml_decoder; std::vector ov_input_names; std::vector ov_output_names; for (const auto & ov_param : model->get_parameters()) { ov_input_names.push_back(ov_param->get_friendly_name()); } for (const auto & ov_output : model->get_results()) { ov_output_names.push_back(ov_output->get_friendly_name()); } ov_input_names_cache[key] = std::move(ov_input_names); ov_output_names_cache[key] = std::move(ov_output_names); } } auto ov_input_names = ov_input_names_cache[key]; auto ov_output_names = ov_output_names_cache[key]; if (is_prefill) { auto inp_len = inp_pos->ne[0]; for (int chunk_index = 0; chunk_index * prefill_chunk_size < inp_len; chunk_index++) { for (size_t i = 0; i < ov_input_names.size(); i++) { auto param_name = ov_input_names[i]; auto input_tensor = get_ov_input_tensor_static_prefill(ggml_decoder, param_name, chunk_index); infer_request->set_input_tensor(i, input_tensor); if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { const auto input_tensor = infer_request->get_input_tensor(i); print_input_tensor_info(param_name, input_tensor); } } for (size_t i = 0; i < ov_output_names.size(); i++) { auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); infer_request->set_output_tensor(i, output_tensor); } infer_request->infer(); if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { for (size_t i = 0; i < ov_output_names.size(); i++) { const auto output_tensor = infer_request->get_output_tensor(i); print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data()); } } } infer_end_time = ggml_time_us(); } else { for (size_t i = 0; i < ov_input_names.size(); i++) { auto param_name = ov_input_names[i]; auto input_tensor = get_ov_input_tensor_static_decode(ggml_decoder, param_name); infer_request->set_input_tensor(i, input_tensor); if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { const auto input_tensor = infer_request->get_input_tensor(i); print_input_tensor_info(param_name, input_tensor); } } for (size_t i = 0; i < ov_output_names.size(); i++) { auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); infer_request->set_output_tensor(i, output_tensor); } infer_request->infer(); infer_end_time = ggml_time_us(); if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { for (size_t i = 0; i < ov_output_names.size(); i++) { const auto output_tensor = infer_request->get_output_tensor(i); print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data()); } } } if (getenv("GGML_OPENVINO_PROFILING")) { GGML_LOG_INFO("\nGGML OpenVINO Backend: \n"); GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000); if (!cache_hit) { GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000); GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); } GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - compile_end_time) / 1000); } return GGML_STATUS_SUCCESS; } ov::AnyMap get_ov_compile_config(const std::string & device) { ov::AnyMap config; auto * cache_dir = getenv("GGML_OPENVINO_CACHE_DIR"); if (device == "NPU") { config = { {"NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES" }, {"NPU_USE_NPUW", "YES" }, {"NPUW_DEVICES", "NPU" }, {"NPUW_FOLD", "YES" }, {"NPUW_WEIGHTS_BANK", "shared"}, {"NPUW_FUNCALL_FOR_ALL", "YES" }, {"NPUW_FUNCALL_ASYNC", "YES" }, {"NPUW_DQ", "YES" }, {"NPUW_DQ_FULL", "NO" }, }; if (cache_dir) { config["NPUW_CACHE_DIR"] = cache_dir; } } else if (cache_dir) { core.set_property(ov::cache_dir(cache_dir)); } return config; } std::map get_types_to_requant(const std::string & device) { if (device == "NPU") { return { {GGML_TYPE_Q4_0, ExtraQuantType::Q4_0_128}, {GGML_TYPE_Q4_1, ExtraQuantType::Q4_0_128}, {GGML_TYPE_Q4_K, ExtraQuantType::Q4_0_128}, {GGML_TYPE_Q6_K, ExtraQuantType::F16 }, {GGML_TYPE_Q5_K, ExtraQuantType::F16 }, }; } if (device == "GPU") { return { // gs16 will be supported on openvino-2025.4 {GGML_TYPE_Q6_K, ExtraQuantType::Q8_0_32}, }; } return {}; } bool is_naive(ggml_cgraph * cgraph) { constexpr int naive_graph_size_threshold = 20; return cgraph->n_nodes < naive_graph_size_threshold; } enum ggml_status naive_compute(ggml_cgraph * cgraph, ov::Core & core, const std::string & device, const ov::AnyMap & config) { if (cgraph->n_nodes == 1 && (cgraph->nodes[0]->op == GGML_OP_NONE || cgraph->nodes[0]->op == GGML_OP_VIEW)) { return GGML_STATUS_SUCCESS; } if (cgraph->nodes[0]->op == GGML_OP_FLASH_ATTN_EXT) { return GGML_STATUS_FAILED; } auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); auto decoder = std::make_shared(cgraph, model_weights); auto input_model = std::make_shared(decoder); auto naive = true; auto model = ov::frontend::ggml::FrontEnd::convert(input_model, naive); if (getenv("GGML_OPENVINO_DUMP_IR")) { ov::serialize(model, "IR_naive.xml"); } auto infer_request = core.compile_model(model, device, config).create_infer_request(); auto ov_params = model->get_parameters(); for (size_t i = 0; i < ov_params.size(); i++) { auto param_name = ov_params[i]->get_friendly_name(); auto input_tensor = get_ov_input_tensor(decoder, param_name); infer_request.set_input_tensor(i, input_tensor); } auto ov_results = model->get_results(); for (size_t i = 0; i < ov_results.size(); i++) { auto result_name = ov_results[i]->get_friendly_name(); auto output_tensor = get_ov_output_tensor(decoder, result_name); infer_request.set_output_tensor(i, output_tensor); } infer_request.infer(); return GGML_STATUS_SUCCESS; } namespace { ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string & name) { const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(name); auto * input_data = ggml_tensor->data; ov::Shape input_shape; if (ggml_tensor->op == GGML_OP_VIEW) { // This case is added to make test-backend-ops work input_shape = ggml_decoder->get_shape(ggml_tensor->view_src); } else { input_shape = ggml_decoder->get_shape(ggml_tensor); } auto input_tensor = ov::Tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape, input_data); return input_tensor; } } // namespace ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string & param_name) { ov::Tensor input_tensor; if (ggml_decoder->get_model_extra_inputs().find(param_name) != ggml_decoder->get_model_extra_inputs().end()) { input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name); } else { input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); } return input_tensor; } ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr ggml_decoder, const std::string & param_name) { const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(param_name); const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor); if (param_name == "inp_pos" || param_name == "inp_tokens" || (op->op == GGML_OP_SET_ROWS && op->src[1] == ggml_tensor)) { assert(ggml_tensor->ne[0] == 1); ov::Shape input_shape = {1, 1, 1, 1}; ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape); if (ggml_tensor->type == GGML_TYPE_I32) { *input_tensor.data() = *((int32_t *) ggml_tensor->data); } else if (ggml_tensor->type == GGML_TYPE_I64) { *input_tensor.data() = *((int64_t *) ggml_tensor->data); } else { throw std::runtime_error("Unexpected tensor type for " + param_name); } return input_tensor; } if (param_name == "inp_out_ids") { ov::Shape input_shape = {1, 1, 1, 1}; ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape); int32_t inp_out_id = *((int32_t *) ggml_tensor->data); assert(ggml_tensor->ne[0] == 1); assert(inp_out_id == 0); *input_tensor.data() = inp_out_id; return input_tensor; } if (param_name.find("KQ_mask") == 0) { size_t context_size = ggml_decoder->get_ctx_size(); std::vector padded_data = pad_input(ggml_tensor, 1, context_size, -INFINITY); ov::Tensor input_tensor(ov::element::f32, ov::Shape{1, 1, 1, context_size}); auto * data_ptr = input_tensor.data(); std::copy(padded_data.begin(), padded_data.begin() + context_size, data_ptr); return input_tensor; } return get_ov_input_tensor(ggml_decoder, param_name); } ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr ggml_decoder, const std::string & param_name, int chunk_index) { const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(param_name); const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor); const size_t input_len = ggml_decoder->get_input_len(); const size_t chunk_size = ggml_decoder->m_prefill_chunk_size; const size_t chunk_valid_size = std::min(chunk_size, input_len - chunk_index * chunk_size); const size_t chunk_pad_size = chunk_size - chunk_valid_size; if (param_name == "inp_pos" || param_name == "inp_tokens" || (op->op == GGML_OP_SET_ROWS && op->src[1] == ggml_tensor)) { ov::Shape input_shape = {1, 1, 1, chunk_size}; ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape); // copy the chunk_index-th chunk from ggml_tensor size_t element_size = ggml_type_size(ggml_tensor->type); void * input_data = (char *) ggml_tensor->data + chunk_index * chunk_size * element_size; std::memcpy(input_tensor.data(), input_data, chunk_valid_size * element_size); // pad the rest with last_value + 1, so that kv's of padded positions are inserted // to the next row after the valids row in the kvcache if (chunk_pad_size > 0) { if (ggml_tensor->type == GGML_TYPE_I32) { int32_t last_value = *((int32_t *) ggml_tensor->data + (chunk_index * chunk_size + chunk_valid_size - 1)); int32_t * output_data = input_tensor.data(); std::fill(output_data + chunk_valid_size, output_data + chunk_size, last_value + 1); } else if (ggml_tensor->type == GGML_TYPE_I64) { int64_t last_value = *((int64_t *) ggml_tensor->data + (chunk_index * chunk_size + chunk_valid_size - 1)); int64_t * output_data = input_tensor.data(); std::fill(output_data + chunk_valid_size, output_data + chunk_size, last_value + 1); } else { throw std::runtime_error("Unexpected tensor type for " + param_name); } } return input_tensor; } if (param_name == "inp_out_ids") { size_t output_len = ggml_decoder->get_compute_params().output_len; ov::Shape input_shape = {1, 1, 1, output_len}; ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape); if (ggml_tensor->ne[0] == 0) { *input_tensor.data() = 0; } else { auto * data_addr = input_tensor.data(); for (size_t i = 0; i < output_len; i++) { data_addr[i] = ((int32_t *) ggml_tensor->data)[i] % chunk_size; } } return input_tensor; } if (param_name.find("KQ_mask") == 0) { size_t cols = ggml_tensor->ne[0]; size_t rows = ggml_tensor->ne[1]; float * ggml_data = (float *) ggml_tensor->data + chunk_index * chunk_size * cols; size_t chunk_valid_rows = std::min(chunk_size, rows - chunk_index * chunk_size); size_t context_size = ggml_decoder->get_ctx_size(); std::vector padded_data = pad_input(ggml_data, chunk_valid_rows, cols, chunk_size, context_size, -INFINITY); set_zero_diagonal(padded_data, chunk_size, context_size); ov::Tensor input_tensor(ov::element::f32, ov::Shape{1, 1, chunk_size, context_size}); auto * data_ptr = input_tensor.data(); std::copy(padded_data.begin(), padded_data.begin() + chunk_size * context_size, data_ptr); return input_tensor; } return get_ov_input_tensor(ggml_decoder, param_name); } ov::Tensor get_ov_output_tensor(std::shared_ptr ggml_decoder, const std::string & result_name) { auto * ggml_tensor = ggml_decoder->get_model_outputs().at(result_name); auto output_type = ggml_decoder->get_ov_type(ggml_tensor); auto output_shape = ggml_decoder->get_shape(ggml_tensor); if (ggml_decoder->is_static() && result_name == "result_output") { output_shape[1] = 1; } ov::Tensor output_tensor(output_type, output_shape, ggml_tensor->data); return output_tensor; } size_t checksum(const void * data, size_t size) { const uint8_t * bytes = static_cast(data); size_t sum = 0; for (size_t i = 0; i < size; ++i) { sum += (uint8_t) i; sum += bytes[i]; } return sum; } void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor) { std::cout << "Input name: " << name << ", Input shape: " << tensor.get_shape() << ", Address: " << tensor.data() << std::endl; switch (tensor.get_element_type()) { case ov::element::f32: { if (name.find("KQ_mask") == std::string::npos) { std::cout << *(tensor.data()) << std::endl; } else { size_t rows = tensor.get_shape()[2]; size_t cols = tensor.get_shape()[3]; auto * data = tensor.data(); for (size_t i = 0; i < rows; ++i) { for (size_t j = 0; j < cols; ++j) { float val = data[i * cols + j]; if (std::isinf(val) && val < 0) { std::cout << std::setw(5) << "-inf"; } else { std::cout << std::setw(5) << val; } } std::cout << std::endl; } } break; } case ov::element::f16: std::cout << *(tensor.data()) << std::endl; break; case ov::element::i32: for (size_t i = 0; i < tensor.get_size(); ++i) { std::cout << tensor.data()[i] << " "; } std::cout << std::endl; break; case ov::element::i64: for (size_t i = 0; i < tensor.get_size(); ++i) { std::cout << tensor.data()[i] << " "; } std::cout << std::endl; break; default: break; } } void print_output_tensor_info(const std::string & name, const ov::Tensor & tensor, const void * output_dst) { std::cout << "Output name: " << name << ", Output shape: " << tensor.get_shape() << ", Address: " << output_dst << std::endl; auto print_float_stats = [](const std::string & type_name, size_t size, auto get_value) { if (size == 0) { return; } float first = get_value(0); float min = first; float max = first; double sum = first; for (size_t i = 1; i < size; ++i) { float v = get_value(i); if (v < min) { min = v; } if (v > max) { max = v; } sum += v; } double mean = sum / size; std::cout << std::right << std::setw(6) << type_name << std::right << std::setw(12) << "First" << std::setw(12) << "Min" << std::setw(12) << "Max" << std::setw(12) << "Mean" << std::endl; std::cout << std::right << std::setw(6) << "" << std::right << std::setw(12) << first << std::setw(12) << min << std::setw(12) << max << std::setw(12) << mean << std::endl; }; switch (tensor.get_element_type()) { case ov::element::f32: { const float * data = tensor.data(); size_t size = tensor.get_size(); print_float_stats("[f32]", size, [data](size_t i) { return data[i]; }); break; } case ov::element::f16: { const ov::float16 * data = tensor.data(); size_t size = tensor.get_size(); print_float_stats("[f16]", size, [data](size_t i) { return static_cast(data[i]); }); break; } default: break; } } void set_zero_diagonal(std::vector & matrix, size_t rows, size_t cols) { for (size_t i = 0; i < rows; ++i) { size_t diag_col = std::min(i, cols - 1); matrix[i * cols + diag_col] = 0.0f; } } const ggml_tensor * get_inp_pos_tensor(ggml_cgraph * cgraph) { for (int i = 0; i < cgraph->n_nodes; ++i) { auto * op = cgraph->nodes[i]; for (int j = 0; j < GGML_MAX_SRC; ++j) { auto * src = op->src[j]; if (src == nullptr) { break; } if (std::string(src->name) == "inp_pos") { return src; } } } GGML_LOG_ERROR("get_inp_pos_tensor: inp_pos not found in cgraph"); throw std::runtime_error("get_inp_pos_tensor: inp_pos not found in cgraph"); } bool get_is_prefill(const ggml_tensor * inp_pos) { return inp_pos->ne[0] > 1; } graph_key compute_graph_key(ggml_cgraph * cgraph) { graph_key key; key.n_nodes = cgraph->n_nodes; if (cgraph->n_nodes > 0) { key.first_node_name = std::string(cgraph->nodes[0]->name); key.last_node_name = std::string(cgraph->nodes[cgraph->n_nodes - 1]->name); } else { key.first_node_name = ""; key.last_node_name = ""; } return key; } #pragma GCC diagnostic pop