#include "utils.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "ggml-impl.h" #include "ggml-openvino/ggml-decoder.h" #include "ggml.h" #include "openvino/frontend.hpp" #include "openvino/input_model.hpp" ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string& name) { const auto* ggml_tensor = ggml_decoder->get_input_ggml_tensor(name); auto* input_data = ggml_tensor->data; ov::Shape input_shape; if (name.find("cache_k") == 0 || name.find("cache_v") == 0) { input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor).to_shape(); } else if (ggml_tensor->op == GGML_OP_VIEW) { // This case is added to make test-backend-ops work input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor->view_src).to_shape(); } else { input_shape = ggml_decoder->get_input_shape(name).to_shape(); } auto input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); return input_tensor; } std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder) { std::map output_tensors; auto output_names = ggml_decoder->get_model_output_names(); for (size_t inp = 0; inp < output_names.size(); ++inp) { auto name = output_names[inp]; const auto* tensor = ggml_decoder->get_output_ggml_tensor(name); auto* output_data = tensor->view_src ? tensor->view_src->data : tensor->data; output_tensors[name] = output_data; } return output_tensors; } static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { auto fem = ov::frontend::FrontEndManager(); auto front_end = fem.load_by_framework("ggml"); return front_end; } enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph) { static ov::Core core; static std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : ""; if (device.empty()) { const std::vector preferred_device = { "GPU", "CPU", "NPU" }; const auto available_devices = core.get_available_devices(); for (const auto& dev : preferred_device) { if (std::find(available_devices.begin(), available_devices.end(), dev) != available_devices.end()) { device = dev; break; } } } bool is_static = device == "NPU" ? true : false; ov::AnyMap config; if (device == "GPU") { config = { {"GPU_ENABLE_SDPA_OPTIMIZATION", "0"} }; } if (is_naive(cgraph)) { return naive_compute(cgraph, core, device, config); } auto start_time = ggml_time_us(); auto* cache_dir = getenv("GGML_OPENVINO_CACHE_DIR"); if (cache_dir && !is_static) { core.set_property(ov::cache_dir(cache_dir)); } static std::mutex cache_mutex; static std::unordered_map> infer_request_cache; static std::unordered_map> ov_input_names_cache; static std::unordered_map> ov_output_names_cache; // For NPU, store the kvcache model, since we cannot create two infer_request static std::unordered_map compiled_model_cache; std::shared_ptr ggml_decoder; ov::InferRequest infer_request; int64_t decoder_end_time; int64_t conversion_end_time; int64_t compile_end_time; { std::lock_guard lock(cache_mutex); auto it = infer_request_cache.find(cgraph); if (it != infer_request_cache.end()) { std::map> model_weights; ggml_decoder = std::make_shared(cgraph, model_weights, is_static, false); decoder_end_time = ggml_time_us(); // For NPU for the first time we call kvcache modle, pop the compiled kvcache model from cache if (is_static && compiled_model_cache.find(cgraph) != compiled_model_cache.end()) { infer_request_cache[cgraph] = std::make_shared(compiled_model_cache[cgraph].create_infer_request()); compiled_model_cache.erase(cgraph); } infer_request = *infer_request_cache[cgraph]; conversion_end_time = ggml_time_us(); compile_end_time = conversion_end_time; } else { std::shared_ptr model; auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, get_types_to_requant(device)); if (is_static) { ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true); auto ggml_decoder_kvcache = std::make_shared(cgraph, model_weights, is_static, false); decoder_end_time = ggml_time_us(); auto input_model = std::make_shared(ggml_decoder); auto input_model_kvcache = std::make_shared(ggml_decoder_kvcache); model = ov::frontend::ggml::FrontEnd::convert(input_model); ggml_decoder->clear_model_weights(); auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache); ggml_decoder_kvcache->clear_model_weights(); conversion_end_time = ggml_time_us(); if (getenv("GGML_OPENVINO_DUMP_IR")) { char timestamped_filename[64]; auto timestamp = (long long) ggml_time_us(); snprintf(timestamped_filename, sizeof(timestamped_filename), "model_prefill_%lld.xml", timestamp); ov::serialize(model, timestamped_filename); snprintf(timestamped_filename, sizeof(timestamped_filename), "model_kvcache_%lld.xml", timestamp); ov::serialize(model_kvcache, timestamped_filename); } auto compiled_model = core.compile_model(model, device, get_npu_prefill_config()); auto compiled_model_kvcache = core.compile_model(model_kvcache, device, get_npu_generate_config()); compiled_model_cache[cgraph] = compiled_model_kvcache; compile_end_time = ggml_time_us(); infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); infer_request = *infer_request_cache[cgraph]; compiled_model_cache[cgraph] = compiled_model_kvcache; } else { ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true); decoder_end_time = ggml_time_us(); auto input_model = std::make_shared(ggml_decoder); model = ov::frontend::ggml::FrontEnd::convert(input_model); ggml_decoder->clear_model_weights(); conversion_end_time = ggml_time_us(); if (getenv("GGML_OPENVINO_DUMP_IR")) { char timestamped_filename[64]; auto timestamp = (long long) ggml_time_us(); snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); ov::serialize(model, timestamped_filename); } auto compiled_model = core.compile_model(model, device, config); compile_end_time = ggml_time_us(); infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); infer_request = *infer_request_cache[cgraph]; } std::vector ov_input_names; std::vector ov_output_names; for (const auto& ov_param : model->get_parameters()) { ov_input_names.push_back(ov_param->get_friendly_name()); } for (const auto& ov_output : model->get_results()) { ov_output_names.push_back(ov_output->get_friendly_name()); } ov_input_names_cache[cgraph] = ov_input_names; ov_output_names_cache[cgraph] = ov_output_names; } } auto ov_input_names = ov_input_names_cache[cgraph]; auto ov_output_names = ov_output_names_cache[cgraph]; for (size_t i = 0; i < ov_input_names.size(); i++) { auto param_name = ov_input_names[i]; auto input_tensor = get_ov_input_tensor(ggml_decoder, param_name); infer_request.set_input_tensor(i, input_tensor); if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { print_input_tensor_info(param_name, input_tensor); } } auto input_end_time = ggml_time_us(); infer_request.infer(); auto infer_end_time = ggml_time_us(); auto gguf_tensor_addrs = get_ggml_graph_output_dst(ggml_decoder); for (size_t i = 0; i < ov_output_names.size(); i++) { auto result_name = ov_output_names[i]; const auto output_tensor = infer_request.get_output_tensor(i); std::memcpy(gguf_tensor_addrs[result_name], output_tensor.data(), output_tensor.get_byte_size()); if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { print_output_tensor_info(result_name, output_tensor, gguf_tensor_addrs); } } auto end_time = ggml_time_us(); if (getenv("GGML_OPENVINO_PROFILING")) { GGML_LOG_INFO("GGML OpenVINO Backend: \n"); GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000); GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000); GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); GGML_LOG_INFO(" - Graph Input Time: %ld ms \n", (input_end_time - compile_end_time) / 1000); GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - input_end_time) / 1000); GGML_LOG_INFO(" - Graph Output Time: %ld ms \n", (end_time - infer_end_time) / 1000); } return GGML_STATUS_SUCCESS; GGML_UNUSED(backend); } ov::AnyMap get_npu_prefill_config() { ov::AnyMap config = { {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm" }, {"NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES" }, {"NPU_USE_NPUW", "YES" }, {"NPUW_DEVICES", "NPU" }, {"NPUW_FOLD", "YES" }, {"NPUW_WEIGHTS_BANK", "shared" }, {"NPUW_SLICE_OUT", "YES" }, {"NPUW_FUNCALL_ASYNC", "YES" }, {"NPUW_FUNCALL_FOR_ALL", "YES" }, {"NPUW_DQ", "YES" }, {"NPUW_DQ_FULL", "NO" }, {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""}, }; return config; } std::map get_types_to_requant(const std::string& device) { if (device == "NPU") { return { {GGML_TYPE_Q4_0, ExtraQuantType::Q4_0_128}, {GGML_TYPE_Q4_1, ExtraQuantType::Q4_0_128}, {GGML_TYPE_Q4_K, ExtraQuantType::Q4_0_128}, {GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C }, }; } if (device == "GPU") { return { // CVS-166739 {GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C}, }; } } ov::AnyMap get_npu_generate_config() { ov::AnyMap config = get_npu_prefill_config(); config.emplace("NPUW_UNFOLD_IREQS", "YES"); return config; } bool is_naive(struct ggml_cgraph* cgraph) { constexpr int naive_graph_size_threshold = 20; return cgraph->n_nodes < naive_graph_size_threshold; } enum ggml_status naive_compute(struct ggml_cgraph* cgraph, ov::Core& core, const std::string& device, const ov::AnyMap& config) { if (cgraph->n_nodes == 1 && (cgraph->nodes[0]->op == GGML_OP_NONE || cgraph->nodes[0]->op == GGML_OP_VIEW)) { return GGML_STATUS_SUCCESS; } if (cgraph->nodes[0]->op == GGML_OP_FLASH_ATTN_EXT) { return GGML_STATUS_FAILED; } auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); auto decoder = std::make_shared(cgraph, model_weights); auto input_model = std::make_shared(decoder); auto naive = true; auto model = ov::frontend::ggml::FrontEnd::convert(input_model, naive); if (getenv("GGML_OPENVINO_DUMP_IR")) { ov::serialize(model, "IR_naive.xml"); } auto infer_request = core.compile_model(model, device, config).create_infer_request(); auto ov_params = model->get_parameters(); for (size_t i = 0; i < ov_params.size(); i++) { auto param_name = ov_params[i]->get_friendly_name(); auto input_tensor = get_ov_input_tensor(decoder, param_name); infer_request.set_input_tensor(i, input_tensor); } infer_request.infer(); auto gguf_tensor_addrs = get_ggml_graph_output_dst(decoder); auto ov_results = model->get_results(); for (size_t i = 0; i < ov_results.size(); i++) { auto result_name = ov_results[i]->get_friendly_name(); const auto output_tensor = infer_request.get_output_tensor(i); std::memcpy(gguf_tensor_addrs[result_name], output_tensor.data(), output_tensor.get_byte_size()); } return GGML_STATUS_SUCCESS; } ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string& param_name) { bool is_static = ggml_decoder->is_static(); bool is_first_token = ggml_decoder->is_first_token(); ov::Tensor input_tensor; if (ggml_decoder->get_model_extra_inputs().find(param_name) != ggml_decoder->get_model_extra_inputs().end()) { input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name); } else if (!is_static) { input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); } else { if (param_name == "inp_tokens" || param_name == "inp_pos") { if (is_first_token) { size_t context_size = ggml_decoder->get_context_size(); const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); std::vector padded_data = pad_input(input_tensor_ggml, 1, context_size, 0); input_tensor = ov::Tensor(ov::element::i32, ov::Shape{1, 1, context_size}); auto* data_ptr = input_tensor.data(); std::copy(padded_data.begin(), padded_data.end(), data_ptr); } else { input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); } } else if (param_name == "KQ_mask") { size_t context_size = ggml_decoder->get_context_size(); const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); if (is_first_token) { std::vector padded_data = pad_input(input_tensor_ggml, context_size, context_size, -INFINITY); set_zero_diagonal(padded_data, context_size); input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, context_size, context_size}); auto* data_ptr = input_tensor.data(); std::copy(padded_data.begin(), padded_data.end(), data_ptr); } else { std::vector padded_data = pad_input(input_tensor_ggml, 1, context_size, -INFINITY); input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, context_size}); auto* data_ptr = input_tensor.data(); std::copy(padded_data.begin(), padded_data.end(), data_ptr); } } else if (const auto* op = ggml_decoder->get_tensor_used_op(ggml_decoder->get_tensor_from_name(param_name)); op && op->op == GGML_OP_SET_ROWS && is_static && is_first_token) { input_tensor = ov::Tensor(ov::element::i64, ov::Shape{1}); } else { input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); } } return input_tensor; } size_t checksum(const void* data, size_t size) { const uint8_t* bytes = static_cast(data); size_t sum = 0; for (size_t i = 0; i < size; ++i) { sum += (uint8_t) i; sum += bytes[i]; } return sum; } // Suppress deprecation warning for ov::Tensor::data() #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor) { std::cout << "Input name: " << name << ", Input shape: " << tensor.get_shape() << ", Address: " << tensor.data() << std::endl; switch (tensor.get_element_type()) { case ov::element::f32: std::cout << *(tensor.data()) << std::endl; break; case ov::element::f16: std::cout << *(tensor.data()) << std::endl; break; case ov::element::i32: for (size_t i = 0; i < tensor.get_size(); ++i) { std::cout << tensor.data()[i] << " "; } std::cout << std::endl; break; case ov::element::i64: std::cout << *(tensor.data()) << std::endl; break; default: break; } } void print_output_tensor_info(const std::string& name, const ov::Tensor& tensor, std::map& output_dst) { std::cout << "Output name: " << name << ", Output shape: " << tensor.get_shape() << ", Address: " << output_dst[name] << std::endl; auto print_float_stats = [](const std::string& type_name, size_t size, auto get_value) { if (size == 0) { return; } float first = get_value(0); float min = first; float max = first; double sum = first; for (size_t i = 1; i < size; ++i) { float v = get_value(i); if (v < min) { min = v; } if (v > max) { max = v; } sum += v; } double mean = sum / size; std::cout << std::right << std::setw(6) << type_name << std::right << std::setw(12) << "First" << std::setw(12) << "Min" << std::setw(12) << "Max" << std::setw(12) << "Mean" << std::endl; std::cout << std::right << std::setw(6) << "" << std::right << std::setw(12) << first << std::setw(12) << min << std::setw(12) << max << std::setw(12) << mean << std::endl; }; switch (tensor.get_element_type()) { case ov::element::f32: { const float* data = tensor.data(); size_t size = tensor.get_size(); print_float_stats("[f32]", size, [data](size_t i) { return data[i]; }); break; } case ov::element::f16: { const ov::float16* data = tensor.data(); size_t size = tensor.get_size(); print_float_stats("[f16]", size, [data](size_t i) { return static_cast(data[i]); }); break; } default: break; } } #pragma GCC diagnostic pop void set_zero_diagonal(std::vector& matrix, size_t dim) { for (size_t i = 0; i < dim; ++i) { matrix[i * dim + i] = 0.0f; } } bool is_prefill(struct ggml_cgraph* cgraph) { for (int i = 0; i < cgraph->n_nodes; ++i) { auto* op = cgraph->nodes[i]; for (int j = 0; j < GGML_MAX_SRC; ++j) { auto* src = op->src[j]; if (src == nullptr) { break; } if (std::string(src->name) == "inp_tokens") { return src->ne[0] != 1; } } } GGML_LOG_ERROR("is_prefill: inp_tokens not found in cgraph"); throw std::runtime_error("is_prefill: inp_tokens not found in cgraph"); }