#pragma once #include #include #include #include #include #include "ggml-qnn.h" #include "logger.hpp" #include "op-config.hpp" #include "qnn-lib.hpp" #include "tensor.hpp" namespace qnn { using ggml_tensor_array_t = std::vector; using ggml_op_constructor_t = std::function(const std::string &)>; class ggml_qnn_graph { public: explicit ggml_qnn_graph(const std::string &graph_name, QNNBackend device, std::shared_ptr qnn_instance, size_t vtcm_size_in_mb) : _graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) { QNN_LOG_INFO("graph name %s", graph_name.c_str()); auto qnn_interface = qnn_instance->get_qnn_interface(); auto qnn_context = qnn_instance->get_qnn_context_handle(); Qnn_ErrorHandle_t error = QNN_SUCCESS; Qnn_GraphHandle_t graph_handle = nullptr; if (device == QNN_BACKEND_NPU) { // TODO: fix graph config here for NPU QnnHtpGraph_CustomConfig_t hvx_config; hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; hvx_config.numHvxThreads = 8; QnnGraph_Config_t graph_hvx_config; graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_hvx_config.customConfig = &hvx_config; QnnHtpGraph_CustomConfig_t dlbc_config; dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC QnnGraph_Config_t graph_dlbc_config; graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_dlbc_config.customConfig = &dlbc_config; QnnHtpGraph_CustomConfig_t opt_config; opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; opt_config.optimizationOption.floatValue = 1; // 1 / 3 QnnGraph_Config_t graph_opt_config; graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_opt_config.customConfig = &opt_config; QnnHtpGraph_CustomConfig_t vtcm_config; vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; vtcm_config.vtcmSizeInMB = vtcm_size_in_mb; QnnGraph_Config_t graph_vtcm_config; graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_vtcm_config.customConfig = &vtcm_config; const QnnGraph_Config_t *graph_configs[] = { &graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, &graph_opt_config, nullptr }; error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), graph_configs, &graph_handle); } else { error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), nullptr, &graph_handle); } if (error != QNN_SUCCESS) { QNN_LOG_INFO( "can't create qnn graph handle with graph name %s, " "error = %d\n", graph_name.c_str(), error); return; } QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); _graph_handle = graph_handle; _qnn_interface = qnn_interface; } ~ggml_qnn_graph() { QNN_LOG_DEBUG("graph name %s, destroy", _graph_name.c_str()); } bool build_graph(ggml_op_constructor_t op_constructor, const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { GGML_ASSERT(op_constructor); if (!is_valid()) { QNN_LOG_ERROR("Invalid graph\n"); return false; } // get the max tensor rank for (auto tensor : tensor_inputs) { _tensor_rank = std::max(_tensor_rank, ggml_n_dims(tensor)); } for (auto tensor : tensor_outputs) { _tensor_rank = std::max(_tensor_rank, ggml_n_dims(tensor)); } QNN_LOG_DEBUG("graph name %s, build_graph start", _graph_name.c_str()); _tensor_inputs.resize(tensor_inputs.size()); for (size_t i = 0; i < tensor_inputs.size(); i++) { char buffer[GGML_MAX_NAME] = {}; snprintf(buffer, GGML_MAX_NAME, "src%d", (int)i); auto qnn_tensor = std::make_shared(std::string(buffer), _device, _graph_handle, _qnn_instance); auto *ggml_tensor = tensor_inputs[i]; if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, true, _tensor_rank)) { QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; } _tensor_inputs[i] = qnn_tensor; } _tensor_outputs.resize(tensor_outputs.size()); for (size_t i = 0; i < tensor_outputs.size(); i++) { char buffer[GGML_MAX_NAME] = {}; snprintf(buffer, GGML_MAX_NAME, "dst%d", (int)i); auto qnn_tensor = std::make_shared(std::string(buffer), _device, _graph_handle, _qnn_instance); auto *ggml_tensor = tensor_outputs[i]; if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, false, _tensor_rank)) { QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; } _tensor_outputs[i] = qnn_tensor; } _op_config = op_constructor(_graph_name); _op_config->set_input_tensors(_tensor_inputs); _op_config->set_output_tensors(_tensor_outputs); auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, _op_config->get_op_config()); if (error != QNN_SUCCESS) { auto *error_str = get_qnn_error_string(error); if (error_str) { QNN_LOG_ERROR("qnn_graph_add_node.error: %s\n", error_str); } else { QNN_LOG_ERROR("qnn_graph_add_node.error: %d\n", error); } return false; } error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr); if (error != QNN_SUCCESS) { auto *error_str = get_qnn_error_string(error); if (error_str) { QNN_LOG_ERROR("qnn_graph_finalize.error: %s\n", error_str); } else { QNN_LOG_ERROR("qnn_graph_finalize.error: %d\n", error); } return false; } QNN_LOG_DEBUG("graph name %s, build_graph succeed", _graph_name.c_str()); return true; } bool execute(const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { GGML_ASSERT(tensor_inputs.size() == _tensor_inputs.size()); GGML_ASSERT(tensor_outputs.size() == _tensor_outputs.size()); for (size_t i = 0; i < tensor_inputs.size(); i++) { auto *ggml_tensor = tensor_inputs[i]; if (!_tensor_inputs[i]->bind_ggml_tensor(ggml_tensor, true, _tensor_rank)) { QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; } } for (size_t i = 0; i < tensor_outputs.size(); i++) { auto *ggml_tensor = tensor_outputs[i]; if (!_tensor_outputs[i]->bind_ggml_tensor(ggml_tensor, false, _tensor_rank)) { QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; } } _op_config->set_input_tensors(_tensor_inputs); _op_config->set_output_tensors(_tensor_outputs); auto &qnn_tensor_inputs = _op_config->get_qnn_input_tensors(); auto &qnn_tensor_outputs = _op_config->get_qnn_output_tensors(); auto error = _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(), qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr); if (_device == QNN_BACKEND_NPU) { if (error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); } } for (auto tensor : _tensor_inputs) { tensor->unbind_ggml_tensor(); } for (auto tensor : _tensor_outputs) { tensor->unbind_ggml_tensor(); } if (error != QNN_SUCCESS) { QNN_LOG_INFO("error = %d\n", error); return false; } return true; } bool is_valid() const { return _graph_handle != nullptr; } Qnn_GraphHandle_t get_graph_handler() const { return _graph_handle; } const std::string &get_name() const { return _graph_name; } private: const std::string _graph_name; const QNNBackend _device; Qnn_GraphHandle_t _graph_handle = nullptr; std::shared_ptr _qnn_instance; std::shared_ptr _qnn_interface; std::vector> _tensor_inputs; std::vector> _tensor_outputs; std::unique_ptr _op_config; std::vector _param_types; int _tensor_rank = 0; DISABLE_COPY(ggml_qnn_graph); DISABLE_MOVE(ggml_qnn_graph); }; } // namespace qnn