llama.cpp/ggml/src/ggml-qnn/graph.hpp

234 lines
9.4 KiB
C++

#pragma once
#include <cstdio>
#include <functional>
#include <memory>
#include <string>
#include <vector>
#include "ggml-qnn.h"
#include "logger.hpp"
#include "op-config.hpp"
#include "qnn-lib.hpp"
#include "tensor.hpp"
namespace qnn {
using ggml_tensor_array_t = std::vector<ggml_tensor *>;
using ggml_op_constructor_t = std::function<std::unique_ptr<qnn::ggml_qnn_op_config>(const std::string &)>;
class ggml_qnn_graph {
public:
explicit ggml_qnn_graph(const std::string &graph_name, QNNBackend device,
std::shared_ptr<qnn_instance> qnn_instance, size_t vtcm_size_in_mb) :
_graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) {
QNN_LOG_INFO("graph name %s", graph_name.c_str());
auto qnn_interface = qnn_instance->get_qnn_interface();
auto qnn_context = qnn_instance->get_qnn_context_handle();
Qnn_ErrorHandle_t error = QNN_SUCCESS;
Qnn_GraphHandle_t graph_handle = nullptr;
if (device == QNN_BACKEND_NPU) {
// TODO: fix graph config here for NPU
QnnHtpGraph_CustomConfig_t hvx_config;
hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
hvx_config.numHvxThreads = 8;
QnnGraph_Config_t graph_hvx_config;
graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
graph_hvx_config.customConfig = &hvx_config;
QnnHtpGraph_CustomConfig_t dlbc_config;
dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC
QnnGraph_Config_t graph_dlbc_config;
graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
graph_dlbc_config.customConfig = &dlbc_config;
QnnHtpGraph_CustomConfig_t opt_config;
opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
opt_config.optimizationOption.floatValue = 1; // 1 / 3
QnnGraph_Config_t graph_opt_config;
graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
graph_opt_config.customConfig = &opt_config;
QnnHtpGraph_CustomConfig_t vtcm_config;
vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
vtcm_config.vtcmSizeInMB = vtcm_size_in_mb;
QnnGraph_Config_t graph_vtcm_config;
graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
graph_vtcm_config.customConfig = &vtcm_config;
const QnnGraph_Config_t *graph_configs[] = { &graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config,
&graph_opt_config, nullptr };
error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), graph_configs, &graph_handle);
} else {
error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), nullptr, &graph_handle);
}
if (error != QNN_SUCCESS) {
QNN_LOG_INFO(
"can't create qnn graph handle with graph name %s, "
"error = %d\n",
graph_name.c_str(), error);
return;
}
QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str());
_graph_handle = graph_handle;
_qnn_interface = qnn_interface;
}
~ggml_qnn_graph() { QNN_LOG_DEBUG("graph name %s, destroy", _graph_name.c_str()); }
bool build_graph(ggml_op_constructor_t op_constructor, const ggml_tensor_array_t &tensor_inputs,
const ggml_tensor_array_t &tensor_outputs) {
GGML_ASSERT(op_constructor);
if (!is_valid()) {
QNN_LOG_ERROR("Invalid graph\n");
return false;
}
// get the max tensor rank
for (auto tensor : tensor_inputs) {
_tensor_rank = std::max(_tensor_rank, ggml_n_dims(tensor));
}
for (auto tensor : tensor_outputs) {
_tensor_rank = std::max(_tensor_rank, ggml_n_dims(tensor));
}
QNN_LOG_DEBUG("graph name %s, build_graph start", _graph_name.c_str());
_tensor_inputs.resize(tensor_inputs.size());
for (size_t i = 0; i < tensor_inputs.size(); i++) {
char buffer[GGML_MAX_NAME] = {};
snprintf(buffer, GGML_MAX_NAME, "src%d", (int)i);
auto qnn_tensor =
std::make_shared<ggml_qnn_tensor>(std::string(buffer), _device, _graph_handle, _qnn_instance);
auto *ggml_tensor = tensor_inputs[i];
if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, true, _tensor_rank)) {
QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor));
return false;
}
_tensor_inputs[i] = qnn_tensor;
}
_tensor_outputs.resize(tensor_outputs.size());
for (size_t i = 0; i < tensor_outputs.size(); i++) {
char buffer[GGML_MAX_NAME] = {};
snprintf(buffer, GGML_MAX_NAME, "dst%d", (int)i);
auto qnn_tensor =
std::make_shared<ggml_qnn_tensor>(std::string(buffer), _device, _graph_handle, _qnn_instance);
auto *ggml_tensor = tensor_outputs[i];
if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, false, _tensor_rank)) {
QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor));
return false;
}
_tensor_outputs[i] = qnn_tensor;
}
_op_config = op_constructor(_graph_name);
_op_config->set_input_tensors(_tensor_inputs);
_op_config->set_output_tensors(_tensor_outputs);
auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, _op_config->get_op_config());
if (error != QNN_SUCCESS) {
auto *error_str = get_qnn_error_string(error);
if (error_str) {
QNN_LOG_ERROR("qnn_graph_add_node.error: %s\n", error_str);
} else {
QNN_LOG_ERROR("qnn_graph_add_node.error: %d\n", error);
}
return false;
}
error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr);
if (error != QNN_SUCCESS) {
auto *error_str = get_qnn_error_string(error);
if (error_str) {
QNN_LOG_ERROR("qnn_graph_finalize.error: %s\n", error_str);
} else {
QNN_LOG_ERROR("qnn_graph_finalize.error: %d\n", error);
}
return false;
}
QNN_LOG_DEBUG("graph name %s, build_graph succeed", _graph_name.c_str());
return true;
}
bool execute(const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) {
GGML_ASSERT(tensor_inputs.size() == _tensor_inputs.size());
GGML_ASSERT(tensor_outputs.size() == _tensor_outputs.size());
for (size_t i = 0; i < tensor_inputs.size(); i++) {
auto *ggml_tensor = tensor_inputs[i];
if (!_tensor_inputs[i]->bind_ggml_tensor(ggml_tensor, true, _tensor_rank)) {
QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor));
return false;
}
}
for (size_t i = 0; i < tensor_outputs.size(); i++) {
auto *ggml_tensor = tensor_outputs[i];
if (!_tensor_outputs[i]->bind_ggml_tensor(ggml_tensor, false, _tensor_rank)) {
QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor));
return false;
}
}
_op_config->set_input_tensors(_tensor_inputs);
_op_config->set_output_tensors(_tensor_outputs);
auto &qnn_tensor_inputs = _op_config->get_qnn_input_tensors();
auto &qnn_tensor_outputs = _op_config->get_qnn_output_tensors();
auto error =
_qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(),
qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr);
if (_device == QNN_BACKEND_NPU) {
if (error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) {
QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n");
}
}
for (auto tensor : _tensor_inputs) {
tensor->unbind_ggml_tensor();
}
for (auto tensor : _tensor_outputs) {
tensor->unbind_ggml_tensor();
}
if (error != QNN_SUCCESS) {
QNN_LOG_INFO("error = %d\n", error);
return false;
}
return true;
}
bool is_valid() const { return _graph_handle != nullptr; }
Qnn_GraphHandle_t get_graph_handler() const { return _graph_handle; }
const std::string &get_name() const { return _graph_name; }
private:
const std::string _graph_name;
const QNNBackend _device;
Qnn_GraphHandle_t _graph_handle = nullptr;
std::shared_ptr<qnn_instance> _qnn_instance;
std::shared_ptr<qnn_interface> _qnn_interface;
std::vector<std::shared_ptr<ggml_qnn_tensor>> _tensor_inputs;
std::vector<std::shared_ptr<ggml_qnn_tensor>> _tensor_outputs;
std::unique_ptr<ggml_qnn_op_config> _op_config;
std::vector<Qnn_Param_t> _param_types;
int _tensor_rank = 0;
DISABLE_COPY(ggml_qnn_graph);
DISABLE_MOVE(ggml_qnn_graph);
};
} // namespace qnn