[WIP] feat: perf opt (#10)
* reduce log * wip * add function to create concat nodes * opt * insert concat node before mulmat * use resize op * wip * add bind_buffer and remov ggml prefix in tensor types * use gather node instead * fix tensor type, now succeed in gpu and cpu, failed in npu * add comment * wip * add comment * wip * in destructor, clear internal buffer before unbind * disable gather for npu * wip * count swap memory as free memory * wip * fix supported_types ggml_backend_device_i.supports_op will be invoked before ggml_backend_device_i.init_backend * rename create_tensors -> initialize_op_nodes * move ggml_qnn_op_config to deparated file * wip * add create_convert_nodes * add comment * enable different type in/out for npu and cpu backend * fix npu convert op * enlarge max buffer size * add more error code * check tensor type before create convert node * add log * add log * remove transpose0 and use buildin transpose flag * rename transpose1 -> transpose_out * disable convert for npu * add more logs
This commit is contained in:
parent
9f62fc9587
commit
a2df09b6af
|
|
@ -226,9 +226,8 @@ size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buf
|
|||
|
||||
size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
|
||||
GGML_UNUSED(buft);
|
||||
// TODO: this value is an experimental value, works fine with
|
||||
// whisper/llm/minicpm-v inference on Android
|
||||
return (96 * 1024 * 1024);
|
||||
// TODO: get the max size from device
|
||||
return (1024 * 1024 * 1024);
|
||||
}
|
||||
|
||||
bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) {
|
||||
|
|
@ -339,6 +338,7 @@ void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t *free, si
|
|||
GGML_UNUSED(dev);
|
||||
*free = qnn::get_system_free_memory_in_bytes();
|
||||
*total = qnn::get_system_total_memory_in_bytes();
|
||||
QNN_LOG_DEBUG("free memory: %ldMB, total memory: %ldMB", (*free / 1048576), (*total) / 1048576);
|
||||
}
|
||||
|
||||
enum ggml_backend_dev_type ggml_backend_qnn_device_get_type(ggml_backend_dev_t dev) {
|
||||
|
|
@ -374,7 +374,7 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev,
|
|||
|
||||
auto *dev_ctx = get_device_context(dev);
|
||||
const auto device = dev_ctx->device;
|
||||
QNN_LOG_DEBUG("device %d", device);
|
||||
QNN_LOG_DEBUG("device %s", qnn::get_backend_name(device));
|
||||
QNN_LOG_DEBUG("extend_lib_search_path %s", extend_lib_search_path);
|
||||
std::string path = extend_lib_search_path;
|
||||
|
||||
|
|
@ -386,7 +386,7 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev,
|
|||
"dsp:/vendor/dsp/images")
|
||||
.c_str(),
|
||||
1) == 0) {
|
||||
QNN_LOG_INFO("QNN NPU backend setenv successfully");
|
||||
QNN_LOG_DEBUG("QNN NPU backend setenv successfully");
|
||||
} else {
|
||||
QNN_LOG_ERROR("QNN NPU backend setenv failure");
|
||||
}
|
||||
|
|
@ -395,13 +395,13 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev,
|
|||
"rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp")
|
||||
.c_str(),
|
||||
1) == 0) {
|
||||
QNN_LOG_INFO("QNN NPU backend setenv successfully");
|
||||
QNN_LOG_DEBUG("QNN NPU backend setenv successfully");
|
||||
} else {
|
||||
QNN_LOG_ERROR("QNN NPU backend setenv failure");
|
||||
}
|
||||
} else {
|
||||
if (setenv("LD_LIBRARY_PATH", path.c_str(), 1) == 0) {
|
||||
QNN_LOG_INFO("%s backend setenv successfully\n", qnn::get_backend_name(device));
|
||||
QNN_LOG_DEBUG("%s backend setenv successfully\n", qnn::get_backend_name(device));
|
||||
} else {
|
||||
QNN_LOG_ERROR("%s backend setenv failure\n", qnn::get_backend_name(device));
|
||||
}
|
||||
|
|
@ -454,6 +454,7 @@ ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_ptr(ggml_backend_dev_t
|
|||
}
|
||||
|
||||
bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor *op) {
|
||||
// Note that this function could be called before the device context is initialized
|
||||
auto *device_ctx = get_device_context(dev);
|
||||
return qnn::ggml_qnn_supports_op(device_ctx, op);
|
||||
}
|
||||
|
|
@ -495,13 +496,15 @@ struct ggml_backend_qnn_reg_impl : ggml_backend_reg {
|
|||
context = this;
|
||||
iface = interface;
|
||||
|
||||
QNN_LOG_DEBUG("qnn backend registry init");
|
||||
for (int i = 0; i < GGML_QNN_MAX_DEVICES; i++) {
|
||||
const auto device_enum = (QNNBackend)(GGML_QNN_MAX_DEVICES - 1 - i); // init from the last device, i.e. NPU
|
||||
device_contexts[i] = std::make_unique<ggml_backend_qnn_device_context>(
|
||||
/* .device = */ device_enum, // init from the last device, i.e. NPU
|
||||
/* .threads = */ 1,
|
||||
/* .name = */ qnn::get_backend_name(device_enum),
|
||||
/* .lib_name = */ kDeviceCaps[device_enum].lib_name);
|
||||
/* .lib_name = */ kDeviceCaps[device_enum].lib_name,
|
||||
/* .supported_types = */ kDeviceCaps[device_enum].supported_types);
|
||||
|
||||
auto &device = devices[i];
|
||||
device.iface = ggml_backend_qnn_device_interface;
|
||||
|
|
|
|||
|
|
@ -543,14 +543,17 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t
|
|||
return false;
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
auto *type_name = ggml_get_type_traits(tensor->type)->type_name;
|
||||
#endif
|
||||
switch (tensor->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
if (!(ctx->supported_types & (1 << tensor->type))) {
|
||||
QNN_LOG_DEBUG("unsupported data type %s for backend %d", type_name, (int)ctx->device);
|
||||
QNN_LOG_DEBUG("unsupported data type %s for backend %s, supported_types: 0x%x", type_name,
|
||||
qnn::get_backend_name(ctx->device), ctx->supported_types);
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
|
|
@ -563,25 +566,42 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t
|
|||
}
|
||||
|
||||
bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) {
|
||||
GGML_UNUSED(ctx);
|
||||
|
||||
auto *src0 = op->src[0];
|
||||
auto *src1 = op->src[1];
|
||||
if (src0->type != src1->type || src0->type != op->type) {
|
||||
// current qnn implementation only supports the same type for src0 and src1
|
||||
QNN_LOG_DEBUG("src0 type %d and src1 type %d and op type %d are not equal", src0->type, src1->type, op->type);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) {
|
||||
/*
|
||||
* TODO: remove the blocker here when qnn backend supports mul_mat like this:
|
||||
* [ne03, ne02, n, k] * [ne03 * x, ne02 * y, m, k] -> [ne03 * x, ne02 * y, m, n]
|
||||
*/
|
||||
QNN_LOG_DEBUG("src0 and src1 dimensions are not equal");
|
||||
switch (ctx->device) {
|
||||
case QNN_BACKEND_NPU:
|
||||
if (src1->ne[2] != src0->ne[2] || src1->ne[3] != src0->ne[3]) {
|
||||
/*
|
||||
* TODO: remove the blocker here when NPU backend supports mul_mat like this:
|
||||
* [ne03, ne02, n, k] * [ne03 * x, ne02 * y, m, k] -> [ne03 * x, ne02 * y, m, n]
|
||||
*/
|
||||
QNN_LOG_DEBUG("[qnn-npu] src0 and src1 dimensions are not equal, support/unsupported: %d/%d",
|
||||
ctx->support_op_count.load(), ++(ctx->unsupported_op_count));
|
||||
return false;
|
||||
}
|
||||
// fall through, from test here, the convert op is super slow on NPU:
|
||||
// https://github.com/usefulsensors/qc_npu_benchmark
|
||||
case QNN_BACKEND_GPU:
|
||||
if (src0->type != src1->type || src0->type != op->type) {
|
||||
// there's no convert op for GPU.
|
||||
QNN_LOG_DEBUG("[qnn-gpu]type src0(%d), src1(%d) and op(%d) are not equal, support/unsupported: %d/%d",
|
||||
src0->type, src1->type, op->type, ctx->support_op_count.load(),
|
||||
++(ctx->unsupported_op_count));
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
if ((src1->ne[2] % src0->ne[2]) != 0 || (src1->ne[3] % src0->ne[3]) != 0) {
|
||||
QNN_LOG_DEBUG("[%s] src0 and src1 dimensions are not equal, support/unsupported: %d/%d",
|
||||
qnn::get_backend_name(ctx->device), ctx->support_op_count.load(), ++(ctx->unsupported_op_count));
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s] supported matmul op, support/unsupported: %d/%d", qnn::get_backend_name(ctx->device),
|
||||
++(ctx->support_op_count), ctx->unsupported_op_count.load());
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -590,6 +610,7 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm
|
|||
namespace qnn {
|
||||
|
||||
bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) {
|
||||
// Note that this function could be called before the device context is initialized
|
||||
if (op->op == GGML_OP_NONE) {
|
||||
return true;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,10 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#ifndef NDEBUG
|
||||
#include <atomic>
|
||||
#endif
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
|
@ -25,7 +29,7 @@ struct ggml_backend_qnn_device_context {
|
|||
std::string name;
|
||||
std::string lib_name;
|
||||
|
||||
// initialize in init
|
||||
// initialize in qnn init
|
||||
qnn::qcom_socinfo socinfo = {};
|
||||
uint64_t supported_types;
|
||||
std::shared_ptr<qnn::qnn_instance> instance;
|
||||
|
|
@ -33,7 +37,12 @@ struct ggml_backend_qnn_device_context {
|
|||
|
||||
qnn::ggml_qnn_graph_cache_t qnn_graph_cache;
|
||||
|
||||
explicit ggml_backend_qnn_device_context(QNNBackend device, size_t threads, const char *name,
|
||||
const char *lib_name) :
|
||||
device(device), threads(threads), name(name), lib_name(lib_name) {}
|
||||
#ifndef NDEBUG
|
||||
std::atomic_uint32_t support_op_count = 0;
|
||||
std::atomic_uint32_t unsupported_op_count = 0;
|
||||
#endif
|
||||
|
||||
explicit ggml_backend_qnn_device_context(QNNBackend device, size_t threads, const char *name, const char *lib_name,
|
||||
uint64_t supported_types)
|
||||
: device(device), threads(threads), name(name), lib_name(lib_name), supported_types(supported_types) {}
|
||||
};
|
||||
|
|
|
|||
|
|
@ -17,9 +17,9 @@ namespace qnn {
|
|||
class ggml_qnn_graph {
|
||||
public:
|
||||
explicit ggml_qnn_graph(const std::string &graph_name, QNNBackend device,
|
||||
std::shared_ptr<qnn_instance> qnn_instance, size_t vtcm_size_in_mb) :
|
||||
_graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) {
|
||||
QNN_LOG_INFO("[%s]create", graph_name.c_str());
|
||||
std::shared_ptr<qnn_instance> qnn_instance, size_t vtcm_size_in_mb)
|
||||
: _graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) {
|
||||
QNN_LOG_DEBUG("[%s][%s]created", get_backend_name(device), graph_name.c_str());
|
||||
|
||||
auto qnn_interface = qnn_instance->get_qnn_interface();
|
||||
auto qnn_context = qnn_instance->get_qnn_context_handle();
|
||||
|
|
@ -56,24 +56,25 @@ public:
|
|||
graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
|
||||
graph_vtcm_config.customConfig = &vtcm_config;
|
||||
|
||||
const QnnGraph_Config_t *graph_configs[] = { &graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config,
|
||||
&graph_opt_config, nullptr };
|
||||
const QnnGraph_Config_t *graph_configs[] = {&graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config,
|
||||
&graph_opt_config, nullptr};
|
||||
error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), graph_configs, &graph_handle);
|
||||
} else {
|
||||
error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), nullptr, &graph_handle);
|
||||
}
|
||||
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_INFO("[%s]can't create qnn graph handle, error = %d\n", graph_name.c_str(), error);
|
||||
QNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s\n", get_backend_name(device),
|
||||
graph_name.c_str(), get_qnn_error_string(error));
|
||||
return;
|
||||
}
|
||||
|
||||
QNN_LOG_INFO("[%s]create succeed\n", graph_name.c_str());
|
||||
QNN_LOG_INFO("[%s][%s]create succeed\n", get_backend_name(device), graph_name.c_str());
|
||||
_graph_handle = graph_handle;
|
||||
_qnn_interface = qnn_interface;
|
||||
}
|
||||
|
||||
~ggml_qnn_graph() { QNN_LOG_DEBUG("[%s]destroy", _graph_name.c_str()); }
|
||||
~ggml_qnn_graph() { QNN_LOG_DEBUG("[%s][%s]destroy", get_backend_name(_device), _graph_name.c_str()); }
|
||||
|
||||
bool build_graph(ggml_op_constructor_t op_constructor, const ggml_tensor_array_t &tensor_inputs,
|
||||
const ggml_tensor_array_t &tensor_outputs) {
|
||||
|
|
@ -83,10 +84,10 @@ public:
|
|||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s]build_graph start", _graph_name.c_str());
|
||||
QNN_LOG_DEBUG("[%s][%s]build_graph start", get_backend_name(_device), _graph_name.c_str());
|
||||
_op_config = op_constructor(_graph_name, _qnn_instance);
|
||||
if (!_op_config->create_tensors(_device, _graph_handle, tensor_inputs, tensor_outputs)) {
|
||||
QNN_LOG_ERROR("[%s]create_tensors failed\n", _graph_name.c_str());
|
||||
if (!_op_config->initialize_op_nodes(_device, _graph_handle, tensor_inputs, tensor_outputs)) {
|
||||
QNN_LOG_ERROR("[%s][%s]initialize_op_nodes failed", get_backend_name(_device), _graph_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -97,27 +98,23 @@ public:
|
|||
|
||||
auto error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr);
|
||||
if (error != QNN_SUCCESS) {
|
||||
auto *error_str = get_qnn_error_string(error);
|
||||
if (error_str) {
|
||||
QNN_LOG_ERROR("[%s]qnn_graph_finalize.error: %s\n", _graph_name.c_str(), error_str);
|
||||
} else {
|
||||
QNN_LOG_ERROR("[%s]qnn_graph_finalize.error: %d\n", _graph_name.c_str(), error);
|
||||
}
|
||||
QNN_LOG_ERROR("[%s][%s]qnn_graph_finalize.error: %s", get_backend_name(_device), _graph_name.c_str(),
|
||||
get_qnn_error_string(error));
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s]build_graph succeed", _graph_name.c_str());
|
||||
QNN_LOG_DEBUG("[%s][%s]build_graph succeed", get_backend_name(_device), _graph_name.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
bool execute(const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) {
|
||||
if (!_op_config->bind_input_tensors(tensor_inputs)) {
|
||||
QNN_LOG_ERROR("[%s]bind input tensors failed\n", _graph_name.c_str());
|
||||
QNN_LOG_ERROR("[%s][%s]bind input tensors failed\n", get_backend_name(_device), _graph_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!_op_config->bind_output_tensors(tensor_outputs)) {
|
||||
QNN_LOG_ERROR("[%s]bind output tensors failed\n", _graph_name.c_str());
|
||||
QNN_LOG_ERROR("[%s][%s]bind output tensors failed\n", get_backend_name(_device), _graph_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -127,20 +124,21 @@ public:
|
|||
auto error =
|
||||
_qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(),
|
||||
qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr);
|
||||
if (_device == QNN_BACKEND_NPU) {
|
||||
if (error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) {
|
||||
QNN_LOG_WARN("[%s]NPU crashed. SSR detected. Caused QNN graph execute error\n", _graph_name.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
_op_config->unbind_input_tensors();
|
||||
_op_config->unbind_output_tensors();
|
||||
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_INFO("[%s]error = %d\n", _graph_name.c_str(), error);
|
||||
if (_device == QNN_BACKEND_NPU && error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) {
|
||||
QNN_LOG_WARN("[%s][%s]NPU crashed. SSR detected. Caused QNN graph execute error.",
|
||||
get_backend_name(_device), _graph_name.c_str());
|
||||
} else {
|
||||
QNN_LOG_ERROR("[%s][%s]error: %s", get_backend_name(_device), _graph_name.c_str(),
|
||||
get_qnn_error_string(error));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s][%s]execute succeed", get_backend_name(_device), _graph_name.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -34,7 +34,7 @@ void qnn::internal_log(ggml_log_level level, const char * /*file*/, const char *
|
|||
}
|
||||
|
||||
#if ENABLE_QNNSDK_LOG
|
||||
void qnn::sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t timestamp, va_list argp) {
|
||||
void qnn::sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t /*timestamp*/, va_list argp) {
|
||||
static std::mutex log_mutex;
|
||||
static unsigned char s_ggml_qnn_logbuf[QNN_LOGBUF_LEN];
|
||||
|
||||
|
|
@ -60,13 +60,12 @@ void qnn::sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t timest
|
|||
break;
|
||||
}
|
||||
|
||||
double ms = (double)timestamp / 1000000.0;
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(log_mutex);
|
||||
|
||||
memset(s_ggml_qnn_logbuf, 0, QNN_LOGBUF_LEN);
|
||||
vsnprintf(reinterpret_cast<char *const>(s_ggml_qnn_logbuf), QNN_LOGBUF_LEN, fmt, argp);
|
||||
QNN_LOG_INFO("%8.1fms [%-7s] %s", ms, log_level_desc, s_ggml_qnn_logbuf);
|
||||
QNN_LOG_INFO("[%s]%s", log_level_desc, s_ggml_qnn_logbuf);
|
||||
}
|
||||
}
|
||||
#else
|
||||
|
|
|
|||
|
|
@ -0,0 +1,129 @@
|
|||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "ggml-qnn.h"
|
||||
|
||||
#include "qnn-types.hpp"
|
||||
#include "tensor.hpp"
|
||||
|
||||
namespace qnn {
|
||||
|
||||
using ggml_tensor_array_t = std::vector<ggml_tensor *>;
|
||||
|
||||
/**
|
||||
* @class ggml_qnn_op_config
|
||||
* @brief Abstract base class for configuring QNN operations.
|
||||
*
|
||||
* This class provides an interface for creating and managing tensors,
|
||||
* adding operations to a graph, and binding/unbinding input and output tensors.
|
||||
*/
|
||||
class ggml_qnn_op_config {
|
||||
public:
|
||||
virtual ~ggml_qnn_op_config() {}
|
||||
|
||||
/**
|
||||
* @brief Creates tensors and internal nodes for constructing the calculation graph.
|
||||
*
|
||||
* This pure virtual function is responsible for creating tensors on the given
|
||||
* backend device, associating them with the provided graph handle, and creating
|
||||
* the internal nodes necessary for constructing the calculation graph. It takes
|
||||
* input and output tensor arrays as parameters.
|
||||
*
|
||||
* @param device The backend device where tensors will be created.
|
||||
* @param graph_handle The handle to the graph where tensors and nodes will be associated.
|
||||
* @param tensor_inputs An array of input tensors.
|
||||
* @param tensor_outputs An array of output tensors.
|
||||
* @return true if tensors and nodes are successfully created, false otherwise.
|
||||
*/
|
||||
virtual bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
const ggml_tensor_array_t &tensor_inputs,
|
||||
const ggml_tensor_array_t &tensor_outputs) = 0;
|
||||
|
||||
/**
|
||||
* @brief Pure virtual function to retrieve the input tensors for QNN (Quantized Neural Network).
|
||||
*
|
||||
* This function must be overridden by derived classes to provide the specific implementation
|
||||
* for retrieving the input tensors used in QNN operations.
|
||||
*
|
||||
* @return A reference to a vector of Qnn_Tensor_t objects representing the input tensors.
|
||||
*/
|
||||
virtual std::vector<Qnn_Tensor_t> &get_qnn_input_tensors() = 0;
|
||||
|
||||
/**
|
||||
* @brief Pure virtual function to retrieve the output tensors of a QNN (Quantized Neural Network).
|
||||
*
|
||||
* This function must be overridden by any derived class to provide access to the
|
||||
* output tensors of the QNN. The function returns a reference to a vector of
|
||||
* Qnn_Tensor_t objects, which represent the output tensors.
|
||||
*
|
||||
* @return std::vector<Qnn_Tensor_t>& Reference to a vector of Qnn_Tensor_t objects.
|
||||
*/
|
||||
virtual std::vector<Qnn_Tensor_t> &get_qnn_output_tensors() = 0;
|
||||
|
||||
/**
|
||||
* @brief Adds an operation to the given graph.
|
||||
*
|
||||
* This pure virtual function must be implemented by derived classes to add
|
||||
* a specific operation to the provided graph handle.
|
||||
*
|
||||
* This function will be called after `initialize_op_nodes` during initialization.
|
||||
*
|
||||
* @param graph_handle The handle to the graph where the operation will be added.
|
||||
* @return true if the operation was successfully added to the graph, false otherwise.
|
||||
*/
|
||||
virtual bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) = 0;
|
||||
|
||||
/**
|
||||
* @brief Binds the input tensors to the operation.
|
||||
*
|
||||
* This pure virtual function must be implemented by derived classes to bind
|
||||
* the provided input tensors to the operation. The function takes a constant
|
||||
* reference to a ggml_tensor_array_t object, which contains the input tensors
|
||||
* to be bound.
|
||||
*
|
||||
* @param tensor_inputs A constant reference to a ggml_tensor_array_t object
|
||||
* containing the input tensors.
|
||||
* @return true if the input tensors were successfully bound, false otherwise.
|
||||
*/
|
||||
virtual bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) = 0;
|
||||
|
||||
/**
|
||||
* @brief Binds the output tensors to the given tensor array.
|
||||
*
|
||||
* This pure virtual function must be implemented by derived classes to bind
|
||||
* the output tensors to the provided array of tensors. The function is expected
|
||||
* to establish the necessary connections or mappings between the output tensors
|
||||
* and the elements of the given tensor array.
|
||||
*
|
||||
* @param tensor_outputs A constant reference to an array of ggml tensors that
|
||||
* represent the output tensors to be bound.
|
||||
* @return true if the binding is successful, false otherwise.
|
||||
*/
|
||||
virtual bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) = 0;
|
||||
|
||||
/**
|
||||
* @brief Unbinds the input tensors from the operation.
|
||||
*
|
||||
* This pure virtual function is intended to be overridden by derived classes
|
||||
* to implement the logic for unbinding or detaching input tensors that were
|
||||
* previously bound to the operation. This is typically used to release resources
|
||||
* or reset the state of the operation.
|
||||
*/
|
||||
virtual void unbind_input_tensors() = 0;
|
||||
|
||||
/**
|
||||
* @brief Unbinds the output tensors.
|
||||
*
|
||||
* This pure virtual function is responsible for unbinding or detaching
|
||||
* the output tensors from their current bindings. Implementations of this
|
||||
* function should ensure that any resources or references held by the
|
||||
* output tensors are properly released or reset.
|
||||
*/
|
||||
virtual void unbind_output_tensors() = 0;
|
||||
};
|
||||
|
||||
using qnn_op_config_ptr_t = std::shared_ptr<ggml_qnn_op_config>;
|
||||
|
||||
} // namespace qnn
|
||||
|
|
@ -36,7 +36,7 @@ int get_rank(const qnn::ggml_tensor_array_t &tensor_inputs, const qnn::ggml_tens
|
|||
return tensor_rank;
|
||||
}
|
||||
|
||||
Qnn_DataType_t get_tensor_type(const qnn::ggml_qnn_tensor_array_t &tensors) {
|
||||
Qnn_DataType_t get_tensor_type(const qnn::qnn_tensor_array_t &tensors) {
|
||||
Qnn_DataType_t type = QNN_DATATYPE_UNDEFINED;
|
||||
for (auto tensor : tensors) {
|
||||
auto tensor_type_size = qnn::qnn_datatype_size(tensor->get_data_type());
|
||||
|
|
@ -59,8 +59,7 @@ struct tensor_common_params {
|
|||
};
|
||||
|
||||
void create_tensors_from_ggml_tensor(const tensor_common_params ¶ms, const qnn::ggml_tensor_array_t &ggml_tensors,
|
||||
qnn::ggml_qnn_tensor_array_t *tensor_wrappers,
|
||||
std::vector<Qnn_Tensor_t> *qnn_tensors) {
|
||||
qnn::qnn_tensor_array_t *tensor_wrappers, std::vector<Qnn_Tensor_t> *qnn_tensors) {
|
||||
using namespace qnn;
|
||||
|
||||
tensor_wrappers->resize(ggml_tensors.size());
|
||||
|
|
@ -78,7 +77,7 @@ void create_tensors_from_ggml_tensor(const tensor_common_params ¶ms, const q
|
|||
}
|
||||
}
|
||||
|
||||
bool bind_tensors(const qnn::ggml_tensor_array_t &ggml_tensors, qnn::ggml_qnn_tensor_array_t &tensor_wrappers,
|
||||
bool bind_tensors(const qnn::ggml_tensor_array_t &ggml_tensors, qnn::qnn_tensor_array_t &tensor_wrappers,
|
||||
std::vector<Qnn_Tensor_t> &qnn_tensors) {
|
||||
for (size_t i = 0; i < ggml_tensors.size(); i++) {
|
||||
auto *ggml_tensor = ggml_tensors[i];
|
||||
|
|
@ -99,9 +98,9 @@ public:
|
|||
const std::string &op_type, std::shared_ptr<qnn::qnn_instance> qnn_instance)
|
||||
: ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {}
|
||||
|
||||
bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
const qnn::ggml_tensor_array_t &tensor_inputs,
|
||||
const qnn::ggml_tensor_array_t &tensor_outputs) override {
|
||||
bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
const qnn::ggml_tensor_array_t &tensor_inputs,
|
||||
const qnn::ggml_tensor_array_t &tensor_outputs) override {
|
||||
GGML_UNUSED(device);
|
||||
GGML_UNUSED(graph_handle);
|
||||
GGML_UNUSED(tensor_inputs);
|
||||
|
|
@ -109,28 +108,28 @@ public:
|
|||
return true;
|
||||
}
|
||||
|
||||
void set_input_tensors(qnn::ggml_qnn_tensor_array_t &tensor_inputs) {
|
||||
void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) {
|
||||
_tensor_inputs = tensor_inputs;
|
||||
_qnn_tensor_inputs.resize(_tensor_inputs.size());
|
||||
}
|
||||
|
||||
void set_input_tensors(qnn::ggml_qnn_tensor_array_t &&tensor_inputs) {
|
||||
void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) {
|
||||
_tensor_inputs = std::move(tensor_inputs);
|
||||
_qnn_tensor_inputs.resize(_tensor_inputs.size());
|
||||
}
|
||||
|
||||
void set_output_tensors(qnn::ggml_qnn_tensor_array_t &tensor_outputs) {
|
||||
void set_output_tensors(qnn::qnn_tensor_array_t &tensor_outputs) {
|
||||
_tensor_outputs = tensor_outputs;
|
||||
_qnn_tensor_outputs.resize(_tensor_outputs.size());
|
||||
}
|
||||
|
||||
void set_output_tensors(qnn::ggml_qnn_tensor_array_t &&tensor_outputs) {
|
||||
void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_outputs) {
|
||||
_tensor_outputs = std::move(tensor_outputs);
|
||||
_qnn_tensor_outputs.resize(_tensor_outputs.size());
|
||||
}
|
||||
|
||||
qnn::ggml_qnn_tensor_array_t &get_input_tensors() { return _tensor_inputs; }
|
||||
qnn::ggml_qnn_tensor_array_t &get_output_tensors() { return _tensor_outputs; }
|
||||
qnn::qnn_tensor_array_t &get_input_tensors() { return _tensor_inputs; }
|
||||
qnn::qnn_tensor_array_t &get_output_tensors() { return _tensor_outputs; }
|
||||
|
||||
private:
|
||||
DISABLE_COPY(ggml_qnn_connectable_op_config);
|
||||
|
|
@ -186,7 +185,7 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) {
|
|||
GGML_ASSERT(_qnn_tensor_inputs.size() == _tensor_inputs.size());
|
||||
GGML_ASSERT(_qnn_tensor_outputs.size() == _tensor_outputs.size());
|
||||
|
||||
auto qnn_interface = _qnn_instance->get_qnn_interface();
|
||||
QNN_LOG_DEBUG("[%s]add to graph start\n", _name.c_str());
|
||||
for (size_t i = 0; i < _tensor_inputs.size(); i++) {
|
||||
auto tensor = _tensor_inputs[i];
|
||||
if (!tensor->alloc_qnn_tensor_id()) {
|
||||
|
|
@ -194,6 +193,7 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) {
|
|||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s]input tensor id: %d\n", _name.c_str(), tensor->get_qnn_tensor_id());
|
||||
_qnn_tensor_inputs[i] = tensor->get_qnn_tensor();
|
||||
}
|
||||
|
||||
|
|
@ -203,21 +203,19 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) {
|
|||
QNN_LOG_ERROR("[%s]output tensor alloc_qnn_tensor_id failed\n", _name.c_str());
|
||||
return false;
|
||||
}
|
||||
_qnn_tensor_outputs[i] = _tensor_outputs[i]->get_qnn_tensor();
|
||||
|
||||
QNN_LOG_DEBUG("[%s]output tensor id: %d\n", _name.c_str(), tensor->get_qnn_tensor_id());
|
||||
_qnn_tensor_outputs[i] = tensor->get_qnn_tensor();
|
||||
}
|
||||
|
||||
auto qnn_interface = _qnn_instance->get_qnn_interface();
|
||||
auto error = qnn_interface->qnn_graph_add_node(graph_handle, get_op_config());
|
||||
if (error != QNN_SUCCESS) {
|
||||
auto *error_str = get_qnn_error_string(error);
|
||||
if (error_str) {
|
||||
QNN_LOG_ERROR("[%s]qnn_graph_add_node.error: %s\n", _name.c_str(), error_str);
|
||||
} else {
|
||||
QNN_LOG_ERROR("[%s]qnn_graph_add_node.error: %d\n", _name.c_str(), error);
|
||||
}
|
||||
QNN_LOG_ERROR("[%s]qnn_graph_add_node.error: %s", _name.c_str(), get_qnn_error_string(error));
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s]added to graph\n", _name.c_str());
|
||||
QNN_LOG_DEBUG("[%s]added to graph succeed\n", _name.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -259,9 +257,9 @@ Qnn_OpConfig_t ggml_qnn_op_config_base::get_op_config() {
|
|||
return config;
|
||||
}
|
||||
|
||||
bool ggml_qnn_single_op_config::create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
const ggml_tensor_array_t &tensor_inputs,
|
||||
const ggml_tensor_array_t &tensor_outputs) {
|
||||
bool ggml_qnn_single_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
const ggml_tensor_array_t &tensor_inputs,
|
||||
const ggml_tensor_array_t &tensor_outputs) {
|
||||
const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs);
|
||||
tensor_common_params params = {"src", tensor_rank, true, device, graph_handle, _qnn_instance};
|
||||
create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs);
|
||||
|
|
@ -282,9 +280,9 @@ bool ggml_qnn_single_op_config::create_tensors(QNNBackend device, Qnn_GraphHandl
|
|||
return true;
|
||||
}
|
||||
|
||||
bool ggml_qnn_matmul_op_config::create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
const ggml_tensor_array_t &tensor_inputs,
|
||||
const ggml_tensor_array_t &tensor_outputs) {
|
||||
bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
const ggml_tensor_array_t &tensor_inputs,
|
||||
const ggml_tensor_array_t &tensor_outputs) {
|
||||
GGML_ASSERT(tensor_inputs.size() == 2);
|
||||
GGML_ASSERT(tensor_outputs.size() == 1);
|
||||
const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs);
|
||||
|
|
@ -295,59 +293,143 @@ bool ggml_qnn_matmul_op_config::create_tensors(QNNBackend device, Qnn_GraphHandl
|
|||
create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs);
|
||||
|
||||
// create output tensor
|
||||
ggml_qnn_tensor_array_t mat_mul_tensor_outputs;
|
||||
qnn_tensor_array_t mat_mul_tensor_outputs;
|
||||
params.name_prefix = "dst";
|
||||
params.is_input = false;
|
||||
create_tensors_from_ggml_tensor(params, tensor_outputs, &mat_mul_tensor_outputs, nullptr);
|
||||
|
||||
// create convert nodes
|
||||
qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs;
|
||||
if (!create_convert_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs)) {
|
||||
QNN_LOG_ERROR("create convert nodes failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
mat_mul_tensor_inputs.front() = create_gather_nodes(device, graph_handle, tensor_rank, _tensor_inputs.front(),
|
||||
_tensor_inputs.back()->get_dimensions());
|
||||
return create_mat_mul_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs);
|
||||
}
|
||||
|
||||
qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
const int rank, qnn_tensor_ptr_t tensor_input,
|
||||
qnn_dimension_array_t output_dimensions) {
|
||||
if (rank <= 2) {
|
||||
return tensor_input;
|
||||
}
|
||||
|
||||
const auto &input_dimensions = tensor_input->get_dimensions();
|
||||
output_dimensions[rank - 1] = input_dimensions[rank - 1];
|
||||
output_dimensions[rank - 2] = input_dimensions[rank - 2];
|
||||
|
||||
const auto y = output_dimensions[rank - 3] / input_dimensions[rank - 3];
|
||||
if (y == 1 && (rank == 3 || (rank == 4 && output_dimensions[rank - 4] == input_dimensions[rank - 4]))) {
|
||||
return tensor_input;
|
||||
}
|
||||
|
||||
// create concat nodes, to convert tensor shape from [ne03, ne02, n, k] to [ne03 * x, ne02 * y, n, k]
|
||||
constexpr const auto create_node =
|
||||
[](const std::string &name, const int rank, const int axis, const qnn_dimension_array_t &dimensions,
|
||||
qnn_tensor_ptr_t tensor_input, QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
std::shared_ptr<qnn_instance> qnn_instance, qnn_tensor_ptr_t &tensor_output) -> qnn_op_config_ptr_t {
|
||||
auto gather_out =
|
||||
std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, name + "_out", dimensions,
|
||||
tensor_input->get_data_type(), rank, device, graph_handle, qnn_instance);
|
||||
auto gather_op = std::make_shared<ggml_qnn_connectable_op_config>(name, QNN_OP_PACKAGE_NAME_QTI_AISW,
|
||||
QNN_OP_GATHER, qnn_instance);
|
||||
|
||||
Qnn_Scalar_t scalar = QNN_SCALAR_INIT;
|
||||
scalar.dataType = QNN_DATATYPE_INT_32;
|
||||
scalar.int32Value = axis;
|
||||
gather_op->add_scalar_param(QNN_OP_GATHER_PARAM_AXIS, scalar);
|
||||
gather_op->set_output_tensors({gather_out});
|
||||
|
||||
// here we calculate the index mapping, will generate a 1d tensor like [0, 0, 0, 1, 1, 1, 2, 2, 2, ...],
|
||||
// by repeating each index [scale] times.
|
||||
const auto scale = dimensions[axis] / tensor_input->get_dimensions()[axis];
|
||||
std::vector<uint8_t> index_buffer(dimensions[axis] * sizeof(uint32_t));
|
||||
for (uint32_t *curr = reinterpret_cast<uint32_t *>(index_buffer.data()), *end = curr + dimensions[axis];
|
||||
curr < end; curr++) {
|
||||
*curr = (curr - reinterpret_cast<uint32_t *>(index_buffer.data())) / scale;
|
||||
}
|
||||
|
||||
auto gather_index = std::make_shared<ggml_qnn_tensor>(
|
||||
ggml_qnn_tensor::PARAMETER, name + "_index", qnn_dimension_array_t{dimensions[axis]}, QNN_DATATYPE_UINT_32,
|
||||
1, device, graph_handle, qnn_instance);
|
||||
gather_index->set_data_buffer(std::move(index_buffer));
|
||||
gather_op->set_input_tensors({tensor_input, gather_index});
|
||||
|
||||
tensor_output = gather_out;
|
||||
return gather_op;
|
||||
};
|
||||
|
||||
qnn_dimension_array_t intermediate_dimensions = input_dimensions;
|
||||
intermediate_dimensions[rank - 3] = output_dimensions[rank - 3];
|
||||
qnn_tensor_ptr_t gather0_out;
|
||||
_gather0 = create_node(_name + "_gather0", rank, rank - 3, intermediate_dimensions, tensor_input, device,
|
||||
graph_handle, _qnn_instance, gather0_out);
|
||||
if (rank == 3) {
|
||||
return gather0_out;
|
||||
}
|
||||
|
||||
qnn_tensor_ptr_t gather1_out;
|
||||
_gather1 = create_node(_name + "_gather1", rank, rank - 4, output_dimensions, gather0_out, device, graph_handle,
|
||||
_qnn_instance, gather1_out);
|
||||
return gather1_out;
|
||||
}
|
||||
|
||||
bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank,
|
||||
qnn_tensor_array_t &tensor_inputs,
|
||||
qnn_tensor_array_t &tensor_outputs) {
|
||||
if (device == QNN_BACKEND_GPU) {
|
||||
// there's no convert op for GPU, so we should create matmul nodes directl.
|
||||
return create_mat_mul_nodes(device, graph_handle, tensor_rank, _tensor_inputs, mat_mul_tensor_outputs);
|
||||
// there's no convert op for GPU, so we should create matmul nodes directly.
|
||||
return true;
|
||||
}
|
||||
|
||||
// create tensors for convert node
|
||||
ggml_qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs;
|
||||
auto input_tensor_type = get_tensor_type(mat_mul_tensor_inputs);
|
||||
QNN_LOG_DEBUG("matmul input tensor type: %s\n", qnn_datatype_to_string(input_tensor_type));
|
||||
auto tensor_type = get_tensor_type(tensor_inputs);
|
||||
QNN_LOG_DEBUG("input tensor type: %s\n", qnn_datatype_to_string(tensor_type));
|
||||
|
||||
_input_converts.resize(mat_mul_tensor_inputs.size());
|
||||
for (size_t i = 0; i < mat_mul_tensor_inputs.size(); ++i) {
|
||||
_input_converts.resize(tensor_inputs.size());
|
||||
for (size_t i = 0; i < tensor_inputs.size(); ++i) {
|
||||
// create input convert nodes
|
||||
auto convert_in = tensor_inputs[i];
|
||||
if (convert_in->get_data_type() == tensor_type) {
|
||||
continue;
|
||||
}
|
||||
|
||||
std::string convert_name("convert_src" + std::to_string(i));
|
||||
auto convert_in = mat_mul_tensor_inputs[i];
|
||||
auto convert_out = std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_out",
|
||||
convert_in->get_dimensions(), input_tensor_type,
|
||||
tensor_rank, device, graph_handle, _qnn_instance);
|
||||
convert_in->get_dimensions(), tensor_type, rank, device,
|
||||
graph_handle, _qnn_instance);
|
||||
auto convert = std::make_shared<ggml_qnn_connectable_op_config>(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW,
|
||||
QNN_OP_CONVERT, _qnn_instance);
|
||||
convert->set_input_tensors({convert_in});
|
||||
convert->set_output_tensors({convert_out});
|
||||
mat_mul_tensor_inputs[i] = convert_out;
|
||||
tensor_inputs[i] = convert_out;
|
||||
_input_converts[i] = convert;
|
||||
}
|
||||
|
||||
{
|
||||
if (tensor_outputs.front()->get_data_type() != tensor_type) {
|
||||
// create output convert node
|
||||
std::string convert_name("convert_dst");
|
||||
auto convert_out = mat_mul_tensor_outputs.front();
|
||||
auto convert_out = tensor_outputs.front();
|
||||
auto convert_in = std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_in",
|
||||
convert_out->get_dimensions(), input_tensor_type,
|
||||
tensor_rank, device, graph_handle, _qnn_instance);
|
||||
convert_out->get_dimensions(), tensor_type, rank, device,
|
||||
graph_handle, _qnn_instance);
|
||||
auto output_convert = std::make_shared<ggml_qnn_connectable_op_config>(
|
||||
convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_CONVERT, _qnn_instance);
|
||||
output_convert->set_input_tensors({convert_in});
|
||||
output_convert->set_output_tensors({convert_out});
|
||||
mat_mul_tensor_outputs[0] = convert_in;
|
||||
tensor_outputs.front() = convert_in;
|
||||
_output_convert = output_convert;
|
||||
}
|
||||
|
||||
// create mat_mul nodes
|
||||
return create_mat_mul_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank,
|
||||
ggml_qnn_tensor_array_t &tensor_inputs,
|
||||
ggml_qnn_tensor_array_t &tensor_outputs) {
|
||||
qnn_tensor_array_t &tensor_inputs,
|
||||
qnn_tensor_array_t &tensor_outputs) {
|
||||
|
||||
/*
|
||||
* First, both the ggml and qnn tensor in memory are stored as row-major format. (For more details, please also:
|
||||
|
|
@ -386,9 +468,8 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap
|
|||
* ```mermaid
|
||||
* graph TD;
|
||||
* i1>ggml_tensor_in0] --src0--> mat_mul0;
|
||||
* i2>ggml_tensor_in1] --src1--> transpose0;
|
||||
* transpose0 --src0_trans--> mat_mul0;
|
||||
* mat_mul0 --dst_trans--> transpose1;
|
||||
* i2>ggml_tensor_in1] --src1--> mat_mul0;
|
||||
* mat_mul0 --dst_trans--> transpose_out;
|
||||
* transpose1 --dst0--> o1>ggml_tensor_out];
|
||||
* ```
|
||||
*/
|
||||
|
|
@ -398,9 +479,6 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap
|
|||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS does not match the expected value");
|
||||
|
||||
qnn_dimension_array_t dimensions = get_transposed_dimensions(src1->get_dimensions(), rank);
|
||||
auto src0_trans =
|
||||
std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, "src0_trans", dimensions,
|
||||
src1->get_data_type(), rank, device, graph_handle, _qnn_instance);
|
||||
|
||||
// create dst_trans tensor
|
||||
auto dst = tensor_outputs.front();
|
||||
|
|
@ -408,48 +486,37 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap
|
|||
auto dst_trans = std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, "dst_trans", dimensions,
|
||||
dst->get_data_type(), rank, device, graph_handle, _qnn_instance);
|
||||
|
||||
// create transpose0
|
||||
auto transpose0 = std::make_shared<ggml_qnn_connectable_op_config>(_name + "_trans0", QNN_OP_PACKAGE_NAME_QTI_AISW,
|
||||
QNN_OP_TRANSPOSE, _qnn_instance);
|
||||
|
||||
// create transpose1
|
||||
auto transpose1 = std::make_shared<ggml_qnn_connectable_op_config>(_name + "_trans1", QNN_OP_PACKAGE_NAME_QTI_AISW,
|
||||
QNN_OP_TRANSPOSE, _qnn_instance);
|
||||
// create transpose_out
|
||||
auto transpose_out = std::make_shared<ggml_qnn_connectable_op_config>(
|
||||
_name + "_trans1", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE, _qnn_instance);
|
||||
|
||||
// create mat_mul
|
||||
auto mat_mul = std::make_shared<ggml_qnn_connectable_op_config>(_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
|
||||
_qnn_instance);
|
||||
|
||||
// set transpose0 parameters
|
||||
Qnn_Scalar_t scalar = QNN_SCALAR_INIT;
|
||||
scalar.dataType = QNN_DATATYPE_BOOL_8;
|
||||
scalar.bool8Value = 1;
|
||||
mat_mul->add_scalar_param(QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, scalar);
|
||||
|
||||
// set transpose_out parameters
|
||||
auto *params_data = reinterpret_cast<const uint8_t *>(kTransposeParamData[rank - 1].data());
|
||||
const qnn_dimension_array_t param_dims = {(uint32_t)rank, 1, 1, 1};
|
||||
transpose0->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32, device,
|
||||
graph_handle);
|
||||
|
||||
// set transpose1 parameters
|
||||
transpose1->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32, device,
|
||||
graph_handle);
|
||||
|
||||
// set tensor to transpose0
|
||||
ggml_qnn_tensor_array_t tensors = {tensor_inputs.back()};
|
||||
transpose0->set_input_tensors(tensors);
|
||||
tensors = {src0_trans};
|
||||
transpose0->set_output_tensors(tensors);
|
||||
transpose_out->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32,
|
||||
device, graph_handle);
|
||||
|
||||
// set tensor to mat_mul
|
||||
tensors = {tensor_inputs.front(), src0_trans};
|
||||
mat_mul->set_input_tensors(tensors);
|
||||
tensors = {dst_trans};
|
||||
mat_mul->set_input_tensors(tensor_inputs);
|
||||
qnn_tensor_array_t tensors = {dst_trans};
|
||||
mat_mul->set_output_tensors(tensors);
|
||||
|
||||
// set tensor to transpose1
|
||||
// set tensor to transpose_out
|
||||
tensors = {dst_trans};
|
||||
transpose1->set_input_tensors(tensors);
|
||||
transpose1->set_output_tensors(tensor_outputs);
|
||||
transpose_out->set_input_tensors(tensors);
|
||||
transpose_out->set_output_tensors(tensor_outputs);
|
||||
|
||||
_mat_mul = mat_mul;
|
||||
_transpose0 = transpose0;
|
||||
_transpose1 = transpose1;
|
||||
_transpose_out = transpose_out;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -460,8 +527,15 @@ bool ggml_qnn_matmul_op_config::add_op_to_graph(Qnn_GraphHandle_t graph_handle)
|
|||
}
|
||||
}
|
||||
|
||||
return _transpose0->add_op_to_graph(graph_handle) && _mat_mul->add_op_to_graph(graph_handle) &&
|
||||
_transpose1->add_op_to_graph(graph_handle) &&
|
||||
if (_gather0 && !_gather0->add_op_to_graph(graph_handle)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (_gather1 && !_gather1->add_op_to_graph(graph_handle)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return _mat_mul->add_op_to_graph(graph_handle) && _transpose_out->add_op_to_graph(graph_handle) &&
|
||||
(!_output_convert || _output_convert->add_op_to_graph(graph_handle));
|
||||
}
|
||||
|
||||
|
|
@ -473,13 +547,12 @@ bool ggml_qnn_matmul_op_config::bind_output_tensors(const ggml_tensor_array_t &t
|
|||
if (_output_convert) {
|
||||
return _output_convert->bind_output_tensors(tensor_outputs);
|
||||
} else {
|
||||
return _transpose1->bind_output_tensors(tensor_outputs);
|
||||
return _transpose_out->bind_output_tensors(tensor_outputs);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_qnn_matmul_op_config::unbind_input_tensors() {
|
||||
_mat_mul->unbind_input_tensors();
|
||||
_transpose0->unbind_input_tensors();
|
||||
for (auto &convert : _input_converts) {
|
||||
if (convert) {
|
||||
convert->unbind_input_tensors();
|
||||
|
|
@ -488,7 +561,7 @@ void ggml_qnn_matmul_op_config::unbind_input_tensors() {
|
|||
}
|
||||
|
||||
void ggml_qnn_matmul_op_config::unbind_output_tensors() {
|
||||
_transpose1->unbind_output_tensors();
|
||||
_transpose_out->unbind_output_tensors();
|
||||
if (_output_convert) {
|
||||
_output_convert->unbind_output_tensors();
|
||||
}
|
||||
|
|
@ -498,7 +571,7 @@ std::vector<Qnn_Tensor_t> &ggml_qnn_matmul_op_config::get_qnn_output_tensors() {
|
|||
if (_output_convert) {
|
||||
return _output_convert->get_qnn_output_tensors();
|
||||
} else {
|
||||
return _transpose1->get_qnn_output_tensors();
|
||||
return _transpose_out->get_qnn_output_tensors();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -513,9 +586,9 @@ ggml_op_constructor_t create_op_constructor(const std::string &op_name) {
|
|||
} else if (op_name == QNN_OP_TRANSPOSE) {
|
||||
return [](const std::string &instance_name,
|
||||
std::shared_ptr<qnn::qnn_instance> qnn_instance) -> std::unique_ptr<qnn::ggml_qnn_op_config> {
|
||||
return std::make_unique<qnn::ggml_qnn_single_op_config>(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW,
|
||||
QNN_OP_TRANSPOSE, QNN_OP_TRANSPOSE_PARAM_PERM,
|
||||
QNN_DATATYPE_UINT_32, 4 * sizeof(uint32_t), qnn_instance);
|
||||
return std::make_unique<qnn::ggml_qnn_single_op_config>(
|
||||
instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE, QNN_OP_TRANSPOSE_PARAM_PERM,
|
||||
QNN_DATATYPE_UINT_32, 4 * sizeof(uint32_t), qnn_instance);
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -5,31 +5,13 @@
|
|||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "ggml-qnn.h"
|
||||
|
||||
#include "op-config-base.hpp"
|
||||
#include "qnn-lib.hpp"
|
||||
#include "qnn-types.hpp"
|
||||
#include "tensor.hpp"
|
||||
|
||||
namespace qnn {
|
||||
|
||||
using ggml_tensor_array_t = std::vector<ggml_tensor *>;
|
||||
|
||||
class ggml_qnn_op_config {
|
||||
public:
|
||||
virtual ~ggml_qnn_op_config() {}
|
||||
virtual bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
const ggml_tensor_array_t &tensor_inputs,
|
||||
const ggml_tensor_array_t &tensor_outputs) = 0;
|
||||
virtual std::vector<Qnn_Tensor_t> &get_qnn_input_tensors() = 0;
|
||||
virtual std::vector<Qnn_Tensor_t> &get_qnn_output_tensors() = 0;
|
||||
virtual bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) = 0;
|
||||
virtual bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) = 0;
|
||||
virtual bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) = 0;
|
||||
virtual void unbind_input_tensors() = 0;
|
||||
virtual void unbind_output_tensors() = 0;
|
||||
};
|
||||
|
||||
using ggml_op_constructor_t =
|
||||
std::function<std::unique_ptr<ggml_qnn_op_config>(const std::string &, std::shared_ptr<qnn_instance>)>;
|
||||
|
||||
|
|
@ -60,9 +42,9 @@ protected:
|
|||
std::string _package_name;
|
||||
std::string _op_type;
|
||||
std::shared_ptr<qnn_instance> _qnn_instance;
|
||||
ggml_qnn_tensor_array_t _tensor_inputs;
|
||||
ggml_qnn_tensor_array_t _tensor_outputs;
|
||||
ggml_qnn_tensor_array_t _tensor_parameters;
|
||||
qnn_tensor_array_t _tensor_inputs;
|
||||
qnn_tensor_array_t _tensor_outputs;
|
||||
qnn_tensor_array_t _tensor_parameters;
|
||||
std::vector<Qnn_Tensor_t> _qnn_tensor_inputs;
|
||||
std::vector<Qnn_Tensor_t> _qnn_tensor_outputs;
|
||||
std::vector<Qnn_Param_t> _qnn_parameters;
|
||||
|
|
@ -87,8 +69,9 @@ public:
|
|||
_param_type(param_type),
|
||||
_param_buffer(param_size) {}
|
||||
|
||||
bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs,
|
||||
const ggml_tensor_array_t &tensor_outputs) override;
|
||||
bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
const ggml_tensor_array_t &tensor_inputs,
|
||||
const ggml_tensor_array_t &tensor_outputs) override;
|
||||
|
||||
private:
|
||||
const std::string _param_name;
|
||||
|
|
@ -104,8 +87,9 @@ public:
|
|||
ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr<qnn_instance> qnn_instance)
|
||||
: _name(name), _qnn_instance(qnn_instance) {}
|
||||
|
||||
bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs,
|
||||
const ggml_tensor_array_t &tensor_outputs) override;
|
||||
bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
const ggml_tensor_array_t &tensor_inputs,
|
||||
const ggml_tensor_array_t &tensor_outputs) override;
|
||||
bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override;
|
||||
bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override;
|
||||
bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override;
|
||||
|
|
@ -115,17 +99,22 @@ public:
|
|||
std::vector<Qnn_Tensor_t> &get_qnn_output_tensors() override;
|
||||
|
||||
private:
|
||||
qnn_tensor_ptr_t create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank,
|
||||
qnn_tensor_ptr_t tensor_input, qnn_dimension_array_t output_dimensions);
|
||||
bool create_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank,
|
||||
qnn_tensor_array_t &tensor_inputs, qnn_tensor_array_t &tensor_outputs);
|
||||
bool create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank,
|
||||
ggml_qnn_tensor_array_t &tensor_inputs, ggml_qnn_tensor_array_t &tensor_outputs);
|
||||
qnn_tensor_array_t &tensor_inputs, qnn_tensor_array_t &tensor_outputs);
|
||||
|
||||
std::string _name;
|
||||
std::shared_ptr<qnn_instance> _qnn_instance;
|
||||
std::shared_ptr<ggml_qnn_op_config> _transpose0;
|
||||
std::shared_ptr<ggml_qnn_op_config> _transpose1;
|
||||
std::shared_ptr<ggml_qnn_op_config> _mat_mul;
|
||||
std::vector<std::shared_ptr<ggml_qnn_op_config>> _input_converts;
|
||||
std::shared_ptr<ggml_qnn_op_config> _output_convert;
|
||||
ggml_qnn_tensor_array_t _tensor_inputs;
|
||||
qnn_op_config_ptr_t _transpose_out;
|
||||
qnn_op_config_ptr_t _mat_mul;
|
||||
qnn_op_config_ptr_t _gather0;
|
||||
qnn_op_config_ptr_t _gather1;
|
||||
std::vector<qnn_op_config_ptr_t> _input_converts;
|
||||
qnn_op_config_ptr_t _output_convert;
|
||||
qnn_tensor_array_t _tensor_inputs;
|
||||
std::vector<Qnn_Tensor_t> _qnn_tensor_inputs;
|
||||
|
||||
DISABLE_COPY(ggml_qnn_matmul_op_config);
|
||||
|
|
|
|||
|
|
@ -188,8 +188,8 @@ class qnn_instance {
|
|||
public:
|
||||
using BackendIdType = decltype(QnnInterface_t{}.backendId);
|
||||
|
||||
explicit qnn_instance(const std::string &lib_path, const std::string &backend_name, const std::string &model_name) :
|
||||
_lib_path(std::move(lib_path)), _backend_name(std::move(backend_name)), _model_name(std::move(model_name)) {}
|
||||
explicit qnn_instance(const std::string &lib_path, const std::string &backend_name, const std::string &model_name)
|
||||
: _lib_path(std::move(lib_path)), _backend_name(std::move(backend_name)), _model_name(std::move(model_name)) {}
|
||||
|
||||
~qnn_instance() {}
|
||||
|
||||
|
|
@ -269,7 +269,7 @@ public:
|
|||
QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", chipinfo.socModel,
|
||||
qnn::get_chipset_desc(chipinfo.socModel), htp_arch, qnn::get_htparch_desc(htp_arch),
|
||||
chipinfo.vtcmSize);
|
||||
_soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize };
|
||||
_soc_info = {chipinfo.socModel, htp_arch, chipinfo.vtcmSize};
|
||||
}
|
||||
_qnn_interface->qnn_device_free_platform_info(nullptr, p_info);
|
||||
|
||||
|
|
@ -288,7 +288,7 @@ public:
|
|||
arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
|
||||
arch_devconfig.customConfig = &arch_customconfig;
|
||||
|
||||
const QnnDevice_Config_t *p_deviceconfig[] = { &soc_devconfig, &arch_devconfig, nullptr };
|
||||
const QnnDevice_Config_t *p_deviceconfig[] = {&soc_devconfig, &arch_devconfig, nullptr};
|
||||
qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle);
|
||||
} else {
|
||||
qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, nullptr, &_qnn_device_handle);
|
||||
|
|
@ -299,27 +299,17 @@ public:
|
|||
QNN_LOG_INFO("create QNN device successfully\n");
|
||||
}
|
||||
|
||||
if (qnn::sdk_profile_level::profile_off != _profile_level) {
|
||||
if (_profile_level != sdk_profile_level::profile_off) {
|
||||
QNN_LOG_INFO("profiling turned on; level = %d", _profile_level);
|
||||
if (qnn::sdk_profile_level::profile_basic == _profile_level) {
|
||||
QNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n");
|
||||
if (QNN_PROFILE_NO_ERROR != _qnn_interface->qnn_profile_create(
|
||||
_qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) {
|
||||
QNN_LOG_WARN("unable to create profile handle in the backend\n");
|
||||
return 6;
|
||||
} else {
|
||||
QNN_LOG_DEBUG("initialize qnn profile successfully\n");
|
||||
}
|
||||
} else if (qnn::sdk_profile_level::profile_detail == _profile_level) {
|
||||
QNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n");
|
||||
if (QNN_PROFILE_NO_ERROR != _qnn_interface->qnn_profile_create(_qnn_backend_handle,
|
||||
QNN_PROFILE_LEVEL_DETAILED,
|
||||
&_qnn_profile_handle)) {
|
||||
QNN_LOG_WARN("unable to create profile handle in the backend\n");
|
||||
return 7;
|
||||
} else {
|
||||
QNN_LOG_DEBUG("initialize qnn profile successfully\n");
|
||||
}
|
||||
auto profile_level = _profile_level == sdk_profile_level::profile_detail ? QNN_PROFILE_LEVEL_DETAILED
|
||||
: QNN_PROFILE_LEVEL_BASIC;
|
||||
|
||||
if (QNN_PROFILE_NO_ERROR !=
|
||||
_qnn_interface->qnn_profile_create(_qnn_backend_handle, profile_level, &_qnn_profile_handle)) {
|
||||
QNN_LOG_WARN("unable to create profile handle in the backend\n");
|
||||
return 6;
|
||||
} else {
|
||||
QNN_LOG_DEBUG("initialize qnn profile successfully\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -364,7 +354,7 @@ public:
|
|||
size_t candidate_size = 0;
|
||||
uint8_t *rpc_buffer = nullptr;
|
||||
const int size_in_mb = (1 << 20);
|
||||
size_t probe_slots[] = { 1024, 1536, 2048 - 48, 2048 };
|
||||
size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048};
|
||||
size_t probe_counts = sizeof(probe_slots) / sizeof(size_t);
|
||||
for (size_t idx = 0; idx < probe_counts; idx++) {
|
||||
rpc_buffer = static_cast<uint8_t *>(alloc_rpcmem(probe_slots[idx] * size_in_mb, sizeof(void *)));
|
||||
|
|
@ -526,13 +516,13 @@ public:
|
|||
// use rpc control latency recommended 100 us, refer hexagon sdk
|
||||
rpc_control_latency.rpcControlLatencyConfig = 100;
|
||||
|
||||
const QnnHtpPerfInfrastructure_PowerConfig_t *power_configs[] = { &rpc_polling_time, &rpc_control_latency,
|
||||
nullptr };
|
||||
const QnnHtpPerfInfrastructure_PowerConfig_t *power_configs[] = {&rpc_polling_time, &rpc_control_latency,
|
||||
nullptr};
|
||||
Qnn_ErrorHandle_t qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs);
|
||||
if (qnn_status != QNN_SUCCESS) {
|
||||
QNN_LOG_WARN("set htp perf failed\n");
|
||||
} else {
|
||||
QNN_LOG_INFO("set htp perf ok\n");
|
||||
QNN_LOG_DEBUG("set htp perf ok\n");
|
||||
}
|
||||
} else {
|
||||
QNN_LOG_WARN("can't set htp perf\n");
|
||||
|
|
@ -572,13 +562,13 @@ public:
|
|||
power_config.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
|
||||
|
||||
// set power config with different performance parameters
|
||||
const QnnHtpPerfInfrastructure_PowerConfig_t *power_configs[] = { &power_config, nullptr };
|
||||
const QnnHtpPerfInfrastructure_PowerConfig_t *power_configs[] = {&power_config, nullptr};
|
||||
Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS;
|
||||
qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs);
|
||||
if (qnn_status != QNN_SUCCESS) {
|
||||
QNN_LOG_WARN("set htp high performance mode failed\n");
|
||||
} else {
|
||||
QNN_LOG_INFO("set htp high performance mode ok\n");
|
||||
QNN_LOG_DEBUG("set htp high performance mode ok\n");
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
|
@ -659,8 +649,8 @@ public:
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
QNN_LOG_INFO("mem_fd %d\n", mem_fd);
|
||||
Qnn_MemDescriptor_t descriptor = { { rank, dimensions, nullptr }, data_type, QNN_MEM_TYPE_ION, { { mem_fd } } };
|
||||
QNN_LOG_DEBUG("mem_fd %d\n", mem_fd);
|
||||
Qnn_MemDescriptor_t descriptor = {{rank, dimensions, nullptr}, data_type, QNN_MEM_TYPE_ION, {{mem_fd}}};
|
||||
Qnn_MemHandle_t handle = nullptr;
|
||||
auto error = _qnn_interface->qnn_mem_register(_qnn_context_handle, &descriptor,
|
||||
/*numDescriptors=*/1, &handle);
|
||||
|
|
@ -670,8 +660,8 @@ public:
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
_qnn_rpc_buffer_to_handles.insert({ p_data, handle });
|
||||
QNN_LOG_INFO("successfully register shared memory handler: %p\n", handle);
|
||||
_qnn_rpc_buffer_to_handles.insert({p_data, handle});
|
||||
QNN_LOG_DEBUG("successfully register shared memory handler: %p\n", handle);
|
||||
return handle;
|
||||
}
|
||||
|
||||
|
|
@ -748,7 +738,7 @@ private:
|
|||
QNN_LOG_WARN("unable to find a valid qnn system interface\n");
|
||||
return 6;
|
||||
} else {
|
||||
QNN_LOG_INFO("find a valid qnn system interface\n");
|
||||
QNN_LOG_DEBUG("find a valid qnn system interface\n");
|
||||
}
|
||||
|
||||
auto qnn_sys_interface = std::make_shared<qnn::qnn_system_interface>(*provider_list[0], system_lib_handle);
|
||||
|
|
@ -810,7 +800,7 @@ private:
|
|||
QNN_LOG_WARN("unable to find a valid qnn interface\n");
|
||||
return 6;
|
||||
} else {
|
||||
QNN_LOG_INFO("find a valid qnn interface\n");
|
||||
QNN_LOG_DEBUG("find a valid qnn interface\n");
|
||||
}
|
||||
|
||||
BackendIdType backend_id = provider_list[0]->backendId;
|
||||
|
|
@ -890,7 +880,7 @@ private:
|
|||
std::unordered_map<BackendIdType, const QnnInterface_t *> _loaded_backend;
|
||||
|
||||
dl_handler_t _rpc_lib_handle = nullptr;
|
||||
std::atomic_bool _rpcmem_initialized{ false };
|
||||
std::atomic_bool _rpcmem_initialized{false};
|
||||
qnn::pfn_rpc_mem_alloc _pfn_rpc_mem_alloc;
|
||||
qnn::pfn_rpc_mem_free _pfn_rpc_mem_free;
|
||||
qnn::pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd;
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ namespace qnn {
|
|||
// Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK
|
||||
// ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm
|
||||
// =================================================================================================
|
||||
enum sdk_profile_level { profile_off = 0, profile_basic = 1, profile_detail = 2 };
|
||||
enum sdk_profile_level { profile_off = 0, profile_basic, profile_detail };
|
||||
|
||||
enum qcom_htp_arch {
|
||||
NONE = 0,
|
||||
|
|
|
|||
|
|
@ -27,8 +27,8 @@ public:
|
|||
explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string &name,
|
||||
const qnn_dimension_array_t &dimensions, Qnn_DataType_t data_type, int rank,
|
||||
QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
std::shared_ptr<qnn_instance> qnn_instance) :
|
||||
_tensor_name(name), _device(device), _qnn_instance(qnn_instance), _graph_handle(graph_handle) {
|
||||
std::shared_ptr<qnn_instance> qnn_instance)
|
||||
: _tensor_name(name), _device(device), _qnn_instance(qnn_instance), _graph_handle(graph_handle) {
|
||||
if (!_tensor_name.empty()) {
|
||||
QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str());
|
||||
}
|
||||
|
|
@ -37,23 +37,35 @@ public:
|
|||
|
||||
_dimensions = dimensions;
|
||||
update_params_from_ggml_tensor(tensor_type, data_type, rank);
|
||||
QNN_LOG_DEBUG("create tensor %s, rank: %d, dims: [%d, %d, %d, %d], data_type: %d, device: %d",
|
||||
QNN_LOG_DEBUG("[%s][%s]created, rank: %d, dims: [%d, %d, %d, %d], type: %s", get_backend_name(device),
|
||||
_tensor_name.c_str(), rank, (int)_dimensions[0], (int)_dimensions[1], (int)_dimensions[2],
|
||||
(int)_dimensions[3], (int)data_type, (int)device);
|
||||
(int)_dimensions[3], qnn_datatype_to_string(data_type));
|
||||
}
|
||||
|
||||
explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string &name,
|
||||
const ggml_dimension_array_t &dimensions, ggml_type data_type, int rank, QNNBackend device,
|
||||
Qnn_GraphHandle_t graph_handle, std::shared_ptr<qnn_instance> qnn_instance) :
|
||||
ggml_qnn_tensor(tensor_type, name, get_internal_dimension(dimensions, rank),
|
||||
qnn_datatype_from_ggml_datatype(data_type), rank, device, graph_handle, qnn_instance) {}
|
||||
Qnn_GraphHandle_t graph_handle, std::shared_ptr<qnn_instance> qnn_instance)
|
||||
: ggml_qnn_tensor(tensor_type, name, get_internal_dimension(dimensions, rank),
|
||||
qnn_datatype_from_ggml_datatype(data_type), rank, device, graph_handle, qnn_instance) {}
|
||||
|
||||
~ggml_qnn_tensor() { _qnn_rpc_buffer.reset(); }
|
||||
~ggml_qnn_tensor() {
|
||||
_buffer_storage.clear();
|
||||
unbind();
|
||||
_qnn_rpc_buffer.reset();
|
||||
}
|
||||
|
||||
bool set_data_buffer(std::vector<uint8_t> &&buffer) {
|
||||
if (!bind_buffer_impl(buffer.data(), buffer.size())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
_buffer_storage = std::move(buffer);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool alloc_qnn_tensor_id() {
|
||||
if (QNN_TENSOR_GET_ID(_qnn_tensor)) {
|
||||
QNN_LOG_WARN("graph tensor %s already created, id %d", _tensor_name.c_str(),
|
||||
QNN_TENSOR_GET_ID(_qnn_tensor));
|
||||
QNN_LOG_DEBUG("[%s]tensor already has a id: %d", _tensor_name.c_str(), QNN_TENSOR_GET_ID(_qnn_tensor));
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -61,30 +73,90 @@ public:
|
|||
auto qnn_interface = _qnn_instance->get_qnn_interface();
|
||||
auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_WARN("create graph tensor failed, tensor %s, error: %d\n", _tensor_name.c_str(), error);
|
||||
QNN_LOG_WARN("[%s]allocate id failed , error: %d\n", _tensor_name.c_str(), error);
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(qnn_tensor));
|
||||
QNN_LOG_DEBUG("create graph tensor %s, id: %d, rank: %d", _tensor_name.c_str(), QNN_TENSOR_GET_ID(qnn_tensor),
|
||||
QNN_TENSOR_GET_RANK(qnn_tensor));
|
||||
|
||||
QNN_LOG_DEBUG("[%s][%s]allocated id: %d, rank: %d", get_backend_name(_device), _tensor_name.c_str(),
|
||||
QNN_TENSOR_GET_ID(qnn_tensor), QNN_TENSOR_GET_RANK(qnn_tensor));
|
||||
return true;
|
||||
}
|
||||
|
||||
bool bind_buffer(uint8_t *buffer, const size_t buffer_size) {
|
||||
if (!_buffer_storage.empty()) {
|
||||
QNN_LOG_DEBUG("[%s]already has buffer storage, skip bind", _tensor_name.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
return bind_buffer_impl(buffer, buffer_size);
|
||||
}
|
||||
|
||||
bool bind_ggml_tensor(ggml_tensor *tensor) {
|
||||
if (!bind_buffer(reinterpret_cast<uint8_t *>(tensor->data), ggml_nbytes(tensor))) {
|
||||
QNN_LOG_WARN("[%s]failed to bind ggml tensor(%s)", _tensor_name.c_str(), ggml_get_name(tensor));
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s][%s]bind to ggml tensor(%s)", get_backend_name(_device), _tensor_name.c_str(),
|
||||
ggml_get_name(tensor));
|
||||
return true;
|
||||
}
|
||||
|
||||
bool unbind() {
|
||||
if (!_graph_handle) {
|
||||
QNN_LOG_WARN("[%s]not bound to any graph", _tensor_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!_buffer) {
|
||||
QNN_LOG_DEBUG("[%s]bound to ggml tensor", _tensor_name.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!read_from_qnn_tensor()) {
|
||||
QNN_LOG_WARN("[%s]read from qnn tensor failed", _tensor_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!_buffer_storage.empty()) {
|
||||
QNN_LOG_DEBUG("[%s]already has buffer storage, stop unbind", _tensor_name.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!should_use_mem_handle()) {
|
||||
QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW);
|
||||
Qnn_ClientBuffer_t client_buf = {};
|
||||
QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf);
|
||||
QNN_LOG_DEBUG("[%s]clear client buffer", _tensor_name.c_str());
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s][%s]unbind from buffer: %p, size: %d", get_backend_name(_device), _tensor_name.c_str(),
|
||||
_buffer, (int)_buffer_size);
|
||||
_buffer = nullptr;
|
||||
_buffer_size = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; }
|
||||
Qnn_DataType_t get_data_type() const { return QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor); }
|
||||
const qnn_dimension_array_t &get_dimensions() const { return _dimensions; }
|
||||
uint32_t get_qnn_tensor_id() const { return QNN_TENSOR_GET_ID(_qnn_tensor); }
|
||||
|
||||
private:
|
||||
bool bind_buffer_impl(uint8_t *buffer, const size_t buffer_size) {
|
||||
if (_buffer) {
|
||||
if (_buffer != buffer) {
|
||||
QNN_LOG_WARN("tensor %s has been bound to another buffer %p", _tensor_name.c_str(), _buffer);
|
||||
QNN_LOG_WARN("[%s]has been bound to another buffer %p", _tensor_name.c_str(), _buffer);
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_INFO("tensor %s already bound to same ggml tensor %p", _tensor_name.c_str(), _buffer);
|
||||
QNN_LOG_DEBUG("[%s]already bound to same ggml tensor %p", _tensor_name.c_str(), _buffer);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (QNN_TENSOR_GET_TYPE(_qnn_tensor) == QNN_TENSOR_TYPE_NATIVE) {
|
||||
QNN_LOG_DEBUG("tensor %s type(%d) not READ/WRITE, skipping", _tensor_name.c_str(),
|
||||
QNN_LOG_DEBUG("[%s]tensor type(%d) not READ/WRITE, skipping", _tensor_name.c_str(),
|
||||
(int)QNN_TENSOR_TYPE_NATIVE);
|
||||
return true;
|
||||
}
|
||||
|
|
@ -95,7 +167,7 @@ public:
|
|||
_qnn_instance, buffer_size, QNN_TENSOR_GET_RANK(_qnn_tensor),
|
||||
QNN_TENSOR_GET_DIMENSIONS(_qnn_tensor), QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor));
|
||||
if (!qnn_rpc_buffer->is_valid()) {
|
||||
QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str());
|
||||
QNN_LOG_WARN("[%s]alloc rpc mem failed", _tensor_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -104,12 +176,12 @@ public:
|
|||
|
||||
QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE);
|
||||
QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, _qnn_rpc_buffer->get_mem_handle());
|
||||
QNN_LOG_DEBUG("tensor %s, use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor));
|
||||
QNN_LOG_DEBUG("[%s]use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor));
|
||||
} else {
|
||||
QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW);
|
||||
Qnn_ClientBuffer_t client_buf = { buffer, (uint32_t)buffer_size };
|
||||
Qnn_ClientBuffer_t client_buf = {buffer, (uint32_t)buffer_size};
|
||||
QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf);
|
||||
QNN_LOG_DEBUG("tensor %s, use client buffer %p size %d", _tensor_name.c_str(), client_buf.data,
|
||||
QNN_LOG_DEBUG("[%s]use client buffer %p size %d", _tensor_name.c_str(), client_buf.data,
|
||||
(int)client_buf.dataSize);
|
||||
}
|
||||
|
||||
|
|
@ -117,62 +189,19 @@ public:
|
|||
_buffer_size = buffer_size;
|
||||
|
||||
if (!write_to_qnn_tensor()) {
|
||||
QNN_LOG_WARN("write to qnn tensor failed, tensor %s", _tensor_name.c_str());
|
||||
QNN_LOG_WARN("[%s]write to qnn tensor failed", _tensor_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("bind tensor %s to buffer: %p, size: %d", _tensor_name.c_str(), buffer, (int)buffer_size);
|
||||
QNN_LOG_DEBUG("[%s][%s]bind to buffer: %p, size: %d", get_backend_name(_device), _tensor_name.c_str(), buffer,
|
||||
(int)buffer_size);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool bind_ggml_tensor(ggml_tensor *tensor) {
|
||||
if (!bind_buffer(reinterpret_cast<uint8_t *>(tensor->data), ggml_nbytes(tensor))) {
|
||||
QNN_LOG_WARN("Failed to bind tensor: %s to ggml tensor: %s", _tensor_name.c_str(), ggml_get_name(tensor));
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("Bind tensor %s to ggml tensor %s", _tensor_name.c_str(), ggml_get_name(tensor));
|
||||
return true;
|
||||
}
|
||||
|
||||
bool unbind() {
|
||||
if (!_graph_handle) {
|
||||
QNN_LOG_WARN("tensor %s not bound to any graph", _tensor_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!_buffer) {
|
||||
QNN_LOG_DEBUG("tensor %s not bound to ggml tensor", _tensor_name.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!read_from_qnn_tensor()) {
|
||||
QNN_LOG_WARN("read from qnn tensor failed, tensor %s", _tensor_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!should_use_mem_handle()) {
|
||||
QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW);
|
||||
Qnn_ClientBuffer_t client_buf = {};
|
||||
QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf);
|
||||
QNN_LOG_DEBUG("tensor %s, clear client buffer", _tensor_name.c_str());
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("unbind tensor: %s from buffer: %p, size: %d", _tensor_name.c_str(), _buffer, (int)_buffer_size);
|
||||
_buffer = nullptr;
|
||||
_buffer_size = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; }
|
||||
Qnn_DataType_t get_data_type() const { return QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor); }
|
||||
const qnn_dimension_array_t &get_dimensions() const { return _dimensions; }
|
||||
|
||||
private:
|
||||
bool write_to_qnn_tensor() {
|
||||
auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor);
|
||||
if (tensor_type != QNN_TENSOR_TYPE_APP_WRITE && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) {
|
||||
QNN_LOG_DEBUG("tensor %s type(%d) not WRITE", _tensor_name.c_str(), (int)tensor_type);
|
||||
QNN_LOG_DEBUG("[%s]tensor type(%d) not WRITE", _tensor_name.c_str(), (int)tensor_type);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -180,20 +209,20 @@ private:
|
|||
if (_qnn_rpc_buffer) {
|
||||
memcpy(_qnn_rpc_buffer->get_buffer(), _buffer, _buffer_size);
|
||||
} else {
|
||||
QNN_LOG_WARN("tensor %s: can't find rpcmem from qnn mem handle\n", _tensor_name.c_str());
|
||||
QNN_LOG_WARN("[%s]can't find rpcmem from qnn mem handle\n", _tensor_name.c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// For CPU and GPU, the data is already in the tensor.
|
||||
QNN_LOG_DEBUG("write tensor %s to qnn", _tensor_name.c_str());
|
||||
QNN_LOG_DEBUG("[%s][%s]write tensor to qnn", get_backend_name(_device), _tensor_name.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
bool read_from_qnn_tensor() {
|
||||
auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor);
|
||||
if (tensor_type != QNN_TENSOR_TYPE_APP_READ && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) {
|
||||
QNN_LOG_DEBUG("tensor %s type(%d) not READ", _tensor_name.c_str(), (int)tensor_type);
|
||||
QNN_LOG_DEBUG("[%s]tensor type(%d) not READ", _tensor_name.c_str(), (int)tensor_type);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -201,13 +230,13 @@ private:
|
|||
if (_qnn_rpc_buffer) {
|
||||
memcpy(_buffer, _qnn_rpc_buffer->get_buffer(), _buffer_size);
|
||||
} else {
|
||||
QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n");
|
||||
QNN_LOG_WARN("[%s]can't find rpcmem from qnn mem handle", _tensor_name.c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// For CPU and GPU, the data is already in the tensor.
|
||||
QNN_LOG_DEBUG("read tensor %s from qnn", _tensor_name.c_str());
|
||||
QNN_LOG_DEBUG("[%s][%s]read tensor from qnn", get_backend_name(_device), _tensor_name.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -231,12 +260,14 @@ private:
|
|||
case PARAMETER:
|
||||
new_tensor_type = QNN_TENSOR_TYPE_STATIC;
|
||||
break;
|
||||
case INTERMEDIATE:
|
||||
default:
|
||||
new_tensor_type = QNN_TENSOR_TYPE_NATIVE;
|
||||
break;
|
||||
}
|
||||
QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type);
|
||||
QNN_LOG_INFO("tensor %s changed to type %d", _tensor_name.c_str(), new_tensor_type);
|
||||
QNN_LOG_DEBUG("[%s][%s]tensor changed to type %d", get_backend_name(_device), _tensor_name.c_str(),
|
||||
new_tensor_type);
|
||||
}
|
||||
|
||||
bool should_use_mem_handle() const {
|
||||
|
|
@ -246,6 +277,7 @@ private:
|
|||
std::string _tensor_name;
|
||||
uint8_t *_buffer = nullptr;
|
||||
size_t _buffer_size = 0;
|
||||
std::vector<uint8_t> _buffer_storage;
|
||||
QNNBackend _device;
|
||||
std::shared_ptr<qnn_instance> _qnn_instance;
|
||||
Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion);
|
||||
|
|
@ -257,7 +289,7 @@ private:
|
|||
DISABLE_MOVE(ggml_qnn_tensor);
|
||||
};
|
||||
|
||||
using ggml_qnn_tensor_ptr_t = std::shared_ptr<ggml_qnn_tensor>;
|
||||
using ggml_qnn_tensor_array_t = std::vector<std::shared_ptr<ggml_qnn_tensor>>;
|
||||
using qnn_tensor_ptr_t = std::shared_ptr<ggml_qnn_tensor>;
|
||||
using qnn_tensor_array_t = std::vector<qnn_tensor_ptr_t>;
|
||||
|
||||
} // namespace qnn
|
||||
|
|
|
|||
|
|
@ -5,9 +5,11 @@
|
|||
|
||||
#include "ggml-qnn.h"
|
||||
|
||||
#include "QnnGraph.h"
|
||||
#include "qnn-types.hpp"
|
||||
|
||||
#ifdef __linux__
|
||||
#include <sys/sysinfo.h>
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
|
|
@ -148,11 +150,11 @@ const char *get_ggml_type_name(ggml_type type) {
|
|||
const char *get_backend_name(QNNBackend device_index) {
|
||||
switch (device_index) {
|
||||
case QNN_BACKEND_CPU:
|
||||
return "QNN-CPU";
|
||||
return "qnn-cpu";
|
||||
case QNN_BACKEND_GPU:
|
||||
return "QNN-GPU";
|
||||
return "qnn-gpu";
|
||||
case QNN_BACKEND_NPU:
|
||||
return "QNN-NPU";
|
||||
return "qnn-npu";
|
||||
case QNN_BACKEND_COUNT:
|
||||
default:
|
||||
return "unknown";
|
||||
|
|
@ -195,18 +197,7 @@ intptr_t align_to(size_t alignment, intptr_t offset) {
|
|||
: offset + (static_cast<intptr_t>(alignment) - (offset % static_cast<intptr_t>(alignment)));
|
||||
}
|
||||
|
||||
uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor) {
|
||||
/*
|
||||
size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]);
|
||||
size_t n_dims = qnn_get_ggml_tensor_rank(tensor);
|
||||
for (int i = 1; i < n_dims; i++) {
|
||||
data_size *= tensor->ne[i];
|
||||
}
|
||||
|
||||
return data_size;
|
||||
*/
|
||||
return ggml_nbytes(tensor);
|
||||
}
|
||||
uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor) { return ggml_nbytes(tensor); }
|
||||
|
||||
void *align_alloc(size_t alignment, size_t size) {
|
||||
size_t size_aligned = size;
|
||||
|
|
@ -248,6 +239,7 @@ const char *opname_from_ggmlop(enum ggml_op ggmlop) {
|
|||
const char *get_qnn_error_string(Qnn_ErrorHandle_t error) {
|
||||
// A complete list of error codes can be found at here:
|
||||
// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/api_error_codes.html
|
||||
thread_local static char error_code[128] = {};
|
||||
switch (error) {
|
||||
case QNN_SUCCESS:
|
||||
return "QNN_SUCCESS";
|
||||
|
|
@ -277,6 +269,36 @@ const char *get_qnn_error_string(Qnn_ErrorHandle_t error) {
|
|||
return "QNN_GRAPH_ERROR_UNCONNECTED_NODE";
|
||||
case QNN_GRAPH_ERROR_CREATE_FAILED:
|
||||
return "QNN_GRAPH_ERROR_CREATE_FAILED";
|
||||
case QNN_GRAPH_ERROR_OPTIMIZATION_FAILED:
|
||||
return "QNN_GRAPH_ERROR_OPTIMIZATION_FAILED";
|
||||
case QNN_GRAPH_ERROR_FINALIZE_FAILED:
|
||||
return "QNN_GRAPH_ERROR_FINALIZE_FAILED";
|
||||
case QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED:
|
||||
return "QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED";
|
||||
case QNN_GRAPH_ERROR_GRAPH_FINALIZED:
|
||||
return "QNN_GRAPH_ERROR_GRAPH_FINALIZED";
|
||||
case QNN_GRAPH_ERROR_EXECUTION_ASYNC_FIFO_FULL:
|
||||
return "QNN_GRAPH_ERROR_EXECUTION_ASYNC_FIFO_FULL";
|
||||
case QNN_GRAPH_ERROR_SIGNAL_IN_USE:
|
||||
return "QNN_GRAPH_ERROR_SIGNAL_IN_USE";
|
||||
case QNN_GRAPH_ERROR_ABORTED:
|
||||
return "QNN_GRAPH_ERROR_ABORTED";
|
||||
case QNN_GRAPH_ERROR_PROFILE_IN_USE:
|
||||
return "QNN_GRAPH_ERROR_PROFILE_IN_USE";
|
||||
case QNN_GRAPH_ERROR_TIMED_OUT:
|
||||
return "QNN_GRAPH_ERROR_TIMED_OUT";
|
||||
case QNN_GRAPH_ERROR_SUBGRAPH:
|
||||
return "QNN_GRAPH_ERROR_SUBGRAPH";
|
||||
case QNN_GRAPH_ERROR_DISABLED:
|
||||
return "QNN_GRAPH_ERROR_DISABLED";
|
||||
case QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE:
|
||||
return "QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE";
|
||||
case QNN_GRAPH_ERROR_TENSOR_SPARSITY:
|
||||
return "QNN_GRAPH_ERROR_TENSOR_SPARSITY";
|
||||
case QNN_GRAPH_ERROR_EARLY_TERMINATION:
|
||||
return "QNN_GRAPH_ERROR_EARLY_TERMINATION";
|
||||
case QNN_GRAPH_ERROR_INVALID_CONTEXT:
|
||||
return "QNN_GRAPH_ERROR_INVALID_CONTEXT";
|
||||
|
||||
// QnnOpPackage_Error_t
|
||||
case QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED:
|
||||
|
|
@ -294,19 +316,34 @@ const char *get_qnn_error_string(Qnn_ErrorHandle_t error) {
|
|||
case QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT:
|
||||
return "QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT";
|
||||
default:
|
||||
return nullptr;
|
||||
if (error >= QNN_GRAPH_MIN_ERROR && error < QNN_GRAPH_MAX_ERROR) {
|
||||
snprintf(error_code, sizeof(error_code), "UNKNOWN_GRAPH_ERROR_%d", int(error - QNN_GRAPH_MIN_ERROR));
|
||||
} else {
|
||||
snprintf(error_code, sizeof(error_code), "%d", int(error));
|
||||
}
|
||||
return error_code;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __linux__
|
||||
|
||||
size_t get_system_total_memory_in_bytes() {
|
||||
struct sysinfo info = {};
|
||||
if (sysinfo(&info) == 0) {
|
||||
return (info.totalram + info.totalswap) * info.mem_unit;
|
||||
}
|
||||
|
||||
auto pages = (size_t)sysconf(_SC_PHYS_PAGES);
|
||||
auto page_size = (size_t)sysconf(_SC_PAGE_SIZE);
|
||||
return pages * page_size;
|
||||
}
|
||||
|
||||
size_t get_system_free_memory_in_bytes() {
|
||||
struct sysinfo info = {};
|
||||
if (sysinfo(&info) == 0) {
|
||||
return (info.freeram + info.freeswap) * info.mem_unit;
|
||||
}
|
||||
|
||||
auto avail_pages = (size_t)sysconf(_SC_AVPHYS_PAGES);
|
||||
auto page_size = (size_t)sysconf(_SC_PAGE_SIZE);
|
||||
return avail_pages * page_size;
|
||||
|
|
|
|||
Loading…
Reference in New Issue