feat: fix mulmat (#2)
* ggml_qnn_op_config now manager the construction of ggml_qnn_tensor * wip * add interface ggml_qnn_op_config * add ggml_qnn_list_op_config * add create_tensor and move tensor bind to execute * wip * rename: ggml_qnn_list_op_config -> ggml_qnn_matmul_op_config * add tensortype to allow native tensor * remove ggml_tensor param at ggml_qnn_tensor::create_tensor * postpone the tensor id allocation to add_node * add ggml_qnn_op_config_base * trival change to reduct the param of function * split bind_tensors into bind_input_tensors and bind_output_tensors * implement ggml_qnn_single_op_config::create_tensors next will set the prameter of transpose * tensor: add bind buffer * add parameter tensor type * implement add_tensor_param * set qnn_instance only at constructor * set transpose tensor param * move create_op_constructor into op-config module * create QNN_OP_MAT_MUL from ggml_qnn_matmul_op_config * try fix crash * fix compiling error at older ndk (r23c) * fix crash * fix parameter tensor name * update tensor dimension assignment and add TODO * fix mat_mul graph creating * fix MUL_MAT_256x16x10x1_256x1x10x1_16x1x10x1 * append type to graph cache key * wip * fix supported op * update comment * disable op other than add and mat_mul * add convert op to adapt multi input/output format * disable f16 for cpu backend according to official doc https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/cpu_backend.html#supported-operations * add supported data types flags in each backend * remove unused functions * append output type to graph key * fix gpu backend by disable the different data type op * fix cpu backend support ops * fix duplicated tensor name * append op name * suppress warning * remove unused code
This commit is contained in:
parent
f260498213
commit
4abaf7d87e
|
|
@ -51,12 +51,30 @@ struct qnn_device_caps {
|
|||
const char *description;
|
||||
const char *lib_name;
|
||||
enum ggml_backend_dev_type type;
|
||||
|
||||
// TODO: should get this caps from device
|
||||
std::unordered_set<ggml_type> supported_types;
|
||||
};
|
||||
|
||||
const qnn_device_caps kDeviceCaps[GGML_QNN_MAX_DEVICES]{
|
||||
{ "qnn-cpu", "Qualcomm Kryo CPU", "libQnnCpu.so", GGML_BACKEND_DEVICE_TYPE_CPU }, /* QNN_BACKEND_CPU */
|
||||
{ "qnn-gpu", "Qualcomm Adreno GPU", "libQnnGpu.so", GGML_BACKEND_DEVICE_TYPE_GPU }, /* QNN_BACKEND_GPU */
|
||||
{ "qnn-npu", "Qualcomm NPU", "libQnnHtp.so", GGML_BACKEND_DEVICE_TYPE_GPU }, /* QNN_BACKEND_NPU */
|
||||
{ "qnn-cpu",
|
||||
"Qualcomm Kryo CPU",
|
||||
"libQnnCpu.so",
|
||||
GGML_BACKEND_DEVICE_TYPE_CPU,
|
||||
{ GGML_TYPE_F32,
|
||||
GGML_TYPE_I8 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul
|
||||
{ "qnn-gpu",
|
||||
"Qualcomm Adreno GPU",
|
||||
"libQnnGpu.so",
|
||||
GGML_BACKEND_DEVICE_TYPE_GPU,
|
||||
{ GGML_TYPE_F32,
|
||||
GGML_TYPE_F16 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul
|
||||
{ "qnn-npu",
|
||||
"Qualcomm NPU",
|
||||
"libQnnHtp.so",
|
||||
GGML_BACKEND_DEVICE_TYPE_GPU,
|
||||
{ GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_I16,
|
||||
GGML_TYPE_I8 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul
|
||||
};
|
||||
|
||||
class ggml_backend_qnn_buffer_context {
|
||||
|
|
@ -340,9 +358,10 @@ void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, struct ggml_backe
|
|||
props->type = ggml_backend_qnn_device_get_type(dev);
|
||||
ggml_backend_qnn_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
props->caps = {
|
||||
/* async */ false,
|
||||
/* host_buffer */ false,
|
||||
/* events */ false,
|
||||
/* async */ false,
|
||||
/* host_buffer */ false,
|
||||
/* buffer_from_host_ptr */ false,
|
||||
/* events */ false,
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -412,6 +431,7 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev,
|
|||
dev_ctx->instance = instance;
|
||||
dev_ctx->qnn_interface = qnn_interface;
|
||||
dev_ctx->socinfo = instance->get_soc_info();
|
||||
dev_ctx->supported_types = kDeviceCaps[device_index].supported_types;
|
||||
|
||||
ggml_backend_t qnn_backend = new ggml_backend{
|
||||
/* .guid = */ ggml_backend_qnn_guid(),
|
||||
|
|
@ -440,8 +460,8 @@ ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_ptr(ggml_backend_dev_t
|
|||
}
|
||||
|
||||
bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor *op) {
|
||||
GGML_UNUSED(dev);
|
||||
return qnn::ggml_qnn_supports_op(op);
|
||||
auto *device_ctx = get_device_context(dev);
|
||||
return qnn::ggml_qnn_supports_op(device_ctx, op);
|
||||
}
|
||||
|
||||
bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||
|
|
|
|||
|
|
@ -108,8 +108,8 @@ std::string get_graph_key(const std::string &op_name, const std::array<ggml_tens
|
|||
const std::array<ggml_tensor *, _OutputSize> &outputs) {
|
||||
constexpr static const auto append_dimensions = [](std::string &key, const ggml_tensor *tensor) {
|
||||
char buffer[256] = {};
|
||||
snprintf(buffer, sizeof(buffer), "_%ldx%ldx%ldx%ld", (long)tensor->ne[0], (long)tensor->ne[1],
|
||||
(long)tensor->ne[2], (long)tensor->ne[3]);
|
||||
snprintf(buffer, sizeof(buffer), "_%ldx%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1],
|
||||
(long)tensor->ne[2], (long)tensor->ne[3], qnn::get_ggml_type_name(tensor->type));
|
||||
key += buffer;
|
||||
};
|
||||
|
||||
|
|
@ -117,32 +117,11 @@ std::string get_graph_key(const std::string &op_name, const std::array<ggml_tens
|
|||
for (auto &input : inputs) {
|
||||
append_dimensions(graph_key, input);
|
||||
}
|
||||
for (auto &output : outputs) {
|
||||
append_dimensions(graph_key, output);
|
||||
}
|
||||
|
||||
graph_key += qnn::get_ggml_type_name(outputs.front()->type);
|
||||
return graph_key;
|
||||
}
|
||||
|
||||
qnn::ggml_op_constructor_t generate_common_op_constructor(const std::string &op_name) {
|
||||
if (op_name == QNN_OP_MAT_MUL) {
|
||||
// For QNN_OP_MAT_MUL, we need to transpose the input tensor
|
||||
return [](const std::string &name) {
|
||||
auto config = std::make_unique<qnn::ggml_qnn_op_config>(name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL);
|
||||
Qnn_Scalar_t scalar = QNN_SCALAR_INIT;
|
||||
scalar.dataType = QNN_DATATYPE_BOOL_8;
|
||||
scalar.bool8Value = true;
|
||||
config->add_scalar_param(QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0, scalar);
|
||||
QNN_LOG_DEBUG("add scalar param %s\n", QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0);
|
||||
return config;
|
||||
};
|
||||
}
|
||||
|
||||
return [op_name](const std::string &name) {
|
||||
return std::make_unique<qnn::ggml_qnn_op_config>(name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_name);
|
||||
};
|
||||
}
|
||||
|
||||
constexpr const char *kGgmlOpToQnnOp[] = {
|
||||
nullptr, // GGML_OP_NONE
|
||||
nullptr, // GGML_OP_DUP
|
||||
|
|
@ -278,7 +257,7 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *c
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
auto op_constructor = generate_common_op_constructor(kGgmlOpToQnnOp[op]);
|
||||
auto op_constructor = qnn::create_op_constructor(kGgmlOpToQnnOp[op]);
|
||||
if (!graph->build_graph(op_constructor, to_ggml_tensor_array<_InputSize>(inputs),
|
||||
to_ggml_tensor_array<_OutputSize>(outputs))) {
|
||||
QNN_LOG_ERROR("build_graph failed\n");
|
||||
|
|
@ -542,11 +521,57 @@ static constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = {
|
|||
static_assert(sizeof(kQnnBinaryOpsTable) / sizeof(kQnnBinaryOpsTable[0]) == GGML_OP_COUNT,
|
||||
"GGML_OP_COUNT does not match the size of the kQnnBinaryOpsTable table");
|
||||
|
||||
bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_tensor *tensor) {
|
||||
switch (tensor->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
if (ctx->supported_types.find(tensor->type) == ctx->supported_types.end()) {
|
||||
QNN_LOG_DEBUG("unsupported data type GGML_TYPE_F16 for cpu backend");
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
QNN_LOG_DEBUG("unsupported data type %d", tensor->type);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) {
|
||||
GGML_UNUSED(ctx);
|
||||
|
||||
auto *src0 = op->src[0];
|
||||
auto *src1 = op->src[1];
|
||||
if (src0->type != src1->type || src0->type != op->type) {
|
||||
// current qnn implementation only supports the same type for src0 and src1
|
||||
QNN_LOG_DEBUG("src0 type %d and src1 type %d and op type %d are not equal", src0->type, src1->type, op->type);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) {
|
||||
/*
|
||||
* TODO: remove the blocker here when qnn backend supports mul_mat like this:
|
||||
* [ne03, ne02, n, k] * [ne03 * x, ne02 * y, m, k] -> [ne03 * x, ne02 * y, m, n]
|
||||
*/
|
||||
QNN_LOG_DEBUG("src0 and src1 dimensions are not equal");
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace qnn {
|
||||
|
||||
bool ggml_qnn_supports_op(const ggml_tensor *op) {
|
||||
bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) {
|
||||
if (op->op == GGML_OP_NONE) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (op->op == GGML_OP_UNARY) {
|
||||
if (!kQnnUnaryOpsTable[kGgmlUnaryOpStart + ggml_get_unary_op(op)]) {
|
||||
QNN_LOG_DEBUG("unsupported unary op %d", ggml_get_unary_op(op));
|
||||
|
|
@ -557,35 +582,38 @@ bool ggml_qnn_supports_op(const ggml_tensor *op) {
|
|||
QNN_LOG_DEBUG("src0 is nullptr");
|
||||
return false;
|
||||
}
|
||||
} else if (op->op != GGML_OP_NONE) {
|
||||
} else {
|
||||
if (!kQnnUnaryOpsTable[op->op] && !kQnnBinaryOpsTable[op->op]) {
|
||||
QNN_LOG_DEBUG("unsupported op %d", op->op);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!op->src[0] || !op->src[1]) {
|
||||
auto *src0 = op->src[0];
|
||||
auto *src1 = op->src[1];
|
||||
if (!src0 || !src1) {
|
||||
QNN_LOG_DEBUG("src0 or src1 is nullptr");
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
if (op->op == GGML_OP_ADD && !is_tensor_dimensions_equal(op->src[0], op->src[1])) {
|
||||
QNN_LOG_DEBUG("src0 and src1 dimensions are not equal");
|
||||
if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, src1) ||
|
||||
!ggml_qnn_supports_tensor(ctx, op)) {
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
switch (op->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_I8:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
break;
|
||||
default:
|
||||
QNN_LOG_DEBUG("unsupported src0 type %d", op->src[0]->type);
|
||||
return false;
|
||||
switch (op->op) {
|
||||
case GGML_OP_ADD:
|
||||
if (!is_tensor_dimensions_equal(src0, src1)) {
|
||||
QNN_LOG_DEBUG("src0 and src1 dimensions are not equal");
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
|
||||
case GGML_OP_MUL_MAT:
|
||||
return ggml_qnn_supports_matmul_op(ctx, op);
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@
|
|||
|
||||
namespace qnn {
|
||||
|
||||
bool ggml_qnn_supports_op(const ggml_tensor *op);
|
||||
bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op);
|
||||
bool ggml_qnn_forward(ggml_backend_qnn_device_context *ctx, struct ggml_tensor *tensor);
|
||||
|
||||
} // namespace qnn
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@
|
|||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
|
|
@ -26,6 +27,7 @@ struct ggml_backend_qnn_device_context {
|
|||
|
||||
// initialize in init
|
||||
qnn::qcom_socinfo socinfo = {};
|
||||
std::unordered_set<ggml_type> supported_types;
|
||||
std::shared_ptr<qnn::qnn_instance> instance;
|
||||
std::shared_ptr<qnn::qnn_interface> qnn_interface;
|
||||
|
||||
|
|
|
|||
|
|
@ -8,8 +8,8 @@
|
|||
namespace qnn {
|
||||
class ggml_qnn_rpc_buffer {
|
||||
public:
|
||||
ggml_qnn_rpc_buffer(std::shared_ptr<qnn_instance> qnn_instance, size_t size, uint32_t rank, uint32_t *dimensions,
|
||||
Qnn_DataType_t data_type) :
|
||||
ggml_qnn_rpc_buffer(std::shared_ptr<qnn_instance> qnn_instance, const size_t size, const uint32_t rank,
|
||||
uint32_t *dimensions, Qnn_DataType_t data_type) :
|
||||
_qnn_instance(qnn_instance), _size(size) {
|
||||
|
||||
_qnn_rpc_buffer = static_cast<uint8_t *>(qnn_instance->alloc_rpcmem(size, alignof(void *)));
|
||||
|
|
|
|||
|
|
@ -2,7 +2,6 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstdio>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
|
@ -12,19 +11,15 @@
|
|||
#include "logger.hpp"
|
||||
#include "op-config.hpp"
|
||||
#include "qnn-lib.hpp"
|
||||
#include "tensor.hpp"
|
||||
|
||||
namespace qnn {
|
||||
|
||||
using ggml_tensor_array_t = std::vector<ggml_tensor *>;
|
||||
using ggml_op_constructor_t = std::function<std::unique_ptr<qnn::ggml_qnn_op_config>(const std::string &)>;
|
||||
|
||||
class ggml_qnn_graph {
|
||||
public:
|
||||
explicit ggml_qnn_graph(const std::string &graph_name, QNNBackend device,
|
||||
std::shared_ptr<qnn_instance> qnn_instance, size_t vtcm_size_in_mb) :
|
||||
_graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) {
|
||||
QNN_LOG_INFO("graph name %s", graph_name.c_str());
|
||||
QNN_LOG_INFO("[%s]create", graph_name.c_str());
|
||||
|
||||
auto qnn_interface = qnn_instance->get_qnn_interface();
|
||||
auto qnn_context = qnn_instance->get_qnn_context_handle();
|
||||
|
|
@ -69,19 +64,16 @@ public:
|
|||
}
|
||||
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_INFO(
|
||||
"can't create qnn graph handle with graph name %s, "
|
||||
"error = %d\n",
|
||||
graph_name.c_str(), error);
|
||||
QNN_LOG_INFO("[%s]can't create qnn graph handle, error = %d\n", graph_name.c_str(), error);
|
||||
return;
|
||||
}
|
||||
|
||||
QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str());
|
||||
QNN_LOG_INFO("[%s]create succeed\n", graph_name.c_str());
|
||||
_graph_handle = graph_handle;
|
||||
_qnn_interface = qnn_interface;
|
||||
}
|
||||
|
||||
~ggml_qnn_graph() { QNN_LOG_DEBUG("graph name %s, destroy", _graph_name.c_str()); }
|
||||
~ggml_qnn_graph() { QNN_LOG_DEBUG("[%s]destroy", _graph_name.c_str()); }
|
||||
|
||||
bool build_graph(ggml_op_constructor_t op_constructor, const ggml_tensor_array_t &tensor_inputs,
|
||||
const ggml_tensor_array_t &tensor_outputs) {
|
||||
|
|
@ -91,95 +83,44 @@ public:
|
|||
return false;
|
||||
}
|
||||
|
||||
// get the max tensor rank
|
||||
for (auto tensor : tensor_inputs) {
|
||||
_tensor_rank = std::max(_tensor_rank, ggml_n_dims(tensor));
|
||||
}
|
||||
for (auto tensor : tensor_outputs) {
|
||||
_tensor_rank = std::max(_tensor_rank, ggml_n_dims(tensor));
|
||||
QNN_LOG_DEBUG("[%s]build_graph start", _graph_name.c_str());
|
||||
_op_config = op_constructor(_graph_name, _qnn_instance);
|
||||
if (!_op_config->create_tensors(_device, _graph_handle, tensor_inputs, tensor_outputs)) {
|
||||
QNN_LOG_ERROR("[%s]create_tensors failed\n", _graph_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("graph name %s, build_graph start", _graph_name.c_str());
|
||||
_tensor_inputs.resize(tensor_inputs.size());
|
||||
for (size_t i = 0; i < tensor_inputs.size(); i++) {
|
||||
char buffer[GGML_MAX_NAME] = {};
|
||||
snprintf(buffer, GGML_MAX_NAME, "src%d", (int)i);
|
||||
auto qnn_tensor =
|
||||
std::make_shared<ggml_qnn_tensor>(std::string(buffer), _device, _graph_handle, _qnn_instance);
|
||||
auto *ggml_tensor = tensor_inputs[i];
|
||||
if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, true, _tensor_rank)) {
|
||||
QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor));
|
||||
return false;
|
||||
}
|
||||
|
||||
_tensor_inputs[i] = qnn_tensor;
|
||||
if (!_op_config->add_op_to_graph(_graph_handle)) {
|
||||
QNN_LOG_ERROR("[%s]add nodes failed\n", _graph_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
_tensor_outputs.resize(tensor_outputs.size());
|
||||
for (size_t i = 0; i < tensor_outputs.size(); i++) {
|
||||
char buffer[GGML_MAX_NAME] = {};
|
||||
snprintf(buffer, GGML_MAX_NAME, "dst%d", (int)i);
|
||||
auto qnn_tensor =
|
||||
std::make_shared<ggml_qnn_tensor>(std::string(buffer), _device, _graph_handle, _qnn_instance);
|
||||
auto *ggml_tensor = tensor_outputs[i];
|
||||
if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, false, _tensor_rank)) {
|
||||
QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor));
|
||||
return false;
|
||||
}
|
||||
|
||||
_tensor_outputs[i] = qnn_tensor;
|
||||
}
|
||||
|
||||
_op_config = op_constructor(_graph_name);
|
||||
_op_config->set_input_tensors(_tensor_inputs);
|
||||
_op_config->set_output_tensors(_tensor_outputs);
|
||||
auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, _op_config->get_op_config());
|
||||
auto error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr);
|
||||
if (error != QNN_SUCCESS) {
|
||||
auto *error_str = get_qnn_error_string(error);
|
||||
if (error_str) {
|
||||
QNN_LOG_ERROR("qnn_graph_add_node.error: %s\n", error_str);
|
||||
QNN_LOG_ERROR("[%s]qnn_graph_finalize.error: %s\n", _graph_name.c_str(), error_str);
|
||||
} else {
|
||||
QNN_LOG_ERROR("qnn_graph_add_node.error: %d\n", error);
|
||||
QNN_LOG_ERROR("[%s]qnn_graph_finalize.error: %d\n", _graph_name.c_str(), error);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr);
|
||||
if (error != QNN_SUCCESS) {
|
||||
auto *error_str = get_qnn_error_string(error);
|
||||
if (error_str) {
|
||||
QNN_LOG_ERROR("qnn_graph_finalize.error: %s\n", error_str);
|
||||
} else {
|
||||
QNN_LOG_ERROR("qnn_graph_finalize.error: %d\n", error);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("graph name %s, build_graph succeed", _graph_name.c_str());
|
||||
QNN_LOG_DEBUG("[%s]build_graph succeed", _graph_name.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
bool execute(const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) {
|
||||
GGML_ASSERT(tensor_inputs.size() == _tensor_inputs.size());
|
||||
GGML_ASSERT(tensor_outputs.size() == _tensor_outputs.size());
|
||||
for (size_t i = 0; i < tensor_inputs.size(); i++) {
|
||||
auto *ggml_tensor = tensor_inputs[i];
|
||||
if (!_tensor_inputs[i]->bind_ggml_tensor(ggml_tensor, true, _tensor_rank)) {
|
||||
QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor));
|
||||
return false;
|
||||
}
|
||||
if (!_op_config->bind_input_tensors(tensor_inputs)) {
|
||||
QNN_LOG_ERROR("[%s]bind input tensors failed\n", _graph_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < tensor_outputs.size(); i++) {
|
||||
auto *ggml_tensor = tensor_outputs[i];
|
||||
if (!_tensor_outputs[i]->bind_ggml_tensor(ggml_tensor, false, _tensor_rank)) {
|
||||
QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor));
|
||||
return false;
|
||||
}
|
||||
if (!_op_config->bind_output_tensors(tensor_outputs)) {
|
||||
QNN_LOG_ERROR("[%s]bind output tensors failed\n", _graph_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
_op_config->set_input_tensors(_tensor_inputs);
|
||||
_op_config->set_output_tensors(_tensor_outputs);
|
||||
auto &qnn_tensor_inputs = _op_config->get_qnn_input_tensors();
|
||||
auto &qnn_tensor_outputs = _op_config->get_qnn_output_tensors();
|
||||
|
||||
|
|
@ -188,20 +129,15 @@ public:
|
|||
qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr);
|
||||
if (_device == QNN_BACKEND_NPU) {
|
||||
if (error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) {
|
||||
QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n");
|
||||
QNN_LOG_WARN("[%s]NPU crashed. SSR detected. Caused QNN graph execute error\n", _graph_name.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
for (auto tensor : _tensor_inputs) {
|
||||
tensor->unbind_ggml_tensor();
|
||||
}
|
||||
|
||||
for (auto tensor : _tensor_outputs) {
|
||||
tensor->unbind_ggml_tensor();
|
||||
}
|
||||
_op_config->unbind_input_tensors();
|
||||
_op_config->unbind_output_tensors();
|
||||
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_INFO("error = %d\n", error);
|
||||
QNN_LOG_INFO("[%s]error = %d\n", _graph_name.c_str(), error);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -220,11 +156,8 @@ private:
|
|||
Qnn_GraphHandle_t _graph_handle = nullptr;
|
||||
std::shared_ptr<qnn_instance> _qnn_instance;
|
||||
std::shared_ptr<qnn_interface> _qnn_interface;
|
||||
std::vector<std::shared_ptr<ggml_qnn_tensor>> _tensor_inputs;
|
||||
std::vector<std::shared_ptr<ggml_qnn_tensor>> _tensor_outputs;
|
||||
std::unique_ptr<ggml_qnn_op_config> _op_config;
|
||||
std::vector<Qnn_Param_t> _param_types;
|
||||
int _tensor_rank = 0;
|
||||
|
||||
DISABLE_COPY(ggml_qnn_graph);
|
||||
DISABLE_MOVE(ggml_qnn_graph);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,471 @@
|
|||
#include "op-config.hpp"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#include "logger.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
constexpr const qnn::qnn_dimension_array_t kTransposeParamData[GGML_MAX_DIMS] = {
|
||||
{ 0 },
|
||||
{ 1, 0 },
|
||||
{ 0, 2, 1 },
|
||||
{ 0, 1, 3, 2 },
|
||||
};
|
||||
|
||||
qnn::qnn_dimension_array_t get_transposed_dimensions(const qnn::qnn_dimension_array_t &dimensions, int rank) {
|
||||
qnn::qnn_dimension_array_t transposed_dims = dimensions;
|
||||
if (rank >= 2) {
|
||||
transposed_dims[rank - 1] = dimensions[rank - 2];
|
||||
transposed_dims[rank - 2] = dimensions[rank - 1];
|
||||
}
|
||||
|
||||
return transposed_dims;
|
||||
}
|
||||
|
||||
int get_rank(const qnn::ggml_tensor_array_t &tensor_inputs, const qnn::ggml_tensor_array_t &tensor_outputs) {
|
||||
int tensor_rank = 0;
|
||||
// get the max tensor rank
|
||||
for (auto tensor : tensor_inputs) {
|
||||
tensor_rank = std::max(tensor_rank, ggml_n_dims(tensor));
|
||||
}
|
||||
for (auto tensor : tensor_outputs) {
|
||||
tensor_rank = std::max(tensor_rank, ggml_n_dims(tensor));
|
||||
}
|
||||
|
||||
return tensor_rank;
|
||||
}
|
||||
|
||||
Qnn_DataType_t get_tensor_type(const qnn::ggml_qnn_tensor_array_t &tensors) {
|
||||
Qnn_DataType_t type = QNN_DATATYPE_UNDEFINED;
|
||||
for (auto tensor : tensors) {
|
||||
auto tensor_type_size = qnn::qnn_datatype_size(tensor->get_data_type());
|
||||
GGML_ASSERT(tensor_type_size > 0);
|
||||
if (tensor_type_size > qnn::qnn_datatype_size(type)) {
|
||||
type = tensor->get_data_type();
|
||||
}
|
||||
}
|
||||
|
||||
return type;
|
||||
}
|
||||
|
||||
struct tensor_common_params {
|
||||
const char *name_prefix;
|
||||
int tensor_rank;
|
||||
bool is_input;
|
||||
QNNBackend device;
|
||||
Qnn_GraphHandle_t graph_handle;
|
||||
std::shared_ptr<qnn::qnn_instance> qnn_instance;
|
||||
};
|
||||
|
||||
void create_tensors_from_ggml_tensor(const tensor_common_params ¶ms, const qnn::ggml_tensor_array_t &ggml_tensors,
|
||||
qnn::ggml_qnn_tensor_array_t *tensor_wrappers,
|
||||
std::vector<Qnn_Tensor_t> *qnn_tensors) {
|
||||
using namespace qnn;
|
||||
|
||||
tensor_wrappers->resize(ggml_tensors.size());
|
||||
if (qnn_tensors) {
|
||||
qnn_tensors->resize(ggml_tensors.size());
|
||||
}
|
||||
char buffer[GGML_MAX_NAME] = {};
|
||||
auto tensor_type = params.is_input ? ggml_qnn_tensor::INPUT : ggml_qnn_tensor::OUTPUT;
|
||||
for (size_t i = 0; i < ggml_tensors.size(); i++) {
|
||||
snprintf(buffer, GGML_MAX_NAME, "%s%d", params.name_prefix, (int)i);
|
||||
auto *ggml_tensor = ggml_tensors[i];
|
||||
(*tensor_wrappers)[i] = std::make_shared<ggml_qnn_tensor>(tensor_type, std::string(buffer), ggml_tensor->ne,
|
||||
ggml_tensor->type, params.tensor_rank, params.device,
|
||||
params.graph_handle, params.qnn_instance);
|
||||
}
|
||||
}
|
||||
|
||||
bool bind_tensors(const qnn::ggml_tensor_array_t &ggml_tensors, qnn::ggml_qnn_tensor_array_t &tensor_wrappers,
|
||||
std::vector<Qnn_Tensor_t> &qnn_tensors) {
|
||||
for (size_t i = 0; i < ggml_tensors.size(); i++) {
|
||||
auto *ggml_tensor = ggml_tensors[i];
|
||||
if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) {
|
||||
QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor));
|
||||
return false;
|
||||
}
|
||||
|
||||
qnn_tensors[i] = tensor_wrappers[i]->get_qnn_tensor();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
class ggml_qnn_connectable_op_config : public qnn::ggml_qnn_op_config_base {
|
||||
public:
|
||||
explicit ggml_qnn_connectable_op_config(const std::string &name, const std::string &package_name,
|
||||
const std::string &op_type,
|
||||
std::shared_ptr<qnn::qnn_instance> qnn_instance) :
|
||||
ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {}
|
||||
|
||||
bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
const qnn::ggml_tensor_array_t &tensor_inputs,
|
||||
const qnn::ggml_tensor_array_t &tensor_outputs) override {
|
||||
GGML_UNUSED(device);
|
||||
GGML_UNUSED(graph_handle);
|
||||
GGML_UNUSED(tensor_inputs);
|
||||
GGML_UNUSED(tensor_outputs);
|
||||
return true;
|
||||
}
|
||||
|
||||
void set_input_tensors(qnn::ggml_qnn_tensor_array_t &tensor_inputs) {
|
||||
_tensor_inputs = tensor_inputs;
|
||||
_qnn_tensor_inputs.resize(_tensor_inputs.size());
|
||||
}
|
||||
|
||||
void set_input_tensors(qnn::ggml_qnn_tensor_array_t &&tensor_inputs) {
|
||||
_tensor_inputs = std::move(tensor_inputs);
|
||||
_qnn_tensor_inputs.resize(_tensor_inputs.size());
|
||||
}
|
||||
|
||||
void set_output_tensors(qnn::ggml_qnn_tensor_array_t &tensor_outputs) {
|
||||
_tensor_outputs = tensor_outputs;
|
||||
_qnn_tensor_outputs.resize(_tensor_outputs.size());
|
||||
}
|
||||
|
||||
void set_output_tensors(qnn::ggml_qnn_tensor_array_t &&tensor_outputs) {
|
||||
_tensor_outputs = std::move(tensor_outputs);
|
||||
_qnn_tensor_outputs.resize(_tensor_outputs.size());
|
||||
}
|
||||
|
||||
qnn::ggml_qnn_tensor_array_t &get_input_tensors() { return _tensor_inputs; }
|
||||
qnn::ggml_qnn_tensor_array_t &get_output_tensors() { return _tensor_outputs; }
|
||||
|
||||
private:
|
||||
DISABLE_COPY(ggml_qnn_connectable_op_config);
|
||||
DISABLE_MOVE(ggml_qnn_connectable_op_config);
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace qnn {
|
||||
|
||||
void ggml_qnn_op_config_base::add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar) {
|
||||
_param_names.push_back(name);
|
||||
Qnn_Param_t param = QNN_PARAM_INIT;
|
||||
param.paramType = QNN_PARAMTYPE_SCALAR;
|
||||
param.name = _param_names.back().c_str();
|
||||
param.scalarParam = scalar;
|
||||
_qnn_parameters.push_back(param);
|
||||
}
|
||||
|
||||
bool ggml_qnn_op_config_base::add_tensor_param(const std::string &name, const qnn_dimension_array_t &dimensions,
|
||||
int rank, const uint8_t *data, const Qnn_DataType_t data_type,
|
||||
QNNBackend device, Qnn_GraphHandle_t graph_handle) {
|
||||
std::string tensor_name = _name + name + std::to_string(_tensor_parameters.size());
|
||||
auto param_tensor = std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::PARAMETER, tensor_name, dimensions,
|
||||
data_type, rank, device, graph_handle, _qnn_instance);
|
||||
size_t data_size = ggml_type_size(ggml_datatype_from_qnn_datatype(data_type));
|
||||
for (int i = 0; i < rank; i++) {
|
||||
data_size *= dimensions[i];
|
||||
}
|
||||
|
||||
GGML_ASSERT(data_size > 0);
|
||||
if (!param_tensor->bind_buffer(const_cast<uint8_t *>(data), data_size)) {
|
||||
QNN_LOG_ERROR("parameter tensor bind_buffer failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!param_tensor->alloc_qnn_tensor_id()) {
|
||||
QNN_LOG_ERROR("parameter tensor alloc_qnn_tensor_id failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
_tensor_parameters.push_back(param_tensor);
|
||||
_param_names.push_back(name);
|
||||
Qnn_Param_t param = QNN_PARAM_INIT;
|
||||
param.paramType = QNN_PARAMTYPE_TENSOR;
|
||||
param.name = _param_names.back().c_str();
|
||||
param.tensorParam = param_tensor->get_qnn_tensor();
|
||||
_qnn_parameters.push_back(param);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) {
|
||||
GGML_ASSERT(_qnn_tensor_inputs.size() == _tensor_inputs.size());
|
||||
GGML_ASSERT(_qnn_tensor_outputs.size() == _tensor_outputs.size());
|
||||
|
||||
auto qnn_interface = _qnn_instance->get_qnn_interface();
|
||||
for (size_t i = 0; i < _tensor_inputs.size(); i++) {
|
||||
auto tensor = _tensor_inputs[i];
|
||||
if (!tensor->alloc_qnn_tensor_id()) {
|
||||
QNN_LOG_ERROR("[%s]input tensor alloc_qnn_tensor_id failed\n", _name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
_qnn_tensor_inputs[i] = tensor->get_qnn_tensor();
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < _tensor_outputs.size(); i++) {
|
||||
auto tensor = _tensor_outputs[i];
|
||||
if (!tensor->alloc_qnn_tensor_id()) {
|
||||
QNN_LOG_ERROR("[%s]output tensor alloc_qnn_tensor_id failed\n", _name.c_str());
|
||||
return false;
|
||||
}
|
||||
_qnn_tensor_outputs[i] = _tensor_outputs[i]->get_qnn_tensor();
|
||||
}
|
||||
|
||||
auto error = qnn_interface->qnn_graph_add_node(graph_handle, get_op_config());
|
||||
if (error != QNN_SUCCESS) {
|
||||
auto *error_str = get_qnn_error_string(error);
|
||||
if (error_str) {
|
||||
QNN_LOG_ERROR("[%s]qnn_graph_add_node.error: %s\n", _name.c_str(), error_str);
|
||||
} else {
|
||||
QNN_LOG_ERROR("[%s]qnn_graph_add_node.error: %d\n", _name.c_str(), error);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s]added to graph\n", _name.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ggml_qnn_op_config_base::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) {
|
||||
GGML_ASSERT(tensor_inputs.size() == _tensor_inputs.size());
|
||||
return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs);
|
||||
}
|
||||
|
||||
bool ggml_qnn_op_config_base::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) {
|
||||
GGML_ASSERT(tensor_outputs.size() == _tensor_outputs.size());
|
||||
return bind_tensors(tensor_outputs, _tensor_outputs, _qnn_tensor_outputs);
|
||||
}
|
||||
|
||||
void ggml_qnn_op_config_base::unbind_input_tensors() {
|
||||
for (auto &tensor : _tensor_inputs) {
|
||||
tensor->unbind();
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_qnn_op_config_base::unbind_output_tensors() {
|
||||
for (auto &tensor : _tensor_outputs) {
|
||||
tensor->unbind();
|
||||
}
|
||||
}
|
||||
|
||||
Qnn_OpConfig_t ggml_qnn_op_config_base::get_op_config() {
|
||||
Qnn_OpConfig_t config = QNN_OPCONFIG_INIT;
|
||||
config.version = QNN_OPCONFIG_VERSION_1;
|
||||
auto &op_config = config.v1;
|
||||
op_config.name = _name.c_str();
|
||||
op_config.packageName = _package_name.c_str();
|
||||
op_config.typeName = _op_type.c_str();
|
||||
op_config.numOfParams = (uint32_t)_qnn_parameters.size();
|
||||
op_config.params = _qnn_parameters.data();
|
||||
op_config.numOfInputs = (uint32_t)_qnn_tensor_inputs.size();
|
||||
op_config.inputTensors = _qnn_tensor_inputs.data();
|
||||
op_config.numOfOutputs = (uint32_t)_qnn_tensor_outputs.size();
|
||||
op_config.outputTensors = _qnn_tensor_outputs.data();
|
||||
return config;
|
||||
}
|
||||
|
||||
bool ggml_qnn_single_op_config::create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
const ggml_tensor_array_t &tensor_inputs,
|
||||
const ggml_tensor_array_t &tensor_outputs) {
|
||||
const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs);
|
||||
tensor_common_params params = { "src", tensor_rank, true, device, graph_handle, _qnn_instance };
|
||||
create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs);
|
||||
params.name_prefix = "dst";
|
||||
params.is_input = false;
|
||||
create_tensors_from_ggml_tensor(params, tensor_outputs, &_tensor_outputs, &_qnn_tensor_outputs);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ggml_qnn_matmul_op_config::create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
const ggml_tensor_array_t &tensor_inputs,
|
||||
const ggml_tensor_array_t &tensor_outputs) {
|
||||
GGML_ASSERT(tensor_inputs.size() == 2);
|
||||
GGML_ASSERT(tensor_outputs.size() == 1);
|
||||
const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs);
|
||||
GGML_ASSERT(tensor_rank >= 2);
|
||||
|
||||
// create input tensors
|
||||
tensor_common_params params = { "src", tensor_rank, true, device, graph_handle, _qnn_instance };
|
||||
create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs);
|
||||
|
||||
// create output tensor
|
||||
ggml_qnn_tensor_array_t mat_mul_tensor_outputs;
|
||||
params.name_prefix = "dst";
|
||||
params.is_input = false;
|
||||
create_tensors_from_ggml_tensor(params, tensor_outputs, &mat_mul_tensor_outputs, nullptr);
|
||||
|
||||
// create mat_mul nodes
|
||||
return create_mat_mul_nodes(device, graph_handle, tensor_rank, _tensor_inputs, mat_mul_tensor_outputs);
|
||||
}
|
||||
|
||||
bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank,
|
||||
ggml_qnn_tensor_array_t &tensor_inputs,
|
||||
ggml_qnn_tensor_array_t &tensor_outputs) {
|
||||
|
||||
/*
|
||||
* First, both the ggml and qnn tensor in memory are stored as row-major format. (For more details, please also:
|
||||
* https://pytorch.org/blog/tensor-memory-format-matters/#:~:text=Column%20Major%20Order:%20In%20this%20format,%20the%20matrix)
|
||||
* But the dimensions of the tensor are stored in different order.
|
||||
* For example, a 2x3 matrix:
|
||||
* [
|
||||
* [1, 2, 3],
|
||||
* [4, 5, 6],
|
||||
* ]
|
||||
* The ggml tensor will have dimensions [3, 2], while the qnn tensor will have dimensions [2, 3].
|
||||
*
|
||||
* Second, from the ggml introduction here: https://github.com/huggingface/blog/blob/main/introduction-to-ggml.md
|
||||
* Given 2 matrices A and B, the matrix multiplication C = A * B is defined as:
|
||||
* ```python
|
||||
* import torch
|
||||
* # Create two matrices
|
||||
* A = torch.tensor([
|
||||
* [2, 8],
|
||||
* [5, 1],
|
||||
* [4, 2],
|
||||
* [8, 6],
|
||||
* ])
|
||||
* B = torch.tensor([
|
||||
* [10, 5],
|
||||
* [9, 9],
|
||||
* [5, 4],
|
||||
* ])
|
||||
* # Perform matrix multiplication
|
||||
* result = torch.matmul(A, B.T)
|
||||
* print(result.T)
|
||||
* ```
|
||||
* Here, the B.T is the transpose of B.
|
||||
*
|
||||
* So here we need to create graph like:
|
||||
* ```mermaid
|
||||
* graph TD;
|
||||
* i1>ggml_tensor_in0] --src0--> mat_mul0;
|
||||
* i2>ggml_tensor_in1] --src1--> transpose0;
|
||||
* transpose0 --src0_trans--> mat_mul0;
|
||||
* mat_mul0 --dst_trans--> transpose1;
|
||||
* transpose1 --dst0--> o1>ggml_tensor_out];
|
||||
* ```
|
||||
*/
|
||||
|
||||
// create src0_trans tensor
|
||||
auto src1 = tensor_inputs.back();
|
||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS does not match the expected value");
|
||||
|
||||
qnn_dimension_array_t dimensions = get_transposed_dimensions(src1->get_dimensions(), rank);
|
||||
auto src0_trans =
|
||||
std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, "src0_trans", dimensions,
|
||||
src1->get_data_type(), rank, device, graph_handle, _qnn_instance);
|
||||
|
||||
// create dst_trans tensor
|
||||
auto dst = tensor_outputs.front();
|
||||
dimensions = get_transposed_dimensions(dst->get_dimensions(), rank);
|
||||
auto dst_trans = std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, "dst_trans", dimensions,
|
||||
dst->get_data_type(), rank, device, graph_handle, _qnn_instance);
|
||||
|
||||
// create transpose0
|
||||
auto transpose0 = std::make_shared<ggml_qnn_connectable_op_config>(_name + "_trans0", QNN_OP_PACKAGE_NAME_QTI_AISW,
|
||||
QNN_OP_TRANSPOSE, _qnn_instance);
|
||||
|
||||
// create transpose1
|
||||
auto transpose1 = std::make_shared<ggml_qnn_connectable_op_config>(_name + "_trans1", QNN_OP_PACKAGE_NAME_QTI_AISW,
|
||||
QNN_OP_TRANSPOSE, _qnn_instance);
|
||||
|
||||
// create mat_mul
|
||||
auto mat_mul = std::make_shared<ggml_qnn_connectable_op_config>(_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
|
||||
_qnn_instance);
|
||||
|
||||
// set transpose0 parameters
|
||||
auto *params_data = reinterpret_cast<const uint8_t *>(kTransposeParamData[rank - 1].data());
|
||||
const qnn_dimension_array_t param_dims = { (uint32_t)rank, 1, 1, 1 };
|
||||
transpose0->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32, device,
|
||||
graph_handle);
|
||||
|
||||
// set transpose1 parameters
|
||||
transpose1->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32, device,
|
||||
graph_handle);
|
||||
|
||||
// set tensor to transpose0
|
||||
ggml_qnn_tensor_array_t tensors = { tensor_inputs.back() };
|
||||
transpose0->set_input_tensors(tensors);
|
||||
tensors = { src0_trans };
|
||||
transpose0->set_output_tensors(tensors);
|
||||
|
||||
// set tensor to mat_mul
|
||||
tensors = { tensor_inputs.front(), src0_trans };
|
||||
mat_mul->set_input_tensors(tensors);
|
||||
tensors = { dst_trans };
|
||||
mat_mul->set_output_tensors(tensors);
|
||||
|
||||
// set tensor to transpose1
|
||||
tensors = { dst_trans };
|
||||
transpose1->set_input_tensors(tensors);
|
||||
transpose1->set_output_tensors(tensor_outputs);
|
||||
|
||||
_mat_mul = mat_mul;
|
||||
_transpose0 = transpose0;
|
||||
_transpose1 = transpose1;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ggml_qnn_matmul_op_config::add_op_to_graph(Qnn_GraphHandle_t graph_handle) {
|
||||
for (auto &convert : _input_converts) {
|
||||
if (convert && !convert->add_op_to_graph(graph_handle)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return _transpose0->add_op_to_graph(graph_handle) && _mat_mul->add_op_to_graph(graph_handle) &&
|
||||
_transpose1->add_op_to_graph(graph_handle) &&
|
||||
(!_output_convert || _output_convert->add_op_to_graph(graph_handle));
|
||||
}
|
||||
|
||||
bool ggml_qnn_matmul_op_config::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) {
|
||||
return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs);
|
||||
}
|
||||
|
||||
bool ggml_qnn_matmul_op_config::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) {
|
||||
if (_output_convert) {
|
||||
return _output_convert->bind_output_tensors(tensor_outputs);
|
||||
} else {
|
||||
return _transpose1->bind_output_tensors(tensor_outputs);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_qnn_matmul_op_config::unbind_input_tensors() {
|
||||
_mat_mul->unbind_input_tensors();
|
||||
_transpose0->unbind_input_tensors();
|
||||
for (auto &convert : _input_converts) {
|
||||
if (convert) {
|
||||
convert->unbind_input_tensors();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_qnn_matmul_op_config::unbind_output_tensors() {
|
||||
_transpose1->unbind_output_tensors();
|
||||
if (_output_convert) {
|
||||
_output_convert->unbind_output_tensors();
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<Qnn_Tensor_t> &ggml_qnn_matmul_op_config::get_qnn_output_tensors() {
|
||||
if (_output_convert) {
|
||||
return _output_convert->get_qnn_output_tensors();
|
||||
} else {
|
||||
return _transpose1->get_qnn_output_tensors();
|
||||
}
|
||||
}
|
||||
|
||||
ggml_op_constructor_t create_op_constructor(const std::string &op_name) {
|
||||
if (op_name == QNN_OP_MAT_MUL) {
|
||||
// For QNN_OP_MAT_MUL, we need to transpose the input tensor
|
||||
return [](const std::string &instance_name,
|
||||
std::shared_ptr<qnn::qnn_instance> qnn_instance) -> std::unique_ptr<qnn::ggml_qnn_op_config> {
|
||||
QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s\n", instance_name.c_str());
|
||||
return std::make_unique<qnn::ggml_qnn_matmul_op_config>(instance_name, qnn_instance);
|
||||
};
|
||||
}
|
||||
|
||||
return [op_name](const std::string &instance_name,
|
||||
std::shared_ptr<qnn::qnn_instance> qnn_instance) -> std::unique_ptr<qnn::ggml_qnn_op_config> {
|
||||
return std::make_unique<qnn::ggml_qnn_single_op_config>(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_name,
|
||||
qnn_instance);
|
||||
};
|
||||
}
|
||||
|
||||
} // namespace qnn
|
||||
|
|
@ -1,73 +1,122 @@
|
|||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <functional>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "ggml-qnn.h"
|
||||
|
||||
#include "logger.hpp"
|
||||
#include "qnn-lib.hpp"
|
||||
#include "qnn-types.hpp"
|
||||
#include "tensor.hpp"
|
||||
|
||||
namespace qnn {
|
||||
|
||||
using ggml_tensor_array_t = std::vector<ggml_tensor *>;
|
||||
|
||||
class ggml_qnn_op_config {
|
||||
public:
|
||||
explicit ggml_qnn_op_config(const std::string &name, const std::string &package_name, const std::string &op_type) :
|
||||
_name(name), _package_name(package_name), _op_type(op_type) {}
|
||||
virtual ~ggml_qnn_op_config() {}
|
||||
virtual bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
const ggml_tensor_array_t &tensor_inputs,
|
||||
const ggml_tensor_array_t &tensor_outputs) = 0;
|
||||
virtual std::vector<Qnn_Tensor_t> &get_qnn_input_tensors() = 0;
|
||||
virtual std::vector<Qnn_Tensor_t> &get_qnn_output_tensors() = 0;
|
||||
virtual bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) = 0;
|
||||
virtual bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) = 0;
|
||||
virtual bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) = 0;
|
||||
virtual void unbind_input_tensors() = 0;
|
||||
virtual void unbind_output_tensors() = 0;
|
||||
};
|
||||
|
||||
void set_input_tensors(const std::vector<std::shared_ptr<ggml_qnn_tensor>> &tensor_inputs) {
|
||||
_qnn_tensor_inputs.resize(tensor_inputs.size());
|
||||
for (size_t i = 0; i < tensor_inputs.size(); i++) {
|
||||
_qnn_tensor_inputs[i] = tensor_inputs[i]->get_qnn_tensor();
|
||||
}
|
||||
}
|
||||
class ggml_qnn_op_config_base : public ggml_qnn_op_config {
|
||||
public:
|
||||
explicit ggml_qnn_op_config_base(const std::string &name, const std::string &package_name,
|
||||
const std::string &op_type, std::shared_ptr<qnn_instance> qnn_instance) :
|
||||
_name(name), _package_name(package_name), _op_type(op_type), _qnn_instance(qnn_instance) {}
|
||||
|
||||
void set_output_tensors(const std::vector<std::shared_ptr<ggml_qnn_tensor>> &tensor_outputs) {
|
||||
_qnn_tensor_outputs.resize(tensor_outputs.size());
|
||||
for (size_t i = 0; i < tensor_outputs.size(); i++) {
|
||||
_qnn_tensor_outputs[i] = tensor_outputs[i]->get_qnn_tensor();
|
||||
}
|
||||
}
|
||||
void add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar);
|
||||
bool add_tensor_param(const std::string &name, const qnn_dimension_array_t &dimensions, int rank,
|
||||
const uint8_t *data, const Qnn_DataType_t data_type, QNNBackend device,
|
||||
Qnn_GraphHandle_t graph_handle);
|
||||
bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override;
|
||||
bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override;
|
||||
bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override;
|
||||
void unbind_input_tensors() override;
|
||||
void unbind_output_tensors() override;
|
||||
std::vector<Qnn_Tensor_t> &get_qnn_input_tensors() override { return _qnn_tensor_inputs; }
|
||||
std::vector<Qnn_Tensor_t> &get_qnn_output_tensors() override { return _qnn_tensor_outputs; }
|
||||
|
||||
void add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar) {
|
||||
_param_names.push_back(name);
|
||||
Qnn_Param_t param = QNN_PARAM_INIT;
|
||||
param.paramType = QNN_PARAMTYPE_SCALAR;
|
||||
param.name = _param_names.back().c_str();
|
||||
param.scalarParam = scalar;
|
||||
_parameters.push_back(param);
|
||||
}
|
||||
protected:
|
||||
Qnn_OpConfig_t get_op_config();
|
||||
|
||||
std::vector<Qnn_Tensor_t> &get_qnn_input_tensors() { return _qnn_tensor_inputs; }
|
||||
std::vector<Qnn_Tensor_t> &get_qnn_output_tensors() { return _qnn_tensor_outputs; }
|
||||
|
||||
Qnn_OpConfig_t get_op_config() {
|
||||
Qnn_OpConfig_t config = QNN_OPCONFIG_INIT;
|
||||
config.version = QNN_OPCONFIG_VERSION_1;
|
||||
auto &op_config = config.v1;
|
||||
op_config.name = _name.c_str();
|
||||
op_config.packageName = _package_name.c_str();
|
||||
op_config.typeName = _op_type.c_str();
|
||||
op_config.numOfParams = (uint32_t)_parameters.size();
|
||||
op_config.params = _parameters.data();
|
||||
op_config.numOfInputs = (uint32_t)_qnn_tensor_inputs.size();
|
||||
op_config.inputTensors = _qnn_tensor_inputs.data();
|
||||
op_config.numOfOutputs = (uint32_t)_qnn_tensor_outputs.size();
|
||||
op_config.outputTensors = _qnn_tensor_outputs.data();
|
||||
return config;
|
||||
}
|
||||
|
||||
private:
|
||||
std::string _name;
|
||||
std::string _package_name;
|
||||
std::string _op_type;
|
||||
std::shared_ptr<qnn_instance> _qnn_instance;
|
||||
ggml_qnn_tensor_array_t _tensor_inputs;
|
||||
ggml_qnn_tensor_array_t _tensor_outputs;
|
||||
ggml_qnn_tensor_array_t _tensor_parameters;
|
||||
std::vector<Qnn_Tensor_t> _qnn_tensor_inputs;
|
||||
std::vector<Qnn_Tensor_t> _qnn_tensor_outputs;
|
||||
std::vector<Qnn_Param_t> _parameters;
|
||||
std::vector<Qnn_Param_t> _qnn_parameters;
|
||||
std::vector<std::string> _param_names;
|
||||
|
||||
DISABLE_COPY(ggml_qnn_op_config);
|
||||
DISABLE_MOVE(ggml_qnn_op_config);
|
||||
DISABLE_COPY(ggml_qnn_op_config_base);
|
||||
DISABLE_MOVE(ggml_qnn_op_config_base);
|
||||
};
|
||||
|
||||
class ggml_qnn_single_op_config : public ggml_qnn_op_config_base {
|
||||
public:
|
||||
explicit ggml_qnn_single_op_config(const std::string &name, const std::string &package_name,
|
||||
const std::string &op_type, std::shared_ptr<qnn_instance> qnn_instance) :
|
||||
ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {}
|
||||
|
||||
bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs,
|
||||
const ggml_tensor_array_t &tensor_outputs) override;
|
||||
|
||||
private:
|
||||
DISABLE_COPY(ggml_qnn_single_op_config);
|
||||
DISABLE_MOVE(ggml_qnn_single_op_config);
|
||||
};
|
||||
|
||||
class ggml_qnn_matmul_op_config : public ggml_qnn_op_config {
|
||||
public:
|
||||
ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr<qnn_instance> qnn_instance) :
|
||||
_name(name), _qnn_instance(qnn_instance) {}
|
||||
|
||||
bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs,
|
||||
const ggml_tensor_array_t &tensor_outputs) override;
|
||||
bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override;
|
||||
bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override;
|
||||
bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override;
|
||||
void unbind_input_tensors() override;
|
||||
void unbind_output_tensors() override;
|
||||
std::vector<Qnn_Tensor_t> &get_qnn_input_tensors() override { return _qnn_tensor_inputs; }
|
||||
std::vector<Qnn_Tensor_t> &get_qnn_output_tensors() override;
|
||||
|
||||
private:
|
||||
bool create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank,
|
||||
ggml_qnn_tensor_array_t &tensor_inputs, ggml_qnn_tensor_array_t &tensor_outputs);
|
||||
|
||||
std::string _name;
|
||||
std::shared_ptr<qnn_instance> _qnn_instance;
|
||||
std::shared_ptr<ggml_qnn_op_config> _transpose0;
|
||||
std::shared_ptr<ggml_qnn_op_config> _transpose1;
|
||||
std::shared_ptr<ggml_qnn_op_config> _mat_mul;
|
||||
std::vector<std::shared_ptr<ggml_qnn_op_config>> _input_converts;
|
||||
std::shared_ptr<ggml_qnn_op_config> _output_convert;
|
||||
ggml_qnn_tensor_array_t _tensor_inputs;
|
||||
std::vector<Qnn_Tensor_t> _qnn_tensor_inputs;
|
||||
|
||||
DISABLE_COPY(ggml_qnn_matmul_op_config);
|
||||
DISABLE_MOVE(ggml_qnn_matmul_op_config);
|
||||
};
|
||||
|
||||
using ggml_op_constructor_t =
|
||||
std::function<std::unique_ptr<ggml_qnn_op_config>(const std::string &, std::shared_ptr<qnn_instance>)>;
|
||||
|
||||
ggml_op_constructor_t create_op_constructor(const std::string &op_name);
|
||||
|
||||
} // namespace qnn
|
||||
|
|
|
|||
|
|
@ -637,7 +637,7 @@ public:
|
|||
return mem_fd;
|
||||
}
|
||||
|
||||
Qnn_MemHandle_t register_rpcmem(void *p_data, uint32_t rank, uint32_t *dimensions, Qnn_DataType_t data_type) {
|
||||
Qnn_MemHandle_t register_rpcmem(void *p_data, const uint32_t rank, uint32_t *dimensions, Qnn_DataType_t data_type) {
|
||||
if (!p_data) {
|
||||
QNN_LOG_WARN("invalid param\n");
|
||||
return nullptr;
|
||||
|
|
|
|||
|
|
@ -1,8 +1,10 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <atomic>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
|
@ -16,55 +18,81 @@
|
|||
|
||||
namespace qnn {
|
||||
|
||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS should be 4");
|
||||
|
||||
class ggml_qnn_tensor {
|
||||
public:
|
||||
explicit ggml_qnn_tensor(const std::string &name, QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
typedef enum _tensor_type { INPUT, OUTPUT, INTERMEDIATE, PARAMETER } tensor_type_t;
|
||||
|
||||
explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string &name,
|
||||
const qnn_dimension_array_t &dimensions, Qnn_DataType_t data_type, int rank,
|
||||
QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
std::shared_ptr<qnn_instance> qnn_instance) :
|
||||
_tensor_name(name), _device(device), _qnn_instance(qnn_instance), _graph_handle(graph_handle) {
|
||||
QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str());
|
||||
if (!_tensor_name.empty()) {
|
||||
QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str());
|
||||
}
|
||||
QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions.data());
|
||||
QNN_TENSOR_SET_TYPE(_qnn_tensor, QNN_TENSOR_TYPE_NATIVE);
|
||||
QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER);
|
||||
QNN_LOG_DEBUG("create tensor %s, device: %d", _tensor_name.c_str(), device);
|
||||
|
||||
_dimensions = dimensions;
|
||||
update_params_from_ggml_tensor(tensor_type, data_type, rank);
|
||||
QNN_LOG_DEBUG("create tensor %s, rank: %d, dims: [%d, %d, %d, %d], data_type: %d, device: %d",
|
||||
_tensor_name.c_str(), rank, (int)_dimensions[0], (int)_dimensions[1], (int)_dimensions[2],
|
||||
(int)_dimensions[3], (int)data_type, (int)device);
|
||||
}
|
||||
|
||||
explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string &name,
|
||||
const ggml_dimension_array_t &dimensions, ggml_type data_type, int rank, QNNBackend device,
|
||||
Qnn_GraphHandle_t graph_handle, std::shared_ptr<qnn_instance> qnn_instance) :
|
||||
ggml_qnn_tensor(tensor_type, name, get_internal_dimension(dimensions, rank),
|
||||
qnn_datatype_from_ggml_datatype(data_type), rank, device, graph_handle, qnn_instance) {}
|
||||
|
||||
~ggml_qnn_tensor() { _qnn_rpc_buffer.reset(); }
|
||||
|
||||
bool bind_ggml_tensor(ggml_tensor *tensor, bool is_input, int prev_max_rank) {
|
||||
if (_tensor) {
|
||||
if (_tensor != tensor) {
|
||||
QNN_LOG_WARN("tensor %s has been bound to another ggml tensor %s", _tensor_name.c_str(),
|
||||
ggml_get_name(_tensor));
|
||||
return false;
|
||||
}
|
||||
QNN_LOG_INFO("tensor %s already bound to same ggml tensor %s", _tensor_name.c_str(),
|
||||
ggml_get_name(_tensor));
|
||||
bool alloc_qnn_tensor_id() {
|
||||
if (QNN_TENSOR_GET_ID(_qnn_tensor)) {
|
||||
QNN_LOG_WARN("graph tensor %s already created, id %d", _tensor_name.c_str(),
|
||||
QNN_TENSOR_GET_ID(_qnn_tensor));
|
||||
return true;
|
||||
}
|
||||
|
||||
update_params_from_ggml_tensor(tensor, prev_max_rank);
|
||||
Qnn_TensorType_t new_tensor_type = is_input ? QNN_TENSOR_TYPE_APP_WRITE : QNN_TENSOR_TYPE_APP_READ;
|
||||
QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type);
|
||||
QNN_LOG_INFO("tensor %s changed to type %d", _tensor_name.c_str(), new_tensor_type);
|
||||
Qnn_Tensor_t qnn_tensor = _qnn_tensor;
|
||||
auto qnn_interface = _qnn_instance->get_qnn_interface();
|
||||
auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_WARN("create graph tensor failed, tensor %s, error: %d\n", _tensor_name.c_str(), error);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!QNN_TENSOR_GET_ID(_qnn_tensor)) {
|
||||
Qnn_Tensor_t qnn_tensor = _qnn_tensor;
|
||||
auto qnn_interface = _qnn_instance->get_qnn_interface();
|
||||
auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_WARN("create graph tensor failed, tensor %s, error: %d\n", _tensor_name.c_str(), error);
|
||||
QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(qnn_tensor));
|
||||
QNN_LOG_DEBUG("create graph tensor %s, id: %d, rank: %d", _tensor_name.c_str(), QNN_TENSOR_GET_ID(qnn_tensor),
|
||||
QNN_TENSOR_GET_RANK(qnn_tensor));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool bind_buffer(uint8_t *buffer, const size_t buffer_size) {
|
||||
if (_buffer) {
|
||||
if (_buffer != buffer) {
|
||||
QNN_LOG_WARN("tensor %s has been bound to another buffer %p", _tensor_name.c_str(), _buffer);
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(qnn_tensor));
|
||||
QNN_LOG_DEBUG("create graph tensor %s, id: %d, rank: %d", _tensor_name.c_str(),
|
||||
QNN_TENSOR_GET_ID(qnn_tensor), QNN_TENSOR_GET_RANK(qnn_tensor));
|
||||
QNN_LOG_INFO("tensor %s already bound to same ggml tensor %p", _tensor_name.c_str(), _buffer);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (QNN_TENSOR_GET_TYPE(_qnn_tensor) == QNN_TENSOR_TYPE_NATIVE) {
|
||||
QNN_LOG_DEBUG("tensor %s type(%d) not READ/WRITE, skipping", _tensor_name.c_str(),
|
||||
(int)QNN_TENSOR_TYPE_NATIVE);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (should_use_mem_handle()) {
|
||||
if (!_qnn_rpc_buffer) {
|
||||
auto qnn_rpc_buffer = std::make_unique<ggml_qnn_rpc_buffer>(
|
||||
_qnn_instance, ggml_nbytes(tensor), QNN_TENSOR_GET_RANK(_qnn_tensor),
|
||||
_qnn_instance, buffer_size, QNN_TENSOR_GET_RANK(_qnn_tensor),
|
||||
QNN_TENSOR_GET_DIMENSIONS(_qnn_tensor), QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor));
|
||||
if (!qnn_rpc_buffer->is_valid()) {
|
||||
QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str());
|
||||
|
|
@ -79,30 +107,41 @@ public:
|
|||
QNN_LOG_DEBUG("tensor %s, use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor));
|
||||
} else {
|
||||
QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW);
|
||||
Qnn_ClientBuffer_t client_buf = { tensor->data, get_ggml_tensor_data_size(tensor) };
|
||||
Qnn_ClientBuffer_t client_buf = { buffer, (uint32_t)buffer_size };
|
||||
QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf);
|
||||
QNN_LOG_DEBUG("tensor %s, use client buffer %p size %d", _tensor_name.c_str(), client_buf.data,
|
||||
(int)client_buf.dataSize);
|
||||
}
|
||||
|
||||
_tensor = tensor;
|
||||
_buffer = buffer;
|
||||
_buffer_size = buffer_size;
|
||||
|
||||
if (!write_to_qnn_tensor()) {
|
||||
QNN_LOG_WARN("write to qnn tensor failed, tensor %s", _tensor_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("bind tensor %s to ggml tensor %s", _tensor_name.c_str(), ggml_get_name(tensor));
|
||||
QNN_LOG_DEBUG("bind tensor %s to buffer: %p, size: %d", _tensor_name.c_str(), buffer, (int)buffer_size);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool unbind_ggml_tensor() {
|
||||
bool bind_ggml_tensor(ggml_tensor *tensor) {
|
||||
if (!bind_buffer(reinterpret_cast<uint8_t *>(tensor->data), ggml_nbytes(tensor))) {
|
||||
QNN_LOG_WARN("Failed to bind tensor: %s to ggml tensor: %s", _tensor_name.c_str(), ggml_get_name(tensor));
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("Bind tensor %s to ggml tensor %s", _tensor_name.c_str(), ggml_get_name(tensor));
|
||||
return true;
|
||||
}
|
||||
|
||||
bool unbind() {
|
||||
if (!_graph_handle) {
|
||||
QNN_LOG_WARN("tensor %s not bound to any graph", _tensor_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!_tensor) {
|
||||
if (!_buffer) {
|
||||
QNN_LOG_DEBUG("tensor %s not bound to ggml tensor", _tensor_name.c_str());
|
||||
return true;
|
||||
}
|
||||
|
|
@ -119,12 +158,15 @@ public:
|
|||
QNN_LOG_DEBUG("tensor %s, clear client buffer", _tensor_name.c_str());
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("unbind tensor: %s from ggml tensor: %s", _tensor_name.c_str(), ggml_get_name(_tensor));
|
||||
_tensor = nullptr;
|
||||
QNN_LOG_DEBUG("unbind tensor: %s from buffer: %p, size: %d", _tensor_name.c_str(), _buffer, (int)_buffer_size);
|
||||
_buffer = nullptr;
|
||||
_buffer_size = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; }
|
||||
Qnn_DataType_t get_data_type() const { return QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor); }
|
||||
const qnn_dimension_array_t &get_dimensions() const { return _dimensions; }
|
||||
|
||||
private:
|
||||
bool write_to_qnn_tensor() {
|
||||
|
|
@ -136,7 +178,7 @@ private:
|
|||
|
||||
if (should_use_mem_handle()) {
|
||||
if (_qnn_rpc_buffer) {
|
||||
memcpy(_qnn_rpc_buffer->get_buffer(), _tensor->data, ggml_nbytes(_tensor));
|
||||
memcpy(_qnn_rpc_buffer->get_buffer(), _buffer, _buffer_size);
|
||||
} else {
|
||||
QNN_LOG_WARN("tensor %s: can't find rpcmem from qnn mem handle\n", _tensor_name.c_str());
|
||||
return false;
|
||||
|
|
@ -157,7 +199,7 @@ private:
|
|||
|
||||
if (should_use_mem_handle()) {
|
||||
if (_qnn_rpc_buffer) {
|
||||
memcpy(_tensor->data, _qnn_rpc_buffer->get_buffer(), ggml_nbytes(_tensor));
|
||||
memcpy(_buffer, _qnn_rpc_buffer->get_buffer(), _buffer_size);
|
||||
} else {
|
||||
QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n");
|
||||
return false;
|
||||
|
|
@ -169,29 +211,45 @@ private:
|
|||
return true;
|
||||
}
|
||||
|
||||
void update_params_from_ggml_tensor(ggml_tensor *tensor, int prev_max_rank) {
|
||||
_dimensions[0] = (uint32_t)tensor->ne[0];
|
||||
_dimensions[1] = (uint32_t)tensor->ne[1];
|
||||
_dimensions[2] = (uint32_t)tensor->ne[2];
|
||||
_dimensions[3] = (uint32_t)tensor->ne[3];
|
||||
QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, device_datatype_from_ggml_datatype(tensor->type));
|
||||
void update_params_from_ggml_tensor(tensor_type_t tensor_type, Qnn_DataType_t data_type, int rank) {
|
||||
QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, data_type);
|
||||
// TODO: set the quantizeParams base on the tensor type
|
||||
|
||||
QNN_TENSOR_SET_RANK(_qnn_tensor, (uint32_t)std::max(prev_max_rank, ggml_n_dims(tensor)));
|
||||
|
||||
QNN_TENSOR_SET_RANK(_qnn_tensor, (uint32_t)rank);
|
||||
QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW);
|
||||
Qnn_ClientBuffer_t client_buf = {};
|
||||
QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf);
|
||||
|
||||
Qnn_TensorType_t new_tensor_type;
|
||||
switch (tensor_type) {
|
||||
case INPUT:
|
||||
new_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
|
||||
break;
|
||||
case OUTPUT:
|
||||
new_tensor_type = QNN_TENSOR_TYPE_APP_READ;
|
||||
break;
|
||||
case PARAMETER:
|
||||
new_tensor_type = QNN_TENSOR_TYPE_STATIC;
|
||||
break;
|
||||
default:
|
||||
new_tensor_type = QNN_TENSOR_TYPE_NATIVE;
|
||||
break;
|
||||
}
|
||||
QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type);
|
||||
QNN_LOG_INFO("tensor %s changed to type %d", _tensor_name.c_str(), new_tensor_type);
|
||||
}
|
||||
|
||||
bool should_use_mem_handle() const { return _device == QNN_BACKEND_NPU; }
|
||||
bool should_use_mem_handle() const {
|
||||
return _device == QNN_BACKEND_NPU && QNN_TENSOR_GET_TYPE(_qnn_tensor) != QNN_TENSOR_TYPE_STATIC;
|
||||
}
|
||||
|
||||
std::string _tensor_name;
|
||||
const ggml_tensor *_tensor;
|
||||
uint8_t *_buffer = nullptr;
|
||||
size_t _buffer_size = 0;
|
||||
QNNBackend _device;
|
||||
std::shared_ptr<qnn_instance> _qnn_instance;
|
||||
Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion);
|
||||
std::array<uint32_t, GGML_MAX_DIMS> _dimensions = {};
|
||||
qnn_dimension_array_t _dimensions = {};
|
||||
Qnn_GraphHandle_t _graph_handle = nullptr;
|
||||
std::unique_ptr<ggml_qnn_rpc_buffer> _qnn_rpc_buffer;
|
||||
|
||||
|
|
@ -199,4 +257,6 @@ private:
|
|||
DISABLE_MOVE(ggml_qnn_tensor);
|
||||
};
|
||||
|
||||
using ggml_qnn_tensor_array_t = std::vector<std::shared_ptr<ggml_qnn_tensor>>;
|
||||
|
||||
} // namespace qnn
|
||||
|
|
|
|||
|
|
@ -9,14 +9,40 @@
|
|||
|
||||
namespace qnn {
|
||||
|
||||
qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, uint32_t rank) {
|
||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS should be 4");
|
||||
GGML_ASSERT(rank <= GGML_MAX_DIMS && rank > 0);
|
||||
|
||||
qnn_dimension_array_t internal_dims = {};
|
||||
/*
|
||||
* Both the ggml and qnn tensor in memory are stored as row-major format.
|
||||
* But the dimensions of the tensor are stored in different order.
|
||||
* For example, a 2x3 matrix:
|
||||
* [
|
||||
* [1, 2, 3],
|
||||
* [4, 5, 6],
|
||||
* ]
|
||||
* The ggml tensor will have dimensions [3, 2], while the qnn tensor will have dimensions [2, 3].
|
||||
*/
|
||||
for (uint32_t i = 0; i < rank; i++) {
|
||||
internal_dims[i] = std::max<uint32_t>(dims[rank - 1 - i], 1);
|
||||
}
|
||||
|
||||
return internal_dims;
|
||||
}
|
||||
|
||||
// TODO: mapping more ggml data type to QNN data type
|
||||
// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684
|
||||
Qnn_DataType_t device_datatype_from_ggml_datatype(ggml_type ggml_type) {
|
||||
Qnn_DataType_t qnn_datatype_from_ggml_datatype(ggml_type ggml_type) {
|
||||
switch (ggml_type) {
|
||||
case GGML_TYPE_F16:
|
||||
return QNN_DATATYPE_FLOAT_16;
|
||||
case GGML_TYPE_F32:
|
||||
return QNN_DATATYPE_FLOAT_32;
|
||||
case GGML_TYPE_F16:
|
||||
return QNN_DATATYPE_FLOAT_16;
|
||||
case GGML_TYPE_I32:
|
||||
return QNN_DATATYPE_INT_32;
|
||||
case GGML_TYPE_I16:
|
||||
return QNN_DATATYPE_INT_16;
|
||||
case GGML_TYPE_I8:
|
||||
return QNN_DATATYPE_INT_8;
|
||||
case GGML_TYPE_Q8_0:
|
||||
|
|
@ -29,16 +55,75 @@ Qnn_DataType_t device_datatype_from_ggml_datatype(ggml_type ggml_type) {
|
|||
return QNN_DATATYPE_UNDEFINED;
|
||||
}
|
||||
|
||||
Qnn_TensorType_t device_tensortype_from_ggml_tensor(ggml_tensor *ggml_tensor) {
|
||||
Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_NATIVE;
|
||||
ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type) {
|
||||
switch (qnn_type) {
|
||||
case QNN_DATATYPE_FLOAT_32:
|
||||
return GGML_TYPE_F32;
|
||||
case QNN_DATATYPE_FLOAT_16:
|
||||
return GGML_TYPE_F16;
|
||||
case QNN_DATATYPE_UINT_32:
|
||||
case QNN_DATATYPE_INT_32:
|
||||
return GGML_TYPE_I32;
|
||||
case QNN_DATATYPE_INT_16:
|
||||
return GGML_TYPE_I16;
|
||||
case QNN_DATATYPE_INT_8:
|
||||
return GGML_TYPE_I8;
|
||||
case QNN_DATATYPE_SFIXED_POINT_8:
|
||||
return GGML_TYPE_Q8_0;
|
||||
case QNN_DATATYPE_SFIXED_POINT_4:
|
||||
return GGML_TYPE_Q4_0;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return GGML_TYPE_COUNT;
|
||||
}
|
||||
|
||||
if (ggml_tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||
qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
|
||||
} else if (ggml_tensor->flags & GGML_TENSOR_FLAG_OUTPUT) {
|
||||
qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ;
|
||||
size_t qnn_datatype_size(Qnn_DataType_t qnn_type) {
|
||||
switch (qnn_type) {
|
||||
case QNN_DATATYPE_FLOAT_32:
|
||||
return sizeof(float);
|
||||
case QNN_DATATYPE_FLOAT_16:
|
||||
return sizeof(uint16_t);
|
||||
case QNN_DATATYPE_UINT_32:
|
||||
case QNN_DATATYPE_INT_32:
|
||||
return sizeof(int32_t);
|
||||
case QNN_DATATYPE_INT_16:
|
||||
return sizeof(int16_t);
|
||||
case QNN_DATATYPE_INT_8:
|
||||
return sizeof(int8_t);
|
||||
case QNN_DATATYPE_SFIXED_POINT_8:
|
||||
return sizeof(int8_t);
|
||||
case QNN_DATATYPE_SFIXED_POINT_4:
|
||||
return sizeof(int8_t);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
const char *qnn_datatype_to_string(Qnn_DataType_t qnn_type) {
|
||||
switch (qnn_type) {
|
||||
case QNN_DATATYPE_FLOAT_32:
|
||||
return "QNN_DATATYPE_FLOAT_32";
|
||||
case QNN_DATATYPE_FLOAT_16:
|
||||
return "QNN_DATATYPE_FLOAT_16";
|
||||
case QNN_DATATYPE_UINT_32:
|
||||
return "QNN_DATATYPE_UINT_32";
|
||||
case QNN_DATATYPE_INT_32:
|
||||
return "QNN_DATATYPE_INT_32";
|
||||
case QNN_DATATYPE_INT_16:
|
||||
return "QNN_DATATYPE_INT_16";
|
||||
case QNN_DATATYPE_INT_8:
|
||||
return "QNN_DATATYPE_INT_8";
|
||||
case QNN_DATATYPE_SFIXED_POINT_8:
|
||||
return "QNN_DATATYPE_SFIXED_POINT_8";
|
||||
case QNN_DATATYPE_SFIXED_POINT_4:
|
||||
return "QNN_DATATYPE_SFIXED_POINT_4";
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return qnn_tensor_type;
|
||||
return "QNN_DATATYPE_UNDEFINED";
|
||||
}
|
||||
|
||||
uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor) {
|
||||
|
|
@ -51,8 +136,13 @@ uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor) {
|
|||
return rank;
|
||||
}
|
||||
|
||||
const char *get_backend_name(int n_backend_type) {
|
||||
switch (n_backend_type) {
|
||||
const char *get_ggml_type_name(ggml_type type) {
|
||||
const auto *traits = ggml_get_type_traits(type);
|
||||
return traits->type_name;
|
||||
}
|
||||
|
||||
const char *get_backend_name(size_t device_index) {
|
||||
switch (device_index) {
|
||||
case QNN_BACKEND_CPU:
|
||||
return "QNN-CPU";
|
||||
case QNN_BACKEND_GPU:
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@
|
|||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <array>
|
||||
#include <string>
|
||||
|
||||
#include "ggml.h"
|
||||
|
|
@ -17,8 +18,14 @@
|
|||
|
||||
namespace qnn {
|
||||
|
||||
using ggml_dimension_array_t = int64_t[GGML_MAX_DIMS];
|
||||
using qnn_dimension_array_t = std::array<uint32_t, GGML_MAX_DIMS>;
|
||||
|
||||
qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, uint32_t rank);
|
||||
|
||||
uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor);
|
||||
const char *get_backend_name(int n_backend_type);
|
||||
const char *get_ggml_type_name(ggml_type type);
|
||||
const char *get_backend_name(size_t device_index);
|
||||
const char *get_chipset_desc(uint32_t chipset_id);
|
||||
const char *get_htparch_desc(size_t htp_arch);
|
||||
intptr_t align_to(size_t alignment, intptr_t offset);
|
||||
|
|
@ -187,8 +194,10 @@ inline void set_qnn_tensor_dyn_dimensions(Qnn_Tensor_t &tensor, uint8_t *isDynam
|
|||
}
|
||||
}
|
||||
|
||||
Qnn_DataType_t device_datatype_from_ggml_datatype(ggml_type ggml_type);
|
||||
Qnn_TensorType_t device_tensortype_from_ggml_tensor(ggml_tensor *ggml_tensor);
|
||||
Qnn_DataType_t qnn_datatype_from_ggml_datatype(ggml_type ggml_type);
|
||||
ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type);
|
||||
size_t qnn_datatype_size(Qnn_DataType_t qnn_type);
|
||||
const char *qnn_datatype_to_string(Qnn_DataType_t qnn_type);
|
||||
|
||||
#if ENABLE_QNNBACKEND_PERF
|
||||
class qnn_perf {
|
||||
|
|
|
|||
Loading…
Reference in New Issue