feat: fix mulmat (#2)

* ggml_qnn_op_config now manager the construction of ggml_qnn_tensor

* wip

* add interface ggml_qnn_op_config

* add ggml_qnn_list_op_config

* add create_tensor and move tensor bind to execute

* wip

* rename: ggml_qnn_list_op_config -> ggml_qnn_matmul_op_config

* add tensortype to allow native tensor

* remove ggml_tensor param at ggml_qnn_tensor::create_tensor

* postpone the tensor id allocation to add_node

* add ggml_qnn_op_config_base

* trival change to reduct the param of function

* split bind_tensors into bind_input_tensors and bind_output_tensors

* implement ggml_qnn_single_op_config::create_tensors

next will set the prameter of transpose

* tensor: add bind buffer

* add parameter tensor type

* implement add_tensor_param

* set qnn_instance only at constructor

* set transpose tensor param

* move create_op_constructor into op-config module

* create QNN_OP_MAT_MUL from ggml_qnn_matmul_op_config

* try fix crash

* fix compiling error at older ndk (r23c)

* fix crash

* fix parameter tensor name

* update tensor dimension assignment and add TODO

* fix mat_mul graph creating

* fix MUL_MAT_256x16x10x1_256x1x10x1_16x1x10x1

* append type to graph cache key

* wip

* fix supported op

* update comment

* disable op other than add and mat_mul

* add convert op to adapt multi input/output format

* disable f16 for cpu backend according to official doc

https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/cpu_backend.html#supported-operations

* add supported data types flags in each backend

* remove unused functions

* append output type to graph key

* fix gpu backend by disable the different data type op

* fix cpu backend support ops

* fix duplicated tensor name

* append op name

* suppress warning

* remove unused code
This commit is contained in:
nullname 2024-10-28 12:48:16 +08:00 committed by GitHub
parent f260498213
commit 4abaf7d87e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 918 additions and 256 deletions

View File

@ -51,12 +51,30 @@ struct qnn_device_caps {
const char *description;
const char *lib_name;
enum ggml_backend_dev_type type;
// TODO: should get this caps from device
std::unordered_set<ggml_type> supported_types;
};
const qnn_device_caps kDeviceCaps[GGML_QNN_MAX_DEVICES]{
{ "qnn-cpu", "Qualcomm Kryo CPU", "libQnnCpu.so", GGML_BACKEND_DEVICE_TYPE_CPU }, /* QNN_BACKEND_CPU */
{ "qnn-gpu", "Qualcomm Adreno GPU", "libQnnGpu.so", GGML_BACKEND_DEVICE_TYPE_GPU }, /* QNN_BACKEND_GPU */
{ "qnn-npu", "Qualcomm NPU", "libQnnHtp.so", GGML_BACKEND_DEVICE_TYPE_GPU }, /* QNN_BACKEND_NPU */
{ "qnn-cpu",
"Qualcomm Kryo CPU",
"libQnnCpu.so",
GGML_BACKEND_DEVICE_TYPE_CPU,
{ GGML_TYPE_F32,
GGML_TYPE_I8 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul
{ "qnn-gpu",
"Qualcomm Adreno GPU",
"libQnnGpu.so",
GGML_BACKEND_DEVICE_TYPE_GPU,
{ GGML_TYPE_F32,
GGML_TYPE_F16 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul
{ "qnn-npu",
"Qualcomm NPU",
"libQnnHtp.so",
GGML_BACKEND_DEVICE_TYPE_GPU,
{ GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_I16,
GGML_TYPE_I8 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul
};
class ggml_backend_qnn_buffer_context {
@ -340,9 +358,10 @@ void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, struct ggml_backe
props->type = ggml_backend_qnn_device_get_type(dev);
ggml_backend_qnn_device_get_memory(dev, &props->memory_free, &props->memory_total);
props->caps = {
/* async */ false,
/* host_buffer */ false,
/* events */ false,
/* async */ false,
/* host_buffer */ false,
/* buffer_from_host_ptr */ false,
/* events */ false,
};
}
@ -412,6 +431,7 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev,
dev_ctx->instance = instance;
dev_ctx->qnn_interface = qnn_interface;
dev_ctx->socinfo = instance->get_soc_info();
dev_ctx->supported_types = kDeviceCaps[device_index].supported_types;
ggml_backend_t qnn_backend = new ggml_backend{
/* .guid = */ ggml_backend_qnn_guid(),
@ -440,8 +460,8 @@ ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_ptr(ggml_backend_dev_t
}
bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor *op) {
GGML_UNUSED(dev);
return qnn::ggml_qnn_supports_op(op);
auto *device_ctx = get_device_context(dev);
return qnn::ggml_qnn_supports_op(device_ctx, op);
}
bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {

View File

@ -108,8 +108,8 @@ std::string get_graph_key(const std::string &op_name, const std::array<ggml_tens
const std::array<ggml_tensor *, _OutputSize> &outputs) {
constexpr static const auto append_dimensions = [](std::string &key, const ggml_tensor *tensor) {
char buffer[256] = {};
snprintf(buffer, sizeof(buffer), "_%ldx%ldx%ldx%ld", (long)tensor->ne[0], (long)tensor->ne[1],
(long)tensor->ne[2], (long)tensor->ne[3]);
snprintf(buffer, sizeof(buffer), "_%ldx%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1],
(long)tensor->ne[2], (long)tensor->ne[3], qnn::get_ggml_type_name(tensor->type));
key += buffer;
};
@ -117,32 +117,11 @@ std::string get_graph_key(const std::string &op_name, const std::array<ggml_tens
for (auto &input : inputs) {
append_dimensions(graph_key, input);
}
for (auto &output : outputs) {
append_dimensions(graph_key, output);
}
graph_key += qnn::get_ggml_type_name(outputs.front()->type);
return graph_key;
}
qnn::ggml_op_constructor_t generate_common_op_constructor(const std::string &op_name) {
if (op_name == QNN_OP_MAT_MUL) {
// For QNN_OP_MAT_MUL, we need to transpose the input tensor
return [](const std::string &name) {
auto config = std::make_unique<qnn::ggml_qnn_op_config>(name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL);
Qnn_Scalar_t scalar = QNN_SCALAR_INIT;
scalar.dataType = QNN_DATATYPE_BOOL_8;
scalar.bool8Value = true;
config->add_scalar_param(QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0, scalar);
QNN_LOG_DEBUG("add scalar param %s\n", QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0);
return config;
};
}
return [op_name](const std::string &name) {
return std::make_unique<qnn::ggml_qnn_op_config>(name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_name);
};
}
constexpr const char *kGgmlOpToQnnOp[] = {
nullptr, // GGML_OP_NONE
nullptr, // GGML_OP_DUP
@ -278,7 +257,7 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *c
return nullptr;
}
auto op_constructor = generate_common_op_constructor(kGgmlOpToQnnOp[op]);
auto op_constructor = qnn::create_op_constructor(kGgmlOpToQnnOp[op]);
if (!graph->build_graph(op_constructor, to_ggml_tensor_array<_InputSize>(inputs),
to_ggml_tensor_array<_OutputSize>(outputs))) {
QNN_LOG_ERROR("build_graph failed\n");
@ -542,11 +521,57 @@ static constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = {
static_assert(sizeof(kQnnBinaryOpsTable) / sizeof(kQnnBinaryOpsTable[0]) == GGML_OP_COUNT,
"GGML_OP_COUNT does not match the size of the kQnnBinaryOpsTable table");
bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_tensor *tensor) {
switch (tensor->type) {
case GGML_TYPE_F32:
case GGML_TYPE_F16:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q4_0:
if (ctx->supported_types.find(tensor->type) == ctx->supported_types.end()) {
QNN_LOG_DEBUG("unsupported data type GGML_TYPE_F16 for cpu backend");
return false;
}
break;
default:
QNN_LOG_DEBUG("unsupported data type %d", tensor->type);
return false;
}
return true;
}
bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) {
GGML_UNUSED(ctx);
auto *src0 = op->src[0];
auto *src1 = op->src[1];
if (src0->type != src1->type || src0->type != op->type) {
// current qnn implementation only supports the same type for src0 and src1
QNN_LOG_DEBUG("src0 type %d and src1 type %d and op type %d are not equal", src0->type, src1->type, op->type);
return false;
}
if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) {
/*
* TODO: remove the blocker here when qnn backend supports mul_mat like this:
* [ne03, ne02, n, k] * [ne03 * x, ne02 * y, m, k] -> [ne03 * x, ne02 * y, m, n]
*/
QNN_LOG_DEBUG("src0 and src1 dimensions are not equal");
return false;
}
return true;
}
} // namespace
namespace qnn {
bool ggml_qnn_supports_op(const ggml_tensor *op) {
bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) {
if (op->op == GGML_OP_NONE) {
return true;
}
if (op->op == GGML_OP_UNARY) {
if (!kQnnUnaryOpsTable[kGgmlUnaryOpStart + ggml_get_unary_op(op)]) {
QNN_LOG_DEBUG("unsupported unary op %d", ggml_get_unary_op(op));
@ -557,35 +582,38 @@ bool ggml_qnn_supports_op(const ggml_tensor *op) {
QNN_LOG_DEBUG("src0 is nullptr");
return false;
}
} else if (op->op != GGML_OP_NONE) {
} else {
if (!kQnnUnaryOpsTable[op->op] && !kQnnBinaryOpsTable[op->op]) {
QNN_LOG_DEBUG("unsupported op %d", op->op);
return false;
}
if (!op->src[0] || !op->src[1]) {
auto *src0 = op->src[0];
auto *src1 = op->src[1];
if (!src0 || !src1) {
QNN_LOG_DEBUG("src0 or src1 is nullptr");
return false;
}
#ifndef NDEBUG
if (op->op == GGML_OP_ADD && !is_tensor_dimensions_equal(op->src[0], op->src[1])) {
QNN_LOG_DEBUG("src0 and src1 dimensions are not equal");
if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, src1) ||
!ggml_qnn_supports_tensor(ctx, op)) {
return false;
}
#endif
}
switch (op->type) {
case GGML_TYPE_F32:
case GGML_TYPE_F16:
case GGML_TYPE_I8:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q4_0:
break;
default:
QNN_LOG_DEBUG("unsupported src0 type %d", op->src[0]->type);
return false;
switch (op->op) {
case GGML_OP_ADD:
if (!is_tensor_dimensions_equal(src0, src1)) {
QNN_LOG_DEBUG("src0 and src1 dimensions are not equal");
return false;
}
break;
case GGML_OP_MUL_MAT:
return ggml_qnn_supports_matmul_op(ctx, op);
default:
return false;
}
}
return true;

View File

@ -6,7 +6,7 @@
namespace qnn {
bool ggml_qnn_supports_op(const ggml_tensor *op);
bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op);
bool ggml_qnn_forward(ggml_backend_qnn_device_context *ctx, struct ggml_tensor *tensor);
} // namespace qnn

View File

@ -4,6 +4,7 @@
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include "ggml.h"
@ -26,6 +27,7 @@ struct ggml_backend_qnn_device_context {
// initialize in init
qnn::qcom_socinfo socinfo = {};
std::unordered_set<ggml_type> supported_types;
std::shared_ptr<qnn::qnn_instance> instance;
std::shared_ptr<qnn::qnn_interface> qnn_interface;

View File

@ -8,8 +8,8 @@
namespace qnn {
class ggml_qnn_rpc_buffer {
public:
ggml_qnn_rpc_buffer(std::shared_ptr<qnn_instance> qnn_instance, size_t size, uint32_t rank, uint32_t *dimensions,
Qnn_DataType_t data_type) :
ggml_qnn_rpc_buffer(std::shared_ptr<qnn_instance> qnn_instance, const size_t size, const uint32_t rank,
uint32_t *dimensions, Qnn_DataType_t data_type) :
_qnn_instance(qnn_instance), _size(size) {
_qnn_rpc_buffer = static_cast<uint8_t *>(qnn_instance->alloc_rpcmem(size, alignof(void *)));

View File

@ -2,7 +2,6 @@
#pragma once
#include <cstdio>
#include <functional>
#include <memory>
#include <string>
#include <vector>
@ -12,19 +11,15 @@
#include "logger.hpp"
#include "op-config.hpp"
#include "qnn-lib.hpp"
#include "tensor.hpp"
namespace qnn {
using ggml_tensor_array_t = std::vector<ggml_tensor *>;
using ggml_op_constructor_t = std::function<std::unique_ptr<qnn::ggml_qnn_op_config>(const std::string &)>;
class ggml_qnn_graph {
public:
explicit ggml_qnn_graph(const std::string &graph_name, QNNBackend device,
std::shared_ptr<qnn_instance> qnn_instance, size_t vtcm_size_in_mb) :
_graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) {
QNN_LOG_INFO("graph name %s", graph_name.c_str());
QNN_LOG_INFO("[%s]create", graph_name.c_str());
auto qnn_interface = qnn_instance->get_qnn_interface();
auto qnn_context = qnn_instance->get_qnn_context_handle();
@ -69,19 +64,16 @@ public:
}
if (error != QNN_SUCCESS) {
QNN_LOG_INFO(
"can't create qnn graph handle with graph name %s, "
"error = %d\n",
graph_name.c_str(), error);
QNN_LOG_INFO("[%s]can't create qnn graph handle, error = %d\n", graph_name.c_str(), error);
return;
}
QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str());
QNN_LOG_INFO("[%s]create succeed\n", graph_name.c_str());
_graph_handle = graph_handle;
_qnn_interface = qnn_interface;
}
~ggml_qnn_graph() { QNN_LOG_DEBUG("graph name %s, destroy", _graph_name.c_str()); }
~ggml_qnn_graph() { QNN_LOG_DEBUG("[%s]destroy", _graph_name.c_str()); }
bool build_graph(ggml_op_constructor_t op_constructor, const ggml_tensor_array_t &tensor_inputs,
const ggml_tensor_array_t &tensor_outputs) {
@ -91,95 +83,44 @@ public:
return false;
}
// get the max tensor rank
for (auto tensor : tensor_inputs) {
_tensor_rank = std::max(_tensor_rank, ggml_n_dims(tensor));
}
for (auto tensor : tensor_outputs) {
_tensor_rank = std::max(_tensor_rank, ggml_n_dims(tensor));
QNN_LOG_DEBUG("[%s]build_graph start", _graph_name.c_str());
_op_config = op_constructor(_graph_name, _qnn_instance);
if (!_op_config->create_tensors(_device, _graph_handle, tensor_inputs, tensor_outputs)) {
QNN_LOG_ERROR("[%s]create_tensors failed\n", _graph_name.c_str());
return false;
}
QNN_LOG_DEBUG("graph name %s, build_graph start", _graph_name.c_str());
_tensor_inputs.resize(tensor_inputs.size());
for (size_t i = 0; i < tensor_inputs.size(); i++) {
char buffer[GGML_MAX_NAME] = {};
snprintf(buffer, GGML_MAX_NAME, "src%d", (int)i);
auto qnn_tensor =
std::make_shared<ggml_qnn_tensor>(std::string(buffer), _device, _graph_handle, _qnn_instance);
auto *ggml_tensor = tensor_inputs[i];
if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, true, _tensor_rank)) {
QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor));
return false;
}
_tensor_inputs[i] = qnn_tensor;
if (!_op_config->add_op_to_graph(_graph_handle)) {
QNN_LOG_ERROR("[%s]add nodes failed\n", _graph_name.c_str());
return false;
}
_tensor_outputs.resize(tensor_outputs.size());
for (size_t i = 0; i < tensor_outputs.size(); i++) {
char buffer[GGML_MAX_NAME] = {};
snprintf(buffer, GGML_MAX_NAME, "dst%d", (int)i);
auto qnn_tensor =
std::make_shared<ggml_qnn_tensor>(std::string(buffer), _device, _graph_handle, _qnn_instance);
auto *ggml_tensor = tensor_outputs[i];
if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, false, _tensor_rank)) {
QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor));
return false;
}
_tensor_outputs[i] = qnn_tensor;
}
_op_config = op_constructor(_graph_name);
_op_config->set_input_tensors(_tensor_inputs);
_op_config->set_output_tensors(_tensor_outputs);
auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, _op_config->get_op_config());
auto error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr);
if (error != QNN_SUCCESS) {
auto *error_str = get_qnn_error_string(error);
if (error_str) {
QNN_LOG_ERROR("qnn_graph_add_node.error: %s\n", error_str);
QNN_LOG_ERROR("[%s]qnn_graph_finalize.error: %s\n", _graph_name.c_str(), error_str);
} else {
QNN_LOG_ERROR("qnn_graph_add_node.error: %d\n", error);
QNN_LOG_ERROR("[%s]qnn_graph_finalize.error: %d\n", _graph_name.c_str(), error);
}
return false;
}
error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr);
if (error != QNN_SUCCESS) {
auto *error_str = get_qnn_error_string(error);
if (error_str) {
QNN_LOG_ERROR("qnn_graph_finalize.error: %s\n", error_str);
} else {
QNN_LOG_ERROR("qnn_graph_finalize.error: %d\n", error);
}
return false;
}
QNN_LOG_DEBUG("graph name %s, build_graph succeed", _graph_name.c_str());
QNN_LOG_DEBUG("[%s]build_graph succeed", _graph_name.c_str());
return true;
}
bool execute(const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) {
GGML_ASSERT(tensor_inputs.size() == _tensor_inputs.size());
GGML_ASSERT(tensor_outputs.size() == _tensor_outputs.size());
for (size_t i = 0; i < tensor_inputs.size(); i++) {
auto *ggml_tensor = tensor_inputs[i];
if (!_tensor_inputs[i]->bind_ggml_tensor(ggml_tensor, true, _tensor_rank)) {
QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor));
return false;
}
if (!_op_config->bind_input_tensors(tensor_inputs)) {
QNN_LOG_ERROR("[%s]bind input tensors failed\n", _graph_name.c_str());
return false;
}
for (size_t i = 0; i < tensor_outputs.size(); i++) {
auto *ggml_tensor = tensor_outputs[i];
if (!_tensor_outputs[i]->bind_ggml_tensor(ggml_tensor, false, _tensor_rank)) {
QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor));
return false;
}
if (!_op_config->bind_output_tensors(tensor_outputs)) {
QNN_LOG_ERROR("[%s]bind output tensors failed\n", _graph_name.c_str());
return false;
}
_op_config->set_input_tensors(_tensor_inputs);
_op_config->set_output_tensors(_tensor_outputs);
auto &qnn_tensor_inputs = _op_config->get_qnn_input_tensors();
auto &qnn_tensor_outputs = _op_config->get_qnn_output_tensors();
@ -188,20 +129,15 @@ public:
qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr);
if (_device == QNN_BACKEND_NPU) {
if (error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) {
QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n");
QNN_LOG_WARN("[%s]NPU crashed. SSR detected. Caused QNN graph execute error\n", _graph_name.c_str());
}
}
for (auto tensor : _tensor_inputs) {
tensor->unbind_ggml_tensor();
}
for (auto tensor : _tensor_outputs) {
tensor->unbind_ggml_tensor();
}
_op_config->unbind_input_tensors();
_op_config->unbind_output_tensors();
if (error != QNN_SUCCESS) {
QNN_LOG_INFO("error = %d\n", error);
QNN_LOG_INFO("[%s]error = %d\n", _graph_name.c_str(), error);
return false;
}
@ -220,11 +156,8 @@ private:
Qnn_GraphHandle_t _graph_handle = nullptr;
std::shared_ptr<qnn_instance> _qnn_instance;
std::shared_ptr<qnn_interface> _qnn_interface;
std::vector<std::shared_ptr<ggml_qnn_tensor>> _tensor_inputs;
std::vector<std::shared_ptr<ggml_qnn_tensor>> _tensor_outputs;
std::unique_ptr<ggml_qnn_op_config> _op_config;
std::vector<Qnn_Param_t> _param_types;
int _tensor_rank = 0;
DISABLE_COPY(ggml_qnn_graph);
DISABLE_MOVE(ggml_qnn_graph);

View File

@ -0,0 +1,471 @@
#include "op-config.hpp"
#include <cstdint>
#include "logger.hpp"
namespace {
constexpr const qnn::qnn_dimension_array_t kTransposeParamData[GGML_MAX_DIMS] = {
{ 0 },
{ 1, 0 },
{ 0, 2, 1 },
{ 0, 1, 3, 2 },
};
qnn::qnn_dimension_array_t get_transposed_dimensions(const qnn::qnn_dimension_array_t &dimensions, int rank) {
qnn::qnn_dimension_array_t transposed_dims = dimensions;
if (rank >= 2) {
transposed_dims[rank - 1] = dimensions[rank - 2];
transposed_dims[rank - 2] = dimensions[rank - 1];
}
return transposed_dims;
}
int get_rank(const qnn::ggml_tensor_array_t &tensor_inputs, const qnn::ggml_tensor_array_t &tensor_outputs) {
int tensor_rank = 0;
// get the max tensor rank
for (auto tensor : tensor_inputs) {
tensor_rank = std::max(tensor_rank, ggml_n_dims(tensor));
}
for (auto tensor : tensor_outputs) {
tensor_rank = std::max(tensor_rank, ggml_n_dims(tensor));
}
return tensor_rank;
}
Qnn_DataType_t get_tensor_type(const qnn::ggml_qnn_tensor_array_t &tensors) {
Qnn_DataType_t type = QNN_DATATYPE_UNDEFINED;
for (auto tensor : tensors) {
auto tensor_type_size = qnn::qnn_datatype_size(tensor->get_data_type());
GGML_ASSERT(tensor_type_size > 0);
if (tensor_type_size > qnn::qnn_datatype_size(type)) {
type = tensor->get_data_type();
}
}
return type;
}
struct tensor_common_params {
const char *name_prefix;
int tensor_rank;
bool is_input;
QNNBackend device;
Qnn_GraphHandle_t graph_handle;
std::shared_ptr<qnn::qnn_instance> qnn_instance;
};
void create_tensors_from_ggml_tensor(const tensor_common_params &params, const qnn::ggml_tensor_array_t &ggml_tensors,
qnn::ggml_qnn_tensor_array_t *tensor_wrappers,
std::vector<Qnn_Tensor_t> *qnn_tensors) {
using namespace qnn;
tensor_wrappers->resize(ggml_tensors.size());
if (qnn_tensors) {
qnn_tensors->resize(ggml_tensors.size());
}
char buffer[GGML_MAX_NAME] = {};
auto tensor_type = params.is_input ? ggml_qnn_tensor::INPUT : ggml_qnn_tensor::OUTPUT;
for (size_t i = 0; i < ggml_tensors.size(); i++) {
snprintf(buffer, GGML_MAX_NAME, "%s%d", params.name_prefix, (int)i);
auto *ggml_tensor = ggml_tensors[i];
(*tensor_wrappers)[i] = std::make_shared<ggml_qnn_tensor>(tensor_type, std::string(buffer), ggml_tensor->ne,
ggml_tensor->type, params.tensor_rank, params.device,
params.graph_handle, params.qnn_instance);
}
}
bool bind_tensors(const qnn::ggml_tensor_array_t &ggml_tensors, qnn::ggml_qnn_tensor_array_t &tensor_wrappers,
std::vector<Qnn_Tensor_t> &qnn_tensors) {
for (size_t i = 0; i < ggml_tensors.size(); i++) {
auto *ggml_tensor = ggml_tensors[i];
if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) {
QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor));
return false;
}
qnn_tensors[i] = tensor_wrappers[i]->get_qnn_tensor();
}
return true;
}
class ggml_qnn_connectable_op_config : public qnn::ggml_qnn_op_config_base {
public:
explicit ggml_qnn_connectable_op_config(const std::string &name, const std::string &package_name,
const std::string &op_type,
std::shared_ptr<qnn::qnn_instance> qnn_instance) :
ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {}
bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle,
const qnn::ggml_tensor_array_t &tensor_inputs,
const qnn::ggml_tensor_array_t &tensor_outputs) override {
GGML_UNUSED(device);
GGML_UNUSED(graph_handle);
GGML_UNUSED(tensor_inputs);
GGML_UNUSED(tensor_outputs);
return true;
}
void set_input_tensors(qnn::ggml_qnn_tensor_array_t &tensor_inputs) {
_tensor_inputs = tensor_inputs;
_qnn_tensor_inputs.resize(_tensor_inputs.size());
}
void set_input_tensors(qnn::ggml_qnn_tensor_array_t &&tensor_inputs) {
_tensor_inputs = std::move(tensor_inputs);
_qnn_tensor_inputs.resize(_tensor_inputs.size());
}
void set_output_tensors(qnn::ggml_qnn_tensor_array_t &tensor_outputs) {
_tensor_outputs = tensor_outputs;
_qnn_tensor_outputs.resize(_tensor_outputs.size());
}
void set_output_tensors(qnn::ggml_qnn_tensor_array_t &&tensor_outputs) {
_tensor_outputs = std::move(tensor_outputs);
_qnn_tensor_outputs.resize(_tensor_outputs.size());
}
qnn::ggml_qnn_tensor_array_t &get_input_tensors() { return _tensor_inputs; }
qnn::ggml_qnn_tensor_array_t &get_output_tensors() { return _tensor_outputs; }
private:
DISABLE_COPY(ggml_qnn_connectable_op_config);
DISABLE_MOVE(ggml_qnn_connectable_op_config);
};
} // namespace
namespace qnn {
void ggml_qnn_op_config_base::add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar) {
_param_names.push_back(name);
Qnn_Param_t param = QNN_PARAM_INIT;
param.paramType = QNN_PARAMTYPE_SCALAR;
param.name = _param_names.back().c_str();
param.scalarParam = scalar;
_qnn_parameters.push_back(param);
}
bool ggml_qnn_op_config_base::add_tensor_param(const std::string &name, const qnn_dimension_array_t &dimensions,
int rank, const uint8_t *data, const Qnn_DataType_t data_type,
QNNBackend device, Qnn_GraphHandle_t graph_handle) {
std::string tensor_name = _name + name + std::to_string(_tensor_parameters.size());
auto param_tensor = std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::PARAMETER, tensor_name, dimensions,
data_type, rank, device, graph_handle, _qnn_instance);
size_t data_size = ggml_type_size(ggml_datatype_from_qnn_datatype(data_type));
for (int i = 0; i < rank; i++) {
data_size *= dimensions[i];
}
GGML_ASSERT(data_size > 0);
if (!param_tensor->bind_buffer(const_cast<uint8_t *>(data), data_size)) {
QNN_LOG_ERROR("parameter tensor bind_buffer failed\n");
return false;
}
if (!param_tensor->alloc_qnn_tensor_id()) {
QNN_LOG_ERROR("parameter tensor alloc_qnn_tensor_id failed\n");
return false;
}
_tensor_parameters.push_back(param_tensor);
_param_names.push_back(name);
Qnn_Param_t param = QNN_PARAM_INIT;
param.paramType = QNN_PARAMTYPE_TENSOR;
param.name = _param_names.back().c_str();
param.tensorParam = param_tensor->get_qnn_tensor();
_qnn_parameters.push_back(param);
return true;
}
bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) {
GGML_ASSERT(_qnn_tensor_inputs.size() == _tensor_inputs.size());
GGML_ASSERT(_qnn_tensor_outputs.size() == _tensor_outputs.size());
auto qnn_interface = _qnn_instance->get_qnn_interface();
for (size_t i = 0; i < _tensor_inputs.size(); i++) {
auto tensor = _tensor_inputs[i];
if (!tensor->alloc_qnn_tensor_id()) {
QNN_LOG_ERROR("[%s]input tensor alloc_qnn_tensor_id failed\n", _name.c_str());
return false;
}
_qnn_tensor_inputs[i] = tensor->get_qnn_tensor();
}
for (size_t i = 0; i < _tensor_outputs.size(); i++) {
auto tensor = _tensor_outputs[i];
if (!tensor->alloc_qnn_tensor_id()) {
QNN_LOG_ERROR("[%s]output tensor alloc_qnn_tensor_id failed\n", _name.c_str());
return false;
}
_qnn_tensor_outputs[i] = _tensor_outputs[i]->get_qnn_tensor();
}
auto error = qnn_interface->qnn_graph_add_node(graph_handle, get_op_config());
if (error != QNN_SUCCESS) {
auto *error_str = get_qnn_error_string(error);
if (error_str) {
QNN_LOG_ERROR("[%s]qnn_graph_add_node.error: %s\n", _name.c_str(), error_str);
} else {
QNN_LOG_ERROR("[%s]qnn_graph_add_node.error: %d\n", _name.c_str(), error);
}
return false;
}
QNN_LOG_DEBUG("[%s]added to graph\n", _name.c_str());
return true;
}
bool ggml_qnn_op_config_base::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) {
GGML_ASSERT(tensor_inputs.size() == _tensor_inputs.size());
return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs);
}
bool ggml_qnn_op_config_base::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) {
GGML_ASSERT(tensor_outputs.size() == _tensor_outputs.size());
return bind_tensors(tensor_outputs, _tensor_outputs, _qnn_tensor_outputs);
}
void ggml_qnn_op_config_base::unbind_input_tensors() {
for (auto &tensor : _tensor_inputs) {
tensor->unbind();
}
}
void ggml_qnn_op_config_base::unbind_output_tensors() {
for (auto &tensor : _tensor_outputs) {
tensor->unbind();
}
}
Qnn_OpConfig_t ggml_qnn_op_config_base::get_op_config() {
Qnn_OpConfig_t config = QNN_OPCONFIG_INIT;
config.version = QNN_OPCONFIG_VERSION_1;
auto &op_config = config.v1;
op_config.name = _name.c_str();
op_config.packageName = _package_name.c_str();
op_config.typeName = _op_type.c_str();
op_config.numOfParams = (uint32_t)_qnn_parameters.size();
op_config.params = _qnn_parameters.data();
op_config.numOfInputs = (uint32_t)_qnn_tensor_inputs.size();
op_config.inputTensors = _qnn_tensor_inputs.data();
op_config.numOfOutputs = (uint32_t)_qnn_tensor_outputs.size();
op_config.outputTensors = _qnn_tensor_outputs.data();
return config;
}
bool ggml_qnn_single_op_config::create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle,
const ggml_tensor_array_t &tensor_inputs,
const ggml_tensor_array_t &tensor_outputs) {
const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs);
tensor_common_params params = { "src", tensor_rank, true, device, graph_handle, _qnn_instance };
create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs);
params.name_prefix = "dst";
params.is_input = false;
create_tensors_from_ggml_tensor(params, tensor_outputs, &_tensor_outputs, &_qnn_tensor_outputs);
return true;
}
bool ggml_qnn_matmul_op_config::create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle,
const ggml_tensor_array_t &tensor_inputs,
const ggml_tensor_array_t &tensor_outputs) {
GGML_ASSERT(tensor_inputs.size() == 2);
GGML_ASSERT(tensor_outputs.size() == 1);
const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs);
GGML_ASSERT(tensor_rank >= 2);
// create input tensors
tensor_common_params params = { "src", tensor_rank, true, device, graph_handle, _qnn_instance };
create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs);
// create output tensor
ggml_qnn_tensor_array_t mat_mul_tensor_outputs;
params.name_prefix = "dst";
params.is_input = false;
create_tensors_from_ggml_tensor(params, tensor_outputs, &mat_mul_tensor_outputs, nullptr);
// create mat_mul nodes
return create_mat_mul_nodes(device, graph_handle, tensor_rank, _tensor_inputs, mat_mul_tensor_outputs);
}
bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank,
ggml_qnn_tensor_array_t &tensor_inputs,
ggml_qnn_tensor_array_t &tensor_outputs) {
/*
* First, both the ggml and qnn tensor in memory are stored as row-major format. (For more details, please also:
* https://pytorch.org/blog/tensor-memory-format-matters/#:~:text=Column%20Major%20Order:%20In%20this%20format,%20the%20matrix)
* But the dimensions of the tensor are stored in different order.
* For example, a 2x3 matrix:
* [
* [1, 2, 3],
* [4, 5, 6],
* ]
* The ggml tensor will have dimensions [3, 2], while the qnn tensor will have dimensions [2, 3].
*
* Second, from the ggml introduction here: https://github.com/huggingface/blog/blob/main/introduction-to-ggml.md
* Given 2 matrices A and B, the matrix multiplication C = A * B is defined as:
* ```python
* import torch
* # Create two matrices
* A = torch.tensor([
* [2, 8],
* [5, 1],
* [4, 2],
* [8, 6],
* ])
* B = torch.tensor([
* [10, 5],
* [9, 9],
* [5, 4],
* ])
* # Perform matrix multiplication
* result = torch.matmul(A, B.T)
* print(result.T)
* ```
* Here, the B.T is the transpose of B.
*
* So here we need to create graph like:
* ```mermaid
* graph TD;
* i1>ggml_tensor_in0] --src0--> mat_mul0;
* i2>ggml_tensor_in1] --src1--> transpose0;
* transpose0 --src0_trans--> mat_mul0;
* mat_mul0 --dst_trans--> transpose1;
* transpose1 --dst0--> o1>ggml_tensor_out];
* ```
*/
// create src0_trans tensor
auto src1 = tensor_inputs.back();
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS does not match the expected value");
qnn_dimension_array_t dimensions = get_transposed_dimensions(src1->get_dimensions(), rank);
auto src0_trans =
std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, "src0_trans", dimensions,
src1->get_data_type(), rank, device, graph_handle, _qnn_instance);
// create dst_trans tensor
auto dst = tensor_outputs.front();
dimensions = get_transposed_dimensions(dst->get_dimensions(), rank);
auto dst_trans = std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, "dst_trans", dimensions,
dst->get_data_type(), rank, device, graph_handle, _qnn_instance);
// create transpose0
auto transpose0 = std::make_shared<ggml_qnn_connectable_op_config>(_name + "_trans0", QNN_OP_PACKAGE_NAME_QTI_AISW,
QNN_OP_TRANSPOSE, _qnn_instance);
// create transpose1
auto transpose1 = std::make_shared<ggml_qnn_connectable_op_config>(_name + "_trans1", QNN_OP_PACKAGE_NAME_QTI_AISW,
QNN_OP_TRANSPOSE, _qnn_instance);
// create mat_mul
auto mat_mul = std::make_shared<ggml_qnn_connectable_op_config>(_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
_qnn_instance);
// set transpose0 parameters
auto *params_data = reinterpret_cast<const uint8_t *>(kTransposeParamData[rank - 1].data());
const qnn_dimension_array_t param_dims = { (uint32_t)rank, 1, 1, 1 };
transpose0->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32, device,
graph_handle);
// set transpose1 parameters
transpose1->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32, device,
graph_handle);
// set tensor to transpose0
ggml_qnn_tensor_array_t tensors = { tensor_inputs.back() };
transpose0->set_input_tensors(tensors);
tensors = { src0_trans };
transpose0->set_output_tensors(tensors);
// set tensor to mat_mul
tensors = { tensor_inputs.front(), src0_trans };
mat_mul->set_input_tensors(tensors);
tensors = { dst_trans };
mat_mul->set_output_tensors(tensors);
// set tensor to transpose1
tensors = { dst_trans };
transpose1->set_input_tensors(tensors);
transpose1->set_output_tensors(tensor_outputs);
_mat_mul = mat_mul;
_transpose0 = transpose0;
_transpose1 = transpose1;
return true;
}
bool ggml_qnn_matmul_op_config::add_op_to_graph(Qnn_GraphHandle_t graph_handle) {
for (auto &convert : _input_converts) {
if (convert && !convert->add_op_to_graph(graph_handle)) {
return false;
}
}
return _transpose0->add_op_to_graph(graph_handle) && _mat_mul->add_op_to_graph(graph_handle) &&
_transpose1->add_op_to_graph(graph_handle) &&
(!_output_convert || _output_convert->add_op_to_graph(graph_handle));
}
bool ggml_qnn_matmul_op_config::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) {
return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs);
}
bool ggml_qnn_matmul_op_config::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) {
if (_output_convert) {
return _output_convert->bind_output_tensors(tensor_outputs);
} else {
return _transpose1->bind_output_tensors(tensor_outputs);
}
}
void ggml_qnn_matmul_op_config::unbind_input_tensors() {
_mat_mul->unbind_input_tensors();
_transpose0->unbind_input_tensors();
for (auto &convert : _input_converts) {
if (convert) {
convert->unbind_input_tensors();
}
}
}
void ggml_qnn_matmul_op_config::unbind_output_tensors() {
_transpose1->unbind_output_tensors();
if (_output_convert) {
_output_convert->unbind_output_tensors();
}
}
std::vector<Qnn_Tensor_t> &ggml_qnn_matmul_op_config::get_qnn_output_tensors() {
if (_output_convert) {
return _output_convert->get_qnn_output_tensors();
} else {
return _transpose1->get_qnn_output_tensors();
}
}
ggml_op_constructor_t create_op_constructor(const std::string &op_name) {
if (op_name == QNN_OP_MAT_MUL) {
// For QNN_OP_MAT_MUL, we need to transpose the input tensor
return [](const std::string &instance_name,
std::shared_ptr<qnn::qnn_instance> qnn_instance) -> std::unique_ptr<qnn::ggml_qnn_op_config> {
QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s\n", instance_name.c_str());
return std::make_unique<qnn::ggml_qnn_matmul_op_config>(instance_name, qnn_instance);
};
}
return [op_name](const std::string &instance_name,
std::shared_ptr<qnn::qnn_instance> qnn_instance) -> std::unique_ptr<qnn::ggml_qnn_op_config> {
return std::make_unique<qnn::ggml_qnn_single_op_config>(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_name,
qnn_instance);
};
}
} // namespace qnn

View File

@ -1,73 +1,122 @@
#pragma once
#include <array>
#include <functional>
#include <string>
#include <vector>
#include "ggml-qnn.h"
#include "logger.hpp"
#include "qnn-lib.hpp"
#include "qnn-types.hpp"
#include "tensor.hpp"
namespace qnn {
using ggml_tensor_array_t = std::vector<ggml_tensor *>;
class ggml_qnn_op_config {
public:
explicit ggml_qnn_op_config(const std::string &name, const std::string &package_name, const std::string &op_type) :
_name(name), _package_name(package_name), _op_type(op_type) {}
virtual ~ggml_qnn_op_config() {}
virtual bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle,
const ggml_tensor_array_t &tensor_inputs,
const ggml_tensor_array_t &tensor_outputs) = 0;
virtual std::vector<Qnn_Tensor_t> &get_qnn_input_tensors() = 0;
virtual std::vector<Qnn_Tensor_t> &get_qnn_output_tensors() = 0;
virtual bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) = 0;
virtual bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) = 0;
virtual bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) = 0;
virtual void unbind_input_tensors() = 0;
virtual void unbind_output_tensors() = 0;
};
void set_input_tensors(const std::vector<std::shared_ptr<ggml_qnn_tensor>> &tensor_inputs) {
_qnn_tensor_inputs.resize(tensor_inputs.size());
for (size_t i = 0; i < tensor_inputs.size(); i++) {
_qnn_tensor_inputs[i] = tensor_inputs[i]->get_qnn_tensor();
}
}
class ggml_qnn_op_config_base : public ggml_qnn_op_config {
public:
explicit ggml_qnn_op_config_base(const std::string &name, const std::string &package_name,
const std::string &op_type, std::shared_ptr<qnn_instance> qnn_instance) :
_name(name), _package_name(package_name), _op_type(op_type), _qnn_instance(qnn_instance) {}
void set_output_tensors(const std::vector<std::shared_ptr<ggml_qnn_tensor>> &tensor_outputs) {
_qnn_tensor_outputs.resize(tensor_outputs.size());
for (size_t i = 0; i < tensor_outputs.size(); i++) {
_qnn_tensor_outputs[i] = tensor_outputs[i]->get_qnn_tensor();
}
}
void add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar);
bool add_tensor_param(const std::string &name, const qnn_dimension_array_t &dimensions, int rank,
const uint8_t *data, const Qnn_DataType_t data_type, QNNBackend device,
Qnn_GraphHandle_t graph_handle);
bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override;
bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override;
bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override;
void unbind_input_tensors() override;
void unbind_output_tensors() override;
std::vector<Qnn_Tensor_t> &get_qnn_input_tensors() override { return _qnn_tensor_inputs; }
std::vector<Qnn_Tensor_t> &get_qnn_output_tensors() override { return _qnn_tensor_outputs; }
void add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar) {
_param_names.push_back(name);
Qnn_Param_t param = QNN_PARAM_INIT;
param.paramType = QNN_PARAMTYPE_SCALAR;
param.name = _param_names.back().c_str();
param.scalarParam = scalar;
_parameters.push_back(param);
}
protected:
Qnn_OpConfig_t get_op_config();
std::vector<Qnn_Tensor_t> &get_qnn_input_tensors() { return _qnn_tensor_inputs; }
std::vector<Qnn_Tensor_t> &get_qnn_output_tensors() { return _qnn_tensor_outputs; }
Qnn_OpConfig_t get_op_config() {
Qnn_OpConfig_t config = QNN_OPCONFIG_INIT;
config.version = QNN_OPCONFIG_VERSION_1;
auto &op_config = config.v1;
op_config.name = _name.c_str();
op_config.packageName = _package_name.c_str();
op_config.typeName = _op_type.c_str();
op_config.numOfParams = (uint32_t)_parameters.size();
op_config.params = _parameters.data();
op_config.numOfInputs = (uint32_t)_qnn_tensor_inputs.size();
op_config.inputTensors = _qnn_tensor_inputs.data();
op_config.numOfOutputs = (uint32_t)_qnn_tensor_outputs.size();
op_config.outputTensors = _qnn_tensor_outputs.data();
return config;
}
private:
std::string _name;
std::string _package_name;
std::string _op_type;
std::shared_ptr<qnn_instance> _qnn_instance;
ggml_qnn_tensor_array_t _tensor_inputs;
ggml_qnn_tensor_array_t _tensor_outputs;
ggml_qnn_tensor_array_t _tensor_parameters;
std::vector<Qnn_Tensor_t> _qnn_tensor_inputs;
std::vector<Qnn_Tensor_t> _qnn_tensor_outputs;
std::vector<Qnn_Param_t> _parameters;
std::vector<Qnn_Param_t> _qnn_parameters;
std::vector<std::string> _param_names;
DISABLE_COPY(ggml_qnn_op_config);
DISABLE_MOVE(ggml_qnn_op_config);
DISABLE_COPY(ggml_qnn_op_config_base);
DISABLE_MOVE(ggml_qnn_op_config_base);
};
class ggml_qnn_single_op_config : public ggml_qnn_op_config_base {
public:
explicit ggml_qnn_single_op_config(const std::string &name, const std::string &package_name,
const std::string &op_type, std::shared_ptr<qnn_instance> qnn_instance) :
ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {}
bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs,
const ggml_tensor_array_t &tensor_outputs) override;
private:
DISABLE_COPY(ggml_qnn_single_op_config);
DISABLE_MOVE(ggml_qnn_single_op_config);
};
class ggml_qnn_matmul_op_config : public ggml_qnn_op_config {
public:
ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr<qnn_instance> qnn_instance) :
_name(name), _qnn_instance(qnn_instance) {}
bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs,
const ggml_tensor_array_t &tensor_outputs) override;
bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override;
bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override;
bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override;
void unbind_input_tensors() override;
void unbind_output_tensors() override;
std::vector<Qnn_Tensor_t> &get_qnn_input_tensors() override { return _qnn_tensor_inputs; }
std::vector<Qnn_Tensor_t> &get_qnn_output_tensors() override;
private:
bool create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank,
ggml_qnn_tensor_array_t &tensor_inputs, ggml_qnn_tensor_array_t &tensor_outputs);
std::string _name;
std::shared_ptr<qnn_instance> _qnn_instance;
std::shared_ptr<ggml_qnn_op_config> _transpose0;
std::shared_ptr<ggml_qnn_op_config> _transpose1;
std::shared_ptr<ggml_qnn_op_config> _mat_mul;
std::vector<std::shared_ptr<ggml_qnn_op_config>> _input_converts;
std::shared_ptr<ggml_qnn_op_config> _output_convert;
ggml_qnn_tensor_array_t _tensor_inputs;
std::vector<Qnn_Tensor_t> _qnn_tensor_inputs;
DISABLE_COPY(ggml_qnn_matmul_op_config);
DISABLE_MOVE(ggml_qnn_matmul_op_config);
};
using ggml_op_constructor_t =
std::function<std::unique_ptr<ggml_qnn_op_config>(const std::string &, std::shared_ptr<qnn_instance>)>;
ggml_op_constructor_t create_op_constructor(const std::string &op_name);
} // namespace qnn

View File

@ -637,7 +637,7 @@ public:
return mem_fd;
}
Qnn_MemHandle_t register_rpcmem(void *p_data, uint32_t rank, uint32_t *dimensions, Qnn_DataType_t data_type) {
Qnn_MemHandle_t register_rpcmem(void *p_data, const uint32_t rank, uint32_t *dimensions, Qnn_DataType_t data_type) {
if (!p_data) {
QNN_LOG_WARN("invalid param\n");
return nullptr;

View File

@ -1,8 +1,10 @@
#pragma once
#include <algorithm>
#include <array>
#include <atomic>
#include <cstddef>
#include <cstdint>
#include <memory>
#include <string>
@ -16,55 +18,81 @@
namespace qnn {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS should be 4");
class ggml_qnn_tensor {
public:
explicit ggml_qnn_tensor(const std::string &name, QNNBackend device, Qnn_GraphHandle_t graph_handle,
typedef enum _tensor_type { INPUT, OUTPUT, INTERMEDIATE, PARAMETER } tensor_type_t;
explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string &name,
const qnn_dimension_array_t &dimensions, Qnn_DataType_t data_type, int rank,
QNNBackend device, Qnn_GraphHandle_t graph_handle,
std::shared_ptr<qnn_instance> qnn_instance) :
_tensor_name(name), _device(device), _qnn_instance(qnn_instance), _graph_handle(graph_handle) {
QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str());
if (!_tensor_name.empty()) {
QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str());
}
QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions.data());
QNN_TENSOR_SET_TYPE(_qnn_tensor, QNN_TENSOR_TYPE_NATIVE);
QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER);
QNN_LOG_DEBUG("create tensor %s, device: %d", _tensor_name.c_str(), device);
_dimensions = dimensions;
update_params_from_ggml_tensor(tensor_type, data_type, rank);
QNN_LOG_DEBUG("create tensor %s, rank: %d, dims: [%d, %d, %d, %d], data_type: %d, device: %d",
_tensor_name.c_str(), rank, (int)_dimensions[0], (int)_dimensions[1], (int)_dimensions[2],
(int)_dimensions[3], (int)data_type, (int)device);
}
explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string &name,
const ggml_dimension_array_t &dimensions, ggml_type data_type, int rank, QNNBackend device,
Qnn_GraphHandle_t graph_handle, std::shared_ptr<qnn_instance> qnn_instance) :
ggml_qnn_tensor(tensor_type, name, get_internal_dimension(dimensions, rank),
qnn_datatype_from_ggml_datatype(data_type), rank, device, graph_handle, qnn_instance) {}
~ggml_qnn_tensor() { _qnn_rpc_buffer.reset(); }
bool bind_ggml_tensor(ggml_tensor *tensor, bool is_input, int prev_max_rank) {
if (_tensor) {
if (_tensor != tensor) {
QNN_LOG_WARN("tensor %s has been bound to another ggml tensor %s", _tensor_name.c_str(),
ggml_get_name(_tensor));
return false;
}
QNN_LOG_INFO("tensor %s already bound to same ggml tensor %s", _tensor_name.c_str(),
ggml_get_name(_tensor));
bool alloc_qnn_tensor_id() {
if (QNN_TENSOR_GET_ID(_qnn_tensor)) {
QNN_LOG_WARN("graph tensor %s already created, id %d", _tensor_name.c_str(),
QNN_TENSOR_GET_ID(_qnn_tensor));
return true;
}
update_params_from_ggml_tensor(tensor, prev_max_rank);
Qnn_TensorType_t new_tensor_type = is_input ? QNN_TENSOR_TYPE_APP_WRITE : QNN_TENSOR_TYPE_APP_READ;
QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type);
QNN_LOG_INFO("tensor %s changed to type %d", _tensor_name.c_str(), new_tensor_type);
Qnn_Tensor_t qnn_tensor = _qnn_tensor;
auto qnn_interface = _qnn_instance->get_qnn_interface();
auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor);
if (error != QNN_SUCCESS) {
QNN_LOG_WARN("create graph tensor failed, tensor %s, error: %d\n", _tensor_name.c_str(), error);
return false;
}
if (!QNN_TENSOR_GET_ID(_qnn_tensor)) {
Qnn_Tensor_t qnn_tensor = _qnn_tensor;
auto qnn_interface = _qnn_instance->get_qnn_interface();
auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor);
if (error != QNN_SUCCESS) {
QNN_LOG_WARN("create graph tensor failed, tensor %s, error: %d\n", _tensor_name.c_str(), error);
QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(qnn_tensor));
QNN_LOG_DEBUG("create graph tensor %s, id: %d, rank: %d", _tensor_name.c_str(), QNN_TENSOR_GET_ID(qnn_tensor),
QNN_TENSOR_GET_RANK(qnn_tensor));
return true;
}
bool bind_buffer(uint8_t *buffer, const size_t buffer_size) {
if (_buffer) {
if (_buffer != buffer) {
QNN_LOG_WARN("tensor %s has been bound to another buffer %p", _tensor_name.c_str(), _buffer);
return false;
}
QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(qnn_tensor));
QNN_LOG_DEBUG("create graph tensor %s, id: %d, rank: %d", _tensor_name.c_str(),
QNN_TENSOR_GET_ID(qnn_tensor), QNN_TENSOR_GET_RANK(qnn_tensor));
QNN_LOG_INFO("tensor %s already bound to same ggml tensor %p", _tensor_name.c_str(), _buffer);
return true;
}
if (QNN_TENSOR_GET_TYPE(_qnn_tensor) == QNN_TENSOR_TYPE_NATIVE) {
QNN_LOG_DEBUG("tensor %s type(%d) not READ/WRITE, skipping", _tensor_name.c_str(),
(int)QNN_TENSOR_TYPE_NATIVE);
return true;
}
if (should_use_mem_handle()) {
if (!_qnn_rpc_buffer) {
auto qnn_rpc_buffer = std::make_unique<ggml_qnn_rpc_buffer>(
_qnn_instance, ggml_nbytes(tensor), QNN_TENSOR_GET_RANK(_qnn_tensor),
_qnn_instance, buffer_size, QNN_TENSOR_GET_RANK(_qnn_tensor),
QNN_TENSOR_GET_DIMENSIONS(_qnn_tensor), QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor));
if (!qnn_rpc_buffer->is_valid()) {
QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str());
@ -79,30 +107,41 @@ public:
QNN_LOG_DEBUG("tensor %s, use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor));
} else {
QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW);
Qnn_ClientBuffer_t client_buf = { tensor->data, get_ggml_tensor_data_size(tensor) };
Qnn_ClientBuffer_t client_buf = { buffer, (uint32_t)buffer_size };
QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf);
QNN_LOG_DEBUG("tensor %s, use client buffer %p size %d", _tensor_name.c_str(), client_buf.data,
(int)client_buf.dataSize);
}
_tensor = tensor;
_buffer = buffer;
_buffer_size = buffer_size;
if (!write_to_qnn_tensor()) {
QNN_LOG_WARN("write to qnn tensor failed, tensor %s", _tensor_name.c_str());
return false;
}
QNN_LOG_DEBUG("bind tensor %s to ggml tensor %s", _tensor_name.c_str(), ggml_get_name(tensor));
QNN_LOG_DEBUG("bind tensor %s to buffer: %p, size: %d", _tensor_name.c_str(), buffer, (int)buffer_size);
return true;
}
bool unbind_ggml_tensor() {
bool bind_ggml_tensor(ggml_tensor *tensor) {
if (!bind_buffer(reinterpret_cast<uint8_t *>(tensor->data), ggml_nbytes(tensor))) {
QNN_LOG_WARN("Failed to bind tensor: %s to ggml tensor: %s", _tensor_name.c_str(), ggml_get_name(tensor));
return false;
}
QNN_LOG_DEBUG("Bind tensor %s to ggml tensor %s", _tensor_name.c_str(), ggml_get_name(tensor));
return true;
}
bool unbind() {
if (!_graph_handle) {
QNN_LOG_WARN("tensor %s not bound to any graph", _tensor_name.c_str());
return false;
}
if (!_tensor) {
if (!_buffer) {
QNN_LOG_DEBUG("tensor %s not bound to ggml tensor", _tensor_name.c_str());
return true;
}
@ -119,12 +158,15 @@ public:
QNN_LOG_DEBUG("tensor %s, clear client buffer", _tensor_name.c_str());
}
QNN_LOG_DEBUG("unbind tensor: %s from ggml tensor: %s", _tensor_name.c_str(), ggml_get_name(_tensor));
_tensor = nullptr;
QNN_LOG_DEBUG("unbind tensor: %s from buffer: %p, size: %d", _tensor_name.c_str(), _buffer, (int)_buffer_size);
_buffer = nullptr;
_buffer_size = 0;
return true;
}
const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; }
Qnn_DataType_t get_data_type() const { return QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor); }
const qnn_dimension_array_t &get_dimensions() const { return _dimensions; }
private:
bool write_to_qnn_tensor() {
@ -136,7 +178,7 @@ private:
if (should_use_mem_handle()) {
if (_qnn_rpc_buffer) {
memcpy(_qnn_rpc_buffer->get_buffer(), _tensor->data, ggml_nbytes(_tensor));
memcpy(_qnn_rpc_buffer->get_buffer(), _buffer, _buffer_size);
} else {
QNN_LOG_WARN("tensor %s: can't find rpcmem from qnn mem handle\n", _tensor_name.c_str());
return false;
@ -157,7 +199,7 @@ private:
if (should_use_mem_handle()) {
if (_qnn_rpc_buffer) {
memcpy(_tensor->data, _qnn_rpc_buffer->get_buffer(), ggml_nbytes(_tensor));
memcpy(_buffer, _qnn_rpc_buffer->get_buffer(), _buffer_size);
} else {
QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n");
return false;
@ -169,29 +211,45 @@ private:
return true;
}
void update_params_from_ggml_tensor(ggml_tensor *tensor, int prev_max_rank) {
_dimensions[0] = (uint32_t)tensor->ne[0];
_dimensions[1] = (uint32_t)tensor->ne[1];
_dimensions[2] = (uint32_t)tensor->ne[2];
_dimensions[3] = (uint32_t)tensor->ne[3];
QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, device_datatype_from_ggml_datatype(tensor->type));
void update_params_from_ggml_tensor(tensor_type_t tensor_type, Qnn_DataType_t data_type, int rank) {
QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, data_type);
// TODO: set the quantizeParams base on the tensor type
QNN_TENSOR_SET_RANK(_qnn_tensor, (uint32_t)std::max(prev_max_rank, ggml_n_dims(tensor)));
QNN_TENSOR_SET_RANK(_qnn_tensor, (uint32_t)rank);
QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW);
Qnn_ClientBuffer_t client_buf = {};
QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf);
Qnn_TensorType_t new_tensor_type;
switch (tensor_type) {
case INPUT:
new_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
break;
case OUTPUT:
new_tensor_type = QNN_TENSOR_TYPE_APP_READ;
break;
case PARAMETER:
new_tensor_type = QNN_TENSOR_TYPE_STATIC;
break;
default:
new_tensor_type = QNN_TENSOR_TYPE_NATIVE;
break;
}
QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type);
QNN_LOG_INFO("tensor %s changed to type %d", _tensor_name.c_str(), new_tensor_type);
}
bool should_use_mem_handle() const { return _device == QNN_BACKEND_NPU; }
bool should_use_mem_handle() const {
return _device == QNN_BACKEND_NPU && QNN_TENSOR_GET_TYPE(_qnn_tensor) != QNN_TENSOR_TYPE_STATIC;
}
std::string _tensor_name;
const ggml_tensor *_tensor;
uint8_t *_buffer = nullptr;
size_t _buffer_size = 0;
QNNBackend _device;
std::shared_ptr<qnn_instance> _qnn_instance;
Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion);
std::array<uint32_t, GGML_MAX_DIMS> _dimensions = {};
qnn_dimension_array_t _dimensions = {};
Qnn_GraphHandle_t _graph_handle = nullptr;
std::unique_ptr<ggml_qnn_rpc_buffer> _qnn_rpc_buffer;
@ -199,4 +257,6 @@ private:
DISABLE_MOVE(ggml_qnn_tensor);
};
using ggml_qnn_tensor_array_t = std::vector<std::shared_ptr<ggml_qnn_tensor>>;
} // namespace qnn

View File

@ -9,14 +9,40 @@
namespace qnn {
qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, uint32_t rank) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS should be 4");
GGML_ASSERT(rank <= GGML_MAX_DIMS && rank > 0);
qnn_dimension_array_t internal_dims = {};
/*
* Both the ggml and qnn tensor in memory are stored as row-major format.
* But the dimensions of the tensor are stored in different order.
* For example, a 2x3 matrix:
* [
* [1, 2, 3],
* [4, 5, 6],
* ]
* The ggml tensor will have dimensions [3, 2], while the qnn tensor will have dimensions [2, 3].
*/
for (uint32_t i = 0; i < rank; i++) {
internal_dims[i] = std::max<uint32_t>(dims[rank - 1 - i], 1);
}
return internal_dims;
}
// TODO: mapping more ggml data type to QNN data type
// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684
Qnn_DataType_t device_datatype_from_ggml_datatype(ggml_type ggml_type) {
Qnn_DataType_t qnn_datatype_from_ggml_datatype(ggml_type ggml_type) {
switch (ggml_type) {
case GGML_TYPE_F16:
return QNN_DATATYPE_FLOAT_16;
case GGML_TYPE_F32:
return QNN_DATATYPE_FLOAT_32;
case GGML_TYPE_F16:
return QNN_DATATYPE_FLOAT_16;
case GGML_TYPE_I32:
return QNN_DATATYPE_INT_32;
case GGML_TYPE_I16:
return QNN_DATATYPE_INT_16;
case GGML_TYPE_I8:
return QNN_DATATYPE_INT_8;
case GGML_TYPE_Q8_0:
@ -29,16 +55,75 @@ Qnn_DataType_t device_datatype_from_ggml_datatype(ggml_type ggml_type) {
return QNN_DATATYPE_UNDEFINED;
}
Qnn_TensorType_t device_tensortype_from_ggml_tensor(ggml_tensor *ggml_tensor) {
Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_NATIVE;
ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type) {
switch (qnn_type) {
case QNN_DATATYPE_FLOAT_32:
return GGML_TYPE_F32;
case QNN_DATATYPE_FLOAT_16:
return GGML_TYPE_F16;
case QNN_DATATYPE_UINT_32:
case QNN_DATATYPE_INT_32:
return GGML_TYPE_I32;
case QNN_DATATYPE_INT_16:
return GGML_TYPE_I16;
case QNN_DATATYPE_INT_8:
return GGML_TYPE_I8;
case QNN_DATATYPE_SFIXED_POINT_8:
return GGML_TYPE_Q8_0;
case QNN_DATATYPE_SFIXED_POINT_4:
return GGML_TYPE_Q4_0;
default:
break;
}
return GGML_TYPE_COUNT;
}
if (ggml_tensor->flags & GGML_TENSOR_FLAG_INPUT) {
qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
} else if (ggml_tensor->flags & GGML_TENSOR_FLAG_OUTPUT) {
qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ;
size_t qnn_datatype_size(Qnn_DataType_t qnn_type) {
switch (qnn_type) {
case QNN_DATATYPE_FLOAT_32:
return sizeof(float);
case QNN_DATATYPE_FLOAT_16:
return sizeof(uint16_t);
case QNN_DATATYPE_UINT_32:
case QNN_DATATYPE_INT_32:
return sizeof(int32_t);
case QNN_DATATYPE_INT_16:
return sizeof(int16_t);
case QNN_DATATYPE_INT_8:
return sizeof(int8_t);
case QNN_DATATYPE_SFIXED_POINT_8:
return sizeof(int8_t);
case QNN_DATATYPE_SFIXED_POINT_4:
return sizeof(int8_t);
default:
break;
}
return 0;
}
const char *qnn_datatype_to_string(Qnn_DataType_t qnn_type) {
switch (qnn_type) {
case QNN_DATATYPE_FLOAT_32:
return "QNN_DATATYPE_FLOAT_32";
case QNN_DATATYPE_FLOAT_16:
return "QNN_DATATYPE_FLOAT_16";
case QNN_DATATYPE_UINT_32:
return "QNN_DATATYPE_UINT_32";
case QNN_DATATYPE_INT_32:
return "QNN_DATATYPE_INT_32";
case QNN_DATATYPE_INT_16:
return "QNN_DATATYPE_INT_16";
case QNN_DATATYPE_INT_8:
return "QNN_DATATYPE_INT_8";
case QNN_DATATYPE_SFIXED_POINT_8:
return "QNN_DATATYPE_SFIXED_POINT_8";
case QNN_DATATYPE_SFIXED_POINT_4:
return "QNN_DATATYPE_SFIXED_POINT_4";
default:
break;
}
return qnn_tensor_type;
return "QNN_DATATYPE_UNDEFINED";
}
uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor) {
@ -51,8 +136,13 @@ uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor) {
return rank;
}
const char *get_backend_name(int n_backend_type) {
switch (n_backend_type) {
const char *get_ggml_type_name(ggml_type type) {
const auto *traits = ggml_get_type_traits(type);
return traits->type_name;
}
const char *get_backend_name(size_t device_index) {
switch (device_index) {
case QNN_BACKEND_CPU:
return "QNN-CPU";
case QNN_BACKEND_GPU:

View File

@ -6,6 +6,7 @@
#include <stddef.h>
#include <stdint.h>
#include <array>
#include <string>
#include "ggml.h"
@ -17,8 +18,14 @@
namespace qnn {
using ggml_dimension_array_t = int64_t[GGML_MAX_DIMS];
using qnn_dimension_array_t = std::array<uint32_t, GGML_MAX_DIMS>;
qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, uint32_t rank);
uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor);
const char *get_backend_name(int n_backend_type);
const char *get_ggml_type_name(ggml_type type);
const char *get_backend_name(size_t device_index);
const char *get_chipset_desc(uint32_t chipset_id);
const char *get_htparch_desc(size_t htp_arch);
intptr_t align_to(size_t alignment, intptr_t offset);
@ -187,8 +194,10 @@ inline void set_qnn_tensor_dyn_dimensions(Qnn_Tensor_t &tensor, uint8_t *isDynam
}
}
Qnn_DataType_t device_datatype_from_ggml_datatype(ggml_type ggml_type);
Qnn_TensorType_t device_tensortype_from_ggml_tensor(ggml_tensor *ggml_tensor);
Qnn_DataType_t qnn_datatype_from_ggml_datatype(ggml_type ggml_type);
ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type);
size_t qnn_datatype_size(Qnn_DataType_t qnn_type);
const char *qnn_datatype_to_string(Qnn_DataType_t qnn_type);
#if ENABLE_QNNBACKEND_PERF
class qnn_perf {