feat: add QNN_OP_TRANSPOSE (#6)

* redo: add convert nodes

This reverts commit 8448acd5ebf8fe86ab9d25313b64a15c811ef96e.

* align clang format with cann

* rename binary_op -> general_op

casue there're some op that will only tak 1 param

* Revert "rename binary_op -> general_op"

This reverts commit 5be63b1a0dc4614457785367dade62158fe46214.

* wip

* add GGML_OP_PERMUTE

* add GGML_OP_VIEW and GGML_OP_GET_ROWS

* wip

* Revert "wip"

This reverts commit 772462ca6cfa01ea31bde725c2da60076ad9385f.
This commit is contained in:
nullname 2024-11-04 23:12:03 +08:00 committed by GitHub
parent 0fec56fd57
commit 8ad86dc703
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 265 additions and 141 deletions

View File

@ -57,30 +57,30 @@ struct qnn_device_caps {
};
const qnn_device_caps kDeviceCaps[GGML_QNN_MAX_DEVICES]{
{ "qnn-cpu",
"Qualcomm Kryo CPU",
"libQnnCpu.so",
GGML_BACKEND_DEVICE_TYPE_CPU,
{ GGML_TYPE_F32,
GGML_TYPE_I8 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul
{ "qnn-gpu",
"Qualcomm Adreno GPU",
"libQnnGpu.so",
GGML_BACKEND_DEVICE_TYPE_GPU,
{ GGML_TYPE_F32,
GGML_TYPE_F16 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul
{ "qnn-npu",
"Qualcomm NPU",
"libQnnHtp.so",
GGML_BACKEND_DEVICE_TYPE_GPU,
{ GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_I16,
GGML_TYPE_I8 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul
{// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul
"qnn-cpu",
"Qualcomm Kryo CPU",
"libQnnCpu.so",
GGML_BACKEND_DEVICE_TYPE_CPU,
{GGML_TYPE_F32, GGML_TYPE_I8}},
{// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul
"qnn-gpu",
"Qualcomm Adreno GPU",
"libQnnGpu.so",
GGML_BACKEND_DEVICE_TYPE_GPU,
{GGML_TYPE_F32, GGML_TYPE_F16}},
{// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul
"qnn-npu",
"Qualcomm NPU",
"libQnnHtp.so",
GGML_BACKEND_DEVICE_TYPE_GPU,
{GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_I16, GGML_TYPE_I8}},
};
class ggml_backend_qnn_buffer_context {
public:
ggml_backend_qnn_buffer_context(QNNBackend device, std::shared_ptr<qnn::qnn_instance> instance, size_t size) :
_instance(instance), _name(QNN_BACKEND_NAME + std::to_string(device)) {
ggml_backend_qnn_buffer_context(QNNBackend device, std::shared_ptr<qnn::qnn_instance> instance, size_t size)
: _instance(instance), _name(QNN_BACKEND_NAME + std::to_string(device)) {
// TODO: fix this for other platforms
size_t size_page = sysconf(_SC_PAGESIZE);
@ -251,7 +251,7 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(ggml_backend_dev_t dev)
if (!ggml_backend_qnn_buffer_type_initialized) {
for (size_t i = 0; i < GGML_QNN_MAX_DEVICES; i++) {
auto &context = ggml_backend_qnn_buffer_type_contexts[i];
context = { std::string(QNN_BACKEND_NAME) + std::to_string(i) };
context = {std::string(QNN_BACKEND_NAME) + std::to_string(i)};
ggml_backend_qnn_buffer_types[i] = {
/* .iface = */ {
/* .get_name = */ ggml_backend_qnn_buffer_type_name,
@ -348,8 +348,8 @@ void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, struct ggml_backe
}
ggml_guid_t ggml_backend_qnn_guid() {
static ggml_guid guid = { 0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81,
0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09 };
static ggml_guid guid = {0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81,
0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09};
return &guid;
}
@ -511,7 +511,7 @@ const ggml_backend_reg_i ggml_backend_qnn_reg_interface = {
} // namespace
ggml_backend_reg_t ggml_backend_qnn_reg() {
static ggml_backend_qnn_reg_impl reg{ ggml_backend_qnn_reg_interface };
static ggml_backend_qnn_reg_impl reg{ggml_backend_qnn_reg_interface};
static bool initialized = false;
static std::mutex mutex;

View File

@ -3,16 +3,50 @@ BasedOnStyle: Google
IndentWidth: 4
AccessModifierOffset: -4
AlignAfterOpenBracket: Align
AlignOperands: true
AlignConsecutiveMacros: false
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
AlignEscapedNewlines: Left
AlignOperands: true
AlignTrailingComments: true
AllowAllArgumentsOnNextLine: true
AllowAllConstructorInitializersOnNextLine: true
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: Never
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: All
AllowShortLambdasOnASingleLine: All
AllowShortIfStatementsOnASingleLine: WithoutElse
AllowShortLoopsOnASingleLine: true
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: Yes
BinPackArguments: true
BinPackParameters: true
BreakBeforeBraces: Custom
BreakConstructorInitializers: AfterColon
BraceWrapping:
AfterCaseLabel: false
AfterClass: false
AfterControlStatement: false
AfterEnum: false
AfterFunction: false
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
AfterExternBlock: false
BeforeCatch: false
BeforeElse: false
IndentBraces: false
SplitEmptyFunction: true
SplitEmptyRecord: true
SplitEmptyNamespace: true
ColumnLimit: 120
Cpp11BracedListStyle: false
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DerivePointerAlignment: false
IncludeCategories:
IncludeCategories:
- Regex: '^<.*\.h>'
Priority: 1
- Regex: '^<.*'
@ -28,4 +62,4 @@ MaxEmptyLinesToKeep: 1
PointerAlignment: Right
SortIncludes: true
SpacesBeforeTrailingComments: 1
UseTab: Never
UseTab: Never

View File

@ -92,10 +92,10 @@ qnn::ggml_tensor_array_t to_ggml_tensor_array(const std::array<ggml_tensor *, _S
return qnn::ggml_tensor_array_t(array.data(), array.data() + _Size);
}
template <size_t _InputSize, size_t _OutputSize>
template <size_t _InputSize>
bool execute_graph(qnn::ggml_qnn_graph *graph, const std::array<ggml_tensor *, _InputSize> &inputs,
const std::array<ggml_tensor *, _OutputSize> &outputs) {
if (!graph->execute(to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<_OutputSize>(outputs))) {
ggml_tensor *output) {
if (!graph->execute(to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<1>({output}))) {
QNN_LOG_WARN("execute failed\n");
return false;
}
@ -154,37 +154,37 @@ constexpr const char *kGgmlOpToQnnOp[] = {
nullptr, // GGML_OP_MUL_MAT_ID
nullptr, // GGML_OP_OUT_PROD
nullptr, // GGML_OP_SCALE
nullptr, // GGML_OP_SET
nullptr, // GGML_OP_CPY
nullptr, // GGML_OP_CONT
nullptr, // GGML_OP_RESHAPE
nullptr, // GGML_OP_VIEW
nullptr, // GGML_OP_PERMUTE
nullptr, // GGML_OP_TRANSPOSE
nullptr, // GGML_OP_GET_ROWS
nullptr, // GGML_OP_GET_ROWS_BACK
nullptr, // GGML_OP_DIAG
nullptr, // GGML_OP_DIAG_MASK_INF
nullptr, // GGML_OP_DIAG_MASK_ZERO
nullptr, // GGML_OP_SOFT_MAX
nullptr, // GGML_OP_SOFT_MAX_BACK
nullptr, // GGML_OP_ROPE
nullptr, // GGML_OP_ROPE_BACK
nullptr, // GGML_OP_CLAMP
nullptr, // GGML_OP_CONV_TRANSPOSE_1D
nullptr, // GGML_OP_IM2COL
nullptr, // GGML_OP_IM2COL_BACK
nullptr, // GGML_OP_CONV_TRANSPOSE_2D
nullptr, // GGML_OP_POOL_1D
nullptr, // GGML_OP_POOL_2D
nullptr, // GGML_OP_POOL_2D_BACK
nullptr, // GGML_OP_UPSCALE
nullptr, // GGML_OP_PAD
nullptr, // GGML_OP_ARANGE
nullptr, // GGML_OP_TIMESTEP_EMBEDDING
nullptr, // GGML_OP_ARGSORT
nullptr, // GGML_OP_LEAKY_RELU
nullptr, // GGML_OP_SCALE
nullptr, // GGML_OP_SET
nullptr, // GGML_OP_CPY
nullptr, // GGML_OP_CONT
nullptr, // GGML_OP_RESHAPE
nullptr, // GGML_OP_VIEW
QNN_OP_TRANSPOSE, // GGML_OP_PERMUTE
nullptr, // GGML_OP_TRANSPOSE
nullptr, // GGML_OP_GET_ROWS
nullptr, // GGML_OP_GET_ROWS_BACK
nullptr, // GGML_OP_DIAG
nullptr, // GGML_OP_DIAG_MASK_INF
nullptr, // GGML_OP_DIAG_MASK_ZERO
nullptr, // GGML_OP_SOFT_MAX
nullptr, // GGML_OP_SOFT_MAX_BACK
nullptr, // GGML_OP_ROPE
nullptr, // GGML_OP_ROPE_BACK
nullptr, // GGML_OP_CLAMP
nullptr, // GGML_OP_CONV_TRANSPOSE_1D
nullptr, // GGML_OP_IM2COL
nullptr, // GGML_OP_IM2COL_BACK
nullptr, // GGML_OP_CONV_TRANSPOSE_2D
nullptr, // GGML_OP_POOL_1D
nullptr, // GGML_OP_POOL_2D
nullptr, // GGML_OP_POOL_2D_BACK
nullptr, // GGML_OP_UPSCALE
nullptr, // GGML_OP_PAD
nullptr, // GGML_OP_ARANGE
nullptr, // GGML_OP_TIMESTEP_EMBEDDING
nullptr, // GGML_OP_ARGSORT
nullptr, // GGML_OP_LEAKY_RELU
nullptr, // GGML_OP_FLASH_ATTN_EXT
nullptr, // GGML_OP_FLASH_ATTN_BACK
@ -235,16 +235,16 @@ static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == (GGML_OP_COU
static_assert(kGgmlOpToQnnOp[GGML_UNARY_OP_GELU + kGgmlUnaryOpStart] != nullptr,
"GGML_UNARY_OP_GELU does not correspond to QNN_OP_GELU");
template <size_t _InputSize, size_t _OutputSize>
template <size_t _InputSize>
qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *ctx, size_t op,
const std::array<ggml_tensor *, _InputSize> &inputs,
const std::array<ggml_tensor *, _OutputSize> &outputs) {
ggml_tensor *output) {
GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT));
auto &graph_cache = ctx->qnn_graph_cache;
const auto *op_name =
op < kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - kGgmlUnaryOpStart));
auto graph_key = get_graph_key<_InputSize, _OutputSize>(op_name, inputs, outputs);
auto graph_key = get_graph_key<_InputSize, 1>(op_name, inputs, {output});
auto it = graph_cache.find(graph_key);
qnn::ggml_qnn_graph *graph_ptr = nullptr;
if (it != graph_cache.end()) {
@ -259,7 +259,7 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *c
auto op_constructor = qnn::create_op_constructor(kGgmlOpToQnnOp[op]);
if (!graph->build_graph(op_constructor, to_ggml_tensor_array<_InputSize>(inputs),
to_ggml_tensor_array<_OutputSize>(outputs))) {
to_ggml_tensor_array<1>({output}))) {
QNN_LOG_ERROR("build_graph failed\n");
return nullptr;
}
@ -278,9 +278,9 @@ bool qnn_binary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0,
CHECK_PARAMS(ctx, src0, src1, dst);
bool succeed = false;
auto *graph_ptr = get_qnn_graph_from_cache<2, 1>(ctx, _GgmlOp, { src0, src1 }, { dst });
auto *graph_ptr = get_qnn_graph_from_cache<2>(ctx, _GgmlOp, {src0, src1}, dst);
if (graph_ptr) {
succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst });
succeed = execute_graph<2>(graph_ptr, {src0, src1}, dst);
}
#ifndef NDEBUG
@ -301,9 +301,9 @@ bool qnn_unary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, g
CHECK_PARAMS(ctx, src, dst);
bool succeed = false;
auto *graph_ptr = get_qnn_graph_from_cache<1, 1>(ctx, _GgmlOp, { src }, { dst });
auto *graph_ptr = get_qnn_graph_from_cache<1>(ctx, _GgmlOp, {src}, dst);
if (graph_ptr) {
succeed = execute_graph<1, 1>(graph_ptr, { src }, { dst });
succeed = execute_graph<1>(graph_ptr, {src}, dst);
}
#ifndef NDEBUG
@ -315,6 +315,22 @@ bool qnn_unary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, g
return succeed;
}
bool qnn_unary_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, ggml_tensor *dst) {
GGML_UNUSED(ctx);
GGML_UNUSED(src);
GGML_UNUSED(dst);
return true;
}
bool qnn_binary_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst) {
GGML_UNUSED(ctx);
GGML_UNUSED(src0);
GGML_UNUSED(src1);
GGML_UNUSED(dst);
return true;
}
constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = {
nullptr, // GGML_OP_NONE
nullptr, // GGML_OP_DUP
@ -347,37 +363,37 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = {
nullptr, // GGML_OP_MUL_MAT_ID
nullptr, // GGML_OP_OUT_PROD
nullptr, // GGML_OP_SCALE
nullptr, // GGML_OP_SET
nullptr, // GGML_OP_CPY
nullptr, // GGML_OP_CONT
nullptr, // GGML_OP_RESHAPE
nullptr, // GGML_OP_VIEW
nullptr, // GGML_OP_PERMUTE
nullptr, // GGML_OP_TRANSPOSE
nullptr, // GGML_OP_GET_ROWS
nullptr, // GGML_OP_GET_ROWS_BACK
nullptr, // GGML_OP_DIAG
nullptr, // GGML_OP_DIAG_MASK_INF
nullptr, // GGML_OP_DIAG_MASK_ZERO
nullptr, // GGML_OP_SOFT_MAX
nullptr, // GGML_OP_SOFT_MAX_BACK
nullptr, // GGML_OP_ROPE
nullptr, // GGML_OP_ROPE_BACK
nullptr, // GGML_OP_CLAMP
nullptr, // GGML_OP_CONV_TRANSPOSE_1D
nullptr, // GGML_OP_IM2COL
nullptr, // GGML_OP_IM2COL_BACK
nullptr, // GGML_OP_CONV_TRANSPOSE_2D
nullptr, // GGML_OP_POOL_1D
nullptr, // GGML_OP_POOL_2D
nullptr, // GGML_OP_POOL_2D_BACK
nullptr, // GGML_OP_UPSCALE
nullptr, // GGML_OP_PAD
nullptr, // GGML_OP_ARANGE
nullptr, // GGML_OP_TIMESTEP_EMBEDDING
nullptr, // GGML_OP_ARGSORT
nullptr, // GGML_OP_LEAKY_RELU
nullptr, // GGML_OP_SCALE
nullptr, // GGML_OP_SET
nullptr, // GGML_OP_CPY
nullptr, // GGML_OP_CONT
nullptr, // GGML_OP_RESHAPE
qnn_unary_nop_impl, // GGML_OP_VIEW
qnn_unary_op_impl<GGML_OP_PERMUTE>, // GGML_OP_PERMUTE
nullptr, // GGML_OP_TRANSPOSE
qnn_unary_nop_impl, // GGML_OP_GET_ROWS
nullptr, // GGML_OP_GET_ROWS_BACK
nullptr, // GGML_OP_DIAG
nullptr, // GGML_OP_DIAG_MASK_INF
nullptr, // GGML_OP_DIAG_MASK_ZERO
nullptr, // GGML_OP_SOFT_MAX
nullptr, // GGML_OP_SOFT_MAX_BACK
nullptr, // GGML_OP_ROPE
nullptr, // GGML_OP_ROPE_BACK
nullptr, // GGML_OP_CLAMP
nullptr, // GGML_OP_CONV_TRANSPOSE_1D
nullptr, // GGML_OP_IM2COL
nullptr, // GGML_OP_IM2COL_BACK
nullptr, // GGML_OP_CONV_TRANSPOSE_2D
nullptr, // GGML_OP_POOL_1D
nullptr, // GGML_OP_POOL_2D
nullptr, // GGML_OP_POOL_2D_BACK
nullptr, // GGML_OP_UPSCALE
nullptr, // GGML_OP_PAD
nullptr, // GGML_OP_ARANGE
nullptr, // GGML_OP_TIMESTEP_EMBEDDING
nullptr, // GGML_OP_ARGSORT
nullptr, // GGML_OP_LEAKY_RELU
nullptr, // GGML_OP_FLASH_ATTN_EXT
nullptr, // GGML_OP_FLASH_ATTN_BACK
@ -522,18 +538,24 @@ static_assert(sizeof(kQnnBinaryOpsTable) / sizeof(kQnnBinaryOpsTable[0]) == GGML
"GGML_OP_COUNT does not match the size of the kQnnBinaryOpsTable table");
bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_tensor *tensor) {
if (!tensor) {
QNN_LOG_DEBUG("tensor is nullptr");
return false;
}
auto *type_name = ggml_get_type_traits(tensor->type)->type_name;
switch (tensor->type) {
case GGML_TYPE_F32:
case GGML_TYPE_F16:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q4_0:
if (ctx->supported_types.find(tensor->type) == ctx->supported_types.end()) {
QNN_LOG_DEBUG("unsupported data type GGML_TYPE_F16 for cpu backend");
QNN_LOG_DEBUG("unsupported data type %s for backend %d", type_name, (int)ctx->device);
return false;
}
break;
default:
QNN_LOG_DEBUG("unsupported data type %d", tensor->type);
QNN_LOG_DEBUG("unsupported data type %s", type_name);
return false;
}
@ -591,19 +613,15 @@ bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tenso
}
} else {
if (!kQnnUnaryOpsTable[op->op] && !kQnnBinaryOpsTable[op->op]) {
QNN_LOG_DEBUG("unsupported op %d", op->op);
QNN_LOG_DEBUG("[%s] unsupported op", ggml_op_name(op->op));
return false;
}
auto *src0 = op->src[0];
auto *src1 = op->src[1];
if (!src0 || !src1) {
QNN_LOG_DEBUG("src0 or src1 is nullptr");
return false;
}
if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, src1) ||
!ggml_qnn_supports_tensor(ctx, op)) {
if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, op) ||
(kQnnBinaryOpsTable[op->op] && !ggml_qnn_supports_tensor(ctx, src1))) {
QNN_LOG_DEBUG("[%s] unsupported tensor", ggml_op_name(op->op));
return false;
}
@ -642,7 +660,7 @@ bool ggml_qnn_forward(ggml_backend_qnn_device_context *ctx, struct ggml_tensor *
return binary_op(ctx, tensor->src[0], tensor->src[1], tensor);
}
QNN_LOG_WARN("unsupported op %s", ggml_op_desc(tensor));
QNN_LOG_WARN("[forward]unsupported op %s", ggml_op_desc(tensor));
return false;
}

View File

@ -7,10 +7,10 @@
namespace {
constexpr const qnn::qnn_dimension_array_t kTransposeParamData[GGML_MAX_DIMS] = {
{ 0 },
{ 1, 0 },
{ 0, 2, 1 },
{ 0, 1, 3, 2 },
{0},
{1, 0},
{0, 2, 1},
{0, 1, 3, 2},
};
qnn::qnn_dimension_array_t get_transposed_dimensions(const qnn::qnn_dimension_array_t &dimensions, int rank) {
@ -96,9 +96,8 @@ bool bind_tensors(const qnn::ggml_tensor_array_t &ggml_tensors, qnn::ggml_qnn_te
class ggml_qnn_connectable_op_config : public qnn::ggml_qnn_op_config_base {
public:
explicit ggml_qnn_connectable_op_config(const std::string &name, const std::string &package_name,
const std::string &op_type,
std::shared_ptr<qnn::qnn_instance> qnn_instance) :
ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {}
const std::string &op_type, std::shared_ptr<qnn::qnn_instance> qnn_instance)
: ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {}
bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle,
const qnn::ggml_tensor_array_t &tensor_inputs,
@ -264,11 +263,22 @@ bool ggml_qnn_single_op_config::create_tensors(QNNBackend device, Qnn_GraphHandl
const ggml_tensor_array_t &tensor_inputs,
const ggml_tensor_array_t &tensor_outputs) {
const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs);
tensor_common_params params = { "src", tensor_rank, true, device, graph_handle, _qnn_instance };
tensor_common_params params = {"src", tensor_rank, true, device, graph_handle, _qnn_instance};
create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs);
params.name_prefix = "dst";
params.is_input = false;
create_tensors_from_ggml_tensor(params, tensor_outputs, &_tensor_outputs, &_qnn_tensor_outputs);
if (_param_buffer.size() > 0) {
// handle parameters in output tensor
auto *params = tensor_outputs.front()->op_params;
memcpy(_param_buffer.data(), params, _param_buffer.size());
const uint32_t count = uint32_t(_param_buffer.size() / qnn_datatype_size(_param_type));
const qnn_dimension_array_t param_dims = {count, 1, 1, 1};
add_tensor_param(_param_name, param_dims, 1, _param_buffer.data(), _param_type, device, graph_handle);
}
return true;
}
@ -281,7 +291,7 @@ bool ggml_qnn_matmul_op_config::create_tensors(QNNBackend device, Qnn_GraphHandl
GGML_ASSERT(tensor_rank >= 2);
// create input tensors
tensor_common_params params = { "src", tensor_rank, true, device, graph_handle, _qnn_instance };
tensor_common_params params = {"src", tensor_rank, true, device, graph_handle, _qnn_instance};
create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs);
// create output tensor
@ -290,8 +300,49 @@ bool ggml_qnn_matmul_op_config::create_tensors(QNNBackend device, Qnn_GraphHandl
params.is_input = false;
create_tensors_from_ggml_tensor(params, tensor_outputs, &mat_mul_tensor_outputs, nullptr);
if (device == QNN_BACKEND_GPU) {
// there's no convert op for GPU, so we should create matmul nodes directl.
return create_mat_mul_nodes(device, graph_handle, tensor_rank, _tensor_inputs, mat_mul_tensor_outputs);
}
// create tensors for convert node
ggml_qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs;
auto input_tensor_type = get_tensor_type(mat_mul_tensor_inputs);
QNN_LOG_DEBUG("matmul input tensor type: %s\n", qnn_datatype_to_string(input_tensor_type));
_input_converts.resize(mat_mul_tensor_inputs.size());
for (size_t i = 0; i < mat_mul_tensor_inputs.size(); ++i) {
// create input convert nodes
std::string convert_name("convert_src" + std::to_string(i));
auto convert_in = mat_mul_tensor_inputs[i];
auto convert_out = std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_out",
convert_in->get_dimensions(), input_tensor_type,
tensor_rank, device, graph_handle, _qnn_instance);
auto convert = std::make_shared<ggml_qnn_connectable_op_config>(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW,
QNN_OP_CONVERT, _qnn_instance);
convert->set_input_tensors({convert_in});
convert->set_output_tensors({convert_out});
mat_mul_tensor_inputs[i] = convert_out;
_input_converts[i] = convert;
}
{
// create output convert node
std::string convert_name("convert_dst");
auto convert_out = mat_mul_tensor_outputs.front();
auto convert_in = std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_in",
convert_out->get_dimensions(), input_tensor_type,
tensor_rank, device, graph_handle, _qnn_instance);
auto output_convert = std::make_shared<ggml_qnn_connectable_op_config>(
convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_CONVERT, _qnn_instance);
output_convert->set_input_tensors({convert_in});
output_convert->set_output_tensors({convert_out});
mat_mul_tensor_outputs[0] = convert_in;
_output_convert = output_convert;
}
// create mat_mul nodes
return create_mat_mul_nodes(device, graph_handle, tensor_rank, _tensor_inputs, mat_mul_tensor_outputs);
return create_mat_mul_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs);
}
bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank,
@ -371,7 +422,7 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap
// set transpose0 parameters
auto *params_data = reinterpret_cast<const uint8_t *>(kTransposeParamData[rank - 1].data());
const qnn_dimension_array_t param_dims = { (uint32_t)rank, 1, 1, 1 };
const qnn_dimension_array_t param_dims = {(uint32_t)rank, 1, 1, 1};
transpose0->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32, device,
graph_handle);
@ -380,19 +431,19 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap
graph_handle);
// set tensor to transpose0
ggml_qnn_tensor_array_t tensors = { tensor_inputs.back() };
ggml_qnn_tensor_array_t tensors = {tensor_inputs.back()};
transpose0->set_input_tensors(tensors);
tensors = { src0_trans };
tensors = {src0_trans};
transpose0->set_output_tensors(tensors);
// set tensor to mat_mul
tensors = { tensor_inputs.front(), src0_trans };
tensors = {tensor_inputs.front(), src0_trans};
mat_mul->set_input_tensors(tensors);
tensors = { dst_trans };
tensors = {dst_trans};
mat_mul->set_output_tensors(tensors);
// set tensor to transpose1
tensors = { dst_trans };
tensors = {dst_trans};
transpose1->set_input_tensors(tensors);
transpose1->set_output_tensors(tensor_outputs);
@ -459,6 +510,13 @@ ggml_op_constructor_t create_op_constructor(const std::string &op_name) {
QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s\n", instance_name.c_str());
return std::make_unique<qnn::ggml_qnn_matmul_op_config>(instance_name, qnn_instance);
};
} else if (op_name == QNN_OP_TRANSPOSE) {
return [](const std::string &instance_name,
std::shared_ptr<qnn::qnn_instance> qnn_instance) -> std::unique_ptr<qnn::ggml_qnn_op_config> {
return std::make_unique<qnn::ggml_qnn_single_op_config>(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW,
QNN_OP_TRANSPOSE, QNN_OP_TRANSPOSE_PARAM_PERM,
QNN_DATATYPE_UINT_32, 4 * sizeof(uint32_t), qnn_instance);
};
}
return [op_name](const std::string &instance_name,

View File

@ -30,11 +30,16 @@ public:
virtual void unbind_output_tensors() = 0;
};
using ggml_op_constructor_t =
std::function<std::unique_ptr<ggml_qnn_op_config>(const std::string &, std::shared_ptr<qnn_instance>)>;
ggml_op_constructor_t create_op_constructor(const std::string &op_name);
class ggml_qnn_op_config_base : public ggml_qnn_op_config {
public:
explicit ggml_qnn_op_config_base(const std::string &name, const std::string &package_name,
const std::string &op_type, std::shared_ptr<qnn_instance> qnn_instance) :
_name(name), _package_name(package_name), _op_type(op_type), _qnn_instance(qnn_instance) {}
const std::string &op_type, std::shared_ptr<qnn_instance> qnn_instance)
: _name(name), _package_name(package_name), _op_type(op_type), _qnn_instance(qnn_instance) {}
void add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar);
bool add_tensor_param(const std::string &name, const qnn_dimension_array_t &dimensions, int rank,
@ -70,21 +75,34 @@ protected:
class ggml_qnn_single_op_config : public ggml_qnn_op_config_base {
public:
explicit ggml_qnn_single_op_config(const std::string &name, const std::string &package_name,
const std::string &op_type, std::shared_ptr<qnn_instance> qnn_instance) :
ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {}
const std::string &op_type, std::shared_ptr<qnn_instance> qnn_instance)
: ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {}
explicit ggml_qnn_single_op_config(const std::string &name, const std::string &package_name,
const std::string &op_type, const std::string &param_name,
const Qnn_DataType_t param_type, const size_t param_size,
std::shared_ptr<qnn_instance> qnn_instance)
: ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance),
_param_name(param_name),
_param_type(param_type),
_param_buffer(param_size) {}
bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs,
const ggml_tensor_array_t &tensor_outputs) override;
private:
const std::string _param_name;
const Qnn_DataType_t _param_type = QNN_DATATYPE_UINT_32;
std::vector<uint8_t> _param_buffer;
DISABLE_COPY(ggml_qnn_single_op_config);
DISABLE_MOVE(ggml_qnn_single_op_config);
};
class ggml_qnn_matmul_op_config : public ggml_qnn_op_config {
public:
ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr<qnn_instance> qnn_instance) :
_name(name), _qnn_instance(qnn_instance) {}
ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr<qnn_instance> qnn_instance)
: _name(name), _qnn_instance(qnn_instance) {}
bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs,
const ggml_tensor_array_t &tensor_outputs) override;
@ -114,9 +132,4 @@ private:
DISABLE_MOVE(ggml_qnn_matmul_op_config);
};
using ggml_op_constructor_t =
std::function<std::unique_ptr<ggml_qnn_op_config>(const std::string &, std::shared_ptr<qnn_instance>)>;
ggml_op_constructor_t create_op_constructor(const std::string &op_name);
} // namespace qnn

View File

@ -257,6 +257,7 @@ private:
DISABLE_MOVE(ggml_qnn_tensor);
};
using ggml_qnn_tensor_ptr_t = std::shared_ptr<ggml_qnn_tensor>;
using ggml_qnn_tensor_array_t = std::vector<std::shared_ptr<ggml_qnn_tensor>>;
} // namespace qnn