feat: add QNN_OP_TRANSPOSE (#6)

* redo: add convert nodes

This reverts commit 8448acd5ebf8fe86ab9d25313b64a15c811ef96e.

* align clang format with cann

* rename binary_op -> general_op

casue there're some op that will only tak 1 param

* Revert "rename binary_op -> general_op"

This reverts commit 5be63b1a0dc4614457785367dade62158fe46214.

* wip

* add GGML_OP_PERMUTE

* add GGML_OP_VIEW and GGML_OP_GET_ROWS

* wip

* Revert "wip"

This reverts commit 772462ca6cfa01ea31bde725c2da60076ad9385f.
This commit is contained in:
nullname 2024-11-04 23:12:03 +08:00 committed by GitHub
parent 0fec56fd57
commit 8ad86dc703
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 265 additions and 141 deletions

View File

@ -57,30 +57,30 @@ struct qnn_device_caps {
}; };
const qnn_device_caps kDeviceCaps[GGML_QNN_MAX_DEVICES]{ const qnn_device_caps kDeviceCaps[GGML_QNN_MAX_DEVICES]{
{ "qnn-cpu", {// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul
"Qualcomm Kryo CPU", "qnn-cpu",
"libQnnCpu.so", "Qualcomm Kryo CPU",
GGML_BACKEND_DEVICE_TYPE_CPU, "libQnnCpu.so",
{ GGML_TYPE_F32, GGML_BACKEND_DEVICE_TYPE_CPU,
GGML_TYPE_I8 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul {GGML_TYPE_F32, GGML_TYPE_I8}},
{ "qnn-gpu", {// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul
"Qualcomm Adreno GPU", "qnn-gpu",
"libQnnGpu.so", "Qualcomm Adreno GPU",
GGML_BACKEND_DEVICE_TYPE_GPU, "libQnnGpu.so",
{ GGML_TYPE_F32, GGML_BACKEND_DEVICE_TYPE_GPU,
GGML_TYPE_F16 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul {GGML_TYPE_F32, GGML_TYPE_F16}},
{ "qnn-npu", {// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul
"Qualcomm NPU", "qnn-npu",
"libQnnHtp.so", "Qualcomm NPU",
GGML_BACKEND_DEVICE_TYPE_GPU, "libQnnHtp.so",
{ GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_I16, GGML_BACKEND_DEVICE_TYPE_GPU,
GGML_TYPE_I8 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_I16, GGML_TYPE_I8}},
}; };
class ggml_backend_qnn_buffer_context { class ggml_backend_qnn_buffer_context {
public: public:
ggml_backend_qnn_buffer_context(QNNBackend device, std::shared_ptr<qnn::qnn_instance> instance, size_t size) : ggml_backend_qnn_buffer_context(QNNBackend device, std::shared_ptr<qnn::qnn_instance> instance, size_t size)
_instance(instance), _name(QNN_BACKEND_NAME + std::to_string(device)) { : _instance(instance), _name(QNN_BACKEND_NAME + std::to_string(device)) {
// TODO: fix this for other platforms // TODO: fix this for other platforms
size_t size_page = sysconf(_SC_PAGESIZE); size_t size_page = sysconf(_SC_PAGESIZE);
@ -251,7 +251,7 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(ggml_backend_dev_t dev)
if (!ggml_backend_qnn_buffer_type_initialized) { if (!ggml_backend_qnn_buffer_type_initialized) {
for (size_t i = 0; i < GGML_QNN_MAX_DEVICES; i++) { for (size_t i = 0; i < GGML_QNN_MAX_DEVICES; i++) {
auto &context = ggml_backend_qnn_buffer_type_contexts[i]; auto &context = ggml_backend_qnn_buffer_type_contexts[i];
context = { std::string(QNN_BACKEND_NAME) + std::to_string(i) }; context = {std::string(QNN_BACKEND_NAME) + std::to_string(i)};
ggml_backend_qnn_buffer_types[i] = { ggml_backend_qnn_buffer_types[i] = {
/* .iface = */ { /* .iface = */ {
/* .get_name = */ ggml_backend_qnn_buffer_type_name, /* .get_name = */ ggml_backend_qnn_buffer_type_name,
@ -348,8 +348,8 @@ void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, struct ggml_backe
} }
ggml_guid_t ggml_backend_qnn_guid() { ggml_guid_t ggml_backend_qnn_guid() {
static ggml_guid guid = { 0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, static ggml_guid guid = {0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81,
0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09 }; 0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09};
return &guid; return &guid;
} }
@ -511,7 +511,7 @@ const ggml_backend_reg_i ggml_backend_qnn_reg_interface = {
} // namespace } // namespace
ggml_backend_reg_t ggml_backend_qnn_reg() { ggml_backend_reg_t ggml_backend_qnn_reg() {
static ggml_backend_qnn_reg_impl reg{ ggml_backend_qnn_reg_interface }; static ggml_backend_qnn_reg_impl reg{ggml_backend_qnn_reg_interface};
static bool initialized = false; static bool initialized = false;
static std::mutex mutex; static std::mutex mutex;

View File

@ -3,16 +3,50 @@ BasedOnStyle: Google
IndentWidth: 4 IndentWidth: 4
AccessModifierOffset: -4 AccessModifierOffset: -4
AlignAfterOpenBracket: Align AlignAfterOpenBracket: Align
AlignOperands: true AlignConsecutiveMacros: false
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
AlignEscapedNewlines: Left
AlignOperands: true
AlignTrailingComments: true AlignTrailingComments: true
AllowAllArgumentsOnNextLine: true
AllowAllConstructorInitializersOnNextLine: true
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: Never
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: All
AllowShortLambdasOnASingleLine: All
AllowShortIfStatementsOnASingleLine: WithoutElse
AllowShortLoopsOnASingleLine: true
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: Yes
BinPackArguments: true BinPackArguments: true
BinPackParameters: true BinPackParameters: true
BreakBeforeBraces: Custom BraceWrapping:
BreakConstructorInitializers: AfterColon AfterCaseLabel: false
AfterClass: false
AfterControlStatement: false
AfterEnum: false
AfterFunction: false
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
AfterExternBlock: false
BeforeCatch: false
BeforeElse: false
IndentBraces: false
SplitEmptyFunction: true
SplitEmptyRecord: true
SplitEmptyNamespace: true
ColumnLimit: 120 ColumnLimit: 120
Cpp11BracedListStyle: false ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DerivePointerAlignment: false DerivePointerAlignment: false
IncludeCategories: IncludeCategories:
- Regex: '^<.*\.h>' - Regex: '^<.*\.h>'
Priority: 1 Priority: 1
- Regex: '^<.*' - Regex: '^<.*'
@ -28,4 +62,4 @@ MaxEmptyLinesToKeep: 1
PointerAlignment: Right PointerAlignment: Right
SortIncludes: true SortIncludes: true
SpacesBeforeTrailingComments: 1 SpacesBeforeTrailingComments: 1
UseTab: Never UseTab: Never

View File

@ -92,10 +92,10 @@ qnn::ggml_tensor_array_t to_ggml_tensor_array(const std::array<ggml_tensor *, _S
return qnn::ggml_tensor_array_t(array.data(), array.data() + _Size); return qnn::ggml_tensor_array_t(array.data(), array.data() + _Size);
} }
template <size_t _InputSize, size_t _OutputSize> template <size_t _InputSize>
bool execute_graph(qnn::ggml_qnn_graph *graph, const std::array<ggml_tensor *, _InputSize> &inputs, bool execute_graph(qnn::ggml_qnn_graph *graph, const std::array<ggml_tensor *, _InputSize> &inputs,
const std::array<ggml_tensor *, _OutputSize> &outputs) { ggml_tensor *output) {
if (!graph->execute(to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<_OutputSize>(outputs))) { if (!graph->execute(to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<1>({output}))) {
QNN_LOG_WARN("execute failed\n"); QNN_LOG_WARN("execute failed\n");
return false; return false;
} }
@ -154,37 +154,37 @@ constexpr const char *kGgmlOpToQnnOp[] = {
nullptr, // GGML_OP_MUL_MAT_ID nullptr, // GGML_OP_MUL_MAT_ID
nullptr, // GGML_OP_OUT_PROD nullptr, // GGML_OP_OUT_PROD
nullptr, // GGML_OP_SCALE nullptr, // GGML_OP_SCALE
nullptr, // GGML_OP_SET nullptr, // GGML_OP_SET
nullptr, // GGML_OP_CPY nullptr, // GGML_OP_CPY
nullptr, // GGML_OP_CONT nullptr, // GGML_OP_CONT
nullptr, // GGML_OP_RESHAPE nullptr, // GGML_OP_RESHAPE
nullptr, // GGML_OP_VIEW nullptr, // GGML_OP_VIEW
nullptr, // GGML_OP_PERMUTE QNN_OP_TRANSPOSE, // GGML_OP_PERMUTE
nullptr, // GGML_OP_TRANSPOSE nullptr, // GGML_OP_TRANSPOSE
nullptr, // GGML_OP_GET_ROWS nullptr, // GGML_OP_GET_ROWS
nullptr, // GGML_OP_GET_ROWS_BACK nullptr, // GGML_OP_GET_ROWS_BACK
nullptr, // GGML_OP_DIAG nullptr, // GGML_OP_DIAG
nullptr, // GGML_OP_DIAG_MASK_INF nullptr, // GGML_OP_DIAG_MASK_INF
nullptr, // GGML_OP_DIAG_MASK_ZERO nullptr, // GGML_OP_DIAG_MASK_ZERO
nullptr, // GGML_OP_SOFT_MAX nullptr, // GGML_OP_SOFT_MAX
nullptr, // GGML_OP_SOFT_MAX_BACK nullptr, // GGML_OP_SOFT_MAX_BACK
nullptr, // GGML_OP_ROPE nullptr, // GGML_OP_ROPE
nullptr, // GGML_OP_ROPE_BACK nullptr, // GGML_OP_ROPE_BACK
nullptr, // GGML_OP_CLAMP nullptr, // GGML_OP_CLAMP
nullptr, // GGML_OP_CONV_TRANSPOSE_1D nullptr, // GGML_OP_CONV_TRANSPOSE_1D
nullptr, // GGML_OP_IM2COL nullptr, // GGML_OP_IM2COL
nullptr, // GGML_OP_IM2COL_BACK nullptr, // GGML_OP_IM2COL_BACK
nullptr, // GGML_OP_CONV_TRANSPOSE_2D nullptr, // GGML_OP_CONV_TRANSPOSE_2D
nullptr, // GGML_OP_POOL_1D nullptr, // GGML_OP_POOL_1D
nullptr, // GGML_OP_POOL_2D nullptr, // GGML_OP_POOL_2D
nullptr, // GGML_OP_POOL_2D_BACK nullptr, // GGML_OP_POOL_2D_BACK
nullptr, // GGML_OP_UPSCALE nullptr, // GGML_OP_UPSCALE
nullptr, // GGML_OP_PAD nullptr, // GGML_OP_PAD
nullptr, // GGML_OP_ARANGE nullptr, // GGML_OP_ARANGE
nullptr, // GGML_OP_TIMESTEP_EMBEDDING nullptr, // GGML_OP_TIMESTEP_EMBEDDING
nullptr, // GGML_OP_ARGSORT nullptr, // GGML_OP_ARGSORT
nullptr, // GGML_OP_LEAKY_RELU nullptr, // GGML_OP_LEAKY_RELU
nullptr, // GGML_OP_FLASH_ATTN_EXT nullptr, // GGML_OP_FLASH_ATTN_EXT
nullptr, // GGML_OP_FLASH_ATTN_BACK nullptr, // GGML_OP_FLASH_ATTN_BACK
@ -235,16 +235,16 @@ static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == (GGML_OP_COU
static_assert(kGgmlOpToQnnOp[GGML_UNARY_OP_GELU + kGgmlUnaryOpStart] != nullptr, static_assert(kGgmlOpToQnnOp[GGML_UNARY_OP_GELU + kGgmlUnaryOpStart] != nullptr,
"GGML_UNARY_OP_GELU does not correspond to QNN_OP_GELU"); "GGML_UNARY_OP_GELU does not correspond to QNN_OP_GELU");
template <size_t _InputSize, size_t _OutputSize> template <size_t _InputSize>
qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *ctx, size_t op, qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *ctx, size_t op,
const std::array<ggml_tensor *, _InputSize> &inputs, const std::array<ggml_tensor *, _InputSize> &inputs,
const std::array<ggml_tensor *, _OutputSize> &outputs) { ggml_tensor *output) {
GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT)); GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT));
auto &graph_cache = ctx->qnn_graph_cache; auto &graph_cache = ctx->qnn_graph_cache;
const auto *op_name = const auto *op_name =
op < kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - kGgmlUnaryOpStart)); op < kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - kGgmlUnaryOpStart));
auto graph_key = get_graph_key<_InputSize, _OutputSize>(op_name, inputs, outputs); auto graph_key = get_graph_key<_InputSize, 1>(op_name, inputs, {output});
auto it = graph_cache.find(graph_key); auto it = graph_cache.find(graph_key);
qnn::ggml_qnn_graph *graph_ptr = nullptr; qnn::ggml_qnn_graph *graph_ptr = nullptr;
if (it != graph_cache.end()) { if (it != graph_cache.end()) {
@ -259,7 +259,7 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *c
auto op_constructor = qnn::create_op_constructor(kGgmlOpToQnnOp[op]); auto op_constructor = qnn::create_op_constructor(kGgmlOpToQnnOp[op]);
if (!graph->build_graph(op_constructor, to_ggml_tensor_array<_InputSize>(inputs), if (!graph->build_graph(op_constructor, to_ggml_tensor_array<_InputSize>(inputs),
to_ggml_tensor_array<_OutputSize>(outputs))) { to_ggml_tensor_array<1>({output}))) {
QNN_LOG_ERROR("build_graph failed\n"); QNN_LOG_ERROR("build_graph failed\n");
return nullptr; return nullptr;
} }
@ -278,9 +278,9 @@ bool qnn_binary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0,
CHECK_PARAMS(ctx, src0, src1, dst); CHECK_PARAMS(ctx, src0, src1, dst);
bool succeed = false; bool succeed = false;
auto *graph_ptr = get_qnn_graph_from_cache<2, 1>(ctx, _GgmlOp, { src0, src1 }, { dst }); auto *graph_ptr = get_qnn_graph_from_cache<2>(ctx, _GgmlOp, {src0, src1}, dst);
if (graph_ptr) { if (graph_ptr) {
succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); succeed = execute_graph<2>(graph_ptr, {src0, src1}, dst);
} }
#ifndef NDEBUG #ifndef NDEBUG
@ -301,9 +301,9 @@ bool qnn_unary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, g
CHECK_PARAMS(ctx, src, dst); CHECK_PARAMS(ctx, src, dst);
bool succeed = false; bool succeed = false;
auto *graph_ptr = get_qnn_graph_from_cache<1, 1>(ctx, _GgmlOp, { src }, { dst }); auto *graph_ptr = get_qnn_graph_from_cache<1>(ctx, _GgmlOp, {src}, dst);
if (graph_ptr) { if (graph_ptr) {
succeed = execute_graph<1, 1>(graph_ptr, { src }, { dst }); succeed = execute_graph<1>(graph_ptr, {src}, dst);
} }
#ifndef NDEBUG #ifndef NDEBUG
@ -315,6 +315,22 @@ bool qnn_unary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, g
return succeed; return succeed;
} }
bool qnn_unary_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, ggml_tensor *dst) {
GGML_UNUSED(ctx);
GGML_UNUSED(src);
GGML_UNUSED(dst);
return true;
}
bool qnn_binary_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst) {
GGML_UNUSED(ctx);
GGML_UNUSED(src0);
GGML_UNUSED(src1);
GGML_UNUSED(dst);
return true;
}
constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = {
nullptr, // GGML_OP_NONE nullptr, // GGML_OP_NONE
nullptr, // GGML_OP_DUP nullptr, // GGML_OP_DUP
@ -347,37 +363,37 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = {
nullptr, // GGML_OP_MUL_MAT_ID nullptr, // GGML_OP_MUL_MAT_ID
nullptr, // GGML_OP_OUT_PROD nullptr, // GGML_OP_OUT_PROD
nullptr, // GGML_OP_SCALE nullptr, // GGML_OP_SCALE
nullptr, // GGML_OP_SET nullptr, // GGML_OP_SET
nullptr, // GGML_OP_CPY nullptr, // GGML_OP_CPY
nullptr, // GGML_OP_CONT nullptr, // GGML_OP_CONT
nullptr, // GGML_OP_RESHAPE nullptr, // GGML_OP_RESHAPE
nullptr, // GGML_OP_VIEW qnn_unary_nop_impl, // GGML_OP_VIEW
nullptr, // GGML_OP_PERMUTE qnn_unary_op_impl<GGML_OP_PERMUTE>, // GGML_OP_PERMUTE
nullptr, // GGML_OP_TRANSPOSE nullptr, // GGML_OP_TRANSPOSE
nullptr, // GGML_OP_GET_ROWS qnn_unary_nop_impl, // GGML_OP_GET_ROWS
nullptr, // GGML_OP_GET_ROWS_BACK nullptr, // GGML_OP_GET_ROWS_BACK
nullptr, // GGML_OP_DIAG nullptr, // GGML_OP_DIAG
nullptr, // GGML_OP_DIAG_MASK_INF nullptr, // GGML_OP_DIAG_MASK_INF
nullptr, // GGML_OP_DIAG_MASK_ZERO nullptr, // GGML_OP_DIAG_MASK_ZERO
nullptr, // GGML_OP_SOFT_MAX nullptr, // GGML_OP_SOFT_MAX
nullptr, // GGML_OP_SOFT_MAX_BACK nullptr, // GGML_OP_SOFT_MAX_BACK
nullptr, // GGML_OP_ROPE nullptr, // GGML_OP_ROPE
nullptr, // GGML_OP_ROPE_BACK nullptr, // GGML_OP_ROPE_BACK
nullptr, // GGML_OP_CLAMP nullptr, // GGML_OP_CLAMP
nullptr, // GGML_OP_CONV_TRANSPOSE_1D nullptr, // GGML_OP_CONV_TRANSPOSE_1D
nullptr, // GGML_OP_IM2COL nullptr, // GGML_OP_IM2COL
nullptr, // GGML_OP_IM2COL_BACK nullptr, // GGML_OP_IM2COL_BACK
nullptr, // GGML_OP_CONV_TRANSPOSE_2D nullptr, // GGML_OP_CONV_TRANSPOSE_2D
nullptr, // GGML_OP_POOL_1D nullptr, // GGML_OP_POOL_1D
nullptr, // GGML_OP_POOL_2D nullptr, // GGML_OP_POOL_2D
nullptr, // GGML_OP_POOL_2D_BACK nullptr, // GGML_OP_POOL_2D_BACK
nullptr, // GGML_OP_UPSCALE nullptr, // GGML_OP_UPSCALE
nullptr, // GGML_OP_PAD nullptr, // GGML_OP_PAD
nullptr, // GGML_OP_ARANGE nullptr, // GGML_OP_ARANGE
nullptr, // GGML_OP_TIMESTEP_EMBEDDING nullptr, // GGML_OP_TIMESTEP_EMBEDDING
nullptr, // GGML_OP_ARGSORT nullptr, // GGML_OP_ARGSORT
nullptr, // GGML_OP_LEAKY_RELU nullptr, // GGML_OP_LEAKY_RELU
nullptr, // GGML_OP_FLASH_ATTN_EXT nullptr, // GGML_OP_FLASH_ATTN_EXT
nullptr, // GGML_OP_FLASH_ATTN_BACK nullptr, // GGML_OP_FLASH_ATTN_BACK
@ -522,18 +538,24 @@ static_assert(sizeof(kQnnBinaryOpsTable) / sizeof(kQnnBinaryOpsTable[0]) == GGML
"GGML_OP_COUNT does not match the size of the kQnnBinaryOpsTable table"); "GGML_OP_COUNT does not match the size of the kQnnBinaryOpsTable table");
bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_tensor *tensor) { bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_tensor *tensor) {
if (!tensor) {
QNN_LOG_DEBUG("tensor is nullptr");
return false;
}
auto *type_name = ggml_get_type_traits(tensor->type)->type_name;
switch (tensor->type) { switch (tensor->type) {
case GGML_TYPE_F32: case GGML_TYPE_F32:
case GGML_TYPE_F16: case GGML_TYPE_F16:
case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_0:
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:
if (ctx->supported_types.find(tensor->type) == ctx->supported_types.end()) { if (ctx->supported_types.find(tensor->type) == ctx->supported_types.end()) {
QNN_LOG_DEBUG("unsupported data type GGML_TYPE_F16 for cpu backend"); QNN_LOG_DEBUG("unsupported data type %s for backend %d", type_name, (int)ctx->device);
return false; return false;
} }
break; break;
default: default:
QNN_LOG_DEBUG("unsupported data type %d", tensor->type); QNN_LOG_DEBUG("unsupported data type %s", type_name);
return false; return false;
} }
@ -591,19 +613,15 @@ bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tenso
} }
} else { } else {
if (!kQnnUnaryOpsTable[op->op] && !kQnnBinaryOpsTable[op->op]) { if (!kQnnUnaryOpsTable[op->op] && !kQnnBinaryOpsTable[op->op]) {
QNN_LOG_DEBUG("unsupported op %d", op->op); QNN_LOG_DEBUG("[%s] unsupported op", ggml_op_name(op->op));
return false; return false;
} }
auto *src0 = op->src[0]; auto *src0 = op->src[0];
auto *src1 = op->src[1]; auto *src1 = op->src[1];
if (!src0 || !src1) { if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, op) ||
QNN_LOG_DEBUG("src0 or src1 is nullptr"); (kQnnBinaryOpsTable[op->op] && !ggml_qnn_supports_tensor(ctx, src1))) {
return false; QNN_LOG_DEBUG("[%s] unsupported tensor", ggml_op_name(op->op));
}
if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, src1) ||
!ggml_qnn_supports_tensor(ctx, op)) {
return false; return false;
} }
@ -642,7 +660,7 @@ bool ggml_qnn_forward(ggml_backend_qnn_device_context *ctx, struct ggml_tensor *
return binary_op(ctx, tensor->src[0], tensor->src[1], tensor); return binary_op(ctx, tensor->src[0], tensor->src[1], tensor);
} }
QNN_LOG_WARN("unsupported op %s", ggml_op_desc(tensor)); QNN_LOG_WARN("[forward]unsupported op %s", ggml_op_desc(tensor));
return false; return false;
} }

View File

@ -7,10 +7,10 @@
namespace { namespace {
constexpr const qnn::qnn_dimension_array_t kTransposeParamData[GGML_MAX_DIMS] = { constexpr const qnn::qnn_dimension_array_t kTransposeParamData[GGML_MAX_DIMS] = {
{ 0 }, {0},
{ 1, 0 }, {1, 0},
{ 0, 2, 1 }, {0, 2, 1},
{ 0, 1, 3, 2 }, {0, 1, 3, 2},
}; };
qnn::qnn_dimension_array_t get_transposed_dimensions(const qnn::qnn_dimension_array_t &dimensions, int rank) { qnn::qnn_dimension_array_t get_transposed_dimensions(const qnn::qnn_dimension_array_t &dimensions, int rank) {
@ -96,9 +96,8 @@ bool bind_tensors(const qnn::ggml_tensor_array_t &ggml_tensors, qnn::ggml_qnn_te
class ggml_qnn_connectable_op_config : public qnn::ggml_qnn_op_config_base { class ggml_qnn_connectable_op_config : public qnn::ggml_qnn_op_config_base {
public: public:
explicit ggml_qnn_connectable_op_config(const std::string &name, const std::string &package_name, explicit ggml_qnn_connectable_op_config(const std::string &name, const std::string &package_name,
const std::string &op_type, const std::string &op_type, std::shared_ptr<qnn::qnn_instance> qnn_instance)
std::shared_ptr<qnn::qnn_instance> qnn_instance) : : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {}
ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {}
bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle,
const qnn::ggml_tensor_array_t &tensor_inputs, const qnn::ggml_tensor_array_t &tensor_inputs,
@ -264,11 +263,22 @@ bool ggml_qnn_single_op_config::create_tensors(QNNBackend device, Qnn_GraphHandl
const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_inputs,
const ggml_tensor_array_t &tensor_outputs) { const ggml_tensor_array_t &tensor_outputs) {
const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs); const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs);
tensor_common_params params = { "src", tensor_rank, true, device, graph_handle, _qnn_instance }; tensor_common_params params = {"src", tensor_rank, true, device, graph_handle, _qnn_instance};
create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs); create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs);
params.name_prefix = "dst"; params.name_prefix = "dst";
params.is_input = false; params.is_input = false;
create_tensors_from_ggml_tensor(params, tensor_outputs, &_tensor_outputs, &_qnn_tensor_outputs); create_tensors_from_ggml_tensor(params, tensor_outputs, &_tensor_outputs, &_qnn_tensor_outputs);
if (_param_buffer.size() > 0) {
// handle parameters in output tensor
auto *params = tensor_outputs.front()->op_params;
memcpy(_param_buffer.data(), params, _param_buffer.size());
const uint32_t count = uint32_t(_param_buffer.size() / qnn_datatype_size(_param_type));
const qnn_dimension_array_t param_dims = {count, 1, 1, 1};
add_tensor_param(_param_name, param_dims, 1, _param_buffer.data(), _param_type, device, graph_handle);
}
return true; return true;
} }
@ -281,7 +291,7 @@ bool ggml_qnn_matmul_op_config::create_tensors(QNNBackend device, Qnn_GraphHandl
GGML_ASSERT(tensor_rank >= 2); GGML_ASSERT(tensor_rank >= 2);
// create input tensors // create input tensors
tensor_common_params params = { "src", tensor_rank, true, device, graph_handle, _qnn_instance }; tensor_common_params params = {"src", tensor_rank, true, device, graph_handle, _qnn_instance};
create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs); create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs);
// create output tensor // create output tensor
@ -290,8 +300,49 @@ bool ggml_qnn_matmul_op_config::create_tensors(QNNBackend device, Qnn_GraphHandl
params.is_input = false; params.is_input = false;
create_tensors_from_ggml_tensor(params, tensor_outputs, &mat_mul_tensor_outputs, nullptr); create_tensors_from_ggml_tensor(params, tensor_outputs, &mat_mul_tensor_outputs, nullptr);
if (device == QNN_BACKEND_GPU) {
// there's no convert op for GPU, so we should create matmul nodes directl.
return create_mat_mul_nodes(device, graph_handle, tensor_rank, _tensor_inputs, mat_mul_tensor_outputs);
}
// create tensors for convert node
ggml_qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs;
auto input_tensor_type = get_tensor_type(mat_mul_tensor_inputs);
QNN_LOG_DEBUG("matmul input tensor type: %s\n", qnn_datatype_to_string(input_tensor_type));
_input_converts.resize(mat_mul_tensor_inputs.size());
for (size_t i = 0; i < mat_mul_tensor_inputs.size(); ++i) {
// create input convert nodes
std::string convert_name("convert_src" + std::to_string(i));
auto convert_in = mat_mul_tensor_inputs[i];
auto convert_out = std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_out",
convert_in->get_dimensions(), input_tensor_type,
tensor_rank, device, graph_handle, _qnn_instance);
auto convert = std::make_shared<ggml_qnn_connectable_op_config>(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW,
QNN_OP_CONVERT, _qnn_instance);
convert->set_input_tensors({convert_in});
convert->set_output_tensors({convert_out});
mat_mul_tensor_inputs[i] = convert_out;
_input_converts[i] = convert;
}
{
// create output convert node
std::string convert_name("convert_dst");
auto convert_out = mat_mul_tensor_outputs.front();
auto convert_in = std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_in",
convert_out->get_dimensions(), input_tensor_type,
tensor_rank, device, graph_handle, _qnn_instance);
auto output_convert = std::make_shared<ggml_qnn_connectable_op_config>(
convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_CONVERT, _qnn_instance);
output_convert->set_input_tensors({convert_in});
output_convert->set_output_tensors({convert_out});
mat_mul_tensor_outputs[0] = convert_in;
_output_convert = output_convert;
}
// create mat_mul nodes // create mat_mul nodes
return create_mat_mul_nodes(device, graph_handle, tensor_rank, _tensor_inputs, mat_mul_tensor_outputs); return create_mat_mul_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs);
} }
bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank,
@ -371,7 +422,7 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap
// set transpose0 parameters // set transpose0 parameters
auto *params_data = reinterpret_cast<const uint8_t *>(kTransposeParamData[rank - 1].data()); auto *params_data = reinterpret_cast<const uint8_t *>(kTransposeParamData[rank - 1].data());
const qnn_dimension_array_t param_dims = { (uint32_t)rank, 1, 1, 1 }; const qnn_dimension_array_t param_dims = {(uint32_t)rank, 1, 1, 1};
transpose0->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32, device, transpose0->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32, device,
graph_handle); graph_handle);
@ -380,19 +431,19 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap
graph_handle); graph_handle);
// set tensor to transpose0 // set tensor to transpose0
ggml_qnn_tensor_array_t tensors = { tensor_inputs.back() }; ggml_qnn_tensor_array_t tensors = {tensor_inputs.back()};
transpose0->set_input_tensors(tensors); transpose0->set_input_tensors(tensors);
tensors = { src0_trans }; tensors = {src0_trans};
transpose0->set_output_tensors(tensors); transpose0->set_output_tensors(tensors);
// set tensor to mat_mul // set tensor to mat_mul
tensors = { tensor_inputs.front(), src0_trans }; tensors = {tensor_inputs.front(), src0_trans};
mat_mul->set_input_tensors(tensors); mat_mul->set_input_tensors(tensors);
tensors = { dst_trans }; tensors = {dst_trans};
mat_mul->set_output_tensors(tensors); mat_mul->set_output_tensors(tensors);
// set tensor to transpose1 // set tensor to transpose1
tensors = { dst_trans }; tensors = {dst_trans};
transpose1->set_input_tensors(tensors); transpose1->set_input_tensors(tensors);
transpose1->set_output_tensors(tensor_outputs); transpose1->set_output_tensors(tensor_outputs);
@ -459,6 +510,13 @@ ggml_op_constructor_t create_op_constructor(const std::string &op_name) {
QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s\n", instance_name.c_str()); QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s\n", instance_name.c_str());
return std::make_unique<qnn::ggml_qnn_matmul_op_config>(instance_name, qnn_instance); return std::make_unique<qnn::ggml_qnn_matmul_op_config>(instance_name, qnn_instance);
}; };
} else if (op_name == QNN_OP_TRANSPOSE) {
return [](const std::string &instance_name,
std::shared_ptr<qnn::qnn_instance> qnn_instance) -> std::unique_ptr<qnn::ggml_qnn_op_config> {
return std::make_unique<qnn::ggml_qnn_single_op_config>(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW,
QNN_OP_TRANSPOSE, QNN_OP_TRANSPOSE_PARAM_PERM,
QNN_DATATYPE_UINT_32, 4 * sizeof(uint32_t), qnn_instance);
};
} }
return [op_name](const std::string &instance_name, return [op_name](const std::string &instance_name,

View File

@ -30,11 +30,16 @@ public:
virtual void unbind_output_tensors() = 0; virtual void unbind_output_tensors() = 0;
}; };
using ggml_op_constructor_t =
std::function<std::unique_ptr<ggml_qnn_op_config>(const std::string &, std::shared_ptr<qnn_instance>)>;
ggml_op_constructor_t create_op_constructor(const std::string &op_name);
class ggml_qnn_op_config_base : public ggml_qnn_op_config { class ggml_qnn_op_config_base : public ggml_qnn_op_config {
public: public:
explicit ggml_qnn_op_config_base(const std::string &name, const std::string &package_name, explicit ggml_qnn_op_config_base(const std::string &name, const std::string &package_name,
const std::string &op_type, std::shared_ptr<qnn_instance> qnn_instance) : const std::string &op_type, std::shared_ptr<qnn_instance> qnn_instance)
_name(name), _package_name(package_name), _op_type(op_type), _qnn_instance(qnn_instance) {} : _name(name), _package_name(package_name), _op_type(op_type), _qnn_instance(qnn_instance) {}
void add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar); void add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar);
bool add_tensor_param(const std::string &name, const qnn_dimension_array_t &dimensions, int rank, bool add_tensor_param(const std::string &name, const qnn_dimension_array_t &dimensions, int rank,
@ -70,21 +75,34 @@ protected:
class ggml_qnn_single_op_config : public ggml_qnn_op_config_base { class ggml_qnn_single_op_config : public ggml_qnn_op_config_base {
public: public:
explicit ggml_qnn_single_op_config(const std::string &name, const std::string &package_name, explicit ggml_qnn_single_op_config(const std::string &name, const std::string &package_name,
const std::string &op_type, std::shared_ptr<qnn_instance> qnn_instance) : const std::string &op_type, std::shared_ptr<qnn_instance> qnn_instance)
ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {}
explicit ggml_qnn_single_op_config(const std::string &name, const std::string &package_name,
const std::string &op_type, const std::string &param_name,
const Qnn_DataType_t param_type, const size_t param_size,
std::shared_ptr<qnn_instance> qnn_instance)
: ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance),
_param_name(param_name),
_param_type(param_type),
_param_buffer(param_size) {}
bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs, bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs,
const ggml_tensor_array_t &tensor_outputs) override; const ggml_tensor_array_t &tensor_outputs) override;
private: private:
const std::string _param_name;
const Qnn_DataType_t _param_type = QNN_DATATYPE_UINT_32;
std::vector<uint8_t> _param_buffer;
DISABLE_COPY(ggml_qnn_single_op_config); DISABLE_COPY(ggml_qnn_single_op_config);
DISABLE_MOVE(ggml_qnn_single_op_config); DISABLE_MOVE(ggml_qnn_single_op_config);
}; };
class ggml_qnn_matmul_op_config : public ggml_qnn_op_config { class ggml_qnn_matmul_op_config : public ggml_qnn_op_config {
public: public:
ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr<qnn_instance> qnn_instance) : ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr<qnn_instance> qnn_instance)
_name(name), _qnn_instance(qnn_instance) {} : _name(name), _qnn_instance(qnn_instance) {}
bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs, bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs,
const ggml_tensor_array_t &tensor_outputs) override; const ggml_tensor_array_t &tensor_outputs) override;
@ -114,9 +132,4 @@ private:
DISABLE_MOVE(ggml_qnn_matmul_op_config); DISABLE_MOVE(ggml_qnn_matmul_op_config);
}; };
using ggml_op_constructor_t =
std::function<std::unique_ptr<ggml_qnn_op_config>(const std::string &, std::shared_ptr<qnn_instance>)>;
ggml_op_constructor_t create_op_constructor(const std::string &op_name);
} // namespace qnn } // namespace qnn

View File

@ -257,6 +257,7 @@ private:
DISABLE_MOVE(ggml_qnn_tensor); DISABLE_MOVE(ggml_qnn_tensor);
}; };
using ggml_qnn_tensor_ptr_t = std::shared_ptr<ggml_qnn_tensor>;
using ggml_qnn_tensor_array_t = std::vector<std::shared_ptr<ggml_qnn_tensor>>; using ggml_qnn_tensor_array_t = std::vector<std::shared_ptr<ggml_qnn_tensor>>;
} // namespace qnn } // namespace qnn