feat: add QNN_OP_TRANSPOSE (#6)
* redo: add convert nodes This reverts commit 8448acd5ebf8fe86ab9d25313b64a15c811ef96e. * align clang format with cann * rename binary_op -> general_op casue there're some op that will only tak 1 param * Revert "rename binary_op -> general_op" This reverts commit 5be63b1a0dc4614457785367dade62158fe46214. * wip * add GGML_OP_PERMUTE * add GGML_OP_VIEW and GGML_OP_GET_ROWS * wip * Revert "wip" This reverts commit 772462ca6cfa01ea31bde725c2da60076ad9385f.
This commit is contained in:
parent
0fec56fd57
commit
8ad86dc703
|
|
@ -57,30 +57,30 @@ struct qnn_device_caps {
|
|||
};
|
||||
|
||||
const qnn_device_caps kDeviceCaps[GGML_QNN_MAX_DEVICES]{
|
||||
{ "qnn-cpu",
|
||||
"Qualcomm Kryo CPU",
|
||||
"libQnnCpu.so",
|
||||
GGML_BACKEND_DEVICE_TYPE_CPU,
|
||||
{ GGML_TYPE_F32,
|
||||
GGML_TYPE_I8 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul
|
||||
{ "qnn-gpu",
|
||||
"Qualcomm Adreno GPU",
|
||||
"libQnnGpu.so",
|
||||
GGML_BACKEND_DEVICE_TYPE_GPU,
|
||||
{ GGML_TYPE_F32,
|
||||
GGML_TYPE_F16 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul
|
||||
{ "qnn-npu",
|
||||
"Qualcomm NPU",
|
||||
"libQnnHtp.so",
|
||||
GGML_BACKEND_DEVICE_TYPE_GPU,
|
||||
{ GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_I16,
|
||||
GGML_TYPE_I8 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul
|
||||
{// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul
|
||||
"qnn-cpu",
|
||||
"Qualcomm Kryo CPU",
|
||||
"libQnnCpu.so",
|
||||
GGML_BACKEND_DEVICE_TYPE_CPU,
|
||||
{GGML_TYPE_F32, GGML_TYPE_I8}},
|
||||
{// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul
|
||||
"qnn-gpu",
|
||||
"Qualcomm Adreno GPU",
|
||||
"libQnnGpu.so",
|
||||
GGML_BACKEND_DEVICE_TYPE_GPU,
|
||||
{GGML_TYPE_F32, GGML_TYPE_F16}},
|
||||
{// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul
|
||||
"qnn-npu",
|
||||
"Qualcomm NPU",
|
||||
"libQnnHtp.so",
|
||||
GGML_BACKEND_DEVICE_TYPE_GPU,
|
||||
{GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_I16, GGML_TYPE_I8}},
|
||||
};
|
||||
|
||||
class ggml_backend_qnn_buffer_context {
|
||||
public:
|
||||
ggml_backend_qnn_buffer_context(QNNBackend device, std::shared_ptr<qnn::qnn_instance> instance, size_t size) :
|
||||
_instance(instance), _name(QNN_BACKEND_NAME + std::to_string(device)) {
|
||||
ggml_backend_qnn_buffer_context(QNNBackend device, std::shared_ptr<qnn::qnn_instance> instance, size_t size)
|
||||
: _instance(instance), _name(QNN_BACKEND_NAME + std::to_string(device)) {
|
||||
|
||||
// TODO: fix this for other platforms
|
||||
size_t size_page = sysconf(_SC_PAGESIZE);
|
||||
|
|
@ -251,7 +251,7 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(ggml_backend_dev_t dev)
|
|||
if (!ggml_backend_qnn_buffer_type_initialized) {
|
||||
for (size_t i = 0; i < GGML_QNN_MAX_DEVICES; i++) {
|
||||
auto &context = ggml_backend_qnn_buffer_type_contexts[i];
|
||||
context = { std::string(QNN_BACKEND_NAME) + std::to_string(i) };
|
||||
context = {std::string(QNN_BACKEND_NAME) + std::to_string(i)};
|
||||
ggml_backend_qnn_buffer_types[i] = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_qnn_buffer_type_name,
|
||||
|
|
@ -348,8 +348,8 @@ void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, struct ggml_backe
|
|||
}
|
||||
|
||||
ggml_guid_t ggml_backend_qnn_guid() {
|
||||
static ggml_guid guid = { 0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81,
|
||||
0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09 };
|
||||
static ggml_guid guid = {0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81,
|
||||
0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09};
|
||||
return &guid;
|
||||
}
|
||||
|
||||
|
|
@ -511,7 +511,7 @@ const ggml_backend_reg_i ggml_backend_qnn_reg_interface = {
|
|||
} // namespace
|
||||
|
||||
ggml_backend_reg_t ggml_backend_qnn_reg() {
|
||||
static ggml_backend_qnn_reg_impl reg{ ggml_backend_qnn_reg_interface };
|
||||
static ggml_backend_qnn_reg_impl reg{ggml_backend_qnn_reg_interface};
|
||||
static bool initialized = false;
|
||||
static std::mutex mutex;
|
||||
|
||||
|
|
|
|||
|
|
@ -3,16 +3,50 @@ BasedOnStyle: Google
|
|||
IndentWidth: 4
|
||||
AccessModifierOffset: -4
|
||||
AlignAfterOpenBracket: Align
|
||||
AlignOperands: true
|
||||
AlignConsecutiveMacros: false
|
||||
AlignConsecutiveAssignments: false
|
||||
AlignConsecutiveDeclarations: false
|
||||
AlignEscapedNewlines: Left
|
||||
AlignOperands: true
|
||||
AlignTrailingComments: true
|
||||
AllowAllArgumentsOnNextLine: true
|
||||
AllowAllConstructorInitializersOnNextLine: true
|
||||
AllowAllParametersOfDeclarationOnNextLine: true
|
||||
AllowShortBlocksOnASingleLine: Never
|
||||
AllowShortCaseLabelsOnASingleLine: false
|
||||
AllowShortFunctionsOnASingleLine: All
|
||||
AllowShortLambdasOnASingleLine: All
|
||||
AllowShortIfStatementsOnASingleLine: WithoutElse
|
||||
AllowShortLoopsOnASingleLine: true
|
||||
AlwaysBreakAfterDefinitionReturnType: None
|
||||
AlwaysBreakAfterReturnType: None
|
||||
AlwaysBreakBeforeMultilineStrings: true
|
||||
AlwaysBreakTemplateDeclarations: Yes
|
||||
BinPackArguments: true
|
||||
BinPackParameters: true
|
||||
BreakBeforeBraces: Custom
|
||||
BreakConstructorInitializers: AfterColon
|
||||
BraceWrapping:
|
||||
AfterCaseLabel: false
|
||||
AfterClass: false
|
||||
AfterControlStatement: false
|
||||
AfterEnum: false
|
||||
AfterFunction: false
|
||||
AfterNamespace: false
|
||||
AfterObjCDeclaration: false
|
||||
AfterStruct: false
|
||||
AfterUnion: false
|
||||
AfterExternBlock: false
|
||||
BeforeCatch: false
|
||||
BeforeElse: false
|
||||
IndentBraces: false
|
||||
SplitEmptyFunction: true
|
||||
SplitEmptyRecord: true
|
||||
SplitEmptyNamespace: true
|
||||
ColumnLimit: 120
|
||||
Cpp11BracedListStyle: false
|
||||
ConstructorInitializerIndentWidth: 4
|
||||
ContinuationIndentWidth: 4
|
||||
Cpp11BracedListStyle: true
|
||||
DerivePointerAlignment: false
|
||||
IncludeCategories:
|
||||
IncludeCategories:
|
||||
- Regex: '^<.*\.h>'
|
||||
Priority: 1
|
||||
- Regex: '^<.*'
|
||||
|
|
@ -28,4 +62,4 @@ MaxEmptyLinesToKeep: 1
|
|||
PointerAlignment: Right
|
||||
SortIncludes: true
|
||||
SpacesBeforeTrailingComments: 1
|
||||
UseTab: Never
|
||||
UseTab: Never
|
||||
|
|
|
|||
|
|
@ -92,10 +92,10 @@ qnn::ggml_tensor_array_t to_ggml_tensor_array(const std::array<ggml_tensor *, _S
|
|||
return qnn::ggml_tensor_array_t(array.data(), array.data() + _Size);
|
||||
}
|
||||
|
||||
template <size_t _InputSize, size_t _OutputSize>
|
||||
template <size_t _InputSize>
|
||||
bool execute_graph(qnn::ggml_qnn_graph *graph, const std::array<ggml_tensor *, _InputSize> &inputs,
|
||||
const std::array<ggml_tensor *, _OutputSize> &outputs) {
|
||||
if (!graph->execute(to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<_OutputSize>(outputs))) {
|
||||
ggml_tensor *output) {
|
||||
if (!graph->execute(to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<1>({output}))) {
|
||||
QNN_LOG_WARN("execute failed\n");
|
||||
return false;
|
||||
}
|
||||
|
|
@ -154,37 +154,37 @@ constexpr const char *kGgmlOpToQnnOp[] = {
|
|||
nullptr, // GGML_OP_MUL_MAT_ID
|
||||
nullptr, // GGML_OP_OUT_PROD
|
||||
|
||||
nullptr, // GGML_OP_SCALE
|
||||
nullptr, // GGML_OP_SET
|
||||
nullptr, // GGML_OP_CPY
|
||||
nullptr, // GGML_OP_CONT
|
||||
nullptr, // GGML_OP_RESHAPE
|
||||
nullptr, // GGML_OP_VIEW
|
||||
nullptr, // GGML_OP_PERMUTE
|
||||
nullptr, // GGML_OP_TRANSPOSE
|
||||
nullptr, // GGML_OP_GET_ROWS
|
||||
nullptr, // GGML_OP_GET_ROWS_BACK
|
||||
nullptr, // GGML_OP_DIAG
|
||||
nullptr, // GGML_OP_DIAG_MASK_INF
|
||||
nullptr, // GGML_OP_DIAG_MASK_ZERO
|
||||
nullptr, // GGML_OP_SOFT_MAX
|
||||
nullptr, // GGML_OP_SOFT_MAX_BACK
|
||||
nullptr, // GGML_OP_ROPE
|
||||
nullptr, // GGML_OP_ROPE_BACK
|
||||
nullptr, // GGML_OP_CLAMP
|
||||
nullptr, // GGML_OP_CONV_TRANSPOSE_1D
|
||||
nullptr, // GGML_OP_IM2COL
|
||||
nullptr, // GGML_OP_IM2COL_BACK
|
||||
nullptr, // GGML_OP_CONV_TRANSPOSE_2D
|
||||
nullptr, // GGML_OP_POOL_1D
|
||||
nullptr, // GGML_OP_POOL_2D
|
||||
nullptr, // GGML_OP_POOL_2D_BACK
|
||||
nullptr, // GGML_OP_UPSCALE
|
||||
nullptr, // GGML_OP_PAD
|
||||
nullptr, // GGML_OP_ARANGE
|
||||
nullptr, // GGML_OP_TIMESTEP_EMBEDDING
|
||||
nullptr, // GGML_OP_ARGSORT
|
||||
nullptr, // GGML_OP_LEAKY_RELU
|
||||
nullptr, // GGML_OP_SCALE
|
||||
nullptr, // GGML_OP_SET
|
||||
nullptr, // GGML_OP_CPY
|
||||
nullptr, // GGML_OP_CONT
|
||||
nullptr, // GGML_OP_RESHAPE
|
||||
nullptr, // GGML_OP_VIEW
|
||||
QNN_OP_TRANSPOSE, // GGML_OP_PERMUTE
|
||||
nullptr, // GGML_OP_TRANSPOSE
|
||||
nullptr, // GGML_OP_GET_ROWS
|
||||
nullptr, // GGML_OP_GET_ROWS_BACK
|
||||
nullptr, // GGML_OP_DIAG
|
||||
nullptr, // GGML_OP_DIAG_MASK_INF
|
||||
nullptr, // GGML_OP_DIAG_MASK_ZERO
|
||||
nullptr, // GGML_OP_SOFT_MAX
|
||||
nullptr, // GGML_OP_SOFT_MAX_BACK
|
||||
nullptr, // GGML_OP_ROPE
|
||||
nullptr, // GGML_OP_ROPE_BACK
|
||||
nullptr, // GGML_OP_CLAMP
|
||||
nullptr, // GGML_OP_CONV_TRANSPOSE_1D
|
||||
nullptr, // GGML_OP_IM2COL
|
||||
nullptr, // GGML_OP_IM2COL_BACK
|
||||
nullptr, // GGML_OP_CONV_TRANSPOSE_2D
|
||||
nullptr, // GGML_OP_POOL_1D
|
||||
nullptr, // GGML_OP_POOL_2D
|
||||
nullptr, // GGML_OP_POOL_2D_BACK
|
||||
nullptr, // GGML_OP_UPSCALE
|
||||
nullptr, // GGML_OP_PAD
|
||||
nullptr, // GGML_OP_ARANGE
|
||||
nullptr, // GGML_OP_TIMESTEP_EMBEDDING
|
||||
nullptr, // GGML_OP_ARGSORT
|
||||
nullptr, // GGML_OP_LEAKY_RELU
|
||||
|
||||
nullptr, // GGML_OP_FLASH_ATTN_EXT
|
||||
nullptr, // GGML_OP_FLASH_ATTN_BACK
|
||||
|
|
@ -235,16 +235,16 @@ static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == (GGML_OP_COU
|
|||
static_assert(kGgmlOpToQnnOp[GGML_UNARY_OP_GELU + kGgmlUnaryOpStart] != nullptr,
|
||||
"GGML_UNARY_OP_GELU does not correspond to QNN_OP_GELU");
|
||||
|
||||
template <size_t _InputSize, size_t _OutputSize>
|
||||
template <size_t _InputSize>
|
||||
qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *ctx, size_t op,
|
||||
const std::array<ggml_tensor *, _InputSize> &inputs,
|
||||
const std::array<ggml_tensor *, _OutputSize> &outputs) {
|
||||
ggml_tensor *output) {
|
||||
GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT));
|
||||
|
||||
auto &graph_cache = ctx->qnn_graph_cache;
|
||||
const auto *op_name =
|
||||
op < kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - kGgmlUnaryOpStart));
|
||||
auto graph_key = get_graph_key<_InputSize, _OutputSize>(op_name, inputs, outputs);
|
||||
auto graph_key = get_graph_key<_InputSize, 1>(op_name, inputs, {output});
|
||||
auto it = graph_cache.find(graph_key);
|
||||
qnn::ggml_qnn_graph *graph_ptr = nullptr;
|
||||
if (it != graph_cache.end()) {
|
||||
|
|
@ -259,7 +259,7 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *c
|
|||
|
||||
auto op_constructor = qnn::create_op_constructor(kGgmlOpToQnnOp[op]);
|
||||
if (!graph->build_graph(op_constructor, to_ggml_tensor_array<_InputSize>(inputs),
|
||||
to_ggml_tensor_array<_OutputSize>(outputs))) {
|
||||
to_ggml_tensor_array<1>({output}))) {
|
||||
QNN_LOG_ERROR("build_graph failed\n");
|
||||
return nullptr;
|
||||
}
|
||||
|
|
@ -278,9 +278,9 @@ bool qnn_binary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0,
|
|||
CHECK_PARAMS(ctx, src0, src1, dst);
|
||||
|
||||
bool succeed = false;
|
||||
auto *graph_ptr = get_qnn_graph_from_cache<2, 1>(ctx, _GgmlOp, { src0, src1 }, { dst });
|
||||
auto *graph_ptr = get_qnn_graph_from_cache<2>(ctx, _GgmlOp, {src0, src1}, dst);
|
||||
if (graph_ptr) {
|
||||
succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst });
|
||||
succeed = execute_graph<2>(graph_ptr, {src0, src1}, dst);
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
|
|
@ -301,9 +301,9 @@ bool qnn_unary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, g
|
|||
CHECK_PARAMS(ctx, src, dst);
|
||||
|
||||
bool succeed = false;
|
||||
auto *graph_ptr = get_qnn_graph_from_cache<1, 1>(ctx, _GgmlOp, { src }, { dst });
|
||||
auto *graph_ptr = get_qnn_graph_from_cache<1>(ctx, _GgmlOp, {src}, dst);
|
||||
if (graph_ptr) {
|
||||
succeed = execute_graph<1, 1>(graph_ptr, { src }, { dst });
|
||||
succeed = execute_graph<1>(graph_ptr, {src}, dst);
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
|
|
@ -315,6 +315,22 @@ bool qnn_unary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, g
|
|||
|
||||
return succeed;
|
||||
}
|
||||
|
||||
bool qnn_unary_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, ggml_tensor *dst) {
|
||||
GGML_UNUSED(ctx);
|
||||
GGML_UNUSED(src);
|
||||
GGML_UNUSED(dst);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool qnn_binary_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst) {
|
||||
GGML_UNUSED(ctx);
|
||||
GGML_UNUSED(src0);
|
||||
GGML_UNUSED(src1);
|
||||
GGML_UNUSED(dst);
|
||||
return true;
|
||||
}
|
||||
|
||||
constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = {
|
||||
nullptr, // GGML_OP_NONE
|
||||
nullptr, // GGML_OP_DUP
|
||||
|
|
@ -347,37 +363,37 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = {
|
|||
nullptr, // GGML_OP_MUL_MAT_ID
|
||||
nullptr, // GGML_OP_OUT_PROD
|
||||
|
||||
nullptr, // GGML_OP_SCALE
|
||||
nullptr, // GGML_OP_SET
|
||||
nullptr, // GGML_OP_CPY
|
||||
nullptr, // GGML_OP_CONT
|
||||
nullptr, // GGML_OP_RESHAPE
|
||||
nullptr, // GGML_OP_VIEW
|
||||
nullptr, // GGML_OP_PERMUTE
|
||||
nullptr, // GGML_OP_TRANSPOSE
|
||||
nullptr, // GGML_OP_GET_ROWS
|
||||
nullptr, // GGML_OP_GET_ROWS_BACK
|
||||
nullptr, // GGML_OP_DIAG
|
||||
nullptr, // GGML_OP_DIAG_MASK_INF
|
||||
nullptr, // GGML_OP_DIAG_MASK_ZERO
|
||||
nullptr, // GGML_OP_SOFT_MAX
|
||||
nullptr, // GGML_OP_SOFT_MAX_BACK
|
||||
nullptr, // GGML_OP_ROPE
|
||||
nullptr, // GGML_OP_ROPE_BACK
|
||||
nullptr, // GGML_OP_CLAMP
|
||||
nullptr, // GGML_OP_CONV_TRANSPOSE_1D
|
||||
nullptr, // GGML_OP_IM2COL
|
||||
nullptr, // GGML_OP_IM2COL_BACK
|
||||
nullptr, // GGML_OP_CONV_TRANSPOSE_2D
|
||||
nullptr, // GGML_OP_POOL_1D
|
||||
nullptr, // GGML_OP_POOL_2D
|
||||
nullptr, // GGML_OP_POOL_2D_BACK
|
||||
nullptr, // GGML_OP_UPSCALE
|
||||
nullptr, // GGML_OP_PAD
|
||||
nullptr, // GGML_OP_ARANGE
|
||||
nullptr, // GGML_OP_TIMESTEP_EMBEDDING
|
||||
nullptr, // GGML_OP_ARGSORT
|
||||
nullptr, // GGML_OP_LEAKY_RELU
|
||||
nullptr, // GGML_OP_SCALE
|
||||
nullptr, // GGML_OP_SET
|
||||
nullptr, // GGML_OP_CPY
|
||||
nullptr, // GGML_OP_CONT
|
||||
nullptr, // GGML_OP_RESHAPE
|
||||
qnn_unary_nop_impl, // GGML_OP_VIEW
|
||||
qnn_unary_op_impl<GGML_OP_PERMUTE>, // GGML_OP_PERMUTE
|
||||
nullptr, // GGML_OP_TRANSPOSE
|
||||
qnn_unary_nop_impl, // GGML_OP_GET_ROWS
|
||||
nullptr, // GGML_OP_GET_ROWS_BACK
|
||||
nullptr, // GGML_OP_DIAG
|
||||
nullptr, // GGML_OP_DIAG_MASK_INF
|
||||
nullptr, // GGML_OP_DIAG_MASK_ZERO
|
||||
nullptr, // GGML_OP_SOFT_MAX
|
||||
nullptr, // GGML_OP_SOFT_MAX_BACK
|
||||
nullptr, // GGML_OP_ROPE
|
||||
nullptr, // GGML_OP_ROPE_BACK
|
||||
nullptr, // GGML_OP_CLAMP
|
||||
nullptr, // GGML_OP_CONV_TRANSPOSE_1D
|
||||
nullptr, // GGML_OP_IM2COL
|
||||
nullptr, // GGML_OP_IM2COL_BACK
|
||||
nullptr, // GGML_OP_CONV_TRANSPOSE_2D
|
||||
nullptr, // GGML_OP_POOL_1D
|
||||
nullptr, // GGML_OP_POOL_2D
|
||||
nullptr, // GGML_OP_POOL_2D_BACK
|
||||
nullptr, // GGML_OP_UPSCALE
|
||||
nullptr, // GGML_OP_PAD
|
||||
nullptr, // GGML_OP_ARANGE
|
||||
nullptr, // GGML_OP_TIMESTEP_EMBEDDING
|
||||
nullptr, // GGML_OP_ARGSORT
|
||||
nullptr, // GGML_OP_LEAKY_RELU
|
||||
|
||||
nullptr, // GGML_OP_FLASH_ATTN_EXT
|
||||
nullptr, // GGML_OP_FLASH_ATTN_BACK
|
||||
|
|
@ -522,18 +538,24 @@ static_assert(sizeof(kQnnBinaryOpsTable) / sizeof(kQnnBinaryOpsTable[0]) == GGML
|
|||
"GGML_OP_COUNT does not match the size of the kQnnBinaryOpsTable table");
|
||||
|
||||
bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_tensor *tensor) {
|
||||
if (!tensor) {
|
||||
QNN_LOG_DEBUG("tensor is nullptr");
|
||||
return false;
|
||||
}
|
||||
|
||||
auto *type_name = ggml_get_type_traits(tensor->type)->type_name;
|
||||
switch (tensor->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
if (ctx->supported_types.find(tensor->type) == ctx->supported_types.end()) {
|
||||
QNN_LOG_DEBUG("unsupported data type GGML_TYPE_F16 for cpu backend");
|
||||
QNN_LOG_DEBUG("unsupported data type %s for backend %d", type_name, (int)ctx->device);
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
QNN_LOG_DEBUG("unsupported data type %d", tensor->type);
|
||||
QNN_LOG_DEBUG("unsupported data type %s", type_name);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -591,19 +613,15 @@ bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tenso
|
|||
}
|
||||
} else {
|
||||
if (!kQnnUnaryOpsTable[op->op] && !kQnnBinaryOpsTable[op->op]) {
|
||||
QNN_LOG_DEBUG("unsupported op %d", op->op);
|
||||
QNN_LOG_DEBUG("[%s] unsupported op", ggml_op_name(op->op));
|
||||
return false;
|
||||
}
|
||||
|
||||
auto *src0 = op->src[0];
|
||||
auto *src1 = op->src[1];
|
||||
if (!src0 || !src1) {
|
||||
QNN_LOG_DEBUG("src0 or src1 is nullptr");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, src1) ||
|
||||
!ggml_qnn_supports_tensor(ctx, op)) {
|
||||
if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, op) ||
|
||||
(kQnnBinaryOpsTable[op->op] && !ggml_qnn_supports_tensor(ctx, src1))) {
|
||||
QNN_LOG_DEBUG("[%s] unsupported tensor", ggml_op_name(op->op));
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -642,7 +660,7 @@ bool ggml_qnn_forward(ggml_backend_qnn_device_context *ctx, struct ggml_tensor *
|
|||
return binary_op(ctx, tensor->src[0], tensor->src[1], tensor);
|
||||
}
|
||||
|
||||
QNN_LOG_WARN("unsupported op %s", ggml_op_desc(tensor));
|
||||
QNN_LOG_WARN("[forward]unsupported op %s", ggml_op_desc(tensor));
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -7,10 +7,10 @@
|
|||
namespace {
|
||||
|
||||
constexpr const qnn::qnn_dimension_array_t kTransposeParamData[GGML_MAX_DIMS] = {
|
||||
{ 0 },
|
||||
{ 1, 0 },
|
||||
{ 0, 2, 1 },
|
||||
{ 0, 1, 3, 2 },
|
||||
{0},
|
||||
{1, 0},
|
||||
{0, 2, 1},
|
||||
{0, 1, 3, 2},
|
||||
};
|
||||
|
||||
qnn::qnn_dimension_array_t get_transposed_dimensions(const qnn::qnn_dimension_array_t &dimensions, int rank) {
|
||||
|
|
@ -96,9 +96,8 @@ bool bind_tensors(const qnn::ggml_tensor_array_t &ggml_tensors, qnn::ggml_qnn_te
|
|||
class ggml_qnn_connectable_op_config : public qnn::ggml_qnn_op_config_base {
|
||||
public:
|
||||
explicit ggml_qnn_connectable_op_config(const std::string &name, const std::string &package_name,
|
||||
const std::string &op_type,
|
||||
std::shared_ptr<qnn::qnn_instance> qnn_instance) :
|
||||
ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {}
|
||||
const std::string &op_type, std::shared_ptr<qnn::qnn_instance> qnn_instance)
|
||||
: ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {}
|
||||
|
||||
bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
const qnn::ggml_tensor_array_t &tensor_inputs,
|
||||
|
|
@ -264,11 +263,22 @@ bool ggml_qnn_single_op_config::create_tensors(QNNBackend device, Qnn_GraphHandl
|
|||
const ggml_tensor_array_t &tensor_inputs,
|
||||
const ggml_tensor_array_t &tensor_outputs) {
|
||||
const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs);
|
||||
tensor_common_params params = { "src", tensor_rank, true, device, graph_handle, _qnn_instance };
|
||||
tensor_common_params params = {"src", tensor_rank, true, device, graph_handle, _qnn_instance};
|
||||
create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs);
|
||||
params.name_prefix = "dst";
|
||||
params.is_input = false;
|
||||
create_tensors_from_ggml_tensor(params, tensor_outputs, &_tensor_outputs, &_qnn_tensor_outputs);
|
||||
|
||||
if (_param_buffer.size() > 0) {
|
||||
// handle parameters in output tensor
|
||||
auto *params = tensor_outputs.front()->op_params;
|
||||
memcpy(_param_buffer.data(), params, _param_buffer.size());
|
||||
|
||||
const uint32_t count = uint32_t(_param_buffer.size() / qnn_datatype_size(_param_type));
|
||||
const qnn_dimension_array_t param_dims = {count, 1, 1, 1};
|
||||
add_tensor_param(_param_name, param_dims, 1, _param_buffer.data(), _param_type, device, graph_handle);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -281,7 +291,7 @@ bool ggml_qnn_matmul_op_config::create_tensors(QNNBackend device, Qnn_GraphHandl
|
|||
GGML_ASSERT(tensor_rank >= 2);
|
||||
|
||||
// create input tensors
|
||||
tensor_common_params params = { "src", tensor_rank, true, device, graph_handle, _qnn_instance };
|
||||
tensor_common_params params = {"src", tensor_rank, true, device, graph_handle, _qnn_instance};
|
||||
create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs);
|
||||
|
||||
// create output tensor
|
||||
|
|
@ -290,8 +300,49 @@ bool ggml_qnn_matmul_op_config::create_tensors(QNNBackend device, Qnn_GraphHandl
|
|||
params.is_input = false;
|
||||
create_tensors_from_ggml_tensor(params, tensor_outputs, &mat_mul_tensor_outputs, nullptr);
|
||||
|
||||
if (device == QNN_BACKEND_GPU) {
|
||||
// there's no convert op for GPU, so we should create matmul nodes directl.
|
||||
return create_mat_mul_nodes(device, graph_handle, tensor_rank, _tensor_inputs, mat_mul_tensor_outputs);
|
||||
}
|
||||
|
||||
// create tensors for convert node
|
||||
ggml_qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs;
|
||||
auto input_tensor_type = get_tensor_type(mat_mul_tensor_inputs);
|
||||
QNN_LOG_DEBUG("matmul input tensor type: %s\n", qnn_datatype_to_string(input_tensor_type));
|
||||
|
||||
_input_converts.resize(mat_mul_tensor_inputs.size());
|
||||
for (size_t i = 0; i < mat_mul_tensor_inputs.size(); ++i) {
|
||||
// create input convert nodes
|
||||
std::string convert_name("convert_src" + std::to_string(i));
|
||||
auto convert_in = mat_mul_tensor_inputs[i];
|
||||
auto convert_out = std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_out",
|
||||
convert_in->get_dimensions(), input_tensor_type,
|
||||
tensor_rank, device, graph_handle, _qnn_instance);
|
||||
auto convert = std::make_shared<ggml_qnn_connectable_op_config>(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW,
|
||||
QNN_OP_CONVERT, _qnn_instance);
|
||||
convert->set_input_tensors({convert_in});
|
||||
convert->set_output_tensors({convert_out});
|
||||
mat_mul_tensor_inputs[i] = convert_out;
|
||||
_input_converts[i] = convert;
|
||||
}
|
||||
|
||||
{
|
||||
// create output convert node
|
||||
std::string convert_name("convert_dst");
|
||||
auto convert_out = mat_mul_tensor_outputs.front();
|
||||
auto convert_in = std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_in",
|
||||
convert_out->get_dimensions(), input_tensor_type,
|
||||
tensor_rank, device, graph_handle, _qnn_instance);
|
||||
auto output_convert = std::make_shared<ggml_qnn_connectable_op_config>(
|
||||
convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_CONVERT, _qnn_instance);
|
||||
output_convert->set_input_tensors({convert_in});
|
||||
output_convert->set_output_tensors({convert_out});
|
||||
mat_mul_tensor_outputs[0] = convert_in;
|
||||
_output_convert = output_convert;
|
||||
}
|
||||
|
||||
// create mat_mul nodes
|
||||
return create_mat_mul_nodes(device, graph_handle, tensor_rank, _tensor_inputs, mat_mul_tensor_outputs);
|
||||
return create_mat_mul_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs);
|
||||
}
|
||||
|
||||
bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank,
|
||||
|
|
@ -371,7 +422,7 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap
|
|||
|
||||
// set transpose0 parameters
|
||||
auto *params_data = reinterpret_cast<const uint8_t *>(kTransposeParamData[rank - 1].data());
|
||||
const qnn_dimension_array_t param_dims = { (uint32_t)rank, 1, 1, 1 };
|
||||
const qnn_dimension_array_t param_dims = {(uint32_t)rank, 1, 1, 1};
|
||||
transpose0->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32, device,
|
||||
graph_handle);
|
||||
|
||||
|
|
@ -380,19 +431,19 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap
|
|||
graph_handle);
|
||||
|
||||
// set tensor to transpose0
|
||||
ggml_qnn_tensor_array_t tensors = { tensor_inputs.back() };
|
||||
ggml_qnn_tensor_array_t tensors = {tensor_inputs.back()};
|
||||
transpose0->set_input_tensors(tensors);
|
||||
tensors = { src0_trans };
|
||||
tensors = {src0_trans};
|
||||
transpose0->set_output_tensors(tensors);
|
||||
|
||||
// set tensor to mat_mul
|
||||
tensors = { tensor_inputs.front(), src0_trans };
|
||||
tensors = {tensor_inputs.front(), src0_trans};
|
||||
mat_mul->set_input_tensors(tensors);
|
||||
tensors = { dst_trans };
|
||||
tensors = {dst_trans};
|
||||
mat_mul->set_output_tensors(tensors);
|
||||
|
||||
// set tensor to transpose1
|
||||
tensors = { dst_trans };
|
||||
tensors = {dst_trans};
|
||||
transpose1->set_input_tensors(tensors);
|
||||
transpose1->set_output_tensors(tensor_outputs);
|
||||
|
||||
|
|
@ -459,6 +510,13 @@ ggml_op_constructor_t create_op_constructor(const std::string &op_name) {
|
|||
QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s\n", instance_name.c_str());
|
||||
return std::make_unique<qnn::ggml_qnn_matmul_op_config>(instance_name, qnn_instance);
|
||||
};
|
||||
} else if (op_name == QNN_OP_TRANSPOSE) {
|
||||
return [](const std::string &instance_name,
|
||||
std::shared_ptr<qnn::qnn_instance> qnn_instance) -> std::unique_ptr<qnn::ggml_qnn_op_config> {
|
||||
return std::make_unique<qnn::ggml_qnn_single_op_config>(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW,
|
||||
QNN_OP_TRANSPOSE, QNN_OP_TRANSPOSE_PARAM_PERM,
|
||||
QNN_DATATYPE_UINT_32, 4 * sizeof(uint32_t), qnn_instance);
|
||||
};
|
||||
}
|
||||
|
||||
return [op_name](const std::string &instance_name,
|
||||
|
|
|
|||
|
|
@ -30,11 +30,16 @@ public:
|
|||
virtual void unbind_output_tensors() = 0;
|
||||
};
|
||||
|
||||
using ggml_op_constructor_t =
|
||||
std::function<std::unique_ptr<ggml_qnn_op_config>(const std::string &, std::shared_ptr<qnn_instance>)>;
|
||||
|
||||
ggml_op_constructor_t create_op_constructor(const std::string &op_name);
|
||||
|
||||
class ggml_qnn_op_config_base : public ggml_qnn_op_config {
|
||||
public:
|
||||
explicit ggml_qnn_op_config_base(const std::string &name, const std::string &package_name,
|
||||
const std::string &op_type, std::shared_ptr<qnn_instance> qnn_instance) :
|
||||
_name(name), _package_name(package_name), _op_type(op_type), _qnn_instance(qnn_instance) {}
|
||||
const std::string &op_type, std::shared_ptr<qnn_instance> qnn_instance)
|
||||
: _name(name), _package_name(package_name), _op_type(op_type), _qnn_instance(qnn_instance) {}
|
||||
|
||||
void add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar);
|
||||
bool add_tensor_param(const std::string &name, const qnn_dimension_array_t &dimensions, int rank,
|
||||
|
|
@ -70,21 +75,34 @@ protected:
|
|||
class ggml_qnn_single_op_config : public ggml_qnn_op_config_base {
|
||||
public:
|
||||
explicit ggml_qnn_single_op_config(const std::string &name, const std::string &package_name,
|
||||
const std::string &op_type, std::shared_ptr<qnn_instance> qnn_instance) :
|
||||
ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {}
|
||||
const std::string &op_type, std::shared_ptr<qnn_instance> qnn_instance)
|
||||
: ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {}
|
||||
|
||||
explicit ggml_qnn_single_op_config(const std::string &name, const std::string &package_name,
|
||||
const std::string &op_type, const std::string ¶m_name,
|
||||
const Qnn_DataType_t param_type, const size_t param_size,
|
||||
std::shared_ptr<qnn_instance> qnn_instance)
|
||||
: ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance),
|
||||
_param_name(param_name),
|
||||
_param_type(param_type),
|
||||
_param_buffer(param_size) {}
|
||||
|
||||
bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs,
|
||||
const ggml_tensor_array_t &tensor_outputs) override;
|
||||
|
||||
private:
|
||||
const std::string _param_name;
|
||||
const Qnn_DataType_t _param_type = QNN_DATATYPE_UINT_32;
|
||||
std::vector<uint8_t> _param_buffer;
|
||||
|
||||
DISABLE_COPY(ggml_qnn_single_op_config);
|
||||
DISABLE_MOVE(ggml_qnn_single_op_config);
|
||||
};
|
||||
|
||||
class ggml_qnn_matmul_op_config : public ggml_qnn_op_config {
|
||||
public:
|
||||
ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr<qnn_instance> qnn_instance) :
|
||||
_name(name), _qnn_instance(qnn_instance) {}
|
||||
ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr<qnn_instance> qnn_instance)
|
||||
: _name(name), _qnn_instance(qnn_instance) {}
|
||||
|
||||
bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs,
|
||||
const ggml_tensor_array_t &tensor_outputs) override;
|
||||
|
|
@ -114,9 +132,4 @@ private:
|
|||
DISABLE_MOVE(ggml_qnn_matmul_op_config);
|
||||
};
|
||||
|
||||
using ggml_op_constructor_t =
|
||||
std::function<std::unique_ptr<ggml_qnn_op_config>(const std::string &, std::shared_ptr<qnn_instance>)>;
|
||||
|
||||
ggml_op_constructor_t create_op_constructor(const std::string &op_name);
|
||||
|
||||
} // namespace qnn
|
||||
|
|
|
|||
|
|
@ -257,6 +257,7 @@ private:
|
|||
DISABLE_MOVE(ggml_qnn_tensor);
|
||||
};
|
||||
|
||||
using ggml_qnn_tensor_ptr_t = std::shared_ptr<ggml_qnn_tensor>;
|
||||
using ggml_qnn_tensor_array_t = std::vector<std::shared_ptr<ggml_qnn_tensor>>;
|
||||
|
||||
} // namespace qnn
|
||||
|
|
|
|||
Loading…
Reference in New Issue