diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index 4da991916c..d28163dce4 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -57,30 +57,30 @@ struct qnn_device_caps { }; const qnn_device_caps kDeviceCaps[GGML_QNN_MAX_DEVICES]{ - { "qnn-cpu", - "Qualcomm Kryo CPU", - "libQnnCpu.so", - GGML_BACKEND_DEVICE_TYPE_CPU, - { GGML_TYPE_F32, - GGML_TYPE_I8 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul - { "qnn-gpu", - "Qualcomm Adreno GPU", - "libQnnGpu.so", - GGML_BACKEND_DEVICE_TYPE_GPU, - { GGML_TYPE_F32, - GGML_TYPE_F16 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul - { "qnn-npu", - "Qualcomm NPU", - "libQnnHtp.so", - GGML_BACKEND_DEVICE_TYPE_GPU, - { GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_I16, - GGML_TYPE_I8 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul + {// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul + "qnn-cpu", + "Qualcomm Kryo CPU", + "libQnnCpu.so", + GGML_BACKEND_DEVICE_TYPE_CPU, + {GGML_TYPE_F32, GGML_TYPE_I8}}, + {// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul + "qnn-gpu", + "Qualcomm Adreno GPU", + "libQnnGpu.so", + GGML_BACKEND_DEVICE_TYPE_GPU, + {GGML_TYPE_F32, GGML_TYPE_F16}}, + {// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul + "qnn-npu", + "Qualcomm NPU", + "libQnnHtp.so", + GGML_BACKEND_DEVICE_TYPE_GPU, + {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_I16, GGML_TYPE_I8}}, }; class ggml_backend_qnn_buffer_context { public: - ggml_backend_qnn_buffer_context(QNNBackend device, std::shared_ptr instance, size_t size) : - _instance(instance), _name(QNN_BACKEND_NAME + std::to_string(device)) { + ggml_backend_qnn_buffer_context(QNNBackend device, std::shared_ptr instance, size_t size) + : _instance(instance), _name(QNN_BACKEND_NAME + std::to_string(device)) { // TODO: fix this for other platforms size_t size_page = sysconf(_SC_PAGESIZE); @@ -251,7 +251,7 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(ggml_backend_dev_t dev) if (!ggml_backend_qnn_buffer_type_initialized) { for (size_t i = 0; i < GGML_QNN_MAX_DEVICES; i++) { auto &context = ggml_backend_qnn_buffer_type_contexts[i]; - context = { std::string(QNN_BACKEND_NAME) + std::to_string(i) }; + context = {std::string(QNN_BACKEND_NAME) + std::to_string(i)}; ggml_backend_qnn_buffer_types[i] = { /* .iface = */ { /* .get_name = */ ggml_backend_qnn_buffer_type_name, @@ -348,8 +348,8 @@ void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, struct ggml_backe } ggml_guid_t ggml_backend_qnn_guid() { - static ggml_guid guid = { 0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, - 0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09 }; + static ggml_guid guid = {0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, + 0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09}; return &guid; } @@ -511,7 +511,7 @@ const ggml_backend_reg_i ggml_backend_qnn_reg_interface = { } // namespace ggml_backend_reg_t ggml_backend_qnn_reg() { - static ggml_backend_qnn_reg_impl reg{ ggml_backend_qnn_reg_interface }; + static ggml_backend_qnn_reg_impl reg{ggml_backend_qnn_reg_interface}; static bool initialized = false; static std::mutex mutex; diff --git a/ggml/src/ggml-qnn/.clang-format b/ggml/src/ggml-qnn/.clang-format index 3b933ff10d..0c67c54239 100644 --- a/ggml/src/ggml-qnn/.clang-format +++ b/ggml/src/ggml-qnn/.clang-format @@ -3,16 +3,50 @@ BasedOnStyle: Google IndentWidth: 4 AccessModifierOffset: -4 AlignAfterOpenBracket: Align -AlignOperands: true +AlignConsecutiveMacros: false +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignEscapedNewlines: Left +AlignOperands: true AlignTrailingComments: true +AllowAllArgumentsOnNextLine: true +AllowAllConstructorInitializersOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: All +AllowShortLambdasOnASingleLine: All +AllowShortIfStatementsOnASingleLine: WithoutElse +AllowShortLoopsOnASingleLine: true +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: true +AlwaysBreakTemplateDeclarations: Yes BinPackArguments: true BinPackParameters: true -BreakBeforeBraces: Custom -BreakConstructorInitializers: AfterColon +BraceWrapping: + AfterCaseLabel: false + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true ColumnLimit: 120 -Cpp11BracedListStyle: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true DerivePointerAlignment: false -IncludeCategories: +IncludeCategories: - Regex: '^<.*\.h>' Priority: 1 - Regex: '^<.*' @@ -28,4 +62,4 @@ MaxEmptyLinesToKeep: 1 PointerAlignment: Right SortIncludes: true SpacesBeforeTrailingComments: 1 -UseTab: Never \ No newline at end of file +UseTab: Never diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 3e24ca32ed..c0e263a640 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -92,10 +92,10 @@ qnn::ggml_tensor_array_t to_ggml_tensor_array(const std::array +template bool execute_graph(qnn::ggml_qnn_graph *graph, const std::array &inputs, - const std::array &outputs) { - if (!graph->execute(to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<_OutputSize>(outputs))) { + ggml_tensor *output) { + if (!graph->execute(to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<1>({output}))) { QNN_LOG_WARN("execute failed\n"); return false; } @@ -154,37 +154,37 @@ constexpr const char *kGgmlOpToQnnOp[] = { nullptr, // GGML_OP_MUL_MAT_ID nullptr, // GGML_OP_OUT_PROD - nullptr, // GGML_OP_SCALE - nullptr, // GGML_OP_SET - nullptr, // GGML_OP_CPY - nullptr, // GGML_OP_CONT - nullptr, // GGML_OP_RESHAPE - nullptr, // GGML_OP_VIEW - nullptr, // GGML_OP_PERMUTE - nullptr, // GGML_OP_TRANSPOSE - nullptr, // GGML_OP_GET_ROWS - nullptr, // GGML_OP_GET_ROWS_BACK - nullptr, // GGML_OP_DIAG - nullptr, // GGML_OP_DIAG_MASK_INF - nullptr, // GGML_OP_DIAG_MASK_ZERO - nullptr, // GGML_OP_SOFT_MAX - nullptr, // GGML_OP_SOFT_MAX_BACK - nullptr, // GGML_OP_ROPE - nullptr, // GGML_OP_ROPE_BACK - nullptr, // GGML_OP_CLAMP - nullptr, // GGML_OP_CONV_TRANSPOSE_1D - nullptr, // GGML_OP_IM2COL - nullptr, // GGML_OP_IM2COL_BACK - nullptr, // GGML_OP_CONV_TRANSPOSE_2D - nullptr, // GGML_OP_POOL_1D - nullptr, // GGML_OP_POOL_2D - nullptr, // GGML_OP_POOL_2D_BACK - nullptr, // GGML_OP_UPSCALE - nullptr, // GGML_OP_PAD - nullptr, // GGML_OP_ARANGE - nullptr, // GGML_OP_TIMESTEP_EMBEDDING - nullptr, // GGML_OP_ARGSORT - nullptr, // GGML_OP_LEAKY_RELU + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + nullptr, // GGML_OP_RESHAPE + nullptr, // GGML_OP_VIEW + QNN_OP_TRANSPOSE, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + nullptr, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_IM2COL_BACK + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_POOL_2D_BACK + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU nullptr, // GGML_OP_FLASH_ATTN_EXT nullptr, // GGML_OP_FLASH_ATTN_BACK @@ -235,16 +235,16 @@ static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == (GGML_OP_COU static_assert(kGgmlOpToQnnOp[GGML_UNARY_OP_GELU + kGgmlUnaryOpStart] != nullptr, "GGML_UNARY_OP_GELU does not correspond to QNN_OP_GELU"); -template +template qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *ctx, size_t op, const std::array &inputs, - const std::array &outputs) { + ggml_tensor *output) { GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT)); auto &graph_cache = ctx->qnn_graph_cache; const auto *op_name = op < kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - kGgmlUnaryOpStart)); - auto graph_key = get_graph_key<_InputSize, _OutputSize>(op_name, inputs, outputs); + auto graph_key = get_graph_key<_InputSize, 1>(op_name, inputs, {output}); auto it = graph_cache.find(graph_key); qnn::ggml_qnn_graph *graph_ptr = nullptr; if (it != graph_cache.end()) { @@ -259,7 +259,7 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *c auto op_constructor = qnn::create_op_constructor(kGgmlOpToQnnOp[op]); if (!graph->build_graph(op_constructor, to_ggml_tensor_array<_InputSize>(inputs), - to_ggml_tensor_array<_OutputSize>(outputs))) { + to_ggml_tensor_array<1>({output}))) { QNN_LOG_ERROR("build_graph failed\n"); return nullptr; } @@ -278,9 +278,9 @@ bool qnn_binary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, CHECK_PARAMS(ctx, src0, src1, dst); bool succeed = false; - auto *graph_ptr = get_qnn_graph_from_cache<2, 1>(ctx, _GgmlOp, { src0, src1 }, { dst }); + auto *graph_ptr = get_qnn_graph_from_cache<2>(ctx, _GgmlOp, {src0, src1}, dst); if (graph_ptr) { - succeed = execute_graph<2, 1>(graph_ptr, { src0, src1 }, { dst }); + succeed = execute_graph<2>(graph_ptr, {src0, src1}, dst); } #ifndef NDEBUG @@ -301,9 +301,9 @@ bool qnn_unary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, g CHECK_PARAMS(ctx, src, dst); bool succeed = false; - auto *graph_ptr = get_qnn_graph_from_cache<1, 1>(ctx, _GgmlOp, { src }, { dst }); + auto *graph_ptr = get_qnn_graph_from_cache<1>(ctx, _GgmlOp, {src}, dst); if (graph_ptr) { - succeed = execute_graph<1, 1>(graph_ptr, { src }, { dst }); + succeed = execute_graph<1>(graph_ptr, {src}, dst); } #ifndef NDEBUG @@ -315,6 +315,22 @@ bool qnn_unary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, g return succeed; } + +bool qnn_unary_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, ggml_tensor *dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(src); + GGML_UNUSED(dst); + return true; +} + +bool qnn_binary_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(src0); + GGML_UNUSED(src1); + GGML_UNUSED(dst); + return true; +} + constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { nullptr, // GGML_OP_NONE nullptr, // GGML_OP_DUP @@ -347,37 +363,37 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { nullptr, // GGML_OP_MUL_MAT_ID nullptr, // GGML_OP_OUT_PROD - nullptr, // GGML_OP_SCALE - nullptr, // GGML_OP_SET - nullptr, // GGML_OP_CPY - nullptr, // GGML_OP_CONT - nullptr, // GGML_OP_RESHAPE - nullptr, // GGML_OP_VIEW - nullptr, // GGML_OP_PERMUTE - nullptr, // GGML_OP_TRANSPOSE - nullptr, // GGML_OP_GET_ROWS - nullptr, // GGML_OP_GET_ROWS_BACK - nullptr, // GGML_OP_DIAG - nullptr, // GGML_OP_DIAG_MASK_INF - nullptr, // GGML_OP_DIAG_MASK_ZERO - nullptr, // GGML_OP_SOFT_MAX - nullptr, // GGML_OP_SOFT_MAX_BACK - nullptr, // GGML_OP_ROPE - nullptr, // GGML_OP_ROPE_BACK - nullptr, // GGML_OP_CLAMP - nullptr, // GGML_OP_CONV_TRANSPOSE_1D - nullptr, // GGML_OP_IM2COL - nullptr, // GGML_OP_IM2COL_BACK - nullptr, // GGML_OP_CONV_TRANSPOSE_2D - nullptr, // GGML_OP_POOL_1D - nullptr, // GGML_OP_POOL_2D - nullptr, // GGML_OP_POOL_2D_BACK - nullptr, // GGML_OP_UPSCALE - nullptr, // GGML_OP_PAD - nullptr, // GGML_OP_ARANGE - nullptr, // GGML_OP_TIMESTEP_EMBEDDING - nullptr, // GGML_OP_ARGSORT - nullptr, // GGML_OP_LEAKY_RELU + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + nullptr, // GGML_OP_RESHAPE + qnn_unary_nop_impl, // GGML_OP_VIEW + qnn_unary_op_impl, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + qnn_unary_nop_impl, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_IM2COL_BACK + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_POOL_2D_BACK + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU nullptr, // GGML_OP_FLASH_ATTN_EXT nullptr, // GGML_OP_FLASH_ATTN_BACK @@ -522,18 +538,24 @@ static_assert(sizeof(kQnnBinaryOpsTable) / sizeof(kQnnBinaryOpsTable[0]) == GGML "GGML_OP_COUNT does not match the size of the kQnnBinaryOpsTable table"); bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_tensor *tensor) { + if (!tensor) { + QNN_LOG_DEBUG("tensor is nullptr"); + return false; + } + + auto *type_name = ggml_get_type_traits(tensor->type)->type_name; switch (tensor->type) { case GGML_TYPE_F32: case GGML_TYPE_F16: case GGML_TYPE_Q8_0: case GGML_TYPE_Q4_0: if (ctx->supported_types.find(tensor->type) == ctx->supported_types.end()) { - QNN_LOG_DEBUG("unsupported data type GGML_TYPE_F16 for cpu backend"); + QNN_LOG_DEBUG("unsupported data type %s for backend %d", type_name, (int)ctx->device); return false; } break; default: - QNN_LOG_DEBUG("unsupported data type %d", tensor->type); + QNN_LOG_DEBUG("unsupported data type %s", type_name); return false; } @@ -591,19 +613,15 @@ bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tenso } } else { if (!kQnnUnaryOpsTable[op->op] && !kQnnBinaryOpsTable[op->op]) { - QNN_LOG_DEBUG("unsupported op %d", op->op); + QNN_LOG_DEBUG("[%s] unsupported op", ggml_op_name(op->op)); return false; } auto *src0 = op->src[0]; auto *src1 = op->src[1]; - if (!src0 || !src1) { - QNN_LOG_DEBUG("src0 or src1 is nullptr"); - return false; - } - - if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, src1) || - !ggml_qnn_supports_tensor(ctx, op)) { + if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, op) || + (kQnnBinaryOpsTable[op->op] && !ggml_qnn_supports_tensor(ctx, src1))) { + QNN_LOG_DEBUG("[%s] unsupported tensor", ggml_op_name(op->op)); return false; } @@ -642,7 +660,7 @@ bool ggml_qnn_forward(ggml_backend_qnn_device_context *ctx, struct ggml_tensor * return binary_op(ctx, tensor->src[0], tensor->src[1], tensor); } - QNN_LOG_WARN("unsupported op %s", ggml_op_desc(tensor)); + QNN_LOG_WARN("[forward]unsupported op %s", ggml_op_desc(tensor)); return false; } diff --git a/ggml/src/ggml-qnn/op-config.cpp b/ggml/src/ggml-qnn/op-config.cpp index 07dcba1564..9b98051adf 100644 --- a/ggml/src/ggml-qnn/op-config.cpp +++ b/ggml/src/ggml-qnn/op-config.cpp @@ -7,10 +7,10 @@ namespace { constexpr const qnn::qnn_dimension_array_t kTransposeParamData[GGML_MAX_DIMS] = { - { 0 }, - { 1, 0 }, - { 0, 2, 1 }, - { 0, 1, 3, 2 }, + {0}, + {1, 0}, + {0, 2, 1}, + {0, 1, 3, 2}, }; qnn::qnn_dimension_array_t get_transposed_dimensions(const qnn::qnn_dimension_array_t &dimensions, int rank) { @@ -96,9 +96,8 @@ bool bind_tensors(const qnn::ggml_tensor_array_t &ggml_tensors, qnn::ggml_qnn_te class ggml_qnn_connectable_op_config : public qnn::ggml_qnn_op_config_base { public: explicit ggml_qnn_connectable_op_config(const std::string &name, const std::string &package_name, - const std::string &op_type, - std::shared_ptr qnn_instance) : - ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} + const std::string &op_type, std::shared_ptr qnn_instance) + : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const qnn::ggml_tensor_array_t &tensor_inputs, @@ -264,11 +263,22 @@ bool ggml_qnn_single_op_config::create_tensors(QNNBackend device, Qnn_GraphHandl const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs); - tensor_common_params params = { "src", tensor_rank, true, device, graph_handle, _qnn_instance }; + tensor_common_params params = {"src", tensor_rank, true, device, graph_handle, _qnn_instance}; create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs); params.name_prefix = "dst"; params.is_input = false; create_tensors_from_ggml_tensor(params, tensor_outputs, &_tensor_outputs, &_qnn_tensor_outputs); + + if (_param_buffer.size() > 0) { + // handle parameters in output tensor + auto *params = tensor_outputs.front()->op_params; + memcpy(_param_buffer.data(), params, _param_buffer.size()); + + const uint32_t count = uint32_t(_param_buffer.size() / qnn_datatype_size(_param_type)); + const qnn_dimension_array_t param_dims = {count, 1, 1, 1}; + add_tensor_param(_param_name, param_dims, 1, _param_buffer.data(), _param_type, device, graph_handle); + } + return true; } @@ -281,7 +291,7 @@ bool ggml_qnn_matmul_op_config::create_tensors(QNNBackend device, Qnn_GraphHandl GGML_ASSERT(tensor_rank >= 2); // create input tensors - tensor_common_params params = { "src", tensor_rank, true, device, graph_handle, _qnn_instance }; + tensor_common_params params = {"src", tensor_rank, true, device, graph_handle, _qnn_instance}; create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs); // create output tensor @@ -290,8 +300,49 @@ bool ggml_qnn_matmul_op_config::create_tensors(QNNBackend device, Qnn_GraphHandl params.is_input = false; create_tensors_from_ggml_tensor(params, tensor_outputs, &mat_mul_tensor_outputs, nullptr); + if (device == QNN_BACKEND_GPU) { + // there's no convert op for GPU, so we should create matmul nodes directl. + return create_mat_mul_nodes(device, graph_handle, tensor_rank, _tensor_inputs, mat_mul_tensor_outputs); + } + + // create tensors for convert node + ggml_qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs; + auto input_tensor_type = get_tensor_type(mat_mul_tensor_inputs); + QNN_LOG_DEBUG("matmul input tensor type: %s\n", qnn_datatype_to_string(input_tensor_type)); + + _input_converts.resize(mat_mul_tensor_inputs.size()); + for (size_t i = 0; i < mat_mul_tensor_inputs.size(); ++i) { + // create input convert nodes + std::string convert_name("convert_src" + std::to_string(i)); + auto convert_in = mat_mul_tensor_inputs[i]; + auto convert_out = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_out", + convert_in->get_dimensions(), input_tensor_type, + tensor_rank, device, graph_handle, _qnn_instance); + auto convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_CONVERT, _qnn_instance); + convert->set_input_tensors({convert_in}); + convert->set_output_tensors({convert_out}); + mat_mul_tensor_inputs[i] = convert_out; + _input_converts[i] = convert; + } + + { + // create output convert node + std::string convert_name("convert_dst"); + auto convert_out = mat_mul_tensor_outputs.front(); + auto convert_in = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_in", + convert_out->get_dimensions(), input_tensor_type, + tensor_rank, device, graph_handle, _qnn_instance); + auto output_convert = std::make_shared( + convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_CONVERT, _qnn_instance); + output_convert->set_input_tensors({convert_in}); + output_convert->set_output_tensors({convert_out}); + mat_mul_tensor_outputs[0] = convert_in; + _output_convert = output_convert; + } + // create mat_mul nodes - return create_mat_mul_nodes(device, graph_handle, tensor_rank, _tensor_inputs, mat_mul_tensor_outputs); + return create_mat_mul_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs); } bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, @@ -371,7 +422,7 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap // set transpose0 parameters auto *params_data = reinterpret_cast(kTransposeParamData[rank - 1].data()); - const qnn_dimension_array_t param_dims = { (uint32_t)rank, 1, 1, 1 }; + const qnn_dimension_array_t param_dims = {(uint32_t)rank, 1, 1, 1}; transpose0->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32, device, graph_handle); @@ -380,19 +431,19 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap graph_handle); // set tensor to transpose0 - ggml_qnn_tensor_array_t tensors = { tensor_inputs.back() }; + ggml_qnn_tensor_array_t tensors = {tensor_inputs.back()}; transpose0->set_input_tensors(tensors); - tensors = { src0_trans }; + tensors = {src0_trans}; transpose0->set_output_tensors(tensors); // set tensor to mat_mul - tensors = { tensor_inputs.front(), src0_trans }; + tensors = {tensor_inputs.front(), src0_trans}; mat_mul->set_input_tensors(tensors); - tensors = { dst_trans }; + tensors = {dst_trans}; mat_mul->set_output_tensors(tensors); // set tensor to transpose1 - tensors = { dst_trans }; + tensors = {dst_trans}; transpose1->set_input_tensors(tensors); transpose1->set_output_tensors(tensor_outputs); @@ -459,6 +510,13 @@ ggml_op_constructor_t create_op_constructor(const std::string &op_name) { QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s\n", instance_name.c_str()); return std::make_unique(instance_name, qnn_instance); }; + } else if (op_name == QNN_OP_TRANSPOSE) { + return [](const std::string &instance_name, + std::shared_ptr qnn_instance) -> std::unique_ptr { + return std::make_unique(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_TRANSPOSE, QNN_OP_TRANSPOSE_PARAM_PERM, + QNN_DATATYPE_UINT_32, 4 * sizeof(uint32_t), qnn_instance); + }; } return [op_name](const std::string &instance_name, diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp index 2016cb4ac9..4ec7aac9b2 100644 --- a/ggml/src/ggml-qnn/op-config.hpp +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -30,11 +30,16 @@ public: virtual void unbind_output_tensors() = 0; }; +using ggml_op_constructor_t = + std::function(const std::string &, std::shared_ptr)>; + +ggml_op_constructor_t create_op_constructor(const std::string &op_name); + class ggml_qnn_op_config_base : public ggml_qnn_op_config { public: explicit ggml_qnn_op_config_base(const std::string &name, const std::string &package_name, - const std::string &op_type, std::shared_ptr qnn_instance) : - _name(name), _package_name(package_name), _op_type(op_type), _qnn_instance(qnn_instance) {} + const std::string &op_type, std::shared_ptr qnn_instance) + : _name(name), _package_name(package_name), _op_type(op_type), _qnn_instance(qnn_instance) {} void add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar); bool add_tensor_param(const std::string &name, const qnn_dimension_array_t &dimensions, int rank, @@ -70,21 +75,34 @@ protected: class ggml_qnn_single_op_config : public ggml_qnn_op_config_base { public: explicit ggml_qnn_single_op_config(const std::string &name, const std::string &package_name, - const std::string &op_type, std::shared_ptr qnn_instance) : - ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} + const std::string &op_type, std::shared_ptr qnn_instance) + : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} + + explicit ggml_qnn_single_op_config(const std::string &name, const std::string &package_name, + const std::string &op_type, const std::string ¶m_name, + const Qnn_DataType_t param_type, const size_t param_size, + std::shared_ptr qnn_instance) + : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance), + _param_name(param_name), + _param_type(param_type), + _param_buffer(param_size) {} bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) override; private: + const std::string _param_name; + const Qnn_DataType_t _param_type = QNN_DATATYPE_UINT_32; + std::vector _param_buffer; + DISABLE_COPY(ggml_qnn_single_op_config); DISABLE_MOVE(ggml_qnn_single_op_config); }; class ggml_qnn_matmul_op_config : public ggml_qnn_op_config { public: - ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr qnn_instance) : - _name(name), _qnn_instance(qnn_instance) {} + ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr qnn_instance) + : _name(name), _qnn_instance(qnn_instance) {} bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) override; @@ -114,9 +132,4 @@ private: DISABLE_MOVE(ggml_qnn_matmul_op_config); }; -using ggml_op_constructor_t = - std::function(const std::string &, std::shared_ptr)>; - -ggml_op_constructor_t create_op_constructor(const std::string &op_name); - } // namespace qnn diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index faf5b0df5f..f28fc8e2ca 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -257,6 +257,7 @@ private: DISABLE_MOVE(ggml_qnn_tensor); }; +using ggml_qnn_tensor_ptr_t = std::shared_ptr; using ggml_qnn_tensor_array_t = std::vector>; } // namespace qnn