diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 6bd9006851..1ed01bfd68 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -11,12 +11,10 @@ #include "tensor.hpp" #include "utils.hpp" -#ifndef NDEBUG - namespace { -bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor *src, ggml_tensor *dst) { - if (!ctx || !src || !dst) { +bool qnn_is_op_valid(ggml_backend_qnn_device_context *ctx, const ggml_tensor *dst) { + if (!ctx || !dst) { QNN_LOG_WARN("invalid params"); return false; } @@ -27,77 +25,36 @@ bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor return false; } - return true; -} - -bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) { - if (!ctx || !src0 || !src1 || !dst) { - QNN_LOG_WARN("invalid params"); - return false; + const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(dst)); + switch (param_count) { + case 1: + return dst->src[0]; + case 2: + return dst->src[0] && dst->src[1]; + default: + QNN_LOG_WARN("invalid op param count %d", (int)param_count); + break; } - auto instance = ctx->instance; - if (!instance) { - QNN_LOG_WARN("invalid instance"); - return false; - } - - return true; + return false; } +#ifndef NDEBUG void print_ggml_tensor(const ggml_tensor *tensor) { QNN_LOG_DEBUG("%s: type:%s ne: %ldx%ldx%ldx%ld, nb: %ldx%ldx%ldx%ld", tensor->name, ggml_type_name(tensor->type), (long)tensor->ne[0], (long)tensor->ne[1], (long)tensor->ne[2], (long)tensor->ne[3], (long)tensor->nb[0], (long)tensor->nb[1], (long)tensor->nb[2], (long)tensor->nb[3]); } +#endif } // namespace -#define CHECK_PARAMS(ctx, ...) \ - if (!qnn_is_valid_params((ctx), __VA_ARGS__)) { \ - return false; \ - } - -#else -#define CHECK_PARAMS(ctx, ...) -#endif - namespace { -bool is_tensor_dimensions_equal(const ggml_tensor *l, const ggml_tensor *r) { - const auto dim_l = ggml_n_dims(l); - if (dim_l != ggml_n_dims(r)) { - return false; - } +typedef bool (*ggml_qnn_op_t)(ggml_backend_qnn_device_context *ctx, ggml_tensor *dst); - for (int i = 0; i < dim_l; i++) { - if (l->ne[i] != r->ne[i]) { - return false; - } - } - - return true; -} - -typedef bool (*ggml_qnn_unary_op_t)(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, ggml_tensor *dst); -typedef bool (*ggml_qnn_binary_op_t)(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, ggml_tensor *src1, - ggml_tensor *dst); - -typedef const ggml_qnn_unary_op_t (&ggml_qnn_unary_op_array_t)[GGML_OP_COUNT + GGML_UNARY_OP_COUNT]; -typedef const ggml_qnn_binary_op_t (&ggml_qnn_binary_op_array_t)[GGML_OP_COUNT]; - -constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT; - -template -qnn::ggml_tensor_array_t to_ggml_tensor_array(const std::array &array) { - return qnn::ggml_tensor_array_t(array.data(), array.data() + _Size); -} - -template -bool execute_graph(qnn::ggml_qnn_graph *graph, const std::array &inputs, - ggml_tensor *output) { - if (!graph->execute(to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<1>({output}))) { +bool execute_graph(qnn::qnn_graph *graph, ggml_tensor *output) { + if (!graph->execute(output)) { QNN_LOG_WARN("execute failed"); return false; } @@ -105,165 +62,114 @@ bool execute_graph(qnn::ggml_qnn_graph *graph, const std::array -std::string get_graph_key(const std::string &op_name, const std::array &inputs, - const std::array &outputs) { - constexpr static const auto append_dimensions = [](std::string &key, const ggml_tensor *tensor) { - char buffer[256] = {}; - snprintf(buffer, sizeof(buffer), "_%ldx%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], - (long)tensor->ne[2], (long)tensor->ne[3], qnn::get_ggml_type_name(tensor->type)); - key += buffer; - }; - - std::string graph_key(op_name); - for (auto &input : inputs) { - append_dimensions(graph_key, input); +void append_tensor_dimensions(const ggml_tensor *tensor, std::string &output) { + char buffer[256] = {}; + const auto *type_name = qnn::get_ggml_type_name(tensor->type); + int len = 0; + switch (ggml_n_dims(tensor)) { + case 1: + len = snprintf(buffer, sizeof(buffer), "%ld%s", (long)tensor->ne[0], type_name); + break; + case 2: + len = snprintf(buffer, sizeof(buffer), "%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], type_name); + break; + case 3: + len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], + (long)tensor->ne[2], type_name); + break; + case 4: + default: + len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], + (long)tensor->ne[2], (long)tensor->ne[3], type_name); + break; } - - graph_key += qnn::get_ggml_type_name(outputs.front()->type); - return graph_key; + GGML_ASSERT(len > 0 && len < (int)sizeof(buffer)); + output.append(buffer, len); } -constexpr const char *kGgmlOpToQnnOp[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - QNN_OP_ELEMENT_WISE_ADD, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - QNN_OP_ELEMENT_WISE_SUBTRACT, // GGML_OP_SUB - QNN_OP_ELEMENT_WISE_MULTIPLY, // GGML_OP_MUL - QNN_OP_ELEMENT_WISE_DIVIDE, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // GGML_OP_SQRT - QNN_OP_ELEMENT_WISE_LOG, // GGML_OP_LOG - nullptr, // GGML_OP_SIN - nullptr, // GGML_OP_COS - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_COUNT_EQUAL - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM +void get_graph_key_from_op(const ggml_tensor *op, std::string &output) { + GGML_ASSERT(op->op != GGML_OP_NONE); + output += ggml_op_desc(op); + output += qnn::get_ggml_type_name(op->type); + const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(op)); + for (size_t i = 0; i < param_count; ++i) { + auto *input = op->src[i]; + output += '_'; + append_tensor_dimensions(input, output); + } +} - QNN_OP_MAT_MUL, // GGML_OP_MUL_MAT - nullptr, // GGML_OP_MUL_MAT_ID - nullptr, // GGML_OP_OUT_PROD +void get_op_key_with_src_op_desc(const ggml_tensor *op, std::string &output) { + output += ggml_op_desc(op); + output += '('; + if (op->src[0]) { + output += ggml_op_desc(op->src[0]); + } + for (size_t i = 1; i < GGML_MAX_DIMS && op->src[i]; ++i) { + output += ','; + output += ggml_op_desc(op->src[i]); + } + output += ')'; +} - nullptr, // GGML_OP_SCALE - nullptr, // GGML_OP_SET - nullptr, // GGML_OP_CPY - nullptr, // GGML_OP_CONT - nullptr, // GGML_OP_RESHAPE - nullptr, // GGML_OP_VIEW - QNN_OP_TRANSPOSE, // GGML_OP_PERMUTE - nullptr, // GGML_OP_TRANSPOSE - nullptr, // GGML_OP_GET_ROWS - nullptr, // GGML_OP_GET_ROWS_BACK - nullptr, // GGML_OP_DIAG - nullptr, // GGML_OP_DIAG_MASK_INF - nullptr, // GGML_OP_DIAG_MASK_ZERO - nullptr, // GGML_OP_SOFT_MAX - nullptr, // GGML_OP_SOFT_MAX_BACK - nullptr, // GGML_OP_ROPE - nullptr, // GGML_OP_ROPE_BACK - nullptr, // GGML_OP_CLAMP - nullptr, // GGML_OP_CONV_TRANSPOSE_1D - nullptr, // GGML_OP_IM2COL - nullptr, // GGML_OP_IM2COL_BACK - nullptr, // GGML_OP_CONV_TRANSPOSE_2D - nullptr, // GGML_OP_POOL_1D - nullptr, // GGML_OP_POOL_2D - nullptr, // GGML_OP_POOL_2D_BACK - nullptr, // GGML_OP_UPSCALE - nullptr, // GGML_OP_PAD - nullptr, // GGML_OP_PAD_REFLECT_1D - nullptr, // GGML_OP_ARANGE - nullptr, // GGML_OP_TIMESTEP_EMBEDDING - nullptr, // GGML_OP_ARGSORT - nullptr, // GGML_OP_LEAKY_RELU +void get_graph_key_from_cgraph(const ggml_cgraph *cgraph, std::string &output) { + // generate key from the graph, the key is used to cache the graph, like: + // "MUL_MATf32_256x16x10f32_256x1x10f32#LOG#ADD#ADDf32_16x1x10f32" + if (cgraph->n_nodes == 0) { + QNN_LOG_DEBUG("empty cgraph"); + return; + } - nullptr, // GGML_OP_FLASH_ATTN_EXT - nullptr, // GGML_OP_FLASH_ATTN_BACK - nullptr, // GGML_OP_SSM_CONV - nullptr, // GGML_OP_SSM_SCAN - nullptr, // GGML_OP_WIN_PART - nullptr, // GGML_OP_WIN_UNPART - nullptr, // GGML_OP_GET_REL_POS - nullptr, // GGML_OP_ADD_REL_POS - nullptr, // GGML_OP_RWKV_WKV6 + { + bool is_start = true; + for (int i = 0; i < cgraph->n_nodes; ++i) { + auto *op = cgraph->nodes[i]; + if (ggml_is_empty(op)) { + QNN_LOG_DEBUG("empty op in graph, skipping"); + continue; + } - nullptr, // GGML_OP_UNARY + if (op->op == GGML_OP_NONE) { + QNN_LOG_DEBUG("GGML_OP_NONE in graph, skipping"); + continue; + } - nullptr, // GGML_OP_MAP_UNARY - nullptr, // GGML_OP_MAP_BINARY + if (is_start) { + get_graph_key_from_op(cgraph->nodes[0], output); + is_start = false; + } else { + output += '#'; + get_op_key_with_src_op_desc(op, output); + } + } + } - nullptr, // GGML_OP_MAP_CUSTOM1_F32 - nullptr, // GGML_OP_MAP_CUSTOM2_F32 - nullptr, // GGML_OP_MAP_CUSTOM3_F32 - - nullptr, // GGML_OP_MAP_CUSTOM1 - nullptr, // GGML_OP_MAP_CUSTOM2 - nullptr, // GGML_OP_MAP_CUSTOM3 - - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK - nullptr, // GGML_OP_OPT_STEP_ADAMW - - // ggml_unary_op - nullptr, // GGML_UNARY_OP_ABS - nullptr, // GGML_UNARY_OP_SGN - nullptr, // GGML_UNARY_OP_NEG - nullptr, // GGML_UNARY_OP_STEP - nullptr, // GGML_UNARY_OP_TANH - nullptr, // GGML_UNARY_OP_ELU - nullptr, // GGML_UNARY_OP_RELU - nullptr, // GGML_UNARY_OP_SIGMOID - QNN_OP_GELU, // GGML_UNARY_OP_GELU - nullptr, // GGML_UNARY_OP_GELU_QUICK - nullptr, // GGML_UNARY_OP_SILU - nullptr, // GGML_UNARY_OP_HARDSWISH - nullptr, // GGML_UNARY_OP_HARDSIGMOID - nullptr, // GGML_UNARY_OP_EXP -}; - -static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), - "GGML_OP_COUNT does not match the size of the kGgmlOpToQnnOp table"); -static_assert(kGgmlOpToQnnOp[GGML_UNARY_OP_GELU + kGgmlUnaryOpStart] != nullptr, - "GGML_UNARY_OP_GELU does not correspond to QNN_OP_GELU"); - -template -qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *ctx, size_t op, - const std::array &inputs, - ggml_tensor *output) { - GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT)); + if (cgraph->n_nodes > 1) { + auto *last_op = cgraph->nodes[cgraph->n_nodes - 1]; + output += qnn::get_ggml_type_name(last_op->type); + output += '_'; + append_tensor_dimensions(last_op, output); + } +} +qnn::qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *ctx, ggml_tensor *output) { auto &graph_cache = ctx->qnn_graph_cache; - const auto *op_name = - op < kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - kGgmlUnaryOpStart)); - auto graph_key = get_graph_key<_InputSize, 1>(op_name, inputs, {output}); + std::string graph_key; + get_graph_key_from_op(output, graph_key); auto it = graph_cache.find(graph_key); - qnn::ggml_qnn_graph *graph_ptr = nullptr; + qnn::qnn_graph *graph_ptr = nullptr; if (it != graph_cache.end()) { QNN_LOG_DEBUG("[%s]found graph %s in cache", qnn::get_backend_name(ctx->device), graph_key.c_str()); graph_ptr = it->second.get(); } else { auto graph = - std::make_unique(graph_key, ctx->device, ctx->instance, ctx->socinfo.vtcm_size_in_mb); + std::make_unique(graph_key, ctx->device, ctx->instance, ctx->socinfo.vtcm_size_in_mb); if (!graph->is_valid()) { return nullptr; } - auto op_constructor = qnn::create_op_constructor(kGgmlOpToQnnOp[op]); - if (!graph->build_graph(op_constructor, to_ggml_tensor_array<_InputSize>(inputs), - to_ggml_tensor_array<1>({output}))) { - QNN_LOG_ERROR("[%s]build_graph failed", qnn::get_backend_name(ctx->device)); + if (!graph->build_graph_from_op(output)) { + QNN_LOG_ERROR("[%s]build_graph_from_op failed", qnn::get_backend_name(ctx->device)); return nullptr; } @@ -274,22 +180,54 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *c return graph_ptr; } -template -bool qnn_binary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst) { - static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); - - CHECK_PARAMS(ctx, src0, src1, dst); - - bool succeed = false; - auto *graph_ptr = get_qnn_graph_from_cache<2>(ctx, _GgmlOp, {src0, src1}, dst); - if (graph_ptr) { - succeed = execute_graph<2>(graph_ptr, {src0, src1}, dst); +qnn::qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *ctx, const ggml_cgraph *cgraph) { + auto &graph_cache = ctx->qnn_graph_cache; + std::string graph_key; + get_graph_key_from_cgraph(cgraph, graph_key); + if (graph_key.empty()) { + QNN_LOG_DEBUG("[%s]empty graph key for cgraph: %p, size: %d", qnn::get_backend_name(ctx->device), cgraph, + (int)cgraph->n_nodes); + return nullptr; } + auto it = graph_cache.find(graph_key); + qnn::qnn_graph *graph_ptr = nullptr; + if (it != graph_cache.end()) { + QNN_LOG_DEBUG("[%s]found graph %s in cache", qnn::get_backend_name(ctx->device), graph_key.c_str()); + graph_ptr = it->second.get(); + } else { + auto graph = + std::make_unique(graph_key, ctx->device, ctx->instance, ctx->socinfo.vtcm_size_in_mb); + if (!graph->is_valid()) { + return nullptr; + } + + if (!graph->build_graph_from_ggml_graph(cgraph)) { + QNN_LOG_ERROR("[%s]build_graph_from_op failed", qnn::get_backend_name(ctx->device)); + return nullptr; + } + + graph_ptr = graph.get(); + graph_cache[graph_key] = std::move(graph); + } + + return graph_ptr; +} + +bool qnn_generic_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *dst) { + if (!qnn_is_op_valid(ctx, dst)) { + return false; + } + + auto *graph_ptr = get_qnn_graph_from_cache(ctx, dst); + bool succeed = graph_ptr && execute_graph(graph_ptr, dst); + #ifndef NDEBUG if (!succeed) { - print_ggml_tensor(src0); - print_ggml_tensor(src1); + const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(dst)); + for (size_t i = 0; i < param_count; ++i) { + print_ggml_tensor(dst->src[i]); + } print_ggml_tensor(dst); } #endif @@ -297,219 +235,76 @@ bool qnn_binary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, return succeed; } -template -bool qnn_unary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, ggml_tensor *dst) { - static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP"); - - CHECK_PARAMS(ctx, src, dst); - - bool succeed = false; - auto *graph_ptr = get_qnn_graph_from_cache<1>(ctx, _GgmlOp, {src}, dst); - if (graph_ptr) { - succeed = execute_graph<1>(graph_ptr, {src}, dst); - } - -#ifndef NDEBUG - if (!succeed) { - print_ggml_tensor(src); - print_ggml_tensor(dst); - } -#endif - - return succeed; -} - -bool qnn_unary_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, ggml_tensor *dst) { +bool qnn_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *dst) { GGML_UNUSED(ctx); - GGML_UNUSED(src); GGML_UNUSED(dst); return true; } -bool qnn_binary_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst) { - GGML_UNUSED(ctx); - GGML_UNUSED(src0); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - return true; -} +constexpr const ggml_qnn_op_t kQnnOpsTable[] = { + qnn_nop_impl, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + qnn_generic_op_impl, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + qnn_generic_op_impl, // GGML_OP_SUB + qnn_generic_op_impl, // GGML_OP_MUL + qnn_generic_op_impl, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + qnn_generic_op_impl, // GGML_OP_SQRT + qnn_generic_op_impl, // GGML_OP_LOG + nullptr, // GGML_OP_SIN + nullptr, // GGML_OP_COS + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_COUNT_EQUAL + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM -constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { - qnn_unary_nop_impl, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - nullptr, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB - nullptr, // GGML_OP_MUL - nullptr, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - qnn_unary_op_impl, // GGML_OP_SQRT - qnn_unary_op_impl, // GGML_OP_LOG - nullptr, // GGML_OP_SIN - nullptr, // GGML_OP_COS - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_COUNT_EQUAL - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM + qnn_generic_op_impl, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_MUL_MAT_ID + nullptr, // GGML_OP_OUT_PROD - nullptr, // GGML_OP_MUL_MAT - nullptr, // GGML_OP_MUL_MAT_ID - nullptr, // GGML_OP_OUT_PROD - - nullptr, // GGML_OP_SCALE - nullptr, // GGML_OP_SET - nullptr, // GGML_OP_CPY - nullptr, // GGML_OP_CONT - qnn_unary_nop_impl, // GGML_OP_RESHAPE - qnn_unary_nop_impl, // GGML_OP_VIEW - qnn_unary_nop_impl, // GGML_OP_PERMUTE - qnn_unary_nop_impl, // GGML_OP_TRANSPOSE - qnn_unary_nop_impl, // GGML_OP_GET_ROWS - nullptr, // GGML_OP_GET_ROWS_BACK - nullptr, // GGML_OP_DIAG - nullptr, // GGML_OP_DIAG_MASK_INF - nullptr, // GGML_OP_DIAG_MASK_ZERO - nullptr, // GGML_OP_SOFT_MAX - nullptr, // GGML_OP_SOFT_MAX_BACK - nullptr, // GGML_OP_ROPE - nullptr, // GGML_OP_ROPE_BACK - nullptr, // GGML_OP_CLAMP - nullptr, // GGML_OP_CONV_TRANSPOSE_1D - nullptr, // GGML_OP_IM2COL - nullptr, // GGML_OP_IM2COL_BACK - nullptr, // GGML_OP_CONV_TRANSPOSE_2D - nullptr, // GGML_OP_POOL_1D - nullptr, // GGML_OP_POOL_2D - nullptr, // GGML_OP_POOL_2D_BACK - nullptr, // GGML_OP_UPSCALE - nullptr, // GGML_OP_PAD - nullptr, // GGML_OP_PAD_REFLECT_1D - nullptr, // GGML_OP_ARANGE - nullptr, // GGML_OP_TIMESTEP_EMBEDDING - nullptr, // GGML_OP_ARGSORT - nullptr, // GGML_OP_LEAKY_RELU - - nullptr, // GGML_OP_FLASH_ATTN_EXT - nullptr, // GGML_OP_FLASH_ATTN_BACK - nullptr, // GGML_OP_SSM_CONV - nullptr, // GGML_OP_SSM_SCAN - nullptr, // GGML_OP_WIN_PART - nullptr, // GGML_OP_WIN_UNPART - nullptr, // GGML_OP_GET_REL_POS - nullptr, // GGML_OP_ADD_REL_POS - nullptr, // GGML_OP_RWKV_WKV6 - - nullptr, // GGML_OP_UNARY - - nullptr, // GGML_OP_MAP_UNARY - nullptr, // GGML_OP_MAP_BINARY - - nullptr, // GGML_OP_MAP_CUSTOM1_F32 - nullptr, // GGML_OP_MAP_CUSTOM2_F32 - nullptr, // GGML_OP_MAP_CUSTOM3_F32 - - nullptr, // GGML_OP_MAP_CUSTOM1 - nullptr, // GGML_OP_MAP_CUSTOM2 - nullptr, // GGML_OP_MAP_CUSTOM3 - - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK - nullptr, // GGML_OP_OPT_STEP_ADAMW - - // ggml_unary_op - nullptr, // GGML_UNARY_OP_ABS - nullptr, // GGML_UNARY_OP_SGN - nullptr, // GGML_UNARY_OP_NEG - nullptr, // GGML_UNARY_OP_STEP - nullptr, // GGML_UNARY_OP_TANH - nullptr, // GGML_UNARY_OP_ELU - nullptr, // GGML_UNARY_OP_RELU - nullptr, // GGML_UNARY_OP_SIGMOID - qnn_unary_op_impl, // GGML_UNARY_OP_GELU - nullptr, // GGML_UNARY_OP_GELU_QUICK - nullptr, // GGML_UNARY_OP_SILU - nullptr, // GGML_UNARY_OP_HARDSWISH - nullptr, // GGML_UNARY_OP_HARDSIGMOID - nullptr, // GGML_UNARY_OP_EXP -}; - -static_assert(sizeof(kQnnUnaryOpsTable) / sizeof(kQnnUnaryOpsTable[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), - "GGML_OP_COUNT does not match the size of the kQnnUnaryOpsTable table"); - -constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - qnn_binary_op_impl, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - qnn_binary_op_impl, // GGML_OP_SUB - qnn_binary_op_impl, // GGML_OP_MUL - qnn_binary_op_impl, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - nullptr, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG - nullptr, // GGML_OP_SIN - nullptr, // GGML_OP_COS - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_COUNT_EQUAL - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM - - qnn_binary_op_impl, // GGML_OP_MUL_MAT - nullptr, // GGML_OP_MUL_MAT_ID - nullptr, // GGML_OP_OUT_PROD - - nullptr, // GGML_OP_SCALE - nullptr, // GGML_OP_SET - nullptr, // GGML_OP_CPY - nullptr, // GGML_OP_CONT - nullptr, // GGML_OP_RESHAPE - nullptr, // GGML_OP_VIEW - nullptr, // GGML_OP_PERMUTE - nullptr, // GGML_OP_TRANSPOSE - nullptr, // GGML_OP_GET_ROWS - nullptr, // GGML_OP_GET_ROWS_BACK - nullptr, // GGML_OP_DIAG - nullptr, // GGML_OP_DIAG_MASK_INF - nullptr, // GGML_OP_DIAG_MASK_ZERO - nullptr, // GGML_OP_SOFT_MAX - nullptr, // GGML_OP_SOFT_MAX_BACK - nullptr, // GGML_OP_ROPE - nullptr, // GGML_OP_ROPE_BACK - nullptr, // GGML_OP_CLAMP - nullptr, // GGML_OP_CONV_TRANSPOSE_1D - nullptr, // GGML_OP_IM2COL - nullptr, // GGML_OP_IM2COL_BACK - nullptr, // GGML_OP_CONV_TRANSPOSE_2D - nullptr, // GGML_OP_POOL_1D - nullptr, // GGML_OP_POOL_2D - nullptr, // GGML_OP_POOL_2D_BACK - nullptr, // GGML_OP_UPSCALE - nullptr, // GGML_OP_PAD - nullptr, // GGML_OP_PAD_REFLECT_1D - nullptr, // GGML_OP_ARANGE - nullptr, // GGML_OP_TIMESTEP_EMBEDDING - nullptr, // GGML_OP_ARGSORT - nullptr, // GGML_OP_LEAKY_RELU + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + qnn_nop_impl, // GGML_OP_RESHAPE + nullptr, // GGML_OP_VIEW + nullptr, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + nullptr, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_IM2COL_BACK + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_POOL_2D_BACK + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_PAD_REFLECT_1D + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU nullptr, // GGML_OP_FLASH_ATTN_EXT nullptr, // GGML_OP_FLASH_ATTN_BACK @@ -537,10 +332,36 @@ constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = { nullptr, // GGML_OP_CROSS_ENTROPY_LOSS nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK nullptr, // GGML_OP_OPT_STEP_ADAMW + + // ggml_unary_op + nullptr, // GGML_UNARY_OP_ABS + nullptr, // GGML_UNARY_OP_SGN + nullptr, // GGML_UNARY_OP_NEG + nullptr, // GGML_UNARY_OP_STEP + nullptr, // GGML_UNARY_OP_TANH + nullptr, // GGML_UNARY_OP_ELU + nullptr, // GGML_UNARY_OP_RELU + nullptr, // GGML_UNARY_OP_SIGMOID + qnn_generic_op_impl, // GGML_UNARY_OP_GELU + nullptr, // GGML_UNARY_OP_GELU_QUICK + nullptr, // GGML_UNARY_OP_SILU + nullptr, // GGML_UNARY_OP_HARDSWISH + nullptr, // GGML_UNARY_OP_HARDSIGMOID + nullptr, // GGML_UNARY_OP_EXP }; -static_assert(sizeof(kQnnBinaryOpsTable) / sizeof(kQnnBinaryOpsTable[0]) == GGML_OP_COUNT, - "GGML_OP_COUNT does not match the size of the kQnnBinaryOpsTable table"); +static_assert(kQnnOpsTable[GGML_OP_NONE] == qnn_nop_impl, "GGML_OP_NONE does not match the qnn_nop_impl function"); +static_assert(kQnnOpsTable[GGML_OP_ADD] == qnn_generic_op_impl, + "GGML_OP_ADD does not match the qnn_generic_op_impl function"); +static_assert(kQnnOpsTable[GGML_OP_MUL] == qnn_generic_op_impl, + "GGML_OP_MUL does not match the qnn_generic_op_impl function"); +static_assert(kQnnOpsTable[GGML_OP_MUL_MAT] == qnn_generic_op_impl, + "GGML_OP_MUL_MAT does not match the qnn_generic_op_impl function"); +static_assert(kQnnOpsTable[GGML_OP_RESHAPE] == qnn_nop_impl, + "GGML_OP_RESHAPE does not match the qnn_nop_impl function"); +static_assert(kQnnOpsTable[GGML_OP_VIEW] == nullptr, "GGML_OP_VIEW is not nullptr"); +static_assert(std::size(kQnnOpsTable) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), + "GGML_OP_COUNT does not match the size of the kQnnOpsTable table"); bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_tensor *tensor) { if (!tensor) { @@ -548,6 +369,7 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t return false; } +#ifndef NDEBUG if (tensor->view_src) { auto *src_tensor = tensor->view_src; QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) is a view, src: %s_%dx%dx%dx%d", qnn::get_backend_name(ctx->device), @@ -555,6 +377,7 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t ggml_get_name(src_tensor), src_tensor->ne[0], src_tensor->ne[1], src_tensor->ne[2], src_tensor->ne[3]); } +#endif switch (tensor->type) { case GGML_TYPE_F32: @@ -576,6 +399,25 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t return true; } +bool ggnl_qnn_supports_op_tensor(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { + if (op->op == GGML_OP_NONE) { + return true; + } + + if (!ggml_qnn_supports_tensor(ctx, op)) { + return false; + } + + const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(op)); + for (size_t i = 0; i < param_count; ++i) { + if (!ggml_qnn_supports_tensor(ctx, op->src[i])) { + return false; + } + } + + return true; +} + bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { constexpr const size_t kMaxNpuTensorSize = 8192L * 2048 + 8192 * 512 + 2048 * 512; constexpr const auto get_tensor_size = [](const ggml_tensor *tensor) -> size_t { @@ -591,11 +433,11 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm * TODO: remove the blocker here when NPU backend supports mul_mat like this: * [ne03, ne02, n, k] * [ne03 * x, ne02 * y, m, k] -> [ne03 * x, ne02 * y, m, n] */ - QNN_LOG_DEBUG("[qnn-npu] src0 and src1 dimensions are not equal, support/unsupported: %d/%d", + QNN_LOG_DEBUG("[qnn-npu][MUL_MAT]src0 and src1 dimensions are not equal, support/unsupported: %d/%d", ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); return false; } else if (get_tensor_size(src0) + get_tensor_size(src1) + get_tensor_size(op) >= kMaxNpuTensorSize) { - QNN_LOG_DEBUG("[qnn-npu] tensor size is too large, support/unsupported: %d/%d", + QNN_LOG_DEBUG("[qnn-npu][MUL_MAT]tensor size is too large, support/unsupported: %d/%d", ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); return false; } @@ -604,9 +446,9 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm case QNN_BACKEND_GPU: if (src0->type != src1->type || src0->type != op->type) { // there's no convert op for GPU. - QNN_LOG_DEBUG("[qnn-gpu]type src0(%d), src1(%d) and op(%d) are not equal, support/unsupported: %d/%d", - src0->type, src1->type, op->type, ctx->support_op_count.load(), - ++(ctx->unsupported_op_count)); + QNN_LOG_DEBUG( + "[qnn-gpu][MUL_MAT]type src0(%d), src1(%d) and op(%d) are not equal, support/unsupported: %d/%d", + src0->type, src1->type, op->type, ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); return false; } break; @@ -615,12 +457,12 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm } if ((src1->ne[2] % src0->ne[2]) != 0 || (src1->ne[3] % src0->ne[3]) != 0) { - QNN_LOG_DEBUG("[%s] src0 and src1 dimensions are not equal, support/unsupported: %d/%d", + QNN_LOG_DEBUG("[%s][MUL_MAT]src0 and src1 dimensions are not equal, support/unsupported: %d/%d", qnn::get_backend_name(ctx->device), ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); return false; } - QNN_LOG_DEBUG("[%s] supported matmul op, support/unsupported: %d/%d", qnn::get_backend_name(ctx->device), + QNN_LOG_DEBUG("[%s][MUL_MAT]supported matmul op, support/unsupported: %d/%d", qnn::get_backend_name(ctx->device), ++(ctx->support_op_count), ctx->unsupported_op_count.load()); return true; } @@ -635,41 +477,30 @@ bool device_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor return true; } - auto *src0 = op->src[0]; + if (!kQnnOpsTable[qnn::get_qnn_op_index(op)]) { + QNN_LOG_DEBUG("[%s]unsupported op", ggml_op_name(op->op)); + return false; + } + + if (!ggnl_qnn_supports_op_tensor(ctx, op)) { + QNN_LOG_DEBUG("[%s]unsupported tensor", ggml_op_name(op->op)); + return false; + } + if (op->op == GGML_OP_UNARY) { const auto unary_op = ggml_get_unary_op(op); - if (unary_op == GGML_UNARY_OP_GELU && ctx->device == QNN_BACKEND_NPU) { - // TODO: fix this when NPU supports GELU - QNN_LOG_DEBUG("unsupported unary op GGML_UNARY_OP_GELU for NPU"); - return false; - } - - if (!kQnnUnaryOpsTable[kGgmlUnaryOpStart + unary_op]) { - QNN_LOG_DEBUG("unsupported unary op %d", unary_op); - return false; - } - - if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, op)) { - QNN_LOG_DEBUG("src0 is nullptr"); + if (unary_op == GGML_UNARY_OP_GELU) { + // TODO: fix this + QNN_LOG_DEBUG("[GELU]unsupported unary op GGML_UNARY_OP_GELU for NPU"); return false; } } else { - if (!kQnnUnaryOpsTable[op->op] && !kQnnBinaryOpsTable[op->op]) { - QNN_LOG_DEBUG("[%s] unsupported op", ggml_op_name(op->op)); - return false; - } - + auto *src0 = op->src[0]; auto *src1 = op->src[1]; - if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, op) || - (kQnnBinaryOpsTable[op->op] && !ggml_qnn_supports_tensor(ctx, src1))) { - QNN_LOG_DEBUG("[%s] unsupported tensor", ggml_op_name(op->op)); - return false; - } - switch (op->op) { case GGML_OP_ADD: - if (!is_tensor_dimensions_equal(src0, src1)) { - QNN_LOG_DEBUG("src0 and src1 dimensions are not equal"); + if (!ggml_are_same_shape(src0, src1)) { + QNN_LOG_DEBUG("[ADD] src0 and src1 dimensions are not equal"); return false; } break; @@ -686,34 +517,13 @@ bool device_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor } bool device_compute_graph(ggml_backend_qnn_device_context *ctx, ggml_cgraph *cgraph) { - QNN_LOG_DEBUG("[%s]compute graph, nodes count: %d", qnn::get_backend_name(ctx->device), (int)cgraph->n_nodes); - for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor *tensor = cgraph->nodes[i]; - if (ggml_is_empty(tensor)) { - continue; - } + QNN_LOG_DEBUG("[%s]compute graph start, nodes count: %d", qnn::get_backend_name(ctx->device), (int)cgraph->n_nodes); - size_t unary_op_idx = tensor->op; - if (tensor->op == GGML_OP_UNARY) { - unary_op_idx = kGgmlUnaryOpStart + ggml_get_unary_op(tensor); - } + auto qnn_graph = get_qnn_graph_from_cache(ctx, cgraph); + bool success = qnn_graph && qnn_graph->execute(cgraph); - bool ok = false; - auto unary_op = kQnnUnaryOpsTable[unary_op_idx]; - auto binary_op = kQnnBinaryOpsTable[tensor->op]; - if (unary_op) { - ok = unary_op(ctx, tensor->src[0], tensor); - } else if (binary_op) { - ok = binary_op(ctx, tensor->src[0], tensor->src[1], tensor); - } - - if (!ok) { - QNN_LOG_WARN("[%s]unsupported op %s", qnn::get_backend_name(ctx->device), ggml_op_desc(tensor)); - return false; - } - } - - return true; + QNN_LOG_DEBUG("[%s]compute graph, success: %d", qnn::get_backend_name(ctx->device), (int)success); + return success; } } // namespace qnn diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index 17823ed577..df5e2eb08f 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -19,7 +19,7 @@ #include "qnn-lib.hpp" namespace qnn { -typedef std::unordered_map> ggml_qnn_graph_cache_t; +typedef std::unordered_map> qnn_graph_cache_t; } // namespace qnn struct ggml_backend_qnn_device_context { @@ -35,7 +35,7 @@ struct ggml_backend_qnn_device_context { std::shared_ptr instance; std::shared_ptr qnn_interface; - qnn::ggml_qnn_graph_cache_t qnn_graph_cache; + qnn::qnn_graph_cache_t qnn_graph_cache; #ifndef NDEBUG std::atomic_uint32_t support_op_count = 0; diff --git a/ggml/src/ggml-qnn/buffer.hpp b/ggml/src/ggml-qnn/buffer.hpp index 9573e160b4..af165b394e 100644 --- a/ggml/src/ggml-qnn/buffer.hpp +++ b/ggml/src/ggml-qnn/buffer.hpp @@ -8,18 +8,65 @@ namespace qnn { +/** + * @brief An interface for managing generic QNN buffers. + * + * This abstract class defines the interface for managing generic memory buffers in a QNN context. + */ class qnn_buffer_interface { public: virtual ~qnn_buffer_interface() = default; + /** + * @brief Checks if the buffer is valid. + * + * This pure virtual function must be implemented by derived classes to check + * the validity of the buffer. + * + * @return true if the buffer is valid, false otherwise. + */ virtual bool is_valid() const = 0; + + /** + * @brief Gets the buffer pointer. + * + * This pure virtual function must be implemented by derived classes to return + * a pointer to the buffer. + * + * @return A pointer to the buffer. + */ virtual uint8_t *get_buffer() = 0; + + /** + * @brief Gets the buffer pointer. + * + * This pure virtual function must be implemented by derived classes to return + * a pointer to the buffer. + * + * @return A pointer to the buffer. + */ virtual size_t get_size() const = 0; + + /** + * @brief Gets the QNN memory handle associated with the buffer. + * + * This pure virtual function must be implemented by derived classes to return + * the memory handle associated with the buffer. + * + * @return The memory handle, or null if no valid QNN memory handle is attached. + */ virtual Qnn_MemHandle_t get_mem_handle() const = 0; }; using qnn_buffer_ptr = std::shared_ptr; +/** + * @brief A class for managing QNN RPC memory buffers. + * + * This class is responsible for allocating, registering, and managing a buffer in RPC memory. + * It ensures that the buffer is properly allocated and registered with the QNN instance, and + * handles cleanup of the buffer and its associated memory handle upon destruction. + */ class qnn_rpc_buffer : public qnn_buffer_interface { public: qnn_rpc_buffer(std::shared_ptr qnn_instance, const size_t size, const uint32_t rank, @@ -29,7 +76,7 @@ public: _qnn_rpc_buffer = static_cast(qnn_instance->alloc_rpcmem(size, alignof(uint8_t *))); _qnn_rpc_mem_handle = qnn_instance->register_rpcmem(_qnn_rpc_buffer, rank, dimensions, data_type); if (!_qnn_rpc_buffer || !_qnn_rpc_mem_handle) { - QNN_LOG_WARN("register rpc mem failure"); + QNN_LOG_WARN("Failed to register RPC memory: buffer or memory handle is null"); // let the destructor free the buffer return; } @@ -64,6 +111,13 @@ private: DISABLE_MOVE(qnn_rpc_buffer); }; +/** + * @brief A class for managing QNN memory buffers allocated in regular memory. + * + * This class is responsible for allocating, managing, and freeing memory buffers + * in regular (non-RPC) memory. It implements the qnn_buffer_interface to provide + * a consistent interface for buffer management. + */ class qnn_mem_buffer : public qnn_buffer_interface { public: explicit qnn_mem_buffer(const uint8_t *data, size_t size) { @@ -102,4 +156,24 @@ private: DISABLE_MOVE(qnn_mem_buffer); }; +class qnn_mem_buffer_slice : public qnn_buffer_interface { +public: + qnn_mem_buffer_slice(const uint8_t *buffer, size_t size) : _buffer(const_cast(buffer)), _size(size) {} + + bool is_valid() const override { return _buffer && _size; } + + uint8_t *get_buffer() override { return _buffer; } + + size_t get_size() const override { return _size; } + + Qnn_MemHandle_t get_mem_handle() const override { return nullptr; } + +private: + uint8_t *_buffer = nullptr; + size_t _size = 0; + + DISABLE_COPY(qnn_mem_buffer_slice); + DISABLE_MOVE(qnn_mem_buffer_slice); +}; + } // namespace qnn diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp index 933016a628..b3673eb35a 100644 --- a/ggml/src/ggml-qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/ggml-qnn.cpp @@ -222,6 +222,9 @@ bool ggml_backend_qnn_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_ GGML_UNUSED(backend_dst); GGML_UNUSED(src); GGML_UNUSED(dst); + + QNN_LOG_DEBUG("opy form %s to %s, src_is_qnn: %d, dst_is_qnn: %d", ggml_get_name(src), ggml_get_name(dst), + (int)ggml_backend_is_qnn(backend_src), (int)ggml_backend_is_qnn(backend_dst)); return false; } @@ -317,8 +320,6 @@ ggml_guid_t ggml_backend_qnn_guid() { return &guid; } -bool ggml_backend_is_qnn(ggml_backend_t backend) { return ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); } - ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, const char *extend_lib_search_path) { if (!extend_lib_search_path) { extend_lib_search_path = GGML_QNN_DEFAULT_LIB_SEARCH_PATH; @@ -420,8 +421,13 @@ bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_ } bool ggml_backend_qnn_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor *op) { +#ifdef NDEBUG + GGML_UNUSED(dev); + GGML_UNUSED(op); +#else auto *device_ctx = get_device_context(dev); QNN_LOG_DEBUG("[%s][%s]offload op", qnn::get_backend_name(device_ctx->device), ggml_op_name(op->op)); +#endif return false; } @@ -509,6 +515,8 @@ const ggml_backend_reg_i ggml_backend_qnn_reg_interface = { } // namespace +bool ggml_backend_is_qnn(ggml_backend_t backend) { return ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); } + ggml_backend_reg_t ggml_backend_qnn_reg() { static ggml_backend_qnn_reg_impl reg{ggml_backend_qnn_reg_interface}; return ® diff --git a/ggml/src/ggml-qnn/graph.cpp b/ggml/src/ggml-qnn/graph.cpp new file mode 100644 index 0000000000..0210e1554a --- /dev/null +++ b/ggml/src/ggml-qnn/graph.cpp @@ -0,0 +1,386 @@ + +#include "graph.hpp" + +#include +#include + +#include "ggml-impl.h" + +#include "logger.hpp" +#include "op-config.hpp" +#include "tensor.hpp" + +namespace { +using qnn_tensor_cache_t = std::unordered_map; + +int get_op_max_rank(const ggml_tensor *op) { + int max_rank = ggml_n_dims(op); + const int count = (int)qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(op)); + for (int i = 0; i < count; ++i) { + max_rank = std::max(max_rank, ggml_n_dims(op->src[i])); + } + + return max_rank; +} + +qnn::qnn_tensor_ptr_t create_tensor_with_cache(ggml_tensor *tensor, qnn::ggml_qnn_tensor::tensor_type_t type, int rank, + QNNBackend device, Qnn_GraphHandle_t graph_handle, + std::shared_ptr qnn_instance, + qnn_tensor_cache_t &tensor_cache) { + GGML_ASSERT(tensor); + if (tensor_cache.count(tensor)) { + return tensor_cache[tensor]; + } + + auto qnn_tensor = std::make_shared(type, tensor->name, tensor->ne, tensor->type, rank, device, + graph_handle, qnn_instance); + tensor_cache[tensor] = qnn_tensor; + return qnn_tensor; +} + +qnn::qnn_tensor_array_t create_tensors_with_cache(const qnn::ggml_tensor_array_t &ggml_tensors, + qnn::ggml_qnn_tensor::tensor_type_t type, int rank, QNNBackend device, + Qnn_GraphHandle_t graph_handle, + std::shared_ptr qnn_instance, + qnn_tensor_cache_t &tensor_cache) { + qnn::qnn_tensor_array_t tensors; + for (auto *tensor : ggml_tensors) { + tensors.push_back( + create_tensor_with_cache(tensor, type, rank, device, graph_handle, qnn_instance, tensor_cache)); + } + + return tensors; +} + +qnn::qnn_op_config_ptr_t create_operation_from_op_tensor(ggml_tensor *dst, const std::string &name, int rank, + QNNBackend device, Qnn_GraphHandle_t graph_handle, + std::shared_ptr qnn_instance, + bool is_intermediate, qnn_tensor_cache_t &tensor_cache) { + const auto op_index = qnn::get_qnn_op_index(dst); + auto qnn_op = qnn::create_op_constructor(op_index); + auto operation = qnn_op(name, qnn_instance); + + // input tensors + qnn::qnn_tensor_array_t input_qnn_tensors; + auto tensor_type = is_intermediate ? qnn::ggml_qnn_tensor::INTERMEDIATE : qnn::ggml_qnn_tensor::INPUT; + for (size_t i = 0; i < qnn::get_qnn_op_input_param_count(op_index); ++i) { + auto input_qnn_tensor = + create_tensor_with_cache(dst->src[i], tensor_type, rank, device, graph_handle, qnn_instance, tensor_cache); + input_qnn_tensors.push_back(input_qnn_tensor); + } + operation->set_input_tensors(input_qnn_tensors); + + // output tensor + tensor_type = is_intermediate ? qnn::ggml_qnn_tensor::INTERMEDIATE : qnn::ggml_qnn_tensor::OUTPUT; + qnn::qnn_tensor_array_t output_qnn_tensors = + create_tensors_with_cache({dst}, tensor_type, rank, device, graph_handle, qnn_instance, tensor_cache); + operation->set_output_tensors(output_qnn_tensors); + + // initialize operation + if (!operation->initialize_op_nodes(device, graph_handle)) { + QNN_LOG_ERROR("[%s][%s]initialize_op_nodes failed", qnn::get_backend_name(device), name.c_str()); + return nullptr; + } + + return operation; +} + +bool bind_src_tensors(ggml_tensor *op, qnn::qnn_tensor_array_t &tensor_wrappers, + std::vector &qnn_tensors) { + if (op->op == GGML_OP_NONE) { + QNN_LOG_DEBUG("op %s is not a valid op", ggml_get_name(op)); + return false; + } + + const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(op)); + GGML_ASSERT(tensor_wrappers.size() == param_count); + qnn_tensors.resize(param_count); + for (size_t i = 0; i < param_count; ++i) { + auto *ggml_tensor = op->src[i]; + if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) { + QNN_LOG_ERROR("bind tensor %s failed", ggml_get_name(ggml_tensor)); + return false; + } + + qnn_tensors[i] = tensor_wrappers[i]->get_qnn_tensor(); + } + + return true; +} + +int get_io_tensors_from_graph(const ggml_cgraph *cgraph, qnn::ggml_tensor_array_t &inputs, + qnn::ggml_tensor_array_t &outputs) { + using ggml_tensor_set_t = std::set; + + ggml_tensor_set_t input_set; + ggml_tensor_set_t output_set; + ggml_tensor_set_t visited_set; + int rank = 0; + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor *dst = cgraph->nodes[i]; + if (ggml_is_empty(dst)) { + continue; + } + + if (dst->op == GGML_OP_NONE || dst->op == GGML_OP_VIEW) { + // TODO: remove GGML_OP_VIEW after view op is supported + continue; + } + + rank = std::max(rank, ggml_n_dims(dst)); + input_set.erase(dst); + if (!visited_set.count(dst)) { + output_set.insert(dst); + visited_set.insert(dst); + } + + for (size_t i = 0; i < GGML_MAX_DIMS && dst->src[i]; ++i) { + auto *src = dst->src[i]; + rank = std::max(rank, ggml_n_dims(src)); + output_set.erase(src); + if (!visited_set.count(src)) { + input_set.insert(src); + visited_set.insert(src); + } + } + } + + inputs.assign(input_set.begin(), input_set.end()); + outputs.assign(output_set.begin(), output_set.end()); + return rank; +} + +} // namespace + +namespace qnn { + +qnn_graph::qnn_graph(const std::string &graph_name, QNNBackend device, std::shared_ptr qnn_instance, + size_t vtcm_size_in_mb) + : _graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) { + QNN_LOG_DEBUG("[%s][%s]created", get_backend_name(device), graph_name.c_str()); + + auto qnn_interface = qnn_instance->get_qnn_interface(); + auto qnn_context = qnn_instance->get_qnn_context_handle(); + Qnn_ErrorHandle_t error = QNN_SUCCESS; + Qnn_GraphHandle_t graph_handle = nullptr; + if (device == QNN_BACKEND_NPU) { + // TODO: fix graph config here for NPU + QnnHtpGraph_CustomConfig_t hvx_config; + hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; + hvx_config.numHvxThreads = 8; + QnnGraph_Config_t graph_hvx_config; + graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_hvx_config.customConfig = &hvx_config; + + QnnHtpGraph_CustomConfig_t dlbc_config; + dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; + dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC + QnnGraph_Config_t graph_dlbc_config; + graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_dlbc_config.customConfig = &dlbc_config; + + QnnHtpGraph_CustomConfig_t opt_config; + opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + opt_config.optimizationOption.floatValue = 1; // 1 / 3 + QnnGraph_Config_t graph_opt_config; + graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_opt_config.customConfig = &opt_config; + + QnnHtpGraph_CustomConfig_t vtcm_config; + vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; + vtcm_config.vtcmSizeInMB = vtcm_size_in_mb; + QnnGraph_Config_t graph_vtcm_config; + graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_vtcm_config.customConfig = &vtcm_config; + + const QnnGraph_Config_t *graph_configs[] = {&graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, + &graph_opt_config, nullptr}; + error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), graph_configs, &graph_handle); + } else { + error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), nullptr, &graph_handle); + } + + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s", get_backend_name(device), graph_name.c_str(), + get_qnn_error_string(error)); + return; + } + + QNN_LOG_INFO("[%s][%s]create succeed", get_backend_name(device), graph_name.c_str()); + _graph_handle = graph_handle; + _qnn_interface = qnn_interface; +} + +qnn_graph::~qnn_graph() { QNN_LOG_DEBUG("[%s][%s]destroy", get_backend_name(_device), _graph_name.c_str()); } + +bool qnn_graph::build_graph_from_op(ggml_tensor *op) { + if (!is_valid()) { + QNN_LOG_ERROR("Invalid graph"); + return false; + } + + QNN_LOG_DEBUG("[%s][%s]build start", get_backend_name(_device), _graph_name.c_str()); + qnn_tensor_cache_t tensor_cache; + const auto rank = get_op_max_rank(op); + auto operation = create_operation_from_op_tensor(op, _graph_name, rank, _device, _graph_handle, _qnn_instance, + false, tensor_cache); + if (!operation) { + QNN_LOG_ERROR("[%s][%s]create_operation_from_op_tensor failed", get_backend_name(_device), _graph_name.c_str()); + return false; + } + + _tensor_inputs = operation->get_input_tensors(); + _tensor_outputs = operation->get_output_tensors(); + _operations.push_back(std::move(operation)); + if (!finalize()) { + return false; + } + + QNN_LOG_DEBUG("[%s][%s]build succeed", get_backend_name(_device), _graph_name.c_str()); + return true; +} + +bool qnn_graph::build_graph_from_ggml_graph(const ggml_cgraph *cgraph) { + QNN_LOG_DEBUG("[%s][%s]build start", get_backend_name(_device), _graph_name.c_str()); + + ggml_tensor_array_t inputs; + ggml_tensor_array_t outputs; + int rank = get_io_tensors_from_graph(cgraph, inputs, outputs); + QNN_LOG_DEBUG("[%s]rank: %d, input_set: %d, output_set: %d", get_backend_name(_device), rank, int(inputs.size()), + int(outputs.size())); + + { + qnn_tensor_cache_t tensor_cache; + auto input_tensors = create_tensors_with_cache(inputs, ggml_qnn_tensor::INPUT, rank, _device, _graph_handle, + _qnn_instance, tensor_cache); + auto output_tensors = create_tensors_with_cache(outputs, ggml_qnn_tensor::OUTPUT, rank, _device, _graph_handle, + _qnn_instance, tensor_cache); + qnn_op_config_array_t operations; + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor *dst = cgraph->nodes[i]; + if (ggml_is_empty(dst)) { + continue; + } + + if (dst->op == GGML_OP_NONE || dst->op == GGML_OP_VIEW) { + // TODO: remove GGML_OP_VIEW after view op is supported + continue; + } + + QNN_LOG_DEBUG("[%s]create op: %s", get_backend_name(_device), get_qnn_op_name(dst->op)); + auto operation = create_operation_from_op_tensor(dst, dst->name, rank, _device, _graph_handle, + _qnn_instance, true, tensor_cache); // TODO: fix op name + operations.push_back(operation); + } + + _tensor_inputs = std::move(input_tensors); + _tensor_outputs = std::move(output_tensors); + _operations = std::move(operations); + if (!finalize()) { + return false; + } + } + + QNN_LOG_DEBUG("[%s][%s]build succeed", get_backend_name(_device), _graph_name.c_str()); + return true; +} + +bool qnn_graph::execute(ggml_tensor *op) { + if (!bind_src_tensors(op, _tensor_inputs, _qnn_tensor_inputs)) { + QNN_LOG_ERROR("[%s][%s]bind input tensors failed", get_backend_name(_device), _graph_name.c_str()); + return false; + } + + if (!qnn::bind_tensors({op}, _tensor_outputs, _qnn_tensor_outputs)) { + QNN_LOG_ERROR("[%s][%s]bind output tensors failed", get_backend_name(_device), _graph_name.c_str()); + return false; + } + + auto &qnn_tensor_inputs = _qnn_tensor_inputs; + auto &qnn_tensor_outputs = _qnn_tensor_outputs; + auto error = + _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(), + qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr); + unbind_tensors(_tensor_inputs); + unbind_tensors(_tensor_outputs); + + if (error != QNN_SUCCESS) { + if (_device == QNN_BACKEND_NPU && error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { + QNN_LOG_WARN("[%s][%s]NPU crashed. SSR detected. Caused QNN graph execute error.", + get_backend_name(_device), _graph_name.c_str()); + } else { + QNN_LOG_ERROR("[%s][%s]error: %s", get_backend_name(_device), _graph_name.c_str(), + get_qnn_error_string(error)); + } + return false; + } + + QNN_LOG_DEBUG("[%s][%s]execute succeed", get_backend_name(_device), _graph_name.c_str()); + return true; +} + +bool qnn_graph::execute(const ggml_cgraph *cgraph) { + ggml_tensor_array_t inputs; + ggml_tensor_array_t outputs; +#ifdef NDEBUG + get_io_tensors_from_graph(cgraph, inputs, outputs); +#else + int rank = get_io_tensors_from_graph(cgraph, inputs, outputs); + QNN_LOG_DEBUG("[%s]rank: %d, input_set: %d, output_set: %d", get_backend_name(_device), rank, int(inputs.size()), + int(outputs.size())); +#endif + + { + if (!qnn::bind_tensors(inputs, _tensor_inputs, _qnn_tensor_inputs)) { + QNN_LOG_ERROR("[%s][%s]bind input tensors failed", get_backend_name(_device), _graph_name.c_str()); + return false; + } + + if (!qnn::bind_tensors(outputs, _tensor_outputs, _qnn_tensor_outputs)) { + QNN_LOG_ERROR("[%s][%s]bind output tensors failed", get_backend_name(_device), _graph_name.c_str()); + return false; + } + + auto &qnn_tensor_inputs = _qnn_tensor_inputs; + auto &qnn_tensor_outputs = _qnn_tensor_outputs; + auto error = + _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(), + qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr); + unbind_tensors(_tensor_inputs); + unbind_tensors(_tensor_outputs); + + if (error != QNN_SUCCESS) { + if (_device == QNN_BACKEND_NPU && error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { + QNN_LOG_WARN("[%s][%s]NPU crashed. SSR detected. Caused QNN graph execute error.", + get_backend_name(_device), _graph_name.c_str()); + } else { + QNN_LOG_ERROR("[%s][%s]error: %s", get_backend_name(_device), _graph_name.c_str(), + get_qnn_error_string(error)); + } + return false; + } + + QNN_LOG_DEBUG("[%s][%s]execute succeed", get_backend_name(_device), _graph_name.c_str()); + return true; + } +} + +bool qnn_graph::finalize() { + if (!qnn::add_op_to_graph(_graph_handle, _operations)) { + QNN_LOG_ERROR("[%s]add nodes failed", _graph_name.c_str()); + return false; + } + + auto error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("[%s][%s]qnn_graph_finalize.error: %s", get_backend_name(_device), _graph_name.c_str(), + get_qnn_error_string(error)); + return false; + } + + QNN_LOG_DEBUG("[%s][%s]finalize succeed", get_backend_name(_device), _graph_name.c_str()); + return true; +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 1806f41126..521186f790 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -1,164 +1,53 @@ #pragma once -#include #include #include #include #include "ggml-qnn.h" -#include "logger.hpp" #include "op-config.hpp" #include "qnn-lib.hpp" namespace qnn { -class ggml_qnn_graph { +class qnn_graph { public: - explicit ggml_qnn_graph(const std::string &graph_name, QNNBackend device, - std::shared_ptr qnn_instance, size_t vtcm_size_in_mb) - : _graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) { - QNN_LOG_DEBUG("[%s][%s]created", get_backend_name(device), graph_name.c_str()); + explicit qnn_graph(const std::string &graph_name, QNNBackend device, std::shared_ptr qnn_instance, + size_t vtcm_size_in_mb); + ~qnn_graph(); - auto qnn_interface = qnn_instance->get_qnn_interface(); - auto qnn_context = qnn_instance->get_qnn_context_handle(); - Qnn_ErrorHandle_t error = QNN_SUCCESS; - Qnn_GraphHandle_t graph_handle = nullptr; - if (device == QNN_BACKEND_NPU) { - // TODO: fix graph config here for NPU - QnnHtpGraph_CustomConfig_t hvx_config; - hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; - hvx_config.numHvxThreads = 8; - QnnGraph_Config_t graph_hvx_config; - graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_hvx_config.customConfig = &hvx_config; - - QnnHtpGraph_CustomConfig_t dlbc_config; - dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; - dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; - dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC - QnnGraph_Config_t graph_dlbc_config; - graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_dlbc_config.customConfig = &dlbc_config; - - QnnHtpGraph_CustomConfig_t opt_config; - opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - opt_config.optimizationOption.floatValue = 1; // 1 / 3 - QnnGraph_Config_t graph_opt_config; - graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_opt_config.customConfig = &opt_config; - - QnnHtpGraph_CustomConfig_t vtcm_config; - vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; - vtcm_config.vtcmSizeInMB = vtcm_size_in_mb; - QnnGraph_Config_t graph_vtcm_config; - graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_vtcm_config.customConfig = &vtcm_config; - - const QnnGraph_Config_t *graph_configs[] = {&graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, - &graph_opt_config, nullptr}; - error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), graph_configs, &graph_handle); - } else { - error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), nullptr, &graph_handle); - } - - if (error != QNN_SUCCESS) { - QNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s", get_backend_name(device), graph_name.c_str(), - get_qnn_error_string(error)); - return; - } - - QNN_LOG_INFO("[%s][%s]create succeed", get_backend_name(device), graph_name.c_str()); - _graph_handle = graph_handle; - _qnn_interface = qnn_interface; - } - - ~ggml_qnn_graph() { QNN_LOG_DEBUG("[%s][%s]destroy", get_backend_name(_device), _graph_name.c_str()); } - - bool build_graph(ggml_op_constructor_t op_constructor, const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) { - GGML_ASSERT(op_constructor); - if (!is_valid()) { - QNN_LOG_ERROR("Invalid graph"); - return false; - } - - QNN_LOG_DEBUG("[%s][%s]build_graph start", get_backend_name(_device), _graph_name.c_str()); - _op_config = op_constructor(_graph_name, _qnn_instance); - if (!_op_config->initialize_op_nodes(_device, _graph_handle, tensor_inputs, tensor_outputs)) { - QNN_LOG_ERROR("[%s][%s]initialize_op_nodes failed", get_backend_name(_device), _graph_name.c_str()); - return false; - } - - if (!_op_config->add_op_to_graph(_graph_handle)) { - QNN_LOG_ERROR("[%s]add nodes failed", _graph_name.c_str()); - return false; - } - - auto error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr); - if (error != QNN_SUCCESS) { - QNN_LOG_ERROR("[%s][%s]qnn_graph_finalize.error: %s", get_backend_name(_device), _graph_name.c_str(), - get_qnn_error_string(error)); - return false; - } - - QNN_LOG_DEBUG("[%s][%s]build_graph succeed", get_backend_name(_device), _graph_name.c_str()); - return true; - } - - bool execute(const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { - if (!_op_config->bind_input_tensors(tensor_inputs)) { - QNN_LOG_ERROR("[%s][%s]bind input tensors failed", get_backend_name(_device), _graph_name.c_str()); - return false; - } - - if (!_op_config->bind_output_tensors(tensor_outputs)) { - QNN_LOG_ERROR("[%s][%s]bind output tensors failed", get_backend_name(_device), _graph_name.c_str()); - return false; - } - - auto &qnn_tensor_inputs = _op_config->get_qnn_input_tensors(); - auto &qnn_tensor_outputs = _op_config->get_qnn_output_tensors(); - - auto error = - _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(), - qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr); - _op_config->unbind_input_tensors(); - _op_config->unbind_output_tensors(); - - if (error != QNN_SUCCESS) { - if (_device == QNN_BACKEND_NPU && error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { - QNN_LOG_WARN("[%s][%s]NPU crashed. SSR detected. Caused QNN graph execute error.", - get_backend_name(_device), _graph_name.c_str()); - } else { - QNN_LOG_ERROR("[%s][%s]error: %s", get_backend_name(_device), _graph_name.c_str(), - get_qnn_error_string(error)); - } - return false; - } - - QNN_LOG_DEBUG("[%s][%s]execute succeed", get_backend_name(_device), _graph_name.c_str()); - return true; - } + bool build_graph_from_op(ggml_tensor *op); + bool build_graph_from_ggml_graph(const ggml_cgraph *cgraph); + bool execute(ggml_tensor *op); + bool execute(const ggml_cgraph *cgraph); bool is_valid() const { return _graph_handle != nullptr; } - Qnn_GraphHandle_t get_graph_handler() const { return _graph_handle; } - + std::shared_ptr get_qnn_instance() { return _qnn_instance; } const std::string &get_name() const { return _graph_name; } + QNNBackend get_device() const { return _device; } private: + bool finalize(); + const std::string _graph_name; const QNNBackend _device; Qnn_GraphHandle_t _graph_handle = nullptr; std::shared_ptr _qnn_instance; std::shared_ptr _qnn_interface; - std::unique_ptr _op_config; - std::vector _param_types; + qnn_op_config_array_t _operations; - DISABLE_COPY(ggml_qnn_graph); - DISABLE_MOVE(ggml_qnn_graph); + qnn_tensor_array_t _tensor_inputs; + qnn_tensor_array_t _tensor_outputs; + std::vector _qnn_tensor_inputs; + std::vector _qnn_tensor_outputs; + + DISABLE_COPY(qnn_graph); + DISABLE_MOVE(qnn_graph); }; +using qnn_graph_ptr_t = std::shared_ptr; + } // namespace qnn diff --git a/ggml/src/ggml-qnn/op-config-base.hpp b/ggml/src/ggml-qnn/op-config-base.hpp index 159944a7d7..274bb8318f 100644 --- a/ggml/src/ggml-qnn/op-config-base.hpp +++ b/ggml/src/ggml-qnn/op-config-base.hpp @@ -10,8 +10,6 @@ namespace qnn { -using ggml_tensor_array_t = std::vector; - /** * @class ggml_qnn_op_config * @brief Abstract base class for configuring QNN operations. @@ -23,6 +21,34 @@ class ggml_qnn_op_config { public: virtual ~ggml_qnn_op_config() {} + /** + * @brief Sets custom input tensors for the operation. This method should be called before `initialize_op_nodes`. + * If no custom input tensors are provided, the input tensors will be automatically created from the input ggml + * tensors. + * + * This pure virtual function must be overridden by derived classes to set + * the input tensors for the operation. The function takes a reference to a + * vector of qnn_tensor_ptr_t objects, which represent the input tensors. + * + * @param tensor_inputs A reference to a vector of qnn_tensor_ptr_t objects representing the input tensors. + */ + virtual void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) = 0; + virtual void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) = 0; + + /** + * @brief Sets custom output tensors for the operation. This method should be called before `initialize_op_nodes`. + * If no custom output tensors are provided, the output tensors will be automatically created from the output ggml + * tensors. + * + * This pure virtual function must be overridden by derived classes to set + * the output tensors for the operation. The function takes a reference to a + * vector of qnn_tensor_ptr_t objects, which represent the output tensors. + * + * @param tensor_outputs A reference to a vector of qnn_tensor_ptr_t objects representing the output tensors. + */ + virtual void set_output_tensors(qnn::qnn_tensor_array_t &tensor_outputs) = 0; + virtual void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) = 0; + /** * @brief Creates tensors and internal nodes for constructing the calculation graph. * @@ -31,36 +57,32 @@ public: * the internal nodes necessary for constructing the calculation graph. It takes * input and output tensor arrays as parameters. * - * @param device The backend device where tensors will be created. - * @param graph_handle The handle to the graph where tensors and nodes will be associated. - * @param tensor_inputs An array of input tensors. - * @param tensor_outputs An array of output tensors. + * @param device + * @param graph_handle * @return true if tensors and nodes are successfully created, false otherwise. */ - virtual bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) = 0; + virtual bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) = 0; /** - * @brief Pure virtual function to retrieve the input tensors for QNN (Quantized Neural Network). + * @brief Pure virtual function to retrieve the input tensors. * * This function must be overridden by derived classes to provide the specific implementation * for retrieving the input tensors used in QNN operations. * - * @return A reference to a vector of Qnn_Tensor_t objects representing the input tensors. + * @return A reference to a vector of qnn_tensor_ptr_t objects representing the input tensors. */ - virtual std::vector &get_qnn_input_tensors() = 0; + virtual const qnn_tensor_array_t &get_input_tensors() = 0; /** - * @brief Pure virtual function to retrieve the output tensors of a QNN (Quantized Neural Network). + * @brief Pure virtual function to retrieve the output tensors of a QNN. * * This function must be overridden by any derived class to provide access to the * output tensors of the QNN. The function returns a reference to a vector of - * Qnn_Tensor_t objects, which represent the output tensors. + * qnn_tensor_ptr_t objects, which represent the output tensors. * - * @return std::vector& Reference to a vector of Qnn_Tensor_t objects. + * @return A reference to a vector of qnn_tensor_ptr_t objects representing the output tensors. */ - virtual std::vector &get_qnn_output_tensors() = 0; + virtual const qnn_tensor_array_t &get_output_tensors() = 0; /** * @brief Adds an operation to the given graph. @@ -125,5 +147,6 @@ public: }; using qnn_op_config_ptr_t = std::shared_ptr; +using qnn_op_config_array_t = std::vector; } // namespace qnn diff --git a/ggml/src/ggml-qnn/op-config-caps.cpp b/ggml/src/ggml-qnn/op-config-caps.cpp new file mode 100644 index 0000000000..aab8f65958 --- /dev/null +++ b/ggml/src/ggml-qnn/op-config-caps.cpp @@ -0,0 +1,223 @@ + +#include "op-config.hpp" + +namespace { + +using op_dims_calc_func_t = void (*)(const std::vector &input_dims, + qnn::ggml_dimension_array_t &output_dims); + +void element_wise_op_dims(const std::vector &input_dims, + qnn::ggml_dimension_array_t &output_dims) { + for (size_t i = 1; i < std::size(output_dims); i++) { + output_dims[i] = input_dims.front()[i]; + } +} + +void mat_mul_op_dims(const std::vector &input_dims, + qnn::ggml_dimension_array_t &output_dims) { + GGML_ASSERT(input_dims.size() == 2); + output_dims[0] = input_dims.front()[1]; + output_dims[1] = input_dims.back()[1]; +} + +struct qnn_op_caps_t { + const char *qnn_op_name = nullptr; + const size_t input_param_count = 0; + op_dims_calc_func_t calc_dims_func = nullptr; +}; + +constexpr const qnn_op_caps_t kOpCaps[] = { + {}, // GGML_OP_NONE + {}, // GGML_OP_DUP + { + // GGML_OP_ADD + QNN_OP_ELEMENT_WISE_ADD, // qnn_op_name + 2, // input_param_count + element_wise_op_dims, // calc_dims_func + }, + {}, // GGML_OP_ADD1 + {}, // GGML_OP_ACC + { + // GGML_OP_SUB + QNN_OP_ELEMENT_WISE_SUBTRACT, // qnn_op_name + 2, // input_param_count + element_wise_op_dims, // calc_dims_func + }, + { + // GGML_OP_MUL + QNN_OP_ELEMENT_WISE_MULTIPLY, // qnn_op_name + 2, // input_param_count + element_wise_op_dims, // calc_dims_func + }, + { + // GGML_OP_DIV + QNN_OP_ELEMENT_WISE_DIVIDE, // qnn_op_name + 2, // input_param_count + element_wise_op_dims, // calc_dims_func + }, + {}, // GGML_OP_SQR + { + // GGML_OP_SQRT + QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // qnn_op_name + 1, // input_param_count + element_wise_op_dims, // calc_dims_func + }, + { + // GGML_OP_LOG + QNN_OP_ELEMENT_WISE_LOG, // qnn_op_name + 1, // input_param_count + element_wise_op_dims, // calc_dims_func + }, + {}, // GGML_OP_SIN + {}, // GGML_OP_COS + {}, // GGML_OP_SUM + {}, // GGML_OP_SUM_ROWS + {}, // GGML_OP_MEAN + {}, // GGML_OP_ARGMAX + {}, // GGML_OP_COUNT_EQUAL + {}, // GGML_OP_REPEAT + {}, // GGML_OP_REPEAT_BACK + {}, // GGML_OP_CONCAT + {}, // GGML_OP_SILU_BACK + {}, // GGML_OP_NORM + {}, // GGML_OP_RMS_NORM + {}, // GGML_OP_RMS_NORM_BACK + {}, // GGML_OP_GROUP_NORM + { + // GGML_OP_MUL_MAT + QNN_OP_MAT_MUL, // qnn_op_name + 2, // input_param_count + mat_mul_op_dims, // calc_dims_func + }, + {}, // GGML_OP_MUL_MAT_ID + {}, // GGML_OP_OUT_PROD + {}, // GGML_OP_SCALE + {}, // GGML_OP_SET + {}, // GGML_OP_CPY + {}, // GGML_OP_CONT + { + // GGML_OP_RESHAPE + QNN_OP_RESHAPE, // qnn_op_name + 1, // input_param_count + nullptr, // TODO: calc_dims_func + }, + {}, // GGML_OP_VIEW + {}, // GGML_OP_PERMUTE + {}, // GGML_OP_TRANSPOSE + {}, // GGML_OP_GET_ROWS + {}, // GGML_OP_GET_ROWS_BACK + {}, // GGML_OP_DIAG + {}, // GGML_OP_DIAG_MASK_INF + {}, // GGML_OP_DIAG_MASK_ZERO + {}, // GGML_OP_SOFT_MAX + {}, // GGML_OP_SOFT_MAX_BACK + {}, // GGML_OP_ROPE + {}, // GGML_OP_ROPE_BACK + {}, // GGML_OP_CLAMP + {}, // GGML_OP_CONV_TRANSPOSE_1D + {}, // GGML_OP_IM2COL + {}, // GGML_OP_IM2COL_BACK + {}, // GGML_OP_CONV_TRANSPOSE_2D + {}, // GGML_OP_POOL_1D + {}, // GGML_OP_POOL_2D + {}, // GGML_OP_POOL_2D_BACK + {}, // GGML_OP_UPSCALE + {}, // GGML_OP_PAD + {}, // GGML_OP_PAD_REFLECT_1D + {}, // GGML_OP_ARANGE + + {}, // GGML_OP_TIMESTEP_EMBEDDING + {}, // GGML_OP_ARGSORT + {}, // GGML_OP_LEAKY_RELU + + {}, // GGML_OP_FLASH_ATTN_EXT + {}, // GGML_OP_FLASH_ATTN_BACK + {}, // GGML_OP_SSM_CONV + {}, // GGML_OP_SSM_SCAN + {}, // GGML_OP_WIN_PART + {}, // GGML_OP_WIN_UNPART + {}, // GGML_OP_GET_REL_POS + {}, // GGML_OP_ADD_REL_POS + {}, // GGML_OP_RWKV_WKV6 + + {}, // GGML_OP_UNARY + + {}, // GGML_OP_MAP_UNARY + {}, // GGML_OP_MAP_BINARY + + {}, // GGML_OP_MAP_CUSTOM1_F32 + {}, // GGML_OP_MAP_CUSTOM2_F32 + {}, // GGML_OP_MAP_CUSTOM3_F32 + + {}, // GGML_OP_MAP_CUSTOM1 + {}, // GGML_OP_MAP_CUSTOM2 + {}, // GGML_OP_MAP_CUSTOM3 + + {}, // GGML_OP_CROSS_ENTROPY_LOSS + {}, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + {}, // GGML_OP_OPT_STEP_ADAMW + + // ggml_unary_op + {}, // GGML_UNARY_OP_ABS + {}, // GGML_UNARY_OP_SGN + {}, // GGML_UNARY_OP_NEG + {}, // GGML_UNARY_OP_STEP + {}, // GGML_UNARY_OP_TANH + {}, // GGML_UNARY_OP_ELU + {}, // GGML_UNARY_OP_RELU + {}, // GGML_UNARY_OP_SIGMOID + { + // GGML_UNARY_OP_GELU + QNN_OP_GELU, // qnn_op_name + 1, // input_param_count + nullptr, // TODO: calc_dims_func + }, + {}, // GGML_UNARY_OP_GELU_QUICK + {}, // GGML_UNARY_OP_SILU + {}, // GGML_UNARY_OP_HARDSWISH + {}, // GGML_UNARY_OP_HARDSIGMOID + {}, // GGML_UNARY_OP_EXP +}; + +static_assert(kOpCaps[GGML_OP_NONE].calc_dims_func == nullptr, "GGML_OP_NONE should not have calc_dims_func function"); +static_assert(kOpCaps[GGML_OP_ADD].calc_dims_func == element_wise_op_dims, + "GGML_OP_ADD does not have element_wise_op_dims function"); +static_assert(kOpCaps[GGML_OP_MUL_MAT].calc_dims_func == mat_mul_op_dims, + "GGML_OP_ADD does not have element_wise_op_dims function"); +static_assert(kOpCaps[GGML_OP_LOG].calc_dims_func == element_wise_op_dims, + "GGML_OP_LOG does not have element_wise_op_dims function"); +static_assert(std::size(kOpCaps) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), + "GGML_OP_COUNT does not match the size of the kOpCaps table"); + +} // namespace + +namespace qnn { + +size_t get_qnn_op_index(const ggml_tensor *tensor) { + if (tensor->op == GGML_OP_UNARY) { + return kGgmlUnaryOpStart + ggml_get_unary_op(tensor); + } + + return tensor->op; +} + +void get_ggml_op_output_dimensions(const std::vector &input_dims, size_t op, + ggml_dimension_array_t &output_dims) { + GGML_ASSERT(op < std::size(kOpCaps)); + auto get_dims = kOpCaps[op].calc_dims_func; + GGML_ASSERT(get_dims); + get_dims(input_dims, output_dims); +} + +const char *get_qnn_op_name(size_t op) { + GGML_ASSERT(op < std::size(kOpCaps)); + GGML_ASSERT(kOpCaps[op].qnn_op_name); + return kOpCaps[op].qnn_op_name; +} + +size_t get_qnn_op_input_param_count(size_t op) { + GGML_ASSERT(op < std::size(kOpCaps)); + return kOpCaps[op].input_param_count; +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/op-config.cpp b/ggml/src/ggml-qnn/op-config.cpp index b3c84b5435..7edb4078a5 100644 --- a/ggml/src/ggml-qnn/op-config.cpp +++ b/ggml/src/ggml-qnn/op-config.cpp @@ -24,16 +24,7 @@ qnn::qnn_dimension_array_t get_transposed_dimensions(const qnn::qnn_dimension_ar } int get_rank(const qnn::ggml_tensor_array_t &tensor_inputs, const qnn::ggml_tensor_array_t &tensor_outputs) { - int tensor_rank = 0; - // get the max tensor rank - for (auto tensor : tensor_inputs) { - tensor_rank = std::max(tensor_rank, ggml_n_dims(tensor)); - } - for (auto tensor : tensor_outputs) { - tensor_rank = std::max(tensor_rank, ggml_n_dims(tensor)); - } - - return tensor_rank; + return std::max(qnn::get_ggml_tensors_max_rank(tensor_inputs), qnn::get_ggml_tensors_max_rank(tensor_outputs)); } Qnn_DataType_t get_tensor_type(const qnn::qnn_tensor_array_t &tensors) { @@ -49,93 +40,6 @@ Qnn_DataType_t get_tensor_type(const qnn::qnn_tensor_array_t &tensors) { return type; } -struct tensor_common_params { - const char *name_prefix; - int tensor_rank; - bool is_input; - QNNBackend device; - Qnn_GraphHandle_t graph_handle; - std::shared_ptr qnn_instance; -}; - -void create_tensors_from_ggml_tensor(const tensor_common_params ¶ms, const qnn::ggml_tensor_array_t &ggml_tensors, - qnn::qnn_tensor_array_t *tensor_wrappers, std::vector *qnn_tensors) { - using namespace qnn; - - tensor_wrappers->resize(ggml_tensors.size()); - if (qnn_tensors) { - qnn_tensors->resize(ggml_tensors.size()); - } - char buffer[GGML_MAX_NAME] = {}; - auto tensor_type = params.is_input ? ggml_qnn_tensor::INPUT : ggml_qnn_tensor::OUTPUT; - for (size_t i = 0; i < ggml_tensors.size(); i++) { - snprintf(buffer, GGML_MAX_NAME, "%s%d", params.name_prefix, (int)i); - auto *ggml_tensor = ggml_tensors[i]; - (*tensor_wrappers)[i] = std::make_shared(tensor_type, std::string(buffer), ggml_tensor->ne, - ggml_tensor->type, params.tensor_rank, params.device, - params.graph_handle, params.qnn_instance); - } -} - -bool bind_tensors(const qnn::ggml_tensor_array_t &ggml_tensors, qnn::qnn_tensor_array_t &tensor_wrappers, - std::vector &qnn_tensors) { - for (size_t i = 0; i < ggml_tensors.size(); i++) { - auto *ggml_tensor = ggml_tensors[i]; - if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) { - QNN_LOG_ERROR("bind tensor %s failed", ggml_get_name(ggml_tensor)); - return false; - } - - qnn_tensors[i] = tensor_wrappers[i]->get_qnn_tensor(); - } - - return true; -} - -class ggml_qnn_connectable_op_config : public qnn::ggml_qnn_op_config_base { -public: - explicit ggml_qnn_connectable_op_config(const std::string &name, const std::string &package_name, - const std::string &op_type, std::shared_ptr qnn_instance) - : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} - - bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const qnn::ggml_tensor_array_t &tensor_inputs, - const qnn::ggml_tensor_array_t &tensor_outputs) override { - GGML_UNUSED(device); - GGML_UNUSED(graph_handle); - GGML_UNUSED(tensor_inputs); - GGML_UNUSED(tensor_outputs); - return true; - } - - void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) { - _tensor_inputs = tensor_inputs; - _qnn_tensor_inputs.resize(_tensor_inputs.size()); - } - - void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) { - _tensor_inputs = std::move(tensor_inputs); - _qnn_tensor_inputs.resize(_tensor_inputs.size()); - } - - void set_output_tensors(qnn::qnn_tensor_array_t &tensor_outputs) { - _tensor_outputs = tensor_outputs; - _qnn_tensor_outputs.resize(_tensor_outputs.size()); - } - - void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_outputs) { - _tensor_outputs = std::move(tensor_outputs); - _qnn_tensor_outputs.resize(_tensor_outputs.size()); - } - - qnn::qnn_tensor_array_t &get_input_tensors() { return _tensor_inputs; } - qnn::qnn_tensor_array_t &get_output_tensors() { return _tensor_outputs; } - -private: - DISABLE_COPY(ggml_qnn_connectable_op_config); - DISABLE_MOVE(ggml_qnn_connectable_op_config); -}; - } // namespace namespace qnn { @@ -161,7 +65,7 @@ bool ggml_qnn_op_config_base::add_tensor_param(const std::string &name, const qn } GGML_ASSERT(data_size > 0); - if (!param_tensor->bind_buffer(const_cast(data), data_size)) { + if (!param_tensor->set_data_buffer(data, data_size)) { QNN_LOG_ERROR("parameter tensor bind_buffer failed"); return false; } @@ -181,6 +85,26 @@ bool ggml_qnn_op_config_base::add_tensor_param(const std::string &name, const qn return true; } +void ggml_qnn_op_config_base::set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) { + _tensor_inputs = tensor_inputs; + _qnn_tensor_inputs.resize(_tensor_inputs.size()); +} + +void ggml_qnn_op_config_base::set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) { + _tensor_inputs = tensor_inputs; + _qnn_tensor_inputs.resize(_tensor_inputs.size()); +} + +void ggml_qnn_op_config_base::set_output_tensors(qnn::qnn_tensor_array_t &tensor_outputs) { + _tensor_outputs = std::move(tensor_outputs); + _qnn_tensor_outputs.resize(_tensor_outputs.size()); +} + +void ggml_qnn_op_config_base::set_output_tensors(qnn::qnn_tensor_array_t &&tensor_outputs) { + _tensor_outputs = std::move(tensor_outputs); + _qnn_tensor_outputs.resize(_tensor_outputs.size()); +} + bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { GGML_ASSERT(_qnn_tensor_inputs.size() == _tensor_inputs.size()); GGML_ASSERT(_qnn_tensor_outputs.size() == _tensor_outputs.size()); @@ -221,12 +145,12 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { bool ggml_qnn_op_config_base::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) { GGML_ASSERT(tensor_inputs.size() == _tensor_inputs.size()); - return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs); + return qnn::bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs); } bool ggml_qnn_op_config_base::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) { GGML_ASSERT(tensor_outputs.size() == _tensor_outputs.size()); - return bind_tensors(tensor_outputs, _tensor_outputs, _qnn_tensor_outputs); + return qnn::bind_tensors(tensor_outputs, _tensor_outputs, _qnn_tensor_outputs); } void ggml_qnn_op_config_base::unbind_input_tensors() { @@ -257,55 +181,42 @@ Qnn_OpConfig_t ggml_qnn_op_config_base::get_op_config() { return config; } -bool ggml_qnn_single_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) { - const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs); - tensor_common_params params = {"src", tensor_rank, true, device, graph_handle, _qnn_instance}; - create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs); - params.name_prefix = "dst"; - params.is_input = false; - create_tensors_from_ggml_tensor(params, tensor_outputs, &_tensor_outputs, &_qnn_tensor_outputs); - - if (_param_buffer.size() > 0) { - // handle parameters in output tensor - auto *params = tensor_outputs.front()->op_params; - memcpy(_param_buffer.data(), params, _param_buffer.size()); - - const uint32_t count = uint32_t(_param_buffer.size() / qnn_datatype_size(_param_type)); - const qnn_dimension_array_t param_dims = {count, 1, 1, 1}; - add_tensor_param(_param_name, param_dims, 1, _param_buffer.data(), _param_type, device, graph_handle); - } - +bool ggml_qnn_single_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) { + GGML_UNUSED(device); + GGML_UNUSED(graph_handle); return true; } +void ggml_qnn_aggregate_op_config::set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) { + _tensor_inputs = tensor_inputs; +} + +void ggml_qnn_aggregate_op_config::set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) { + _tensor_inputs = std::move(tensor_inputs); +} + +void ggml_qnn_aggregate_op_config::set_output_tensors(qnn::qnn_tensor_array_t &tensor_outputs) { + _tensor_outputs = tensor_outputs; +} + +void ggml_qnn_aggregate_op_config::set_output_tensors(qnn::qnn_tensor_array_t &&tensor_outputs) { + _tensor_outputs = std::move(tensor_outputs); +} + bool ggml_qnn_aggregate_op_config::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) { - return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs); + return qnn::bind_tensors(tensor_inputs, _tensor_inputs); } bool ggml_qnn_aggregate_op_config::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) { - return bind_tensors(tensor_outputs, _tensor_outputs, _qnn_tensor_outputs); + return qnn::bind_tensors(tensor_outputs, _tensor_outputs); } -bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) { - GGML_ASSERT(tensor_inputs.size() == 2); - GGML_ASSERT(tensor_outputs.size() == 1); - const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs); - GGML_ASSERT(tensor_rank >= 2); - - // create input tensors - tensor_common_params params = {"src", tensor_rank, true, device, graph_handle, _qnn_instance}; - create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs); - - // create output tensor - params.name_prefix = "dst"; - params.is_input = false; - create_tensors_from_ggml_tensor(params, tensor_outputs, &_tensor_outputs, &_qnn_tensor_outputs); +bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) { + GGML_ASSERT(_tensor_inputs.size() == 2); + GGML_ASSERT(_tensor_outputs.size() == 1); // create convert nodes + const auto tensor_rank = _tensor_inputs.front()->get_rank(); qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs; qnn_tensor_array_t mat_mul_tensor_outputs = _tensor_outputs; if (!create_convert_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs)) { @@ -343,8 +254,8 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic auto gather_out = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, name + "_out", dimensions, tensor_input->get_data_type(), rank, device, graph_handle, qnn_instance); - auto gather_op = std::make_shared(name, QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_GATHER, qnn_instance); + auto gather_op = std::make_shared(name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_GATHER, + qnn_instance); Qnn_Scalar_t scalar = QNN_SCALAR_INIT; scalar.dataType = QNN_DATATYPE_INT_32; @@ -355,16 +266,16 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic // here we calculate the index mapping, will generate a 1d tensor like [0, 0, 0, 1, 1, 1, 2, 2, 2, ...], // by repeating each index [scale] times. const auto scale = dimensions[axis] / tensor_input->get_dimensions()[axis]; - std::vector index_buffer(dimensions[axis] * sizeof(uint32_t)); - for (uint32_t *curr = reinterpret_cast(index_buffer.data()), *end = curr + dimensions[axis]; + auto index_buffer = std::make_shared(dimensions[axis] * sizeof(uint32_t)); + for (uint32_t *curr = reinterpret_cast(index_buffer->get_buffer()), *end = curr + dimensions[axis]; curr < end; curr++) { - *curr = (curr - reinterpret_cast(index_buffer.data())) / scale; + *curr = (curr - reinterpret_cast(index_buffer->get_buffer())) / scale; } auto gather_index = std::make_shared( ggml_qnn_tensor::PARAMETER, name + "_index", qnn_dimension_array_t{dimensions[axis]}, QNN_DATATYPE_UINT_32, 1, device, graph_handle, qnn_instance); - gather_index->set_data_buffer(std::move(index_buffer)); + gather_index->set_data_buffer(index_buffer); gather_op->set_input_tensors({tensor_input, gather_index}); tensor_output = gather_out; @@ -409,8 +320,8 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap auto convert_out = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_out", convert_in->get_dimensions(), tensor_type, rank, device, graph_handle, _qnn_instance); - auto convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_CONVERT, _qnn_instance); + auto convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_CONVERT, _qnn_instance); convert->set_input_tensors({convert_in}); convert->set_output_tensors({convert_out}); tensor_inputs[i] = convert_out; @@ -424,8 +335,8 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap auto convert_in = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_in", convert_out->get_dimensions(), tensor_type, rank, device, graph_handle, _qnn_instance); - auto output_convert = std::make_shared( - convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_CONVERT, _qnn_instance); + auto output_convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_CONVERT, _qnn_instance); output_convert->set_input_tensors({convert_in}); output_convert->set_output_tensors({convert_out}); tensor_outputs.front() = convert_in; @@ -495,12 +406,12 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap dst->get_data_type(), rank, device, graph_handle, _qnn_instance); // create transpose_out - auto transpose_out = std::make_shared( - _name + "_trans1", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE, _qnn_instance); + auto transpose_out = std::make_shared(_name + "_trans1", QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_TRANSPOSE, _qnn_instance); // create mat_mul - auto mat_mul = std::make_shared(_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL, - _qnn_instance); + auto mat_mul = + std::make_shared(_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL, _qnn_instance); Qnn_Scalar_t scalar = QNN_SCALAR_INIT; scalar.dataType = QNN_DATATYPE_BOOL_8; @@ -528,19 +439,20 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap return true; } -ggml_op_constructor_t create_op_constructor(const std::string &op_name) { +ggml_op_constructor_t create_op_constructor(size_t op) { + std::string op_name = get_qnn_op_name(op); if (op_name == QNN_OP_MAT_MUL) { // For QNN_OP_MAT_MUL, we need to transpose the input tensor return [](const std::string &instance_name, - std::shared_ptr qnn_instance) -> std::unique_ptr { + std::shared_ptr qnn_instance) -> std::shared_ptr { QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s", instance_name.c_str()); - return std::make_unique(instance_name, qnn_instance); + return std::make_shared(instance_name, qnn_instance); }; } return [op_name](const std::string &instance_name, - std::shared_ptr qnn_instance) -> std::unique_ptr { - return std::make_unique(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_name, + std::shared_ptr qnn_instance) -> std::shared_ptr { + return std::make_shared(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_name, qnn_instance); }; } diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp index a05b75ade7..ca066520bc 100644 --- a/ggml/src/ggml-qnn/op-config.hpp +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -1,7 +1,7 @@ #pragma once -#include #include +#include #include #include @@ -13,9 +13,28 @@ namespace qnn { using ggml_op_constructor_t = - std::function(const std::string &, std::shared_ptr)>; + std::function(const std::string &, std::shared_ptr)>; -ggml_op_constructor_t create_op_constructor(const std::string &op_name); +constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT; + +size_t get_qnn_op_index(const ggml_tensor *tensor); +void get_ggml_op_output_dimensions(const std::vector &input_dims, size_t op, + ggml_dimension_array_t &output_dims); + +const char *get_qnn_op_name(size_t op); +size_t get_qnn_op_input_param_count(size_t op); + +ggml_op_constructor_t create_op_constructor(size_t op); + +inline bool add_op_to_graph(Qnn_GraphHandle_t graph_handle, std::vector &operations) { + for (auto &op : operations) { + if (!op->add_op_to_graph(graph_handle)) { + return false; + } + } + + return true; +} class ggml_qnn_op_config_base : public ggml_qnn_op_config { public: @@ -27,13 +46,18 @@ public: bool add_tensor_param(const std::string &name, const qnn_dimension_array_t &dimensions, int rank, const uint8_t *data, const Qnn_DataType_t data_type, QNNBackend device, Qnn_GraphHandle_t graph_handle); + + void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; + void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override; bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; void unbind_input_tensors() override; void unbind_output_tensors() override; - std::vector &get_qnn_input_tensors() override { return _qnn_tensor_inputs; } - std::vector &get_qnn_output_tensors() override { return _qnn_tensor_outputs; } + const qnn_tensor_array_t &get_input_tensors() override { return _tensor_inputs; } + const qnn_tensor_array_t &get_output_tensors() override { return _tensor_outputs; } protected: Qnn_OpConfig_t get_op_config(); @@ -60,24 +84,9 @@ public: const std::string &op_type, std::shared_ptr qnn_instance) : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} - explicit ggml_qnn_single_op_config(const std::string &name, const std::string &package_name, - const std::string &op_type, const std::string ¶m_name, - const Qnn_DataType_t param_type, const size_t param_size, - std::shared_ptr qnn_instance) - : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance), - _param_name(param_name), - _param_type(param_type), - _param_buffer(param_size) {} - - bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) override; + bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; private: - const std::string _param_name; - const Qnn_DataType_t _param_type = QNN_DATATYPE_UINT_32; - std::vector _param_buffer; - DISABLE_COPY(ggml_qnn_single_op_config); DISABLE_MOVE(ggml_qnn_single_op_config); }; @@ -88,26 +97,21 @@ public: : _name(name), _qnn_instance(qnn_instance) {} ~ggml_qnn_aggregate_op_config() { - _qnn_tensor_inputs.clear(); - _qnn_tensor_outputs.clear(); _tensor_inputs.clear(); _tensor_outputs.clear(); _operations.clear(); } + void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; + void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override; + void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override; bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override { - for (auto &op : _operations) { - if (!op->add_op_to_graph(graph_handle)) { - return false; - } - } - return true; + return qnn::add_op_to_graph(graph_handle, _operations); } bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; - bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; - void unbind_input_tensors() override { for (auto &tensor : _tensor_inputs) { tensor->unbind(); @@ -120,8 +124,8 @@ public: } } - std::vector &get_qnn_input_tensors() override { return _qnn_tensor_inputs; } - std::vector &get_qnn_output_tensors() override { return _qnn_tensor_outputs; } + const qnn_tensor_array_t &get_input_tensors() override { return _tensor_inputs; } + const qnn_tensor_array_t &get_output_tensors() override { return _tensor_outputs; } protected: std::string _name; @@ -130,8 +134,6 @@ protected: std::vector _operations; qnn_tensor_array_t _tensor_inputs; qnn_tensor_array_t _tensor_outputs; - std::vector _qnn_tensor_inputs; - std::vector _qnn_tensor_outputs; private: DISABLE_COPY(ggml_qnn_aggregate_op_config); @@ -143,9 +145,7 @@ public: ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr qnn_instance) : ggml_qnn_aggregate_op_config(name, qnn_instance) {} - bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) override; + bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; private: qnn_tensor_ptr_t create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, diff --git a/ggml/src/ggml-qnn/qnn-types.hpp b/ggml/src/ggml-qnn/qnn-types.hpp index 7461ac3012..ec30602843 100644 --- a/ggml/src/ggml-qnn/qnn-types.hpp +++ b/ggml/src/ggml-qnn/qnn-types.hpp @@ -12,7 +12,9 @@ namespace qnn { // // helper data type / data structure / macros / functions of // Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK -// ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm +// ref: +// https://github.com/pytorch/executorch/blob/ae3d558d5e6aa04fc52a3065399fe6a773702f52/backends/qualcomm/serialization/qc_schema.py#L53 +// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html#supported-snapdragon-devices // ================================================================================================= enum sdk_profile_level { profile_off = 0, profile_basic, profile_detail }; @@ -22,14 +24,18 @@ enum qcom_htp_arch { V69 = 69, V73 = 73, V75 = 75, + V79 = 79, // SD 8 Gen 4 (SM8750) }; enum qcom_chipset { UNKNOWN_SM = 0, - SM8450 = 36, // v69 - SM8475 = 42, // v69 - SM8550 = 43, // v73 - SM8650 = 57, // v75 + SM8450 = 36, // v69, SD 8 Gen 1 + SM8475 = 42, // v69, SD 8+ Gen 1 + SM8550 = 43, // v73, SD 8 Gen 2 + SSG2115P = 46, // v73 + SM8650 = 57, // v75, SD 8 Gen 3 + SA8295 = 39, // v68 + SM8750 = 69, // v79, SD 8 Gen 4 }; struct qcom_socinfo { diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 833c620971..3bd86891cb 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -20,9 +20,9 @@ namespace qnn { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS should be 4"); -class ggml_qnn_tensor { +class ggml_qnn_tensor : public std::enable_shared_from_this { public: - typedef enum _tensor_type { INPUT, OUTPUT, INTERMEDIATE, PARAMETER } tensor_type_t; + typedef enum _tensor_type { INPUT, OUTPUT, INTERMEDIATE, PARAMETER, BIDIRECTION } tensor_type_t; explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string &name, const qnn_dimension_array_t &dimensions, Qnn_DataType_t data_type, int rank, @@ -49,18 +49,27 @@ public: qnn_datatype_from_ggml_datatype(data_type), rank, device, graph_handle, qnn_instance) {} ~ggml_qnn_tensor() { - _buffer_storage.clear(); - unbind(); _rpc_buffer.reset(); + unbind(); } - bool set_data_buffer(std::vector &&buffer) { - if (!bind_buffer_impl(buffer.data(), buffer.size())) { - return false; + bool set_data_buffer(const uint8_t *buffer, const size_t buffer_size) { + auto qnn_buffer = std::make_shared(buffer, buffer_size); + if (bind_buffer_impl(qnn_buffer)) { + return true; } - _buffer_storage = std::move(buffer); - return true; + can_unbind = false; + return false; + } + + bool set_data_buffer(qnn_buffer_ptr buffer) { + if (bind_buffer_impl(buffer)) { + return true; + } + + can_unbind = false; + return false; } bool alloc_qnn_tensor_id() { @@ -83,23 +92,32 @@ public: return true; } - bool bind_buffer(uint8_t *buffer, const size_t buffer_size) { - if (!_buffer_storage.empty()) { + bool bind_ggml_tensor(ggml_tensor *tensor) { + if (!can_unbind) { QNN_LOG_DEBUG("[%s]already has buffer storage, skip bind", _tensor_name.c_str()); return true; } - return bind_buffer_impl(buffer, buffer_size); - } +#ifndef NDEBUG + if (tensor->view_src) { + auto *src = tensor->view_src; + QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) is a view, src: %s_%dx%dx%dx%d", get_backend_name(_device), + tensor->name, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], src->name, + src->ne[0], src->ne[1], src->ne[2], src->ne[3]); + } +#endif - bool bind_ggml_tensor(ggml_tensor *tensor) { - if (!bind_buffer(reinterpret_cast(tensor->data), ggml_nbytes(tensor))) { + auto buffer = + std::make_shared(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + if (!bind_buffer_impl(buffer)) { QNN_LOG_WARN("[%s]failed to bind ggml tensor(%s)", _tensor_name.c_str(), ggml_get_name(tensor)); return false; } QNN_LOG_DEBUG("[%s][%s]bind to ggml tensor(%s)", get_backend_name(_device), _tensor_name.c_str(), ggml_get_name(tensor)); + tensor->extra = this; + _ggml_tensor = tensor; return true; } @@ -110,7 +128,7 @@ public: } if (!_buffer) { - QNN_LOG_DEBUG("[%s]bound to ggml tensor", _tensor_name.c_str()); + QNN_LOG_DEBUG("[%s]unbind to ggml tensor", _tensor_name.c_str()); return true; } @@ -119,7 +137,7 @@ public: return false; } - if (!_buffer_storage.empty()) { + if (!can_unbind) { QNN_LOG_DEBUG("[%s]already has buffer storage, stop unbind", _tensor_name.c_str()); return true; } @@ -132,26 +150,32 @@ public: } QNN_LOG_DEBUG("[%s][%s]unbind from buffer: %p, size: %d", get_backend_name(_device), _tensor_name.c_str(), - _buffer, (int)_buffer_size); - _buffer = nullptr; - _buffer_size = 0; + _buffer.get(), (int)_buffer->get_size()); + _buffer.reset(); + + if (_ggml_tensor) { + _ggml_tensor->extra = nullptr; + _ggml_tensor = nullptr; + } + return true; } const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } Qnn_DataType_t get_data_type() const { return QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor); } const qnn_dimension_array_t &get_dimensions() const { return _dimensions; } + uint32_t get_rank() const { return QNN_TENSOR_GET_RANK(_qnn_tensor); } uint32_t get_qnn_tensor_id() const { return QNN_TENSOR_GET_ID(_qnn_tensor); } private: - bool bind_buffer_impl(uint8_t *buffer, const size_t buffer_size) { + bool bind_buffer_impl(qnn_buffer_ptr buffer) { if (_buffer) { if (_buffer != buffer) { - QNN_LOG_WARN("[%s]has been bound to another buffer %p", _tensor_name.c_str(), _buffer); + QNN_LOG_WARN("[%s]has been bound to another buffer %p", _tensor_name.c_str(), _buffer.get()); return false; } - QNN_LOG_DEBUG("[%s]already bound to same ggml tensor %p", _tensor_name.c_str(), _buffer); + QNN_LOG_DEBUG("[%s]already bound to same ggml tensor %p", _tensor_name.c_str(), _buffer.get()); return true; } @@ -164,7 +188,7 @@ private: if (should_use_mem_handle()) { if (!_rpc_buffer) { auto rpc_buffer = std::make_shared( - _qnn_instance, buffer_size, QNN_TENSOR_GET_RANK(_qnn_tensor), + _qnn_instance, buffer->get_size(), QNN_TENSOR_GET_RANK(_qnn_tensor), QNN_TENSOR_GET_DIMENSIONS(_qnn_tensor), QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor)); if (!rpc_buffer->is_valid()) { QNN_LOG_WARN("[%s][%s]alloc rpc mem failed", get_backend_name(_device), _tensor_name.c_str()); @@ -187,22 +211,21 @@ private: QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); } else { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); - Qnn_ClientBuffer_t client_buf = {buffer, (uint32_t)buffer_size}; + Qnn_ClientBuffer_t client_buf = {buffer->get_buffer(), (uint32_t)buffer->get_size()}; QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); QNN_LOG_DEBUG("[%s]use client buffer %p size %d", _tensor_name.c_str(), client_buf.data, (int)client_buf.dataSize); } _buffer = buffer; - _buffer_size = buffer_size; if (!write_to_qnn_tensor()) { QNN_LOG_WARN("[%s]write to qnn tensor failed", _tensor_name.c_str()); return false; } - QNN_LOG_DEBUG("[%s][%s]bind to buffer: %p, size: %d", get_backend_name(_device), _tensor_name.c_str(), buffer, - (int)buffer_size); + QNN_LOG_DEBUG("[%s][%s]bind to buffer: %p, size: %d", get_backend_name(_device), _tensor_name.c_str(), + buffer.get(), (int)buffer->get_size()); return true; } @@ -214,7 +237,7 @@ private: } if (_rpc_buffer) { - memcpy(_rpc_buffer->get_buffer(), _buffer, _buffer_size); + memcpy(_rpc_buffer->get_buffer(), _buffer->get_buffer(), _buffer->get_size()); } // For CPU and GPU, the data is already in the tensor. @@ -230,7 +253,7 @@ private: } if (_rpc_buffer) { - memcpy(_buffer, _rpc_buffer->get_buffer(), _buffer_size); + memcpy(_buffer->get_buffer(), _rpc_buffer->get_buffer(), _buffer->get_size()); } // For CPU and GPU, the data is already in the tensor. @@ -258,6 +281,9 @@ private: case PARAMETER: new_tensor_type = QNN_TENSOR_TYPE_STATIC; break; + case BIDIRECTION: + new_tensor_type = QNN_TENSOR_TYPE_APP_READWRITE; + break; case INTERMEDIATE: default: new_tensor_type = QNN_TENSOR_TYPE_NATIVE; @@ -273,15 +299,15 @@ private: } std::string _tensor_name; - uint8_t *_buffer = nullptr; - size_t _buffer_size = 0; - std::vector _buffer_storage; + qnn_buffer_ptr _buffer; + bool can_unbind = true; QNNBackend _device; std::shared_ptr _qnn_instance; Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); qnn_dimension_array_t _dimensions = {}; Qnn_GraphHandle_t _graph_handle = nullptr; qnn_buffer_ptr _rpc_buffer; + ggml_tensor *_ggml_tensor = nullptr; DISABLE_COPY(ggml_qnn_tensor); DISABLE_MOVE(ggml_qnn_tensor); @@ -289,5 +315,92 @@ private: using qnn_tensor_ptr_t = std::shared_ptr; using qnn_tensor_array_t = std::vector; +using ggml_tensor_array_t = std::vector; + +inline qnn_tensor_ptr_t get_qnn_tensor_ptr(ggml_tensor *ggml_tensor) { + return ggml_tensor->extra ? reinterpret_cast(ggml_tensor->extra)->shared_from_this() + : qnn_tensor_ptr_t(); +} + +inline int get_ggml_tensors_max_rank(const qnn::ggml_tensor_array_t &tensors) { + int max_rank = 0; + for (auto tensor : tensors) { + max_rank = std::max(max_rank, ggml_n_dims(tensor)); + } + + return max_rank; +} + +inline bool bind_tensors(const ggml_tensor_array_t &ggml_tensors, qnn_tensor_array_t &tensor_wrappers, + std::vector &qnn_tensors) { + GGML_ASSERT(tensor_wrappers.size() == ggml_tensors.size()); + qnn_tensors.resize(ggml_tensors.size()); + for (size_t i = 0; i < ggml_tensors.size(); i++) { + auto *ggml_tensor = ggml_tensors[i]; + if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) { + QNN_LOG_ERROR("bind tensor %s failed", ggml_get_name(ggml_tensor)); + return false; + } + + qnn_tensors[i] = tensor_wrappers[i]->get_qnn_tensor(); + } + + return true; +} + +inline bool bind_tensors(const ggml_tensor_array_t &ggml_tensors, qnn_tensor_array_t &tensor_wrappers) { + GGML_ASSERT(tensor_wrappers.size() == ggml_tensors.size()); + for (size_t i = 0; i < ggml_tensors.size(); i++) { + auto *ggml_tensor = ggml_tensors[i]; + if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) { + QNN_LOG_ERROR("bind tensor %s failed", ggml_get_name(ggml_tensor)); + return false; + } + } + + return true; +} + +inline void unbind_tensors(qnn_tensor_array_t &tensor_wrappers) { + for (auto &tensor : tensor_wrappers) { + tensor->unbind(); + } +} + +struct tensor_create_common_params { + const char *name_prefix; + int tensor_rank; + bool is_input; + QNNBackend device; + Qnn_GraphHandle_t graph_handle; + std::shared_ptr qnn_instance; +}; + +inline void create_tensors_from_ggml_tensor(const tensor_create_common_params ¶ms, + const ggml_tensor_array_t &ggml_tensors, + qnn_tensor_array_t *tensor_wrappers, + std::vector *qnn_tensors) { + if (qnn_tensors) { + qnn_tensors->resize(ggml_tensors.size()); + } + + if (!tensor_wrappers->empty()) { + QNN_LOG_DEBUG("tensor_wrappers is not empty, skip create tensors"); + GGML_ASSERT(tensor_wrappers->size() == ggml_tensors.size()); + return; + } + + tensor_wrappers->resize(ggml_tensors.size()); + + char buffer[GGML_MAX_NAME] = {}; + auto tensor_type = params.is_input ? ggml_qnn_tensor::INPUT : ggml_qnn_tensor::OUTPUT; + for (size_t i = 0; i < ggml_tensors.size(); i++) { + snprintf(buffer, GGML_MAX_NAME, "%s%d", params.name_prefix, (int)i); + auto *ggml_tensor = ggml_tensors[i]; + (*tensor_wrappers)[i] = std::make_shared(tensor_type, std::string(buffer), ggml_tensor->ne, + ggml_tensor->type, params.tensor_rank, params.device, + params.graph_handle, params.qnn_instance); + } +} } // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index eaabe60cdb..6e77ee5f5f 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -188,13 +188,15 @@ const char *get_backend_name(QNNBackend device_index) { const char *get_chipset_desc(uint32_t chipset_id) { switch (chipset_id) { case SM8450: - return "SM8450"; + return "SD 8 Gen 1 (SM8450)"; case SM8475: - return "SM8475"; + return "SD 8+ Gen 1 (SM8475)"; case SM8550: - return "SM8550"; + return "SD 8 Gen 2 (SM8550)"; case SM8650: - return "SM8650"; + return "SD 8 Gen 3 (SM8650)"; + case SM8750: + return "SD 8 Gen 4 (SM8750)"; default: return "unknown"; } @@ -210,6 +212,8 @@ const char *get_htparch_desc(size_t htp_arch) { return "QCOM_HTP_V73"; case V75: return "QCOM_HTP_V75"; + case V79: + return "QCOM_HTP_V79"; default: return "unknown"; }