From a1ab67478fac9b71e2cf952cd9ac671d34ac1eb0 Mon Sep 17 00:00:00 2001 From: nullname Date: Sat, 22 Mar 2025 12:34:31 +0800 Subject: [PATCH] [feat] add more op (#35) * move op key generate function to kOpCaps * fix op desc print * try fix rms_norm * Revert "try fix rms_norm" This reverts commit 33b296098012909cb482fc29b52b28098dc971cd. * add quantization type support by converting them to float * enable quantization tensor for mulmat in gpu/npu * fix asan error * add log and assert * insert output convert operator after mulmat * add log * fix some error in running * disable permute again * add log * add error function * Revert "add error function" This reverts commit f92ff47798ac8053fb776c55efbb1a98469c7af1. * add log * more log * disable convert op in graph * wip * add f16 config for graph * set f16 precision for f16 graph * fix override data type * add comment * add config flag to enable quantize type * add log * more quantized type for cpu and gpu backend * enable all quant types for cpu and gpu backend * rename * wip * add log * remove unused functions * skip permute * remove get_qnn_op_input_param_count * fallback to generic_get_op_desc if no op_desc * revert 'skip permute' * Revert "revert 'skip permute'" This reverts commit 5761e31fd23c69c4cabf6fd9fac1a0d3e5a74968. * wip * add log * print qnn tensor type * add log * limit the max size of tensor * add log * fix tensor size limiter * small improve on tensor info printer * disable sqrt and div to pass test-backend-ops for 8 gen 2 * remove debug log in release build * add log * skip permute in src * wip * disable reshape * skip mul at decoder start * wip * add log * add qnn_scoped_timer * add perf tracker in graph * add cmake options GGML_QNN_ENABLE_PERFORMANCE_TRACKING * fix flag name * use milli-second * wip * fix comment string * add file for profiler * change qnn-cpu to GGML_BACKEND_DEVICE_TYPE_ACCEL, so that we can run tests on cpu * wip * profiler: refactoring * wip * add implement for print_profile_events * set-up profiler for graph * set profiler to graph execute * pretty print events * unified log print prefix * print event count * enable optrace * print duration at event end * wip * add more detailed soc information * wip * move device caps array into qnn-lib.cpp * remove lib_name in device_context * move get_graph_key_from_cgraph to graph.cpp * add override type for tensor key * use override_type instead of original data type for graph key * append op type to tensor name to fix error in qwen * remove todo * wip --- ggml/include/ggml-qnn.h | 5 +- ggml/src/ggml-qnn/CMakeLists.txt | 9 + ggml/src/ggml-qnn/backend-ops.cpp | 303 ++++++++------------- ggml/src/ggml-qnn/backend.hpp | 15 +- ggml/src/ggml-qnn/buffer.hpp | 12 +- ggml/src/ggml-qnn/convert.cpp | 155 +++++++++++ ggml/src/ggml-qnn/convert.hpp | 26 ++ ggml/src/ggml-qnn/ggml-qnn.cpp | 110 +++----- ggml/src/ggml-qnn/graph.cpp | 382 ++++++++++++++++++++------- ggml/src/ggml-qnn/graph.hpp | 50 +++- ggml/src/ggml-qnn/logger.hpp | 10 +- ggml/src/ggml-qnn/op-config-base.hpp | 4 +- ggml/src/ggml-qnn/op-config-caps.cpp | 132 +++++---- ggml/src/ggml-qnn/op-config-impl.cpp | 94 ++++--- ggml/src/ggml-qnn/op-config-impl.hpp | 56 ++-- ggml/src/ggml-qnn/op-config.hpp | 13 +- ggml/src/ggml-qnn/profiler.cpp | 170 ++++++++++++ ggml/src/ggml-qnn/profiler.hpp | 100 +++++++ ggml/src/ggml-qnn/qnn-lib.cpp | 99 ++++--- ggml/src/ggml-qnn/qnn-lib.hpp | 78 +++--- ggml/src/ggml-qnn/qnn-types.hpp | 14 +- ggml/src/ggml-qnn/tensor.hpp | 92 ++++--- ggml/src/ggml-qnn/utils.cpp | 103 ++++++-- ggml/src/ggml-qnn/utils.hpp | 49 +--- 24 files changed, 1381 insertions(+), 700 deletions(-) create mode 100644 ggml/src/ggml-qnn/convert.cpp create mode 100644 ggml/src/ggml-qnn/convert.hpp create mode 100644 ggml/src/ggml-qnn/profiler.cpp create mode 100644 ggml/src/ggml-qnn/profiler.hpp diff --git a/ggml/include/ggml-qnn.h b/ggml/include/ggml-qnn.h index 2b25ce40d7..48194106cf 100644 --- a/ggml/include/ggml-qnn.h +++ b/ggml/include/ggml-qnn.h @@ -1,14 +1,13 @@ #pragma once -#include "ggml.h" - #include "ggml-backend.h" +#include "ggml.h" #ifdef __cplusplus extern "C" { #endif -#define GGML_QNN_NAME "QNN" +#define GGML_QNN_NAME "qnn" #define GGML_QNN_MAX_DEVICES QNN_BACKEND_COUNT enum QNNBackend { diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt index 520bbd1f46..b3591f903d 100644 --- a/ggml/src/ggml-qnn/CMakeLists.txt +++ b/ggml/src/ggml-qnn/CMakeLists.txt @@ -42,4 +42,13 @@ target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${ if(GGML_QNN_ENABLE_CPU_BACKEND) message("GGML_QNN_ENABLE_CPU_BACKEND is enabled") target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_ENABLE_CPU_BACKEND) +else() + message("GGML_QNN_ENABLE_CPU_BACKEND is disabled") +endif() + +if(GGML_QNN_ENABLE_PERFORMANCE_TRACKING) + message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING is enabled") + target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_ENABLE_PERFORMANCE_TRACKING) +else() + message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING is disabled") endif() diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 95fe35b465..ecafe70963 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -12,156 +12,10 @@ namespace { -bool qnn_is_op_valid(ggml_backend_qnn_device_context * ctx, const ggml_tensor * dst) { - if (!ctx || !dst) { - QNN_LOG_WARN("invalid params\n"); - return false; - } - - auto instance = ctx->instance; - if (!instance) { - QNN_LOG_WARN("invalid instance\n"); - return false; - } - - const auto param_count = qnn::get_qnn_op_input_param_count(dst); - switch (param_count) { - case 1: - return dst->src[0]; - case 2: - return dst->src[0] && dst->src[1]; - default: - QNN_LOG_WARN("invalid op param count %d\n", (int) param_count); - break; - } - - return false; -} - -#ifndef NDEBUG -void print_ggml_tensor(const ggml_tensor * tensor) { - QNN_LOG_DEBUG("%s: type:%s ne: %ldx%ldx%ldx%ld, nb: %ldx%ldx%ldx%ld\n", tensor->name, ggml_type_name(tensor->type), - (long) tensor->ne[0], (long) tensor->ne[1], (long) tensor->ne[2], (long) tensor->ne[3], - (long) tensor->nb[0], (long) tensor->nb[1], (long) tensor->nb[2], (long) tensor->nb[3]); -} -#endif - -} // namespace - -namespace { - -typedef bool (*ggml_qnn_op_t)(ggml_backend_qnn_device_context * ctx, ggml_tensor * dst); - -void append_tensor_dimensions(const ggml_tensor * tensor, std::string & output) { - char buffer[256] = {}; - const auto * type_name = qnn::get_ggml_type_name(tensor->type); - int len = 0; - switch (ggml_n_dims(tensor)) { - case 1: - len = snprintf(buffer, sizeof(buffer), "%ld%s", (long) tensor->ne[0], type_name); - break; - case 2: - len = snprintf(buffer, sizeof(buffer), "%ldx%ld%s", (long) tensor->ne[0], (long) tensor->ne[1], type_name); - break; - case 3: - len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ld%s", (long) tensor->ne[0], (long) tensor->ne[1], - (long) tensor->ne[2], type_name); - break; - case 4: - default: - len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ldx%ld%s", (long) tensor->ne[0], (long) tensor->ne[1], - (long) tensor->ne[2], (long) tensor->ne[3], type_name); - break; - } - GGML_ASSERT(len > 0 && len < (int) sizeof(buffer)); - output.append(buffer, len); -} - -void get_graph_key_from_op(const ggml_tensor * op, std::string & output) { - GGML_ASSERT(op->op != GGML_OP_NONE); - output += ggml_op_desc(op); - output += qnn::get_ggml_type_name(op->type); - const auto param_count = qnn::get_qnn_op_input_param_count(op); - for (size_t i = 0; i < param_count; ++i) { - auto * input = op->src[i]; - if (!input) { - break; - } - - output += '_'; - append_tensor_dimensions(input, output); - } -} - -void get_op_key_with_src_op_desc(const ggml_tensor * op, std::string & output) { - output += ggml_op_desc(op); - output += '('; - if (op->src[0]) { - output += ggml_op_desc(op->src[0]); - } - for (size_t i = 1; i < GGML_MAX_DIMS && op->src[i]; ++i) { - output += ','; - output += ggml_op_desc(op->src[i]); - } - output += ')'; -} - -/** - * @brief Generates a unique key for a given computation graph (cgraph). - * - * This key is used to cache the graph, enabling efficient reuse of previously - * compiled graphs. The key is constructed by concatenating the descriptions - * of the operations and their associated tensor dimensions within the graph. - * - * Example key format: "MUL_MATf32_256x16x10f32_256x1x10f32#LOG#ADD#ADDf32_16x1x10f32" - * - * @param cgraph The computation graph for which the key is generated. - * @param output The string where the generated key will be stored. - * - * TODO: Improve the key generation logic to handle more complex graph structures and edge cases. - */ -void get_graph_key_from_cgraph(const ggml_cgraph * cgraph, std::string & output) { - if (cgraph->n_nodes == 0) { - QNN_LOG_DEBUG("empty cgraph\n"); - return; - } - - { - bool is_start = true; - for (int i = 0; i < cgraph->n_nodes; ++i) { - auto * op = cgraph->nodes[i]; - if (ggml_is_empty(op)) { - QNN_LOG_DEBUG("empty op in graph, skipping\n"); - continue; - } - - if (op->op == GGML_OP_NONE) { - QNN_LOG_DEBUG("GGML_OP_NONE in graph, skipping\n"); - continue; - } - - if (is_start) { - get_graph_key_from_op(cgraph->nodes[0], output); - is_start = false; - } else { - output += '#'; - get_op_key_with_src_op_desc(op, output); - } - } - } - - if (cgraph->n_nodes > 1) { - auto * last_op = cgraph->nodes[cgraph->n_nodes - 1]; - output += qnn::get_ggml_type_name(last_op->type); - output += '_'; - append_tensor_dimensions(last_op, output); - } -} - qnn::qnn_graph * get_qnn_graph_from_cache(ggml_backend_qnn_device_context * ctx, const ggml_cgraph * cgraph) { auto & graph_cache = ctx->qnn_graph_cache; std::string graph_key; - get_graph_key_from_cgraph(cgraph, graph_key); + auto op_data_type = qnn::qnn_graph::get_graph_key_from_cgraph(cgraph, graph_key); if (graph_key.empty()) { QNN_LOG_DEBUG("[%s]empty graph key for cgraph: %p, size: %d\n", qnn::get_backend_name(ctx->device), (const void *) cgraph, (int) cgraph->n_nodes); @@ -171,11 +25,20 @@ qnn::qnn_graph * get_qnn_graph_from_cache(ggml_backend_qnn_device_context * ctx, auto it = graph_cache.find(graph_key); qnn::qnn_graph * graph_ptr = nullptr; if (it != graph_cache.end()) { - QNN_LOG_DEBUG("[%s]found graph %s in cache\n", qnn::get_backend_name(ctx->device), graph_key.c_str()); + auto it = graph_cache.find(graph_key); + QNN_LOG_DEBUG("[%s]found graph %s in cache, cache size: %d\n", qnn::get_backend_name(ctx->device), + graph_key.c_str(), (int) graph_cache.size()); graph_ptr = it->second.get(); } else { - auto graph = - std::make_unique(graph_key, ctx->device, ctx->instance, ctx->socinfo.vtcm_size_in_mb); + auto precision = qnn::qnn_graph::kHtpDefault; + if (op_data_type == GGML_TYPE_F16) { + QNN_LOG_DEBUG("[%s][%s]set graph precision to FP16\n", qnn::get_backend_name(ctx->device), + graph_key.c_str()); + precision = qnn::qnn_graph::kHtpFp16; + } + + auto graph = std::make_unique(graph_key, ctx->device, ctx->instance, precision, + ctx->socinfo.vtcm_size_in_mb); if (!graph->is_valid()) { return nullptr; } @@ -187,6 +50,8 @@ qnn::qnn_graph * get_qnn_graph_from_cache(ggml_backend_qnn_device_context * ctx, graph_ptr = graph.get(); graph_cache[graph_key] = std::move(graph); + QNN_LOG_DEBUG("[%s]add graph %s to cache, cache size: %d\n", qnn::get_backend_name(ctx->device), + graph_key.c_str(), (int) graph_cache.size()); } return graph_ptr; @@ -201,9 +66,9 @@ constexpr const bool kQnnSupportedOps[] = { false, // GGML_OP_ACC true, // GGML_OP_SUB true, // GGML_OP_MUL - true, // GGML_OP_DIV + false, // GGML_OP_DIV, disabled for now cause failed on test-backend-ops false, // GGML_OP_SQR - true, // GGML_OP_SQRT + false, // GGML_OP_SQRT, disabled for now cause failed on test-backend-ops true, // GGML_OP_LOG false, // GGML_OP_SIN false, // GGML_OP_COS @@ -229,7 +94,7 @@ constexpr const bool kQnnSupportedOps[] = { false, // GGML_OP_SET false, // GGML_OP_CPY false, // GGML_OP_CONT - true, // GGML_OP_RESHAPE + false, // GGML_OP_RESHAPE false, // GGML_OP_VIEW false, // GGML_OP_PERMUTE false, // GGML_OP_TRANSPOSE @@ -306,14 +171,39 @@ constexpr const bool kQnnSupportedOps[] = { static_assert(kQnnSupportedOps[GGML_OP_NONE], "GGML_OP_NONE is not true"); static_assert(kQnnSupportedOps[GGML_OP_ADD], "GGML_OP_ADD is not true"); static_assert(kQnnSupportedOps[GGML_OP_MUL], "GGML_OP_MUL is not true"); -static_assert(kQnnSupportedOps[GGML_OP_MUL_MAT], - "GGML_OP_MUL_MAT is not true, please check the kQnnSupportedOps table in the backend-ops.cpp file"); -static_assert(kQnnSupportedOps[GGML_OP_RESHAPE], "GGML_OP_RESHAPE is not true"); +static_assert(kQnnSupportedOps[GGML_OP_MUL_MAT], "GGML_OP_MUL_MAT is not true"); +static_assert(!kQnnSupportedOps[GGML_OP_RESHAPE], "GGML_OP_RESHAPE should not be true"); static_assert(!kQnnSupportedOps[GGML_OP_VIEW], "GGML_OP_VIEW is not false"); static_assert(std::size(kQnnSupportedOps) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), "GGML_OP_COUNT does not match the size of the kQnnSupportedOps table"); -bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context * ctx, const ggml_tensor * tensor) { +inline bool is_type_bit_enabled(uint64_t bits, ggml_type type) { + return bits & (uint64_t(1) << type); +} + +inline bool is_tensor_size_valid(ggml_backend_qnn_device_context * ctx, const ggml_tensor * tensor) { + constexpr const auto get_tensor_size_in_bytes = [](const ggml_tensor * tensor, ggml_type type) -> size_t { + return tensor->ne[0] * tensor->ne[1] * tensor->ne[2] * tensor->ne[3] * ggml_type_size(type); + }; + + auto type = tensor->type; + if (ggml_is_quantized(type) && ctx->enable_cpu_dequantize) { + type = GGML_TYPE_F32; // TODO: [quantize] fix me if plan to dequantize to other types + } + + const auto tensor_size = get_tensor_size_in_bytes(tensor, type); + if (ctx->max_tensor_size_in_bytes && tensor_size >= ctx->max_tensor_size_in_bytes) { + QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) size(%lld) exceeds the limit(%lld)\n", + qnn::get_backend_name(ctx->device), ggml_get_name(tensor), (int) tensor->ne[0], + (int) tensor->ne[1], (int) tensor->ne[2], (int) tensor->ne[3], (long long int) tensor_size, + (long long int) ctx->max_tensor_size_in_bytes); + return false; + } + + return true; +} + +bool is_tensor_type_valid(ggml_backend_qnn_device_context * ctx, const ggml_tensor * tensor) { if (!tensor) { QNN_LOG_DEBUG("tensor is nullptr\n"); return false; @@ -332,9 +222,7 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context * ctx, const ggml_ switch (tensor->type) { case GGML_TYPE_F32: case GGML_TYPE_F16: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q4_0: - if (!(ctx->supported_types & (uint64_t(1) << tensor->type))) { + if (!is_type_bit_enabled(ctx->supported_types, tensor->type)) { QNN_LOG_DEBUG("[%s]unsupported data type %s, supported_types: 0x%x\n", qnn::get_backend_name(ctx->device), ggml_type_name(tensor->type), (unsigned int) ctx->supported_types); @@ -350,18 +238,29 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context * ctx, const ggml_ return true; } +bool is_data_reinterpretation_op(ggml_op op) { + return op == GGML_OP_VIEW || op == GGML_OP_PERMUTE; +} + bool ggnl_qnn_supports_op_tensor(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) { if (op->op == GGML_OP_NONE) { return true; } - if (!ggml_qnn_supports_tensor(ctx, op)) { + if (!is_tensor_type_valid(ctx, op) || !is_tensor_size_valid(ctx, op)) { return false; } - const auto param_count = qnn::get_qnn_op_input_param_count(op); - for (size_t i = 0; i < param_count; ++i) { - if (!ggml_qnn_supports_tensor(ctx, op->src[i])) { + // TODO: fix for other op + const bool cpu_dequant = ctx->enable_cpu_dequantize && op->op == GGML_OP_MUL_MAT; + for (size_t i = 0; i < GGML_MAX_SRC && op->src[i]; ++i) { + auto * src = op->src[i]; + if (!is_tensor_size_valid(ctx, src)) { + return false; + } + + // passthrough the quantized tensor for CPU dequantization + if (!is_tensor_type_valid(ctx, src) && (!cpu_dequant || !ggml_is_quantized(src->type))) { return false; } } @@ -394,14 +293,17 @@ bool ggml_qnn_have_same_tensor_types(ggml_backend_qnn_device_context * ctx, cons return true; } +// TODO: move to caps array? bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) { - constexpr const size_t kMaxNpuTensorSize = 8192L * 2048 + 8192 * 512 + 2048 * 512; - constexpr const auto get_tensor_size = [](const ggml_tensor * tensor) -> size_t { - return tensor->ne[0] * tensor->ne[1] * tensor->ne[2] * tensor->ne[3]; - }; - auto * src0 = op->src[0]; auto * src1 = op->src[1]; + if (is_data_reinterpretation_op(src0->op) || is_data_reinterpretation_op(src1->op)) { + // TODO: remove the blocker here when we support permute op + QNN_LOG_DEBUG("[%s][MUL_MAT]data reorganization op is not supported, (%s, %s)\n", + qnn::get_backend_name(ctx->device), ggml_op_name(src0->op), ggml_op_name(src1->op)); + return false; + } + switch (ctx->device) { case QNN_BACKEND_NPU: if (src1->ne[2] != src0->ne[2] || src1->ne[3] != src0->ne[3]) { @@ -411,15 +313,21 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context * ctx, const gg */ QNN_LOG_DEBUG("[qnn-npu][MUL_MAT]src0 and src1 dimensions are not equal\n"); return false; - } else if (get_tensor_size(src0) + get_tensor_size(src1) + get_tensor_size(op) >= kMaxNpuTensorSize) { - QNN_LOG_DEBUG("[qnn-npu][MUL_MAT]tensor size is too large\n"); - return false; } // fall through, from test here, the convert op is super slow on NPU: // https://github.com/usefulsensors/qc_npu_benchmark case QNN_BACKEND_GPU: - if (ggml_qnn_have_same_tensor_types(ctx, op)) { - // there's no convert op for GPU. + if (!ggml_qnn_have_same_tensor_types(ctx, op) && op->type != GGML_TYPE_F32) { + // for different tensor types and not float32, we don't support it currently, since there's no convert + QNN_LOG_DEBUG("[%s][MUL_MAT]src0 and src1 and dst types are not equal\n", + qnn::get_backend_name(ctx->device)); + return false; + } + if (op->type == GGML_TYPE_F32 && ggml_is_quantized(src0->type) && + !is_type_bit_enabled(ctx->cpu_preprocess_types, src0->type)) { + // for such cases that src0 is quantized and op is float32, check if the quant type is enabled + QNN_LOG_DEBUG("[%s][MUL_MAT]quantized src0 type %s is not enabled\n", + qnn::get_backend_name(ctx->device), ggml_type_name(src0->type)); return false; } break; @@ -436,6 +344,19 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context * ctx, const gg return true; } +#ifndef NDEBUG + +void print_tensor_info(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op, bool is_supported) { + const char * supported = is_supported ? "supported" : "unsupported"; + std::string op_key; + qnn::get_qnn_op_desc(op, true, GGML_TYPE_COUNT, op_key); + + QNN_LOG_DEBUG("[%s][%s]op was %s, support/unsupported: %d/%d\n", qnn::get_backend_name(ctx->device), op_key.c_str(), + supported, ctx->supported_op_count.load(), ctx->unsupported_op_count.load()); +} + +#endif + } // namespace namespace qnn { @@ -448,22 +369,16 @@ bool device_supports_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor if (!kQnnSupportedOps[qnn::get_qnn_op_index(op)]) { #ifndef NDEBUG - std::string op_key; - get_graph_key_from_op(op, op_key); ctx->unsupported_op_count++; - QNN_LOG_DEBUG("[%s][%s]op was unsupported, support/unsupported: %d/%d\n", qnn::get_backend_name(ctx->device), - op_key.c_str(), ctx->supported_op_count.load(), ctx->unsupported_op_count.load()); + print_tensor_info(ctx, op, false); #endif return false; } if (!ggnl_qnn_supports_op_tensor(ctx, op)) { #ifndef NDEBUG - std::string tensor_dims; - append_tensor_dimensions(op, tensor_dims); - QNN_LOG_DEBUG("[%s][%s]unsupported tensor(%s), support/unsupported: %d/%d\n", - qnn::get_backend_name(ctx->device), ggml_op_name(op->op), tensor_dims.c_str(), - ctx->supported_op_count.load(), ctx->unsupported_op_count.load()); + ctx->unsupported_op_count++; + print_tensor_info(ctx, op, false); #endif return false; } @@ -480,13 +395,23 @@ bool device_supports_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor auto * src0 = op->src[0]; auto * src1 = op->src[1]; switch (op->op) { + case GGML_OP_MUL: + // TODO: fix this when we have the support for mul with rms_norm + if (ctx->enable_cpu_dequantize && (src0->op == GGML_OP_RMS_NORM || src1->op == GGML_OP_RMS_NORM)) { + QNN_LOG_DEBUG("[%s][%s]skip unsupported mul with rms norm, (%s, %s)\n", + qnn::get_backend_name(ctx->device), ggml_op_desc(op), ggml_op_desc(src0), + ggml_op_desc(src1)); + is_op_supported = false; + break; + } + // fall through, just skip the mul with rms_norm, in llama, its at start of decoder block case GGML_OP_ADD: case GGML_OP_SUB: - case GGML_OP_MUL: case GGML_OP_DIV: + // TODO: move to op caps array? if (!ggml_are_same_shape(src0, src1)) { QNN_LOG_DEBUG("[%s][%s] src0 and src1 dimensions are not equal\n", - qnn::get_backend_name(ctx->device), ggml_op_name(op->op)); + qnn::get_backend_name(ctx->device), ggml_op_desc(op)); is_op_supported = false; } break; @@ -503,13 +428,11 @@ bool device_supports_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor #ifndef NDEBUG if (is_op_supported) { ctx->supported_op_count++; - QNN_LOG_DEBUG("[%s][%s]op was supported, support/unsupported: %d/%d\n", qnn::get_backend_name(ctx->device), - ggml_op_name(op->op), ctx->supported_op_count.load(), ctx->unsupported_op_count.load()); } else { ctx->unsupported_op_count++; - QNN_LOG_DEBUG("[%s][%s]op was unsupported, support/unsupported: %d/%d\n", qnn::get_backend_name(ctx->device), - ggml_op_name(op->op), ctx->supported_op_count.load(), ctx->unsupported_op_count.load()); } + + print_tensor_info(ctx, op, is_op_supported); #endif return is_op_supported; @@ -520,7 +443,7 @@ bool device_compute_graph(ggml_backend_qnn_device_context * ctx, ggml_cgraph * c (int) cgraph->n_nodes); auto qnn_graph = get_qnn_graph_from_cache(ctx, cgraph); - bool success = qnn_graph && qnn_graph->execute(cgraph); + bool success = qnn_graph && qnn_graph->execute(cgraph, ctx->convert_context); QNN_LOG_DEBUG("[%s]compute graph, success: %d\n", qnn::get_backend_name(ctx->device), (int) success); return success; diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index 253b0b6723..f2484a7a97 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -10,6 +10,7 @@ #include #include +#include "convert.hpp" #include "ggml-backend.h" #include "ggml-qnn.h" #include "ggml.h" @@ -25,26 +26,30 @@ struct ggml_backend_qnn_device_context { QNNBackend device; size_t threads; std::string name; - std::string lib_name; + std::string description; // initialize in qnn init qnn::qcom_socinfo socinfo = {}; - uint64_t supported_types; + size_t max_tensor_size_in_bytes; std::shared_ptr instance; std::shared_ptr qnn_interface; - qnn::qnn_graph_cache_t qnn_graph_cache; + qnn::qnn_graph_cache_t qnn_graph_cache; + std::shared_ptr convert_context = std::make_shared(); #ifndef NDEBUG std::atomic_uint32_t supported_op_count = 0; std::atomic_uint32_t unsupported_op_count = 0; #endif + bool enable_cpu_dequantize = false; + uint64_t supported_types; + uint64_t cpu_preprocess_types; + explicit ggml_backend_qnn_device_context(QNNBackend device, size_t threads, const char * name, - const char * lib_name, uint64_t supported_types) : + uint64_t supported_types) : device(device), threads(threads), name(name), - lib_name(lib_name), supported_types(supported_types) {} }; diff --git a/ggml/src/ggml-qnn/buffer.hpp b/ggml/src/ggml-qnn/buffer.hpp index 43c4666dd1..2840f78fb5 100644 --- a/ggml/src/ggml-qnn/buffer.hpp +++ b/ggml/src/ggml-qnn/buffer.hpp @@ -69,8 +69,8 @@ using qnn_buffer_ptr = std::shared_ptr; */ class qnn_rpc_buffer : public qnn_buffer_interface { public: - qnn_rpc_buffer(std::shared_ptr qnn_instance, const size_t size, const uint32_t rank, - uint32_t * dimensions, Qnn_DataType_t data_type) : + qnn_rpc_buffer(qnn_instance_ptr qnn_instance, const size_t size, const uint32_t rank, uint32_t * dimensions, + Qnn_DataType_t data_type) : _size(size), _qnn_instance(qnn_instance) { _qnn_rpc_buffer = static_cast(qnn_instance->alloc_rpcmem(size, alignof(uint8_t *))); @@ -105,10 +105,10 @@ class qnn_rpc_buffer : public qnn_buffer_interface { Qnn_MemHandle_t get_mem_handle() const override { return _qnn_rpc_mem_handle; } private: - size_t _size = 0; - uint8_t * _qnn_rpc_buffer = nullptr; - Qnn_MemHandle_t _qnn_rpc_mem_handle = nullptr; - std::shared_ptr _qnn_instance; + size_t _size = 0; + uint8_t * _qnn_rpc_buffer = nullptr; + Qnn_MemHandle_t _qnn_rpc_mem_handle = nullptr; + qnn_instance_ptr _qnn_instance; DISABLE_COPY(qnn_rpc_buffer); DISABLE_MOVE(qnn_rpc_buffer); diff --git a/ggml/src/ggml-qnn/convert.cpp b/ggml/src/ggml-qnn/convert.cpp new file mode 100644 index 0000000000..9719bac345 --- /dev/null +++ b/ggml/src/ggml-qnn/convert.cpp @@ -0,0 +1,155 @@ + +#include "convert.hpp" + +#include "logger.hpp" + +namespace { + +size_t get_convert_buffer_size(const qnn::ggml_dimension_array_t & dimensions, ggml_type dst_type) { + GGML_ASSERT(ggml_blck_size(dst_type) == 1); + size_t nbytes = ggml_type_size(dst_type); + for (size_t i = 0; i < GGML_MAX_DIMS; ++i) { + nbytes *= dimensions[i]; // tight packing + } + + return nbytes; +} + +// from ggml_backend_blas_mul_mat, when omp available, use it otherwise will fall back to standard lib solution +// TODO: remove this when we can fall back the convert to blas backend +#ifdef GGML_USE_OPENMP + +void convert_tensor_impl(const ggml_tensor * src, int max_threads, + std::shared_ptr & output_buffer) { + const auto ne03 = src->ne[3]; + const auto ne02 = src->ne[2]; + const auto ne01 = src->ne[1]; + const auto ne00 = src->ne[0]; + const auto ne_plane = ne01 * ne00; + const auto nb03 = src->nb[3]; + const auto nb02 = src->nb[2]; + const auto nb01 = src->nb[1]; + const int min_cols_per_thread = 4096; + void * wdata = output_buffer->get_buffer(); + const auto to_float = ggml_get_type_traits(src->type)->to_float; + GGML_ASSERT(to_float); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + const void * x = (char *) src->data + i02 * nb02 + i03 * nb03; + float * const wplane = (float *) wdata + i02 * ne_plane + i03 * ne02 * ne_plane; + + const int min_rows_per_thread = std::max((int) (min_cols_per_thread / ne00), 1); + const int n_threads = std::max(std::min(max_threads, (int) (ne01 / min_rows_per_thread)), 1); + +# pragma omp parallel for num_threads(n_threads) + for (int64_t i01 = 0; i01 < ne01; i01++) { + to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00); + } + } + } + + return output_buffer; +} + +#else + +void convert_tensor_impl(const ggml_tensor * src, int max_threads, std::vector> & tasks, + std::shared_ptr & output_buffer) { + const auto ne03 = src->ne[3]; + const auto ne02 = src->ne[2]; + const auto ne01 = src->ne[1]; + const auto ne00 = src->ne[0]; + const auto ne_plane = ne01 * ne00; + const auto nb03 = src->nb[3]; + const auto nb02 = src->nb[2]; + const auto nb01 = src->nb[1]; + const int min_cols_per_thread = 4096; + void * wdata = output_buffer->get_buffer(); + const auto to_float = ggml_get_type_traits(src->type)->to_float; + GGML_ASSERT(to_float); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + const void * x = (char *) src->data + i02 * nb02 + i03 * nb03; + float * const wplane = (float *) wdata + i02 * ne_plane + i03 * ne02 * ne_plane; + + const int min_rows_per_thread = std::max((int) (min_cols_per_thread / ne00), 1); + const int n_threads = std::max(std::min(max_threads, (int) (ne01 / min_rows_per_thread)), 1); + + for (int i = 1; i < n_threads; i++) { + const int64_t start = i * ne01 / n_threads; + const int64_t end = (i + 1) * ne01 / n_threads; + if (start < end) { + tasks.push_back(std::async(std::launch::async, [=]() { + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00); + } + })); + } + } + { + // reuse the current thread for the first task + const int64_t start = 0; + const int64_t end = ne01 / n_threads; + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00); + } + } + } + } + + // wait for all tasks to finish + for (auto & task : tasks) { + task.get(); + } + tasks.clear(); +} + +#endif + +} // namespace + +namespace qnn { + +std::vector convert(std::shared_ptr convert_context, + const ggml_tensor_array_t & tensors, ggml_type target_data_type) { + convert_context->buffers.resize(tensors.size()); + std::vector output_buffers(tensors.size()); + for (size_t i = 0; i < tensors.size(); ++i) { + const ggml_tensor * src = tensors[i]; + if (src->type == target_data_type) { + continue; + } + + auto & data_buffer = convert_context->buffers[i]; + const auto dst_size = get_convert_buffer_size(src->ne, target_data_type); + if (!data_buffer || data_buffer->get_size() < dst_size) { +#ifndef NDEBUG + auto old_size = data_buffer ? data_buffer->get_size() : 0; + QNN_LOG_DEBUG("create buffer[%d] for tensor %s(%s), old_size: %d, new_size: %d\n", (int) i, + ggml_get_name(src), ggml_type_name(src->type), (int) old_size, (int) dst_size); +#endif + data_buffer = std::make_shared(dst_size); + } + + // TODO: add more restrictions to the buffer slice here + std::shared_ptr output_buffer = + std::make_shared(data_buffer->get_buffer(), dst_size); + + QNN_LOG_DEBUG("convert tensor(%s) from %s to %s, size: %d, n_threads: %d\n", ggml_get_name(src), + ggml_type_name(src->type), ggml_type_name(target_data_type), (int) dst_size, + convert_context->n_threads); + +#ifdef GGML_USE_OPENMP + convert_tensor_impl(src, convert_context->n_threads, output_buffer); +#else + convert_tensor_impl(src, convert_context->n_threads, convert_context->tasks, output_buffer); +#endif + output_buffers[i] = output_buffer; + } + + return output_buffers; +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/convert.hpp b/ggml/src/ggml-qnn/convert.hpp new file mode 100644 index 0000000000..818004c587 --- /dev/null +++ b/ggml/src/ggml-qnn/convert.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include +#include +#include + +#include "buffer.hpp" +#include "ggml-qnn.h" +#include "tensor.hpp" +#include "utils.hpp" + +namespace qnn { + +// see also: ggml_backend_blas_context +struct qnn_convert_context_t { + int n_threads = std::thread::hardware_concurrency(); + std::vector> buffers; +#ifndef GGML_USE_OPENMP + std::vector> tasks; +#endif +}; + +std::vector convert(std::shared_ptr convert_context, + const ggml_tensor_array_t & tensors, ggml_type target_data_type); + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp index fd18a1a623..1d3e45562c 100644 --- a/ggml/src/ggml-qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/ggml-qnn.cpp @@ -4,78 +4,16 @@ #include #include +#include "backend-ops.hpp" +#include "backend.hpp" #include "ggml-backend-impl.h" #include "ggml-impl.h" -#include "ggml-qnn/backend-ops.hpp" -#include "ggml-qnn/backend.hpp" -#include "ggml-qnn/logger.hpp" -#include "ggml-qnn/tensor.hpp" -#include "ggml-qnn/utils.hpp" - -// ================================================================================================= -// -// self-defined macro / data structure -// -// ================================================================================================= -#ifdef NDEBUG -# define ENABLE_QNNBACKEND_PERF 0 // enable/disable op's perf info -#else -# define ENABLE_QNNBACKEND_PERF 1 // enable/disable op's perf info -#endif - -#define QNN_BACKEND_NAME "qnn" +#include "logger.hpp" +#include "tensor.hpp" +#include "utils.hpp" namespace { -#ifdef _WIN32 -constexpr const char * kQnnCpuLibName = "QnnCpu.dll"; -constexpr const char * kQnnGpuLibName = "QnnGpu.dll"; -constexpr const char * kQnnNpuLibName = "QnnHtp.dll"; -#else -constexpr const char * kQnnCpuLibName = "libQnnCpu.so"; -constexpr const char * kQnnGpuLibName = "libQnnGpu.so"; -constexpr const char * kQnnNpuLibName = "libQnnHtp.so"; -#endif - -struct qnn_device_caps { - const char * name; - const char * description; - const char * lib_name; - enum ggml_backend_dev_type type; - - // TODO: should get this caps from device - uint64_t supported_types; -}; - -// TODO: should move this to qnn-lib.cpp -constexpr const qnn_device_caps kDeviceCaps[] = { - { - "qnn-cpu", "Qualcomm Kryo CPU", - kQnnCpuLibName, GGML_BACKEND_DEVICE_TYPE_CPU, - (1 << GGML_TYPE_I8) | (1 << GGML_TYPE_F32), - }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul - { - "qnn-gpu", "Qualcomm Adreno GPU", - kQnnGpuLibName, GGML_BACKEND_DEVICE_TYPE_GPU, - (1 << GGML_TYPE_F32) | (1 << GGML_TYPE_F16), - }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul - { - "qnn-npu", "Qualcomm NPU", - kQnnNpuLibName, GGML_BACKEND_DEVICE_TYPE_ACCEL, - (1 << GGML_TYPE_F32) | (1 << GGML_TYPE_F16) | (1 << GGML_TYPE_I16) | (1 << GGML_TYPE_I8), - }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul -}; - -static_assert(sizeof(kDeviceCaps) / sizeof(kDeviceCaps[0]) == GGML_QNN_MAX_DEVICES, - "The number of qnn devices should be equal to GGML_QNN_MAX_DEVICES"); -static_assert(kDeviceCaps[QNN_BACKEND_NPU].type == GGML_BACKEND_DEVICE_TYPE_ACCEL, - "The NPU device should be an accelerator device"); -static_assert(kDeviceCaps[QNN_BACKEND_GPU].type == GGML_BACKEND_DEVICE_TYPE_GPU, - "The NPU device should be an accelerator device"); - -static_assert(kDeviceCaps[QNN_BACKEND_CPU].type == GGML_BACKEND_DEVICE_TYPE_CPU, - "The NPU device should be an accelerator device"); - ggml_backend_qnn_device_context * get_device_context(ggml_backend_dev_t dev) { return reinterpret_cast(dev->context); } @@ -266,13 +204,13 @@ constexpr const ggml_backend_i ggml_backend_qnn_interface = { * ----------------------------------------------------------------------------------------------- */ const char * ggml_backend_qnn_device_get_name(ggml_backend_dev_t dev) { - const auto & caps = kDeviceCaps[get_device_context(dev)->device]; - return caps.name; + auto * dev_ctx = get_device_context(dev); + return qnn::get_backend_name(dev_ctx->device); } const char * ggml_backend_qnn_device_get_description(ggml_backend_dev_t dev) { - const auto & caps = kDeviceCaps[get_device_context(dev)->device]; - return caps.description; + auto * dev_ctx = get_device_context(dev); + return dev_ctx->description.empty() ? qnn::get_backend_desc(dev_ctx->device) : dev_ctx->description.c_str(); } void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { @@ -283,7 +221,7 @@ void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t * free, s } enum ggml_backend_dev_type ggml_backend_qnn_device_get_type(ggml_backend_dev_t dev) { - return kDeviceCaps[get_device_context(dev)->device].type; + return qnn::get_device_caps(get_device_context(dev)->device).type; } void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { @@ -310,14 +248,14 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, extend_lib_search_path = GGML_QNN_DEFAULT_LIB_SEARCH_PATH; QNN_LOG_WARN( "extend_lib_search_path is nullptr, will " - "use " GGML_QNN_DEFAULT_LIB_SEARCH_PATH " as default"); + "use " GGML_QNN_DEFAULT_LIB_SEARCH_PATH " as default\n"); } auto * dev_ctx = get_device_context(dev); const auto device = dev_ctx->device; QNN_LOG_DEBUG("device %s\n", qnn::get_backend_name(device)); QNN_LOG_DEBUG("extend_lib_search_path %s\n", extend_lib_search_path); - auto instance = std::make_shared(extend_lib_search_path, dev_ctx->lib_name); + auto instance = std::make_shared(extend_lib_search_path, device); auto result = instance->qnn_init(nullptr); if (result != 0) { QNN_LOG_WARN("failed to init qnn backend %s\n", qnn::get_backend_name(device)); @@ -331,10 +269,21 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, std::string device_name = qnn::get_backend_name(device); QNN_LOG_INFO("qnn device name %s\n", device_name.c_str()); - dev_ctx->instance = instance; - dev_ctx->qnn_interface = qnn_interface; - dev_ctx->socinfo = instance->get_soc_info(); - dev_ctx->supported_types = kDeviceCaps[device].supported_types; + const auto & device_caps = qnn::get_device_caps(device); + dev_ctx->instance = instance; + dev_ctx->qnn_interface = qnn_interface; + dev_ctx->socinfo = instance->get_soc_info(); + dev_ctx->supported_types = device_caps.supported_types; + dev_ctx->cpu_preprocess_types = device_caps.cpu_preprocess_types; + dev_ctx->max_tensor_size_in_bytes = device_caps.max_tensor_size_in_bytes; + { + char buffer[256]; + snprintf(buffer, sizeof(buffer), "%s(%s)", qnn::get_chipset_desc(dev_ctx->socinfo.soc_model), + qnn::get_backend_desc(dev_ctx->device)); + dev_ctx->description = buffer; + } + // TODO: remove npu from here if hardware quantization is supported + dev_ctx->enable_cpu_dequantize = device == QNN_BACKEND_CPU; ggml_backend_t qnn_backend = new ggml_backend{ /* .guid = */ ggml_backend_qnn_guid(), @@ -425,16 +374,17 @@ struct ggml_backend_qnn_reg_impl : ggml_backend_reg { * here we skip the initialization of CPU device, * cause it'll block unsupported ops fallback to ggml cpu backend */ + QNN_LOG_DEBUG("qnn backend registry skip CPU device\n"); continue; } #endif + const auto & device_caps = qnn::get_device_caps(device_enum); device_contexts.emplace_back(std::make_unique( /* .device = */ device_enum, // init from the last device, i.e. NPU /* .threads = */ 1, /* .name = */ qnn::get_backend_name(device_enum), - /* .lib_name = */ kDeviceCaps[device_enum].lib_name, - /* .supported_types = */ kDeviceCaps[device_enum].supported_types)); + /* .supported_types = */ device_caps.supported_types)); devices.emplace_back(ggml_backend_device{ /* iface = */ ggml_backend_qnn_device_interface, diff --git a/ggml/src/ggml-qnn/graph.cpp b/ggml/src/ggml-qnn/graph.cpp index 2a282771c2..3021a6f0a2 100644 --- a/ggml/src/ggml-qnn/graph.cpp +++ b/ggml/src/ggml-qnn/graph.cpp @@ -7,15 +7,27 @@ #include "ggml-impl.h" #include "logger.hpp" #include "op-config.hpp" +#include "profiler.hpp" #include "tensor.hpp" +#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING +# define GRAPH_PROFILE_HANDLE (_event_tracer ? _event_tracer->get_handle() : nullptr) +# define GRAPH_PROFILE_PRINT() \ + if (_event_tracer) { \ + _event_tracer->print_profile_events(); \ + } \ + (void) 0 +#else +# define GRAPH_PROFILE_HANDLE (nullptr) +# define GRAPH_PROFILE_PRINT() (void) 0 +#endif + namespace { using qnn_tensor_cache_t = std::unordered_map; int get_op_max_rank(const ggml_tensor * op) { - int max_rank = ggml_n_dims(op); - const int count = (int) qnn::get_qnn_op_input_param_count(op); - for (int i = 0; i < count; ++i) { + int max_rank = ggml_n_dims(op); + for (int i = 0; i < GGML_MAX_DIMS && op->src[i]; ++i) { max_rank = std::max(max_rank, ggml_n_dims(op->src[i])); } @@ -23,7 +35,8 @@ int get_op_max_rank(const ggml_tensor * op) { } qnn::qnn_tensor_ptr_t create_tensor_with_cache(ggml_tensor * tensor, qnn::ggml_qnn_tensor::tensor_type_t type, int rank, - QNNBackend device, Qnn_GraphHandle_t graph_handle, + ggml_type override_data_type, QNNBackend device, + Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance, qnn_tensor_cache_t & tensor_cache) { GGML_ASSERT(tensor); @@ -31,21 +44,30 @@ qnn::qnn_tensor_ptr_t create_tensor_with_cache(ggml_tensor * tensor, qnn::ggml_q return tensor_cache[tensor]; } - auto qnn_tensor = std::make_shared(type, tensor->name, tensor->ne, tensor->type, rank, device, - graph_handle, qnn_instance); + QNN_LOG_DEBUG("[%s]create_tensor_with_cache, data_type: %s, override_data_type: %s\n", + qnn::get_backend_name(device), ggml_type_name(tensor->type), ggml_type_name(override_data_type)); + auto data_type = override_data_type != GGML_TYPE_COUNT ? override_data_type : tensor->type; + + // We've observed that some tensors have the same name with different op types will be added to the same graph + // which will cause the graph build failed. To avoid this, we append the op type to the tensor name. + char tensor_name[256]; + snprintf(tensor_name, sizeof(tensor_name), "%s_%s", ggml_get_name(tensor), ggml_op_desc(tensor)); + auto qnn_tensor = std::make_shared(type, std::string(tensor_name), tensor->ne, data_type, + rank, device, graph_handle, qnn_instance); tensor_cache[tensor] = qnn_tensor; return qnn_tensor; } qnn::qnn_tensor_array_t create_tensors_with_cache(const qnn::ggml_tensor_array_t & ggml_tensors, - qnn::ggml_qnn_tensor::tensor_type_t type, int rank, QNNBackend device, + qnn::ggml_qnn_tensor::tensor_type_t type, int rank, + ggml_type override_data_type, QNNBackend device, Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance, qnn_tensor_cache_t & tensor_cache) { qnn::qnn_tensor_array_t tensors; for (auto * tensor : ggml_tensors) { - tensors.push_back( - create_tensor_with_cache(tensor, type, rank, device, graph_handle, qnn_instance, tensor_cache)); + tensors.push_back(create_tensor_with_cache(tensor, type, rank, override_data_type, device, graph_handle, + qnn_instance, tensor_cache)); } return tensors; @@ -54,23 +76,23 @@ qnn::qnn_tensor_array_t create_tensors_with_cache(const qnn::ggml_tensor_array_t qnn::qnn_op_config_ptr_t create_operation_from_op_tensor(ggml_tensor * dst, const std::string & name, int rank, QNNBackend device, Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance, - bool is_intermediate, qnn_tensor_cache_t & tensor_cache) { + qnn_tensor_cache_t & tensor_cache) { auto operation = qnn::create_op(dst, name, qnn_instance); // input tensors qnn::qnn_tensor_array_t input_qnn_tensors; - auto tensor_type = is_intermediate ? qnn::ggml_qnn_tensor::INTERMEDIATE : qnn::ggml_qnn_tensor::INPUT; - for (size_t i = 0; i < qnn::get_qnn_op_input_param_count(dst); ++i) { - auto input_qnn_tensor = - create_tensor_with_cache(dst->src[i], tensor_type, rank, device, graph_handle, qnn_instance, tensor_cache); + for (size_t i = 0; i < GGML_MAX_DIMS && dst->src[i]; ++i) { + auto * src = dst->src[i]; + auto input_qnn_tensor = create_tensor_with_cache(src, qnn::ggml_qnn_tensor::INTERMEDIATE, rank, GGML_TYPE_COUNT, + device, graph_handle, qnn_instance, tensor_cache); input_qnn_tensors.push_back(input_qnn_tensor); } operation->set_input_tensors(input_qnn_tensors); // output tensor - tensor_type = is_intermediate ? qnn::ggml_qnn_tensor::INTERMEDIATE : qnn::ggml_qnn_tensor::OUTPUT; qnn::qnn_tensor_array_t output_qnn_tensors = - create_tensors_with_cache({ dst }, tensor_type, rank, device, graph_handle, qnn_instance, tensor_cache); + create_tensors_with_cache({ dst }, qnn::ggml_qnn_tensor::INTERMEDIATE, rank, GGML_TYPE_COUNT, device, + graph_handle, qnn_instance, tensor_cache); operation->set_output_tensors(output_qnn_tensors); // initialize operation @@ -82,29 +104,6 @@ qnn::qnn_op_config_ptr_t create_operation_from_op_tensor(ggml_tensor * dst, cons return operation; } -bool bind_src_tensors(ggml_tensor * op, qnn::qnn_tensor_array_t & tensor_wrappers, - std::vector & qnn_tensors) { - if (op->op == GGML_OP_NONE) { - QNN_LOG_DEBUG("op %s is not a valid op\n", ggml_get_name(op)); - return false; - } - - const auto param_count = qnn::get_qnn_op_input_param_count(op); - GGML_ASSERT(tensor_wrappers.size() == param_count); - qnn_tensors.resize(param_count); - for (size_t i = 0; i < param_count; ++i) { - auto * ggml_tensor = op->src[i]; - if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) { - QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); - return false; - } - - qnn_tensors[i] = tensor_wrappers[i]->get_qnn_tensor(); - } - - return true; -} - /** * @brief Extracts input and output tensors from a computational graph. * @@ -134,11 +133,15 @@ int get_io_tensors_from_graph(const ggml_cgraph * cgraph, qnn::ggml_tensor_array continue; } - if (dst->op == GGML_OP_NONE || dst->op == GGML_OP_VIEW) { + if (dst->op == GGML_OP_NONE || dst->op == GGML_OP_VIEW || dst->op == GGML_OP_PERMUTE) { // TODO: remove GGML_OP_VIEW after view op is supported + QNN_LOG_DEBUG("node[%d]%s(%s), type: %s, skipped\n", i, ggml_get_name(dst), ggml_op_desc(dst), + ggml_type_name(dst->type)); continue; } + QNN_LOG_DEBUG("node[%d]%s(%s), type: %s\n", i, ggml_get_name(dst), ggml_op_desc(dst), + ggml_type_name(dst->type)); rank = std::max(rank, ggml_n_dims(dst)); if (connectivity_map.count(dst) == 0) { connectivity_map[dst] = { @@ -150,10 +153,12 @@ int get_io_tensors_from_graph(const ggml_cgraph * cgraph, qnn::ggml_tensor_array ++(connectivity_map[dst].in_degree); } - for (size_t i = 0; i < GGML_MAX_DIMS && dst->src[i]; ++i) { - auto * src = dst->src[i]; + for (size_t j = 0; j < GGML_MAX_DIMS && dst->src[j]; ++j) { + auto * src = dst->src[j]; rank = std::max(rank, ggml_n_dims(src)); + QNN_LOG_DEBUG("node[%d]: src[%d]: %s(%s), type: %s\n", i, (int) j, ggml_get_name(src), ggml_op_desc(src), + ggml_type_name(src->type)); if (connectivity_map.count(src) == 0) { connectivity_map[src] = { 0, @@ -187,16 +192,155 @@ int get_io_tensors_from_graph(const ggml_cgraph * cgraph, qnn::ggml_tensor_array return rank; } +/* + * for src0_F32, src1_F32, dst_F32 -> GGML_TYPE_COUNT + * for src0_F16, src1_F16, dst_F16 -> GGML_TYPE_COUNT + * for src0_F16, src1_F32, dst_F32 -> GGML_TYPE_F32 + * for src0_q4, src1_F32, dst_F32 -> GGML_TYPE_F32 + * for src0_q4, src1_F16, dst_F32 -> GGML_TYPE_F32 + */ +ggml_type get_override_data_type(const qnn::ggml_tensor_array_t & inputs, const qnn::ggml_tensor_array_t & outputs) { + GGML_ASSERT(!inputs.empty()); + ggml_type override_data_type = inputs.front()->type; + bool is_same_data_type = true; + for (auto * tensor : inputs) { + QNN_LOG_DEBUG("input_tensor: %s(%s), override_data_type(%s)\n", ggml_get_name(tensor), + ggml_type_name(tensor->type), ggml_type_name(override_data_type)); + is_same_data_type = is_same_data_type && tensor->type == override_data_type; + override_data_type = std::min(override_data_type, tensor->type); + } + + for (auto * tensor : outputs) { + QNN_LOG_DEBUG("output_tensor: %s(%s), override_data_type(%s)\n", ggml_get_name(tensor), + ggml_type_name(tensor->type), ggml_type_name(override_data_type)); + is_same_data_type = is_same_data_type && tensor->type == override_data_type; + override_data_type = std::min(override_data_type, tensor->type); + } + + return is_same_data_type ? GGML_TYPE_COUNT : override_data_type; +} + +static const QnnHtpGraph_CustomConfig_t kDefaultHvxConfig = []() { + QnnHtpGraph_CustomConfig_t hvx_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT; + hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; + hvx_config.numHvxThreads = 8; + return hvx_config; +}(); + +static const QnnHtpGraph_CustomConfig_t kDefaultDlbcConfig = []() { + QnnHtpGraph_CustomConfig_t dlbc_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT; + dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; + dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC + return dlbc_config; +}(); + +/* + * 1 = Faster preparation time, less optimal graph + * 2 = Longer preparation time, more optimal graph + * 3 = Longest preparation time, most likely even more optimal graph: + * QNN_HTP_DEVICE_CONFIG_OPTION_SOC configuration will be taken into account when possible, details see HTP Backend Specific Page + */ +static const QnnHtpGraph_CustomConfig_t kDefaultOptConfig = []() { + QnnHtpGraph_CustomConfig_t opt_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT; + opt_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; +#ifndef NDEBUG + opt_config.optimizationOption.floatValue = 3; +#else + opt_config.optimizationOption.floatValue = 1; +#endif + return opt_config; +}(); + +static const QnnHtpGraph_CustomConfig_t kHtpPrecisionConfigF16 = []() { + QnnHtpGraph_CustomConfig_t precision_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT; + precision_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION; + precision_config.precision = QNN_PRECISION_FLOAT16; + return precision_config; +}(); + +constexpr QnnHtpGraph_CustomConfig_t make_vtcm_config(size_t vtcm_size_in_mb) { + QnnHtpGraph_CustomConfig_t vtcm_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT; + vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; + vtcm_config.vtcmSizeInMB = (uint32_t) vtcm_size_in_mb; + return vtcm_config; +} + +constexpr QnnGraph_Config_t make_graph_config(const QnnHtpGraph_CustomConfig_t * custom_config) { + QnnGraph_Config_t graph_config = QNN_GRAPH_CONFIG_INIT; + graph_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_config.customConfig = const_cast(custom_config); + return graph_config; +} + } // namespace namespace qnn { -qnn_graph::qnn_graph(const std::string & graph_name, QNNBackend device, std::shared_ptr qnn_instance, - size_t vtcm_size_in_mb) : +ggml_type qnn_graph::get_graph_key_from_cgraph(const ggml_cgraph * cgraph, std::string & output) { + if (cgraph->n_nodes == 0) { + QNN_LOG_DEBUG("empty cgraph\n"); + return GGML_TYPE_COUNT; + } + + ggml_type override_type = GGML_TYPE_COUNT; + { + // TODO: can we have a better approach to get the override_type here? + // though it is O(n) + O(mlog(m)) complexity, our graph is small, so it is fine + ggml_tensor_array_t inputs; + ggml_tensor_array_t outputs; + get_io_tensors_from_graph(cgraph, inputs, outputs); + if (!inputs.empty() && !outputs.empty()) { + override_type = get_override_data_type(inputs, outputs); + QNN_LOG_DEBUG("get_graph_key, override_type: %s\n", ggml_type_name(override_type)); + } else { + QNN_LOG_DEBUG("get_graph_key, no input or output tensors\n"); + } + } + + ggml_type min_op_type = GGML_TYPE_COUNT; + { + bool is_start = true; + for (int i = 0; i < cgraph->n_nodes; ++i) { + auto * op = cgraph->nodes[i]; + if (ggml_is_empty(op)) { + QNN_LOG_DEBUG("empty op in graph, skipping\n"); + continue; + } + + if (op->op == GGML_OP_NONE || op->op == GGML_OP_VIEW || op->op == GGML_OP_PERMUTE) { + QNN_LOG_DEBUG("%s in graph, skipping\n", ggml_op_desc(op)); + continue; + } + + min_op_type = std::min(min_op_type, op->type); + if (is_start) { + qnn::get_qnn_op_desc(op, is_start, override_type, output); + is_start = false; + } else { + output += '#'; + qnn::get_qnn_op_desc(op, is_start, override_type, output); + } + } + } + + if (cgraph->n_nodes > 1) { + auto * last_op = cgraph->nodes[cgraph->n_nodes - 1]; + output += qnn::get_ggml_type_name(last_op->type); + output += '_'; + qnn::append_tensor_shape_and_type(last_op, output); + } + + return min_op_type; +} + +qnn_graph::qnn_graph(const std::string & graph_name, QNNBackend device, qnn_instance_ptr qnn_instance, + htp_precision precision, size_t vtcm_size_in_mb) : _graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) { - QNN_LOG_DEBUG("[%s][%s]created\n", get_backend_name(device), graph_name.c_str()); + QNN_LOG_DEBUG("[%s][%s]creating\n", get_backend_name(device), graph_name.c_str()); auto qnn_interface = qnn_instance->get_qnn_interface(); auto qnn_context = qnn_instance->get_qnn_context_handle(); @@ -204,38 +348,29 @@ qnn_graph::qnn_graph(const std::string & graph_name, QNNBackend device, std::sha Qnn_GraphHandle_t graph_handle = nullptr; if (device == QNN_BACKEND_NPU) { // TODO: fix graph config here for NPU - QnnHtpGraph_CustomConfig_t hvx_config; - hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; - hvx_config.numHvxThreads = 8; - QnnGraph_Config_t graph_hvx_config; - graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_hvx_config.customConfig = &hvx_config; + std::vector graph_configs; - QnnHtpGraph_CustomConfig_t dlbc_config; - dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; - dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; - dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC - QnnGraph_Config_t graph_dlbc_config; - graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_dlbc_config.customConfig = &dlbc_config; + auto hvx_config = make_graph_config(&kDefaultHvxConfig); + graph_configs.push_back(&hvx_config); - QnnHtpGraph_CustomConfig_t opt_config; - opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - opt_config.optimizationOption.floatValue = 1; // 1 / 3 - QnnGraph_Config_t graph_opt_config; - graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_opt_config.customConfig = &opt_config; + auto dlbc_config = make_graph_config(&kDefaultDlbcConfig); + graph_configs.push_back(&dlbc_config); - QnnHtpGraph_CustomConfig_t vtcm_config; - vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; - vtcm_config.vtcmSizeInMB = (uint32_t) vtcm_size_in_mb; - QnnGraph_Config_t graph_vtcm_config; - graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_vtcm_config.customConfig = &vtcm_config; + auto opt_config = make_graph_config(&kDefaultOptConfig); + graph_configs.push_back(&opt_config); - const QnnGraph_Config_t * graph_configs[] = { &graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, - &graph_opt_config, nullptr }; - error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), graph_configs, &graph_handle); + auto vctm_sub_config = make_vtcm_config(vtcm_size_in_mb); + auto vtcm_config = make_graph_config(&vctm_sub_config); + graph_configs.push_back(&vtcm_config); + + if (precision == qnn_graph::kHtpFp16) { + auto precision_config = make_graph_config(&kHtpPrecisionConfigF16); + graph_configs.push_back(&precision_config); + QNN_LOG_DEBUG("[%s][%s]set precision to F16\n", get_backend_name(device), graph_name.c_str()); + } + + graph_configs.push_back(nullptr); + error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), graph_configs.data(), &graph_handle); } else { error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), nullptr, &graph_handle); } @@ -246,9 +381,16 @@ qnn_graph::qnn_graph(const std::string & graph_name, QNNBackend device, std::sha return; } - QNN_LOG_DEBUG("[%s][%s]create succeed\n", get_backend_name(device), graph_name.c_str()); +#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING + if (device == QNN_BACKEND_NPU) { + _event_tracer = std::make_shared( + graph_name, qnn_interface, qnn_instance->get_qnn_backend_handle(), qnn_event_tracer::PROFILE_OP_TRACE); + } +#endif + _graph_handle = graph_handle; _qnn_interface = qnn_interface; + QNN_LOG_DEBUG("[%s][%s]create succeed\n", get_backend_name(device), graph_name.c_str()); } qnn_graph::~qnn_graph() { @@ -261,15 +403,28 @@ bool qnn_graph::build_graph_from_ggml_graph(const ggml_cgraph * cgraph) { ggml_tensor_array_t inputs; ggml_tensor_array_t outputs; int rank = get_io_tensors_from_graph(cgraph, inputs, outputs); - QNN_LOG_DEBUG("[%s]rank: %d, input_set: %d, output_set: %d\n", get_backend_name(_device), rank, int(inputs.size()), - int(outputs.size())); + QNN_LOG_DEBUG("[%s][%s]rank: %d, graph_nodes: %d, input_set: %d, output_set: %d\n", get_backend_name(_device), + _graph_name.c_str(), rank, cgraph->n_nodes, int(inputs.size()), int(outputs.size())); { + static_assert( + GGML_TYPE_COUNT > GGML_TYPE_Q8_0 && GGML_TYPE_Q8_0 > GGML_TYPE_F16 && GGML_TYPE_F16 > GGML_TYPE_F32, + "GGML_TYPE enum order is not correct"); + + QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]build_graph_from_ggml_graph", get_backend_name(_device), + _graph_name.c_str()); + + auto override_data_type = get_override_data_type(inputs, outputs); + if (override_data_type != GGML_TYPE_COUNT) { + QNN_LOG_DEBUG("[%s][%s]set override_data_type: %s\n", get_backend_name(_device), _graph_name.c_str(), + ggml_type_name(override_data_type)); + } + qnn_tensor_cache_t tensor_cache; - auto input_tensors = create_tensors_with_cache(inputs, ggml_qnn_tensor::INPUT, rank, _device, _graph_handle, - _qnn_instance, tensor_cache); - auto output_tensors = create_tensors_with_cache(outputs, ggml_qnn_tensor::OUTPUT, rank, _device, _graph_handle, - _qnn_instance, tensor_cache); + auto input_tensors = create_tensors_with_cache(inputs, ggml_qnn_tensor::INPUT, rank, override_data_type, + _device, _graph_handle, _qnn_instance, tensor_cache); + auto output_tensors = create_tensors_with_cache(outputs, ggml_qnn_tensor::OUTPUT, rank, GGML_TYPE_COUNT, + _device, _graph_handle, _qnn_instance, tensor_cache); qnn_op_config_array_t operations; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * dst = cgraph->nodes[i]; @@ -277,14 +432,21 @@ bool qnn_graph::build_graph_from_ggml_graph(const ggml_cgraph * cgraph) { continue; } - if (dst->op == GGML_OP_NONE || dst->op == GGML_OP_VIEW) { + if (dst->op == GGML_OP_NONE || dst->op == GGML_OP_VIEW || dst->op == GGML_OP_PERMUTE) { // TODO: remove GGML_OP_VIEW after view op is supported continue; } - QNN_LOG_DEBUG("[%s]create op: %s\n", get_backend_name(_device), get_qnn_op_name(dst)); +#ifndef NDEBUG + { + std::string op_desc; + get_qnn_op_desc(dst, true, GGML_TYPE_COUNT, op_desc); + QNN_LOG_DEBUG("[%s]create op(%s) with qnn op(%s)\n", get_backend_name(_device), op_desc.c_str(), + get_qnn_op_name(dst)); + } +#endif auto operation = create_operation_from_op_tensor(dst, dst->name, rank, _device, _graph_handle, - _qnn_instance, true, tensor_cache); // TODO: fix op name + _qnn_instance, tensor_cache); // TODO: fix op name operations.push_back(operation); } @@ -300,59 +462,81 @@ bool qnn_graph::build_graph_from_ggml_graph(const ggml_cgraph * cgraph) { return true; } -bool qnn_graph::execute(const ggml_cgraph * cgraph) { +bool qnn_graph::execute(const ggml_cgraph * cgraph, std::shared_ptr convert_context) { ggml_tensor_array_t inputs; ggml_tensor_array_t outputs; + { + QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]get_io_tensors_from_graph", get_backend_name(_device), + _graph_name.c_str()); #ifdef NDEBUG - get_io_tensors_from_graph(cgraph, inputs, outputs); + get_io_tensors_from_graph(cgraph, inputs, outputs); #else - int rank = get_io_tensors_from_graph(cgraph, inputs, outputs); - QNN_LOG_DEBUG("[%s]rank: %d, input_set: %d, output_set: %d\n", get_backend_name(_device), rank, int(inputs.size()), - int(outputs.size())); + int rank = get_io_tensors_from_graph(cgraph, inputs, outputs); + QNN_LOG_DEBUG("[%s]rank: %d, input_set: %d, output_set: %d\n", get_backend_name(_device), rank, + int(inputs.size()), int(outputs.size())); #endif + } { - if (!qnn::bind_tensors(inputs, _tensor_inputs, _qnn_tensor_inputs)) { - QNN_LOG_ERROR("[%s][%s]bind input tensors failed\n", get_backend_name(_device), _graph_name.c_str()); - return false; + QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]bind_tensors", get_backend_name(_device), _graph_name.c_str()); + auto override_data_type = get_override_data_type(inputs, outputs); + if (override_data_type != GGML_TYPE_COUNT) { + QNN_LOG_DEBUG("[%s][%s]override_data_type: %s\n", get_backend_name(_device), _graph_name.c_str(), + ggml_type_name(override_data_type)); + auto buffers = convert(convert_context, inputs, override_data_type); + if (!qnn::bind_tensors_with_custom_buffers(inputs, buffers, _tensor_inputs, _qnn_tensor_inputs)) { + QNN_LOG_ERROR("[%s][%s]bind input tensors failed\n", get_backend_name(_device), _graph_name.c_str()); + return false; + } + } else { + if (!qnn::bind_tensors(inputs, _tensor_inputs, _qnn_tensor_inputs)) { + QNN_LOG_ERROR("[%s][%s]bind input tensors failed\n", get_backend_name(_device), _graph_name.c_str()); + return false; + } } if (!qnn::bind_tensors(outputs, _tensor_outputs, _qnn_tensor_outputs)) { QNN_LOG_ERROR("[%s][%s]bind output tensors failed\n", get_backend_name(_device), _graph_name.c_str()); return false; } + } + { + QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]execute", get_backend_name(_device), _graph_name.c_str()); auto & qnn_tensor_inputs = _qnn_tensor_inputs; auto & qnn_tensor_outputs = _qnn_tensor_outputs; - auto error = - _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(), - qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr); + auto error = _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), + qnn_tensor_inputs.size(), qnn_tensor_outputs.data(), + qnn_tensor_outputs.size(), GRAPH_PROFILE_HANDLE, nullptr); unbind_tensors(_tensor_inputs); unbind_tensors(_tensor_outputs); - if (error != QNN_SUCCESS) { if (_device == QNN_BACKEND_NPU && error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { - QNN_LOG_WARN("[%s][%s]NPU crashed. SSR detected. Caused QNN graph execute error.\n", + QNN_LOG_WARN("[%s][%s][execute]NPU crashed. SSR detected. Caused QNN graph execute error.\n", get_backend_name(_device), _graph_name.c_str()); } else { - QNN_LOG_ERROR("[%s][%s]error: %s\n", get_backend_name(_device), _graph_name.c_str(), + QNN_LOG_ERROR("[%s][%s][execute]error: %s\n", get_backend_name(_device), _graph_name.c_str(), get_qnn_error_string(error)); } return false; } QNN_LOG_DEBUG("[%s][%s]execute succeed\n", get_backend_name(_device), _graph_name.c_str()); - return true; } + + GRAPH_PROFILE_PRINT(); + return true; } bool qnn_graph::finalize() { + QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]finalize", get_backend_name(_device), _graph_name.c_str()); + if (!qnn::add_op_to_graph(_graph_handle, _operations)) { QNN_LOG_ERROR("[%s]add nodes failed\n", _graph_name.c_str()); return false; } - auto error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr); + auto error = _qnn_interface->qnn_graph_finalize(_graph_handle, GRAPH_PROFILE_HANDLE, nullptr); if (error != QNN_SUCCESS) { QNN_LOG_ERROR("[%s][%s]qnn_graph_finalize.error: %s\n", get_backend_name(_device), _graph_name.c_str(), get_qnn_error_string(error)); diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index dc1ed0b3f8..a913b8bba3 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -5,8 +5,10 @@ #include #include +#include "convert.hpp" #include "ggml-qnn.h" #include "op-config.hpp" +#include "profiler.hpp" #include "qnn-lib.hpp" namespace qnn { @@ -21,19 +23,42 @@ namespace qnn { */ class qnn_graph { public: - explicit qnn_graph(const std::string & graph_name, QNNBackend device, std::shared_ptr qnn_instance, - size_t vtcm_size_in_mb); + enum htp_precision { + kHtpDefault = 0, + kHtpFp16, + }; + + /** + * @brief Generates a unique key for a given computation graph (cgraph). + * + * This key is used to cache the graph, enabling efficient reuse of previously + * compiled graphs. The key is constructed by concatenating the descriptions + * of the operations and their associated tensor dimensions within the graph. + * + * Example key format: "MUL_MATf32_2048x8192q4_K_2048x2f32#MUL(SILU,MUL_MAT)#MUL_MAT(NONE,MUL)#ADD(MUL_MAT,ADD)f32_2048x2f32" + * + * @param cgraph The computation graph for which the key is generated. + * @param output The string where the generated key will be stored. + * @return The max ggml_type of all tensors in the graph. + * + * TODO: Improve the key generation logic to handle more complex graph structures and edge cases. + */ + static ggml_type get_graph_key_from_cgraph(const ggml_cgraph * cgraph, std::string & output); + + explicit qnn_graph(const std::string & graph_name, QNNBackend device, qnn_instance_ptr qnn_instance, + htp_precision precision, size_t vtcm_size_in_mb); + ~qnn_graph(); bool build_graph_from_ggml_graph(const ggml_cgraph * cgraph); - bool execute(const ggml_cgraph * cgraph); + bool execute(const ggml_cgraph * cgraph, std::shared_ptr convert_context); bool is_valid() const { return _graph_handle != nullptr; } Qnn_GraphHandle_t get_graph_handler() const { return _graph_handle; } - std::shared_ptr get_qnn_instance() { return _qnn_instance; } + qnn_instance_ptr get_qnn_instance() { return _qnn_instance; } const std::string & get_name() const { return _graph_name; } @@ -42,18 +67,23 @@ class qnn_graph { private: bool finalize(); - const std::string _graph_name; - const QNNBackend _device; - Qnn_GraphHandle_t _graph_handle = nullptr; - std::shared_ptr _qnn_instance; - std::shared_ptr _qnn_interface; - qnn_op_config_array_t _operations; + const std::string _graph_name; + const QNNBackend _device; + Qnn_GraphHandle_t _graph_handle = nullptr; + qnn_instance_ptr _qnn_instance; + qnn_interface_ptr _qnn_interface; + qnn_op_config_array_t _operations; qnn_tensor_array_t _tensor_inputs; qnn_tensor_array_t _tensor_outputs; std::vector _qnn_tensor_inputs; std::vector _qnn_tensor_outputs; +#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING + // profiler + qnn_event_tracer_ptr _event_tracer; +#endif + DISABLE_COPY(qnn_graph); DISABLE_MOVE(qnn_graph); }; diff --git a/ggml/src/ggml-qnn/logger.hpp b/ggml/src/ggml-qnn/logger.hpp index cf94ce2217..309ae3e985 100644 --- a/ggml/src/ggml-qnn/logger.hpp +++ b/ggml/src/ggml-qnn/logger.hpp @@ -1,10 +1,11 @@ #pragma once +#include + #include #include "ggml-impl.h" #include "ggml.h" -#include "QnnLog.h" namespace qnn { void sdk_logcallback(const char * fmt, QnnLog_Level_t level, uint64_t timestamp, va_list argp); @@ -13,4 +14,9 @@ void sdk_logcallback(const char * fmt, QnnLog_Level_t level, uint64_t timestamp, #define QNN_LOG_ERROR(...) (GGML_LOG_ERROR(__VA_ARGS__)) #define QNN_LOG_WARN(...) (GGML_LOG_WARN(__VA_ARGS__)) #define QNN_LOG_INFO(...) (GGML_LOG_INFO(__VA_ARGS__)) -#define QNN_LOG_DEBUG(...) (GGML_LOG_DEBUG(__VA_ARGS__)) + +#ifndef NDEBUG +# define QNN_LOG_DEBUG(...) (GGML_LOG_DEBUG(__VA_ARGS__)) +#else +# define QNN_LOG_DEBUG(...) +#endif diff --git a/ggml/src/ggml-qnn/op-config-base.hpp b/ggml/src/ggml-qnn/op-config-base.hpp index b24b53bf2a..87ca798272 100644 --- a/ggml/src/ggml-qnn/op-config-base.hpp +++ b/ggml/src/ggml-qnn/op-config-base.hpp @@ -70,7 +70,7 @@ class ggml_qnn_op_config { * * @return A reference to a vector of qnn_tensor_ptr_t objects representing the input tensors. */ - virtual const qnn_tensor_array_t & get_input_tensors() = 0; + virtual qnn_tensor_array_t & get_input_tensors() = 0; /** * @brief Pure virtual function to retrieve the output tensors of a QNN. @@ -81,7 +81,7 @@ class ggml_qnn_op_config { * * @return A reference to a vector of qnn_tensor_ptr_t objects representing the output tensors. */ - virtual const qnn_tensor_array_t & get_output_tensors() = 0; + virtual qnn_tensor_array_t & get_output_tensors() = 0; /** * @brief Adds an operation to the given graph. diff --git a/ggml/src/ggml-qnn/op-config-caps.cpp b/ggml/src/ggml-qnn/op-config-caps.cpp index 16b50503be..a29ea28ad6 100644 --- a/ggml/src/ggml-qnn/op-config-caps.cpp +++ b/ggml/src/ggml-qnn/op-config-caps.cpp @@ -3,30 +3,77 @@ namespace { -using op_constructor_t = std::shared_ptr (*)(const ggml_tensor *, const std::string &, +using op_constructor_t = std::shared_ptr (*)(const ggml_tensor *, const std::string &, std::shared_ptr); -using op_dims_calc_func_t = void (*)(const std::vector & input_dims, - qnn::ggml_dimension_array_t & output_dims); -void element_wise_op_dims(const std::vector & input_dims, - qnn::ggml_dimension_array_t & output_dims) { - for (size_t i = 1; i < std::size(output_dims); i++) { - output_dims[i] = input_dims.front()[i]; +using op_description_generator_t = void (*)(const ggml_tensor * op, bool append_dimensions, + ggml_type override_data_type, std::string & output); + +void append_tensor_shape_and_type_impl(const ggml_tensor * tensor, ggml_type override_data_type, std::string & output) { + char buffer[256] = {}; + const auto * type_name = qnn::get_ggml_type_name(std::min(tensor->type, override_data_type)); + int len = 0; + switch (ggml_n_dims(tensor)) { + case 1: + len = snprintf(buffer, sizeof(buffer), "%ld%s", (long) tensor->ne[0], type_name); + break; + case 2: + len = snprintf(buffer, sizeof(buffer), "%ldx%ld%s", (long) tensor->ne[0], (long) tensor->ne[1], type_name); + break; + case 3: + len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ld%s", (long) tensor->ne[0], (long) tensor->ne[1], + (long) tensor->ne[2], type_name); + break; + case 4: + default: + len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ldx%ld%s", (long) tensor->ne[0], (long) tensor->ne[1], + (long) tensor->ne[2], (long) tensor->ne[3], type_name); + break; + } + GGML_ASSERT(len > 0 && len < (int) sizeof(buffer)); + output.append(buffer, len); +} + +void get_graph_key_from_op(const ggml_tensor * op, ggml_type override_data_type, std::string & output) { + output += ggml_op_desc(op); + output += qnn::get_ggml_type_name(op->type); + for (size_t i = 0; i < GGML_MAX_SRC && op->src[i]; ++i) { + auto * src = op->src[i]; + if (!src) { + break; + } + + output += '_'; + append_tensor_shape_and_type_impl(src, override_data_type, output); } } -void mat_mul_op_dims(const std::vector & input_dims, - qnn::ggml_dimension_array_t & output_dims) { - GGML_ASSERT(input_dims.size() == 2); - output_dims[0] = input_dims.front()[1]; - output_dims[1] = input_dims.back()[1]; +void get_op_key_with_src_op_desc(const ggml_tensor * op, std::string & output) { + output += ggml_op_desc(op); + output += '('; + if (op->src[0]) { + output += ggml_op_desc(op->src[0]); + } + for (size_t i = 1; i < GGML_MAX_SRC && op->src[i]; ++i) { + output += ','; + output += ggml_op_desc(op->src[i]); + } + output += ')'; +} + +void generic_get_op_desc(const ggml_tensor * op, bool append_dimensions, ggml_type override_data_type, + std::string & output) { + if (append_dimensions) { + get_graph_key_from_op(op, override_data_type, output); + } else { + get_op_key_with_src_op_desc(op, output); + } } struct qnn_op_caps_t { - const char * qnn_op_name = nullptr; - const size_t input_param_count = 0; - op_dims_calc_func_t calc_dims_func = nullptr; - const char * qnn_param_name = nullptr; + const char * qnn_op_name = nullptr; + op_description_generator_t get_desc = nullptr; + const char * qnn_param_name = nullptr; }; constexpr const qnn_op_caps_t kOpCaps[] = { @@ -35,41 +82,29 @@ constexpr const qnn_op_caps_t kOpCaps[] = { { // GGML_OP_ADD QNN_OP_ELEMENT_WISE_ADD, // qnn_op_name - 2, // input_param_count - element_wise_op_dims, // calc_dims_func }, {}, // GGML_OP_ADD1 {}, // GGML_OP_ACC { // GGML_OP_SUB QNN_OP_ELEMENT_WISE_SUBTRACT, // qnn_op_name - 2, // input_param_count - element_wise_op_dims, // calc_dims_func }, { // GGML_OP_MUL QNN_OP_ELEMENT_WISE_MULTIPLY, // qnn_op_name - 2, // input_param_count - element_wise_op_dims, // calc_dims_func }, { // GGML_OP_DIV QNN_OP_ELEMENT_WISE_DIVIDE, // qnn_op_name - 2, // input_param_count - element_wise_op_dims, // calc_dims_func }, {}, // GGML_OP_SQR { // GGML_OP_SQRT QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // qnn_op_name - 1, // input_param_count - element_wise_op_dims, // calc_dims_func }, { // GGML_OP_LOG QNN_OP_ELEMENT_WISE_LOG, // qnn_op_name - 1, // input_param_count - element_wise_op_dims, // calc_dims_func }, {}, // GGML_OP_SIN {}, // GGML_OP_COS @@ -86,17 +121,14 @@ constexpr const qnn_op_caps_t kOpCaps[] = { { // GGML_OP_RMS_NORM QNN_OP_RMS_NORM, // qnn_op_name - 1, // input_param_count - nullptr, // TODO: calc_dims_func + generic_get_op_desc, // get_desc QNN_OP_RMS_NORM_PARAM_EPSILON, // qnn_param_name }, {}, // GGML_OP_RMS_NORM_BACK {}, // GGML_OP_GROUP_NORM { // GGML_OP_MUL_MAT - QNN_OP_MAT_MUL, // qnn_op_name - 2, // input_param_count - mat_mul_op_dims, // calc_dims_func + QNN_OP_MAT_MUL, // qnn_op_name }, {}, // GGML_OP_MUL_MAT_ID {}, // GGML_OP_OUT_PROD @@ -107,8 +139,6 @@ constexpr const qnn_op_caps_t kOpCaps[] = { { // GGML_OP_RESHAPE QNN_OP_RESHAPE, // qnn_op_name - 1, // input_param_count - nullptr, // TODO: calc_dims_func }, {}, // GGML_OP_VIEW {}, // GGML_OP_PERMUTE @@ -179,8 +209,6 @@ constexpr const qnn_op_caps_t kOpCaps[] = { { // GGML_UNARY_OP_GELU QNN_OP_GELU, // qnn_op_name - 1, // input_param_count - nullptr, // TODO: calc_dims_func }, {}, // GGML_UNARY_OP_GELU_QUICK {}, // GGML_UNARY_OP_SILU @@ -189,15 +217,11 @@ constexpr const qnn_op_caps_t kOpCaps[] = { {}, // GGML_UNARY_OP_EXP }; -static_assert(kOpCaps[GGML_OP_NONE].calc_dims_func == nullptr, "GGML_OP_NONE should not have calc_dims_func function"); -static_assert(kOpCaps[GGML_OP_ADD].calc_dims_func == element_wise_op_dims, - "GGML_OP_ADD does not have element_wise_op_dims function"); -static_assert(kOpCaps[GGML_OP_MUL_MAT].calc_dims_func == mat_mul_op_dims, - "GGML_OP_ADD does not have element_wise_op_dims function"); -static_assert(kOpCaps[GGML_OP_LOG].calc_dims_func == element_wise_op_dims, - "GGML_OP_LOG does not have element_wise_op_dims function"); -static_assert(kOpCaps[GGML_OP_COUNT + GGML_UNARY_OP_GELU].input_param_count == 1, - "GGML_UNARY_OP_GELU does not have 1 input parameter"); +static_assert(kOpCaps[GGML_OP_NONE].get_desc == nullptr, "GGML_OP_NONE should not have get_desc function"); +static_assert(kOpCaps[GGML_OP_ADD].qnn_op_name, "GGML_OP_ADD does not have qnn_op_name in the kOpCaps table"); +static_assert(kOpCaps[GGML_OP_MUL_MAT].qnn_op_name, "GGML_OP_MUL_MAT does not have qnn_op_name in the kOpCaps table"); +static_assert(kOpCaps[GGML_OP_MUL].qnn_op_name, "GGML_OP_MUL does not have qnn_op_name in the kOpCaps table"); +static_assert(kOpCaps[GGML_OP_LOG].qnn_op_name, "GGML_OP_LOG does not have qnn_op_name in the kOpCaps table"); static_assert(std::size(kOpCaps) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), "GGML_OP_COUNT does not match the size of the kOpCaps table"); @@ -368,6 +392,10 @@ static_assert(std::size(kOpConstructors) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT namespace qnn { +void append_tensor_shape_and_type(const ggml_tensor * tensor, std::string & output) { + append_tensor_shape_and_type_impl(tensor, GGML_TYPE_COUNT, output); +} + size_t get_qnn_op_index(const ggml_tensor * tensor) { if (tensor->op == GGML_OP_UNARY) { return kGgmlUnaryOpStart + ggml_get_unary_op(tensor); @@ -383,14 +411,20 @@ const char * get_qnn_op_name(const ggml_tensor * op) { return kOpCaps[op_index].qnn_op_name; } -size_t get_qnn_op_input_param_count(const ggml_tensor * op) { +void get_qnn_op_desc(const ggml_tensor * op, bool append_dimensions, ggml_type override_data_type, + std::string & output) { auto op_index = get_qnn_op_index(op); GGML_ASSERT(op_index < std::size(kOpCaps)); - return kOpCaps[op_index].input_param_count; + auto get_desc = kOpCaps[op_index].get_desc; + if (get_desc) { + get_desc(op, append_dimensions, override_data_type, output); + } else { + generic_get_op_desc(op, append_dimensions, override_data_type, output); + } } std::shared_ptr create_op(const ggml_tensor * op, const std::string & name, - std::shared_ptr qnn_instance) { + qnn_instance_ptr qnn_instance) { auto op_index = get_qnn_op_index(op); GGML_ASSERT(op_index < std::size(kOpCaps)); auto op_constructor = kOpConstructors[op_index]; diff --git a/ggml/src/ggml-qnn/op-config-impl.cpp b/ggml/src/ggml-qnn/op-config-impl.cpp index 14638a554e..b85f145045 100644 --- a/ggml/src/ggml-qnn/op-config-impl.cpp +++ b/ggml/src/ggml-qnn/op-config-impl.cpp @@ -84,12 +84,12 @@ void ggml_qnn_op_config_base::set_input_tensors(qnn::qnn_tensor_array_t & tensor } void ggml_qnn_op_config_base::set_input_tensors(qnn::qnn_tensor_array_t && tensor_inputs) { - _tensor_inputs = tensor_inputs; + _tensor_inputs = std::move(tensor_inputs); _qnn_tensor_inputs.resize(_tensor_inputs.size()); } void ggml_qnn_op_config_base::set_output_tensors(qnn::qnn_tensor_array_t & tensor_outputs) { - _tensor_outputs = std::move(tensor_outputs); + _tensor_outputs = tensor_outputs; _qnn_tensor_outputs.resize(_tensor_outputs.size()); } @@ -99,10 +99,11 @@ void ggml_qnn_op_config_base::set_output_tensors(qnn::qnn_tensor_array_t && tens } bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { + QNN_LOG_DEBUG("[%s]add to graph start\n", _name.c_str()); + GGML_ASSERT(_qnn_tensor_inputs.size() == _tensor_inputs.size()); GGML_ASSERT(_qnn_tensor_outputs.size() == _tensor_outputs.size()); - QNN_LOG_DEBUG("[%s]add to graph start\n", _name.c_str()); for (size_t i = 0; i < _tensor_inputs.size(); i++) { auto tensor = _tensor_inputs[i]; if (!tensor->alloc_qnn_tensor_id()) { @@ -110,7 +111,8 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { return false; } - QNN_LOG_DEBUG("[%s]input tensor id: %d\n", _name.c_str(), tensor->get_qnn_tensor_id()); + QNN_LOG_DEBUG("[%s]input tensor(%s), id(%d)\n", _name.c_str(), tensor->get_tensor_name().c_str(), + tensor->get_qnn_tensor_id()); _qnn_tensor_inputs[i] = tensor->get_qnn_tensor(); } @@ -121,7 +123,8 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { return false; } - QNN_LOG_DEBUG("[%s]output tensor id: %d\n", _name.c_str(), tensor->get_qnn_tensor_id()); + QNN_LOG_DEBUG("[%s]output tensor(%s), id(%d)\n", _name.c_str(), tensor->get_tensor_name().c_str(), + tensor->get_qnn_tensor_id()); _qnn_tensor_outputs[i] = tensor->get_qnn_tensor(); } @@ -222,18 +225,30 @@ bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_Graph GGML_ASSERT(_tensor_outputs.size() == 1); // create convert nodes - const auto tensor_rank = _tensor_inputs.front()->get_rank(); - qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs; - qnn_tensor_array_t mat_mul_tensor_outputs = _tensor_outputs; - if (!create_convert_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs)) { - QNN_LOG_ERROR("create convert nodes failed\n"); - return false; - } + const auto tensor_rank = _tensor_inputs.front()->get_rank(); + qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs; + auto tensor_type = create_input_convert_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs); mat_mul_tensor_inputs.front() = create_gather_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs.front(), mat_mul_tensor_inputs.back()->get_dimensions()); - return create_mat_mul_nodes(mat_mul_tensor_inputs, mat_mul_tensor_outputs); + + if (device != QNN_BACKEND_GPU && _tensor_outputs.front()->get_data_type() != tensor_type) { + auto convert_out = create_output_convert_nodes(device, graph_handle, tensor_rank, tensor_type, _tensor_outputs); + if (!create_mat_mul_nodes(mat_mul_tensor_inputs, convert_out->get_input_tensors())) { + QNN_LOG_ERROR("create mat_mul nodes failed\n"); + return false; + } + + _operations.push_back(convert_out); + } else { + if (!create_mat_mul_nodes(mat_mul_tensor_inputs, _tensor_outputs)) { + QNN_LOG_ERROR("create mat_mul nodes failed\n"); + return false; + } + } + + return true; } qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, @@ -256,7 +271,7 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic constexpr const auto create_node = [](const std::string & name, const int rank, const int axis, const qnn_dimension_array_t & dimensions, qnn_tensor_ptr_t tensor_input, QNNBackend device, Qnn_GraphHandle_t graph_handle, - std::shared_ptr qnn_instance, qnn_tensor_ptr_t & tensor_output) -> qnn_op_config_ptr_t { + qnn_instance_ptr qnn_instance, qnn_tensor_ptr_t & tensor_output) -> qnn_op_config_ptr_t { auto gather_out = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, name + "_out", dimensions, tensor_input->get_data_type(), rank, device, graph_handle, qnn_instance); @@ -303,18 +318,16 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic return gather1_out; } -bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, - qnn_tensor_array_t & tensor_inputs, - qnn_tensor_array_t & tensor_outputs) { +Qnn_DataType_t ggml_qnn_matmul_op_config::create_input_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const int rank, + qnn_tensor_array_t & tensor_inputs) { if (device == QNN_BACKEND_GPU) { // there's no convert op for GPU, so we should create matmul nodes directly. - return true; + return QNN_DATATYPE_UNDEFINED; } // create tensors for convert node auto tensor_type = get_tensor_type(tensor_inputs); - QNN_LOG_DEBUG("input tensor type: %s\n", qnn_datatype_to_string(tensor_type)); - for (size_t i = 0; i < tensor_inputs.size(); ++i) { // create input convert nodes auto convert_in = tensor_inputs[i]; @@ -327,29 +340,35 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap convert_in->get_dimensions(), tensor_type, rank, device, graph_handle, _qnn_instance); auto convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_CONVERT, _qnn_instance); + QNN_OP_CAST, _qnn_instance); + QNN_LOG_DEBUG("[%s][MUL_MAT]create: %s, type: %s\n", get_backend_name(device), convert_name.c_str(), + qnn_datatype_to_string(tensor_type)); convert->set_input_tensors({ convert_in }); convert->set_output_tensors({ convert_out }); tensor_inputs[i] = convert_out; _operations.push_back(convert); } - if (tensor_outputs.front()->get_data_type() != tensor_type) { - // create output convert node - std::string convert_name("convert_dst"); - auto convert_out = tensor_outputs.front(); - auto convert_in = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_in", - convert_out->get_dimensions(), tensor_type, rank, device, - graph_handle, _qnn_instance); - auto output_convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_CONVERT, _qnn_instance); - output_convert->set_input_tensors({ convert_in }); - output_convert->set_output_tensors({ convert_out }); - tensor_outputs.front() = convert_in; - _operations.push_back(output_convert); - } + return tensor_type; +} - return true; +qnn_op_config_ptr_t ggml_qnn_matmul_op_config::create_output_convert_nodes(QNNBackend device, + Qnn_GraphHandle_t graph_handle, + const int rank, Qnn_DataType_t tensor_type, + qnn_tensor_array_t & tensor_outputs) { + GGML_ASSERT(tensor_outputs.size() == 1); + // create output convert node + std::string convert_name("convert_dst"); + auto convert_in = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_in", + tensor_outputs.front()->get_dimensions(), tensor_type, rank, + device, graph_handle, _qnn_instance); + auto output_convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_CAST, _qnn_instance); + QNN_LOG_DEBUG("[%s][MUL_MAT]create: %s, type: %s\n", get_backend_name(device), convert_name.c_str(), + qnn_datatype_to_string(tensor_type)); + output_convert->set_input_tensors({ convert_in }); + output_convert->set_output_tensors(tensor_outputs); + return output_convert; } bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(qnn_tensor_array_t & tensor_inputs, @@ -413,8 +432,7 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(qnn_tensor_array_t & tensor mat_mul->add_scalar_param(QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, scalar); // set tensor to mat_mul - std::swap(tensor_inputs[0], tensor_inputs[1]); - mat_mul->set_input_tensors(tensor_inputs); + mat_mul->set_input_tensors({ tensor_inputs[1], tensor_inputs[0] }); mat_mul->set_output_tensors(tensor_outputs); _operations.push_back(mat_mul); diff --git a/ggml/src/ggml-qnn/op-config-impl.hpp b/ggml/src/ggml-qnn/op-config-impl.hpp index 8e2f107b2d..558b5cafbe 100644 --- a/ggml/src/ggml-qnn/op-config-impl.hpp +++ b/ggml/src/ggml-qnn/op-config-impl.hpp @@ -15,7 +15,7 @@ namespace qnn { class ggml_qnn_op_config_base : public ggml_qnn_op_config { public: explicit ggml_qnn_op_config_base(const std::string & name, const std::string & package_name, - const std::string & op_type, std::shared_ptr qnn_instance) : + const std::string & op_type, qnn_instance_ptr qnn_instance) : _name(name), _package_name(package_name), _op_type(op_type), @@ -36,24 +36,24 @@ class ggml_qnn_op_config_base : public ggml_qnn_op_config { void unbind_input_tensors() override; void unbind_output_tensors() override; - const qnn_tensor_array_t & get_input_tensors() override { return _tensor_inputs; } + qnn_tensor_array_t & get_input_tensors() override { return _tensor_inputs; } - const qnn_tensor_array_t & get_output_tensors() override { return _tensor_outputs; } + qnn_tensor_array_t & get_output_tensors() override { return _tensor_outputs; } protected: Qnn_OpConfig_t get_op_config(); - std::string _name; - std::string _package_name; - std::string _op_type; - std::shared_ptr _qnn_instance; - qnn_tensor_array_t _tensor_inputs; - qnn_tensor_array_t _tensor_outputs; - qnn_tensor_array_t _tensor_parameters; - std::vector _qnn_tensor_inputs; - std::vector _qnn_tensor_outputs; - std::vector _qnn_parameters; - std::vector _param_names; + std::string _name; + std::string _package_name; + std::string _op_type; + qnn_instance_ptr _qnn_instance; + qnn_tensor_array_t _tensor_inputs; + qnn_tensor_array_t _tensor_outputs; + qnn_tensor_array_t _tensor_parameters; + std::vector _qnn_tensor_inputs; + std::vector _qnn_tensor_outputs; + std::vector _qnn_parameters; + std::vector _param_names; DISABLE_COPY(ggml_qnn_op_config_base); DISABLE_MOVE(ggml_qnn_op_config_base); @@ -62,7 +62,7 @@ class ggml_qnn_op_config_base : public ggml_qnn_op_config { class ggml_qnn_single_op_config : public ggml_qnn_op_config_base { public: explicit ggml_qnn_single_op_config(const std::string & name, const std::string & package_name, - const std::string & op_type, std::shared_ptr qnn_instance) : + const std::string & op_type, qnn_instance_ptr qnn_instance) : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; @@ -75,7 +75,7 @@ class ggml_qnn_single_op_config : public ggml_qnn_op_config_base { class ggml_qnn_rmsnorm_op_config : public ggml_qnn_op_config_base { public: explicit ggml_qnn_rmsnorm_op_config(const std::string & name, const std::string & package_name, - const std::string & op_type, std::shared_ptr qnn_instance) : + const std::string & op_type, qnn_instance_ptr qnn_instance) : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; @@ -87,7 +87,7 @@ class ggml_qnn_rmsnorm_op_config : public ggml_qnn_op_config_base { class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config { public: - explicit ggml_qnn_aggregate_op_config(const std::string & name, std::shared_ptr qnn_instance) : + explicit ggml_qnn_aggregate_op_config(const std::string & name, qnn_instance_ptr qnn_instance) : _name(name), _qnn_instance(qnn_instance) {} @@ -121,13 +121,13 @@ class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config { } } - const qnn_tensor_array_t & get_input_tensors() override { return _tensor_inputs; } + qnn_tensor_array_t & get_input_tensors() override { return _tensor_inputs; } - const qnn_tensor_array_t & get_output_tensors() override { return _tensor_outputs; } + qnn_tensor_array_t & get_output_tensors() override { return _tensor_outputs; } protected: - std::string _name; - std::shared_ptr _qnn_instance; + std::string _name; + qnn_instance_ptr _qnn_instance; std::vector _operations; qnn_tensor_array_t _tensor_inputs; @@ -140,17 +140,19 @@ class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config { class ggml_qnn_matmul_op_config : public ggml_qnn_aggregate_op_config { public: - ggml_qnn_matmul_op_config(const std::string & name, std::shared_ptr qnn_instance) : + ggml_qnn_matmul_op_config(const std::string & name, qnn_instance_ptr qnn_instance) : ggml_qnn_aggregate_op_config(name, qnn_instance) {} bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override; private: - qnn_tensor_ptr_t create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, - qnn_tensor_ptr_t tensor_input, qnn_dimension_array_t output_dimensions); - bool create_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, - qnn_tensor_array_t & tensor_inputs, qnn_tensor_array_t & tensor_outputs); - bool create_mat_mul_nodes(qnn_tensor_array_t & tensor_inputs, qnn_tensor_array_t & tensor_outputs); + qnn_tensor_ptr_t create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, + qnn_tensor_ptr_t tensor_input, qnn_dimension_array_t output_dimensions); + Qnn_DataType_t create_input_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, + qnn_tensor_array_t & tensor_inputs); + qnn_op_config_ptr_t create_output_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, + Qnn_DataType_t tensor_type, qnn_tensor_array_t & tensor_outputs); + bool create_mat_mul_nodes(qnn_tensor_array_t & tensor_inputs, qnn_tensor_array_t & tensor_outputs); DISABLE_COPY(ggml_qnn_matmul_op_config); DISABLE_MOVE(ggml_qnn_matmul_op_config); diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp index d613a2116c..635a831a06 100644 --- a/ggml/src/ggml-qnn/op-config.hpp +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -14,11 +14,16 @@ namespace qnn { constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT; -size_t get_qnn_op_index(const ggml_tensor * tensor); -const char * get_qnn_op_name(const ggml_tensor * op); -size_t get_qnn_op_input_param_count(const ggml_tensor * op); +// TODO: move to a better place +void append_tensor_shape_and_type(const ggml_tensor * tensor, std::string & output); + +size_t get_qnn_op_index(const ggml_tensor * tensor); +const char * get_qnn_op_name(const ggml_tensor * op); +void get_qnn_op_desc(const ggml_tensor * op, bool append_dimensions, ggml_type override_data_type, + std::string & output); + std::shared_ptr create_op(const ggml_tensor * op, const std::string & name, - std::shared_ptr qnn_instance); + qnn_instance_ptr qnn_instance); inline bool add_op_to_graph(Qnn_GraphHandle_t graph_handle, std::vector & operations) { for (auto & op : operations) { diff --git a/ggml/src/ggml-qnn/profiler.cpp b/ggml/src/ggml-qnn/profiler.cpp new file mode 100644 index 0000000000..5625c3acf7 --- /dev/null +++ b/ggml/src/ggml-qnn/profiler.cpp @@ -0,0 +1,170 @@ + +#include "profiler.hpp" + +#include +#include + +#include "logger.hpp" +#include "qnn-lib.hpp" + +namespace { + +std::string get_duration_string(const QnnProfile_EventData_t & event_data) { + char time_str[128] = {}; + switch (event_data.unit) { + case QNN_PROFILE_EVENTUNIT_CYCLES: + snprintf(time_str, sizeof(time_str), "cycles: %lld", (long long int) event_data.value); + break; + case QNN_PROFILE_EVENTUNIT_COUNT: + snprintf(time_str, sizeof(time_str), "count: %lld", (long long int) event_data.value); + break; + case QNN_PROFILE_EVENTUNIT_BYTES: + snprintf(time_str, sizeof(time_str), "size: %lld bytes", (long long int) event_data.value); + break; + case QNN_PROFILE_EVENTUNIT_MICROSEC: + { + double duration_ms = event_data.value / 1000.0; + snprintf(time_str, sizeof(time_str), "duration: %.3f ms", duration_ms); + } + break; + default: + break; + } + + return time_str; +} + +} // namespace + +namespace qnn { + +qnn_event_tracer::qnn_event_tracer(const std::string & prefix, std::shared_ptr interface, + Qnn_BackendHandle_t backend_handle, sdk_profile_level level) : + _interface(interface), + _prefix(prefix) { + QnnProfile_Level_t qnn_profile_level = 0; + switch (level) { + case sdk_profile_level::PROFILE_BASIC: + qnn_profile_level = QNN_PROFILE_LEVEL_BASIC; + break; + case sdk_profile_level::PROFILE_OP_TRACE: + case sdk_profile_level::PROFILE_DETAIL: + qnn_profile_level = QNN_PROFILE_LEVEL_DETAILED; + break; + case sdk_profile_level::PROFILE_OFF: + default: + QNN_LOG_WARN("[profiler][%s]invalid profile level %d, using PROFILE_OFF\n", _prefix.c_str(), level); + return; + } + + auto error = _interface->qnn_profile_create(backend_handle, qnn_profile_level, &_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("[profiler][%s]failed to create QNN profile_handle. Backend ID %u, error %ld\n", _prefix.c_str(), + _interface->get_backend_id(), (long) QNN_GET_ERROR_CODE(error)); + _handle = nullptr; + return; + } + + if (level == sdk_profile_level::PROFILE_OP_TRACE) { + QnnProfile_Config_t qnn_profile_config = QNN_PROFILE_CONFIG_INIT; + qnn_profile_config.option = QNN_PROFILE_CONFIG_OPTION_ENABLE_OPTRACE; + std::array profile_configs = { &qnn_profile_config, nullptr }; + error = _interface->qnn_profile_set_config(_handle, profile_configs.data()); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("[profiler][%s]failed to set QNN profile event. Backend ID %u, error %ld\n", _prefix.c_str(), + _interface->get_backend_id(), (long) QNN_GET_ERROR_CODE(error)); + _interface->qnn_profile_free(_handle); + _handle = nullptr; + return; + } + } + + QNN_LOG_DEBUG("[profiler][%s]created, Backend ID %u, level %d\n", _prefix.c_str(), _interface->get_backend_id(), + level); +} + +qnn_event_tracer::~qnn_event_tracer() { + if (_handle) { + Qnn_ErrorHandle_t error = _interface->qnn_profile_free(_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("[profiler][%s]failed to free QNN profile_handle. Backend ID %u, error %ld\n", + _prefix.c_str(), _interface->get_backend_id(), (long) QNN_GET_ERROR_CODE(error)); + } + _handle = nullptr; + } +} + +void qnn_event_tracer::print_profile_events() { + const QnnProfile_EventId_t * events_ptr = nullptr; + uint32_t num_events = 0; + auto error = _interface->qnn_profile_get_events(_handle, &events_ptr, &num_events); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("[profiler][%s]failed to get QNN profile events. Backend ID %u, error %ld\n", _prefix.c_str(), + _interface->get_backend_id(), (long) QNN_GET_ERROR_CODE(error)); + return; + } + + if (!num_events) { + QNN_LOG_INFO("[profiler][%s]no QNN profile events\n", _prefix.c_str()); + return; + } + + QNN_LOG_INFO("[profiler][%s]print_profile_events start ----------------\n", _prefix.c_str()); + // see also: https://github.com/pytorch/executorch/blob/0ccf5093823761cf8ad98c75e5fe81f15ea42366/backends/qualcomm/runtime/backends/QnnProfiler.cpp#L73 + QnnProfile_EventData_t event_data; + for (uint32_t i = 0; i < num_events; ++i) { + error = _interface->qnn_profile_get_event_data(events_ptr[i], &event_data); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("[profiler][%s]failed to get QNN profile event data. Backend ID %u, event[%d], error: %ld\n", + _prefix.c_str(), _interface->get_backend_id(), i, (long) QNN_GET_ERROR_CODE(error)); + continue; + } + + const QnnProfile_EventId_t * sub_events_ptr = nullptr; + uint32_t num_sub_events = 0; + error = _interface->qnn_profile_get_sub_events(events_ptr[i], &sub_events_ptr, &num_sub_events); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR("[profiler][%s]failed to get QNN profile sub events. Backend ID %u, event[%d], error: %ld\n", + _prefix.c_str(), _interface->get_backend_id(), i, (long) QNN_GET_ERROR_CODE(error)); + continue; + } + + auto duration = get_duration_string(event_data); + if (!num_sub_events) { + QNN_LOG_INFO("[profiler][%s]event[%d]: %s, %s\n", _prefix.c_str(), i, event_data.identifier, + duration.c_str()); + continue; + } + + QNN_LOG_INFO("[profiler][%s]event[%d]: %s, sub_count: %d, start -------------\n", _prefix.c_str(), i, + event_data.identifier, num_sub_events); + QnnProfile_EventData_t sub_event_data; + for (std::uint32_t j = 0; j < num_sub_events; ++j) { + error = _interface->qnn_profile_get_event_data(sub_events_ptr[j], &sub_event_data); + if (error != QNN_SUCCESS) { + QNN_LOG_ERROR( + "[profiler][%s]failed to get QNN profile sub event data. Backend ID %u, event[%d], sub_event[%d], " + "error: %ld\n", + _prefix.c_str(), _interface->get_backend_id(), i, j, (long) QNN_GET_ERROR_CODE(error)); + continue; + } + + if (sub_event_data.type != QNN_PROFILE_EVENTTYPE_NODE) { + QNN_LOG_DEBUG("[profiler][%s]sub_event[%d]%s, type %d, skipping\n", _prefix.c_str(), j, + sub_event_data.identifier, sub_event_data.type); + continue; + } + + auto sub_duration = get_duration_string(sub_event_data); + QNN_LOG_INFO("[profiler][%s]sub_event[%d]: %s, %s\n", _prefix.c_str(), j, sub_event_data.identifier, + sub_duration.c_str()); + } + + QNN_LOG_INFO("[profiler][%s]event[%d]: %s, %s, end --------------\n", _prefix.c_str(), i, event_data.identifier, + duration.c_str()); + } + + QNN_LOG_INFO("[profiler][%s]print_profile_events end -----------------\n", _prefix.c_str()); +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/profiler.hpp b/ggml/src/ggml-qnn/profiler.hpp new file mode 100644 index 0000000000..34db09e0bf --- /dev/null +++ b/ggml/src/ggml-qnn/profiler.hpp @@ -0,0 +1,100 @@ +#pragma once + +#include + +#include +#include +#include +#include + +#include "logger.hpp" +#include "qnn-types.hpp" + +namespace qnn { + +#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING + +class qnn_scoped_timer { + public: + qnn_scoped_timer(const std::string & log_prefix) : _log_prefix(std::move(log_prefix)) { + _begin_us = ggml_time_us(); + } + + qnn_scoped_timer(qnn_scoped_timer && other) { + _begin_us = other._begin_us; + _log_prefix = std::move(other._log_prefix); + } + + ~qnn_scoped_timer() { print(); } + + void operator=(qnn_scoped_timer && other) { + _begin_us = other._begin_us; + _log_prefix = std::move(other._log_prefix); + } + + void print() const { + auto duration = (ggml_time_us() - _begin_us) / 1000.0; + QNN_LOG_INFO("[profiler]%s, duration: %.4f ms\n", _log_prefix.c_str(), duration); + } + + + private: + int64_t _begin_us = 0LL; + std::string _log_prefix; + + qnn_scoped_timer(const qnn_scoped_timer &) = delete; + void operator=(const qnn_scoped_timer &) = delete; +}; + +inline qnn_scoped_timer make_scope_perf_timer(const char * format, ...) { + va_list args; + va_start(args, format); + char buffer[4096]; + vsnprintf(buffer, sizeof(buffer), format, args); + va_end(args); + return qnn_scoped_timer(buffer); +} + +#else + +inline void make_scope_perf_timer(const char *, ...) {} + +#endif + +// forward declaration of qnn_interface +class qnn_interface; + +class qnn_event_tracer { + public: + // ref: + // https://github.com/pytorch/executorch/blob/ae3d558d5e6aa04fc52a3065399fe6a773702f52/backends/qualcomm/serialization/qc_schema.py#L53 + // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html#supported-snapdragon-devices + enum sdk_profile_level { PROFILE_OFF = 0, PROFILE_BASIC, PROFILE_DETAIL, PROFILE_OP_TRACE }; + + explicit qnn_event_tracer(const std::string & prefix, std::shared_ptr interface, + Qnn_BackendHandle_t backend_handle, sdk_profile_level level); + ~qnn_event_tracer(); + + Qnn_ProfileHandle_t get_handle() const { return _handle; } + + void print_profile_events(); + + private: + std::shared_ptr _interface; + Qnn_ProfileHandle_t _handle = nullptr; + std::string _prefix; + + DISABLE_COPY(qnn_event_tracer); + DISABLE_MOVE(qnn_event_tracer); +}; + +using qnn_event_tracer_ptr = std::shared_ptr; + +} // namespace qnn + +#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING +# define QNN_SCOPED_PERFORMANCE_TRACKER(fmt, ...) \ + auto __qnn_timer_##__LINE__ = qnn::make_scope_perf_timer(fmt, __VA_ARGS__) +#else +# define QNN_SCOPED_PERFORMANCE_TRACKER(fmt, ...) ((void) 0) +#endif diff --git a/ggml/src/ggml-qnn/qnn-lib.cpp b/ggml/src/ggml-qnn/qnn-lib.cpp index 3e4aa7fcd4..2ec76939c9 100644 --- a/ggml/src/ggml-qnn/qnn-lib.cpp +++ b/ggml/src/ggml-qnn/qnn-lib.cpp @@ -12,12 +12,51 @@ namespace { #ifdef _WIN32 constexpr const char * kQnnSystemLibName = "QnnSystem.dll"; constexpr const char * kQnnRpcLibName = "libcdsprpc.dll"; +constexpr const char * kQnnCpuLibName = "QnnCpu.dll"; +constexpr const char * kQnnGpuLibName = "QnnGpu.dll"; +constexpr const char * kQnnNpuLibName = "QnnHtp.dll"; #else constexpr const char * kQnnSystemLibName = "libQnnSystem.so"; constexpr const char * kQnnRpcLibName = "libcdsprpc.so"; - +constexpr const char * kQnnCpuLibName = "libQnnCpu.so"; +constexpr const char * kQnnGpuLibName = "libQnnGpu.so"; +constexpr const char * kQnnNpuLibName = "libQnnHtp.so"; #endif +constexpr const qnn::device_caps kDeviceCaps[] = { + { + // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul + kQnnCpuLibName, GGML_BACKEND_DEVICE_TYPE_ACCEL, (1L << GGML_TYPE_I8) | (1L << GGML_TYPE_F32), + 0xFFFFFE, // all quantized types can be offload to CPU, at current implementation, those types will be dequantized into float32 on cpu + 0, // 0 for no limitation + }, + { + // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul + kQnnGpuLibName, GGML_BACKEND_DEVICE_TYPE_GPU, (1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16), + // all quantized types can be offload to GPU, at current implementation, those types will be dequantized into float32 on cpu + 0xFFFFFE, (128256L * 4096 * + sizeof(float)), // tested on 8 gen 2, failed to allocate tensor with size 128256x4096 and float32 + }, + { + // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul + kQnnNpuLibName, GGML_BACKEND_DEVICE_TYPE_ACCEL, + (1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16) | (1L << GGML_TYPE_I16), + (1L << GGML_TYPE_Q2_K) | (1L << GGML_TYPE_Q3_K) | (1L << GGML_TYPE_Q4_K) | (1L << GGML_TYPE_Q8_K), + (8192L * 2048 + 8192 * 512 + 2048 * 512) * sizeof(float), // TODO: should have a better way to get this value + }, +}; + +static_assert(sizeof(kDeviceCaps) / sizeof(kDeviceCaps[0]) == GGML_QNN_MAX_DEVICES, + "The number of qnn devices should be equal to GGML_QNN_MAX_DEVICES"); +static_assert(kDeviceCaps[QNN_BACKEND_NPU].type == GGML_BACKEND_DEVICE_TYPE_ACCEL, + "The NPU device should be an accelerator device"); +static_assert(kDeviceCaps[QNN_BACKEND_GPU].type == GGML_BACKEND_DEVICE_TYPE_GPU, + "The GPU device should be an GPU device"); +static_assert( + kDeviceCaps[QNN_BACKEND_CPU].type == GGML_BACKEND_DEVICE_TYPE_ACCEL, + "The CPU device should be an accelerator device"); // we treat qnn-cpu as a supplementary accelerator device +static_assert(GGML_TYPE_Q4_0 == 2 && GGML_TYPE_Q8_K == 15, "The quantized type order is not correct"); + void insert_path(std::string & path, std::string insert_path, const char separator = ':') { if (!insert_path.empty() && !path.empty()) { insert_path += separator; @@ -108,9 +147,8 @@ qnn_system_interface::~qnn_system_interface() { } } -qnn_instance::qnn_instance(const std::string & lib_path, const std::string & backend_lib_name) : - _additional_lib_load_path(lib_path), - _backend_lib_name(std::move(backend_lib_name)) { +qnn_instance::qnn_instance(const std::string & lib_path, QNNBackend device) : _additional_lib_load_path(lib_path) { + _backend_lib_name = kDeviceCaps[device].lib_name; if (set_qnn_lib_search_path(lib_path)) { QNN_LOG_DEBUG("[%s] set_qnn_lib_search_path succeed\n", _backend_lib_name.c_str()); } else { @@ -181,21 +219,27 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { qnn_status = _qnn_interface->qnn_device_get_platform_info(nullptr, &p_info); if (qnn_status == QNN_SUCCESS) { QNN_LOG_INFO("device counts %d\n", p_info->v1.numHwDevices); - QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices; - QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = {}; + QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices; for (uint32_t i = 0; i < p_info->v1.numHwDevices; i++) { QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d\n", (int) infos[i].v1.deviceId, (int) infos[i].v1.deviceType, (int) infos[i].v1.numCores); - QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; - chipinfo = devinfo->onChipDevice; - size_t htp_arch = (size_t) chipinfo.arch; + QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; + QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice; + size_t htp_arch = (size_t) chipinfo.arch; QNN_LOG_INFO("htp_type:%d(%s)\n", devinfo->devType, (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); - QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB\n", (int) chipinfo.socModel, - qnn::get_chipset_desc(chipinfo.socModel), (int) htp_arch, qnn::get_htparch_desc(htp_arch), - (int) chipinfo.vtcmSize); - _soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; + QNN_LOG_INFO("soc_model:%s(%s), htp_arch:%s(%d), vtcm_size:%d MB\n", + get_chipset_desc(chipinfo.socModel), get_chipset_model(chipinfo.socModel), + get_htparch_desc(htp_arch), (int) htp_arch, (int) chipinfo.vtcmSize); } + + if (p_info->v1.numHwDevices) { + QnnDevice_DeviceInfoExtension_t devinfo = infos[p_info->v1.numHwDevices - 1].v1.deviceInfoExtension; + QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice; + size_t htp_arch = (size_t) chipinfo.arch; + _soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; + } + _qnn_interface->qnn_device_free_platform_info(nullptr, p_info); } else { // For emulator, we can't get platform info @@ -229,20 +273,6 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { QNN_LOG_INFO("create QNN device successfully\n"); } - if (_profile_level != sdk_profile_level::profile_off) { - QNN_LOG_INFO("profiling turned on; level = %d\n", _profile_level); - auto profile_level = - _profile_level == sdk_profile_level::profile_detail ? QNN_PROFILE_LEVEL_DETAILED : QNN_PROFILE_LEVEL_BASIC; - - if (QNN_PROFILE_NO_ERROR != - _qnn_interface->qnn_profile_create(_qnn_backend_handle, profile_level, &_qnn_profile_handle)) { - QNN_LOG_WARN("unable to create profile handle in the backend\n"); - return 6; - } else { - QNN_LOG_DEBUG("initialize qnn profile successfully\n"); - } - } - _rpc_lib_handle = load_lib_with_fallback(kQnnRpcLibName, _additional_lib_load_path); if (_rpc_lib_handle) { _pfn_rpc_mem_alloc = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_alloc")); @@ -339,7 +369,7 @@ int qnn_instance::qnn_finalize() { } if (_qnn_context_handle) { - error = _qnn_interface->qnn_context_free(_qnn_context_handle, _qnn_profile_handle); + error = _qnn_interface->qnn_context_free(_qnn_context_handle, nullptr); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), (int) QNN_GET_ERROR_CODE(error)); @@ -347,15 +377,6 @@ int qnn_instance::qnn_finalize() { _qnn_context_handle = nullptr; } - if (_qnn_profile_handle) { - error = _qnn_interface->qnn_profile_free(_qnn_profile_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), - (int) QNN_GET_ERROR_CODE(error)); - } - _qnn_profile_handle = nullptr; - } - if (_qnn_device_handle) { error = _qnn_interface->qnn_device_free(_qnn_device_handle); if (error != QNN_SUCCESS) { @@ -535,4 +556,8 @@ int qnn_instance::unload_backend() { return 0; } +const device_caps & get_device_caps(QNNBackend device) { + return kDeviceCaps[device]; +} + } // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index bb6006acda..3d0084b868 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -82,70 +82,48 @@ class qnn_interface { // QnnBackend DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate); - DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree); - DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage); - DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig); - DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion); + // QnnDevice DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate); - DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree); - DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure); - DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo); - DEFINE_SHIM_FUNCTION_INTERFACE(device_free_platform_info, deviceFreePlatformInfo); - DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo); // QnnContext DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate); - DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize); - DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary); - DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary); - DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree); // QnnGraph DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate); - DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode); - DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize); - DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute); - DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve); // QnnLog DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate); - DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree); - DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel); // QnnProfile DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate); - + DEFINE_SHIM_FUNCTION_INTERFACE(profile_set_config, profileSetConfig); DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents); - DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents); - DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData); - DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree); // QnnMem DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister); - DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister); // QnnProperty @@ -153,7 +131,6 @@ class qnn_interface { // QnnTensor DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor); - DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor); uint32_t get_backend_id() const { return _qnn_interface.backendId; } @@ -169,18 +146,20 @@ class qnn_interface { #pragma GCC diagnostic pop +using qnn_interface_ptr = std::shared_ptr; + class qnn_instance { public: using BackendIdType = decltype(QnnInterface_t{}.backendId); - explicit qnn_instance(const std::string & lib_path, const std::string & backend_lib_name); + explicit qnn_instance(const std::string & lib_path, QNNBackend device); ~qnn_instance() {} int qnn_init(const QnnSaver_Config_t ** saver_config); int qnn_finalize(); - std::shared_ptr get_qnn_interface() { + qnn_interface_ptr get_qnn_interface() { if (!_qnn_interface) { QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); } @@ -189,8 +168,6 @@ class qnn_instance { Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } - Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; } - Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; } Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; } @@ -256,7 +233,7 @@ class qnn_instance { } int set_high_performance_mode() { - if (nullptr == _qnn_htp_perfinfra) { + if (!_qnn_htp_perfinfra) { QNN_LOG_WARN("perf intra is null\n"); return 1; } @@ -425,29 +402,20 @@ class qnn_instance { std::string _backend_lib_name; BackendIdType _backend_id; - QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; - #ifdef NDEBUG - qnn::sdk_profile_level _profile_level = qnn::sdk_profile_level::profile_off; + QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_INFO; // TODO: should we consider changing this dynamically? #else - qnn::sdk_profile_level _profile_level = qnn::sdk_profile_level::profile_detail; + QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; #endif std::shared_ptr _qnn_sys_interface; std::shared_ptr _qnn_interface; - Qnn_GraphHandle_t _qnn_graph_handle = nullptr; - - Qnn_LogHandle_t _qnn_log_handle = nullptr; - - Qnn_ProfileHandle_t _qnn_profile_handle = nullptr; - - Qnn_DeviceHandle_t _qnn_device_handle = nullptr; - - Qnn_BackendHandle_t _qnn_backend_handle = nullptr; - - Qnn_ContextHandle_t _qnn_context_handle = nullptr; - + Qnn_GraphHandle_t _qnn_graph_handle = nullptr; + Qnn_LogHandle_t _qnn_log_handle = nullptr; + Qnn_DeviceHandle_t _qnn_device_handle = nullptr; + Qnn_BackendHandle_t _qnn_backend_handle = nullptr; + Qnn_ContextHandle_t _qnn_context_handle = nullptr; QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr; uint32_t _qnn_power_configid = 1; @@ -473,4 +441,22 @@ class qnn_instance { qnn::qcom_socinfo _soc_info = {}; }; +using qnn_instance_ptr = std::shared_ptr; + +struct device_caps { + const char * lib_name; + enum ggml_backend_dev_type type; + + // TODO: should we get this from device? + uint64_t supported_types; + + // TODO: should we merge this with supported_types? + uint64_t cpu_preprocess_types; + + // TODO: should we get this from device? + size_t max_tensor_size_in_bytes; +}; + +const device_caps & get_device_caps(QNNBackend device); + } // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn-types.hpp b/ggml/src/ggml-qnn/qnn-types.hpp index 8284036bb7..957f8b681f 100644 --- a/ggml/src/ggml-qnn/qnn-types.hpp +++ b/ggml/src/ggml-qnn/qnn-types.hpp @@ -8,15 +8,6 @@ #include "System/QnnSystemInterface.h" namespace qnn { -// ================================================================================================= -// -// helper data type / data structure / macros / functions of -// Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK -// ref: -// https://github.com/pytorch/executorch/blob/ae3d558d5e6aa04fc52a3065399fe6a773702f52/backends/qualcomm/serialization/qc_schema.py#L53 -// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html#supported-snapdragon-devices -// ================================================================================================= -enum sdk_profile_level { profile_off = 0, profile_basic, profile_detail }; enum qcom_htp_arch { NONE = 0, @@ -29,12 +20,15 @@ enum qcom_htp_arch { enum qcom_chipset { UNKNOWN_SM = 0, + SM8350 = 30, // v68, SD 888/888+ SM8450 = 36, // v69, SD 8 Gen 1 + SA8295 = 39, // v68 SM8475 = 42, // v69, SD 8+ Gen 1 SM8550 = 43, // v73, SD 8 Gen 2 SSG2115P = 46, // v73 + SM7675 = 70, // V73, SD 7+ Gen 3 + SM8635 = 68, // v73, SD 8s Gen 3 SM8650 = 57, // v75, SD 8 Gen 3 - SA8295 = 39, // v68 SM8750 = 69, // v79, SD 8 Gen 4 }; diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 660223caf7..608a80fcf5 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -25,8 +25,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string & name, const qnn_dimension_array_t & dimensions, Qnn_DataType_t data_type, int rank, - QNNBackend device, Qnn_GraphHandle_t graph_handle, - std::shared_ptr qnn_instance) : + QNNBackend device, Qnn_GraphHandle_t graph_handle, qnn_instance_ptr qnn_instance) : _tensor_name(name), _device(device), _qnn_instance(qnn_instance), @@ -46,8 +45,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string & name, const ggml_dimension_array_t & dimensions, ggml_type data_type, int rank, - QNNBackend device, Qnn_GraphHandle_t graph_handle, - std::shared_ptr qnn_instance) : + QNNBackend device, Qnn_GraphHandle_t graph_handle, qnn_instance_ptr qnn_instance) : ggml_qnn_tensor(tensor_type, name, get_internal_dimension(dimensions, rank), qnn_datatype_from_ggml_datatype(data_type), rank, device, graph_handle, qnn_instance) {} @@ -85,7 +83,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { auto qnn_interface = _qnn_instance->get_qnn_interface(); auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("[%s]allocate id failed , error: %d\n", _tensor_name.c_str(), (int) error); + QNN_LOG_ERROR("[%s]allocate id failed, error: %s\n", _tensor_name.c_str(), get_qnn_error_string(error)); return false; } @@ -95,7 +93,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { return true; } - bool bind_ggml_tensor(ggml_tensor * tensor) { + bool bind_ggml_tensor(ggml_tensor * tensor, qnn_buffer_ptr buffer) { if (!_can_unbind) { QNN_LOG_DEBUG("[%s]already has buffer storage, skip bind\n", _tensor_name.c_str()); return true; @@ -111,8 +109,12 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { } #endif - auto buffer = - std::make_shared(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + if (!buffer) { + buffer = + std::make_shared(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + QNN_LOG_DEBUG("[%s][%s]attach buffer to tensor(%s), size: %d\n", get_backend_name(_device), + _tensor_name.c_str(), tensor->name, (int) buffer->get_size()); + } if (!bind_buffer_impl(buffer)) { QNN_LOG_WARN("[%s]failed to bind ggml tensor(%s)\n", _tensor_name.c_str(), ggml_get_name(tensor)); return false; @@ -154,7 +156,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { } QNN_LOG_DEBUG("[%s][%s]unbind from buffer: %p, size: %d\n", get_backend_name(_device), _tensor_name.c_str(), - (void *) _buffer.get(), (int) _buffer->get_size()); + (void *) _buffer->get_buffer(), (int) _buffer->get_size()); _buffer.reset(); if (_ggml_tensor) { @@ -175,15 +177,19 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { uint32_t get_qnn_tensor_id() const { return QNN_TENSOR_GET_ID(_qnn_tensor); } + const std::string & get_tensor_name() const { return _tensor_name; } + private: bool bind_buffer_impl(qnn_buffer_ptr buffer) { if (_buffer) { if (_buffer != buffer) { - QNN_LOG_WARN("[%s]has been bound to another buffer %p\n", _tensor_name.c_str(), (void *) _buffer.get()); + QNN_LOG_WARN("[%s]has been bound to another buffer %p\n", _tensor_name.c_str(), + (void *) _buffer->get_buffer()); return false; } - QNN_LOG_DEBUG("[%s]already bound to same ggml tensor %p\n", _tensor_name.c_str(), (void *) _buffer.get()); + QNN_LOG_DEBUG("[%s]already bound to same ggml tensor %p\n", _tensor_name.c_str(), + (void *) _buffer->get_buffer()); return true; } @@ -221,8 +227,8 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); Qnn_ClientBuffer_t client_buf = { buffer->get_buffer(), (uint32_t) buffer->get_size() }; QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); - QNN_LOG_DEBUG("[%s]use client buffer %p size %d\n", _tensor_name.c_str(), client_buf.data, - (int) client_buf.dataSize); + QNN_LOG_DEBUG("[%s][%s]use client buffer %p size %d\n", get_backend_name(_device), _tensor_name.c_str(), + client_buf.data, (int) client_buf.dataSize); } _buffer = buffer; @@ -233,7 +239,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { } QNN_LOG_DEBUG("[%s][%s]bind to buffer: %p, size: %d\n", get_backend_name(_device), _tensor_name.c_str(), - (void *) buffer.get(), (int) buffer->get_size()); + (void *) buffer->get_buffer(), (int) buffer->get_size()); return true; } @@ -246,10 +252,11 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { if (_rpc_buffer) { memcpy(_rpc_buffer->get_buffer(), _buffer->get_buffer(), _buffer->get_size()); + // For CPU and GPU, the data is already in the tensor. + QNN_LOG_DEBUG("[%s][%s]write buffer(%p) to rpc buffer(%p)\n", get_backend_name(_device), + _tensor_name.c_str(), (void *) _buffer->get_buffer(), (void *) _rpc_buffer->get_buffer()); } - // For CPU and GPU, the data is already in the tensor. - QNN_LOG_DEBUG("[%s][%s]write tensor to qnn\n", get_backend_name(_device), _tensor_name.c_str()); return true; } @@ -262,10 +269,11 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { if (_rpc_buffer) { memcpy(_buffer->get_buffer(), _rpc_buffer->get_buffer(), _buffer->get_size()); + // For CPU and GPU, the data is already in the tensor. + QNN_LOG_DEBUG("[%s][%s]read buffer(%p) from rpc buffer(%p)\n", get_backend_name(_device), + _tensor_name.c_str(), (void *) _buffer->get_buffer(), (void *) _rpc_buffer->get_buffer()); } - // For CPU and GPU, the data is already in the tensor. - QNN_LOG_DEBUG("[%s][%s]read tensor from qnn\n", get_backend_name(_device), _tensor_name.c_str()); return true; } @@ -298,8 +306,8 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { break; } QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type); - QNN_LOG_DEBUG("[%s][%s]tensor changed to type %d\n", get_backend_name(_device), _tensor_name.c_str(), - new_tensor_type); + QNN_LOG_DEBUG("[%s][%s]new_tensor_type %s\n", get_backend_name(_device), _tensor_name.c_str(), + get_qnn_tensor_type_name(new_tensor_type)); } bool should_use_mem_handle() const { @@ -307,16 +315,16 @@ class ggml_qnn_tensor : public std::enable_shared_from_this { return false; } - std::string _tensor_name; - qnn_buffer_ptr _buffer; - bool _can_unbind = true; - QNNBackend _device; - std::shared_ptr _qnn_instance; - Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); - qnn_dimension_array_t _dimensions = {}; - Qnn_GraphHandle_t _graph_handle = nullptr; - qnn_buffer_ptr _rpc_buffer; - ggml_tensor * _ggml_tensor = nullptr; + std::string _tensor_name; + qnn_buffer_ptr _buffer; + bool _can_unbind = true; + QNNBackend _device; + qnn_instance_ptr _qnn_instance; + Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); + qnn_dimension_array_t _dimensions = {}; + Qnn_GraphHandle_t _graph_handle = nullptr; + qnn_buffer_ptr _rpc_buffer; + ggml_tensor * _ggml_tensor = nullptr; DISABLE_COPY(ggml_qnn_tensor); DISABLE_MOVE(ggml_qnn_tensor); @@ -340,13 +348,33 @@ inline int get_ggml_tensors_max_rank(const qnn::ggml_tensor_array_t & tensors) { return max_rank; } +inline bool bind_tensors_with_custom_buffers(const ggml_tensor_array_t & ggml_tensors, + std::vector & buffers, + qnn_tensor_array_t & tensor_wrappers, + std::vector & qnn_tensors) { + GGML_ASSERT(tensor_wrappers.size() == ggml_tensors.size()); + GGML_ASSERT(buffers.size() == ggml_tensors.size()); + qnn_tensors.resize(ggml_tensors.size()); + for (size_t i = 0; i < ggml_tensors.size(); i++) { + auto * ggml_tensor = ggml_tensors[i]; + if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor, buffers[i])) { + QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); + return false; + } + + qnn_tensors[i] = tensor_wrappers[i]->get_qnn_tensor(); + } + + return true; +} + inline bool bind_tensors(const ggml_tensor_array_t & ggml_tensors, qnn_tensor_array_t & tensor_wrappers, std::vector & qnn_tensors) { GGML_ASSERT(tensor_wrappers.size() == ggml_tensors.size()); qnn_tensors.resize(ggml_tensors.size()); for (size_t i = 0; i < ggml_tensors.size(); i++) { auto * ggml_tensor = ggml_tensors[i]; - if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) { + if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor, qnn_buffer_ptr())) { QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; } @@ -361,7 +389,7 @@ inline bool bind_tensors(const ggml_tensor_array_t & ggml_tensors, qnn_tensor_ar GGML_ASSERT(tensor_wrappers.size() == ggml_tensors.size()); for (size_t i = 0; i < ggml_tensors.size(); i++) { auto * ggml_tensor = ggml_tensors[i]; - if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) { + if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor, qnn_buffer_ptr())) { QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); return false; } diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index f9178f90d5..9696101b8b 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -178,8 +178,8 @@ const char * get_ggml_type_name(ggml_type type) { return traits->type_name; } -const char * get_backend_name(QNNBackend device_index) { - switch (device_index) { +const char * get_backend_name(QNNBackend device) { + switch (device) { case QNN_BACKEND_CPU: return "qnn-cpu"; case QNN_BACKEND_GPU: @@ -192,18 +192,65 @@ const char * get_backend_name(QNNBackend device_index) { } } -const char * get_chipset_desc(uint32_t chipset_id) { - switch (chipset_id) { +const char * get_backend_desc(QNNBackend device) { + switch (device) { + case QNN_BACKEND_CPU: + return "CPU"; + case QNN_BACKEND_GPU: + return "Adreno GPU"; + case QNN_BACKEND_NPU: + return "Hexagon NPU"; + case QNN_BACKEND_COUNT: + default: + return "unknown"; + } +} + +const char * get_chipset_desc(uint32_t soc_model) { + switch (soc_model) { + case SM8350: + return "Snapdragon 888/888+"; case SM8450: - return "SD 8 Gen 1 (SM8450)"; + return "Snapdragon 8 Gen 1"; case SM8475: - return "SD 8+ Gen 1 (SM8475)"; + return "Snapdragon 8 Gen 1+"; case SM8550: - return "SD 8 Gen 2 (SM8550)"; + return "Snapdragon 8 Gen 2"; + case SM7675: + return "Snapdragon 7+ Gen 3"; + case SM8635: + return "Snapdragon 8s Gen 3"; case SM8650: - return "SD 8 Gen 3 (SM8650)"; + return "Snapdragon 8 Gen 3"; case SM8750: - return "SD 8 Gen 4 (SM8750)"; + return "Snapdragon 8 Elite"; + default: + return "unknown"; + } +} + +const char * get_chipset_model(uint32_t soc_model) { + switch (soc_model) { + case SM8350: + return "SM8350"; + case SM8450: + return "SM8450"; + case SA8295: + return "SA8295"; + case SM8475: + return "SM8475"; + case SM8550: + return "SM8550"; + case SSG2115P: + return "SSG2115P"; + case SM7675: + return "SM7675"; + case SM8635: + return "SM8635"; + case SM8650: + return "SM8650"; + case SM8750: + return "SM8750"; default: return "unknown"; } @@ -212,15 +259,15 @@ const char * get_chipset_desc(uint32_t chipset_id) { const char * get_htparch_desc(size_t htp_arch) { switch (htp_arch) { case V68: - return "QCOM_HTP_V68"; + return "HTP_V68"; case V69: - return "QCOM_HTP_V69"; + return "HTP_V69"; case V73: - return "QCOM_HTP_V73"; + return "HTP_V73"; case V75: - return "QCOM_HTP_V75"; + return "HTP_V75"; case V79: - return "QCOM_HTP_V79"; + return "HTP_V79"; default: return "unknown"; } @@ -234,6 +281,29 @@ uint32_t get_ggml_tensor_data_size(const ggml_tensor * tensor) { return (uint32_t) ggml_nbytes(tensor); } +const char * get_qnn_tensor_type_name(Qnn_TensorType_t type) { + switch (type) { + case QNN_TENSOR_TYPE_APP_WRITE: + return "QNN_TENSOR_TYPE_APP_WRITE"; + case QNN_TENSOR_TYPE_APP_READ: + return "QNN_TENSOR_TYPE_APP_READ"; + case QNN_TENSOR_TYPE_APP_READWRITE: + return "QNN_TENSOR_TYPE_APP_READWRITE"; + case QNN_TENSOR_TYPE_STATIC: + return "QNN_TENSOR_TYPE_STATIC"; + case QNN_TENSOR_TYPE_NATIVE: + return "QNN_TENSOR_TYPE_NATIVE"; + case QNN_TENSOR_TYPE_UNDEFINED: + return "QNN_TENSOR_TYPE_UNDEFINED"; + case QNN_TENSOR_TYPE_NULL: + return "QNN_TENSOR_TYPE_NULL"; + default: + break; + } + + return "unknown"; +} + #ifdef _WIN32 static void * _align_alloc(size_t alignment, size_t size) { return _aligned_malloc(size, alignment); @@ -265,14 +335,15 @@ void align_free(void * ptr) { void * page_align_alloc(size_t size) { const size_t alignment = _get_page_size(); size_t size_aligned = align_to_generic(alignment, size); - QNN_LOG_DEBUG("_align_alloc success, alignment: %ld, size: %ld, size_aligned: %ld\n", alignment, size, size_aligned); - void * data = _align_alloc(alignment, size_aligned); + void * data = _align_alloc(alignment, size_aligned); if (!data) { QNN_LOG_WARN("_align_alloc failed, alignment: %ld, size: %ld, size_aligned: %ld\n", alignment, size, size_aligned); return nullptr; } + QNN_LOG_DEBUG("_align_alloc success, alignment: %ld, size: %ld, size_aligned: %ld\n", alignment, size, + size_aligned); return data; } diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index d6130a3df4..2e55e2f2d8 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -23,11 +23,14 @@ qnn_dimension_array_t get_view_internal_dimension(const ggml_tensor * tensor, si uint32_t get_ggml_tensor_rank(const ggml_tensor * tensor); const char * get_ggml_type_name(ggml_type type); -const char * get_backend_name(QNNBackend device_index); -const char * get_chipset_desc(uint32_t chipset_id); +const char * get_backend_name(QNNBackend device); +const char * get_backend_desc(QNNBackend device); +const char * get_chipset_desc(uint32_t soc_model); +const char * get_chipset_model(uint32_t soc_model); const char * get_htparch_desc(size_t htp_arch); intptr_t align_to(size_t alignment, intptr_t offset); uint32_t get_ggml_tensor_data_size(const ggml_tensor * tensor); +const char * get_qnn_tensor_type_name(Qnn_TensorType_t type); void * page_align_alloc(size_t size); void align_free(void * ptr); @@ -199,48 +202,6 @@ const char * qnn_datatype_to_string(Qnn_DataType_t qnn_type); size_t get_system_total_memory_in_bytes(); size_t get_system_free_memory_in_bytes(); -#if ENABLE_QNNBACKEND_PERF -class qnn_perf { - public: - qnn_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {}; - - ~qnn_perf() { info(); } - - qnn_perf() = delete; - qnn_perf(const qnn_perf &) = delete; - qnn_perf & operator=(const qnn_perf &) = delete; - - void start() { _begin_time = ggml_time_us(); } - - void info() { - _end_time = ggml_time_us(); - _duration = (_end_time - _begin_time); - QNN_LOG_INFO("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration); - } - - private: - int64_t _begin_time = 0LL; - int64_t _end_time = 0LL; - int64_t _duration = 0LL; - std::string _perf_name; -}; -#else -class qnn_perf { - public: - qnn_perf(const std::string &) {} - - ~qnn_perf() { info(); } - - qnn_perf() = delete; - qnn_perf(const qnn_perf &) = delete; - qnn_perf & operator=(const qnn_perf &) = delete; - - void start() {} - - void info() {} -}; -#endif - } // namespace qnn #define QNN_TENSOR_GET_ID(tensor) qnn::get_qnn_tensorid(tensor)