diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index a41fae6bbb..a4dace7078 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -226,9 +226,8 @@ size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buf size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); - // TODO: this value is an experimental value, works fine with - // whisper/llm/minicpm-v inference on Android - return (96 * 1024 * 1024); + // TODO: get the max size from device + return (1024 * 1024 * 1024); } bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) { @@ -339,6 +338,7 @@ void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t *free, si GGML_UNUSED(dev); *free = qnn::get_system_free_memory_in_bytes(); *total = qnn::get_system_total_memory_in_bytes(); + QNN_LOG_DEBUG("free memory: %ldMB, total memory: %ldMB", (*free / 1048576), (*total) / 1048576); } enum ggml_backend_dev_type ggml_backend_qnn_device_get_type(ggml_backend_dev_t dev) { @@ -374,7 +374,7 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, auto *dev_ctx = get_device_context(dev); const auto device = dev_ctx->device; - QNN_LOG_DEBUG("device %d", device); + QNN_LOG_DEBUG("device %s", qnn::get_backend_name(device)); QNN_LOG_DEBUG("extend_lib_search_path %s", extend_lib_search_path); std::string path = extend_lib_search_path; @@ -386,7 +386,7 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, "dsp:/vendor/dsp/images") .c_str(), 1) == 0) { - QNN_LOG_INFO("QNN NPU backend setenv successfully"); + QNN_LOG_DEBUG("QNN NPU backend setenv successfully"); } else { QNN_LOG_ERROR("QNN NPU backend setenv failure"); } @@ -395,13 +395,13 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, "rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp") .c_str(), 1) == 0) { - QNN_LOG_INFO("QNN NPU backend setenv successfully"); + QNN_LOG_DEBUG("QNN NPU backend setenv successfully"); } else { QNN_LOG_ERROR("QNN NPU backend setenv failure"); } } else { if (setenv("LD_LIBRARY_PATH", path.c_str(), 1) == 0) { - QNN_LOG_INFO("%s backend setenv successfully\n", qnn::get_backend_name(device)); + QNN_LOG_DEBUG("%s backend setenv successfully\n", qnn::get_backend_name(device)); } else { QNN_LOG_ERROR("%s backend setenv failure\n", qnn::get_backend_name(device)); } @@ -454,6 +454,7 @@ ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_ptr(ggml_backend_dev_t } bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor *op) { + // Note that this function could be called before the device context is initialized auto *device_ctx = get_device_context(dev); return qnn::ggml_qnn_supports_op(device_ctx, op); } @@ -495,13 +496,15 @@ struct ggml_backend_qnn_reg_impl : ggml_backend_reg { context = this; iface = interface; + QNN_LOG_DEBUG("qnn backend registry init"); for (int i = 0; i < GGML_QNN_MAX_DEVICES; i++) { const auto device_enum = (QNNBackend)(GGML_QNN_MAX_DEVICES - 1 - i); // init from the last device, i.e. NPU device_contexts[i] = std::make_unique( /* .device = */ device_enum, // init from the last device, i.e. NPU /* .threads = */ 1, /* .name = */ qnn::get_backend_name(device_enum), - /* .lib_name = */ kDeviceCaps[device_enum].lib_name); + /* .lib_name = */ kDeviceCaps[device_enum].lib_name, + /* .supported_types = */ kDeviceCaps[device_enum].supported_types); auto &device = devices[i]; device.iface = ggml_backend_qnn_device_interface; diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 5643a74631..da0480df7f 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -543,14 +543,17 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t return false; } +#ifndef NDEBUG auto *type_name = ggml_get_type_traits(tensor->type)->type_name; +#endif switch (tensor->type) { case GGML_TYPE_F32: case GGML_TYPE_F16: case GGML_TYPE_Q8_0: case GGML_TYPE_Q4_0: if (!(ctx->supported_types & (1 << tensor->type))) { - QNN_LOG_DEBUG("unsupported data type %s for backend %d", type_name, (int)ctx->device); + QNN_LOG_DEBUG("unsupported data type %s for backend %s, supported_types: 0x%x", type_name, + qnn::get_backend_name(ctx->device), ctx->supported_types); return false; } break; @@ -563,25 +566,42 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t } bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { - GGML_UNUSED(ctx); - auto *src0 = op->src[0]; auto *src1 = op->src[1]; - if (src0->type != src1->type || src0->type != op->type) { - // current qnn implementation only supports the same type for src0 and src1 - QNN_LOG_DEBUG("src0 type %d and src1 type %d and op type %d are not equal", src0->type, src1->type, op->type); - return false; - } - - if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) { - /* - * TODO: remove the blocker here when qnn backend supports mul_mat like this: - * [ne03, ne02, n, k] * [ne03 * x, ne02 * y, m, k] -> [ne03 * x, ne02 * y, m, n] - */ - QNN_LOG_DEBUG("src0 and src1 dimensions are not equal"); + switch (ctx->device) { + case QNN_BACKEND_NPU: + if (src1->ne[2] != src0->ne[2] || src1->ne[3] != src0->ne[3]) { + /* + * TODO: remove the blocker here when NPU backend supports mul_mat like this: + * [ne03, ne02, n, k] * [ne03 * x, ne02 * y, m, k] -> [ne03 * x, ne02 * y, m, n] + */ + QNN_LOG_DEBUG("[qnn-npu] src0 and src1 dimensions are not equal, support/unsupported: %d/%d", + ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); + return false; + } + // fall through, from test here, the convert op is super slow on NPU: + // https://github.com/usefulsensors/qc_npu_benchmark + case QNN_BACKEND_GPU: + if (src0->type != src1->type || src0->type != op->type) { + // there's no convert op for GPU. + QNN_LOG_DEBUG("[qnn-gpu]type src0(%d), src1(%d) and op(%d) are not equal, support/unsupported: %d/%d", + src0->type, src1->type, op->type, ctx->support_op_count.load(), + ++(ctx->unsupported_op_count)); + return false; + } + break; + default: + break; + } + + if ((src1->ne[2] % src0->ne[2]) != 0 || (src1->ne[3] % src0->ne[3]) != 0) { + QNN_LOG_DEBUG("[%s] src0 and src1 dimensions are not equal, support/unsupported: %d/%d", + qnn::get_backend_name(ctx->device), ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); return false; } + QNN_LOG_DEBUG("[%s] supported matmul op, support/unsupported: %d/%d", qnn::get_backend_name(ctx->device), + ++(ctx->support_op_count), ctx->unsupported_op_count.load()); return true; } @@ -590,6 +610,7 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm namespace qnn { bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { + // Note that this function could be called before the device context is initialized if (op->op == GGML_OP_NONE) { return true; } diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index aaced22727..17823ed577 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -1,6 +1,10 @@ #pragma once +#ifndef NDEBUG +#include +#endif + #include #include #include @@ -25,7 +29,7 @@ struct ggml_backend_qnn_device_context { std::string name; std::string lib_name; - // initialize in init + // initialize in qnn init qnn::qcom_socinfo socinfo = {}; uint64_t supported_types; std::shared_ptr instance; @@ -33,7 +37,12 @@ struct ggml_backend_qnn_device_context { qnn::ggml_qnn_graph_cache_t qnn_graph_cache; - explicit ggml_backend_qnn_device_context(QNNBackend device, size_t threads, const char *name, - const char *lib_name) : - device(device), threads(threads), name(name), lib_name(lib_name) {} +#ifndef NDEBUG + std::atomic_uint32_t support_op_count = 0; + std::atomic_uint32_t unsupported_op_count = 0; +#endif + + explicit ggml_backend_qnn_device_context(QNNBackend device, size_t threads, const char *name, const char *lib_name, + uint64_t supported_types) + : device(device), threads(threads), name(name), lib_name(lib_name), supported_types(supported_types) {} }; diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 858a7d3af2..1b0dcd78fa 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -17,9 +17,9 @@ namespace qnn { class ggml_qnn_graph { public: explicit ggml_qnn_graph(const std::string &graph_name, QNNBackend device, - std::shared_ptr qnn_instance, size_t vtcm_size_in_mb) : - _graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) { - QNN_LOG_INFO("[%s]create", graph_name.c_str()); + std::shared_ptr qnn_instance, size_t vtcm_size_in_mb) + : _graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) { + QNN_LOG_DEBUG("[%s][%s]created", get_backend_name(device), graph_name.c_str()); auto qnn_interface = qnn_instance->get_qnn_interface(); auto qnn_context = qnn_instance->get_qnn_context_handle(); @@ -56,24 +56,25 @@ public: graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_vtcm_config.customConfig = &vtcm_config; - const QnnGraph_Config_t *graph_configs[] = { &graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, - &graph_opt_config, nullptr }; + const QnnGraph_Config_t *graph_configs[] = {&graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, + &graph_opt_config, nullptr}; error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), graph_configs, &graph_handle); } else { error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), nullptr, &graph_handle); } if (error != QNN_SUCCESS) { - QNN_LOG_INFO("[%s]can't create qnn graph handle, error = %d\n", graph_name.c_str(), error); + QNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s\n", get_backend_name(device), + graph_name.c_str(), get_qnn_error_string(error)); return; } - QNN_LOG_INFO("[%s]create succeed\n", graph_name.c_str()); + QNN_LOG_INFO("[%s][%s]create succeed\n", get_backend_name(device), graph_name.c_str()); _graph_handle = graph_handle; _qnn_interface = qnn_interface; } - ~ggml_qnn_graph() { QNN_LOG_DEBUG("[%s]destroy", _graph_name.c_str()); } + ~ggml_qnn_graph() { QNN_LOG_DEBUG("[%s][%s]destroy", get_backend_name(_device), _graph_name.c_str()); } bool build_graph(ggml_op_constructor_t op_constructor, const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { @@ -83,10 +84,10 @@ public: return false; } - QNN_LOG_DEBUG("[%s]build_graph start", _graph_name.c_str()); + QNN_LOG_DEBUG("[%s][%s]build_graph start", get_backend_name(_device), _graph_name.c_str()); _op_config = op_constructor(_graph_name, _qnn_instance); - if (!_op_config->create_tensors(_device, _graph_handle, tensor_inputs, tensor_outputs)) { - QNN_LOG_ERROR("[%s]create_tensors failed\n", _graph_name.c_str()); + if (!_op_config->initialize_op_nodes(_device, _graph_handle, tensor_inputs, tensor_outputs)) { + QNN_LOG_ERROR("[%s][%s]initialize_op_nodes failed", get_backend_name(_device), _graph_name.c_str()); return false; } @@ -97,27 +98,23 @@ public: auto error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr); if (error != QNN_SUCCESS) { - auto *error_str = get_qnn_error_string(error); - if (error_str) { - QNN_LOG_ERROR("[%s]qnn_graph_finalize.error: %s\n", _graph_name.c_str(), error_str); - } else { - QNN_LOG_ERROR("[%s]qnn_graph_finalize.error: %d\n", _graph_name.c_str(), error); - } + QNN_LOG_ERROR("[%s][%s]qnn_graph_finalize.error: %s", get_backend_name(_device), _graph_name.c_str(), + get_qnn_error_string(error)); return false; } - QNN_LOG_DEBUG("[%s]build_graph succeed", _graph_name.c_str()); + QNN_LOG_DEBUG("[%s][%s]build_graph succeed", get_backend_name(_device), _graph_name.c_str()); return true; } bool execute(const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { if (!_op_config->bind_input_tensors(tensor_inputs)) { - QNN_LOG_ERROR("[%s]bind input tensors failed\n", _graph_name.c_str()); + QNN_LOG_ERROR("[%s][%s]bind input tensors failed\n", get_backend_name(_device), _graph_name.c_str()); return false; } if (!_op_config->bind_output_tensors(tensor_outputs)) { - QNN_LOG_ERROR("[%s]bind output tensors failed\n", _graph_name.c_str()); + QNN_LOG_ERROR("[%s][%s]bind output tensors failed\n", get_backend_name(_device), _graph_name.c_str()); return false; } @@ -127,20 +124,21 @@ public: auto error = _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(), qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr); - if (_device == QNN_BACKEND_NPU) { - if (error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { - QNN_LOG_WARN("[%s]NPU crashed. SSR detected. Caused QNN graph execute error\n", _graph_name.c_str()); - } - } - _op_config->unbind_input_tensors(); _op_config->unbind_output_tensors(); if (error != QNN_SUCCESS) { - QNN_LOG_INFO("[%s]error = %d\n", _graph_name.c_str(), error); + if (_device == QNN_BACKEND_NPU && error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { + QNN_LOG_WARN("[%s][%s]NPU crashed. SSR detected. Caused QNN graph execute error.", + get_backend_name(_device), _graph_name.c_str()); + } else { + QNN_LOG_ERROR("[%s][%s]error: %s", get_backend_name(_device), _graph_name.c_str(), + get_qnn_error_string(error)); + } return false; } + QNN_LOG_DEBUG("[%s][%s]execute succeed", get_backend_name(_device), _graph_name.c_str()); return true; } diff --git a/ggml/src/ggml-qnn/logger.cpp b/ggml/src/ggml-qnn/logger.cpp index 187e9088c7..1e781721d6 100644 --- a/ggml/src/ggml-qnn/logger.cpp +++ b/ggml/src/ggml-qnn/logger.cpp @@ -34,7 +34,7 @@ void qnn::internal_log(ggml_log_level level, const char * /*file*/, const char * } #if ENABLE_QNNSDK_LOG -void qnn::sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t timestamp, va_list argp) { +void qnn::sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t /*timestamp*/, va_list argp) { static std::mutex log_mutex; static unsigned char s_ggml_qnn_logbuf[QNN_LOGBUF_LEN]; @@ -60,13 +60,12 @@ void qnn::sdk_logcallback(const char *fmt, QnnLog_Level_t level, uint64_t timest break; } - double ms = (double)timestamp / 1000000.0; { std::lock_guard lock(log_mutex); memset(s_ggml_qnn_logbuf, 0, QNN_LOGBUF_LEN); vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), QNN_LOGBUF_LEN, fmt, argp); - QNN_LOG_INFO("%8.1fms [%-7s] %s", ms, log_level_desc, s_ggml_qnn_logbuf); + QNN_LOG_INFO("[%s]%s", log_level_desc, s_ggml_qnn_logbuf); } } #else diff --git a/ggml/src/ggml-qnn/op-config-base.hpp b/ggml/src/ggml-qnn/op-config-base.hpp new file mode 100644 index 0000000000..159944a7d7 --- /dev/null +++ b/ggml/src/ggml-qnn/op-config-base.hpp @@ -0,0 +1,129 @@ +#pragma once + +#include +#include + +#include "ggml-qnn.h" + +#include "qnn-types.hpp" +#include "tensor.hpp" + +namespace qnn { + +using ggml_tensor_array_t = std::vector; + +/** + * @class ggml_qnn_op_config + * @brief Abstract base class for configuring QNN operations. + * + * This class provides an interface for creating and managing tensors, + * adding operations to a graph, and binding/unbinding input and output tensors. + */ +class ggml_qnn_op_config { +public: + virtual ~ggml_qnn_op_config() {} + + /** + * @brief Creates tensors and internal nodes for constructing the calculation graph. + * + * This pure virtual function is responsible for creating tensors on the given + * backend device, associating them with the provided graph handle, and creating + * the internal nodes necessary for constructing the calculation graph. It takes + * input and output tensor arrays as parameters. + * + * @param device The backend device where tensors will be created. + * @param graph_handle The handle to the graph where tensors and nodes will be associated. + * @param tensor_inputs An array of input tensors. + * @param tensor_outputs An array of output tensors. + * @return true if tensors and nodes are successfully created, false otherwise. + */ + virtual bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) = 0; + + /** + * @brief Pure virtual function to retrieve the input tensors for QNN (Quantized Neural Network). + * + * This function must be overridden by derived classes to provide the specific implementation + * for retrieving the input tensors used in QNN operations. + * + * @return A reference to a vector of Qnn_Tensor_t objects representing the input tensors. + */ + virtual std::vector &get_qnn_input_tensors() = 0; + + /** + * @brief Pure virtual function to retrieve the output tensors of a QNN (Quantized Neural Network). + * + * This function must be overridden by any derived class to provide access to the + * output tensors of the QNN. The function returns a reference to a vector of + * Qnn_Tensor_t objects, which represent the output tensors. + * + * @return std::vector& Reference to a vector of Qnn_Tensor_t objects. + */ + virtual std::vector &get_qnn_output_tensors() = 0; + + /** + * @brief Adds an operation to the given graph. + * + * This pure virtual function must be implemented by derived classes to add + * a specific operation to the provided graph handle. + * + * This function will be called after `initialize_op_nodes` during initialization. + * + * @param graph_handle The handle to the graph where the operation will be added. + * @return true if the operation was successfully added to the graph, false otherwise. + */ + virtual bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) = 0; + + /** + * @brief Binds the input tensors to the operation. + * + * This pure virtual function must be implemented by derived classes to bind + * the provided input tensors to the operation. The function takes a constant + * reference to a ggml_tensor_array_t object, which contains the input tensors + * to be bound. + * + * @param tensor_inputs A constant reference to a ggml_tensor_array_t object + * containing the input tensors. + * @return true if the input tensors were successfully bound, false otherwise. + */ + virtual bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) = 0; + + /** + * @brief Binds the output tensors to the given tensor array. + * + * This pure virtual function must be implemented by derived classes to bind + * the output tensors to the provided array of tensors. The function is expected + * to establish the necessary connections or mappings between the output tensors + * and the elements of the given tensor array. + * + * @param tensor_outputs A constant reference to an array of ggml tensors that + * represent the output tensors to be bound. + * @return true if the binding is successful, false otherwise. + */ + virtual bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) = 0; + + /** + * @brief Unbinds the input tensors from the operation. + * + * This pure virtual function is intended to be overridden by derived classes + * to implement the logic for unbinding or detaching input tensors that were + * previously bound to the operation. This is typically used to release resources + * or reset the state of the operation. + */ + virtual void unbind_input_tensors() = 0; + + /** + * @brief Unbinds the output tensors. + * + * This pure virtual function is responsible for unbinding or detaching + * the output tensors from their current bindings. Implementations of this + * function should ensure that any resources or references held by the + * output tensors are properly released or reset. + */ + virtual void unbind_output_tensors() = 0; +}; + +using qnn_op_config_ptr_t = std::shared_ptr; + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/op-config.cpp b/ggml/src/ggml-qnn/op-config.cpp index 9b98051adf..df70d548a4 100644 --- a/ggml/src/ggml-qnn/op-config.cpp +++ b/ggml/src/ggml-qnn/op-config.cpp @@ -36,7 +36,7 @@ int get_rank(const qnn::ggml_tensor_array_t &tensor_inputs, const qnn::ggml_tens return tensor_rank; } -Qnn_DataType_t get_tensor_type(const qnn::ggml_qnn_tensor_array_t &tensors) { +Qnn_DataType_t get_tensor_type(const qnn::qnn_tensor_array_t &tensors) { Qnn_DataType_t type = QNN_DATATYPE_UNDEFINED; for (auto tensor : tensors) { auto tensor_type_size = qnn::qnn_datatype_size(tensor->get_data_type()); @@ -59,8 +59,7 @@ struct tensor_common_params { }; void create_tensors_from_ggml_tensor(const tensor_common_params ¶ms, const qnn::ggml_tensor_array_t &ggml_tensors, - qnn::ggml_qnn_tensor_array_t *tensor_wrappers, - std::vector *qnn_tensors) { + qnn::qnn_tensor_array_t *tensor_wrappers, std::vector *qnn_tensors) { using namespace qnn; tensor_wrappers->resize(ggml_tensors.size()); @@ -78,7 +77,7 @@ void create_tensors_from_ggml_tensor(const tensor_common_params ¶ms, const q } } -bool bind_tensors(const qnn::ggml_tensor_array_t &ggml_tensors, qnn::ggml_qnn_tensor_array_t &tensor_wrappers, +bool bind_tensors(const qnn::ggml_tensor_array_t &ggml_tensors, qnn::qnn_tensor_array_t &tensor_wrappers, std::vector &qnn_tensors) { for (size_t i = 0; i < ggml_tensors.size(); i++) { auto *ggml_tensor = ggml_tensors[i]; @@ -99,9 +98,9 @@ public: const std::string &op_type, std::shared_ptr qnn_instance) : ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} - bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const qnn::ggml_tensor_array_t &tensor_inputs, - const qnn::ggml_tensor_array_t &tensor_outputs) override { + bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const qnn::ggml_tensor_array_t &tensor_inputs, + const qnn::ggml_tensor_array_t &tensor_outputs) override { GGML_UNUSED(device); GGML_UNUSED(graph_handle); GGML_UNUSED(tensor_inputs); @@ -109,28 +108,28 @@ public: return true; } - void set_input_tensors(qnn::ggml_qnn_tensor_array_t &tensor_inputs) { + void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) { _tensor_inputs = tensor_inputs; _qnn_tensor_inputs.resize(_tensor_inputs.size()); } - void set_input_tensors(qnn::ggml_qnn_tensor_array_t &&tensor_inputs) { + void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) { _tensor_inputs = std::move(tensor_inputs); _qnn_tensor_inputs.resize(_tensor_inputs.size()); } - void set_output_tensors(qnn::ggml_qnn_tensor_array_t &tensor_outputs) { + void set_output_tensors(qnn::qnn_tensor_array_t &tensor_outputs) { _tensor_outputs = tensor_outputs; _qnn_tensor_outputs.resize(_tensor_outputs.size()); } - void set_output_tensors(qnn::ggml_qnn_tensor_array_t &&tensor_outputs) { + void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_outputs) { _tensor_outputs = std::move(tensor_outputs); _qnn_tensor_outputs.resize(_tensor_outputs.size()); } - qnn::ggml_qnn_tensor_array_t &get_input_tensors() { return _tensor_inputs; } - qnn::ggml_qnn_tensor_array_t &get_output_tensors() { return _tensor_outputs; } + qnn::qnn_tensor_array_t &get_input_tensors() { return _tensor_inputs; } + qnn::qnn_tensor_array_t &get_output_tensors() { return _tensor_outputs; } private: DISABLE_COPY(ggml_qnn_connectable_op_config); @@ -186,7 +185,7 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { GGML_ASSERT(_qnn_tensor_inputs.size() == _tensor_inputs.size()); GGML_ASSERT(_qnn_tensor_outputs.size() == _tensor_outputs.size()); - auto qnn_interface = _qnn_instance->get_qnn_interface(); + QNN_LOG_DEBUG("[%s]add to graph start\n", _name.c_str()); for (size_t i = 0; i < _tensor_inputs.size(); i++) { auto tensor = _tensor_inputs[i]; if (!tensor->alloc_qnn_tensor_id()) { @@ -194,6 +193,7 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { return false; } + QNN_LOG_DEBUG("[%s]input tensor id: %d\n", _name.c_str(), tensor->get_qnn_tensor_id()); _qnn_tensor_inputs[i] = tensor->get_qnn_tensor(); } @@ -203,21 +203,19 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { QNN_LOG_ERROR("[%s]output tensor alloc_qnn_tensor_id failed\n", _name.c_str()); return false; } - _qnn_tensor_outputs[i] = _tensor_outputs[i]->get_qnn_tensor(); + + QNN_LOG_DEBUG("[%s]output tensor id: %d\n", _name.c_str(), tensor->get_qnn_tensor_id()); + _qnn_tensor_outputs[i] = tensor->get_qnn_tensor(); } + auto qnn_interface = _qnn_instance->get_qnn_interface(); auto error = qnn_interface->qnn_graph_add_node(graph_handle, get_op_config()); if (error != QNN_SUCCESS) { - auto *error_str = get_qnn_error_string(error); - if (error_str) { - QNN_LOG_ERROR("[%s]qnn_graph_add_node.error: %s\n", _name.c_str(), error_str); - } else { - QNN_LOG_ERROR("[%s]qnn_graph_add_node.error: %d\n", _name.c_str(), error); - } + QNN_LOG_ERROR("[%s]qnn_graph_add_node.error: %s", _name.c_str(), get_qnn_error_string(error)); return false; } - QNN_LOG_DEBUG("[%s]added to graph\n", _name.c_str()); + QNN_LOG_DEBUG("[%s]added to graph succeed\n", _name.c_str()); return true; } @@ -259,9 +257,9 @@ Qnn_OpConfig_t ggml_qnn_op_config_base::get_op_config() { return config; } -bool ggml_qnn_single_op_config::create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) { +bool ggml_qnn_single_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) { const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs); tensor_common_params params = {"src", tensor_rank, true, device, graph_handle, _qnn_instance}; create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs); @@ -282,9 +280,9 @@ bool ggml_qnn_single_op_config::create_tensors(QNNBackend device, Qnn_GraphHandl return true; } -bool ggml_qnn_matmul_op_config::create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) { +bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) { GGML_ASSERT(tensor_inputs.size() == 2); GGML_ASSERT(tensor_outputs.size() == 1); const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs); @@ -295,59 +293,143 @@ bool ggml_qnn_matmul_op_config::create_tensors(QNNBackend device, Qnn_GraphHandl create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs); // create output tensor - ggml_qnn_tensor_array_t mat_mul_tensor_outputs; + qnn_tensor_array_t mat_mul_tensor_outputs; params.name_prefix = "dst"; params.is_input = false; create_tensors_from_ggml_tensor(params, tensor_outputs, &mat_mul_tensor_outputs, nullptr); + // create convert nodes + qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs; + if (!create_convert_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs)) { + QNN_LOG_ERROR("create convert nodes failed\n"); + return false; + } + + mat_mul_tensor_inputs.front() = create_gather_nodes(device, graph_handle, tensor_rank, _tensor_inputs.front(), + _tensor_inputs.back()->get_dimensions()); + return create_mat_mul_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs); +} + +qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const int rank, qnn_tensor_ptr_t tensor_input, + qnn_dimension_array_t output_dimensions) { + if (rank <= 2) { + return tensor_input; + } + + const auto &input_dimensions = tensor_input->get_dimensions(); + output_dimensions[rank - 1] = input_dimensions[rank - 1]; + output_dimensions[rank - 2] = input_dimensions[rank - 2]; + + const auto y = output_dimensions[rank - 3] / input_dimensions[rank - 3]; + if (y == 1 && (rank == 3 || (rank == 4 && output_dimensions[rank - 4] == input_dimensions[rank - 4]))) { + return tensor_input; + } + + // create concat nodes, to convert tensor shape from [ne03, ne02, n, k] to [ne03 * x, ne02 * y, n, k] + constexpr const auto create_node = + [](const std::string &name, const int rank, const int axis, const qnn_dimension_array_t &dimensions, + qnn_tensor_ptr_t tensor_input, QNNBackend device, Qnn_GraphHandle_t graph_handle, + std::shared_ptr qnn_instance, qnn_tensor_ptr_t &tensor_output) -> qnn_op_config_ptr_t { + auto gather_out = + std::make_shared(ggml_qnn_tensor::INTERMEDIATE, name + "_out", dimensions, + tensor_input->get_data_type(), rank, device, graph_handle, qnn_instance); + auto gather_op = std::make_shared(name, QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_GATHER, qnn_instance); + + Qnn_Scalar_t scalar = QNN_SCALAR_INIT; + scalar.dataType = QNN_DATATYPE_INT_32; + scalar.int32Value = axis; + gather_op->add_scalar_param(QNN_OP_GATHER_PARAM_AXIS, scalar); + gather_op->set_output_tensors({gather_out}); + + // here we calculate the index mapping, will generate a 1d tensor like [0, 0, 0, 1, 1, 1, 2, 2, 2, ...], + // by repeating each index [scale] times. + const auto scale = dimensions[axis] / tensor_input->get_dimensions()[axis]; + std::vector index_buffer(dimensions[axis] * sizeof(uint32_t)); + for (uint32_t *curr = reinterpret_cast(index_buffer.data()), *end = curr + dimensions[axis]; + curr < end; curr++) { + *curr = (curr - reinterpret_cast(index_buffer.data())) / scale; + } + + auto gather_index = std::make_shared( + ggml_qnn_tensor::PARAMETER, name + "_index", qnn_dimension_array_t{dimensions[axis]}, QNN_DATATYPE_UINT_32, + 1, device, graph_handle, qnn_instance); + gather_index->set_data_buffer(std::move(index_buffer)); + gather_op->set_input_tensors({tensor_input, gather_index}); + + tensor_output = gather_out; + return gather_op; + }; + + qnn_dimension_array_t intermediate_dimensions = input_dimensions; + intermediate_dimensions[rank - 3] = output_dimensions[rank - 3]; + qnn_tensor_ptr_t gather0_out; + _gather0 = create_node(_name + "_gather0", rank, rank - 3, intermediate_dimensions, tensor_input, device, + graph_handle, _qnn_instance, gather0_out); + if (rank == 3) { + return gather0_out; + } + + qnn_tensor_ptr_t gather1_out; + _gather1 = create_node(_name + "_gather1", rank, rank - 4, output_dimensions, gather0_out, device, graph_handle, + _qnn_instance, gather1_out); + return gather1_out; +} + +bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, + qnn_tensor_array_t &tensor_inputs, + qnn_tensor_array_t &tensor_outputs) { if (device == QNN_BACKEND_GPU) { - // there's no convert op for GPU, so we should create matmul nodes directl. - return create_mat_mul_nodes(device, graph_handle, tensor_rank, _tensor_inputs, mat_mul_tensor_outputs); + // there's no convert op for GPU, so we should create matmul nodes directly. + return true; } // create tensors for convert node - ggml_qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs; - auto input_tensor_type = get_tensor_type(mat_mul_tensor_inputs); - QNN_LOG_DEBUG("matmul input tensor type: %s\n", qnn_datatype_to_string(input_tensor_type)); + auto tensor_type = get_tensor_type(tensor_inputs); + QNN_LOG_DEBUG("input tensor type: %s\n", qnn_datatype_to_string(tensor_type)); - _input_converts.resize(mat_mul_tensor_inputs.size()); - for (size_t i = 0; i < mat_mul_tensor_inputs.size(); ++i) { + _input_converts.resize(tensor_inputs.size()); + for (size_t i = 0; i < tensor_inputs.size(); ++i) { // create input convert nodes + auto convert_in = tensor_inputs[i]; + if (convert_in->get_data_type() == tensor_type) { + continue; + } + std::string convert_name("convert_src" + std::to_string(i)); - auto convert_in = mat_mul_tensor_inputs[i]; auto convert_out = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_out", - convert_in->get_dimensions(), input_tensor_type, - tensor_rank, device, graph_handle, _qnn_instance); + convert_in->get_dimensions(), tensor_type, rank, device, + graph_handle, _qnn_instance); auto convert = std::make_shared(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_CONVERT, _qnn_instance); convert->set_input_tensors({convert_in}); convert->set_output_tensors({convert_out}); - mat_mul_tensor_inputs[i] = convert_out; + tensor_inputs[i] = convert_out; _input_converts[i] = convert; } - { + if (tensor_outputs.front()->get_data_type() != tensor_type) { // create output convert node std::string convert_name("convert_dst"); - auto convert_out = mat_mul_tensor_outputs.front(); + auto convert_out = tensor_outputs.front(); auto convert_in = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_in", - convert_out->get_dimensions(), input_tensor_type, - tensor_rank, device, graph_handle, _qnn_instance); + convert_out->get_dimensions(), tensor_type, rank, device, + graph_handle, _qnn_instance); auto output_convert = std::make_shared( convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_CONVERT, _qnn_instance); output_convert->set_input_tensors({convert_in}); output_convert->set_output_tensors({convert_out}); - mat_mul_tensor_outputs[0] = convert_in; + tensor_outputs.front() = convert_in; _output_convert = output_convert; } - // create mat_mul nodes - return create_mat_mul_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs); + return true; } bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, - ggml_qnn_tensor_array_t &tensor_inputs, - ggml_qnn_tensor_array_t &tensor_outputs) { + qnn_tensor_array_t &tensor_inputs, + qnn_tensor_array_t &tensor_outputs) { /* * First, both the ggml and qnn tensor in memory are stored as row-major format. (For more details, please also: @@ -386,9 +468,8 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap * ```mermaid * graph TD; * i1>ggml_tensor_in0] --src0--> mat_mul0; - * i2>ggml_tensor_in1] --src1--> transpose0; - * transpose0 --src0_trans--> mat_mul0; - * mat_mul0 --dst_trans--> transpose1; + * i2>ggml_tensor_in1] --src1--> mat_mul0; + * mat_mul0 --dst_trans--> transpose_out; * transpose1 --dst0--> o1>ggml_tensor_out]; * ``` */ @@ -398,9 +479,6 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS does not match the expected value"); qnn_dimension_array_t dimensions = get_transposed_dimensions(src1->get_dimensions(), rank); - auto src0_trans = - std::make_shared(ggml_qnn_tensor::INTERMEDIATE, "src0_trans", dimensions, - src1->get_data_type(), rank, device, graph_handle, _qnn_instance); // create dst_trans tensor auto dst = tensor_outputs.front(); @@ -408,48 +486,37 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap auto dst_trans = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, "dst_trans", dimensions, dst->get_data_type(), rank, device, graph_handle, _qnn_instance); - // create transpose0 - auto transpose0 = std::make_shared(_name + "_trans0", QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_TRANSPOSE, _qnn_instance); - - // create transpose1 - auto transpose1 = std::make_shared(_name + "_trans1", QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_TRANSPOSE, _qnn_instance); + // create transpose_out + auto transpose_out = std::make_shared( + _name + "_trans1", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE, _qnn_instance); // create mat_mul auto mat_mul = std::make_shared(_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL, _qnn_instance); - // set transpose0 parameters + Qnn_Scalar_t scalar = QNN_SCALAR_INIT; + scalar.dataType = QNN_DATATYPE_BOOL_8; + scalar.bool8Value = 1; + mat_mul->add_scalar_param(QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, scalar); + + // set transpose_out parameters auto *params_data = reinterpret_cast(kTransposeParamData[rank - 1].data()); const qnn_dimension_array_t param_dims = {(uint32_t)rank, 1, 1, 1}; - transpose0->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32, device, - graph_handle); - - // set transpose1 parameters - transpose1->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32, device, - graph_handle); - - // set tensor to transpose0 - ggml_qnn_tensor_array_t tensors = {tensor_inputs.back()}; - transpose0->set_input_tensors(tensors); - tensors = {src0_trans}; - transpose0->set_output_tensors(tensors); + transpose_out->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32, + device, graph_handle); // set tensor to mat_mul - tensors = {tensor_inputs.front(), src0_trans}; - mat_mul->set_input_tensors(tensors); - tensors = {dst_trans}; + mat_mul->set_input_tensors(tensor_inputs); + qnn_tensor_array_t tensors = {dst_trans}; mat_mul->set_output_tensors(tensors); - // set tensor to transpose1 + // set tensor to transpose_out tensors = {dst_trans}; - transpose1->set_input_tensors(tensors); - transpose1->set_output_tensors(tensor_outputs); + transpose_out->set_input_tensors(tensors); + transpose_out->set_output_tensors(tensor_outputs); _mat_mul = mat_mul; - _transpose0 = transpose0; - _transpose1 = transpose1; + _transpose_out = transpose_out; return true; } @@ -460,8 +527,15 @@ bool ggml_qnn_matmul_op_config::add_op_to_graph(Qnn_GraphHandle_t graph_handle) } } - return _transpose0->add_op_to_graph(graph_handle) && _mat_mul->add_op_to_graph(graph_handle) && - _transpose1->add_op_to_graph(graph_handle) && + if (_gather0 && !_gather0->add_op_to_graph(graph_handle)) { + return false; + } + + if (_gather1 && !_gather1->add_op_to_graph(graph_handle)) { + return false; + } + + return _mat_mul->add_op_to_graph(graph_handle) && _transpose_out->add_op_to_graph(graph_handle) && (!_output_convert || _output_convert->add_op_to_graph(graph_handle)); } @@ -473,13 +547,12 @@ bool ggml_qnn_matmul_op_config::bind_output_tensors(const ggml_tensor_array_t &t if (_output_convert) { return _output_convert->bind_output_tensors(tensor_outputs); } else { - return _transpose1->bind_output_tensors(tensor_outputs); + return _transpose_out->bind_output_tensors(tensor_outputs); } } void ggml_qnn_matmul_op_config::unbind_input_tensors() { _mat_mul->unbind_input_tensors(); - _transpose0->unbind_input_tensors(); for (auto &convert : _input_converts) { if (convert) { convert->unbind_input_tensors(); @@ -488,7 +561,7 @@ void ggml_qnn_matmul_op_config::unbind_input_tensors() { } void ggml_qnn_matmul_op_config::unbind_output_tensors() { - _transpose1->unbind_output_tensors(); + _transpose_out->unbind_output_tensors(); if (_output_convert) { _output_convert->unbind_output_tensors(); } @@ -498,7 +571,7 @@ std::vector &ggml_qnn_matmul_op_config::get_qnn_output_tensors() { if (_output_convert) { return _output_convert->get_qnn_output_tensors(); } else { - return _transpose1->get_qnn_output_tensors(); + return _transpose_out->get_qnn_output_tensors(); } } @@ -513,9 +586,9 @@ ggml_op_constructor_t create_op_constructor(const std::string &op_name) { } else if (op_name == QNN_OP_TRANSPOSE) { return [](const std::string &instance_name, std::shared_ptr qnn_instance) -> std::unique_ptr { - return std::make_unique(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_TRANSPOSE, QNN_OP_TRANSPOSE_PARAM_PERM, - QNN_DATATYPE_UINT_32, 4 * sizeof(uint32_t), qnn_instance); + return std::make_unique( + instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE, QNN_OP_TRANSPOSE_PARAM_PERM, + QNN_DATATYPE_UINT_32, 4 * sizeof(uint32_t), qnn_instance); }; } diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp index 4ec7aac9b2..2757156330 100644 --- a/ggml/src/ggml-qnn/op-config.hpp +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -5,31 +5,13 @@ #include #include -#include "ggml-qnn.h" - +#include "op-config-base.hpp" #include "qnn-lib.hpp" #include "qnn-types.hpp" #include "tensor.hpp" namespace qnn { -using ggml_tensor_array_t = std::vector; - -class ggml_qnn_op_config { -public: - virtual ~ggml_qnn_op_config() {} - virtual bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, - const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) = 0; - virtual std::vector &get_qnn_input_tensors() = 0; - virtual std::vector &get_qnn_output_tensors() = 0; - virtual bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) = 0; - virtual bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) = 0; - virtual bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) = 0; - virtual void unbind_input_tensors() = 0; - virtual void unbind_output_tensors() = 0; -}; - using ggml_op_constructor_t = std::function(const std::string &, std::shared_ptr)>; @@ -60,9 +42,9 @@ protected: std::string _package_name; std::string _op_type; std::shared_ptr _qnn_instance; - ggml_qnn_tensor_array_t _tensor_inputs; - ggml_qnn_tensor_array_t _tensor_outputs; - ggml_qnn_tensor_array_t _tensor_parameters; + qnn_tensor_array_t _tensor_inputs; + qnn_tensor_array_t _tensor_outputs; + qnn_tensor_array_t _tensor_parameters; std::vector _qnn_tensor_inputs; std::vector _qnn_tensor_outputs; std::vector _qnn_parameters; @@ -87,8 +69,9 @@ public: _param_type(param_type), _param_buffer(param_size) {} - bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) override; + bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) override; private: const std::string _param_name; @@ -104,8 +87,9 @@ public: ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr qnn_instance) : _name(name), _qnn_instance(qnn_instance) {} - bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs, - const ggml_tensor_array_t &tensor_outputs) override; + bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) override; bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override; bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; @@ -115,17 +99,22 @@ public: std::vector &get_qnn_output_tensors() override; private: + qnn_tensor_ptr_t create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, + qnn_tensor_ptr_t tensor_input, qnn_dimension_array_t output_dimensions); + bool create_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, + qnn_tensor_array_t &tensor_inputs, qnn_tensor_array_t &tensor_outputs); bool create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, - ggml_qnn_tensor_array_t &tensor_inputs, ggml_qnn_tensor_array_t &tensor_outputs); + qnn_tensor_array_t &tensor_inputs, qnn_tensor_array_t &tensor_outputs); std::string _name; std::shared_ptr _qnn_instance; - std::shared_ptr _transpose0; - std::shared_ptr _transpose1; - std::shared_ptr _mat_mul; - std::vector> _input_converts; - std::shared_ptr _output_convert; - ggml_qnn_tensor_array_t _tensor_inputs; + qnn_op_config_ptr_t _transpose_out; + qnn_op_config_ptr_t _mat_mul; + qnn_op_config_ptr_t _gather0; + qnn_op_config_ptr_t _gather1; + std::vector _input_converts; + qnn_op_config_ptr_t _output_convert; + qnn_tensor_array_t _tensor_inputs; std::vector _qnn_tensor_inputs; DISABLE_COPY(ggml_qnn_matmul_op_config); diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index 74bc2b3f95..c6801b7771 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -188,8 +188,8 @@ class qnn_instance { public: using BackendIdType = decltype(QnnInterface_t{}.backendId); - explicit qnn_instance(const std::string &lib_path, const std::string &backend_name, const std::string &model_name) : - _lib_path(std::move(lib_path)), _backend_name(std::move(backend_name)), _model_name(std::move(model_name)) {} + explicit qnn_instance(const std::string &lib_path, const std::string &backend_name, const std::string &model_name) + : _lib_path(std::move(lib_path)), _backend_name(std::move(backend_name)), _model_name(std::move(model_name)) {} ~qnn_instance() {} @@ -269,7 +269,7 @@ public: QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", chipinfo.socModel, qnn::get_chipset_desc(chipinfo.socModel), htp_arch, qnn::get_htparch_desc(htp_arch), chipinfo.vtcmSize); - _soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; + _soc_info = {chipinfo.socModel, htp_arch, chipinfo.vtcmSize}; } _qnn_interface->qnn_device_free_platform_info(nullptr, p_info); @@ -288,7 +288,7 @@ public: arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; arch_devconfig.customConfig = &arch_customconfig; - const QnnDevice_Config_t *p_deviceconfig[] = { &soc_devconfig, &arch_devconfig, nullptr }; + const QnnDevice_Config_t *p_deviceconfig[] = {&soc_devconfig, &arch_devconfig, nullptr}; qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); } else { qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, nullptr, &_qnn_device_handle); @@ -299,27 +299,17 @@ public: QNN_LOG_INFO("create QNN device successfully\n"); } - if (qnn::sdk_profile_level::profile_off != _profile_level) { + if (_profile_level != sdk_profile_level::profile_off) { QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); - if (qnn::sdk_profile_level::profile_basic == _profile_level) { - QNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); - if (QNN_PROFILE_NO_ERROR != _qnn_interface->qnn_profile_create( - _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) { - QNN_LOG_WARN("unable to create profile handle in the backend\n"); - return 6; - } else { - QNN_LOG_DEBUG("initialize qnn profile successfully\n"); - } - } else if (qnn::sdk_profile_level::profile_detail == _profile_level) { - QNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); - if (QNN_PROFILE_NO_ERROR != _qnn_interface->qnn_profile_create(_qnn_backend_handle, - QNN_PROFILE_LEVEL_DETAILED, - &_qnn_profile_handle)) { - QNN_LOG_WARN("unable to create profile handle in the backend\n"); - return 7; - } else { - QNN_LOG_DEBUG("initialize qnn profile successfully\n"); - } + auto profile_level = _profile_level == sdk_profile_level::profile_detail ? QNN_PROFILE_LEVEL_DETAILED + : QNN_PROFILE_LEVEL_BASIC; + + if (QNN_PROFILE_NO_ERROR != + _qnn_interface->qnn_profile_create(_qnn_backend_handle, profile_level, &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 6; + } else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); } } @@ -364,7 +354,7 @@ public: size_t candidate_size = 0; uint8_t *rpc_buffer = nullptr; const int size_in_mb = (1 << 20); - size_t probe_slots[] = { 1024, 1536, 2048 - 48, 2048 }; + size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048}; size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); for (size_t idx = 0; idx < probe_counts; idx++) { rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, sizeof(void *))); @@ -526,13 +516,13 @@ public: // use rpc control latency recommended 100 us, refer hexagon sdk rpc_control_latency.rpcControlLatencyConfig = 100; - const QnnHtpPerfInfrastructure_PowerConfig_t *power_configs[] = { &rpc_polling_time, &rpc_control_latency, - nullptr }; + const QnnHtpPerfInfrastructure_PowerConfig_t *power_configs[] = {&rpc_polling_time, &rpc_control_latency, + nullptr}; Qnn_ErrorHandle_t qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); if (qnn_status != QNN_SUCCESS) { QNN_LOG_WARN("set htp perf failed\n"); } else { - QNN_LOG_INFO("set htp perf ok\n"); + QNN_LOG_DEBUG("set htp perf ok\n"); } } else { QNN_LOG_WARN("can't set htp perf\n"); @@ -572,13 +562,13 @@ public: power_config.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; // set power config with different performance parameters - const QnnHtpPerfInfrastructure_PowerConfig_t *power_configs[] = { &power_config, nullptr }; + const QnnHtpPerfInfrastructure_PowerConfig_t *power_configs[] = {&power_config, nullptr}; Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); if (qnn_status != QNN_SUCCESS) { QNN_LOG_WARN("set htp high performance mode failed\n"); } else { - QNN_LOG_INFO("set htp high performance mode ok\n"); + QNN_LOG_DEBUG("set htp high performance mode ok\n"); } return 0; @@ -659,8 +649,8 @@ public: return nullptr; } - QNN_LOG_INFO("mem_fd %d\n", mem_fd); - Qnn_MemDescriptor_t descriptor = { { rank, dimensions, nullptr }, data_type, QNN_MEM_TYPE_ION, { { mem_fd } } }; + QNN_LOG_DEBUG("mem_fd %d\n", mem_fd); + Qnn_MemDescriptor_t descriptor = {{rank, dimensions, nullptr}, data_type, QNN_MEM_TYPE_ION, {{mem_fd}}}; Qnn_MemHandle_t handle = nullptr; auto error = _qnn_interface->qnn_mem_register(_qnn_context_handle, &descriptor, /*numDescriptors=*/1, &handle); @@ -670,8 +660,8 @@ public: return nullptr; } - _qnn_rpc_buffer_to_handles.insert({ p_data, handle }); - QNN_LOG_INFO("successfully register shared memory handler: %p\n", handle); + _qnn_rpc_buffer_to_handles.insert({p_data, handle}); + QNN_LOG_DEBUG("successfully register shared memory handler: %p\n", handle); return handle; } @@ -748,7 +738,7 @@ private: QNN_LOG_WARN("unable to find a valid qnn system interface\n"); return 6; } else { - QNN_LOG_INFO("find a valid qnn system interface\n"); + QNN_LOG_DEBUG("find a valid qnn system interface\n"); } auto qnn_sys_interface = std::make_shared(*provider_list[0], system_lib_handle); @@ -810,7 +800,7 @@ private: QNN_LOG_WARN("unable to find a valid qnn interface\n"); return 6; } else { - QNN_LOG_INFO("find a valid qnn interface\n"); + QNN_LOG_DEBUG("find a valid qnn interface\n"); } BackendIdType backend_id = provider_list[0]->backendId; @@ -890,7 +880,7 @@ private: std::unordered_map _loaded_backend; dl_handler_t _rpc_lib_handle = nullptr; - std::atomic_bool _rpcmem_initialized{ false }; + std::atomic_bool _rpcmem_initialized{false}; qnn::pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; qnn::pfn_rpc_mem_free _pfn_rpc_mem_free; qnn::pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; diff --git a/ggml/src/ggml-qnn/qnn-types.hpp b/ggml/src/ggml-qnn/qnn-types.hpp index 8fce790def..7461ac3012 100644 --- a/ggml/src/ggml-qnn/qnn-types.hpp +++ b/ggml/src/ggml-qnn/qnn-types.hpp @@ -14,7 +14,7 @@ namespace qnn { // Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK // ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm // ================================================================================================= -enum sdk_profile_level { profile_off = 0, profile_basic = 1, profile_detail = 2 }; +enum sdk_profile_level { profile_off = 0, profile_basic, profile_detail }; enum qcom_htp_arch { NONE = 0, diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index f28fc8e2ca..0a9a367015 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -27,8 +27,8 @@ public: explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string &name, const qnn_dimension_array_t &dimensions, Qnn_DataType_t data_type, int rank, QNNBackend device, Qnn_GraphHandle_t graph_handle, - std::shared_ptr qnn_instance) : - _tensor_name(name), _device(device), _qnn_instance(qnn_instance), _graph_handle(graph_handle) { + std::shared_ptr qnn_instance) + : _tensor_name(name), _device(device), _qnn_instance(qnn_instance), _graph_handle(graph_handle) { if (!_tensor_name.empty()) { QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str()); } @@ -37,23 +37,35 @@ public: _dimensions = dimensions; update_params_from_ggml_tensor(tensor_type, data_type, rank); - QNN_LOG_DEBUG("create tensor %s, rank: %d, dims: [%d, %d, %d, %d], data_type: %d, device: %d", + QNN_LOG_DEBUG("[%s][%s]created, rank: %d, dims: [%d, %d, %d, %d], type: %s", get_backend_name(device), _tensor_name.c_str(), rank, (int)_dimensions[0], (int)_dimensions[1], (int)_dimensions[2], - (int)_dimensions[3], (int)data_type, (int)device); + (int)_dimensions[3], qnn_datatype_to_string(data_type)); } explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string &name, const ggml_dimension_array_t &dimensions, ggml_type data_type, int rank, QNNBackend device, - Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance) : - ggml_qnn_tensor(tensor_type, name, get_internal_dimension(dimensions, rank), - qnn_datatype_from_ggml_datatype(data_type), rank, device, graph_handle, qnn_instance) {} + Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance) + : ggml_qnn_tensor(tensor_type, name, get_internal_dimension(dimensions, rank), + qnn_datatype_from_ggml_datatype(data_type), rank, device, graph_handle, qnn_instance) {} - ~ggml_qnn_tensor() { _qnn_rpc_buffer.reset(); } + ~ggml_qnn_tensor() { + _buffer_storage.clear(); + unbind(); + _qnn_rpc_buffer.reset(); + } + + bool set_data_buffer(std::vector &&buffer) { + if (!bind_buffer_impl(buffer.data(), buffer.size())) { + return false; + } + + _buffer_storage = std::move(buffer); + return true; + } bool alloc_qnn_tensor_id() { if (QNN_TENSOR_GET_ID(_qnn_tensor)) { - QNN_LOG_WARN("graph tensor %s already created, id %d", _tensor_name.c_str(), - QNN_TENSOR_GET_ID(_qnn_tensor)); + QNN_LOG_DEBUG("[%s]tensor already has a id: %d", _tensor_name.c_str(), QNN_TENSOR_GET_ID(_qnn_tensor)); return true; } @@ -61,30 +73,90 @@ public: auto qnn_interface = _qnn_instance->get_qnn_interface(); auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("create graph tensor failed, tensor %s, error: %d\n", _tensor_name.c_str(), error); + QNN_LOG_WARN("[%s]allocate id failed , error: %d\n", _tensor_name.c_str(), error); return false; } QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(qnn_tensor)); - QNN_LOG_DEBUG("create graph tensor %s, id: %d, rank: %d", _tensor_name.c_str(), QNN_TENSOR_GET_ID(qnn_tensor), - QNN_TENSOR_GET_RANK(qnn_tensor)); - + QNN_LOG_DEBUG("[%s][%s]allocated id: %d, rank: %d", get_backend_name(_device), _tensor_name.c_str(), + QNN_TENSOR_GET_ID(qnn_tensor), QNN_TENSOR_GET_RANK(qnn_tensor)); return true; } bool bind_buffer(uint8_t *buffer, const size_t buffer_size) { + if (!_buffer_storage.empty()) { + QNN_LOG_DEBUG("[%s]already has buffer storage, skip bind", _tensor_name.c_str()); + return true; + } + + return bind_buffer_impl(buffer, buffer_size); + } + + bool bind_ggml_tensor(ggml_tensor *tensor) { + if (!bind_buffer(reinterpret_cast(tensor->data), ggml_nbytes(tensor))) { + QNN_LOG_WARN("[%s]failed to bind ggml tensor(%s)", _tensor_name.c_str(), ggml_get_name(tensor)); + return false; + } + + QNN_LOG_DEBUG("[%s][%s]bind to ggml tensor(%s)", get_backend_name(_device), _tensor_name.c_str(), + ggml_get_name(tensor)); + return true; + } + + bool unbind() { + if (!_graph_handle) { + QNN_LOG_WARN("[%s]not bound to any graph", _tensor_name.c_str()); + return false; + } + + if (!_buffer) { + QNN_LOG_DEBUG("[%s]bound to ggml tensor", _tensor_name.c_str()); + return true; + } + + if (!read_from_qnn_tensor()) { + QNN_LOG_WARN("[%s]read from qnn tensor failed", _tensor_name.c_str()); + return false; + } + + if (!_buffer_storage.empty()) { + QNN_LOG_DEBUG("[%s]already has buffer storage, stop unbind", _tensor_name.c_str()); + return true; + } + + if (!should_use_mem_handle()) { + QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); + Qnn_ClientBuffer_t client_buf = {}; + QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); + QNN_LOG_DEBUG("[%s]clear client buffer", _tensor_name.c_str()); + } + + QNN_LOG_DEBUG("[%s][%s]unbind from buffer: %p, size: %d", get_backend_name(_device), _tensor_name.c_str(), + _buffer, (int)_buffer_size); + _buffer = nullptr; + _buffer_size = 0; + return true; + } + + const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } + Qnn_DataType_t get_data_type() const { return QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor); } + const qnn_dimension_array_t &get_dimensions() const { return _dimensions; } + uint32_t get_qnn_tensor_id() const { return QNN_TENSOR_GET_ID(_qnn_tensor); } + +private: + bool bind_buffer_impl(uint8_t *buffer, const size_t buffer_size) { if (_buffer) { if (_buffer != buffer) { - QNN_LOG_WARN("tensor %s has been bound to another buffer %p", _tensor_name.c_str(), _buffer); + QNN_LOG_WARN("[%s]has been bound to another buffer %p", _tensor_name.c_str(), _buffer); return false; } - QNN_LOG_INFO("tensor %s already bound to same ggml tensor %p", _tensor_name.c_str(), _buffer); + QNN_LOG_DEBUG("[%s]already bound to same ggml tensor %p", _tensor_name.c_str(), _buffer); return true; } if (QNN_TENSOR_GET_TYPE(_qnn_tensor) == QNN_TENSOR_TYPE_NATIVE) { - QNN_LOG_DEBUG("tensor %s type(%d) not READ/WRITE, skipping", _tensor_name.c_str(), + QNN_LOG_DEBUG("[%s]tensor type(%d) not READ/WRITE, skipping", _tensor_name.c_str(), (int)QNN_TENSOR_TYPE_NATIVE); return true; } @@ -95,7 +167,7 @@ public: _qnn_instance, buffer_size, QNN_TENSOR_GET_RANK(_qnn_tensor), QNN_TENSOR_GET_DIMENSIONS(_qnn_tensor), QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor)); if (!qnn_rpc_buffer->is_valid()) { - QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str()); + QNN_LOG_WARN("[%s]alloc rpc mem failed", _tensor_name.c_str()); return false; } @@ -104,12 +176,12 @@ public: QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, _qnn_rpc_buffer->get_mem_handle()); - QNN_LOG_DEBUG("tensor %s, use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); + QNN_LOG_DEBUG("[%s]use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); } else { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); - Qnn_ClientBuffer_t client_buf = { buffer, (uint32_t)buffer_size }; + Qnn_ClientBuffer_t client_buf = {buffer, (uint32_t)buffer_size}; QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); - QNN_LOG_DEBUG("tensor %s, use client buffer %p size %d", _tensor_name.c_str(), client_buf.data, + QNN_LOG_DEBUG("[%s]use client buffer %p size %d", _tensor_name.c_str(), client_buf.data, (int)client_buf.dataSize); } @@ -117,62 +189,19 @@ public: _buffer_size = buffer_size; if (!write_to_qnn_tensor()) { - QNN_LOG_WARN("write to qnn tensor failed, tensor %s", _tensor_name.c_str()); + QNN_LOG_WARN("[%s]write to qnn tensor failed", _tensor_name.c_str()); return false; } - QNN_LOG_DEBUG("bind tensor %s to buffer: %p, size: %d", _tensor_name.c_str(), buffer, (int)buffer_size); + QNN_LOG_DEBUG("[%s][%s]bind to buffer: %p, size: %d", get_backend_name(_device), _tensor_name.c_str(), buffer, + (int)buffer_size); return true; } - bool bind_ggml_tensor(ggml_tensor *tensor) { - if (!bind_buffer(reinterpret_cast(tensor->data), ggml_nbytes(tensor))) { - QNN_LOG_WARN("Failed to bind tensor: %s to ggml tensor: %s", _tensor_name.c_str(), ggml_get_name(tensor)); - return false; - } - - QNN_LOG_DEBUG("Bind tensor %s to ggml tensor %s", _tensor_name.c_str(), ggml_get_name(tensor)); - return true; - } - - bool unbind() { - if (!_graph_handle) { - QNN_LOG_WARN("tensor %s not bound to any graph", _tensor_name.c_str()); - return false; - } - - if (!_buffer) { - QNN_LOG_DEBUG("tensor %s not bound to ggml tensor", _tensor_name.c_str()); - return true; - } - - if (!read_from_qnn_tensor()) { - QNN_LOG_WARN("read from qnn tensor failed, tensor %s", _tensor_name.c_str()); - return false; - } - - if (!should_use_mem_handle()) { - QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); - Qnn_ClientBuffer_t client_buf = {}; - QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); - QNN_LOG_DEBUG("tensor %s, clear client buffer", _tensor_name.c_str()); - } - - QNN_LOG_DEBUG("unbind tensor: %s from buffer: %p, size: %d", _tensor_name.c_str(), _buffer, (int)_buffer_size); - _buffer = nullptr; - _buffer_size = 0; - return true; - } - - const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } - Qnn_DataType_t get_data_type() const { return QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor); } - const qnn_dimension_array_t &get_dimensions() const { return _dimensions; } - -private: bool write_to_qnn_tensor() { auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); if (tensor_type != QNN_TENSOR_TYPE_APP_WRITE && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) { - QNN_LOG_DEBUG("tensor %s type(%d) not WRITE", _tensor_name.c_str(), (int)tensor_type); + QNN_LOG_DEBUG("[%s]tensor type(%d) not WRITE", _tensor_name.c_str(), (int)tensor_type); return true; } @@ -180,20 +209,20 @@ private: if (_qnn_rpc_buffer) { memcpy(_qnn_rpc_buffer->get_buffer(), _buffer, _buffer_size); } else { - QNN_LOG_WARN("tensor %s: can't find rpcmem from qnn mem handle\n", _tensor_name.c_str()); + QNN_LOG_WARN("[%s]can't find rpcmem from qnn mem handle\n", _tensor_name.c_str()); return false; } } // For CPU and GPU, the data is already in the tensor. - QNN_LOG_DEBUG("write tensor %s to qnn", _tensor_name.c_str()); + QNN_LOG_DEBUG("[%s][%s]write tensor to qnn", get_backend_name(_device), _tensor_name.c_str()); return true; } bool read_from_qnn_tensor() { auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor); if (tensor_type != QNN_TENSOR_TYPE_APP_READ && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) { - QNN_LOG_DEBUG("tensor %s type(%d) not READ", _tensor_name.c_str(), (int)tensor_type); + QNN_LOG_DEBUG("[%s]tensor type(%d) not READ", _tensor_name.c_str(), (int)tensor_type); return true; } @@ -201,13 +230,13 @@ private: if (_qnn_rpc_buffer) { memcpy(_buffer, _qnn_rpc_buffer->get_buffer(), _buffer_size); } else { - QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); + QNN_LOG_WARN("[%s]can't find rpcmem from qnn mem handle", _tensor_name.c_str()); return false; } } // For CPU and GPU, the data is already in the tensor. - QNN_LOG_DEBUG("read tensor %s from qnn", _tensor_name.c_str()); + QNN_LOG_DEBUG("[%s][%s]read tensor from qnn", get_backend_name(_device), _tensor_name.c_str()); return true; } @@ -231,12 +260,14 @@ private: case PARAMETER: new_tensor_type = QNN_TENSOR_TYPE_STATIC; break; + case INTERMEDIATE: default: new_tensor_type = QNN_TENSOR_TYPE_NATIVE; break; } QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type); - QNN_LOG_INFO("tensor %s changed to type %d", _tensor_name.c_str(), new_tensor_type); + QNN_LOG_DEBUG("[%s][%s]tensor changed to type %d", get_backend_name(_device), _tensor_name.c_str(), + new_tensor_type); } bool should_use_mem_handle() const { @@ -246,6 +277,7 @@ private: std::string _tensor_name; uint8_t *_buffer = nullptr; size_t _buffer_size = 0; + std::vector _buffer_storage; QNNBackend _device; std::shared_ptr _qnn_instance; Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); @@ -257,7 +289,7 @@ private: DISABLE_MOVE(ggml_qnn_tensor); }; -using ggml_qnn_tensor_ptr_t = std::shared_ptr; -using ggml_qnn_tensor_array_t = std::vector>; +using qnn_tensor_ptr_t = std::shared_ptr; +using qnn_tensor_array_t = std::vector; } // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index 8ae375ffc8..ebfc037237 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -5,9 +5,11 @@ #include "ggml-qnn.h" +#include "QnnGraph.h" #include "qnn-types.hpp" #ifdef __linux__ +#include #include #endif @@ -148,11 +150,11 @@ const char *get_ggml_type_name(ggml_type type) { const char *get_backend_name(QNNBackend device_index) { switch (device_index) { case QNN_BACKEND_CPU: - return "QNN-CPU"; + return "qnn-cpu"; case QNN_BACKEND_GPU: - return "QNN-GPU"; + return "qnn-gpu"; case QNN_BACKEND_NPU: - return "QNN-NPU"; + return "qnn-npu"; case QNN_BACKEND_COUNT: default: return "unknown"; @@ -195,18 +197,7 @@ intptr_t align_to(size_t alignment, intptr_t offset) { : offset + (static_cast(alignment) - (offset % static_cast(alignment))); } -uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor) { - /* - size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); - size_t n_dims = qnn_get_ggml_tensor_rank(tensor); - for (int i = 1; i < n_dims; i++) { - data_size *= tensor->ne[i]; - } - - return data_size; - */ - return ggml_nbytes(tensor); -} +uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor) { return ggml_nbytes(tensor); } void *align_alloc(size_t alignment, size_t size) { size_t size_aligned = size; @@ -248,6 +239,7 @@ const char *opname_from_ggmlop(enum ggml_op ggmlop) { const char *get_qnn_error_string(Qnn_ErrorHandle_t error) { // A complete list of error codes can be found at here: // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/api_error_codes.html + thread_local static char error_code[128] = {}; switch (error) { case QNN_SUCCESS: return "QNN_SUCCESS"; @@ -277,6 +269,36 @@ const char *get_qnn_error_string(Qnn_ErrorHandle_t error) { return "QNN_GRAPH_ERROR_UNCONNECTED_NODE"; case QNN_GRAPH_ERROR_CREATE_FAILED: return "QNN_GRAPH_ERROR_CREATE_FAILED"; + case QNN_GRAPH_ERROR_OPTIMIZATION_FAILED: + return "QNN_GRAPH_ERROR_OPTIMIZATION_FAILED"; + case QNN_GRAPH_ERROR_FINALIZE_FAILED: + return "QNN_GRAPH_ERROR_FINALIZE_FAILED"; + case QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED: + return "QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED"; + case QNN_GRAPH_ERROR_GRAPH_FINALIZED: + return "QNN_GRAPH_ERROR_GRAPH_FINALIZED"; + case QNN_GRAPH_ERROR_EXECUTION_ASYNC_FIFO_FULL: + return "QNN_GRAPH_ERROR_EXECUTION_ASYNC_FIFO_FULL"; + case QNN_GRAPH_ERROR_SIGNAL_IN_USE: + return "QNN_GRAPH_ERROR_SIGNAL_IN_USE"; + case QNN_GRAPH_ERROR_ABORTED: + return "QNN_GRAPH_ERROR_ABORTED"; + case QNN_GRAPH_ERROR_PROFILE_IN_USE: + return "QNN_GRAPH_ERROR_PROFILE_IN_USE"; + case QNN_GRAPH_ERROR_TIMED_OUT: + return "QNN_GRAPH_ERROR_TIMED_OUT"; + case QNN_GRAPH_ERROR_SUBGRAPH: + return "QNN_GRAPH_ERROR_SUBGRAPH"; + case QNN_GRAPH_ERROR_DISABLED: + return "QNN_GRAPH_ERROR_DISABLED"; + case QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE: + return "QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE"; + case QNN_GRAPH_ERROR_TENSOR_SPARSITY: + return "QNN_GRAPH_ERROR_TENSOR_SPARSITY"; + case QNN_GRAPH_ERROR_EARLY_TERMINATION: + return "QNN_GRAPH_ERROR_EARLY_TERMINATION"; + case QNN_GRAPH_ERROR_INVALID_CONTEXT: + return "QNN_GRAPH_ERROR_INVALID_CONTEXT"; // QnnOpPackage_Error_t case QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED: @@ -294,19 +316,34 @@ const char *get_qnn_error_string(Qnn_ErrorHandle_t error) { case QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT: return "QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT"; default: - return nullptr; + if (error >= QNN_GRAPH_MIN_ERROR && error < QNN_GRAPH_MAX_ERROR) { + snprintf(error_code, sizeof(error_code), "UNKNOWN_GRAPH_ERROR_%d", int(error - QNN_GRAPH_MIN_ERROR)); + } else { + snprintf(error_code, sizeof(error_code), "%d", int(error)); + } + return error_code; } } #ifdef __linux__ size_t get_system_total_memory_in_bytes() { + struct sysinfo info = {}; + if (sysinfo(&info) == 0) { + return (info.totalram + info.totalswap) * info.mem_unit; + } + auto pages = (size_t)sysconf(_SC_PHYS_PAGES); auto page_size = (size_t)sysconf(_SC_PAGE_SIZE); return pages * page_size; } size_t get_system_free_memory_in_bytes() { + struct sysinfo info = {}; + if (sysinfo(&info) == 0) { + return (info.freeram + info.freeswap) * info.mem_unit; + } + auto avail_pages = (size_t)sysconf(_SC_AVPHYS_PAGES); auto page_size = (size_t)sysconf(_SC_PAGE_SIZE); return avail_pages * page_size;