#include "backend-ops.hpp" #include "utils.hpp" #include "logger.hpp" #include "tensor.hpp" static bool qnn_is_valid_params(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { QNN_LOG_WARN("invalid params\n"); return false; } qnn::qnn_instance* instance = nullptr; Qnn_Tensor_t* tensor_0 = nullptr; Qnn_Tensor_t* tensor_1 = nullptr; Qnn_Tensor_t* tensor_2 = nullptr; tensor_0 = (Qnn_Tensor_t*)src0->extra; tensor_1 = (Qnn_Tensor_t*)src1->extra; tensor_2 = (Qnn_Tensor_t*)dst->extra; instance = ctx->instance; if ((nullptr == instance) || (nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { QNN_LOG_WARN("invalid params\n"); return false; } return true; } #ifndef NDEBUG #define CHECK_PARAMS(ctx, src0, src1, dst) \ do { \ if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ return; \ } \ } while (0) #else #define CHECK_PARAMS(ctx, src0, src1, dst) #endif //TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat // keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC static void ggml_qnn_add(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { Qnn_ErrorHandle_t error = QNN_SUCCESS; bool graph_initialized = false; qnn::qnn_instance* instance = nullptr; std::string graph_name = "ggml_op_qnn_add"; Qnn_GraphHandle_t graph_handle = nullptr; Qnn_Param_t qnn_params[] = {}; enum ggml_op ggmlop = GGML_OP_ADD; CHECK_PARAMS(ctx, src0, src1, dst); instance = ctx->instance; auto qnn_raw_interface = ctx->raw_interface; qnn::qnn_perf perf("ggml_qnn_add"); perf.start(); std::string map_entry = std::string(ggml_op_name(ggmlop)); if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { graph_initialized = true; auto& graph_item = instance->_qnn_graph_map[map_entry]; graph_handle = std::get<0>(graph_item); } if (!graph_initialized) { graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name; QNN_LOG_INFO("graph name %s", graph_name.c_str()); if (ctx->device == QNN_BACKEND_NPU) { QnnHtpGraph_CustomConfig_t hvx_config; hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; hvx_config.numHvxThreads = 8; QnnGraph_Config_t graph_hvx_config; graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_hvx_config.customConfig = &hvx_config; QnnHtpGraph_CustomConfig_t dlbc_config; dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC QnnGraph_Config_t graph_dlbc_config; graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_dlbc_config.customConfig = &dlbc_config; QnnHtpGraph_CustomConfig_t opt_config; opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; opt_config.optimizationOption.floatValue = 1; // 1 / 3 QnnGraph_Config_t graph_opt_config; graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_opt_config.customConfig = &opt_config; QnnHtpGraph_CustomConfig_t vtcm_config; vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb; QnnGraph_Config_t graph_vtcm_config; graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_vtcm_config.customConfig = &vtcm_config; const QnnGraph_Config_t* p_graphconfig[] = { &graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, &graph_opt_config, NULL }; error = qnn_raw_interface.graphCreate( instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, &graph_handle); } else { error = qnn_raw_interface.graphCreate( instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, &graph_handle); } if (QNN_SUCCESS != error) { QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " "error = %d\n", graph_name.c_str(), error); goto failure; } else { QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); } qnn::ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx); if (!tensor_input0.is_valid()) { goto failure; } qnn::ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx); if (!tensor_input1.is_valid()) { QNN_LOG_INFO("error = %d\n", error); goto failure; } qnn::ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx); if (!tensor_output.is_valid()) { goto failure; } Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t)1, .v1 = {"ggml_op_add", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_ELEMENT_WISE_ADD, 0, qnn_params, 2, tensor_inputs, 1,tensor_outputs} }; error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); if (ctx->device == QNN_BACKEND_NPU) { if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); } } if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } auto graph_item = std::make_tuple(graph_handle, tensor_input0.get_qnn_tensor(), tensor_input1.get_qnn_tensor(), tensor_output.get_qnn_tensor()); instance->_qnn_graph_map[map_entry] = graph_item; } else { auto& graph_item = instance->_qnn_graph_map[map_entry]; qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); if (ctx->device == QNN_BACKEND_NPU) { if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); } } if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } } failure: if (QNN_SUCCESS != error) { QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", src0->name, src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], src0->nb[1], src0->nb[2]); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", src1->name, src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], src1->nb[1], src1->nb[2]); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); } perf.info(); } /* * ggml_qnn_mul_mat was re-added as a standalone function because * the following comments came from https://github.com/ggerganov/llama.cpp/pull/1632 * MUL_MAT take most of the compute time (about 95%). * So to speed up llama, we have to focus on MUL_MAT. * * We have three kinds of MUL_MAT to compute: * mul_mat_f32: both src0 and src1 are F32. * mul_mat_f16_f32: src0 is F16 and src1 is F32. * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...), and src1 is F32. */ static void ggml_qnn_mul_mat(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { Qnn_ErrorHandle_t error = QNN_SUCCESS; bool graph_initialized = false; qnn::qnn_instance* instance = nullptr; std::string graph_name = "ggml_op_qnn_mul_mat"; Qnn_GraphHandle_t graph_handle = nullptr; Qnn_Param_t qnn_params[] = {}; enum ggml_op ggmlop = GGML_OP_MUL_MAT; CHECK_PARAMS(ctx, src0, src1, dst); instance = ctx->instance; auto qnn_raw_interface = ctx->raw_interface; qnn::qnn_perf perf("ggml_qnn_mul_mat"); perf.start(); std::string map_entry = std::string(ggml_op_name(ggmlop)); if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { graph_initialized = true; auto& graph_item = instance->_qnn_graph_map[map_entry]; graph_handle = std::get<0>(graph_item); } //TODO: for scenarios of quantized data in src0 // pass-1: dequantize src0 to FP32 // pass-2: dq-src0 * src1 // the performance gains is worth although there is performance loss in pass-1 if (!graph_initialized) { graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name; QNN_LOG_INFO("graph name %s", graph_name.c_str()); if (ctx->device == QNN_BACKEND_NPU) { QnnHtpGraph_CustomConfig_t hvx_config; hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; hvx_config.numHvxThreads = 8; QnnGraph_Config_t graph_hvx_config; graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_hvx_config.customConfig = &hvx_config; QnnHtpGraph_CustomConfig_t dlbc_config; dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC QnnGraph_Config_t graph_dlbc_config; graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_dlbc_config.customConfig = &dlbc_config; QnnHtpGraph_CustomConfig_t opt_config; opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; opt_config.optimizationOption.floatValue = 1; //1 / 3 QnnGraph_Config_t graph_opt_config; graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_opt_config.customConfig = &opt_config; QnnHtpGraph_CustomConfig_t vtcm_config; vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb; QnnGraph_Config_t graph_vtcm_config; graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_vtcm_config.customConfig = &vtcm_config; const QnnGraph_Config_t* p_graphconfig[] = { &graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config, &graph_opt_config, NULL }; error = qnn_raw_interface.graphCreate( instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, &graph_handle); } else { error = qnn_raw_interface.graphCreate( instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, &graph_handle); } if (QNN_SUCCESS != error) { QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " "error = %d\n", graph_name.c_str(), error); goto failure; } qnn::ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx); if (!tensor_input0.is_valid()) { goto failure; } qnn::ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx); if (!tensor_input1.is_valid()) { goto failure; } qnn::ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx); if (!tensor_output.is_valid()) { goto failure; } Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t)1, .v1 = {"ggml_op_mul_mat", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL, 0, qnn_params, 2, tensor_inputs, 1, tensor_outputs} }; error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); if (ctx->device == QNN_BACKEND_NPU) { if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); } } if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } auto graph_item = std::make_tuple(graph_handle, tensor_input0.get_qnn_tensor(), tensor_input1.get_qnn_tensor(), tensor_output.get_qnn_tensor()); instance->_qnn_graph_map[map_entry] = graph_item; } else { auto& graph_item = instance->_qnn_graph_map[map_entry]; qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); if (ctx->device == QNN_BACKEND_NPU) { if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); } } if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } } failure: if (QNN_SUCCESS != error) { QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", src0->name, src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], src0->nb[1], src0->nb[2]); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", src1->name, src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], src1->nb[1], src1->nb[2]); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); } perf.info(); } static void ggml_qnn_repeat(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { } static void ggml_qnn_get_rows(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { } static void ggml_qnn_acc(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { } static void ggml_qnn_div(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { } static void ggml_qnn_gelu(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { } static void ggml_qnn_silu(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { } static void ggml_qnn_gelu_quick(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { } static void ggml_qnn_tanh(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { } static void ggml_qnn_relu(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { } static void ggml_qnn_hardsigmoid(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { } static void ggml_qnn_hardswish(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { } static void ggml_qnn_leaky_relu(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { } static void ggml_qnn_sqr(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { } static void ggml_qnn_norm(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { } static void ggml_qnn_group_norm(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { } static void ggml_qnn_concat(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { } static void ggml_qnn_upscale(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { } static void ggml_qnn_pad(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { } static void ggml_qnn_rms_norm(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { } static void ggml_qnn_cpy(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { } static void ggml_qnn_dup(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { ggml_qnn_cpy(ctx, src0, dst, nullptr); (void)src1; } static void ggml_qnn_mul_mat_id(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { } static void ggml_qnn_scale(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { } static void ggml_qnn_clamp(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { } static void ggml_qnn_diag_mask_inf(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { } static void ggml_qnn_soft_max(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { } static void ggml_qnn_rope(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { GGML_ASSERT(ggml_is_contiguous(src0)); } static void ggml_qnn_pool2d(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { } static void ggml_qnn_im2col(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { } static void ggml_qnn_sum_rows(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { GGML_ASSERT(ggml_is_contiguous(src0)); } static void ggml_qnn_argsort(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { GGML_ASSERT(ggml_is_contiguous(src0)); } static void ggml_qnn_nop(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst) { (void)src0; (void)src1; (void)dst; } qnn::ggml_qnn_op_array_t qnn::ggml_qnn_op_array() { static constexpr const qnn::ggml_qnn_op_t kQnnOpsTable[GGML_OP_COUNT] = { nullptr, // GGML_OP_NONE nullptr, // GGML_OP_DUP ggml_qnn_add, // GGML_OP_ADD nullptr, // GGML_OP_ADD1 nullptr, // GGML_OP_ACC nullptr, // GGML_OP_SUB nullptr, // GGML_OP_MUL nullptr, // GGML_OP_DIV nullptr, // GGML_OP_SQR nullptr, // GGML_OP_SQRT nullptr, // GGML_OP_LOG nullptr, // GGML_OP_SUM nullptr, // GGML_OP_SUM_ROWS nullptr, // GGML_OP_MEAN nullptr, // GGML_OP_ARGMAX nullptr, // GGML_OP_REPEAT nullptr, // GGML_OP_REPEAT_BACK nullptr, // GGML_OP_CONCAT nullptr, // GGML_OP_SILU_BACK nullptr, // GGML_OP_NORM nullptr, // GGML_OP_RMS_NORM nullptr, // GGML_OP_RMS_NORM_BACK nullptr, // GGML_OP_GROUP_NORM ggml_qnn_mul_mat, // GGML_OP_MUL_MAT nullptr, // GGML_OP_MUL_MAT_ID nullptr, // GGML_OP_OUT_PROD nullptr, // GGML_OP_SCALE nullptr, // GGML_OP_SET nullptr, // GGML_OP_CPY nullptr, // GGML_OP_CONT nullptr, // GGML_OP_RESHAPE nullptr, // GGML_OP_VIEW nullptr, // GGML_OP_PERMUTE nullptr, // GGML_OP_TRANSPOSE nullptr, // GGML_OP_GET_ROWS nullptr, // GGML_OP_GET_ROWS_BACK nullptr, // GGML_OP_DIAG nullptr, // GGML_OP_DIAG_MASK_INF nullptr, // GGML_OP_DIAG_MASK_ZERO nullptr, // GGML_OP_SOFT_MAX nullptr, // GGML_OP_SOFT_MAX_BACK nullptr, // GGML_OP_ROPE nullptr, // GGML_OP_ROPE_BACK nullptr, // GGML_OP_CLAMP nullptr, // GGML_OP_CONV_TRANSPOSE_1D nullptr, // GGML_OP_IM2COL nullptr, // GGML_OP_CONV_TRANSPOSE_2D nullptr, // GGML_OP_POOL_1D nullptr, // GGML_OP_POOL_2D nullptr, // GGML_OP_UPSCALE nullptr, // GGML_OP_PAD nullptr, // GGML_OP_ARANGE nullptr, // GGML_OP_TIMESTEP_EMBEDDING nullptr, // GGML_OP_ARGSORT nullptr, // GGML_OP_LEAKY_RELU nullptr, // GGML_OP_FLASH_ATTN_EXT nullptr, // GGML_OP_FLASH_ATTN_BACK nullptr, // GGML_OP_SSM_CONV nullptr, // GGML_OP_SSM_SCAN nullptr, // GGML_OP_WIN_PART nullptr, // GGML_OP_WIN_UNPART nullptr, // GGML_OP_GET_REL_POS nullptr, // GGML_OP_ADD_REL_POS nullptr, // GGML_OP_UNARY nullptr, // GGML_OP_MAP_UNARY nullptr, // GGML_OP_MAP_BINARY nullptr, // GGML_OP_MAP_CUSTOM1_F32 nullptr, // GGML_OP_MAP_CUSTOM2_F32 nullptr, // GGML_OP_MAP_CUSTOM3_F32 nullptr, // GGML_OP_MAP_CUSTOM1 nullptr, // GGML_OP_MAP_CUSTOM2 nullptr, // GGML_OP_MAP_CUSTOM3 nullptr, // GGML_OP_CROSS_ENTROPY_LOSS nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK }; return kQnnOpsTable; }