[feat] add more op (#35)
* move op key generate function to kOpCaps * fix op desc print * try fix rms_norm * Revert "try fix rms_norm" This reverts commit 33b296098012909cb482fc29b52b28098dc971cd. * add quantization type support by converting them to float * enable quantization tensor for mulmat in gpu/npu * fix asan error * add log and assert * insert output convert operator after mulmat * add log * fix some error in running * disable permute again * add log * add error function * Revert "add error function" This reverts commit f92ff47798ac8053fb776c55efbb1a98469c7af1. * add log * more log * disable convert op in graph * wip * add f16 config for graph * set f16 precision for f16 graph * fix override data type * add comment * add config flag to enable quantize type * add log * more quantized type for cpu and gpu backend * enable all quant types for cpu and gpu backend * rename * wip * add log * remove unused functions * skip permute * remove get_qnn_op_input_param_count * fallback to generic_get_op_desc if no op_desc * revert 'skip permute' * Revert "revert 'skip permute'" This reverts commit 5761e31fd23c69c4cabf6fd9fac1a0d3e5a74968. * wip * add log * print qnn tensor type * add log * limit the max size of tensor * add log * fix tensor size limiter * small improve on tensor info printer * disable sqrt and div to pass test-backend-ops for 8 gen 2 * remove debug log in release build * add log * skip permute in src * wip * disable reshape * skip mul at decoder start * wip * add log * add qnn_scoped_timer * add perf tracker in graph * add cmake options GGML_QNN_ENABLE_PERFORMANCE_TRACKING * fix flag name * use milli-second * wip * fix comment string * add file for profiler * change qnn-cpu to GGML_BACKEND_DEVICE_TYPE_ACCEL, so that we can run tests on cpu * wip * profiler: refactoring * wip * add implement for print_profile_events * set-up profiler for graph * set profiler to graph execute * pretty print events * unified log print prefix * print event count * enable optrace * print duration at event end * wip * add more detailed soc information * wip * move device caps array into qnn-lib.cpp * remove lib_name in device_context * move get_graph_key_from_cgraph to graph.cpp * add override type for tensor key * use override_type instead of original data type for graph key * append op type to tensor name to fix error in qwen * remove todo * wip
This commit is contained in:
parent
525cd2d641
commit
a1ab67478f
|
|
@ -1,14 +1,13 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GGML_QNN_NAME "QNN"
|
||||
#define GGML_QNN_NAME "qnn"
|
||||
#define GGML_QNN_MAX_DEVICES QNN_BACKEND_COUNT
|
||||
|
||||
enum QNNBackend {
|
||||
|
|
|
|||
|
|
@ -42,4 +42,13 @@ target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${
|
|||
if(GGML_QNN_ENABLE_CPU_BACKEND)
|
||||
message("GGML_QNN_ENABLE_CPU_BACKEND is enabled")
|
||||
target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_ENABLE_CPU_BACKEND)
|
||||
else()
|
||||
message("GGML_QNN_ENABLE_CPU_BACKEND is disabled")
|
||||
endif()
|
||||
|
||||
if(GGML_QNN_ENABLE_PERFORMANCE_TRACKING)
|
||||
message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING is enabled")
|
||||
target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_ENABLE_PERFORMANCE_TRACKING)
|
||||
else()
|
||||
message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING is disabled")
|
||||
endif()
|
||||
|
|
|
|||
|
|
@ -12,156 +12,10 @@
|
|||
|
||||
namespace {
|
||||
|
||||
bool qnn_is_op_valid(ggml_backend_qnn_device_context * ctx, const ggml_tensor * dst) {
|
||||
if (!ctx || !dst) {
|
||||
QNN_LOG_WARN("invalid params\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
auto instance = ctx->instance;
|
||||
if (!instance) {
|
||||
QNN_LOG_WARN("invalid instance\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto param_count = qnn::get_qnn_op_input_param_count(dst);
|
||||
switch (param_count) {
|
||||
case 1:
|
||||
return dst->src[0];
|
||||
case 2:
|
||||
return dst->src[0] && dst->src[1];
|
||||
default:
|
||||
QNN_LOG_WARN("invalid op param count %d\n", (int) param_count);
|
||||
break;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
void print_ggml_tensor(const ggml_tensor * tensor) {
|
||||
QNN_LOG_DEBUG("%s: type:%s ne: %ldx%ldx%ldx%ld, nb: %ldx%ldx%ldx%ld\n", tensor->name, ggml_type_name(tensor->type),
|
||||
(long) tensor->ne[0], (long) tensor->ne[1], (long) tensor->ne[2], (long) tensor->ne[3],
|
||||
(long) tensor->nb[0], (long) tensor->nb[1], (long) tensor->nb[2], (long) tensor->nb[3]);
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace {
|
||||
|
||||
typedef bool (*ggml_qnn_op_t)(ggml_backend_qnn_device_context * ctx, ggml_tensor * dst);
|
||||
|
||||
void append_tensor_dimensions(const ggml_tensor * tensor, std::string & output) {
|
||||
char buffer[256] = {};
|
||||
const auto * type_name = qnn::get_ggml_type_name(tensor->type);
|
||||
int len = 0;
|
||||
switch (ggml_n_dims(tensor)) {
|
||||
case 1:
|
||||
len = snprintf(buffer, sizeof(buffer), "%ld%s", (long) tensor->ne[0], type_name);
|
||||
break;
|
||||
case 2:
|
||||
len = snprintf(buffer, sizeof(buffer), "%ldx%ld%s", (long) tensor->ne[0], (long) tensor->ne[1], type_name);
|
||||
break;
|
||||
case 3:
|
||||
len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ld%s", (long) tensor->ne[0], (long) tensor->ne[1],
|
||||
(long) tensor->ne[2], type_name);
|
||||
break;
|
||||
case 4:
|
||||
default:
|
||||
len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ldx%ld%s", (long) tensor->ne[0], (long) tensor->ne[1],
|
||||
(long) tensor->ne[2], (long) tensor->ne[3], type_name);
|
||||
break;
|
||||
}
|
||||
GGML_ASSERT(len > 0 && len < (int) sizeof(buffer));
|
||||
output.append(buffer, len);
|
||||
}
|
||||
|
||||
void get_graph_key_from_op(const ggml_tensor * op, std::string & output) {
|
||||
GGML_ASSERT(op->op != GGML_OP_NONE);
|
||||
output += ggml_op_desc(op);
|
||||
output += qnn::get_ggml_type_name(op->type);
|
||||
const auto param_count = qnn::get_qnn_op_input_param_count(op);
|
||||
for (size_t i = 0; i < param_count; ++i) {
|
||||
auto * input = op->src[i];
|
||||
if (!input) {
|
||||
break;
|
||||
}
|
||||
|
||||
output += '_';
|
||||
append_tensor_dimensions(input, output);
|
||||
}
|
||||
}
|
||||
|
||||
void get_op_key_with_src_op_desc(const ggml_tensor * op, std::string & output) {
|
||||
output += ggml_op_desc(op);
|
||||
output += '(';
|
||||
if (op->src[0]) {
|
||||
output += ggml_op_desc(op->src[0]);
|
||||
}
|
||||
for (size_t i = 1; i < GGML_MAX_DIMS && op->src[i]; ++i) {
|
||||
output += ',';
|
||||
output += ggml_op_desc(op->src[i]);
|
||||
}
|
||||
output += ')';
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Generates a unique key for a given computation graph (cgraph).
|
||||
*
|
||||
* This key is used to cache the graph, enabling efficient reuse of previously
|
||||
* compiled graphs. The key is constructed by concatenating the descriptions
|
||||
* of the operations and their associated tensor dimensions within the graph.
|
||||
*
|
||||
* Example key format: "MUL_MATf32_256x16x10f32_256x1x10f32#LOG#ADD#ADDf32_16x1x10f32"
|
||||
*
|
||||
* @param cgraph The computation graph for which the key is generated.
|
||||
* @param output The string where the generated key will be stored.
|
||||
*
|
||||
* TODO: Improve the key generation logic to handle more complex graph structures and edge cases.
|
||||
*/
|
||||
void get_graph_key_from_cgraph(const ggml_cgraph * cgraph, std::string & output) {
|
||||
if (cgraph->n_nodes == 0) {
|
||||
QNN_LOG_DEBUG("empty cgraph\n");
|
||||
return;
|
||||
}
|
||||
|
||||
{
|
||||
bool is_start = true;
|
||||
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
||||
auto * op = cgraph->nodes[i];
|
||||
if (ggml_is_empty(op)) {
|
||||
QNN_LOG_DEBUG("empty op in graph, skipping\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
if (op->op == GGML_OP_NONE) {
|
||||
QNN_LOG_DEBUG("GGML_OP_NONE in graph, skipping\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
if (is_start) {
|
||||
get_graph_key_from_op(cgraph->nodes[0], output);
|
||||
is_start = false;
|
||||
} else {
|
||||
output += '#';
|
||||
get_op_key_with_src_op_desc(op, output);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (cgraph->n_nodes > 1) {
|
||||
auto * last_op = cgraph->nodes[cgraph->n_nodes - 1];
|
||||
output += qnn::get_ggml_type_name(last_op->type);
|
||||
output += '_';
|
||||
append_tensor_dimensions(last_op, output);
|
||||
}
|
||||
}
|
||||
|
||||
qnn::qnn_graph * get_qnn_graph_from_cache(ggml_backend_qnn_device_context * ctx, const ggml_cgraph * cgraph) {
|
||||
auto & graph_cache = ctx->qnn_graph_cache;
|
||||
std::string graph_key;
|
||||
get_graph_key_from_cgraph(cgraph, graph_key);
|
||||
auto op_data_type = qnn::qnn_graph::get_graph_key_from_cgraph(cgraph, graph_key);
|
||||
if (graph_key.empty()) {
|
||||
QNN_LOG_DEBUG("[%s]empty graph key for cgraph: %p, size: %d\n", qnn::get_backend_name(ctx->device),
|
||||
(const void *) cgraph, (int) cgraph->n_nodes);
|
||||
|
|
@ -171,11 +25,20 @@ qnn::qnn_graph * get_qnn_graph_from_cache(ggml_backend_qnn_device_context * ctx,
|
|||
auto it = graph_cache.find(graph_key);
|
||||
qnn::qnn_graph * graph_ptr = nullptr;
|
||||
if (it != graph_cache.end()) {
|
||||
QNN_LOG_DEBUG("[%s]found graph %s in cache\n", qnn::get_backend_name(ctx->device), graph_key.c_str());
|
||||
auto it = graph_cache.find(graph_key);
|
||||
QNN_LOG_DEBUG("[%s]found graph %s in cache, cache size: %d\n", qnn::get_backend_name(ctx->device),
|
||||
graph_key.c_str(), (int) graph_cache.size());
|
||||
graph_ptr = it->second.get();
|
||||
} else {
|
||||
auto graph =
|
||||
std::make_unique<qnn::qnn_graph>(graph_key, ctx->device, ctx->instance, ctx->socinfo.vtcm_size_in_mb);
|
||||
auto precision = qnn::qnn_graph::kHtpDefault;
|
||||
if (op_data_type == GGML_TYPE_F16) {
|
||||
QNN_LOG_DEBUG("[%s][%s]set graph precision to FP16\n", qnn::get_backend_name(ctx->device),
|
||||
graph_key.c_str());
|
||||
precision = qnn::qnn_graph::kHtpFp16;
|
||||
}
|
||||
|
||||
auto graph = std::make_unique<qnn::qnn_graph>(graph_key, ctx->device, ctx->instance, precision,
|
||||
ctx->socinfo.vtcm_size_in_mb);
|
||||
if (!graph->is_valid()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
|
@ -187,6 +50,8 @@ qnn::qnn_graph * get_qnn_graph_from_cache(ggml_backend_qnn_device_context * ctx,
|
|||
|
||||
graph_ptr = graph.get();
|
||||
graph_cache[graph_key] = std::move(graph);
|
||||
QNN_LOG_DEBUG("[%s]add graph %s to cache, cache size: %d\n", qnn::get_backend_name(ctx->device),
|
||||
graph_key.c_str(), (int) graph_cache.size());
|
||||
}
|
||||
|
||||
return graph_ptr;
|
||||
|
|
@ -201,9 +66,9 @@ constexpr const bool kQnnSupportedOps[] = {
|
|||
false, // GGML_OP_ACC
|
||||
true, // GGML_OP_SUB
|
||||
true, // GGML_OP_MUL
|
||||
true, // GGML_OP_DIV
|
||||
false, // GGML_OP_DIV, disabled for now cause failed on test-backend-ops
|
||||
false, // GGML_OP_SQR
|
||||
true, // GGML_OP_SQRT
|
||||
false, // GGML_OP_SQRT, disabled for now cause failed on test-backend-ops
|
||||
true, // GGML_OP_LOG
|
||||
false, // GGML_OP_SIN
|
||||
false, // GGML_OP_COS
|
||||
|
|
@ -229,7 +94,7 @@ constexpr const bool kQnnSupportedOps[] = {
|
|||
false, // GGML_OP_SET
|
||||
false, // GGML_OP_CPY
|
||||
false, // GGML_OP_CONT
|
||||
true, // GGML_OP_RESHAPE
|
||||
false, // GGML_OP_RESHAPE
|
||||
false, // GGML_OP_VIEW
|
||||
false, // GGML_OP_PERMUTE
|
||||
false, // GGML_OP_TRANSPOSE
|
||||
|
|
@ -306,14 +171,39 @@ constexpr const bool kQnnSupportedOps[] = {
|
|||
static_assert(kQnnSupportedOps[GGML_OP_NONE], "GGML_OP_NONE is not true");
|
||||
static_assert(kQnnSupportedOps[GGML_OP_ADD], "GGML_OP_ADD is not true");
|
||||
static_assert(kQnnSupportedOps[GGML_OP_MUL], "GGML_OP_MUL is not true");
|
||||
static_assert(kQnnSupportedOps[GGML_OP_MUL_MAT],
|
||||
"GGML_OP_MUL_MAT is not true, please check the kQnnSupportedOps table in the backend-ops.cpp file");
|
||||
static_assert(kQnnSupportedOps[GGML_OP_RESHAPE], "GGML_OP_RESHAPE is not true");
|
||||
static_assert(kQnnSupportedOps[GGML_OP_MUL_MAT], "GGML_OP_MUL_MAT is not true");
|
||||
static_assert(!kQnnSupportedOps[GGML_OP_RESHAPE], "GGML_OP_RESHAPE should not be true");
|
||||
static_assert(!kQnnSupportedOps[GGML_OP_VIEW], "GGML_OP_VIEW is not false");
|
||||
static_assert(std::size(kQnnSupportedOps) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT),
|
||||
"GGML_OP_COUNT does not match the size of the kQnnSupportedOps table");
|
||||
|
||||
bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context * ctx, const ggml_tensor * tensor) {
|
||||
inline bool is_type_bit_enabled(uint64_t bits, ggml_type type) {
|
||||
return bits & (uint64_t(1) << type);
|
||||
}
|
||||
|
||||
inline bool is_tensor_size_valid(ggml_backend_qnn_device_context * ctx, const ggml_tensor * tensor) {
|
||||
constexpr const auto get_tensor_size_in_bytes = [](const ggml_tensor * tensor, ggml_type type) -> size_t {
|
||||
return tensor->ne[0] * tensor->ne[1] * tensor->ne[2] * tensor->ne[3] * ggml_type_size(type);
|
||||
};
|
||||
|
||||
auto type = tensor->type;
|
||||
if (ggml_is_quantized(type) && ctx->enable_cpu_dequantize) {
|
||||
type = GGML_TYPE_F32; // TODO: [quantize] fix me if plan to dequantize to other types
|
||||
}
|
||||
|
||||
const auto tensor_size = get_tensor_size_in_bytes(tensor, type);
|
||||
if (ctx->max_tensor_size_in_bytes && tensor_size >= ctx->max_tensor_size_in_bytes) {
|
||||
QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) size(%lld) exceeds the limit(%lld)\n",
|
||||
qnn::get_backend_name(ctx->device), ggml_get_name(tensor), (int) tensor->ne[0],
|
||||
(int) tensor->ne[1], (int) tensor->ne[2], (int) tensor->ne[3], (long long int) tensor_size,
|
||||
(long long int) ctx->max_tensor_size_in_bytes);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool is_tensor_type_valid(ggml_backend_qnn_device_context * ctx, const ggml_tensor * tensor) {
|
||||
if (!tensor) {
|
||||
QNN_LOG_DEBUG("tensor is nullptr\n");
|
||||
return false;
|
||||
|
|
@ -332,9 +222,7 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context * ctx, const ggml_
|
|||
switch (tensor->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
if (!(ctx->supported_types & (uint64_t(1) << tensor->type))) {
|
||||
if (!is_type_bit_enabled(ctx->supported_types, tensor->type)) {
|
||||
QNN_LOG_DEBUG("[%s]unsupported data type %s, supported_types: 0x%x\n",
|
||||
qnn::get_backend_name(ctx->device), ggml_type_name(tensor->type),
|
||||
(unsigned int) ctx->supported_types);
|
||||
|
|
@ -350,18 +238,29 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context * ctx, const ggml_
|
|||
return true;
|
||||
}
|
||||
|
||||
bool is_data_reinterpretation_op(ggml_op op) {
|
||||
return op == GGML_OP_VIEW || op == GGML_OP_PERMUTE;
|
||||
}
|
||||
|
||||
bool ggnl_qnn_supports_op_tensor(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) {
|
||||
if (op->op == GGML_OP_NONE) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!ggml_qnn_supports_tensor(ctx, op)) {
|
||||
if (!is_tensor_type_valid(ctx, op) || !is_tensor_size_valid(ctx, op)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto param_count = qnn::get_qnn_op_input_param_count(op);
|
||||
for (size_t i = 0; i < param_count; ++i) {
|
||||
if (!ggml_qnn_supports_tensor(ctx, op->src[i])) {
|
||||
// TODO: fix for other op
|
||||
const bool cpu_dequant = ctx->enable_cpu_dequantize && op->op == GGML_OP_MUL_MAT;
|
||||
for (size_t i = 0; i < GGML_MAX_SRC && op->src[i]; ++i) {
|
||||
auto * src = op->src[i];
|
||||
if (!is_tensor_size_valid(ctx, src)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// passthrough the quantized tensor for CPU dequantization
|
||||
if (!is_tensor_type_valid(ctx, src) && (!cpu_dequant || !ggml_is_quantized(src->type))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
@ -394,14 +293,17 @@ bool ggml_qnn_have_same_tensor_types(ggml_backend_qnn_device_context * ctx, cons
|
|||
return true;
|
||||
}
|
||||
|
||||
// TODO: move to caps array?
|
||||
bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) {
|
||||
constexpr const size_t kMaxNpuTensorSize = 8192L * 2048 + 8192 * 512 + 2048 * 512;
|
||||
constexpr const auto get_tensor_size = [](const ggml_tensor * tensor) -> size_t {
|
||||
return tensor->ne[0] * tensor->ne[1] * tensor->ne[2] * tensor->ne[3];
|
||||
};
|
||||
|
||||
auto * src0 = op->src[0];
|
||||
auto * src1 = op->src[1];
|
||||
if (is_data_reinterpretation_op(src0->op) || is_data_reinterpretation_op(src1->op)) {
|
||||
// TODO: remove the blocker here when we support permute op
|
||||
QNN_LOG_DEBUG("[%s][MUL_MAT]data reorganization op is not supported, (%s, %s)\n",
|
||||
qnn::get_backend_name(ctx->device), ggml_op_name(src0->op), ggml_op_name(src1->op));
|
||||
return false;
|
||||
}
|
||||
|
||||
switch (ctx->device) {
|
||||
case QNN_BACKEND_NPU:
|
||||
if (src1->ne[2] != src0->ne[2] || src1->ne[3] != src0->ne[3]) {
|
||||
|
|
@ -411,15 +313,21 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context * ctx, const gg
|
|||
*/
|
||||
QNN_LOG_DEBUG("[qnn-npu][MUL_MAT]src0 and src1 dimensions are not equal\n");
|
||||
return false;
|
||||
} else if (get_tensor_size(src0) + get_tensor_size(src1) + get_tensor_size(op) >= kMaxNpuTensorSize) {
|
||||
QNN_LOG_DEBUG("[qnn-npu][MUL_MAT]tensor size is too large\n");
|
||||
return false;
|
||||
}
|
||||
// fall through, from test here, the convert op is super slow on NPU:
|
||||
// https://github.com/usefulsensors/qc_npu_benchmark
|
||||
case QNN_BACKEND_GPU:
|
||||
if (ggml_qnn_have_same_tensor_types(ctx, op)) {
|
||||
// there's no convert op for GPU.
|
||||
if (!ggml_qnn_have_same_tensor_types(ctx, op) && op->type != GGML_TYPE_F32) {
|
||||
// for different tensor types and not float32, we don't support it currently, since there's no convert
|
||||
QNN_LOG_DEBUG("[%s][MUL_MAT]src0 and src1 and dst types are not equal\n",
|
||||
qnn::get_backend_name(ctx->device));
|
||||
return false;
|
||||
}
|
||||
if (op->type == GGML_TYPE_F32 && ggml_is_quantized(src0->type) &&
|
||||
!is_type_bit_enabled(ctx->cpu_preprocess_types, src0->type)) {
|
||||
// for such cases that src0 is quantized and op is float32, check if the quant type is enabled
|
||||
QNN_LOG_DEBUG("[%s][MUL_MAT]quantized src0 type %s is not enabled\n",
|
||||
qnn::get_backend_name(ctx->device), ggml_type_name(src0->type));
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
|
|
@ -436,6 +344,19 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context * ctx, const gg
|
|||
return true;
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
|
||||
void print_tensor_info(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op, bool is_supported) {
|
||||
const char * supported = is_supported ? "supported" : "unsupported";
|
||||
std::string op_key;
|
||||
qnn::get_qnn_op_desc(op, true, GGML_TYPE_COUNT, op_key);
|
||||
|
||||
QNN_LOG_DEBUG("[%s][%s]op was %s, support/unsupported: %d/%d\n", qnn::get_backend_name(ctx->device), op_key.c_str(),
|
||||
supported, ctx->supported_op_count.load(), ctx->unsupported_op_count.load());
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace qnn {
|
||||
|
|
@ -448,22 +369,16 @@ bool device_supports_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor
|
|||
|
||||
if (!kQnnSupportedOps[qnn::get_qnn_op_index(op)]) {
|
||||
#ifndef NDEBUG
|
||||
std::string op_key;
|
||||
get_graph_key_from_op(op, op_key);
|
||||
ctx->unsupported_op_count++;
|
||||
QNN_LOG_DEBUG("[%s][%s]op was unsupported, support/unsupported: %d/%d\n", qnn::get_backend_name(ctx->device),
|
||||
op_key.c_str(), ctx->supported_op_count.load(), ctx->unsupported_op_count.load());
|
||||
print_tensor_info(ctx, op, false);
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!ggnl_qnn_supports_op_tensor(ctx, op)) {
|
||||
#ifndef NDEBUG
|
||||
std::string tensor_dims;
|
||||
append_tensor_dimensions(op, tensor_dims);
|
||||
QNN_LOG_DEBUG("[%s][%s]unsupported tensor(%s), support/unsupported: %d/%d\n",
|
||||
qnn::get_backend_name(ctx->device), ggml_op_name(op->op), tensor_dims.c_str(),
|
||||
ctx->supported_op_count.load(), ctx->unsupported_op_count.load());
|
||||
ctx->unsupported_op_count++;
|
||||
print_tensor_info(ctx, op, false);
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
|
@ -480,13 +395,23 @@ bool device_supports_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor
|
|||
auto * src0 = op->src[0];
|
||||
auto * src1 = op->src[1];
|
||||
switch (op->op) {
|
||||
case GGML_OP_MUL:
|
||||
// TODO: fix this when we have the support for mul with rms_norm
|
||||
if (ctx->enable_cpu_dequantize && (src0->op == GGML_OP_RMS_NORM || src1->op == GGML_OP_RMS_NORM)) {
|
||||
QNN_LOG_DEBUG("[%s][%s]skip unsupported mul with rms norm, (%s, %s)\n",
|
||||
qnn::get_backend_name(ctx->device), ggml_op_desc(op), ggml_op_desc(src0),
|
||||
ggml_op_desc(src1));
|
||||
is_op_supported = false;
|
||||
break;
|
||||
}
|
||||
// fall through, just skip the mul with rms_norm, in llama, its at start of decoder block
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
// TODO: move to op caps array?
|
||||
if (!ggml_are_same_shape(src0, src1)) {
|
||||
QNN_LOG_DEBUG("[%s][%s] src0 and src1 dimensions are not equal\n",
|
||||
qnn::get_backend_name(ctx->device), ggml_op_name(op->op));
|
||||
qnn::get_backend_name(ctx->device), ggml_op_desc(op));
|
||||
is_op_supported = false;
|
||||
}
|
||||
break;
|
||||
|
|
@ -503,13 +428,11 @@ bool device_supports_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor
|
|||
#ifndef NDEBUG
|
||||
if (is_op_supported) {
|
||||
ctx->supported_op_count++;
|
||||
QNN_LOG_DEBUG("[%s][%s]op was supported, support/unsupported: %d/%d\n", qnn::get_backend_name(ctx->device),
|
||||
ggml_op_name(op->op), ctx->supported_op_count.load(), ctx->unsupported_op_count.load());
|
||||
} else {
|
||||
ctx->unsupported_op_count++;
|
||||
QNN_LOG_DEBUG("[%s][%s]op was unsupported, support/unsupported: %d/%d\n", qnn::get_backend_name(ctx->device),
|
||||
ggml_op_name(op->op), ctx->supported_op_count.load(), ctx->unsupported_op_count.load());
|
||||
}
|
||||
|
||||
print_tensor_info(ctx, op, is_op_supported);
|
||||
#endif
|
||||
|
||||
return is_op_supported;
|
||||
|
|
@ -520,7 +443,7 @@ bool device_compute_graph(ggml_backend_qnn_device_context * ctx, ggml_cgraph * c
|
|||
(int) cgraph->n_nodes);
|
||||
|
||||
auto qnn_graph = get_qnn_graph_from_cache(ctx, cgraph);
|
||||
bool success = qnn_graph && qnn_graph->execute(cgraph);
|
||||
bool success = qnn_graph && qnn_graph->execute(cgraph, ctx->convert_context);
|
||||
|
||||
QNN_LOG_DEBUG("[%s]compute graph, success: %d\n", qnn::get_backend_name(ctx->device), (int) success);
|
||||
return success;
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@
|
|||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
|
||||
#include "convert.hpp"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-qnn.h"
|
||||
#include "ggml.h"
|
||||
|
|
@ -25,26 +26,30 @@ struct ggml_backend_qnn_device_context {
|
|||
QNNBackend device;
|
||||
size_t threads;
|
||||
std::string name;
|
||||
std::string lib_name;
|
||||
std::string description;
|
||||
|
||||
// initialize in qnn init
|
||||
qnn::qcom_socinfo socinfo = {};
|
||||
uint64_t supported_types;
|
||||
size_t max_tensor_size_in_bytes;
|
||||
std::shared_ptr<qnn::qnn_instance> instance;
|
||||
std::shared_ptr<qnn::qnn_interface> qnn_interface;
|
||||
|
||||
qnn::qnn_graph_cache_t qnn_graph_cache;
|
||||
qnn::qnn_graph_cache_t qnn_graph_cache;
|
||||
std::shared_ptr<qnn::qnn_convert_context_t> convert_context = std::make_shared<qnn::qnn_convert_context_t>();
|
||||
|
||||
#ifndef NDEBUG
|
||||
std::atomic_uint32_t supported_op_count = 0;
|
||||
std::atomic_uint32_t unsupported_op_count = 0;
|
||||
#endif
|
||||
|
||||
bool enable_cpu_dequantize = false;
|
||||
uint64_t supported_types;
|
||||
uint64_t cpu_preprocess_types;
|
||||
|
||||
explicit ggml_backend_qnn_device_context(QNNBackend device, size_t threads, const char * name,
|
||||
const char * lib_name, uint64_t supported_types) :
|
||||
uint64_t supported_types) :
|
||||
device(device),
|
||||
threads(threads),
|
||||
name(name),
|
||||
lib_name(lib_name),
|
||||
supported_types(supported_types) {}
|
||||
};
|
||||
|
|
|
|||
|
|
@ -69,8 +69,8 @@ using qnn_buffer_ptr = std::shared_ptr<qnn_buffer_interface>;
|
|||
*/
|
||||
class qnn_rpc_buffer : public qnn_buffer_interface {
|
||||
public:
|
||||
qnn_rpc_buffer(std::shared_ptr<qnn_instance> qnn_instance, const size_t size, const uint32_t rank,
|
||||
uint32_t * dimensions, Qnn_DataType_t data_type) :
|
||||
qnn_rpc_buffer(qnn_instance_ptr qnn_instance, const size_t size, const uint32_t rank, uint32_t * dimensions,
|
||||
Qnn_DataType_t data_type) :
|
||||
_size(size),
|
||||
_qnn_instance(qnn_instance) {
|
||||
_qnn_rpc_buffer = static_cast<uint8_t *>(qnn_instance->alloc_rpcmem(size, alignof(uint8_t *)));
|
||||
|
|
@ -105,10 +105,10 @@ class qnn_rpc_buffer : public qnn_buffer_interface {
|
|||
Qnn_MemHandle_t get_mem_handle() const override { return _qnn_rpc_mem_handle; }
|
||||
|
||||
private:
|
||||
size_t _size = 0;
|
||||
uint8_t * _qnn_rpc_buffer = nullptr;
|
||||
Qnn_MemHandle_t _qnn_rpc_mem_handle = nullptr;
|
||||
std::shared_ptr<qnn_instance> _qnn_instance;
|
||||
size_t _size = 0;
|
||||
uint8_t * _qnn_rpc_buffer = nullptr;
|
||||
Qnn_MemHandle_t _qnn_rpc_mem_handle = nullptr;
|
||||
qnn_instance_ptr _qnn_instance;
|
||||
|
||||
DISABLE_COPY(qnn_rpc_buffer);
|
||||
DISABLE_MOVE(qnn_rpc_buffer);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,155 @@
|
|||
|
||||
#include "convert.hpp"
|
||||
|
||||
#include "logger.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
size_t get_convert_buffer_size(const qnn::ggml_dimension_array_t & dimensions, ggml_type dst_type) {
|
||||
GGML_ASSERT(ggml_blck_size(dst_type) == 1);
|
||||
size_t nbytes = ggml_type_size(dst_type);
|
||||
for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
|
||||
nbytes *= dimensions[i]; // tight packing
|
||||
}
|
||||
|
||||
return nbytes;
|
||||
}
|
||||
|
||||
// from ggml_backend_blas_mul_mat, when omp available, use it otherwise will fall back to standard lib solution
|
||||
// TODO: remove this when we can fall back the convert to blas backend
|
||||
#ifdef GGML_USE_OPENMP
|
||||
|
||||
void convert_tensor_impl(const ggml_tensor * src, int max_threads,
|
||||
std::shared_ptr<qnn::qnn_mem_buffer_slice> & output_buffer) {
|
||||
const auto ne03 = src->ne[3];
|
||||
const auto ne02 = src->ne[2];
|
||||
const auto ne01 = src->ne[1];
|
||||
const auto ne00 = src->ne[0];
|
||||
const auto ne_plane = ne01 * ne00;
|
||||
const auto nb03 = src->nb[3];
|
||||
const auto nb02 = src->nb[2];
|
||||
const auto nb01 = src->nb[1];
|
||||
const int min_cols_per_thread = 4096;
|
||||
void * wdata = output_buffer->get_buffer();
|
||||
const auto to_float = ggml_get_type_traits(src->type)->to_float;
|
||||
GGML_ASSERT(to_float);
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
const void * x = (char *) src->data + i02 * nb02 + i03 * nb03;
|
||||
float * const wplane = (float *) wdata + i02 * ne_plane + i03 * ne02 * ne_plane;
|
||||
|
||||
const int min_rows_per_thread = std::max((int) (min_cols_per_thread / ne00), 1);
|
||||
const int n_threads = std::max(std::min(max_threads, (int) (ne01 / min_rows_per_thread)), 1);
|
||||
|
||||
# pragma omp parallel for num_threads(n_threads)
|
||||
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
||||
to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return output_buffer;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
void convert_tensor_impl(const ggml_tensor * src, int max_threads, std::vector<std::future<void>> & tasks,
|
||||
std::shared_ptr<qnn::qnn_mem_buffer_slice> & output_buffer) {
|
||||
const auto ne03 = src->ne[3];
|
||||
const auto ne02 = src->ne[2];
|
||||
const auto ne01 = src->ne[1];
|
||||
const auto ne00 = src->ne[0];
|
||||
const auto ne_plane = ne01 * ne00;
|
||||
const auto nb03 = src->nb[3];
|
||||
const auto nb02 = src->nb[2];
|
||||
const auto nb01 = src->nb[1];
|
||||
const int min_cols_per_thread = 4096;
|
||||
void * wdata = output_buffer->get_buffer();
|
||||
const auto to_float = ggml_get_type_traits(src->type)->to_float;
|
||||
GGML_ASSERT(to_float);
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
const void * x = (char *) src->data + i02 * nb02 + i03 * nb03;
|
||||
float * const wplane = (float *) wdata + i02 * ne_plane + i03 * ne02 * ne_plane;
|
||||
|
||||
const int min_rows_per_thread = std::max((int) (min_cols_per_thread / ne00), 1);
|
||||
const int n_threads = std::max(std::min(max_threads, (int) (ne01 / min_rows_per_thread)), 1);
|
||||
|
||||
for (int i = 1; i < n_threads; i++) {
|
||||
const int64_t start = i * ne01 / n_threads;
|
||||
const int64_t end = (i + 1) * ne01 / n_threads;
|
||||
if (start < end) {
|
||||
tasks.push_back(std::async(std::launch::async, [=]() {
|
||||
for (int64_t i01 = start; i01 < end; i01++) {
|
||||
to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00);
|
||||
}
|
||||
}));
|
||||
}
|
||||
}
|
||||
{
|
||||
// reuse the current thread for the first task
|
||||
const int64_t start = 0;
|
||||
const int64_t end = ne01 / n_threads;
|
||||
for (int64_t i01 = start; i01 < end; i01++) {
|
||||
to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// wait for all tasks to finish
|
||||
for (auto & task : tasks) {
|
||||
task.get();
|
||||
}
|
||||
tasks.clear();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace qnn {
|
||||
|
||||
std::vector<qnn::qnn_buffer_ptr> convert(std::shared_ptr<qnn_convert_context_t> convert_context,
|
||||
const ggml_tensor_array_t & tensors, ggml_type target_data_type) {
|
||||
convert_context->buffers.resize(tensors.size());
|
||||
std::vector<qnn::qnn_buffer_ptr> output_buffers(tensors.size());
|
||||
for (size_t i = 0; i < tensors.size(); ++i) {
|
||||
const ggml_tensor * src = tensors[i];
|
||||
if (src->type == target_data_type) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto & data_buffer = convert_context->buffers[i];
|
||||
const auto dst_size = get_convert_buffer_size(src->ne, target_data_type);
|
||||
if (!data_buffer || data_buffer->get_size() < dst_size) {
|
||||
#ifndef NDEBUG
|
||||
auto old_size = data_buffer ? data_buffer->get_size() : 0;
|
||||
QNN_LOG_DEBUG("create buffer[%d] for tensor %s(%s), old_size: %d, new_size: %d\n", (int) i,
|
||||
ggml_get_name(src), ggml_type_name(src->type), (int) old_size, (int) dst_size);
|
||||
#endif
|
||||
data_buffer = std::make_shared<qnn::qnn_mem_buffer>(dst_size);
|
||||
}
|
||||
|
||||
// TODO: add more restrictions to the buffer slice here
|
||||
std::shared_ptr<qnn::qnn_mem_buffer_slice> output_buffer =
|
||||
std::make_shared<qnn::qnn_mem_buffer_slice>(data_buffer->get_buffer(), dst_size);
|
||||
|
||||
QNN_LOG_DEBUG("convert tensor(%s) from %s to %s, size: %d, n_threads: %d\n", ggml_get_name(src),
|
||||
ggml_type_name(src->type), ggml_type_name(target_data_type), (int) dst_size,
|
||||
convert_context->n_threads);
|
||||
|
||||
#ifdef GGML_USE_OPENMP
|
||||
convert_tensor_impl(src, convert_context->n_threads, output_buffer);
|
||||
#else
|
||||
convert_tensor_impl(src, convert_context->n_threads, convert_context->tasks, output_buffer);
|
||||
#endif
|
||||
output_buffers[i] = output_buffer;
|
||||
}
|
||||
|
||||
return output_buffers;
|
||||
}
|
||||
|
||||
} // namespace qnn
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
#pragma once
|
||||
|
||||
#include <future>
|
||||
#include <memory>
|
||||
#include <thread>
|
||||
|
||||
#include "buffer.hpp"
|
||||
#include "ggml-qnn.h"
|
||||
#include "tensor.hpp"
|
||||
#include "utils.hpp"
|
||||
|
||||
namespace qnn {
|
||||
|
||||
// see also: ggml_backend_blas_context
|
||||
struct qnn_convert_context_t {
|
||||
int n_threads = std::thread::hardware_concurrency();
|
||||
std::vector<std::shared_ptr<qnn_mem_buffer>> buffers;
|
||||
#ifndef GGML_USE_OPENMP
|
||||
std::vector<std::future<void>> tasks;
|
||||
#endif
|
||||
};
|
||||
|
||||
std::vector<qnn::qnn_buffer_ptr> convert(std::shared_ptr<qnn_convert_context_t> convert_context,
|
||||
const ggml_tensor_array_t & tensors, ggml_type target_data_type);
|
||||
|
||||
} // namespace qnn
|
||||
|
|
@ -4,78 +4,16 @@
|
|||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "backend-ops.hpp"
|
||||
#include "backend.hpp"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-qnn/backend-ops.hpp"
|
||||
#include "ggml-qnn/backend.hpp"
|
||||
#include "ggml-qnn/logger.hpp"
|
||||
#include "ggml-qnn/tensor.hpp"
|
||||
#include "ggml-qnn/utils.hpp"
|
||||
|
||||
// =================================================================================================
|
||||
//
|
||||
// self-defined macro / data structure
|
||||
//
|
||||
// =================================================================================================
|
||||
#ifdef NDEBUG
|
||||
# define ENABLE_QNNBACKEND_PERF 0 // enable/disable op's perf info
|
||||
#else
|
||||
# define ENABLE_QNNBACKEND_PERF 1 // enable/disable op's perf info
|
||||
#endif
|
||||
|
||||
#define QNN_BACKEND_NAME "qnn"
|
||||
#include "logger.hpp"
|
||||
#include "tensor.hpp"
|
||||
#include "utils.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
#ifdef _WIN32
|
||||
constexpr const char * kQnnCpuLibName = "QnnCpu.dll";
|
||||
constexpr const char * kQnnGpuLibName = "QnnGpu.dll";
|
||||
constexpr const char * kQnnNpuLibName = "QnnHtp.dll";
|
||||
#else
|
||||
constexpr const char * kQnnCpuLibName = "libQnnCpu.so";
|
||||
constexpr const char * kQnnGpuLibName = "libQnnGpu.so";
|
||||
constexpr const char * kQnnNpuLibName = "libQnnHtp.so";
|
||||
#endif
|
||||
|
||||
struct qnn_device_caps {
|
||||
const char * name;
|
||||
const char * description;
|
||||
const char * lib_name;
|
||||
enum ggml_backend_dev_type type;
|
||||
|
||||
// TODO: should get this caps from device
|
||||
uint64_t supported_types;
|
||||
};
|
||||
|
||||
// TODO: should move this to qnn-lib.cpp
|
||||
constexpr const qnn_device_caps kDeviceCaps[] = {
|
||||
{
|
||||
"qnn-cpu", "Qualcomm Kryo CPU",
|
||||
kQnnCpuLibName, GGML_BACKEND_DEVICE_TYPE_CPU,
|
||||
(1 << GGML_TYPE_I8) | (1 << GGML_TYPE_F32),
|
||||
}, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul
|
||||
{
|
||||
"qnn-gpu", "Qualcomm Adreno GPU",
|
||||
kQnnGpuLibName, GGML_BACKEND_DEVICE_TYPE_GPU,
|
||||
(1 << GGML_TYPE_F32) | (1 << GGML_TYPE_F16),
|
||||
}, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul
|
||||
{
|
||||
"qnn-npu", "Qualcomm NPU",
|
||||
kQnnNpuLibName, GGML_BACKEND_DEVICE_TYPE_ACCEL,
|
||||
(1 << GGML_TYPE_F32) | (1 << GGML_TYPE_F16) | (1 << GGML_TYPE_I16) | (1 << GGML_TYPE_I8),
|
||||
}, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul
|
||||
};
|
||||
|
||||
static_assert(sizeof(kDeviceCaps) / sizeof(kDeviceCaps[0]) == GGML_QNN_MAX_DEVICES,
|
||||
"The number of qnn devices should be equal to GGML_QNN_MAX_DEVICES");
|
||||
static_assert(kDeviceCaps[QNN_BACKEND_NPU].type == GGML_BACKEND_DEVICE_TYPE_ACCEL,
|
||||
"The NPU device should be an accelerator device");
|
||||
static_assert(kDeviceCaps[QNN_BACKEND_GPU].type == GGML_BACKEND_DEVICE_TYPE_GPU,
|
||||
"The NPU device should be an accelerator device");
|
||||
|
||||
static_assert(kDeviceCaps[QNN_BACKEND_CPU].type == GGML_BACKEND_DEVICE_TYPE_CPU,
|
||||
"The NPU device should be an accelerator device");
|
||||
|
||||
ggml_backend_qnn_device_context * get_device_context(ggml_backend_dev_t dev) {
|
||||
return reinterpret_cast<ggml_backend_qnn_device_context *>(dev->context);
|
||||
}
|
||||
|
|
@ -266,13 +204,13 @@ constexpr const ggml_backend_i ggml_backend_qnn_interface = {
|
|||
* -----------------------------------------------------------------------------------------------
|
||||
*/
|
||||
const char * ggml_backend_qnn_device_get_name(ggml_backend_dev_t dev) {
|
||||
const auto & caps = kDeviceCaps[get_device_context(dev)->device];
|
||||
return caps.name;
|
||||
auto * dev_ctx = get_device_context(dev);
|
||||
return qnn::get_backend_name(dev_ctx->device);
|
||||
}
|
||||
|
||||
const char * ggml_backend_qnn_device_get_description(ggml_backend_dev_t dev) {
|
||||
const auto & caps = kDeviceCaps[get_device_context(dev)->device];
|
||||
return caps.description;
|
||||
auto * dev_ctx = get_device_context(dev);
|
||||
return dev_ctx->description.empty() ? qnn::get_backend_desc(dev_ctx->device) : dev_ctx->description.c_str();
|
||||
}
|
||||
|
||||
void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
|
|
@ -283,7 +221,7 @@ void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t * free, s
|
|||
}
|
||||
|
||||
enum ggml_backend_dev_type ggml_backend_qnn_device_get_type(ggml_backend_dev_t dev) {
|
||||
return kDeviceCaps[get_device_context(dev)->device].type;
|
||||
return qnn::get_device_caps(get_device_context(dev)->device).type;
|
||||
}
|
||||
|
||||
void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||
|
|
@ -310,14 +248,14 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev,
|
|||
extend_lib_search_path = GGML_QNN_DEFAULT_LIB_SEARCH_PATH;
|
||||
QNN_LOG_WARN(
|
||||
"extend_lib_search_path is nullptr, will "
|
||||
"use " GGML_QNN_DEFAULT_LIB_SEARCH_PATH " as default");
|
||||
"use " GGML_QNN_DEFAULT_LIB_SEARCH_PATH " as default\n");
|
||||
}
|
||||
|
||||
auto * dev_ctx = get_device_context(dev);
|
||||
const auto device = dev_ctx->device;
|
||||
QNN_LOG_DEBUG("device %s\n", qnn::get_backend_name(device));
|
||||
QNN_LOG_DEBUG("extend_lib_search_path %s\n", extend_lib_search_path);
|
||||
auto instance = std::make_shared<qnn::qnn_instance>(extend_lib_search_path, dev_ctx->lib_name);
|
||||
auto instance = std::make_shared<qnn::qnn_instance>(extend_lib_search_path, device);
|
||||
auto result = instance->qnn_init(nullptr);
|
||||
if (result != 0) {
|
||||
QNN_LOG_WARN("failed to init qnn backend %s\n", qnn::get_backend_name(device));
|
||||
|
|
@ -331,10 +269,21 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev,
|
|||
|
||||
std::string device_name = qnn::get_backend_name(device);
|
||||
QNN_LOG_INFO("qnn device name %s\n", device_name.c_str());
|
||||
dev_ctx->instance = instance;
|
||||
dev_ctx->qnn_interface = qnn_interface;
|
||||
dev_ctx->socinfo = instance->get_soc_info();
|
||||
dev_ctx->supported_types = kDeviceCaps[device].supported_types;
|
||||
const auto & device_caps = qnn::get_device_caps(device);
|
||||
dev_ctx->instance = instance;
|
||||
dev_ctx->qnn_interface = qnn_interface;
|
||||
dev_ctx->socinfo = instance->get_soc_info();
|
||||
dev_ctx->supported_types = device_caps.supported_types;
|
||||
dev_ctx->cpu_preprocess_types = device_caps.cpu_preprocess_types;
|
||||
dev_ctx->max_tensor_size_in_bytes = device_caps.max_tensor_size_in_bytes;
|
||||
{
|
||||
char buffer[256];
|
||||
snprintf(buffer, sizeof(buffer), "%s(%s)", qnn::get_chipset_desc(dev_ctx->socinfo.soc_model),
|
||||
qnn::get_backend_desc(dev_ctx->device));
|
||||
dev_ctx->description = buffer;
|
||||
}
|
||||
// TODO: remove npu from here if hardware quantization is supported
|
||||
dev_ctx->enable_cpu_dequantize = device == QNN_BACKEND_CPU;
|
||||
|
||||
ggml_backend_t qnn_backend = new ggml_backend{
|
||||
/* .guid = */ ggml_backend_qnn_guid(),
|
||||
|
|
@ -425,16 +374,17 @@ struct ggml_backend_qnn_reg_impl : ggml_backend_reg {
|
|||
* here we skip the initialization of CPU device,
|
||||
* cause it'll block unsupported ops fallback to ggml cpu backend
|
||||
*/
|
||||
QNN_LOG_DEBUG("qnn backend registry skip CPU device\n");
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
|
||||
const auto & device_caps = qnn::get_device_caps(device_enum);
|
||||
device_contexts.emplace_back(std::make_unique<ggml_backend_qnn_device_context>(
|
||||
/* .device = */ device_enum, // init from the last device, i.e. NPU
|
||||
/* .threads = */ 1,
|
||||
/* .name = */ qnn::get_backend_name(device_enum),
|
||||
/* .lib_name = */ kDeviceCaps[device_enum].lib_name,
|
||||
/* .supported_types = */ kDeviceCaps[device_enum].supported_types));
|
||||
/* .supported_types = */ device_caps.supported_types));
|
||||
|
||||
devices.emplace_back(ggml_backend_device{
|
||||
/* iface = */ ggml_backend_qnn_device_interface,
|
||||
|
|
|
|||
|
|
@ -7,15 +7,27 @@
|
|||
#include "ggml-impl.h"
|
||||
#include "logger.hpp"
|
||||
#include "op-config.hpp"
|
||||
#include "profiler.hpp"
|
||||
#include "tensor.hpp"
|
||||
|
||||
#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING
|
||||
# define GRAPH_PROFILE_HANDLE (_event_tracer ? _event_tracer->get_handle() : nullptr)
|
||||
# define GRAPH_PROFILE_PRINT() \
|
||||
if (_event_tracer) { \
|
||||
_event_tracer->print_profile_events(); \
|
||||
} \
|
||||
(void) 0
|
||||
#else
|
||||
# define GRAPH_PROFILE_HANDLE (nullptr)
|
||||
# define GRAPH_PROFILE_PRINT() (void) 0
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
using qnn_tensor_cache_t = std::unordered_map<ggml_tensor *, qnn::qnn_tensor_ptr_t>;
|
||||
|
||||
int get_op_max_rank(const ggml_tensor * op) {
|
||||
int max_rank = ggml_n_dims(op);
|
||||
const int count = (int) qnn::get_qnn_op_input_param_count(op);
|
||||
for (int i = 0; i < count; ++i) {
|
||||
int max_rank = ggml_n_dims(op);
|
||||
for (int i = 0; i < GGML_MAX_DIMS && op->src[i]; ++i) {
|
||||
max_rank = std::max(max_rank, ggml_n_dims(op->src[i]));
|
||||
}
|
||||
|
||||
|
|
@ -23,7 +35,8 @@ int get_op_max_rank(const ggml_tensor * op) {
|
|||
}
|
||||
|
||||
qnn::qnn_tensor_ptr_t create_tensor_with_cache(ggml_tensor * tensor, qnn::ggml_qnn_tensor::tensor_type_t type, int rank,
|
||||
QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
ggml_type override_data_type, QNNBackend device,
|
||||
Qnn_GraphHandle_t graph_handle,
|
||||
std::shared_ptr<qnn::qnn_instance> qnn_instance,
|
||||
qnn_tensor_cache_t & tensor_cache) {
|
||||
GGML_ASSERT(tensor);
|
||||
|
|
@ -31,21 +44,30 @@ qnn::qnn_tensor_ptr_t create_tensor_with_cache(ggml_tensor * tensor, qnn::ggml_q
|
|||
return tensor_cache[tensor];
|
||||
}
|
||||
|
||||
auto qnn_tensor = std::make_shared<qnn::ggml_qnn_tensor>(type, tensor->name, tensor->ne, tensor->type, rank, device,
|
||||
graph_handle, qnn_instance);
|
||||
QNN_LOG_DEBUG("[%s]create_tensor_with_cache, data_type: %s, override_data_type: %s\n",
|
||||
qnn::get_backend_name(device), ggml_type_name(tensor->type), ggml_type_name(override_data_type));
|
||||
auto data_type = override_data_type != GGML_TYPE_COUNT ? override_data_type : tensor->type;
|
||||
|
||||
// We've observed that some tensors have the same name with different op types will be added to the same graph
|
||||
// which will cause the graph build failed. To avoid this, we append the op type to the tensor name.
|
||||
char tensor_name[256];
|
||||
snprintf(tensor_name, sizeof(tensor_name), "%s_%s", ggml_get_name(tensor), ggml_op_desc(tensor));
|
||||
auto qnn_tensor = std::make_shared<qnn::ggml_qnn_tensor>(type, std::string(tensor_name), tensor->ne, data_type,
|
||||
rank, device, graph_handle, qnn_instance);
|
||||
tensor_cache[tensor] = qnn_tensor;
|
||||
return qnn_tensor;
|
||||
}
|
||||
|
||||
qnn::qnn_tensor_array_t create_tensors_with_cache(const qnn::ggml_tensor_array_t & ggml_tensors,
|
||||
qnn::ggml_qnn_tensor::tensor_type_t type, int rank, QNNBackend device,
|
||||
qnn::ggml_qnn_tensor::tensor_type_t type, int rank,
|
||||
ggml_type override_data_type, QNNBackend device,
|
||||
Qnn_GraphHandle_t graph_handle,
|
||||
std::shared_ptr<qnn::qnn_instance> qnn_instance,
|
||||
qnn_tensor_cache_t & tensor_cache) {
|
||||
qnn::qnn_tensor_array_t tensors;
|
||||
for (auto * tensor : ggml_tensors) {
|
||||
tensors.push_back(
|
||||
create_tensor_with_cache(tensor, type, rank, device, graph_handle, qnn_instance, tensor_cache));
|
||||
tensors.push_back(create_tensor_with_cache(tensor, type, rank, override_data_type, device, graph_handle,
|
||||
qnn_instance, tensor_cache));
|
||||
}
|
||||
|
||||
return tensors;
|
||||
|
|
@ -54,23 +76,23 @@ qnn::qnn_tensor_array_t create_tensors_with_cache(const qnn::ggml_tensor_array_t
|
|||
qnn::qnn_op_config_ptr_t create_operation_from_op_tensor(ggml_tensor * dst, const std::string & name, int rank,
|
||||
QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
std::shared_ptr<qnn::qnn_instance> qnn_instance,
|
||||
bool is_intermediate, qnn_tensor_cache_t & tensor_cache) {
|
||||
qnn_tensor_cache_t & tensor_cache) {
|
||||
auto operation = qnn::create_op(dst, name, qnn_instance);
|
||||
|
||||
// input tensors
|
||||
qnn::qnn_tensor_array_t input_qnn_tensors;
|
||||
auto tensor_type = is_intermediate ? qnn::ggml_qnn_tensor::INTERMEDIATE : qnn::ggml_qnn_tensor::INPUT;
|
||||
for (size_t i = 0; i < qnn::get_qnn_op_input_param_count(dst); ++i) {
|
||||
auto input_qnn_tensor =
|
||||
create_tensor_with_cache(dst->src[i], tensor_type, rank, device, graph_handle, qnn_instance, tensor_cache);
|
||||
for (size_t i = 0; i < GGML_MAX_DIMS && dst->src[i]; ++i) {
|
||||
auto * src = dst->src[i];
|
||||
auto input_qnn_tensor = create_tensor_with_cache(src, qnn::ggml_qnn_tensor::INTERMEDIATE, rank, GGML_TYPE_COUNT,
|
||||
device, graph_handle, qnn_instance, tensor_cache);
|
||||
input_qnn_tensors.push_back(input_qnn_tensor);
|
||||
}
|
||||
operation->set_input_tensors(input_qnn_tensors);
|
||||
|
||||
// output tensor
|
||||
tensor_type = is_intermediate ? qnn::ggml_qnn_tensor::INTERMEDIATE : qnn::ggml_qnn_tensor::OUTPUT;
|
||||
qnn::qnn_tensor_array_t output_qnn_tensors =
|
||||
create_tensors_with_cache({ dst }, tensor_type, rank, device, graph_handle, qnn_instance, tensor_cache);
|
||||
create_tensors_with_cache({ dst }, qnn::ggml_qnn_tensor::INTERMEDIATE, rank, GGML_TYPE_COUNT, device,
|
||||
graph_handle, qnn_instance, tensor_cache);
|
||||
operation->set_output_tensors(output_qnn_tensors);
|
||||
|
||||
// initialize operation
|
||||
|
|
@ -82,29 +104,6 @@ qnn::qnn_op_config_ptr_t create_operation_from_op_tensor(ggml_tensor * dst, cons
|
|||
return operation;
|
||||
}
|
||||
|
||||
bool bind_src_tensors(ggml_tensor * op, qnn::qnn_tensor_array_t & tensor_wrappers,
|
||||
std::vector<Qnn_Tensor_t> & qnn_tensors) {
|
||||
if (op->op == GGML_OP_NONE) {
|
||||
QNN_LOG_DEBUG("op %s is not a valid op\n", ggml_get_name(op));
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto param_count = qnn::get_qnn_op_input_param_count(op);
|
||||
GGML_ASSERT(tensor_wrappers.size() == param_count);
|
||||
qnn_tensors.resize(param_count);
|
||||
for (size_t i = 0; i < param_count; ++i) {
|
||||
auto * ggml_tensor = op->src[i];
|
||||
if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) {
|
||||
QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor));
|
||||
return false;
|
||||
}
|
||||
|
||||
qnn_tensors[i] = tensor_wrappers[i]->get_qnn_tensor();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Extracts input and output tensors from a computational graph.
|
||||
*
|
||||
|
|
@ -134,11 +133,15 @@ int get_io_tensors_from_graph(const ggml_cgraph * cgraph, qnn::ggml_tensor_array
|
|||
continue;
|
||||
}
|
||||
|
||||
if (dst->op == GGML_OP_NONE || dst->op == GGML_OP_VIEW) {
|
||||
if (dst->op == GGML_OP_NONE || dst->op == GGML_OP_VIEW || dst->op == GGML_OP_PERMUTE) {
|
||||
// TODO: remove GGML_OP_VIEW after view op is supported
|
||||
QNN_LOG_DEBUG("node[%d]%s(%s), type: %s, skipped\n", i, ggml_get_name(dst), ggml_op_desc(dst),
|
||||
ggml_type_name(dst->type));
|
||||
continue;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("node[%d]%s(%s), type: %s\n", i, ggml_get_name(dst), ggml_op_desc(dst),
|
||||
ggml_type_name(dst->type));
|
||||
rank = std::max(rank, ggml_n_dims(dst));
|
||||
if (connectivity_map.count(dst) == 0) {
|
||||
connectivity_map[dst] = {
|
||||
|
|
@ -150,10 +153,12 @@ int get_io_tensors_from_graph(const ggml_cgraph * cgraph, qnn::ggml_tensor_array
|
|||
++(connectivity_map[dst].in_degree);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < GGML_MAX_DIMS && dst->src[i]; ++i) {
|
||||
auto * src = dst->src[i];
|
||||
for (size_t j = 0; j < GGML_MAX_DIMS && dst->src[j]; ++j) {
|
||||
auto * src = dst->src[j];
|
||||
rank = std::max(rank, ggml_n_dims(src));
|
||||
|
||||
QNN_LOG_DEBUG("node[%d]: src[%d]: %s(%s), type: %s\n", i, (int) j, ggml_get_name(src), ggml_op_desc(src),
|
||||
ggml_type_name(src->type));
|
||||
if (connectivity_map.count(src) == 0) {
|
||||
connectivity_map[src] = {
|
||||
0,
|
||||
|
|
@ -187,16 +192,155 @@ int get_io_tensors_from_graph(const ggml_cgraph * cgraph, qnn::ggml_tensor_array
|
|||
return rank;
|
||||
}
|
||||
|
||||
/*
|
||||
* for src0_F32, src1_F32, dst_F32 -> GGML_TYPE_COUNT
|
||||
* for src0_F16, src1_F16, dst_F16 -> GGML_TYPE_COUNT
|
||||
* for src0_F16, src1_F32, dst_F32 -> GGML_TYPE_F32
|
||||
* for src0_q4, src1_F32, dst_F32 -> GGML_TYPE_F32
|
||||
* for src0_q4, src1_F16, dst_F32 -> GGML_TYPE_F32
|
||||
*/
|
||||
ggml_type get_override_data_type(const qnn::ggml_tensor_array_t & inputs, const qnn::ggml_tensor_array_t & outputs) {
|
||||
GGML_ASSERT(!inputs.empty());
|
||||
ggml_type override_data_type = inputs.front()->type;
|
||||
bool is_same_data_type = true;
|
||||
for (auto * tensor : inputs) {
|
||||
QNN_LOG_DEBUG("input_tensor: %s(%s), override_data_type(%s)\n", ggml_get_name(tensor),
|
||||
ggml_type_name(tensor->type), ggml_type_name(override_data_type));
|
||||
is_same_data_type = is_same_data_type && tensor->type == override_data_type;
|
||||
override_data_type = std::min(override_data_type, tensor->type);
|
||||
}
|
||||
|
||||
for (auto * tensor : outputs) {
|
||||
QNN_LOG_DEBUG("output_tensor: %s(%s), override_data_type(%s)\n", ggml_get_name(tensor),
|
||||
ggml_type_name(tensor->type), ggml_type_name(override_data_type));
|
||||
is_same_data_type = is_same_data_type && tensor->type == override_data_type;
|
||||
override_data_type = std::min(override_data_type, tensor->type);
|
||||
}
|
||||
|
||||
return is_same_data_type ? GGML_TYPE_COUNT : override_data_type;
|
||||
}
|
||||
|
||||
static const QnnHtpGraph_CustomConfig_t kDefaultHvxConfig = []() {
|
||||
QnnHtpGraph_CustomConfig_t hvx_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT;
|
||||
hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
|
||||
hvx_config.numHvxThreads = 8;
|
||||
return hvx_config;
|
||||
}();
|
||||
|
||||
static const QnnHtpGraph_CustomConfig_t kDefaultDlbcConfig = []() {
|
||||
QnnHtpGraph_CustomConfig_t dlbc_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT;
|
||||
dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
|
||||
dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
|
||||
dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC
|
||||
return dlbc_config;
|
||||
}();
|
||||
|
||||
/*
|
||||
* 1 = Faster preparation time, less optimal graph
|
||||
* 2 = Longer preparation time, more optimal graph
|
||||
* 3 = Longest preparation time, most likely even more optimal graph:
|
||||
* QNN_HTP_DEVICE_CONFIG_OPTION_SOC configuration will be taken into account when possible, details see HTP Backend Specific Page
|
||||
*/
|
||||
static const QnnHtpGraph_CustomConfig_t kDefaultOptConfig = []() {
|
||||
QnnHtpGraph_CustomConfig_t opt_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT;
|
||||
opt_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
|
||||
opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
|
||||
#ifndef NDEBUG
|
||||
opt_config.optimizationOption.floatValue = 3;
|
||||
#else
|
||||
opt_config.optimizationOption.floatValue = 1;
|
||||
#endif
|
||||
return opt_config;
|
||||
}();
|
||||
|
||||
static const QnnHtpGraph_CustomConfig_t kHtpPrecisionConfigF16 = []() {
|
||||
QnnHtpGraph_CustomConfig_t precision_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT;
|
||||
precision_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION;
|
||||
precision_config.precision = QNN_PRECISION_FLOAT16;
|
||||
return precision_config;
|
||||
}();
|
||||
|
||||
constexpr QnnHtpGraph_CustomConfig_t make_vtcm_config(size_t vtcm_size_in_mb) {
|
||||
QnnHtpGraph_CustomConfig_t vtcm_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT;
|
||||
vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
|
||||
vtcm_config.vtcmSizeInMB = (uint32_t) vtcm_size_in_mb;
|
||||
return vtcm_config;
|
||||
}
|
||||
|
||||
constexpr QnnGraph_Config_t make_graph_config(const QnnHtpGraph_CustomConfig_t * custom_config) {
|
||||
QnnGraph_Config_t graph_config = QNN_GRAPH_CONFIG_INIT;
|
||||
graph_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
|
||||
graph_config.customConfig = const_cast<QnnHtpGraph_CustomConfig_t *>(custom_config);
|
||||
return graph_config;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace qnn {
|
||||
|
||||
qnn_graph::qnn_graph(const std::string & graph_name, QNNBackend device, std::shared_ptr<qnn_instance> qnn_instance,
|
||||
size_t vtcm_size_in_mb) :
|
||||
ggml_type qnn_graph::get_graph_key_from_cgraph(const ggml_cgraph * cgraph, std::string & output) {
|
||||
if (cgraph->n_nodes == 0) {
|
||||
QNN_LOG_DEBUG("empty cgraph\n");
|
||||
return GGML_TYPE_COUNT;
|
||||
}
|
||||
|
||||
ggml_type override_type = GGML_TYPE_COUNT;
|
||||
{
|
||||
// TODO: can we have a better approach to get the override_type here?
|
||||
// though it is O(n) + O(mlog(m)) complexity, our graph is small, so it is fine
|
||||
ggml_tensor_array_t inputs;
|
||||
ggml_tensor_array_t outputs;
|
||||
get_io_tensors_from_graph(cgraph, inputs, outputs);
|
||||
if (!inputs.empty() && !outputs.empty()) {
|
||||
override_type = get_override_data_type(inputs, outputs);
|
||||
QNN_LOG_DEBUG("get_graph_key, override_type: %s\n", ggml_type_name(override_type));
|
||||
} else {
|
||||
QNN_LOG_DEBUG("get_graph_key, no input or output tensors\n");
|
||||
}
|
||||
}
|
||||
|
||||
ggml_type min_op_type = GGML_TYPE_COUNT;
|
||||
{
|
||||
bool is_start = true;
|
||||
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
||||
auto * op = cgraph->nodes[i];
|
||||
if (ggml_is_empty(op)) {
|
||||
QNN_LOG_DEBUG("empty op in graph, skipping\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
if (op->op == GGML_OP_NONE || op->op == GGML_OP_VIEW || op->op == GGML_OP_PERMUTE) {
|
||||
QNN_LOG_DEBUG("%s in graph, skipping\n", ggml_op_desc(op));
|
||||
continue;
|
||||
}
|
||||
|
||||
min_op_type = std::min(min_op_type, op->type);
|
||||
if (is_start) {
|
||||
qnn::get_qnn_op_desc(op, is_start, override_type, output);
|
||||
is_start = false;
|
||||
} else {
|
||||
output += '#';
|
||||
qnn::get_qnn_op_desc(op, is_start, override_type, output);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (cgraph->n_nodes > 1) {
|
||||
auto * last_op = cgraph->nodes[cgraph->n_nodes - 1];
|
||||
output += qnn::get_ggml_type_name(last_op->type);
|
||||
output += '_';
|
||||
qnn::append_tensor_shape_and_type(last_op, output);
|
||||
}
|
||||
|
||||
return min_op_type;
|
||||
}
|
||||
|
||||
qnn_graph::qnn_graph(const std::string & graph_name, QNNBackend device, qnn_instance_ptr qnn_instance,
|
||||
htp_precision precision, size_t vtcm_size_in_mb) :
|
||||
_graph_name(graph_name),
|
||||
_device(device),
|
||||
_qnn_instance(qnn_instance) {
|
||||
QNN_LOG_DEBUG("[%s][%s]created\n", get_backend_name(device), graph_name.c_str());
|
||||
QNN_LOG_DEBUG("[%s][%s]creating\n", get_backend_name(device), graph_name.c_str());
|
||||
|
||||
auto qnn_interface = qnn_instance->get_qnn_interface();
|
||||
auto qnn_context = qnn_instance->get_qnn_context_handle();
|
||||
|
|
@ -204,38 +348,29 @@ qnn_graph::qnn_graph(const std::string & graph_name, QNNBackend device, std::sha
|
|||
Qnn_GraphHandle_t graph_handle = nullptr;
|
||||
if (device == QNN_BACKEND_NPU) {
|
||||
// TODO: fix graph config here for NPU
|
||||
QnnHtpGraph_CustomConfig_t hvx_config;
|
||||
hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
|
||||
hvx_config.numHvxThreads = 8;
|
||||
QnnGraph_Config_t graph_hvx_config;
|
||||
graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
|
||||
graph_hvx_config.customConfig = &hvx_config;
|
||||
std::vector<const QnnGraph_Config_t *> graph_configs;
|
||||
|
||||
QnnHtpGraph_CustomConfig_t dlbc_config;
|
||||
dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
|
||||
dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
|
||||
dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC
|
||||
QnnGraph_Config_t graph_dlbc_config;
|
||||
graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
|
||||
graph_dlbc_config.customConfig = &dlbc_config;
|
||||
auto hvx_config = make_graph_config(&kDefaultHvxConfig);
|
||||
graph_configs.push_back(&hvx_config);
|
||||
|
||||
QnnHtpGraph_CustomConfig_t opt_config;
|
||||
opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
|
||||
opt_config.optimizationOption.floatValue = 1; // 1 / 3
|
||||
QnnGraph_Config_t graph_opt_config;
|
||||
graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
|
||||
graph_opt_config.customConfig = &opt_config;
|
||||
auto dlbc_config = make_graph_config(&kDefaultDlbcConfig);
|
||||
graph_configs.push_back(&dlbc_config);
|
||||
|
||||
QnnHtpGraph_CustomConfig_t vtcm_config;
|
||||
vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
|
||||
vtcm_config.vtcmSizeInMB = (uint32_t) vtcm_size_in_mb;
|
||||
QnnGraph_Config_t graph_vtcm_config;
|
||||
graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
|
||||
graph_vtcm_config.customConfig = &vtcm_config;
|
||||
auto opt_config = make_graph_config(&kDefaultOptConfig);
|
||||
graph_configs.push_back(&opt_config);
|
||||
|
||||
const QnnGraph_Config_t * graph_configs[] = { &graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config,
|
||||
&graph_opt_config, nullptr };
|
||||
error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), graph_configs, &graph_handle);
|
||||
auto vctm_sub_config = make_vtcm_config(vtcm_size_in_mb);
|
||||
auto vtcm_config = make_graph_config(&vctm_sub_config);
|
||||
graph_configs.push_back(&vtcm_config);
|
||||
|
||||
if (precision == qnn_graph::kHtpFp16) {
|
||||
auto precision_config = make_graph_config(&kHtpPrecisionConfigF16);
|
||||
graph_configs.push_back(&precision_config);
|
||||
QNN_LOG_DEBUG("[%s][%s]set precision to F16\n", get_backend_name(device), graph_name.c_str());
|
||||
}
|
||||
|
||||
graph_configs.push_back(nullptr);
|
||||
error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), graph_configs.data(), &graph_handle);
|
||||
} else {
|
||||
error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), nullptr, &graph_handle);
|
||||
}
|
||||
|
|
@ -246,9 +381,16 @@ qnn_graph::qnn_graph(const std::string & graph_name, QNNBackend device, std::sha
|
|||
return;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s][%s]create succeed\n", get_backend_name(device), graph_name.c_str());
|
||||
#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING
|
||||
if (device == QNN_BACKEND_NPU) {
|
||||
_event_tracer = std::make_shared<qnn_event_tracer>(
|
||||
graph_name, qnn_interface, qnn_instance->get_qnn_backend_handle(), qnn_event_tracer::PROFILE_OP_TRACE);
|
||||
}
|
||||
#endif
|
||||
|
||||
_graph_handle = graph_handle;
|
||||
_qnn_interface = qnn_interface;
|
||||
QNN_LOG_DEBUG("[%s][%s]create succeed\n", get_backend_name(device), graph_name.c_str());
|
||||
}
|
||||
|
||||
qnn_graph::~qnn_graph() {
|
||||
|
|
@ -261,15 +403,28 @@ bool qnn_graph::build_graph_from_ggml_graph(const ggml_cgraph * cgraph) {
|
|||
ggml_tensor_array_t inputs;
|
||||
ggml_tensor_array_t outputs;
|
||||
int rank = get_io_tensors_from_graph(cgraph, inputs, outputs);
|
||||
QNN_LOG_DEBUG("[%s]rank: %d, input_set: %d, output_set: %d\n", get_backend_name(_device), rank, int(inputs.size()),
|
||||
int(outputs.size()));
|
||||
QNN_LOG_DEBUG("[%s][%s]rank: %d, graph_nodes: %d, input_set: %d, output_set: %d\n", get_backend_name(_device),
|
||||
_graph_name.c_str(), rank, cgraph->n_nodes, int(inputs.size()), int(outputs.size()));
|
||||
|
||||
{
|
||||
static_assert(
|
||||
GGML_TYPE_COUNT > GGML_TYPE_Q8_0 && GGML_TYPE_Q8_0 > GGML_TYPE_F16 && GGML_TYPE_F16 > GGML_TYPE_F32,
|
||||
"GGML_TYPE enum order is not correct");
|
||||
|
||||
QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]build_graph_from_ggml_graph", get_backend_name(_device),
|
||||
_graph_name.c_str());
|
||||
|
||||
auto override_data_type = get_override_data_type(inputs, outputs);
|
||||
if (override_data_type != GGML_TYPE_COUNT) {
|
||||
QNN_LOG_DEBUG("[%s][%s]set override_data_type: %s\n", get_backend_name(_device), _graph_name.c_str(),
|
||||
ggml_type_name(override_data_type));
|
||||
}
|
||||
|
||||
qnn_tensor_cache_t tensor_cache;
|
||||
auto input_tensors = create_tensors_with_cache(inputs, ggml_qnn_tensor::INPUT, rank, _device, _graph_handle,
|
||||
_qnn_instance, tensor_cache);
|
||||
auto output_tensors = create_tensors_with_cache(outputs, ggml_qnn_tensor::OUTPUT, rank, _device, _graph_handle,
|
||||
_qnn_instance, tensor_cache);
|
||||
auto input_tensors = create_tensors_with_cache(inputs, ggml_qnn_tensor::INPUT, rank, override_data_type,
|
||||
_device, _graph_handle, _qnn_instance, tensor_cache);
|
||||
auto output_tensors = create_tensors_with_cache(outputs, ggml_qnn_tensor::OUTPUT, rank, GGML_TYPE_COUNT,
|
||||
_device, _graph_handle, _qnn_instance, tensor_cache);
|
||||
qnn_op_config_array_t operations;
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * dst = cgraph->nodes[i];
|
||||
|
|
@ -277,14 +432,21 @@ bool qnn_graph::build_graph_from_ggml_graph(const ggml_cgraph * cgraph) {
|
|||
continue;
|
||||
}
|
||||
|
||||
if (dst->op == GGML_OP_NONE || dst->op == GGML_OP_VIEW) {
|
||||
if (dst->op == GGML_OP_NONE || dst->op == GGML_OP_VIEW || dst->op == GGML_OP_PERMUTE) {
|
||||
// TODO: remove GGML_OP_VIEW after view op is supported
|
||||
continue;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s]create op: %s\n", get_backend_name(_device), get_qnn_op_name(dst));
|
||||
#ifndef NDEBUG
|
||||
{
|
||||
std::string op_desc;
|
||||
get_qnn_op_desc(dst, true, GGML_TYPE_COUNT, op_desc);
|
||||
QNN_LOG_DEBUG("[%s]create op(%s) with qnn op(%s)\n", get_backend_name(_device), op_desc.c_str(),
|
||||
get_qnn_op_name(dst));
|
||||
}
|
||||
#endif
|
||||
auto operation = create_operation_from_op_tensor(dst, dst->name, rank, _device, _graph_handle,
|
||||
_qnn_instance, true, tensor_cache); // TODO: fix op name
|
||||
_qnn_instance, tensor_cache); // TODO: fix op name
|
||||
operations.push_back(operation);
|
||||
}
|
||||
|
||||
|
|
@ -300,59 +462,81 @@ bool qnn_graph::build_graph_from_ggml_graph(const ggml_cgraph * cgraph) {
|
|||
return true;
|
||||
}
|
||||
|
||||
bool qnn_graph::execute(const ggml_cgraph * cgraph) {
|
||||
bool qnn_graph::execute(const ggml_cgraph * cgraph, std::shared_ptr<qnn_convert_context_t> convert_context) {
|
||||
ggml_tensor_array_t inputs;
|
||||
ggml_tensor_array_t outputs;
|
||||
{
|
||||
QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]get_io_tensors_from_graph", get_backend_name(_device),
|
||||
_graph_name.c_str());
|
||||
#ifdef NDEBUG
|
||||
get_io_tensors_from_graph(cgraph, inputs, outputs);
|
||||
get_io_tensors_from_graph(cgraph, inputs, outputs);
|
||||
#else
|
||||
int rank = get_io_tensors_from_graph(cgraph, inputs, outputs);
|
||||
QNN_LOG_DEBUG("[%s]rank: %d, input_set: %d, output_set: %d\n", get_backend_name(_device), rank, int(inputs.size()),
|
||||
int(outputs.size()));
|
||||
int rank = get_io_tensors_from_graph(cgraph, inputs, outputs);
|
||||
QNN_LOG_DEBUG("[%s]rank: %d, input_set: %d, output_set: %d\n", get_backend_name(_device), rank,
|
||||
int(inputs.size()), int(outputs.size()));
|
||||
#endif
|
||||
}
|
||||
|
||||
{
|
||||
if (!qnn::bind_tensors(inputs, _tensor_inputs, _qnn_tensor_inputs)) {
|
||||
QNN_LOG_ERROR("[%s][%s]bind input tensors failed\n", get_backend_name(_device), _graph_name.c_str());
|
||||
return false;
|
||||
QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]bind_tensors", get_backend_name(_device), _graph_name.c_str());
|
||||
auto override_data_type = get_override_data_type(inputs, outputs);
|
||||
if (override_data_type != GGML_TYPE_COUNT) {
|
||||
QNN_LOG_DEBUG("[%s][%s]override_data_type: %s\n", get_backend_name(_device), _graph_name.c_str(),
|
||||
ggml_type_name(override_data_type));
|
||||
auto buffers = convert(convert_context, inputs, override_data_type);
|
||||
if (!qnn::bind_tensors_with_custom_buffers(inputs, buffers, _tensor_inputs, _qnn_tensor_inputs)) {
|
||||
QNN_LOG_ERROR("[%s][%s]bind input tensors failed\n", get_backend_name(_device), _graph_name.c_str());
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (!qnn::bind_tensors(inputs, _tensor_inputs, _qnn_tensor_inputs)) {
|
||||
QNN_LOG_ERROR("[%s][%s]bind input tensors failed\n", get_backend_name(_device), _graph_name.c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!qnn::bind_tensors(outputs, _tensor_outputs, _qnn_tensor_outputs)) {
|
||||
QNN_LOG_ERROR("[%s][%s]bind output tensors failed\n", get_backend_name(_device), _graph_name.c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]execute", get_backend_name(_device), _graph_name.c_str());
|
||||
auto & qnn_tensor_inputs = _qnn_tensor_inputs;
|
||||
auto & qnn_tensor_outputs = _qnn_tensor_outputs;
|
||||
auto error =
|
||||
_qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(),
|
||||
qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr);
|
||||
auto error = _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(),
|
||||
qnn_tensor_inputs.size(), qnn_tensor_outputs.data(),
|
||||
qnn_tensor_outputs.size(), GRAPH_PROFILE_HANDLE, nullptr);
|
||||
unbind_tensors(_tensor_inputs);
|
||||
unbind_tensors(_tensor_outputs);
|
||||
|
||||
if (error != QNN_SUCCESS) {
|
||||
if (_device == QNN_BACKEND_NPU && error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) {
|
||||
QNN_LOG_WARN("[%s][%s]NPU crashed. SSR detected. Caused QNN graph execute error.\n",
|
||||
QNN_LOG_WARN("[%s][%s][execute]NPU crashed. SSR detected. Caused QNN graph execute error.\n",
|
||||
get_backend_name(_device), _graph_name.c_str());
|
||||
} else {
|
||||
QNN_LOG_ERROR("[%s][%s]error: %s\n", get_backend_name(_device), _graph_name.c_str(),
|
||||
QNN_LOG_ERROR("[%s][%s][execute]error: %s\n", get_backend_name(_device), _graph_name.c_str(),
|
||||
get_qnn_error_string(error));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s][%s]execute succeed\n", get_backend_name(_device), _graph_name.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
GRAPH_PROFILE_PRINT();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool qnn_graph::finalize() {
|
||||
QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]finalize", get_backend_name(_device), _graph_name.c_str());
|
||||
|
||||
if (!qnn::add_op_to_graph(_graph_handle, _operations)) {
|
||||
QNN_LOG_ERROR("[%s]add nodes failed\n", _graph_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
auto error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr);
|
||||
auto error = _qnn_interface->qnn_graph_finalize(_graph_handle, GRAPH_PROFILE_HANDLE, nullptr);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_ERROR("[%s][%s]qnn_graph_finalize.error: %s\n", get_backend_name(_device), _graph_name.c_str(),
|
||||
get_qnn_error_string(error));
|
||||
|
|
|
|||
|
|
@ -5,8 +5,10 @@
|
|||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "convert.hpp"
|
||||
#include "ggml-qnn.h"
|
||||
#include "op-config.hpp"
|
||||
#include "profiler.hpp"
|
||||
#include "qnn-lib.hpp"
|
||||
|
||||
namespace qnn {
|
||||
|
|
@ -21,19 +23,42 @@ namespace qnn {
|
|||
*/
|
||||
class qnn_graph {
|
||||
public:
|
||||
explicit qnn_graph(const std::string & graph_name, QNNBackend device, std::shared_ptr<qnn_instance> qnn_instance,
|
||||
size_t vtcm_size_in_mb);
|
||||
enum htp_precision {
|
||||
kHtpDefault = 0,
|
||||
kHtpFp16,
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Generates a unique key for a given computation graph (cgraph).
|
||||
*
|
||||
* This key is used to cache the graph, enabling efficient reuse of previously
|
||||
* compiled graphs. The key is constructed by concatenating the descriptions
|
||||
* of the operations and their associated tensor dimensions within the graph.
|
||||
*
|
||||
* Example key format: "MUL_MATf32_2048x8192q4_K_2048x2f32#MUL(SILU,MUL_MAT)#MUL_MAT(NONE,MUL)#ADD(MUL_MAT,ADD)f32_2048x2f32"
|
||||
*
|
||||
* @param cgraph The computation graph for which the key is generated.
|
||||
* @param output The string where the generated key will be stored.
|
||||
* @return The max ggml_type of all tensors in the graph.
|
||||
*
|
||||
* TODO: Improve the key generation logic to handle more complex graph structures and edge cases.
|
||||
*/
|
||||
static ggml_type get_graph_key_from_cgraph(const ggml_cgraph * cgraph, std::string & output);
|
||||
|
||||
explicit qnn_graph(const std::string & graph_name, QNNBackend device, qnn_instance_ptr qnn_instance,
|
||||
htp_precision precision, size_t vtcm_size_in_mb);
|
||||
|
||||
~qnn_graph();
|
||||
|
||||
bool build_graph_from_ggml_graph(const ggml_cgraph * cgraph);
|
||||
|
||||
bool execute(const ggml_cgraph * cgraph);
|
||||
bool execute(const ggml_cgraph * cgraph, std::shared_ptr<qnn_convert_context_t> convert_context);
|
||||
|
||||
bool is_valid() const { return _graph_handle != nullptr; }
|
||||
|
||||
Qnn_GraphHandle_t get_graph_handler() const { return _graph_handle; }
|
||||
|
||||
std::shared_ptr<qnn_instance> get_qnn_instance() { return _qnn_instance; }
|
||||
qnn_instance_ptr get_qnn_instance() { return _qnn_instance; }
|
||||
|
||||
const std::string & get_name() const { return _graph_name; }
|
||||
|
||||
|
|
@ -42,18 +67,23 @@ class qnn_graph {
|
|||
private:
|
||||
bool finalize();
|
||||
|
||||
const std::string _graph_name;
|
||||
const QNNBackend _device;
|
||||
Qnn_GraphHandle_t _graph_handle = nullptr;
|
||||
std::shared_ptr<qnn_instance> _qnn_instance;
|
||||
std::shared_ptr<qnn_interface> _qnn_interface;
|
||||
qnn_op_config_array_t _operations;
|
||||
const std::string _graph_name;
|
||||
const QNNBackend _device;
|
||||
Qnn_GraphHandle_t _graph_handle = nullptr;
|
||||
qnn_instance_ptr _qnn_instance;
|
||||
qnn_interface_ptr _qnn_interface;
|
||||
qnn_op_config_array_t _operations;
|
||||
|
||||
qnn_tensor_array_t _tensor_inputs;
|
||||
qnn_tensor_array_t _tensor_outputs;
|
||||
std::vector<Qnn_Tensor_t> _qnn_tensor_inputs;
|
||||
std::vector<Qnn_Tensor_t> _qnn_tensor_outputs;
|
||||
|
||||
#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING
|
||||
// profiler
|
||||
qnn_event_tracer_ptr _event_tracer;
|
||||
#endif
|
||||
|
||||
DISABLE_COPY(qnn_graph);
|
||||
DISABLE_MOVE(qnn_graph);
|
||||
};
|
||||
|
|
|
|||
|
|
@ -1,10 +1,11 @@
|
|||
#pragma once
|
||||
|
||||
#include <QnnLog.h>
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml.h"
|
||||
#include "QnnLog.h"
|
||||
|
||||
namespace qnn {
|
||||
void sdk_logcallback(const char * fmt, QnnLog_Level_t level, uint64_t timestamp, va_list argp);
|
||||
|
|
@ -13,4 +14,9 @@ void sdk_logcallback(const char * fmt, QnnLog_Level_t level, uint64_t timestamp,
|
|||
#define QNN_LOG_ERROR(...) (GGML_LOG_ERROR(__VA_ARGS__))
|
||||
#define QNN_LOG_WARN(...) (GGML_LOG_WARN(__VA_ARGS__))
|
||||
#define QNN_LOG_INFO(...) (GGML_LOG_INFO(__VA_ARGS__))
|
||||
#define QNN_LOG_DEBUG(...) (GGML_LOG_DEBUG(__VA_ARGS__))
|
||||
|
||||
#ifndef NDEBUG
|
||||
# define QNN_LOG_DEBUG(...) (GGML_LOG_DEBUG(__VA_ARGS__))
|
||||
#else
|
||||
# define QNN_LOG_DEBUG(...)
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -70,7 +70,7 @@ class ggml_qnn_op_config {
|
|||
*
|
||||
* @return A reference to a vector of qnn_tensor_ptr_t objects representing the input tensors.
|
||||
*/
|
||||
virtual const qnn_tensor_array_t & get_input_tensors() = 0;
|
||||
virtual qnn_tensor_array_t & get_input_tensors() = 0;
|
||||
|
||||
/**
|
||||
* @brief Pure virtual function to retrieve the output tensors of a QNN.
|
||||
|
|
@ -81,7 +81,7 @@ class ggml_qnn_op_config {
|
|||
*
|
||||
* @return A reference to a vector of qnn_tensor_ptr_t objects representing the output tensors.
|
||||
*/
|
||||
virtual const qnn_tensor_array_t & get_output_tensors() = 0;
|
||||
virtual qnn_tensor_array_t & get_output_tensors() = 0;
|
||||
|
||||
/**
|
||||
* @brief Adds an operation to the given graph.
|
||||
|
|
|
|||
|
|
@ -3,30 +3,77 @@
|
|||
|
||||
namespace {
|
||||
|
||||
using op_constructor_t = std::shared_ptr<qnn::ggml_qnn_op_config> (*)(const ggml_tensor *, const std::string &,
|
||||
using op_constructor_t = std::shared_ptr<qnn::ggml_qnn_op_config> (*)(const ggml_tensor *, const std::string &,
|
||||
std::shared_ptr<qnn::qnn_instance>);
|
||||
using op_dims_calc_func_t = void (*)(const std::vector<qnn::ggml_dimension_array_t> & input_dims,
|
||||
qnn::ggml_dimension_array_t & output_dims);
|
||||
|
||||
void element_wise_op_dims(const std::vector<qnn::ggml_dimension_array_t> & input_dims,
|
||||
qnn::ggml_dimension_array_t & output_dims) {
|
||||
for (size_t i = 1; i < std::size(output_dims); i++) {
|
||||
output_dims[i] = input_dims.front()[i];
|
||||
using op_description_generator_t = void (*)(const ggml_tensor * op, bool append_dimensions,
|
||||
ggml_type override_data_type, std::string & output);
|
||||
|
||||
void append_tensor_shape_and_type_impl(const ggml_tensor * tensor, ggml_type override_data_type, std::string & output) {
|
||||
char buffer[256] = {};
|
||||
const auto * type_name = qnn::get_ggml_type_name(std::min(tensor->type, override_data_type));
|
||||
int len = 0;
|
||||
switch (ggml_n_dims(tensor)) {
|
||||
case 1:
|
||||
len = snprintf(buffer, sizeof(buffer), "%ld%s", (long) tensor->ne[0], type_name);
|
||||
break;
|
||||
case 2:
|
||||
len = snprintf(buffer, sizeof(buffer), "%ldx%ld%s", (long) tensor->ne[0], (long) tensor->ne[1], type_name);
|
||||
break;
|
||||
case 3:
|
||||
len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ld%s", (long) tensor->ne[0], (long) tensor->ne[1],
|
||||
(long) tensor->ne[2], type_name);
|
||||
break;
|
||||
case 4:
|
||||
default:
|
||||
len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ldx%ld%s", (long) tensor->ne[0], (long) tensor->ne[1],
|
||||
(long) tensor->ne[2], (long) tensor->ne[3], type_name);
|
||||
break;
|
||||
}
|
||||
GGML_ASSERT(len > 0 && len < (int) sizeof(buffer));
|
||||
output.append(buffer, len);
|
||||
}
|
||||
|
||||
void get_graph_key_from_op(const ggml_tensor * op, ggml_type override_data_type, std::string & output) {
|
||||
output += ggml_op_desc(op);
|
||||
output += qnn::get_ggml_type_name(op->type);
|
||||
for (size_t i = 0; i < GGML_MAX_SRC && op->src[i]; ++i) {
|
||||
auto * src = op->src[i];
|
||||
if (!src) {
|
||||
break;
|
||||
}
|
||||
|
||||
output += '_';
|
||||
append_tensor_shape_and_type_impl(src, override_data_type, output);
|
||||
}
|
||||
}
|
||||
|
||||
void mat_mul_op_dims(const std::vector<qnn::ggml_dimension_array_t> & input_dims,
|
||||
qnn::ggml_dimension_array_t & output_dims) {
|
||||
GGML_ASSERT(input_dims.size() == 2);
|
||||
output_dims[0] = input_dims.front()[1];
|
||||
output_dims[1] = input_dims.back()[1];
|
||||
void get_op_key_with_src_op_desc(const ggml_tensor * op, std::string & output) {
|
||||
output += ggml_op_desc(op);
|
||||
output += '(';
|
||||
if (op->src[0]) {
|
||||
output += ggml_op_desc(op->src[0]);
|
||||
}
|
||||
for (size_t i = 1; i < GGML_MAX_SRC && op->src[i]; ++i) {
|
||||
output += ',';
|
||||
output += ggml_op_desc(op->src[i]);
|
||||
}
|
||||
output += ')';
|
||||
}
|
||||
|
||||
void generic_get_op_desc(const ggml_tensor * op, bool append_dimensions, ggml_type override_data_type,
|
||||
std::string & output) {
|
||||
if (append_dimensions) {
|
||||
get_graph_key_from_op(op, override_data_type, output);
|
||||
} else {
|
||||
get_op_key_with_src_op_desc(op, output);
|
||||
}
|
||||
}
|
||||
|
||||
struct qnn_op_caps_t {
|
||||
const char * qnn_op_name = nullptr;
|
||||
const size_t input_param_count = 0;
|
||||
op_dims_calc_func_t calc_dims_func = nullptr;
|
||||
const char * qnn_param_name = nullptr;
|
||||
const char * qnn_op_name = nullptr;
|
||||
op_description_generator_t get_desc = nullptr;
|
||||
const char * qnn_param_name = nullptr;
|
||||
};
|
||||
|
||||
constexpr const qnn_op_caps_t kOpCaps[] = {
|
||||
|
|
@ -35,41 +82,29 @@ constexpr const qnn_op_caps_t kOpCaps[] = {
|
|||
{
|
||||
// GGML_OP_ADD
|
||||
QNN_OP_ELEMENT_WISE_ADD, // qnn_op_name
|
||||
2, // input_param_count
|
||||
element_wise_op_dims, // calc_dims_func
|
||||
},
|
||||
{}, // GGML_OP_ADD1
|
||||
{}, // GGML_OP_ACC
|
||||
{
|
||||
// GGML_OP_SUB
|
||||
QNN_OP_ELEMENT_WISE_SUBTRACT, // qnn_op_name
|
||||
2, // input_param_count
|
||||
element_wise_op_dims, // calc_dims_func
|
||||
},
|
||||
{
|
||||
// GGML_OP_MUL
|
||||
QNN_OP_ELEMENT_WISE_MULTIPLY, // qnn_op_name
|
||||
2, // input_param_count
|
||||
element_wise_op_dims, // calc_dims_func
|
||||
},
|
||||
{
|
||||
// GGML_OP_DIV
|
||||
QNN_OP_ELEMENT_WISE_DIVIDE, // qnn_op_name
|
||||
2, // input_param_count
|
||||
element_wise_op_dims, // calc_dims_func
|
||||
},
|
||||
{}, // GGML_OP_SQR
|
||||
{
|
||||
// GGML_OP_SQRT
|
||||
QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // qnn_op_name
|
||||
1, // input_param_count
|
||||
element_wise_op_dims, // calc_dims_func
|
||||
},
|
||||
{
|
||||
// GGML_OP_LOG
|
||||
QNN_OP_ELEMENT_WISE_LOG, // qnn_op_name
|
||||
1, // input_param_count
|
||||
element_wise_op_dims, // calc_dims_func
|
||||
},
|
||||
{}, // GGML_OP_SIN
|
||||
{}, // GGML_OP_COS
|
||||
|
|
@ -86,17 +121,14 @@ constexpr const qnn_op_caps_t kOpCaps[] = {
|
|||
{
|
||||
// GGML_OP_RMS_NORM
|
||||
QNN_OP_RMS_NORM, // qnn_op_name
|
||||
1, // input_param_count
|
||||
nullptr, // TODO: calc_dims_func
|
||||
generic_get_op_desc, // get_desc
|
||||
QNN_OP_RMS_NORM_PARAM_EPSILON, // qnn_param_name
|
||||
},
|
||||
{}, // GGML_OP_RMS_NORM_BACK
|
||||
{}, // GGML_OP_GROUP_NORM
|
||||
{
|
||||
// GGML_OP_MUL_MAT
|
||||
QNN_OP_MAT_MUL, // qnn_op_name
|
||||
2, // input_param_count
|
||||
mat_mul_op_dims, // calc_dims_func
|
||||
QNN_OP_MAT_MUL, // qnn_op_name
|
||||
},
|
||||
{}, // GGML_OP_MUL_MAT_ID
|
||||
{}, // GGML_OP_OUT_PROD
|
||||
|
|
@ -107,8 +139,6 @@ constexpr const qnn_op_caps_t kOpCaps[] = {
|
|||
{
|
||||
// GGML_OP_RESHAPE
|
||||
QNN_OP_RESHAPE, // qnn_op_name
|
||||
1, // input_param_count
|
||||
nullptr, // TODO: calc_dims_func
|
||||
},
|
||||
{}, // GGML_OP_VIEW
|
||||
{}, // GGML_OP_PERMUTE
|
||||
|
|
@ -179,8 +209,6 @@ constexpr const qnn_op_caps_t kOpCaps[] = {
|
|||
{
|
||||
// GGML_UNARY_OP_GELU
|
||||
QNN_OP_GELU, // qnn_op_name
|
||||
1, // input_param_count
|
||||
nullptr, // TODO: calc_dims_func
|
||||
},
|
||||
{}, // GGML_UNARY_OP_GELU_QUICK
|
||||
{}, // GGML_UNARY_OP_SILU
|
||||
|
|
@ -189,15 +217,11 @@ constexpr const qnn_op_caps_t kOpCaps[] = {
|
|||
{}, // GGML_UNARY_OP_EXP
|
||||
};
|
||||
|
||||
static_assert(kOpCaps[GGML_OP_NONE].calc_dims_func == nullptr, "GGML_OP_NONE should not have calc_dims_func function");
|
||||
static_assert(kOpCaps[GGML_OP_ADD].calc_dims_func == element_wise_op_dims,
|
||||
"GGML_OP_ADD does not have element_wise_op_dims function");
|
||||
static_assert(kOpCaps[GGML_OP_MUL_MAT].calc_dims_func == mat_mul_op_dims,
|
||||
"GGML_OP_ADD does not have element_wise_op_dims function");
|
||||
static_assert(kOpCaps[GGML_OP_LOG].calc_dims_func == element_wise_op_dims,
|
||||
"GGML_OP_LOG does not have element_wise_op_dims function");
|
||||
static_assert(kOpCaps[GGML_OP_COUNT + GGML_UNARY_OP_GELU].input_param_count == 1,
|
||||
"GGML_UNARY_OP_GELU does not have 1 input parameter");
|
||||
static_assert(kOpCaps[GGML_OP_NONE].get_desc == nullptr, "GGML_OP_NONE should not have get_desc function");
|
||||
static_assert(kOpCaps[GGML_OP_ADD].qnn_op_name, "GGML_OP_ADD does not have qnn_op_name in the kOpCaps table");
|
||||
static_assert(kOpCaps[GGML_OP_MUL_MAT].qnn_op_name, "GGML_OP_MUL_MAT does not have qnn_op_name in the kOpCaps table");
|
||||
static_assert(kOpCaps[GGML_OP_MUL].qnn_op_name, "GGML_OP_MUL does not have qnn_op_name in the kOpCaps table");
|
||||
static_assert(kOpCaps[GGML_OP_LOG].qnn_op_name, "GGML_OP_LOG does not have qnn_op_name in the kOpCaps table");
|
||||
static_assert(std::size(kOpCaps) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT),
|
||||
"GGML_OP_COUNT does not match the size of the kOpCaps table");
|
||||
|
||||
|
|
@ -368,6 +392,10 @@ static_assert(std::size(kOpConstructors) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT
|
|||
|
||||
namespace qnn {
|
||||
|
||||
void append_tensor_shape_and_type(const ggml_tensor * tensor, std::string & output) {
|
||||
append_tensor_shape_and_type_impl(tensor, GGML_TYPE_COUNT, output);
|
||||
}
|
||||
|
||||
size_t get_qnn_op_index(const ggml_tensor * tensor) {
|
||||
if (tensor->op == GGML_OP_UNARY) {
|
||||
return kGgmlUnaryOpStart + ggml_get_unary_op(tensor);
|
||||
|
|
@ -383,14 +411,20 @@ const char * get_qnn_op_name(const ggml_tensor * op) {
|
|||
return kOpCaps[op_index].qnn_op_name;
|
||||
}
|
||||
|
||||
size_t get_qnn_op_input_param_count(const ggml_tensor * op) {
|
||||
void get_qnn_op_desc(const ggml_tensor * op, bool append_dimensions, ggml_type override_data_type,
|
||||
std::string & output) {
|
||||
auto op_index = get_qnn_op_index(op);
|
||||
GGML_ASSERT(op_index < std::size(kOpCaps));
|
||||
return kOpCaps[op_index].input_param_count;
|
||||
auto get_desc = kOpCaps[op_index].get_desc;
|
||||
if (get_desc) {
|
||||
get_desc(op, append_dimensions, override_data_type, output);
|
||||
} else {
|
||||
generic_get_op_desc(op, append_dimensions, override_data_type, output);
|
||||
}
|
||||
}
|
||||
|
||||
std::shared_ptr<ggml_qnn_op_config> create_op(const ggml_tensor * op, const std::string & name,
|
||||
std::shared_ptr<qnn_instance> qnn_instance) {
|
||||
qnn_instance_ptr qnn_instance) {
|
||||
auto op_index = get_qnn_op_index(op);
|
||||
GGML_ASSERT(op_index < std::size(kOpCaps));
|
||||
auto op_constructor = kOpConstructors[op_index];
|
||||
|
|
|
|||
|
|
@ -84,12 +84,12 @@ void ggml_qnn_op_config_base::set_input_tensors(qnn::qnn_tensor_array_t & tensor
|
|||
}
|
||||
|
||||
void ggml_qnn_op_config_base::set_input_tensors(qnn::qnn_tensor_array_t && tensor_inputs) {
|
||||
_tensor_inputs = tensor_inputs;
|
||||
_tensor_inputs = std::move(tensor_inputs);
|
||||
_qnn_tensor_inputs.resize(_tensor_inputs.size());
|
||||
}
|
||||
|
||||
void ggml_qnn_op_config_base::set_output_tensors(qnn::qnn_tensor_array_t & tensor_outputs) {
|
||||
_tensor_outputs = std::move(tensor_outputs);
|
||||
_tensor_outputs = tensor_outputs;
|
||||
_qnn_tensor_outputs.resize(_tensor_outputs.size());
|
||||
}
|
||||
|
||||
|
|
@ -99,10 +99,11 @@ void ggml_qnn_op_config_base::set_output_tensors(qnn::qnn_tensor_array_t && tens
|
|||
}
|
||||
|
||||
bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) {
|
||||
QNN_LOG_DEBUG("[%s]add to graph start\n", _name.c_str());
|
||||
|
||||
GGML_ASSERT(_qnn_tensor_inputs.size() == _tensor_inputs.size());
|
||||
GGML_ASSERT(_qnn_tensor_outputs.size() == _tensor_outputs.size());
|
||||
|
||||
QNN_LOG_DEBUG("[%s]add to graph start\n", _name.c_str());
|
||||
for (size_t i = 0; i < _tensor_inputs.size(); i++) {
|
||||
auto tensor = _tensor_inputs[i];
|
||||
if (!tensor->alloc_qnn_tensor_id()) {
|
||||
|
|
@ -110,7 +111,8 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) {
|
|||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s]input tensor id: %d\n", _name.c_str(), tensor->get_qnn_tensor_id());
|
||||
QNN_LOG_DEBUG("[%s]input tensor(%s), id(%d)\n", _name.c_str(), tensor->get_tensor_name().c_str(),
|
||||
tensor->get_qnn_tensor_id());
|
||||
_qnn_tensor_inputs[i] = tensor->get_qnn_tensor();
|
||||
}
|
||||
|
||||
|
|
@ -121,7 +123,8 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) {
|
|||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s]output tensor id: %d\n", _name.c_str(), tensor->get_qnn_tensor_id());
|
||||
QNN_LOG_DEBUG("[%s]output tensor(%s), id(%d)\n", _name.c_str(), tensor->get_tensor_name().c_str(),
|
||||
tensor->get_qnn_tensor_id());
|
||||
_qnn_tensor_outputs[i] = tensor->get_qnn_tensor();
|
||||
}
|
||||
|
||||
|
|
@ -222,18 +225,30 @@ bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_Graph
|
|||
GGML_ASSERT(_tensor_outputs.size() == 1);
|
||||
|
||||
// create convert nodes
|
||||
const auto tensor_rank = _tensor_inputs.front()->get_rank();
|
||||
qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs;
|
||||
qnn_tensor_array_t mat_mul_tensor_outputs = _tensor_outputs;
|
||||
if (!create_convert_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs)) {
|
||||
QNN_LOG_ERROR("create convert nodes failed\n");
|
||||
return false;
|
||||
}
|
||||
const auto tensor_rank = _tensor_inputs.front()->get_rank();
|
||||
qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs;
|
||||
auto tensor_type = create_input_convert_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs);
|
||||
|
||||
mat_mul_tensor_inputs.front() =
|
||||
create_gather_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs.front(),
|
||||
mat_mul_tensor_inputs.back()->get_dimensions());
|
||||
return create_mat_mul_nodes(mat_mul_tensor_inputs, mat_mul_tensor_outputs);
|
||||
|
||||
if (device != QNN_BACKEND_GPU && _tensor_outputs.front()->get_data_type() != tensor_type) {
|
||||
auto convert_out = create_output_convert_nodes(device, graph_handle, tensor_rank, tensor_type, _tensor_outputs);
|
||||
if (!create_mat_mul_nodes(mat_mul_tensor_inputs, convert_out->get_input_tensors())) {
|
||||
QNN_LOG_ERROR("create mat_mul nodes failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
_operations.push_back(convert_out);
|
||||
} else {
|
||||
if (!create_mat_mul_nodes(mat_mul_tensor_inputs, _tensor_outputs)) {
|
||||
QNN_LOG_ERROR("create mat_mul nodes failed\n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
|
|
@ -256,7 +271,7 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic
|
|||
constexpr const auto create_node =
|
||||
[](const std::string & name, const int rank, const int axis, const qnn_dimension_array_t & dimensions,
|
||||
qnn_tensor_ptr_t tensor_input, QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
std::shared_ptr<qnn_instance> qnn_instance, qnn_tensor_ptr_t & tensor_output) -> qnn_op_config_ptr_t {
|
||||
qnn_instance_ptr qnn_instance, qnn_tensor_ptr_t & tensor_output) -> qnn_op_config_ptr_t {
|
||||
auto gather_out =
|
||||
std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, name + "_out", dimensions,
|
||||
tensor_input->get_data_type(), rank, device, graph_handle, qnn_instance);
|
||||
|
|
@ -303,18 +318,16 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic
|
|||
return gather1_out;
|
||||
}
|
||||
|
||||
bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank,
|
||||
qnn_tensor_array_t & tensor_inputs,
|
||||
qnn_tensor_array_t & tensor_outputs) {
|
||||
Qnn_DataType_t ggml_qnn_matmul_op_config::create_input_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
const int rank,
|
||||
qnn_tensor_array_t & tensor_inputs) {
|
||||
if (device == QNN_BACKEND_GPU) {
|
||||
// there's no convert op for GPU, so we should create matmul nodes directly.
|
||||
return true;
|
||||
return QNN_DATATYPE_UNDEFINED;
|
||||
}
|
||||
|
||||
// create tensors for convert node
|
||||
auto tensor_type = get_tensor_type(tensor_inputs);
|
||||
QNN_LOG_DEBUG("input tensor type: %s\n", qnn_datatype_to_string(tensor_type));
|
||||
|
||||
for (size_t i = 0; i < tensor_inputs.size(); ++i) {
|
||||
// create input convert nodes
|
||||
auto convert_in = tensor_inputs[i];
|
||||
|
|
@ -327,29 +340,35 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap
|
|||
convert_in->get_dimensions(), tensor_type, rank, device,
|
||||
graph_handle, _qnn_instance);
|
||||
auto convert = std::make_shared<ggml_qnn_single_op_config>(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW,
|
||||
QNN_OP_CONVERT, _qnn_instance);
|
||||
QNN_OP_CAST, _qnn_instance);
|
||||
QNN_LOG_DEBUG("[%s][MUL_MAT]create: %s, type: %s\n", get_backend_name(device), convert_name.c_str(),
|
||||
qnn_datatype_to_string(tensor_type));
|
||||
convert->set_input_tensors({ convert_in });
|
||||
convert->set_output_tensors({ convert_out });
|
||||
tensor_inputs[i] = convert_out;
|
||||
_operations.push_back(convert);
|
||||
}
|
||||
|
||||
if (tensor_outputs.front()->get_data_type() != tensor_type) {
|
||||
// create output convert node
|
||||
std::string convert_name("convert_dst");
|
||||
auto convert_out = tensor_outputs.front();
|
||||
auto convert_in = std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_in",
|
||||
convert_out->get_dimensions(), tensor_type, rank, device,
|
||||
graph_handle, _qnn_instance);
|
||||
auto output_convert = std::make_shared<ggml_qnn_single_op_config>(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW,
|
||||
QNN_OP_CONVERT, _qnn_instance);
|
||||
output_convert->set_input_tensors({ convert_in });
|
||||
output_convert->set_output_tensors({ convert_out });
|
||||
tensor_outputs.front() = convert_in;
|
||||
_operations.push_back(output_convert);
|
||||
}
|
||||
return tensor_type;
|
||||
}
|
||||
|
||||
return true;
|
||||
qnn_op_config_ptr_t ggml_qnn_matmul_op_config::create_output_convert_nodes(QNNBackend device,
|
||||
Qnn_GraphHandle_t graph_handle,
|
||||
const int rank, Qnn_DataType_t tensor_type,
|
||||
qnn_tensor_array_t & tensor_outputs) {
|
||||
GGML_ASSERT(tensor_outputs.size() == 1);
|
||||
// create output convert node
|
||||
std::string convert_name("convert_dst");
|
||||
auto convert_in = std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_in",
|
||||
tensor_outputs.front()->get_dimensions(), tensor_type, rank,
|
||||
device, graph_handle, _qnn_instance);
|
||||
auto output_convert = std::make_shared<ggml_qnn_single_op_config>(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW,
|
||||
QNN_OP_CAST, _qnn_instance);
|
||||
QNN_LOG_DEBUG("[%s][MUL_MAT]create: %s, type: %s\n", get_backend_name(device), convert_name.c_str(),
|
||||
qnn_datatype_to_string(tensor_type));
|
||||
output_convert->set_input_tensors({ convert_in });
|
||||
output_convert->set_output_tensors(tensor_outputs);
|
||||
return output_convert;
|
||||
}
|
||||
|
||||
bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(qnn_tensor_array_t & tensor_inputs,
|
||||
|
|
@ -413,8 +432,7 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(qnn_tensor_array_t & tensor
|
|||
mat_mul->add_scalar_param(QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, scalar);
|
||||
|
||||
// set tensor to mat_mul
|
||||
std::swap(tensor_inputs[0], tensor_inputs[1]);
|
||||
mat_mul->set_input_tensors(tensor_inputs);
|
||||
mat_mul->set_input_tensors({ tensor_inputs[1], tensor_inputs[0] });
|
||||
mat_mul->set_output_tensors(tensor_outputs);
|
||||
|
||||
_operations.push_back(mat_mul);
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ namespace qnn {
|
|||
class ggml_qnn_op_config_base : public ggml_qnn_op_config {
|
||||
public:
|
||||
explicit ggml_qnn_op_config_base(const std::string & name, const std::string & package_name,
|
||||
const std::string & op_type, std::shared_ptr<qnn_instance> qnn_instance) :
|
||||
const std::string & op_type, qnn_instance_ptr qnn_instance) :
|
||||
_name(name),
|
||||
_package_name(package_name),
|
||||
_op_type(op_type),
|
||||
|
|
@ -36,24 +36,24 @@ class ggml_qnn_op_config_base : public ggml_qnn_op_config {
|
|||
void unbind_input_tensors() override;
|
||||
void unbind_output_tensors() override;
|
||||
|
||||
const qnn_tensor_array_t & get_input_tensors() override { return _tensor_inputs; }
|
||||
qnn_tensor_array_t & get_input_tensors() override { return _tensor_inputs; }
|
||||
|
||||
const qnn_tensor_array_t & get_output_tensors() override { return _tensor_outputs; }
|
||||
qnn_tensor_array_t & get_output_tensors() override { return _tensor_outputs; }
|
||||
|
||||
protected:
|
||||
Qnn_OpConfig_t get_op_config();
|
||||
|
||||
std::string _name;
|
||||
std::string _package_name;
|
||||
std::string _op_type;
|
||||
std::shared_ptr<qnn_instance> _qnn_instance;
|
||||
qnn_tensor_array_t _tensor_inputs;
|
||||
qnn_tensor_array_t _tensor_outputs;
|
||||
qnn_tensor_array_t _tensor_parameters;
|
||||
std::vector<Qnn_Tensor_t> _qnn_tensor_inputs;
|
||||
std::vector<Qnn_Tensor_t> _qnn_tensor_outputs;
|
||||
std::vector<Qnn_Param_t> _qnn_parameters;
|
||||
std::vector<std::string> _param_names;
|
||||
std::string _name;
|
||||
std::string _package_name;
|
||||
std::string _op_type;
|
||||
qnn_instance_ptr _qnn_instance;
|
||||
qnn_tensor_array_t _tensor_inputs;
|
||||
qnn_tensor_array_t _tensor_outputs;
|
||||
qnn_tensor_array_t _tensor_parameters;
|
||||
std::vector<Qnn_Tensor_t> _qnn_tensor_inputs;
|
||||
std::vector<Qnn_Tensor_t> _qnn_tensor_outputs;
|
||||
std::vector<Qnn_Param_t> _qnn_parameters;
|
||||
std::vector<std::string> _param_names;
|
||||
|
||||
DISABLE_COPY(ggml_qnn_op_config_base);
|
||||
DISABLE_MOVE(ggml_qnn_op_config_base);
|
||||
|
|
@ -62,7 +62,7 @@ class ggml_qnn_op_config_base : public ggml_qnn_op_config {
|
|||
class ggml_qnn_single_op_config : public ggml_qnn_op_config_base {
|
||||
public:
|
||||
explicit ggml_qnn_single_op_config(const std::string & name, const std::string & package_name,
|
||||
const std::string & op_type, std::shared_ptr<qnn_instance> qnn_instance) :
|
||||
const std::string & op_type, qnn_instance_ptr qnn_instance) :
|
||||
ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {}
|
||||
|
||||
bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override;
|
||||
|
|
@ -75,7 +75,7 @@ class ggml_qnn_single_op_config : public ggml_qnn_op_config_base {
|
|||
class ggml_qnn_rmsnorm_op_config : public ggml_qnn_op_config_base {
|
||||
public:
|
||||
explicit ggml_qnn_rmsnorm_op_config(const std::string & name, const std::string & package_name,
|
||||
const std::string & op_type, std::shared_ptr<qnn_instance> qnn_instance) :
|
||||
const std::string & op_type, qnn_instance_ptr qnn_instance) :
|
||||
ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {}
|
||||
|
||||
bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override;
|
||||
|
|
@ -87,7 +87,7 @@ class ggml_qnn_rmsnorm_op_config : public ggml_qnn_op_config_base {
|
|||
|
||||
class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config {
|
||||
public:
|
||||
explicit ggml_qnn_aggregate_op_config(const std::string & name, std::shared_ptr<qnn_instance> qnn_instance) :
|
||||
explicit ggml_qnn_aggregate_op_config(const std::string & name, qnn_instance_ptr qnn_instance) :
|
||||
_name(name),
|
||||
_qnn_instance(qnn_instance) {}
|
||||
|
||||
|
|
@ -121,13 +121,13 @@ class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config {
|
|||
}
|
||||
}
|
||||
|
||||
const qnn_tensor_array_t & get_input_tensors() override { return _tensor_inputs; }
|
||||
qnn_tensor_array_t & get_input_tensors() override { return _tensor_inputs; }
|
||||
|
||||
const qnn_tensor_array_t & get_output_tensors() override { return _tensor_outputs; }
|
||||
qnn_tensor_array_t & get_output_tensors() override { return _tensor_outputs; }
|
||||
|
||||
protected:
|
||||
std::string _name;
|
||||
std::shared_ptr<qnn_instance> _qnn_instance;
|
||||
std::string _name;
|
||||
qnn_instance_ptr _qnn_instance;
|
||||
|
||||
std::vector<qnn_op_config_ptr_t> _operations;
|
||||
qnn_tensor_array_t _tensor_inputs;
|
||||
|
|
@ -140,17 +140,19 @@ class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config {
|
|||
|
||||
class ggml_qnn_matmul_op_config : public ggml_qnn_aggregate_op_config {
|
||||
public:
|
||||
ggml_qnn_matmul_op_config(const std::string & name, std::shared_ptr<qnn_instance> qnn_instance) :
|
||||
ggml_qnn_matmul_op_config(const std::string & name, qnn_instance_ptr qnn_instance) :
|
||||
ggml_qnn_aggregate_op_config(name, qnn_instance) {}
|
||||
|
||||
bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override;
|
||||
|
||||
private:
|
||||
qnn_tensor_ptr_t create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank,
|
||||
qnn_tensor_ptr_t tensor_input, qnn_dimension_array_t output_dimensions);
|
||||
bool create_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank,
|
||||
qnn_tensor_array_t & tensor_inputs, qnn_tensor_array_t & tensor_outputs);
|
||||
bool create_mat_mul_nodes(qnn_tensor_array_t & tensor_inputs, qnn_tensor_array_t & tensor_outputs);
|
||||
qnn_tensor_ptr_t create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank,
|
||||
qnn_tensor_ptr_t tensor_input, qnn_dimension_array_t output_dimensions);
|
||||
Qnn_DataType_t create_input_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank,
|
||||
qnn_tensor_array_t & tensor_inputs);
|
||||
qnn_op_config_ptr_t create_output_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank,
|
||||
Qnn_DataType_t tensor_type, qnn_tensor_array_t & tensor_outputs);
|
||||
bool create_mat_mul_nodes(qnn_tensor_array_t & tensor_inputs, qnn_tensor_array_t & tensor_outputs);
|
||||
|
||||
DISABLE_COPY(ggml_qnn_matmul_op_config);
|
||||
DISABLE_MOVE(ggml_qnn_matmul_op_config);
|
||||
|
|
|
|||
|
|
@ -14,11 +14,16 @@ namespace qnn {
|
|||
|
||||
constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT;
|
||||
|
||||
size_t get_qnn_op_index(const ggml_tensor * tensor);
|
||||
const char * get_qnn_op_name(const ggml_tensor * op);
|
||||
size_t get_qnn_op_input_param_count(const ggml_tensor * op);
|
||||
// TODO: move to a better place
|
||||
void append_tensor_shape_and_type(const ggml_tensor * tensor, std::string & output);
|
||||
|
||||
size_t get_qnn_op_index(const ggml_tensor * tensor);
|
||||
const char * get_qnn_op_name(const ggml_tensor * op);
|
||||
void get_qnn_op_desc(const ggml_tensor * op, bool append_dimensions, ggml_type override_data_type,
|
||||
std::string & output);
|
||||
|
||||
std::shared_ptr<ggml_qnn_op_config> create_op(const ggml_tensor * op, const std::string & name,
|
||||
std::shared_ptr<qnn_instance> qnn_instance);
|
||||
qnn_instance_ptr qnn_instance);
|
||||
|
||||
inline bool add_op_to_graph(Qnn_GraphHandle_t graph_handle, std::vector<qnn_op_config_ptr_t> & operations) {
|
||||
for (auto & op : operations) {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,170 @@
|
|||
|
||||
#include "profiler.hpp"
|
||||
|
||||
#include <HTP/QnnHtpProfile.h>
|
||||
#include <QnnProfile.h>
|
||||
|
||||
#include "logger.hpp"
|
||||
#include "qnn-lib.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
std::string get_duration_string(const QnnProfile_EventData_t & event_data) {
|
||||
char time_str[128] = {};
|
||||
switch (event_data.unit) {
|
||||
case QNN_PROFILE_EVENTUNIT_CYCLES:
|
||||
snprintf(time_str, sizeof(time_str), "cycles: %lld", (long long int) event_data.value);
|
||||
break;
|
||||
case QNN_PROFILE_EVENTUNIT_COUNT:
|
||||
snprintf(time_str, sizeof(time_str), "count: %lld", (long long int) event_data.value);
|
||||
break;
|
||||
case QNN_PROFILE_EVENTUNIT_BYTES:
|
||||
snprintf(time_str, sizeof(time_str), "size: %lld bytes", (long long int) event_data.value);
|
||||
break;
|
||||
case QNN_PROFILE_EVENTUNIT_MICROSEC:
|
||||
{
|
||||
double duration_ms = event_data.value / 1000.0;
|
||||
snprintf(time_str, sizeof(time_str), "duration: %.3f ms", duration_ms);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return time_str;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace qnn {
|
||||
|
||||
qnn_event_tracer::qnn_event_tracer(const std::string & prefix, std::shared_ptr<qnn_interface> interface,
|
||||
Qnn_BackendHandle_t backend_handle, sdk_profile_level level) :
|
||||
_interface(interface),
|
||||
_prefix(prefix) {
|
||||
QnnProfile_Level_t qnn_profile_level = 0;
|
||||
switch (level) {
|
||||
case sdk_profile_level::PROFILE_BASIC:
|
||||
qnn_profile_level = QNN_PROFILE_LEVEL_BASIC;
|
||||
break;
|
||||
case sdk_profile_level::PROFILE_OP_TRACE:
|
||||
case sdk_profile_level::PROFILE_DETAIL:
|
||||
qnn_profile_level = QNN_PROFILE_LEVEL_DETAILED;
|
||||
break;
|
||||
case sdk_profile_level::PROFILE_OFF:
|
||||
default:
|
||||
QNN_LOG_WARN("[profiler][%s]invalid profile level %d, using PROFILE_OFF\n", _prefix.c_str(), level);
|
||||
return;
|
||||
}
|
||||
|
||||
auto error = _interface->qnn_profile_create(backend_handle, qnn_profile_level, &_handle);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_ERROR("[profiler][%s]failed to create QNN profile_handle. Backend ID %u, error %ld\n", _prefix.c_str(),
|
||||
_interface->get_backend_id(), (long) QNN_GET_ERROR_CODE(error));
|
||||
_handle = nullptr;
|
||||
return;
|
||||
}
|
||||
|
||||
if (level == sdk_profile_level::PROFILE_OP_TRACE) {
|
||||
QnnProfile_Config_t qnn_profile_config = QNN_PROFILE_CONFIG_INIT;
|
||||
qnn_profile_config.option = QNN_PROFILE_CONFIG_OPTION_ENABLE_OPTRACE;
|
||||
std::array<const QnnProfile_Config_t *, 2> profile_configs = { &qnn_profile_config, nullptr };
|
||||
error = _interface->qnn_profile_set_config(_handle, profile_configs.data());
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_ERROR("[profiler][%s]failed to set QNN profile event. Backend ID %u, error %ld\n", _prefix.c_str(),
|
||||
_interface->get_backend_id(), (long) QNN_GET_ERROR_CODE(error));
|
||||
_interface->qnn_profile_free(_handle);
|
||||
_handle = nullptr;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[profiler][%s]created, Backend ID %u, level %d\n", _prefix.c_str(), _interface->get_backend_id(),
|
||||
level);
|
||||
}
|
||||
|
||||
qnn_event_tracer::~qnn_event_tracer() {
|
||||
if (_handle) {
|
||||
Qnn_ErrorHandle_t error = _interface->qnn_profile_free(_handle);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_ERROR("[profiler][%s]failed to free QNN profile_handle. Backend ID %u, error %ld\n",
|
||||
_prefix.c_str(), _interface->get_backend_id(), (long) QNN_GET_ERROR_CODE(error));
|
||||
}
|
||||
_handle = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void qnn_event_tracer::print_profile_events() {
|
||||
const QnnProfile_EventId_t * events_ptr = nullptr;
|
||||
uint32_t num_events = 0;
|
||||
auto error = _interface->qnn_profile_get_events(_handle, &events_ptr, &num_events);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_ERROR("[profiler][%s]failed to get QNN profile events. Backend ID %u, error %ld\n", _prefix.c_str(),
|
||||
_interface->get_backend_id(), (long) QNN_GET_ERROR_CODE(error));
|
||||
return;
|
||||
}
|
||||
|
||||
if (!num_events) {
|
||||
QNN_LOG_INFO("[profiler][%s]no QNN profile events\n", _prefix.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
QNN_LOG_INFO("[profiler][%s]print_profile_events start ----------------\n", _prefix.c_str());
|
||||
// see also: https://github.com/pytorch/executorch/blob/0ccf5093823761cf8ad98c75e5fe81f15ea42366/backends/qualcomm/runtime/backends/QnnProfiler.cpp#L73
|
||||
QnnProfile_EventData_t event_data;
|
||||
for (uint32_t i = 0; i < num_events; ++i) {
|
||||
error = _interface->qnn_profile_get_event_data(events_ptr[i], &event_data);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_ERROR("[profiler][%s]failed to get QNN profile event data. Backend ID %u, event[%d], error: %ld\n",
|
||||
_prefix.c_str(), _interface->get_backend_id(), i, (long) QNN_GET_ERROR_CODE(error));
|
||||
continue;
|
||||
}
|
||||
|
||||
const QnnProfile_EventId_t * sub_events_ptr = nullptr;
|
||||
uint32_t num_sub_events = 0;
|
||||
error = _interface->qnn_profile_get_sub_events(events_ptr[i], &sub_events_ptr, &num_sub_events);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_ERROR("[profiler][%s]failed to get QNN profile sub events. Backend ID %u, event[%d], error: %ld\n",
|
||||
_prefix.c_str(), _interface->get_backend_id(), i, (long) QNN_GET_ERROR_CODE(error));
|
||||
continue;
|
||||
}
|
||||
|
||||
auto duration = get_duration_string(event_data);
|
||||
if (!num_sub_events) {
|
||||
QNN_LOG_INFO("[profiler][%s]event[%d]: %s, %s\n", _prefix.c_str(), i, event_data.identifier,
|
||||
duration.c_str());
|
||||
continue;
|
||||
}
|
||||
|
||||
QNN_LOG_INFO("[profiler][%s]event[%d]: %s, sub_count: %d, start -------------\n", _prefix.c_str(), i,
|
||||
event_data.identifier, num_sub_events);
|
||||
QnnProfile_EventData_t sub_event_data;
|
||||
for (std::uint32_t j = 0; j < num_sub_events; ++j) {
|
||||
error = _interface->qnn_profile_get_event_data(sub_events_ptr[j], &sub_event_data);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_ERROR(
|
||||
"[profiler][%s]failed to get QNN profile sub event data. Backend ID %u, event[%d], sub_event[%d], "
|
||||
"error: %ld\n",
|
||||
_prefix.c_str(), _interface->get_backend_id(), i, j, (long) QNN_GET_ERROR_CODE(error));
|
||||
continue;
|
||||
}
|
||||
|
||||
if (sub_event_data.type != QNN_PROFILE_EVENTTYPE_NODE) {
|
||||
QNN_LOG_DEBUG("[profiler][%s]sub_event[%d]%s, type %d, skipping\n", _prefix.c_str(), j,
|
||||
sub_event_data.identifier, sub_event_data.type);
|
||||
continue;
|
||||
}
|
||||
|
||||
auto sub_duration = get_duration_string(sub_event_data);
|
||||
QNN_LOG_INFO("[profiler][%s]sub_event[%d]: %s, %s\n", _prefix.c_str(), j, sub_event_data.identifier,
|
||||
sub_duration.c_str());
|
||||
}
|
||||
|
||||
QNN_LOG_INFO("[profiler][%s]event[%d]: %s, %s, end --------------\n", _prefix.c_str(), i, event_data.identifier,
|
||||
duration.c_str());
|
||||
}
|
||||
|
||||
QNN_LOG_INFO("[profiler][%s]print_profile_events end -----------------\n", _prefix.c_str());
|
||||
}
|
||||
|
||||
} // namespace qnn
|
||||
|
|
@ -0,0 +1,100 @@
|
|||
#pragma once
|
||||
|
||||
#include <QnnCommon.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "logger.hpp"
|
||||
#include "qnn-types.hpp"
|
||||
|
||||
namespace qnn {
|
||||
|
||||
#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING
|
||||
|
||||
class qnn_scoped_timer {
|
||||
public:
|
||||
qnn_scoped_timer(const std::string & log_prefix) : _log_prefix(std::move(log_prefix)) {
|
||||
_begin_us = ggml_time_us();
|
||||
}
|
||||
|
||||
qnn_scoped_timer(qnn_scoped_timer && other) {
|
||||
_begin_us = other._begin_us;
|
||||
_log_prefix = std::move(other._log_prefix);
|
||||
}
|
||||
|
||||
~qnn_scoped_timer() { print(); }
|
||||
|
||||
void operator=(qnn_scoped_timer && other) {
|
||||
_begin_us = other._begin_us;
|
||||
_log_prefix = std::move(other._log_prefix);
|
||||
}
|
||||
|
||||
void print() const {
|
||||
auto duration = (ggml_time_us() - _begin_us) / 1000.0;
|
||||
QNN_LOG_INFO("[profiler]%s, duration: %.4f ms\n", _log_prefix.c_str(), duration);
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
int64_t _begin_us = 0LL;
|
||||
std::string _log_prefix;
|
||||
|
||||
qnn_scoped_timer(const qnn_scoped_timer &) = delete;
|
||||
void operator=(const qnn_scoped_timer &) = delete;
|
||||
};
|
||||
|
||||
inline qnn_scoped_timer make_scope_perf_timer(const char * format, ...) {
|
||||
va_list args;
|
||||
va_start(args, format);
|
||||
char buffer[4096];
|
||||
vsnprintf(buffer, sizeof(buffer), format, args);
|
||||
va_end(args);
|
||||
return qnn_scoped_timer(buffer);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
inline void make_scope_perf_timer(const char *, ...) {}
|
||||
|
||||
#endif
|
||||
|
||||
// forward declaration of qnn_interface
|
||||
class qnn_interface;
|
||||
|
||||
class qnn_event_tracer {
|
||||
public:
|
||||
// ref:
|
||||
// https://github.com/pytorch/executorch/blob/ae3d558d5e6aa04fc52a3065399fe6a773702f52/backends/qualcomm/serialization/qc_schema.py#L53
|
||||
// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html#supported-snapdragon-devices
|
||||
enum sdk_profile_level { PROFILE_OFF = 0, PROFILE_BASIC, PROFILE_DETAIL, PROFILE_OP_TRACE };
|
||||
|
||||
explicit qnn_event_tracer(const std::string & prefix, std::shared_ptr<qnn_interface> interface,
|
||||
Qnn_BackendHandle_t backend_handle, sdk_profile_level level);
|
||||
~qnn_event_tracer();
|
||||
|
||||
Qnn_ProfileHandle_t get_handle() const { return _handle; }
|
||||
|
||||
void print_profile_events();
|
||||
|
||||
private:
|
||||
std::shared_ptr<qnn_interface> _interface;
|
||||
Qnn_ProfileHandle_t _handle = nullptr;
|
||||
std::string _prefix;
|
||||
|
||||
DISABLE_COPY(qnn_event_tracer);
|
||||
DISABLE_MOVE(qnn_event_tracer);
|
||||
};
|
||||
|
||||
using qnn_event_tracer_ptr = std::shared_ptr<qnn_event_tracer>;
|
||||
|
||||
} // namespace qnn
|
||||
|
||||
#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING
|
||||
# define QNN_SCOPED_PERFORMANCE_TRACKER(fmt, ...) \
|
||||
auto __qnn_timer_##__LINE__ = qnn::make_scope_perf_timer(fmt, __VA_ARGS__)
|
||||
#else
|
||||
# define QNN_SCOPED_PERFORMANCE_TRACKER(fmt, ...) ((void) 0)
|
||||
#endif
|
||||
|
|
@ -12,12 +12,51 @@ namespace {
|
|||
#ifdef _WIN32
|
||||
constexpr const char * kQnnSystemLibName = "QnnSystem.dll";
|
||||
constexpr const char * kQnnRpcLibName = "libcdsprpc.dll";
|
||||
constexpr const char * kQnnCpuLibName = "QnnCpu.dll";
|
||||
constexpr const char * kQnnGpuLibName = "QnnGpu.dll";
|
||||
constexpr const char * kQnnNpuLibName = "QnnHtp.dll";
|
||||
#else
|
||||
constexpr const char * kQnnSystemLibName = "libQnnSystem.so";
|
||||
constexpr const char * kQnnRpcLibName = "libcdsprpc.so";
|
||||
|
||||
constexpr const char * kQnnCpuLibName = "libQnnCpu.so";
|
||||
constexpr const char * kQnnGpuLibName = "libQnnGpu.so";
|
||||
constexpr const char * kQnnNpuLibName = "libQnnHtp.so";
|
||||
#endif
|
||||
|
||||
constexpr const qnn::device_caps kDeviceCaps[] = {
|
||||
{
|
||||
// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul
|
||||
kQnnCpuLibName, GGML_BACKEND_DEVICE_TYPE_ACCEL, (1L << GGML_TYPE_I8) | (1L << GGML_TYPE_F32),
|
||||
0xFFFFFE, // all quantized types can be offload to CPU, at current implementation, those types will be dequantized into float32 on cpu
|
||||
0, // 0 for no limitation
|
||||
},
|
||||
{
|
||||
// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul
|
||||
kQnnGpuLibName, GGML_BACKEND_DEVICE_TYPE_GPU, (1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16),
|
||||
// all quantized types can be offload to GPU, at current implementation, those types will be dequantized into float32 on cpu
|
||||
0xFFFFFE, (128256L * 4096 *
|
||||
sizeof(float)), // tested on 8 gen 2, failed to allocate tensor with size 128256x4096 and float32
|
||||
},
|
||||
{
|
||||
// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul
|
||||
kQnnNpuLibName, GGML_BACKEND_DEVICE_TYPE_ACCEL,
|
||||
(1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16) | (1L << GGML_TYPE_I16),
|
||||
(1L << GGML_TYPE_Q2_K) | (1L << GGML_TYPE_Q3_K) | (1L << GGML_TYPE_Q4_K) | (1L << GGML_TYPE_Q8_K),
|
||||
(8192L * 2048 + 8192 * 512 + 2048 * 512) * sizeof(float), // TODO: should have a better way to get this value
|
||||
},
|
||||
};
|
||||
|
||||
static_assert(sizeof(kDeviceCaps) / sizeof(kDeviceCaps[0]) == GGML_QNN_MAX_DEVICES,
|
||||
"The number of qnn devices should be equal to GGML_QNN_MAX_DEVICES");
|
||||
static_assert(kDeviceCaps[QNN_BACKEND_NPU].type == GGML_BACKEND_DEVICE_TYPE_ACCEL,
|
||||
"The NPU device should be an accelerator device");
|
||||
static_assert(kDeviceCaps[QNN_BACKEND_GPU].type == GGML_BACKEND_DEVICE_TYPE_GPU,
|
||||
"The GPU device should be an GPU device");
|
||||
static_assert(
|
||||
kDeviceCaps[QNN_BACKEND_CPU].type == GGML_BACKEND_DEVICE_TYPE_ACCEL,
|
||||
"The CPU device should be an accelerator device"); // we treat qnn-cpu as a supplementary accelerator device
|
||||
static_assert(GGML_TYPE_Q4_0 == 2 && GGML_TYPE_Q8_K == 15, "The quantized type order is not correct");
|
||||
|
||||
void insert_path(std::string & path, std::string insert_path, const char separator = ':') {
|
||||
if (!insert_path.empty() && !path.empty()) {
|
||||
insert_path += separator;
|
||||
|
|
@ -108,9 +147,8 @@ qnn_system_interface::~qnn_system_interface() {
|
|||
}
|
||||
}
|
||||
|
||||
qnn_instance::qnn_instance(const std::string & lib_path, const std::string & backend_lib_name) :
|
||||
_additional_lib_load_path(lib_path),
|
||||
_backend_lib_name(std::move(backend_lib_name)) {
|
||||
qnn_instance::qnn_instance(const std::string & lib_path, QNNBackend device) : _additional_lib_load_path(lib_path) {
|
||||
_backend_lib_name = kDeviceCaps[device].lib_name;
|
||||
if (set_qnn_lib_search_path(lib_path)) {
|
||||
QNN_LOG_DEBUG("[%s] set_qnn_lib_search_path succeed\n", _backend_lib_name.c_str());
|
||||
} else {
|
||||
|
|
@ -181,21 +219,27 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
|
|||
qnn_status = _qnn_interface->qnn_device_get_platform_info(nullptr, &p_info);
|
||||
if (qnn_status == QNN_SUCCESS) {
|
||||
QNN_LOG_INFO("device counts %d\n", p_info->v1.numHwDevices);
|
||||
QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices;
|
||||
QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = {};
|
||||
QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices;
|
||||
for (uint32_t i = 0; i < p_info->v1.numHwDevices; i++) {
|
||||
QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d\n", (int) infos[i].v1.deviceId,
|
||||
(int) infos[i].v1.deviceType, (int) infos[i].v1.numCores);
|
||||
QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension;
|
||||
chipinfo = devinfo->onChipDevice;
|
||||
size_t htp_arch = (size_t) chipinfo.arch;
|
||||
QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension;
|
||||
QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice;
|
||||
size_t htp_arch = (size_t) chipinfo.arch;
|
||||
QNN_LOG_INFO("htp_type:%d(%s)\n", devinfo->devType,
|
||||
(devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : "");
|
||||
QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB\n", (int) chipinfo.socModel,
|
||||
qnn::get_chipset_desc(chipinfo.socModel), (int) htp_arch, qnn::get_htparch_desc(htp_arch),
|
||||
(int) chipinfo.vtcmSize);
|
||||
_soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize };
|
||||
QNN_LOG_INFO("soc_model:%s(%s), htp_arch:%s(%d), vtcm_size:%d MB\n",
|
||||
get_chipset_desc(chipinfo.socModel), get_chipset_model(chipinfo.socModel),
|
||||
get_htparch_desc(htp_arch), (int) htp_arch, (int) chipinfo.vtcmSize);
|
||||
}
|
||||
|
||||
if (p_info->v1.numHwDevices) {
|
||||
QnnDevice_DeviceInfoExtension_t devinfo = infos[p_info->v1.numHwDevices - 1].v1.deviceInfoExtension;
|
||||
QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice;
|
||||
size_t htp_arch = (size_t) chipinfo.arch;
|
||||
_soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize };
|
||||
}
|
||||
|
||||
_qnn_interface->qnn_device_free_platform_info(nullptr, p_info);
|
||||
} else {
|
||||
// For emulator, we can't get platform info
|
||||
|
|
@ -229,20 +273,6 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
|
|||
QNN_LOG_INFO("create QNN device successfully\n");
|
||||
}
|
||||
|
||||
if (_profile_level != sdk_profile_level::profile_off) {
|
||||
QNN_LOG_INFO("profiling turned on; level = %d\n", _profile_level);
|
||||
auto profile_level =
|
||||
_profile_level == sdk_profile_level::profile_detail ? QNN_PROFILE_LEVEL_DETAILED : QNN_PROFILE_LEVEL_BASIC;
|
||||
|
||||
if (QNN_PROFILE_NO_ERROR !=
|
||||
_qnn_interface->qnn_profile_create(_qnn_backend_handle, profile_level, &_qnn_profile_handle)) {
|
||||
QNN_LOG_WARN("unable to create profile handle in the backend\n");
|
||||
return 6;
|
||||
} else {
|
||||
QNN_LOG_DEBUG("initialize qnn profile successfully\n");
|
||||
}
|
||||
}
|
||||
|
||||
_rpc_lib_handle = load_lib_with_fallback(kQnnRpcLibName, _additional_lib_load_path);
|
||||
if (_rpc_lib_handle) {
|
||||
_pfn_rpc_mem_alloc = reinterpret_cast<qnn::pfn_rpc_mem_alloc>(dl_sym(_rpc_lib_handle, "rpcmem_alloc"));
|
||||
|
|
@ -339,7 +369,7 @@ int qnn_instance::qnn_finalize() {
|
|||
}
|
||||
|
||||
if (_qnn_context_handle) {
|
||||
error = _qnn_interface->qnn_context_free(_qnn_context_handle, _qnn_profile_handle);
|
||||
error = _qnn_interface->qnn_context_free(_qnn_context_handle, nullptr);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(),
|
||||
(int) QNN_GET_ERROR_CODE(error));
|
||||
|
|
@ -347,15 +377,6 @@ int qnn_instance::qnn_finalize() {
|
|||
_qnn_context_handle = nullptr;
|
||||
}
|
||||
|
||||
if (_qnn_profile_handle) {
|
||||
error = _qnn_interface->qnn_profile_free(_qnn_profile_handle);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(),
|
||||
(int) QNN_GET_ERROR_CODE(error));
|
||||
}
|
||||
_qnn_profile_handle = nullptr;
|
||||
}
|
||||
|
||||
if (_qnn_device_handle) {
|
||||
error = _qnn_interface->qnn_device_free(_qnn_device_handle);
|
||||
if (error != QNN_SUCCESS) {
|
||||
|
|
@ -535,4 +556,8 @@ int qnn_instance::unload_backend() {
|
|||
return 0;
|
||||
}
|
||||
|
||||
const device_caps & get_device_caps(QNNBackend device) {
|
||||
return kDeviceCaps[device];
|
||||
}
|
||||
|
||||
} // namespace qnn
|
||||
|
|
|
|||
|
|
@ -82,70 +82,48 @@ class qnn_interface {
|
|||
|
||||
// QnnBackend
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate);
|
||||
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree);
|
||||
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage);
|
||||
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig);
|
||||
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion);
|
||||
|
||||
// QnnDevice
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate);
|
||||
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree);
|
||||
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure);
|
||||
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo);
|
||||
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(device_free_platform_info, deviceFreePlatformInfo);
|
||||
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo);
|
||||
|
||||
// QnnContext
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate);
|
||||
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize);
|
||||
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary);
|
||||
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary);
|
||||
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree);
|
||||
|
||||
// QnnGraph
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate);
|
||||
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode);
|
||||
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize);
|
||||
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute);
|
||||
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve);
|
||||
|
||||
// QnnLog
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate);
|
||||
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree);
|
||||
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel);
|
||||
|
||||
// QnnProfile
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate);
|
||||
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(profile_set_config, profileSetConfig);
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents);
|
||||
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents);
|
||||
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData);
|
||||
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree);
|
||||
|
||||
// QnnMem
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister);
|
||||
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister);
|
||||
|
||||
// QnnProperty
|
||||
|
|
@ -153,7 +131,6 @@ class qnn_interface {
|
|||
|
||||
// QnnTensor
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor);
|
||||
|
||||
DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor);
|
||||
|
||||
uint32_t get_backend_id() const { return _qnn_interface.backendId; }
|
||||
|
|
@ -169,18 +146,20 @@ class qnn_interface {
|
|||
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
using qnn_interface_ptr = std::shared_ptr<qnn_interface>;
|
||||
|
||||
class qnn_instance {
|
||||
public:
|
||||
using BackendIdType = decltype(QnnInterface_t{}.backendId);
|
||||
|
||||
explicit qnn_instance(const std::string & lib_path, const std::string & backend_lib_name);
|
||||
explicit qnn_instance(const std::string & lib_path, QNNBackend device);
|
||||
|
||||
~qnn_instance() {}
|
||||
|
||||
int qnn_init(const QnnSaver_Config_t ** saver_config);
|
||||
int qnn_finalize();
|
||||
|
||||
std::shared_ptr<qnn_interface> get_qnn_interface() {
|
||||
qnn_interface_ptr get_qnn_interface() {
|
||||
if (!_qnn_interface) {
|
||||
QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
|
||||
}
|
||||
|
|
@ -189,8 +168,6 @@ class qnn_instance {
|
|||
|
||||
Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; }
|
||||
|
||||
Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; }
|
||||
|
||||
Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; }
|
||||
|
||||
Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; }
|
||||
|
|
@ -256,7 +233,7 @@ class qnn_instance {
|
|||
}
|
||||
|
||||
int set_high_performance_mode() {
|
||||
if (nullptr == _qnn_htp_perfinfra) {
|
||||
if (!_qnn_htp_perfinfra) {
|
||||
QNN_LOG_WARN("perf intra is null\n");
|
||||
return 1;
|
||||
}
|
||||
|
|
@ -425,29 +402,20 @@ class qnn_instance {
|
|||
std::string _backend_lib_name;
|
||||
BackendIdType _backend_id;
|
||||
|
||||
QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG;
|
||||
|
||||
#ifdef NDEBUG
|
||||
qnn::sdk_profile_level _profile_level = qnn::sdk_profile_level::profile_off;
|
||||
QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_INFO; // TODO: should we consider changing this dynamically?
|
||||
#else
|
||||
qnn::sdk_profile_level _profile_level = qnn::sdk_profile_level::profile_detail;
|
||||
QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG;
|
||||
#endif
|
||||
|
||||
std::shared_ptr<qnn::qnn_system_interface> _qnn_sys_interface;
|
||||
std::shared_ptr<qnn::qnn_interface> _qnn_interface;
|
||||
|
||||
Qnn_GraphHandle_t _qnn_graph_handle = nullptr;
|
||||
|
||||
Qnn_LogHandle_t _qnn_log_handle = nullptr;
|
||||
|
||||
Qnn_ProfileHandle_t _qnn_profile_handle = nullptr;
|
||||
|
||||
Qnn_DeviceHandle_t _qnn_device_handle = nullptr;
|
||||
|
||||
Qnn_BackendHandle_t _qnn_backend_handle = nullptr;
|
||||
|
||||
Qnn_ContextHandle_t _qnn_context_handle = nullptr;
|
||||
|
||||
Qnn_GraphHandle_t _qnn_graph_handle = nullptr;
|
||||
Qnn_LogHandle_t _qnn_log_handle = nullptr;
|
||||
Qnn_DeviceHandle_t _qnn_device_handle = nullptr;
|
||||
Qnn_BackendHandle_t _qnn_backend_handle = nullptr;
|
||||
Qnn_ContextHandle_t _qnn_context_handle = nullptr;
|
||||
QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr;
|
||||
uint32_t _qnn_power_configid = 1;
|
||||
|
||||
|
|
@ -473,4 +441,22 @@ class qnn_instance {
|
|||
qnn::qcom_socinfo _soc_info = {};
|
||||
};
|
||||
|
||||
using qnn_instance_ptr = std::shared_ptr<qnn_instance>;
|
||||
|
||||
struct device_caps {
|
||||
const char * lib_name;
|
||||
enum ggml_backend_dev_type type;
|
||||
|
||||
// TODO: should we get this from device?
|
||||
uint64_t supported_types;
|
||||
|
||||
// TODO: should we merge this with supported_types?
|
||||
uint64_t cpu_preprocess_types;
|
||||
|
||||
// TODO: should we get this from device?
|
||||
size_t max_tensor_size_in_bytes;
|
||||
};
|
||||
|
||||
const device_caps & get_device_caps(QNNBackend device);
|
||||
|
||||
} // namespace qnn
|
||||
|
|
|
|||
|
|
@ -8,15 +8,6 @@
|
|||
#include "System/QnnSystemInterface.h"
|
||||
|
||||
namespace qnn {
|
||||
// =================================================================================================
|
||||
//
|
||||
// helper data type / data structure / macros / functions of
|
||||
// Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK
|
||||
// ref:
|
||||
// https://github.com/pytorch/executorch/blob/ae3d558d5e6aa04fc52a3065399fe6a773702f52/backends/qualcomm/serialization/qc_schema.py#L53
|
||||
// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html#supported-snapdragon-devices
|
||||
// =================================================================================================
|
||||
enum sdk_profile_level { profile_off = 0, profile_basic, profile_detail };
|
||||
|
||||
enum qcom_htp_arch {
|
||||
NONE = 0,
|
||||
|
|
@ -29,12 +20,15 @@ enum qcom_htp_arch {
|
|||
|
||||
enum qcom_chipset {
|
||||
UNKNOWN_SM = 0,
|
||||
SM8350 = 30, // v68, SD 888/888+
|
||||
SM8450 = 36, // v69, SD 8 Gen 1
|
||||
SA8295 = 39, // v68
|
||||
SM8475 = 42, // v69, SD 8+ Gen 1
|
||||
SM8550 = 43, // v73, SD 8 Gen 2
|
||||
SSG2115P = 46, // v73
|
||||
SM7675 = 70, // V73, SD 7+ Gen 3
|
||||
SM8635 = 68, // v73, SD 8s Gen 3
|
||||
SM8650 = 57, // v75, SD 8 Gen 3
|
||||
SA8295 = 39, // v68
|
||||
SM8750 = 69, // v79, SD 8 Gen 4
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -25,8 +25,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this<ggml_qnn_tensor> {
|
|||
|
||||
explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string & name,
|
||||
const qnn_dimension_array_t & dimensions, Qnn_DataType_t data_type, int rank,
|
||||
QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
std::shared_ptr<qnn_instance> qnn_instance) :
|
||||
QNNBackend device, Qnn_GraphHandle_t graph_handle, qnn_instance_ptr qnn_instance) :
|
||||
_tensor_name(name),
|
||||
_device(device),
|
||||
_qnn_instance(qnn_instance),
|
||||
|
|
@ -46,8 +45,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this<ggml_qnn_tensor> {
|
|||
|
||||
explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string & name,
|
||||
const ggml_dimension_array_t & dimensions, ggml_type data_type, int rank,
|
||||
QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
std::shared_ptr<qnn_instance> qnn_instance) :
|
||||
QNNBackend device, Qnn_GraphHandle_t graph_handle, qnn_instance_ptr qnn_instance) :
|
||||
ggml_qnn_tensor(tensor_type, name, get_internal_dimension(dimensions, rank),
|
||||
qnn_datatype_from_ggml_datatype(data_type), rank, device, graph_handle, qnn_instance) {}
|
||||
|
||||
|
|
@ -85,7 +83,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this<ggml_qnn_tensor> {
|
|||
auto qnn_interface = _qnn_instance->get_qnn_interface();
|
||||
auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_WARN("[%s]allocate id failed , error: %d\n", _tensor_name.c_str(), (int) error);
|
||||
QNN_LOG_ERROR("[%s]allocate id failed, error: %s\n", _tensor_name.c_str(), get_qnn_error_string(error));
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -95,7 +93,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this<ggml_qnn_tensor> {
|
|||
return true;
|
||||
}
|
||||
|
||||
bool bind_ggml_tensor(ggml_tensor * tensor) {
|
||||
bool bind_ggml_tensor(ggml_tensor * tensor, qnn_buffer_ptr buffer) {
|
||||
if (!_can_unbind) {
|
||||
QNN_LOG_DEBUG("[%s]already has buffer storage, skip bind\n", _tensor_name.c_str());
|
||||
return true;
|
||||
|
|
@ -111,8 +109,12 @@ class ggml_qnn_tensor : public std::enable_shared_from_this<ggml_qnn_tensor> {
|
|||
}
|
||||
#endif
|
||||
|
||||
auto buffer =
|
||||
std::make_shared<qnn_mem_buffer_slice>(reinterpret_cast<uint8_t *>(tensor->data), ggml_nbytes(tensor));
|
||||
if (!buffer) {
|
||||
buffer =
|
||||
std::make_shared<qnn_mem_buffer_slice>(reinterpret_cast<uint8_t *>(tensor->data), ggml_nbytes(tensor));
|
||||
QNN_LOG_DEBUG("[%s][%s]attach buffer to tensor(%s), size: %d\n", get_backend_name(_device),
|
||||
_tensor_name.c_str(), tensor->name, (int) buffer->get_size());
|
||||
}
|
||||
if (!bind_buffer_impl(buffer)) {
|
||||
QNN_LOG_WARN("[%s]failed to bind ggml tensor(%s)\n", _tensor_name.c_str(), ggml_get_name(tensor));
|
||||
return false;
|
||||
|
|
@ -154,7 +156,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this<ggml_qnn_tensor> {
|
|||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s][%s]unbind from buffer: %p, size: %d\n", get_backend_name(_device), _tensor_name.c_str(),
|
||||
(void *) _buffer.get(), (int) _buffer->get_size());
|
||||
(void *) _buffer->get_buffer(), (int) _buffer->get_size());
|
||||
_buffer.reset();
|
||||
|
||||
if (_ggml_tensor) {
|
||||
|
|
@ -175,15 +177,19 @@ class ggml_qnn_tensor : public std::enable_shared_from_this<ggml_qnn_tensor> {
|
|||
|
||||
uint32_t get_qnn_tensor_id() const { return QNN_TENSOR_GET_ID(_qnn_tensor); }
|
||||
|
||||
const std::string & get_tensor_name() const { return _tensor_name; }
|
||||
|
||||
private:
|
||||
bool bind_buffer_impl(qnn_buffer_ptr buffer) {
|
||||
if (_buffer) {
|
||||
if (_buffer != buffer) {
|
||||
QNN_LOG_WARN("[%s]has been bound to another buffer %p\n", _tensor_name.c_str(), (void *) _buffer.get());
|
||||
QNN_LOG_WARN("[%s]has been bound to another buffer %p\n", _tensor_name.c_str(),
|
||||
(void *) _buffer->get_buffer());
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s]already bound to same ggml tensor %p\n", _tensor_name.c_str(), (void *) _buffer.get());
|
||||
QNN_LOG_DEBUG("[%s]already bound to same ggml tensor %p\n", _tensor_name.c_str(),
|
||||
(void *) _buffer->get_buffer());
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -221,8 +227,8 @@ class ggml_qnn_tensor : public std::enable_shared_from_this<ggml_qnn_tensor> {
|
|||
QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW);
|
||||
Qnn_ClientBuffer_t client_buf = { buffer->get_buffer(), (uint32_t) buffer->get_size() };
|
||||
QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf);
|
||||
QNN_LOG_DEBUG("[%s]use client buffer %p size %d\n", _tensor_name.c_str(), client_buf.data,
|
||||
(int) client_buf.dataSize);
|
||||
QNN_LOG_DEBUG("[%s][%s]use client buffer %p size %d\n", get_backend_name(_device), _tensor_name.c_str(),
|
||||
client_buf.data, (int) client_buf.dataSize);
|
||||
}
|
||||
|
||||
_buffer = buffer;
|
||||
|
|
@ -233,7 +239,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this<ggml_qnn_tensor> {
|
|||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s][%s]bind to buffer: %p, size: %d\n", get_backend_name(_device), _tensor_name.c_str(),
|
||||
(void *) buffer.get(), (int) buffer->get_size());
|
||||
(void *) buffer->get_buffer(), (int) buffer->get_size());
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -246,10 +252,11 @@ class ggml_qnn_tensor : public std::enable_shared_from_this<ggml_qnn_tensor> {
|
|||
|
||||
if (_rpc_buffer) {
|
||||
memcpy(_rpc_buffer->get_buffer(), _buffer->get_buffer(), _buffer->get_size());
|
||||
// For CPU and GPU, the data is already in the tensor.
|
||||
QNN_LOG_DEBUG("[%s][%s]write buffer(%p) to rpc buffer(%p)\n", get_backend_name(_device),
|
||||
_tensor_name.c_str(), (void *) _buffer->get_buffer(), (void *) _rpc_buffer->get_buffer());
|
||||
}
|
||||
|
||||
// For CPU and GPU, the data is already in the tensor.
|
||||
QNN_LOG_DEBUG("[%s][%s]write tensor to qnn\n", get_backend_name(_device), _tensor_name.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -262,10 +269,11 @@ class ggml_qnn_tensor : public std::enable_shared_from_this<ggml_qnn_tensor> {
|
|||
|
||||
if (_rpc_buffer) {
|
||||
memcpy(_buffer->get_buffer(), _rpc_buffer->get_buffer(), _buffer->get_size());
|
||||
// For CPU and GPU, the data is already in the tensor.
|
||||
QNN_LOG_DEBUG("[%s][%s]read buffer(%p) from rpc buffer(%p)\n", get_backend_name(_device),
|
||||
_tensor_name.c_str(), (void *) _buffer->get_buffer(), (void *) _rpc_buffer->get_buffer());
|
||||
}
|
||||
|
||||
// For CPU and GPU, the data is already in the tensor.
|
||||
QNN_LOG_DEBUG("[%s][%s]read tensor from qnn\n", get_backend_name(_device), _tensor_name.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -298,8 +306,8 @@ class ggml_qnn_tensor : public std::enable_shared_from_this<ggml_qnn_tensor> {
|
|||
break;
|
||||
}
|
||||
QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type);
|
||||
QNN_LOG_DEBUG("[%s][%s]tensor changed to type %d\n", get_backend_name(_device), _tensor_name.c_str(),
|
||||
new_tensor_type);
|
||||
QNN_LOG_DEBUG("[%s][%s]new_tensor_type %s\n", get_backend_name(_device), _tensor_name.c_str(),
|
||||
get_qnn_tensor_type_name(new_tensor_type));
|
||||
}
|
||||
|
||||
bool should_use_mem_handle() const {
|
||||
|
|
@ -307,16 +315,16 @@ class ggml_qnn_tensor : public std::enable_shared_from_this<ggml_qnn_tensor> {
|
|||
return false;
|
||||
}
|
||||
|
||||
std::string _tensor_name;
|
||||
qnn_buffer_ptr _buffer;
|
||||
bool _can_unbind = true;
|
||||
QNNBackend _device;
|
||||
std::shared_ptr<qnn_instance> _qnn_instance;
|
||||
Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion);
|
||||
qnn_dimension_array_t _dimensions = {};
|
||||
Qnn_GraphHandle_t _graph_handle = nullptr;
|
||||
qnn_buffer_ptr _rpc_buffer;
|
||||
ggml_tensor * _ggml_tensor = nullptr;
|
||||
std::string _tensor_name;
|
||||
qnn_buffer_ptr _buffer;
|
||||
bool _can_unbind = true;
|
||||
QNNBackend _device;
|
||||
qnn_instance_ptr _qnn_instance;
|
||||
Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion);
|
||||
qnn_dimension_array_t _dimensions = {};
|
||||
Qnn_GraphHandle_t _graph_handle = nullptr;
|
||||
qnn_buffer_ptr _rpc_buffer;
|
||||
ggml_tensor * _ggml_tensor = nullptr;
|
||||
|
||||
DISABLE_COPY(ggml_qnn_tensor);
|
||||
DISABLE_MOVE(ggml_qnn_tensor);
|
||||
|
|
@ -340,13 +348,33 @@ inline int get_ggml_tensors_max_rank(const qnn::ggml_tensor_array_t & tensors) {
|
|||
return max_rank;
|
||||
}
|
||||
|
||||
inline bool bind_tensors_with_custom_buffers(const ggml_tensor_array_t & ggml_tensors,
|
||||
std::vector<qnn_buffer_ptr> & buffers,
|
||||
qnn_tensor_array_t & tensor_wrappers,
|
||||
std::vector<Qnn_Tensor_t> & qnn_tensors) {
|
||||
GGML_ASSERT(tensor_wrappers.size() == ggml_tensors.size());
|
||||
GGML_ASSERT(buffers.size() == ggml_tensors.size());
|
||||
qnn_tensors.resize(ggml_tensors.size());
|
||||
for (size_t i = 0; i < ggml_tensors.size(); i++) {
|
||||
auto * ggml_tensor = ggml_tensors[i];
|
||||
if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor, buffers[i])) {
|
||||
QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor));
|
||||
return false;
|
||||
}
|
||||
|
||||
qnn_tensors[i] = tensor_wrappers[i]->get_qnn_tensor();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool bind_tensors(const ggml_tensor_array_t & ggml_tensors, qnn_tensor_array_t & tensor_wrappers,
|
||||
std::vector<Qnn_Tensor_t> & qnn_tensors) {
|
||||
GGML_ASSERT(tensor_wrappers.size() == ggml_tensors.size());
|
||||
qnn_tensors.resize(ggml_tensors.size());
|
||||
for (size_t i = 0; i < ggml_tensors.size(); i++) {
|
||||
auto * ggml_tensor = ggml_tensors[i];
|
||||
if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) {
|
||||
if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor, qnn_buffer_ptr())) {
|
||||
QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor));
|
||||
return false;
|
||||
}
|
||||
|
|
@ -361,7 +389,7 @@ inline bool bind_tensors(const ggml_tensor_array_t & ggml_tensors, qnn_tensor_ar
|
|||
GGML_ASSERT(tensor_wrappers.size() == ggml_tensors.size());
|
||||
for (size_t i = 0; i < ggml_tensors.size(); i++) {
|
||||
auto * ggml_tensor = ggml_tensors[i];
|
||||
if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) {
|
||||
if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor, qnn_buffer_ptr())) {
|
||||
QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor));
|
||||
return false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -178,8 +178,8 @@ const char * get_ggml_type_name(ggml_type type) {
|
|||
return traits->type_name;
|
||||
}
|
||||
|
||||
const char * get_backend_name(QNNBackend device_index) {
|
||||
switch (device_index) {
|
||||
const char * get_backend_name(QNNBackend device) {
|
||||
switch (device) {
|
||||
case QNN_BACKEND_CPU:
|
||||
return "qnn-cpu";
|
||||
case QNN_BACKEND_GPU:
|
||||
|
|
@ -192,18 +192,65 @@ const char * get_backend_name(QNNBackend device_index) {
|
|||
}
|
||||
}
|
||||
|
||||
const char * get_chipset_desc(uint32_t chipset_id) {
|
||||
switch (chipset_id) {
|
||||
const char * get_backend_desc(QNNBackend device) {
|
||||
switch (device) {
|
||||
case QNN_BACKEND_CPU:
|
||||
return "CPU";
|
||||
case QNN_BACKEND_GPU:
|
||||
return "Adreno GPU";
|
||||
case QNN_BACKEND_NPU:
|
||||
return "Hexagon NPU";
|
||||
case QNN_BACKEND_COUNT:
|
||||
default:
|
||||
return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
const char * get_chipset_desc(uint32_t soc_model) {
|
||||
switch (soc_model) {
|
||||
case SM8350:
|
||||
return "Snapdragon 888/888+";
|
||||
case SM8450:
|
||||
return "SD 8 Gen 1 (SM8450)";
|
||||
return "Snapdragon 8 Gen 1";
|
||||
case SM8475:
|
||||
return "SD 8+ Gen 1 (SM8475)";
|
||||
return "Snapdragon 8 Gen 1+";
|
||||
case SM8550:
|
||||
return "SD 8 Gen 2 (SM8550)";
|
||||
return "Snapdragon 8 Gen 2";
|
||||
case SM7675:
|
||||
return "Snapdragon 7+ Gen 3";
|
||||
case SM8635:
|
||||
return "Snapdragon 8s Gen 3";
|
||||
case SM8650:
|
||||
return "SD 8 Gen 3 (SM8650)";
|
||||
return "Snapdragon 8 Gen 3";
|
||||
case SM8750:
|
||||
return "SD 8 Gen 4 (SM8750)";
|
||||
return "Snapdragon 8 Elite";
|
||||
default:
|
||||
return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
const char * get_chipset_model(uint32_t soc_model) {
|
||||
switch (soc_model) {
|
||||
case SM8350:
|
||||
return "SM8350";
|
||||
case SM8450:
|
||||
return "SM8450";
|
||||
case SA8295:
|
||||
return "SA8295";
|
||||
case SM8475:
|
||||
return "SM8475";
|
||||
case SM8550:
|
||||
return "SM8550";
|
||||
case SSG2115P:
|
||||
return "SSG2115P";
|
||||
case SM7675:
|
||||
return "SM7675";
|
||||
case SM8635:
|
||||
return "SM8635";
|
||||
case SM8650:
|
||||
return "SM8650";
|
||||
case SM8750:
|
||||
return "SM8750";
|
||||
default:
|
||||
return "unknown";
|
||||
}
|
||||
|
|
@ -212,15 +259,15 @@ const char * get_chipset_desc(uint32_t chipset_id) {
|
|||
const char * get_htparch_desc(size_t htp_arch) {
|
||||
switch (htp_arch) {
|
||||
case V68:
|
||||
return "QCOM_HTP_V68";
|
||||
return "HTP_V68";
|
||||
case V69:
|
||||
return "QCOM_HTP_V69";
|
||||
return "HTP_V69";
|
||||
case V73:
|
||||
return "QCOM_HTP_V73";
|
||||
return "HTP_V73";
|
||||
case V75:
|
||||
return "QCOM_HTP_V75";
|
||||
return "HTP_V75";
|
||||
case V79:
|
||||
return "QCOM_HTP_V79";
|
||||
return "HTP_V79";
|
||||
default:
|
||||
return "unknown";
|
||||
}
|
||||
|
|
@ -234,6 +281,29 @@ uint32_t get_ggml_tensor_data_size(const ggml_tensor * tensor) {
|
|||
return (uint32_t) ggml_nbytes(tensor);
|
||||
}
|
||||
|
||||
const char * get_qnn_tensor_type_name(Qnn_TensorType_t type) {
|
||||
switch (type) {
|
||||
case QNN_TENSOR_TYPE_APP_WRITE:
|
||||
return "QNN_TENSOR_TYPE_APP_WRITE";
|
||||
case QNN_TENSOR_TYPE_APP_READ:
|
||||
return "QNN_TENSOR_TYPE_APP_READ";
|
||||
case QNN_TENSOR_TYPE_APP_READWRITE:
|
||||
return "QNN_TENSOR_TYPE_APP_READWRITE";
|
||||
case QNN_TENSOR_TYPE_STATIC:
|
||||
return "QNN_TENSOR_TYPE_STATIC";
|
||||
case QNN_TENSOR_TYPE_NATIVE:
|
||||
return "QNN_TENSOR_TYPE_NATIVE";
|
||||
case QNN_TENSOR_TYPE_UNDEFINED:
|
||||
return "QNN_TENSOR_TYPE_UNDEFINED";
|
||||
case QNN_TENSOR_TYPE_NULL:
|
||||
return "QNN_TENSOR_TYPE_NULL";
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return "unknown";
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
static void * _align_alloc(size_t alignment, size_t size) {
|
||||
return _aligned_malloc(size, alignment);
|
||||
|
|
@ -265,14 +335,15 @@ void align_free(void * ptr) {
|
|||
void * page_align_alloc(size_t size) {
|
||||
const size_t alignment = _get_page_size();
|
||||
size_t size_aligned = align_to_generic<size_t>(alignment, size);
|
||||
QNN_LOG_DEBUG("_align_alloc success, alignment: %ld, size: %ld, size_aligned: %ld\n", alignment, size, size_aligned);
|
||||
void * data = _align_alloc(alignment, size_aligned);
|
||||
void * data = _align_alloc(alignment, size_aligned);
|
||||
if (!data) {
|
||||
QNN_LOG_WARN("_align_alloc failed, alignment: %ld, size: %ld, size_aligned: %ld\n", alignment, size,
|
||||
size_aligned);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("_align_alloc success, alignment: %ld, size: %ld, size_aligned: %ld\n", alignment, size,
|
||||
size_aligned);
|
||||
return data;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -23,11 +23,14 @@ qnn_dimension_array_t get_view_internal_dimension(const ggml_tensor * tensor, si
|
|||
|
||||
uint32_t get_ggml_tensor_rank(const ggml_tensor * tensor);
|
||||
const char * get_ggml_type_name(ggml_type type);
|
||||
const char * get_backend_name(QNNBackend device_index);
|
||||
const char * get_chipset_desc(uint32_t chipset_id);
|
||||
const char * get_backend_name(QNNBackend device);
|
||||
const char * get_backend_desc(QNNBackend device);
|
||||
const char * get_chipset_desc(uint32_t soc_model);
|
||||
const char * get_chipset_model(uint32_t soc_model);
|
||||
const char * get_htparch_desc(size_t htp_arch);
|
||||
intptr_t align_to(size_t alignment, intptr_t offset);
|
||||
uint32_t get_ggml_tensor_data_size(const ggml_tensor * tensor);
|
||||
const char * get_qnn_tensor_type_name(Qnn_TensorType_t type);
|
||||
|
||||
void * page_align_alloc(size_t size);
|
||||
void align_free(void * ptr);
|
||||
|
|
@ -199,48 +202,6 @@ const char * qnn_datatype_to_string(Qnn_DataType_t qnn_type);
|
|||
size_t get_system_total_memory_in_bytes();
|
||||
size_t get_system_free_memory_in_bytes();
|
||||
|
||||
#if ENABLE_QNNBACKEND_PERF
|
||||
class qnn_perf {
|
||||
public:
|
||||
qnn_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {};
|
||||
|
||||
~qnn_perf() { info(); }
|
||||
|
||||
qnn_perf() = delete;
|
||||
qnn_perf(const qnn_perf &) = delete;
|
||||
qnn_perf & operator=(const qnn_perf &) = delete;
|
||||
|
||||
void start() { _begin_time = ggml_time_us(); }
|
||||
|
||||
void info() {
|
||||
_end_time = ggml_time_us();
|
||||
_duration = (_end_time - _begin_time);
|
||||
QNN_LOG_INFO("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration);
|
||||
}
|
||||
|
||||
private:
|
||||
int64_t _begin_time = 0LL;
|
||||
int64_t _end_time = 0LL;
|
||||
int64_t _duration = 0LL;
|
||||
std::string _perf_name;
|
||||
};
|
||||
#else
|
||||
class qnn_perf {
|
||||
public:
|
||||
qnn_perf(const std::string &) {}
|
||||
|
||||
~qnn_perf() { info(); }
|
||||
|
||||
qnn_perf() = delete;
|
||||
qnn_perf(const qnn_perf &) = delete;
|
||||
qnn_perf & operator=(const qnn_perf &) = delete;
|
||||
|
||||
void start() {}
|
||||
|
||||
void info() {}
|
||||
};
|
||||
#endif
|
||||
|
||||
} // namespace qnn
|
||||
|
||||
#define QNN_TENSOR_GET_ID(tensor) qnn::get_qnn_tensorid(tensor)
|
||||
|
|
|
|||
Loading…
Reference in New Issue