[feat] Port ggml graph to QNN graph (#16)

* more log

* split graph implementation into cpp file

* rename: ggml_qnn_graph -> qnn_graph

* add imput/output tensor to graph

* fix assert

* wip

* add _ggml_tensor field in qnn tensor

* add comments

* add set_data_buffer with raw memory buffer

* use set_data_buffer

* op param buffer use qnn_buffer_ptr

* add qnn_mem_buffer_slice

* use qnn_buffer_ptr as tensor buffer

* use new set_data_buffer to reduce copy

* ggml_qnn_op_config: add function to set input/output tensor before init node

* remove ggml_qnn_connectable_op_config and use ggml_qnn_single_op_config instead

* wip

* add initialize_op_nodes without tensor params

* wip

* add op caps table

* merge kGgmlOpToQnnOp and kOpCaps tables

* wip

* add cache parameter to create_tensors

* add init_from_ggml_graph

* disable gelu for all backend

* wip

* move op index calc to op config module

* use the ggml_tensor as parameter of build_graph

* add log

* use create_operation_from_op_tensor in old build_graph function

* remove unused constructors

* fix parameter count

* remove unused member func/var

* make init_from_ggml_graph as a class member: build_graph_from_ggml_graph

* move graph finalize into member function `finalize()`

* get graph key from ggml op tensor directly

* append output type

* reduce tensor key length

* add function to generate key from ggml_cgraph

* simplify graph cache insert and delete

* remove template param at get_qnn_graph_from_cache

* wip

* merge kQnnUnaryOpsTable and kQnnBinaryOpsTable

* refactor device_supports_op

* add log

* wip

* use framework function to check same shape

* wip

* extract some logic into separated function

* wip

* add execution function that runs graph

* add function to create qnn graph from ggml_cgraph with cache

* execute graph directly

* return null graph key for empty graph

* add more qualcomm chipset enums

* add cap for reshape

* disable some ops

* try to skip GGML_OP_VIEW

* moew log for view tensor

* append param tensor into intermedia tensor key

* use 'ordered' set

* fix warning in release

* wip
This commit is contained in:
nullname 2025-01-10 11:13:25 +08:00 committed by GitHub
parent 8f07b3e3f6
commit f2d8d017da
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 1317 additions and 869 deletions

View File

@ -11,12 +11,10 @@
#include "tensor.hpp"
#include "utils.hpp"
#ifndef NDEBUG
namespace {
bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor *src, ggml_tensor *dst) {
if (!ctx || !src || !dst) {
bool qnn_is_op_valid(ggml_backend_qnn_device_context *ctx, const ggml_tensor *dst) {
if (!ctx || !dst) {
QNN_LOG_WARN("invalid params");
return false;
}
@ -27,77 +25,36 @@ bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor
return false;
}
return true;
}
bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1,
ggml_tensor *dst) {
if (!ctx || !src0 || !src1 || !dst) {
QNN_LOG_WARN("invalid params");
return false;
const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(dst));
switch (param_count) {
case 1:
return dst->src[0];
case 2:
return dst->src[0] && dst->src[1];
default:
QNN_LOG_WARN("invalid op param count %d", (int)param_count);
break;
}
auto instance = ctx->instance;
if (!instance) {
QNN_LOG_WARN("invalid instance");
return false;
}
return true;
return false;
}
#ifndef NDEBUG
void print_ggml_tensor(const ggml_tensor *tensor) {
QNN_LOG_DEBUG("%s: type:%s ne: %ldx%ldx%ldx%ld, nb: %ldx%ldx%ldx%ld", tensor->name, ggml_type_name(tensor->type),
(long)tensor->ne[0], (long)tensor->ne[1], (long)tensor->ne[2], (long)tensor->ne[3],
(long)tensor->nb[0], (long)tensor->nb[1], (long)tensor->nb[2], (long)tensor->nb[3]);
}
#endif
} // namespace
#define CHECK_PARAMS(ctx, ...) \
if (!qnn_is_valid_params((ctx), __VA_ARGS__)) { \
return false; \
}
#else
#define CHECK_PARAMS(ctx, ...)
#endif
namespace {
bool is_tensor_dimensions_equal(const ggml_tensor *l, const ggml_tensor *r) {
const auto dim_l = ggml_n_dims(l);
if (dim_l != ggml_n_dims(r)) {
return false;
}
typedef bool (*ggml_qnn_op_t)(ggml_backend_qnn_device_context *ctx, ggml_tensor *dst);
for (int i = 0; i < dim_l; i++) {
if (l->ne[i] != r->ne[i]) {
return false;
}
}
return true;
}
typedef bool (*ggml_qnn_unary_op_t)(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, ggml_tensor *dst);
typedef bool (*ggml_qnn_binary_op_t)(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, ggml_tensor *src1,
ggml_tensor *dst);
typedef const ggml_qnn_unary_op_t (&ggml_qnn_unary_op_array_t)[GGML_OP_COUNT + GGML_UNARY_OP_COUNT];
typedef const ggml_qnn_binary_op_t (&ggml_qnn_binary_op_array_t)[GGML_OP_COUNT];
constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT;
template <size_t _Size>
qnn::ggml_tensor_array_t to_ggml_tensor_array(const std::array<ggml_tensor *, _Size> &array) {
return qnn::ggml_tensor_array_t(array.data(), array.data() + _Size);
}
template <size_t _InputSize>
bool execute_graph(qnn::ggml_qnn_graph *graph, const std::array<ggml_tensor *, _InputSize> &inputs,
ggml_tensor *output) {
if (!graph->execute(to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<1>({output}))) {
bool execute_graph(qnn::qnn_graph *graph, ggml_tensor *output) {
if (!graph->execute(output)) {
QNN_LOG_WARN("execute failed");
return false;
}
@ -105,165 +62,114 @@ bool execute_graph(qnn::ggml_qnn_graph *graph, const std::array<ggml_tensor *, _
return true;
}
template <size_t _InputSize, size_t _OutputSize>
std::string get_graph_key(const std::string &op_name, const std::array<ggml_tensor *, _InputSize> &inputs,
const std::array<ggml_tensor *, _OutputSize> &outputs) {
constexpr static const auto append_dimensions = [](std::string &key, const ggml_tensor *tensor) {
char buffer[256] = {};
snprintf(buffer, sizeof(buffer), "_%ldx%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1],
(long)tensor->ne[2], (long)tensor->ne[3], qnn::get_ggml_type_name(tensor->type));
key += buffer;
};
std::string graph_key(op_name);
for (auto &input : inputs) {
append_dimensions(graph_key, input);
void append_tensor_dimensions(const ggml_tensor *tensor, std::string &output) {
char buffer[256] = {};
const auto *type_name = qnn::get_ggml_type_name(tensor->type);
int len = 0;
switch (ggml_n_dims(tensor)) {
case 1:
len = snprintf(buffer, sizeof(buffer), "%ld%s", (long)tensor->ne[0], type_name);
break;
case 2:
len = snprintf(buffer, sizeof(buffer), "%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], type_name);
break;
case 3:
len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1],
(long)tensor->ne[2], type_name);
break;
case 4:
default:
len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1],
(long)tensor->ne[2], (long)tensor->ne[3], type_name);
break;
}
graph_key += qnn::get_ggml_type_name(outputs.front()->type);
return graph_key;
GGML_ASSERT(len > 0 && len < (int)sizeof(buffer));
output.append(buffer, len);
}
constexpr const char *kGgmlOpToQnnOp[] = {
nullptr, // GGML_OP_NONE
nullptr, // GGML_OP_DUP
QNN_OP_ELEMENT_WISE_ADD, // GGML_OP_ADD
nullptr, // GGML_OP_ADD1
nullptr, // GGML_OP_ACC
QNN_OP_ELEMENT_WISE_SUBTRACT, // GGML_OP_SUB
QNN_OP_ELEMENT_WISE_MULTIPLY, // GGML_OP_MUL
QNN_OP_ELEMENT_WISE_DIVIDE, // GGML_OP_DIV
nullptr, // GGML_OP_SQR
QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // GGML_OP_SQRT
QNN_OP_ELEMENT_WISE_LOG, // GGML_OP_LOG
nullptr, // GGML_OP_SIN
nullptr, // GGML_OP_COS
nullptr, // GGML_OP_SUM
nullptr, // GGML_OP_SUM_ROWS
nullptr, // GGML_OP_MEAN
nullptr, // GGML_OP_ARGMAX
nullptr, // GGML_OP_COUNT_EQUAL
nullptr, // GGML_OP_REPEAT
nullptr, // GGML_OP_REPEAT_BACK
nullptr, // GGML_OP_CONCAT
nullptr, // GGML_OP_SILU_BACK
nullptr, // GGML_OP_NORM
nullptr, // GGML_OP_RMS_NORM
nullptr, // GGML_OP_RMS_NORM_BACK
nullptr, // GGML_OP_GROUP_NORM
void get_graph_key_from_op(const ggml_tensor *op, std::string &output) {
GGML_ASSERT(op->op != GGML_OP_NONE);
output += ggml_op_desc(op);
output += qnn::get_ggml_type_name(op->type);
const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(op));
for (size_t i = 0; i < param_count; ++i) {
auto *input = op->src[i];
output += '_';
append_tensor_dimensions(input, output);
}
}
QNN_OP_MAT_MUL, // GGML_OP_MUL_MAT
nullptr, // GGML_OP_MUL_MAT_ID
nullptr, // GGML_OP_OUT_PROD
void get_op_key_with_src_op_desc(const ggml_tensor *op, std::string &output) {
output += ggml_op_desc(op);
output += '(';
if (op->src[0]) {
output += ggml_op_desc(op->src[0]);
}
for (size_t i = 1; i < GGML_MAX_DIMS && op->src[i]; ++i) {
output += ',';
output += ggml_op_desc(op->src[i]);
}
output += ')';
}
nullptr, // GGML_OP_SCALE
nullptr, // GGML_OP_SET
nullptr, // GGML_OP_CPY
nullptr, // GGML_OP_CONT
nullptr, // GGML_OP_RESHAPE
nullptr, // GGML_OP_VIEW
QNN_OP_TRANSPOSE, // GGML_OP_PERMUTE
nullptr, // GGML_OP_TRANSPOSE
nullptr, // GGML_OP_GET_ROWS
nullptr, // GGML_OP_GET_ROWS_BACK
nullptr, // GGML_OP_DIAG
nullptr, // GGML_OP_DIAG_MASK_INF
nullptr, // GGML_OP_DIAG_MASK_ZERO
nullptr, // GGML_OP_SOFT_MAX
nullptr, // GGML_OP_SOFT_MAX_BACK
nullptr, // GGML_OP_ROPE
nullptr, // GGML_OP_ROPE_BACK
nullptr, // GGML_OP_CLAMP
nullptr, // GGML_OP_CONV_TRANSPOSE_1D
nullptr, // GGML_OP_IM2COL
nullptr, // GGML_OP_IM2COL_BACK
nullptr, // GGML_OP_CONV_TRANSPOSE_2D
nullptr, // GGML_OP_POOL_1D
nullptr, // GGML_OP_POOL_2D
nullptr, // GGML_OP_POOL_2D_BACK
nullptr, // GGML_OP_UPSCALE
nullptr, // GGML_OP_PAD
nullptr, // GGML_OP_PAD_REFLECT_1D
nullptr, // GGML_OP_ARANGE
nullptr, // GGML_OP_TIMESTEP_EMBEDDING
nullptr, // GGML_OP_ARGSORT
nullptr, // GGML_OP_LEAKY_RELU
void get_graph_key_from_cgraph(const ggml_cgraph *cgraph, std::string &output) {
// generate key from the graph, the key is used to cache the graph, like:
// "MUL_MATf32_256x16x10f32_256x1x10f32#LOG#ADD#ADDf32_16x1x10f32"
if (cgraph->n_nodes == 0) {
QNN_LOG_DEBUG("empty cgraph");
return;
}
nullptr, // GGML_OP_FLASH_ATTN_EXT
nullptr, // GGML_OP_FLASH_ATTN_BACK
nullptr, // GGML_OP_SSM_CONV
nullptr, // GGML_OP_SSM_SCAN
nullptr, // GGML_OP_WIN_PART
nullptr, // GGML_OP_WIN_UNPART
nullptr, // GGML_OP_GET_REL_POS
nullptr, // GGML_OP_ADD_REL_POS
nullptr, // GGML_OP_RWKV_WKV6
{
bool is_start = true;
for (int i = 0; i < cgraph->n_nodes; ++i) {
auto *op = cgraph->nodes[i];
if (ggml_is_empty(op)) {
QNN_LOG_DEBUG("empty op in graph, skipping");
continue;
}
nullptr, // GGML_OP_UNARY
if (op->op == GGML_OP_NONE) {
QNN_LOG_DEBUG("GGML_OP_NONE in graph, skipping");
continue;
}
nullptr, // GGML_OP_MAP_UNARY
nullptr, // GGML_OP_MAP_BINARY
if (is_start) {
get_graph_key_from_op(cgraph->nodes[0], output);
is_start = false;
} else {
output += '#';
get_op_key_with_src_op_desc(op, output);
}
}
}
nullptr, // GGML_OP_MAP_CUSTOM1_F32
nullptr, // GGML_OP_MAP_CUSTOM2_F32
nullptr, // GGML_OP_MAP_CUSTOM3_F32
nullptr, // GGML_OP_MAP_CUSTOM1
nullptr, // GGML_OP_MAP_CUSTOM2
nullptr, // GGML_OP_MAP_CUSTOM3
nullptr, // GGML_OP_CROSS_ENTROPY_LOSS
nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK
nullptr, // GGML_OP_OPT_STEP_ADAMW
// ggml_unary_op
nullptr, // GGML_UNARY_OP_ABS
nullptr, // GGML_UNARY_OP_SGN
nullptr, // GGML_UNARY_OP_NEG
nullptr, // GGML_UNARY_OP_STEP
nullptr, // GGML_UNARY_OP_TANH
nullptr, // GGML_UNARY_OP_ELU
nullptr, // GGML_UNARY_OP_RELU
nullptr, // GGML_UNARY_OP_SIGMOID
QNN_OP_GELU, // GGML_UNARY_OP_GELU
nullptr, // GGML_UNARY_OP_GELU_QUICK
nullptr, // GGML_UNARY_OP_SILU
nullptr, // GGML_UNARY_OP_HARDSWISH
nullptr, // GGML_UNARY_OP_HARDSIGMOID
nullptr, // GGML_UNARY_OP_EXP
};
static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT),
"GGML_OP_COUNT does not match the size of the kGgmlOpToQnnOp table");
static_assert(kGgmlOpToQnnOp[GGML_UNARY_OP_GELU + kGgmlUnaryOpStart] != nullptr,
"GGML_UNARY_OP_GELU does not correspond to QNN_OP_GELU");
template <size_t _InputSize>
qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *ctx, size_t op,
const std::array<ggml_tensor *, _InputSize> &inputs,
ggml_tensor *output) {
GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT));
if (cgraph->n_nodes > 1) {
auto *last_op = cgraph->nodes[cgraph->n_nodes - 1];
output += qnn::get_ggml_type_name(last_op->type);
output += '_';
append_tensor_dimensions(last_op, output);
}
}
qnn::qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *ctx, ggml_tensor *output) {
auto &graph_cache = ctx->qnn_graph_cache;
const auto *op_name =
op < kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - kGgmlUnaryOpStart));
auto graph_key = get_graph_key<_InputSize, 1>(op_name, inputs, {output});
std::string graph_key;
get_graph_key_from_op(output, graph_key);
auto it = graph_cache.find(graph_key);
qnn::ggml_qnn_graph *graph_ptr = nullptr;
qnn::qnn_graph *graph_ptr = nullptr;
if (it != graph_cache.end()) {
QNN_LOG_DEBUG("[%s]found graph %s in cache", qnn::get_backend_name(ctx->device), graph_key.c_str());
graph_ptr = it->second.get();
} else {
auto graph =
std::make_unique<qnn::ggml_qnn_graph>(graph_key, ctx->device, ctx->instance, ctx->socinfo.vtcm_size_in_mb);
std::make_unique<qnn::qnn_graph>(graph_key, ctx->device, ctx->instance, ctx->socinfo.vtcm_size_in_mb);
if (!graph->is_valid()) {
return nullptr;
}
auto op_constructor = qnn::create_op_constructor(kGgmlOpToQnnOp[op]);
if (!graph->build_graph(op_constructor, to_ggml_tensor_array<_InputSize>(inputs),
to_ggml_tensor_array<1>({output}))) {
QNN_LOG_ERROR("[%s]build_graph failed", qnn::get_backend_name(ctx->device));
if (!graph->build_graph_from_op(output)) {
QNN_LOG_ERROR("[%s]build_graph_from_op failed", qnn::get_backend_name(ctx->device));
return nullptr;
}
@ -274,22 +180,54 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *c
return graph_ptr;
}
template <ggml_op _GgmlOp>
bool qnn_binary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst) {
static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP");
CHECK_PARAMS(ctx, src0, src1, dst);
bool succeed = false;
auto *graph_ptr = get_qnn_graph_from_cache<2>(ctx, _GgmlOp, {src0, src1}, dst);
if (graph_ptr) {
succeed = execute_graph<2>(graph_ptr, {src0, src1}, dst);
qnn::qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *ctx, const ggml_cgraph *cgraph) {
auto &graph_cache = ctx->qnn_graph_cache;
std::string graph_key;
get_graph_key_from_cgraph(cgraph, graph_key);
if (graph_key.empty()) {
QNN_LOG_DEBUG("[%s]empty graph key for cgraph: %p, size: %d", qnn::get_backend_name(ctx->device), cgraph,
(int)cgraph->n_nodes);
return nullptr;
}
auto it = graph_cache.find(graph_key);
qnn::qnn_graph *graph_ptr = nullptr;
if (it != graph_cache.end()) {
QNN_LOG_DEBUG("[%s]found graph %s in cache", qnn::get_backend_name(ctx->device), graph_key.c_str());
graph_ptr = it->second.get();
} else {
auto graph =
std::make_unique<qnn::qnn_graph>(graph_key, ctx->device, ctx->instance, ctx->socinfo.vtcm_size_in_mb);
if (!graph->is_valid()) {
return nullptr;
}
if (!graph->build_graph_from_ggml_graph(cgraph)) {
QNN_LOG_ERROR("[%s]build_graph_from_op failed", qnn::get_backend_name(ctx->device));
return nullptr;
}
graph_ptr = graph.get();
graph_cache[graph_key] = std::move(graph);
}
return graph_ptr;
}
bool qnn_generic_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *dst) {
if (!qnn_is_op_valid(ctx, dst)) {
return false;
}
auto *graph_ptr = get_qnn_graph_from_cache(ctx, dst);
bool succeed = graph_ptr && execute_graph(graph_ptr, dst);
#ifndef NDEBUG
if (!succeed) {
print_ggml_tensor(src0);
print_ggml_tensor(src1);
const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(dst));
for (size_t i = 0; i < param_count; ++i) {
print_ggml_tensor(dst->src[i]);
}
print_ggml_tensor(dst);
}
#endif
@ -297,219 +235,76 @@ bool qnn_binary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0,
return succeed;
}
template <size_t _GgmlOp>
bool qnn_unary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, ggml_tensor *dst) {
static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP");
CHECK_PARAMS(ctx, src, dst);
bool succeed = false;
auto *graph_ptr = get_qnn_graph_from_cache<1>(ctx, _GgmlOp, {src}, dst);
if (graph_ptr) {
succeed = execute_graph<1>(graph_ptr, {src}, dst);
}
#ifndef NDEBUG
if (!succeed) {
print_ggml_tensor(src);
print_ggml_tensor(dst);
}
#endif
return succeed;
}
bool qnn_unary_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, ggml_tensor *dst) {
bool qnn_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *dst) {
GGML_UNUSED(ctx);
GGML_UNUSED(src);
GGML_UNUSED(dst);
return true;
}
bool qnn_binary_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst) {
GGML_UNUSED(ctx);
GGML_UNUSED(src0);
GGML_UNUSED(src1);
GGML_UNUSED(dst);
return true;
}
constexpr const ggml_qnn_op_t kQnnOpsTable[] = {
qnn_nop_impl, // GGML_OP_NONE
nullptr, // GGML_OP_DUP
qnn_generic_op_impl, // GGML_OP_ADD
nullptr, // GGML_OP_ADD1
nullptr, // GGML_OP_ACC
qnn_generic_op_impl, // GGML_OP_SUB
qnn_generic_op_impl, // GGML_OP_MUL
qnn_generic_op_impl, // GGML_OP_DIV
nullptr, // GGML_OP_SQR
qnn_generic_op_impl, // GGML_OP_SQRT
qnn_generic_op_impl, // GGML_OP_LOG
nullptr, // GGML_OP_SIN
nullptr, // GGML_OP_COS
nullptr, // GGML_OP_SUM
nullptr, // GGML_OP_SUM_ROWS
nullptr, // GGML_OP_MEAN
nullptr, // GGML_OP_ARGMAX
nullptr, // GGML_OP_COUNT_EQUAL
nullptr, // GGML_OP_REPEAT
nullptr, // GGML_OP_REPEAT_BACK
nullptr, // GGML_OP_CONCAT
nullptr, // GGML_OP_SILU_BACK
nullptr, // GGML_OP_NORM
nullptr, // GGML_OP_RMS_NORM
nullptr, // GGML_OP_RMS_NORM_BACK
nullptr, // GGML_OP_GROUP_NORM
constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = {
qnn_unary_nop_impl, // GGML_OP_NONE
nullptr, // GGML_OP_DUP
nullptr, // GGML_OP_ADD
nullptr, // GGML_OP_ADD1
nullptr, // GGML_OP_ACC
nullptr, // GGML_OP_SUB
nullptr, // GGML_OP_MUL
nullptr, // GGML_OP_DIV
nullptr, // GGML_OP_SQR
qnn_unary_op_impl<GGML_OP_SQRT>, // GGML_OP_SQRT
qnn_unary_op_impl<GGML_OP_LOG>, // GGML_OP_LOG
nullptr, // GGML_OP_SIN
nullptr, // GGML_OP_COS
nullptr, // GGML_OP_SUM
nullptr, // GGML_OP_SUM_ROWS
nullptr, // GGML_OP_MEAN
nullptr, // GGML_OP_ARGMAX
nullptr, // GGML_OP_COUNT_EQUAL
nullptr, // GGML_OP_REPEAT
nullptr, // GGML_OP_REPEAT_BACK
nullptr, // GGML_OP_CONCAT
nullptr, // GGML_OP_SILU_BACK
nullptr, // GGML_OP_NORM
nullptr, // GGML_OP_RMS_NORM
nullptr, // GGML_OP_RMS_NORM_BACK
nullptr, // GGML_OP_GROUP_NORM
qnn_generic_op_impl, // GGML_OP_MUL_MAT
nullptr, // GGML_OP_MUL_MAT_ID
nullptr, // GGML_OP_OUT_PROD
nullptr, // GGML_OP_MUL_MAT
nullptr, // GGML_OP_MUL_MAT_ID
nullptr, // GGML_OP_OUT_PROD
nullptr, // GGML_OP_SCALE
nullptr, // GGML_OP_SET
nullptr, // GGML_OP_CPY
nullptr, // GGML_OP_CONT
qnn_unary_nop_impl, // GGML_OP_RESHAPE
qnn_unary_nop_impl, // GGML_OP_VIEW
qnn_unary_nop_impl, // GGML_OP_PERMUTE
qnn_unary_nop_impl, // GGML_OP_TRANSPOSE
qnn_unary_nop_impl, // GGML_OP_GET_ROWS
nullptr, // GGML_OP_GET_ROWS_BACK
nullptr, // GGML_OP_DIAG
nullptr, // GGML_OP_DIAG_MASK_INF
nullptr, // GGML_OP_DIAG_MASK_ZERO
nullptr, // GGML_OP_SOFT_MAX
nullptr, // GGML_OP_SOFT_MAX_BACK
nullptr, // GGML_OP_ROPE
nullptr, // GGML_OP_ROPE_BACK
nullptr, // GGML_OP_CLAMP
nullptr, // GGML_OP_CONV_TRANSPOSE_1D
nullptr, // GGML_OP_IM2COL
nullptr, // GGML_OP_IM2COL_BACK
nullptr, // GGML_OP_CONV_TRANSPOSE_2D
nullptr, // GGML_OP_POOL_1D
nullptr, // GGML_OP_POOL_2D
nullptr, // GGML_OP_POOL_2D_BACK
nullptr, // GGML_OP_UPSCALE
nullptr, // GGML_OP_PAD
nullptr, // GGML_OP_PAD_REFLECT_1D
nullptr, // GGML_OP_ARANGE
nullptr, // GGML_OP_TIMESTEP_EMBEDDING
nullptr, // GGML_OP_ARGSORT
nullptr, // GGML_OP_LEAKY_RELU
nullptr, // GGML_OP_FLASH_ATTN_EXT
nullptr, // GGML_OP_FLASH_ATTN_BACK
nullptr, // GGML_OP_SSM_CONV
nullptr, // GGML_OP_SSM_SCAN
nullptr, // GGML_OP_WIN_PART
nullptr, // GGML_OP_WIN_UNPART
nullptr, // GGML_OP_GET_REL_POS
nullptr, // GGML_OP_ADD_REL_POS
nullptr, // GGML_OP_RWKV_WKV6
nullptr, // GGML_OP_UNARY
nullptr, // GGML_OP_MAP_UNARY
nullptr, // GGML_OP_MAP_BINARY
nullptr, // GGML_OP_MAP_CUSTOM1_F32
nullptr, // GGML_OP_MAP_CUSTOM2_F32
nullptr, // GGML_OP_MAP_CUSTOM3_F32
nullptr, // GGML_OP_MAP_CUSTOM1
nullptr, // GGML_OP_MAP_CUSTOM2
nullptr, // GGML_OP_MAP_CUSTOM3
nullptr, // GGML_OP_CROSS_ENTROPY_LOSS
nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK
nullptr, // GGML_OP_OPT_STEP_ADAMW
// ggml_unary_op
nullptr, // GGML_UNARY_OP_ABS
nullptr, // GGML_UNARY_OP_SGN
nullptr, // GGML_UNARY_OP_NEG
nullptr, // GGML_UNARY_OP_STEP
nullptr, // GGML_UNARY_OP_TANH
nullptr, // GGML_UNARY_OP_ELU
nullptr, // GGML_UNARY_OP_RELU
nullptr, // GGML_UNARY_OP_SIGMOID
qnn_unary_op_impl<GGML_UNARY_OP_GELU + kGgmlUnaryOpStart>, // GGML_UNARY_OP_GELU
nullptr, // GGML_UNARY_OP_GELU_QUICK
nullptr, // GGML_UNARY_OP_SILU
nullptr, // GGML_UNARY_OP_HARDSWISH
nullptr, // GGML_UNARY_OP_HARDSIGMOID
nullptr, // GGML_UNARY_OP_EXP
};
static_assert(sizeof(kQnnUnaryOpsTable) / sizeof(kQnnUnaryOpsTable[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT),
"GGML_OP_COUNT does not match the size of the kQnnUnaryOpsTable table");
constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = {
nullptr, // GGML_OP_NONE
nullptr, // GGML_OP_DUP
qnn_binary_op_impl<GGML_OP_ADD>, // GGML_OP_ADD
nullptr, // GGML_OP_ADD1
nullptr, // GGML_OP_ACC
qnn_binary_op_impl<GGML_OP_SUB>, // GGML_OP_SUB
qnn_binary_op_impl<GGML_OP_MUL>, // GGML_OP_MUL
qnn_binary_op_impl<GGML_OP_DIV>, // GGML_OP_DIV
nullptr, // GGML_OP_SQR
nullptr, // GGML_OP_SQRT
nullptr, // GGML_OP_LOG
nullptr, // GGML_OP_SIN
nullptr, // GGML_OP_COS
nullptr, // GGML_OP_SUM
nullptr, // GGML_OP_SUM_ROWS
nullptr, // GGML_OP_MEAN
nullptr, // GGML_OP_ARGMAX
nullptr, // GGML_OP_COUNT_EQUAL
nullptr, // GGML_OP_REPEAT
nullptr, // GGML_OP_REPEAT_BACK
nullptr, // GGML_OP_CONCAT
nullptr, // GGML_OP_SILU_BACK
nullptr, // GGML_OP_NORM
nullptr, // GGML_OP_RMS_NORM
nullptr, // GGML_OP_RMS_NORM_BACK
nullptr, // GGML_OP_GROUP_NORM
qnn_binary_op_impl<GGML_OP_MUL_MAT>, // GGML_OP_MUL_MAT
nullptr, // GGML_OP_MUL_MAT_ID
nullptr, // GGML_OP_OUT_PROD
nullptr, // GGML_OP_SCALE
nullptr, // GGML_OP_SET
nullptr, // GGML_OP_CPY
nullptr, // GGML_OP_CONT
nullptr, // GGML_OP_RESHAPE
nullptr, // GGML_OP_VIEW
nullptr, // GGML_OP_PERMUTE
nullptr, // GGML_OP_TRANSPOSE
nullptr, // GGML_OP_GET_ROWS
nullptr, // GGML_OP_GET_ROWS_BACK
nullptr, // GGML_OP_DIAG
nullptr, // GGML_OP_DIAG_MASK_INF
nullptr, // GGML_OP_DIAG_MASK_ZERO
nullptr, // GGML_OP_SOFT_MAX
nullptr, // GGML_OP_SOFT_MAX_BACK
nullptr, // GGML_OP_ROPE
nullptr, // GGML_OP_ROPE_BACK
nullptr, // GGML_OP_CLAMP
nullptr, // GGML_OP_CONV_TRANSPOSE_1D
nullptr, // GGML_OP_IM2COL
nullptr, // GGML_OP_IM2COL_BACK
nullptr, // GGML_OP_CONV_TRANSPOSE_2D
nullptr, // GGML_OP_POOL_1D
nullptr, // GGML_OP_POOL_2D
nullptr, // GGML_OP_POOL_2D_BACK
nullptr, // GGML_OP_UPSCALE
nullptr, // GGML_OP_PAD
nullptr, // GGML_OP_PAD_REFLECT_1D
nullptr, // GGML_OP_ARANGE
nullptr, // GGML_OP_TIMESTEP_EMBEDDING
nullptr, // GGML_OP_ARGSORT
nullptr, // GGML_OP_LEAKY_RELU
nullptr, // GGML_OP_SCALE
nullptr, // GGML_OP_SET
nullptr, // GGML_OP_CPY
nullptr, // GGML_OP_CONT
qnn_nop_impl, // GGML_OP_RESHAPE
nullptr, // GGML_OP_VIEW
nullptr, // GGML_OP_PERMUTE
nullptr, // GGML_OP_TRANSPOSE
nullptr, // GGML_OP_GET_ROWS
nullptr, // GGML_OP_GET_ROWS_BACK
nullptr, // GGML_OP_DIAG
nullptr, // GGML_OP_DIAG_MASK_INF
nullptr, // GGML_OP_DIAG_MASK_ZERO
nullptr, // GGML_OP_SOFT_MAX
nullptr, // GGML_OP_SOFT_MAX_BACK
nullptr, // GGML_OP_ROPE
nullptr, // GGML_OP_ROPE_BACK
nullptr, // GGML_OP_CLAMP
nullptr, // GGML_OP_CONV_TRANSPOSE_1D
nullptr, // GGML_OP_IM2COL
nullptr, // GGML_OP_IM2COL_BACK
nullptr, // GGML_OP_CONV_TRANSPOSE_2D
nullptr, // GGML_OP_POOL_1D
nullptr, // GGML_OP_POOL_2D
nullptr, // GGML_OP_POOL_2D_BACK
nullptr, // GGML_OP_UPSCALE
nullptr, // GGML_OP_PAD
nullptr, // GGML_OP_PAD_REFLECT_1D
nullptr, // GGML_OP_ARANGE
nullptr, // GGML_OP_TIMESTEP_EMBEDDING
nullptr, // GGML_OP_ARGSORT
nullptr, // GGML_OP_LEAKY_RELU
nullptr, // GGML_OP_FLASH_ATTN_EXT
nullptr, // GGML_OP_FLASH_ATTN_BACK
@ -537,10 +332,36 @@ constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = {
nullptr, // GGML_OP_CROSS_ENTROPY_LOSS
nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK
nullptr, // GGML_OP_OPT_STEP_ADAMW
// ggml_unary_op
nullptr, // GGML_UNARY_OP_ABS
nullptr, // GGML_UNARY_OP_SGN
nullptr, // GGML_UNARY_OP_NEG
nullptr, // GGML_UNARY_OP_STEP
nullptr, // GGML_UNARY_OP_TANH
nullptr, // GGML_UNARY_OP_ELU
nullptr, // GGML_UNARY_OP_RELU
nullptr, // GGML_UNARY_OP_SIGMOID
qnn_generic_op_impl, // GGML_UNARY_OP_GELU
nullptr, // GGML_UNARY_OP_GELU_QUICK
nullptr, // GGML_UNARY_OP_SILU
nullptr, // GGML_UNARY_OP_HARDSWISH
nullptr, // GGML_UNARY_OP_HARDSIGMOID
nullptr, // GGML_UNARY_OP_EXP
};
static_assert(sizeof(kQnnBinaryOpsTable) / sizeof(kQnnBinaryOpsTable[0]) == GGML_OP_COUNT,
"GGML_OP_COUNT does not match the size of the kQnnBinaryOpsTable table");
static_assert(kQnnOpsTable[GGML_OP_NONE] == qnn_nop_impl, "GGML_OP_NONE does not match the qnn_nop_impl function");
static_assert(kQnnOpsTable[GGML_OP_ADD] == qnn_generic_op_impl,
"GGML_OP_ADD does not match the qnn_generic_op_impl function");
static_assert(kQnnOpsTable[GGML_OP_MUL] == qnn_generic_op_impl,
"GGML_OP_MUL does not match the qnn_generic_op_impl function");
static_assert(kQnnOpsTable[GGML_OP_MUL_MAT] == qnn_generic_op_impl,
"GGML_OP_MUL_MAT does not match the qnn_generic_op_impl function");
static_assert(kQnnOpsTable[GGML_OP_RESHAPE] == qnn_nop_impl,
"GGML_OP_RESHAPE does not match the qnn_nop_impl function");
static_assert(kQnnOpsTable[GGML_OP_VIEW] == nullptr, "GGML_OP_VIEW is not nullptr");
static_assert(std::size(kQnnOpsTable) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT),
"GGML_OP_COUNT does not match the size of the kQnnOpsTable table");
bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_tensor *tensor) {
if (!tensor) {
@ -548,6 +369,7 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t
return false;
}
#ifndef NDEBUG
if (tensor->view_src) {
auto *src_tensor = tensor->view_src;
QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) is a view, src: %s_%dx%dx%dx%d", qnn::get_backend_name(ctx->device),
@ -555,6 +377,7 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t
ggml_get_name(src_tensor), src_tensor->ne[0], src_tensor->ne[1], src_tensor->ne[2],
src_tensor->ne[3]);
}
#endif
switch (tensor->type) {
case GGML_TYPE_F32:
@ -576,6 +399,25 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t
return true;
}
bool ggnl_qnn_supports_op_tensor(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) {
if (op->op == GGML_OP_NONE) {
return true;
}
if (!ggml_qnn_supports_tensor(ctx, op)) {
return false;
}
const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(op));
for (size_t i = 0; i < param_count; ++i) {
if (!ggml_qnn_supports_tensor(ctx, op->src[i])) {
return false;
}
}
return true;
}
bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) {
constexpr const size_t kMaxNpuTensorSize = 8192L * 2048 + 8192 * 512 + 2048 * 512;
constexpr const auto get_tensor_size = [](const ggml_tensor *tensor) -> size_t {
@ -591,11 +433,11 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm
* TODO: remove the blocker here when NPU backend supports mul_mat like this:
* [ne03, ne02, n, k] * [ne03 * x, ne02 * y, m, k] -> [ne03 * x, ne02 * y, m, n]
*/
QNN_LOG_DEBUG("[qnn-npu] src0 and src1 dimensions are not equal, support/unsupported: %d/%d",
QNN_LOG_DEBUG("[qnn-npu][MUL_MAT]src0 and src1 dimensions are not equal, support/unsupported: %d/%d",
ctx->support_op_count.load(), ++(ctx->unsupported_op_count));
return false;
} else if (get_tensor_size(src0) + get_tensor_size(src1) + get_tensor_size(op) >= kMaxNpuTensorSize) {
QNN_LOG_DEBUG("[qnn-npu] tensor size is too large, support/unsupported: %d/%d",
QNN_LOG_DEBUG("[qnn-npu][MUL_MAT]tensor size is too large, support/unsupported: %d/%d",
ctx->support_op_count.load(), ++(ctx->unsupported_op_count));
return false;
}
@ -604,9 +446,9 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm
case QNN_BACKEND_GPU:
if (src0->type != src1->type || src0->type != op->type) {
// there's no convert op for GPU.
QNN_LOG_DEBUG("[qnn-gpu]type src0(%d), src1(%d) and op(%d) are not equal, support/unsupported: %d/%d",
src0->type, src1->type, op->type, ctx->support_op_count.load(),
++(ctx->unsupported_op_count));
QNN_LOG_DEBUG(
"[qnn-gpu][MUL_MAT]type src0(%d), src1(%d) and op(%d) are not equal, support/unsupported: %d/%d",
src0->type, src1->type, op->type, ctx->support_op_count.load(), ++(ctx->unsupported_op_count));
return false;
}
break;
@ -615,12 +457,12 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm
}
if ((src1->ne[2] % src0->ne[2]) != 0 || (src1->ne[3] % src0->ne[3]) != 0) {
QNN_LOG_DEBUG("[%s] src0 and src1 dimensions are not equal, support/unsupported: %d/%d",
QNN_LOG_DEBUG("[%s][MUL_MAT]src0 and src1 dimensions are not equal, support/unsupported: %d/%d",
qnn::get_backend_name(ctx->device), ctx->support_op_count.load(), ++(ctx->unsupported_op_count));
return false;
}
QNN_LOG_DEBUG("[%s] supported matmul op, support/unsupported: %d/%d", qnn::get_backend_name(ctx->device),
QNN_LOG_DEBUG("[%s][MUL_MAT]supported matmul op, support/unsupported: %d/%d", qnn::get_backend_name(ctx->device),
++(ctx->support_op_count), ctx->unsupported_op_count.load());
return true;
}
@ -635,41 +477,30 @@ bool device_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor
return true;
}
auto *src0 = op->src[0];
if (!kQnnOpsTable[qnn::get_qnn_op_index(op)]) {
QNN_LOG_DEBUG("[%s]unsupported op", ggml_op_name(op->op));
return false;
}
if (!ggnl_qnn_supports_op_tensor(ctx, op)) {
QNN_LOG_DEBUG("[%s]unsupported tensor", ggml_op_name(op->op));
return false;
}
if (op->op == GGML_OP_UNARY) {
const auto unary_op = ggml_get_unary_op(op);
if (unary_op == GGML_UNARY_OP_GELU && ctx->device == QNN_BACKEND_NPU) {
// TODO: fix this when NPU supports GELU
QNN_LOG_DEBUG("unsupported unary op GGML_UNARY_OP_GELU for NPU");
return false;
}
if (!kQnnUnaryOpsTable[kGgmlUnaryOpStart + unary_op]) {
QNN_LOG_DEBUG("unsupported unary op %d", unary_op);
return false;
}
if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, op)) {
QNN_LOG_DEBUG("src0 is nullptr");
if (unary_op == GGML_UNARY_OP_GELU) {
// TODO: fix this
QNN_LOG_DEBUG("[GELU]unsupported unary op GGML_UNARY_OP_GELU for NPU");
return false;
}
} else {
if (!kQnnUnaryOpsTable[op->op] && !kQnnBinaryOpsTable[op->op]) {
QNN_LOG_DEBUG("[%s] unsupported op", ggml_op_name(op->op));
return false;
}
auto *src0 = op->src[0];
auto *src1 = op->src[1];
if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, op) ||
(kQnnBinaryOpsTable[op->op] && !ggml_qnn_supports_tensor(ctx, src1))) {
QNN_LOG_DEBUG("[%s] unsupported tensor", ggml_op_name(op->op));
return false;
}
switch (op->op) {
case GGML_OP_ADD:
if (!is_tensor_dimensions_equal(src0, src1)) {
QNN_LOG_DEBUG("src0 and src1 dimensions are not equal");
if (!ggml_are_same_shape(src0, src1)) {
QNN_LOG_DEBUG("[ADD] src0 and src1 dimensions are not equal");
return false;
}
break;
@ -686,34 +517,13 @@ bool device_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor
}
bool device_compute_graph(ggml_backend_qnn_device_context *ctx, ggml_cgraph *cgraph) {
QNN_LOG_DEBUG("[%s]compute graph, nodes count: %d", qnn::get_backend_name(ctx->device), (int)cgraph->n_nodes);
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor *tensor = cgraph->nodes[i];
if (ggml_is_empty(tensor)) {
continue;
}
QNN_LOG_DEBUG("[%s]compute graph start, nodes count: %d", qnn::get_backend_name(ctx->device), (int)cgraph->n_nodes);
size_t unary_op_idx = tensor->op;
if (tensor->op == GGML_OP_UNARY) {
unary_op_idx = kGgmlUnaryOpStart + ggml_get_unary_op(tensor);
}
auto qnn_graph = get_qnn_graph_from_cache(ctx, cgraph);
bool success = qnn_graph && qnn_graph->execute(cgraph);
bool ok = false;
auto unary_op = kQnnUnaryOpsTable[unary_op_idx];
auto binary_op = kQnnBinaryOpsTable[tensor->op];
if (unary_op) {
ok = unary_op(ctx, tensor->src[0], tensor);
} else if (binary_op) {
ok = binary_op(ctx, tensor->src[0], tensor->src[1], tensor);
}
if (!ok) {
QNN_LOG_WARN("[%s]unsupported op %s", qnn::get_backend_name(ctx->device), ggml_op_desc(tensor));
return false;
}
}
return true;
QNN_LOG_DEBUG("[%s]compute graph, success: %d", qnn::get_backend_name(ctx->device), (int)success);
return success;
}
} // namespace qnn

View File

@ -19,7 +19,7 @@
#include "qnn-lib.hpp"
namespace qnn {
typedef std::unordered_map<std::string, std::unique_ptr<qnn::ggml_qnn_graph>> ggml_qnn_graph_cache_t;
typedef std::unordered_map<std::string, std::unique_ptr<qnn::qnn_graph>> qnn_graph_cache_t;
} // namespace qnn
struct ggml_backend_qnn_device_context {
@ -35,7 +35,7 @@ struct ggml_backend_qnn_device_context {
std::shared_ptr<qnn::qnn_instance> instance;
std::shared_ptr<qnn::qnn_interface> qnn_interface;
qnn::ggml_qnn_graph_cache_t qnn_graph_cache;
qnn::qnn_graph_cache_t qnn_graph_cache;
#ifndef NDEBUG
std::atomic_uint32_t support_op_count = 0;

View File

@ -8,18 +8,65 @@
namespace qnn {
/**
* @brief An interface for managing generic QNN buffers.
*
* This abstract class defines the interface for managing generic memory buffers in a QNN context.
*/
class qnn_buffer_interface {
public:
virtual ~qnn_buffer_interface() = default;
/**
* @brief Checks if the buffer is valid.
*
* This pure virtual function must be implemented by derived classes to check
* the validity of the buffer.
*
* @return true if the buffer is valid, false otherwise.
*/
virtual bool is_valid() const = 0;
/**
* @brief Gets the buffer pointer.
*
* This pure virtual function must be implemented by derived classes to return
* a pointer to the buffer.
*
* @return A pointer to the buffer.
*/
virtual uint8_t *get_buffer() = 0;
/**
* @brief Gets the buffer pointer.
*
* This pure virtual function must be implemented by derived classes to return
* a pointer to the buffer.
*
* @return A pointer to the buffer.
*/
virtual size_t get_size() const = 0;
/**
* @brief Gets the QNN memory handle associated with the buffer.
*
* This pure virtual function must be implemented by derived classes to return
* the memory handle associated with the buffer.
*
* @return The memory handle, or null if no valid QNN memory handle is attached.
*/
virtual Qnn_MemHandle_t get_mem_handle() const = 0;
};
using qnn_buffer_ptr = std::shared_ptr<qnn_buffer_interface>;
/**
* @brief A class for managing QNN RPC memory buffers.
*
* This class is responsible for allocating, registering, and managing a buffer in RPC memory.
* It ensures that the buffer is properly allocated and registered with the QNN instance, and
* handles cleanup of the buffer and its associated memory handle upon destruction.
*/
class qnn_rpc_buffer : public qnn_buffer_interface {
public:
qnn_rpc_buffer(std::shared_ptr<qnn_instance> qnn_instance, const size_t size, const uint32_t rank,
@ -29,7 +76,7 @@ public:
_qnn_rpc_buffer = static_cast<uint8_t *>(qnn_instance->alloc_rpcmem(size, alignof(uint8_t *)));
_qnn_rpc_mem_handle = qnn_instance->register_rpcmem(_qnn_rpc_buffer, rank, dimensions, data_type);
if (!_qnn_rpc_buffer || !_qnn_rpc_mem_handle) {
QNN_LOG_WARN("register rpc mem failure");
QNN_LOG_WARN("Failed to register RPC memory: buffer or memory handle is null");
// let the destructor free the buffer
return;
}
@ -64,6 +111,13 @@ private:
DISABLE_MOVE(qnn_rpc_buffer);
};
/**
* @brief A class for managing QNN memory buffers allocated in regular memory.
*
* This class is responsible for allocating, managing, and freeing memory buffers
* in regular (non-RPC) memory. It implements the qnn_buffer_interface to provide
* a consistent interface for buffer management.
*/
class qnn_mem_buffer : public qnn_buffer_interface {
public:
explicit qnn_mem_buffer(const uint8_t *data, size_t size) {
@ -102,4 +156,24 @@ private:
DISABLE_MOVE(qnn_mem_buffer);
};
class qnn_mem_buffer_slice : public qnn_buffer_interface {
public:
qnn_mem_buffer_slice(const uint8_t *buffer, size_t size) : _buffer(const_cast<uint8_t *>(buffer)), _size(size) {}
bool is_valid() const override { return _buffer && _size; }
uint8_t *get_buffer() override { return _buffer; }
size_t get_size() const override { return _size; }
Qnn_MemHandle_t get_mem_handle() const override { return nullptr; }
private:
uint8_t *_buffer = nullptr;
size_t _size = 0;
DISABLE_COPY(qnn_mem_buffer_slice);
DISABLE_MOVE(qnn_mem_buffer_slice);
};
} // namespace qnn

View File

@ -222,6 +222,9 @@ bool ggml_backend_qnn_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_
GGML_UNUSED(backend_dst);
GGML_UNUSED(src);
GGML_UNUSED(dst);
QNN_LOG_DEBUG("opy form %s to %s, src_is_qnn: %d, dst_is_qnn: %d", ggml_get_name(src), ggml_get_name(dst),
(int)ggml_backend_is_qnn(backend_src), (int)ggml_backend_is_qnn(backend_dst));
return false;
}
@ -317,8 +320,6 @@ ggml_guid_t ggml_backend_qnn_guid() {
return &guid;
}
bool ggml_backend_is_qnn(ggml_backend_t backend) { return ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); }
ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, const char *extend_lib_search_path) {
if (!extend_lib_search_path) {
extend_lib_search_path = GGML_QNN_DEFAULT_LIB_SEARCH_PATH;
@ -420,8 +421,13 @@ bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_
}
bool ggml_backend_qnn_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor *op) {
#ifdef NDEBUG
GGML_UNUSED(dev);
GGML_UNUSED(op);
#else
auto *device_ctx = get_device_context(dev);
QNN_LOG_DEBUG("[%s][%s]offload op", qnn::get_backend_name(device_ctx->device), ggml_op_name(op->op));
#endif
return false;
}
@ -509,6 +515,8 @@ const ggml_backend_reg_i ggml_backend_qnn_reg_interface = {
} // namespace
bool ggml_backend_is_qnn(ggml_backend_t backend) { return ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); }
ggml_backend_reg_t ggml_backend_qnn_reg() {
static ggml_backend_qnn_reg_impl reg{ggml_backend_qnn_reg_interface};
return &reg;

386
ggml/src/ggml-qnn/graph.cpp Normal file
View File

@ -0,0 +1,386 @@
#include "graph.hpp"
#include <set>
#include <unordered_map>
#include "ggml-impl.h"
#include "logger.hpp"
#include "op-config.hpp"
#include "tensor.hpp"
namespace {
using qnn_tensor_cache_t = std::unordered_map<ggml_tensor *, qnn::qnn_tensor_ptr_t>;
int get_op_max_rank(const ggml_tensor *op) {
int max_rank = ggml_n_dims(op);
const int count = (int)qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(op));
for (int i = 0; i < count; ++i) {
max_rank = std::max(max_rank, ggml_n_dims(op->src[i]));
}
return max_rank;
}
qnn::qnn_tensor_ptr_t create_tensor_with_cache(ggml_tensor *tensor, qnn::ggml_qnn_tensor::tensor_type_t type, int rank,
QNNBackend device, Qnn_GraphHandle_t graph_handle,
std::shared_ptr<qnn::qnn_instance> qnn_instance,
qnn_tensor_cache_t &tensor_cache) {
GGML_ASSERT(tensor);
if (tensor_cache.count(tensor)) {
return tensor_cache[tensor];
}
auto qnn_tensor = std::make_shared<qnn::ggml_qnn_tensor>(type, tensor->name, tensor->ne, tensor->type, rank, device,
graph_handle, qnn_instance);
tensor_cache[tensor] = qnn_tensor;
return qnn_tensor;
}
qnn::qnn_tensor_array_t create_tensors_with_cache(const qnn::ggml_tensor_array_t &ggml_tensors,
qnn::ggml_qnn_tensor::tensor_type_t type, int rank, QNNBackend device,
Qnn_GraphHandle_t graph_handle,
std::shared_ptr<qnn::qnn_instance> qnn_instance,
qnn_tensor_cache_t &tensor_cache) {
qnn::qnn_tensor_array_t tensors;
for (auto *tensor : ggml_tensors) {
tensors.push_back(
create_tensor_with_cache(tensor, type, rank, device, graph_handle, qnn_instance, tensor_cache));
}
return tensors;
}
qnn::qnn_op_config_ptr_t create_operation_from_op_tensor(ggml_tensor *dst, const std::string &name, int rank,
QNNBackend device, Qnn_GraphHandle_t graph_handle,
std::shared_ptr<qnn::qnn_instance> qnn_instance,
bool is_intermediate, qnn_tensor_cache_t &tensor_cache) {
const auto op_index = qnn::get_qnn_op_index(dst);
auto qnn_op = qnn::create_op_constructor(op_index);
auto operation = qnn_op(name, qnn_instance);
// input tensors
qnn::qnn_tensor_array_t input_qnn_tensors;
auto tensor_type = is_intermediate ? qnn::ggml_qnn_tensor::INTERMEDIATE : qnn::ggml_qnn_tensor::INPUT;
for (size_t i = 0; i < qnn::get_qnn_op_input_param_count(op_index); ++i) {
auto input_qnn_tensor =
create_tensor_with_cache(dst->src[i], tensor_type, rank, device, graph_handle, qnn_instance, tensor_cache);
input_qnn_tensors.push_back(input_qnn_tensor);
}
operation->set_input_tensors(input_qnn_tensors);
// output tensor
tensor_type = is_intermediate ? qnn::ggml_qnn_tensor::INTERMEDIATE : qnn::ggml_qnn_tensor::OUTPUT;
qnn::qnn_tensor_array_t output_qnn_tensors =
create_tensors_with_cache({dst}, tensor_type, rank, device, graph_handle, qnn_instance, tensor_cache);
operation->set_output_tensors(output_qnn_tensors);
// initialize operation
if (!operation->initialize_op_nodes(device, graph_handle)) {
QNN_LOG_ERROR("[%s][%s]initialize_op_nodes failed", qnn::get_backend_name(device), name.c_str());
return nullptr;
}
return operation;
}
bool bind_src_tensors(ggml_tensor *op, qnn::qnn_tensor_array_t &tensor_wrappers,
std::vector<Qnn_Tensor_t> &qnn_tensors) {
if (op->op == GGML_OP_NONE) {
QNN_LOG_DEBUG("op %s is not a valid op", ggml_get_name(op));
return false;
}
const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(op));
GGML_ASSERT(tensor_wrappers.size() == param_count);
qnn_tensors.resize(param_count);
for (size_t i = 0; i < param_count; ++i) {
auto *ggml_tensor = op->src[i];
if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) {
QNN_LOG_ERROR("bind tensor %s failed", ggml_get_name(ggml_tensor));
return false;
}
qnn_tensors[i] = tensor_wrappers[i]->get_qnn_tensor();
}
return true;
}
int get_io_tensors_from_graph(const ggml_cgraph *cgraph, qnn::ggml_tensor_array_t &inputs,
qnn::ggml_tensor_array_t &outputs) {
using ggml_tensor_set_t = std::set<ggml_tensor *>;
ggml_tensor_set_t input_set;
ggml_tensor_set_t output_set;
ggml_tensor_set_t visited_set;
int rank = 0;
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor *dst = cgraph->nodes[i];
if (ggml_is_empty(dst)) {
continue;
}
if (dst->op == GGML_OP_NONE || dst->op == GGML_OP_VIEW) {
// TODO: remove GGML_OP_VIEW after view op is supported
continue;
}
rank = std::max(rank, ggml_n_dims(dst));
input_set.erase(dst);
if (!visited_set.count(dst)) {
output_set.insert(dst);
visited_set.insert(dst);
}
for (size_t i = 0; i < GGML_MAX_DIMS && dst->src[i]; ++i) {
auto *src = dst->src[i];
rank = std::max(rank, ggml_n_dims(src));
output_set.erase(src);
if (!visited_set.count(src)) {
input_set.insert(src);
visited_set.insert(src);
}
}
}
inputs.assign(input_set.begin(), input_set.end());
outputs.assign(output_set.begin(), output_set.end());
return rank;
}
} // namespace
namespace qnn {
qnn_graph::qnn_graph(const std::string &graph_name, QNNBackend device, std::shared_ptr<qnn_instance> qnn_instance,
size_t vtcm_size_in_mb)
: _graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) {
QNN_LOG_DEBUG("[%s][%s]created", get_backend_name(device), graph_name.c_str());
auto qnn_interface = qnn_instance->get_qnn_interface();
auto qnn_context = qnn_instance->get_qnn_context_handle();
Qnn_ErrorHandle_t error = QNN_SUCCESS;
Qnn_GraphHandle_t graph_handle = nullptr;
if (device == QNN_BACKEND_NPU) {
// TODO: fix graph config here for NPU
QnnHtpGraph_CustomConfig_t hvx_config;
hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
hvx_config.numHvxThreads = 8;
QnnGraph_Config_t graph_hvx_config;
graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
graph_hvx_config.customConfig = &hvx_config;
QnnHtpGraph_CustomConfig_t dlbc_config;
dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC
QnnGraph_Config_t graph_dlbc_config;
graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
graph_dlbc_config.customConfig = &dlbc_config;
QnnHtpGraph_CustomConfig_t opt_config;
opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
opt_config.optimizationOption.floatValue = 1; // 1 / 3
QnnGraph_Config_t graph_opt_config;
graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
graph_opt_config.customConfig = &opt_config;
QnnHtpGraph_CustomConfig_t vtcm_config;
vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
vtcm_config.vtcmSizeInMB = vtcm_size_in_mb;
QnnGraph_Config_t graph_vtcm_config;
graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
graph_vtcm_config.customConfig = &vtcm_config;
const QnnGraph_Config_t *graph_configs[] = {&graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config,
&graph_opt_config, nullptr};
error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), graph_configs, &graph_handle);
} else {
error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), nullptr, &graph_handle);
}
if (error != QNN_SUCCESS) {
QNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s", get_backend_name(device), graph_name.c_str(),
get_qnn_error_string(error));
return;
}
QNN_LOG_INFO("[%s][%s]create succeed", get_backend_name(device), graph_name.c_str());
_graph_handle = graph_handle;
_qnn_interface = qnn_interface;
}
qnn_graph::~qnn_graph() { QNN_LOG_DEBUG("[%s][%s]destroy", get_backend_name(_device), _graph_name.c_str()); }
bool qnn_graph::build_graph_from_op(ggml_tensor *op) {
if (!is_valid()) {
QNN_LOG_ERROR("Invalid graph");
return false;
}
QNN_LOG_DEBUG("[%s][%s]build start", get_backend_name(_device), _graph_name.c_str());
qnn_tensor_cache_t tensor_cache;
const auto rank = get_op_max_rank(op);
auto operation = create_operation_from_op_tensor(op, _graph_name, rank, _device, _graph_handle, _qnn_instance,
false, tensor_cache);
if (!operation) {
QNN_LOG_ERROR("[%s][%s]create_operation_from_op_tensor failed", get_backend_name(_device), _graph_name.c_str());
return false;
}
_tensor_inputs = operation->get_input_tensors();
_tensor_outputs = operation->get_output_tensors();
_operations.push_back(std::move(operation));
if (!finalize()) {
return false;
}
QNN_LOG_DEBUG("[%s][%s]build succeed", get_backend_name(_device), _graph_name.c_str());
return true;
}
bool qnn_graph::build_graph_from_ggml_graph(const ggml_cgraph *cgraph) {
QNN_LOG_DEBUG("[%s][%s]build start", get_backend_name(_device), _graph_name.c_str());
ggml_tensor_array_t inputs;
ggml_tensor_array_t outputs;
int rank = get_io_tensors_from_graph(cgraph, inputs, outputs);
QNN_LOG_DEBUG("[%s]rank: %d, input_set: %d, output_set: %d", get_backend_name(_device), rank, int(inputs.size()),
int(outputs.size()));
{
qnn_tensor_cache_t tensor_cache;
auto input_tensors = create_tensors_with_cache(inputs, ggml_qnn_tensor::INPUT, rank, _device, _graph_handle,
_qnn_instance, tensor_cache);
auto output_tensors = create_tensors_with_cache(outputs, ggml_qnn_tensor::OUTPUT, rank, _device, _graph_handle,
_qnn_instance, tensor_cache);
qnn_op_config_array_t operations;
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor *dst = cgraph->nodes[i];
if (ggml_is_empty(dst)) {
continue;
}
if (dst->op == GGML_OP_NONE || dst->op == GGML_OP_VIEW) {
// TODO: remove GGML_OP_VIEW after view op is supported
continue;
}
QNN_LOG_DEBUG("[%s]create op: %s", get_backend_name(_device), get_qnn_op_name(dst->op));
auto operation = create_operation_from_op_tensor(dst, dst->name, rank, _device, _graph_handle,
_qnn_instance, true, tensor_cache); // TODO: fix op name
operations.push_back(operation);
}
_tensor_inputs = std::move(input_tensors);
_tensor_outputs = std::move(output_tensors);
_operations = std::move(operations);
if (!finalize()) {
return false;
}
}
QNN_LOG_DEBUG("[%s][%s]build succeed", get_backend_name(_device), _graph_name.c_str());
return true;
}
bool qnn_graph::execute(ggml_tensor *op) {
if (!bind_src_tensors(op, _tensor_inputs, _qnn_tensor_inputs)) {
QNN_LOG_ERROR("[%s][%s]bind input tensors failed", get_backend_name(_device), _graph_name.c_str());
return false;
}
if (!qnn::bind_tensors({op}, _tensor_outputs, _qnn_tensor_outputs)) {
QNN_LOG_ERROR("[%s][%s]bind output tensors failed", get_backend_name(_device), _graph_name.c_str());
return false;
}
auto &qnn_tensor_inputs = _qnn_tensor_inputs;
auto &qnn_tensor_outputs = _qnn_tensor_outputs;
auto error =
_qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(),
qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr);
unbind_tensors(_tensor_inputs);
unbind_tensors(_tensor_outputs);
if (error != QNN_SUCCESS) {
if (_device == QNN_BACKEND_NPU && error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) {
QNN_LOG_WARN("[%s][%s]NPU crashed. SSR detected. Caused QNN graph execute error.",
get_backend_name(_device), _graph_name.c_str());
} else {
QNN_LOG_ERROR("[%s][%s]error: %s", get_backend_name(_device), _graph_name.c_str(),
get_qnn_error_string(error));
}
return false;
}
QNN_LOG_DEBUG("[%s][%s]execute succeed", get_backend_name(_device), _graph_name.c_str());
return true;
}
bool qnn_graph::execute(const ggml_cgraph *cgraph) {
ggml_tensor_array_t inputs;
ggml_tensor_array_t outputs;
#ifdef NDEBUG
get_io_tensors_from_graph(cgraph, inputs, outputs);
#else
int rank = get_io_tensors_from_graph(cgraph, inputs, outputs);
QNN_LOG_DEBUG("[%s]rank: %d, input_set: %d, output_set: %d", get_backend_name(_device), rank, int(inputs.size()),
int(outputs.size()));
#endif
{
if (!qnn::bind_tensors(inputs, _tensor_inputs, _qnn_tensor_inputs)) {
QNN_LOG_ERROR("[%s][%s]bind input tensors failed", get_backend_name(_device), _graph_name.c_str());
return false;
}
if (!qnn::bind_tensors(outputs, _tensor_outputs, _qnn_tensor_outputs)) {
QNN_LOG_ERROR("[%s][%s]bind output tensors failed", get_backend_name(_device), _graph_name.c_str());
return false;
}
auto &qnn_tensor_inputs = _qnn_tensor_inputs;
auto &qnn_tensor_outputs = _qnn_tensor_outputs;
auto error =
_qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(),
qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr);
unbind_tensors(_tensor_inputs);
unbind_tensors(_tensor_outputs);
if (error != QNN_SUCCESS) {
if (_device == QNN_BACKEND_NPU && error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) {
QNN_LOG_WARN("[%s][%s]NPU crashed. SSR detected. Caused QNN graph execute error.",
get_backend_name(_device), _graph_name.c_str());
} else {
QNN_LOG_ERROR("[%s][%s]error: %s", get_backend_name(_device), _graph_name.c_str(),
get_qnn_error_string(error));
}
return false;
}
QNN_LOG_DEBUG("[%s][%s]execute succeed", get_backend_name(_device), _graph_name.c_str());
return true;
}
}
bool qnn_graph::finalize() {
if (!qnn::add_op_to_graph(_graph_handle, _operations)) {
QNN_LOG_ERROR("[%s]add nodes failed", _graph_name.c_str());
return false;
}
auto error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr);
if (error != QNN_SUCCESS) {
QNN_LOG_ERROR("[%s][%s]qnn_graph_finalize.error: %s", get_backend_name(_device), _graph_name.c_str(),
get_qnn_error_string(error));
return false;
}
QNN_LOG_DEBUG("[%s][%s]finalize succeed", get_backend_name(_device), _graph_name.c_str());
return true;
}
} // namespace qnn

View File

@ -1,164 +1,53 @@
#pragma once
#include <cstdio>
#include <memory>
#include <string>
#include <vector>
#include "ggml-qnn.h"
#include "logger.hpp"
#include "op-config.hpp"
#include "qnn-lib.hpp"
namespace qnn {
class ggml_qnn_graph {
class qnn_graph {
public:
explicit ggml_qnn_graph(const std::string &graph_name, QNNBackend device,
std::shared_ptr<qnn_instance> qnn_instance, size_t vtcm_size_in_mb)
: _graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) {
QNN_LOG_DEBUG("[%s][%s]created", get_backend_name(device), graph_name.c_str());
explicit qnn_graph(const std::string &graph_name, QNNBackend device, std::shared_ptr<qnn_instance> qnn_instance,
size_t vtcm_size_in_mb);
~qnn_graph();
auto qnn_interface = qnn_instance->get_qnn_interface();
auto qnn_context = qnn_instance->get_qnn_context_handle();
Qnn_ErrorHandle_t error = QNN_SUCCESS;
Qnn_GraphHandle_t graph_handle = nullptr;
if (device == QNN_BACKEND_NPU) {
// TODO: fix graph config here for NPU
QnnHtpGraph_CustomConfig_t hvx_config;
hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
hvx_config.numHvxThreads = 8;
QnnGraph_Config_t graph_hvx_config;
graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
graph_hvx_config.customConfig = &hvx_config;
QnnHtpGraph_CustomConfig_t dlbc_config;
dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC
QnnGraph_Config_t graph_dlbc_config;
graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
graph_dlbc_config.customConfig = &dlbc_config;
QnnHtpGraph_CustomConfig_t opt_config;
opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
opt_config.optimizationOption.floatValue = 1; // 1 / 3
QnnGraph_Config_t graph_opt_config;
graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
graph_opt_config.customConfig = &opt_config;
QnnHtpGraph_CustomConfig_t vtcm_config;
vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
vtcm_config.vtcmSizeInMB = vtcm_size_in_mb;
QnnGraph_Config_t graph_vtcm_config;
graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
graph_vtcm_config.customConfig = &vtcm_config;
const QnnGraph_Config_t *graph_configs[] = {&graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config,
&graph_opt_config, nullptr};
error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), graph_configs, &graph_handle);
} else {
error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), nullptr, &graph_handle);
}
if (error != QNN_SUCCESS) {
QNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s", get_backend_name(device), graph_name.c_str(),
get_qnn_error_string(error));
return;
}
QNN_LOG_INFO("[%s][%s]create succeed", get_backend_name(device), graph_name.c_str());
_graph_handle = graph_handle;
_qnn_interface = qnn_interface;
}
~ggml_qnn_graph() { QNN_LOG_DEBUG("[%s][%s]destroy", get_backend_name(_device), _graph_name.c_str()); }
bool build_graph(ggml_op_constructor_t op_constructor, const ggml_tensor_array_t &tensor_inputs,
const ggml_tensor_array_t &tensor_outputs) {
GGML_ASSERT(op_constructor);
if (!is_valid()) {
QNN_LOG_ERROR("Invalid graph");
return false;
}
QNN_LOG_DEBUG("[%s][%s]build_graph start", get_backend_name(_device), _graph_name.c_str());
_op_config = op_constructor(_graph_name, _qnn_instance);
if (!_op_config->initialize_op_nodes(_device, _graph_handle, tensor_inputs, tensor_outputs)) {
QNN_LOG_ERROR("[%s][%s]initialize_op_nodes failed", get_backend_name(_device), _graph_name.c_str());
return false;
}
if (!_op_config->add_op_to_graph(_graph_handle)) {
QNN_LOG_ERROR("[%s]add nodes failed", _graph_name.c_str());
return false;
}
auto error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr);
if (error != QNN_SUCCESS) {
QNN_LOG_ERROR("[%s][%s]qnn_graph_finalize.error: %s", get_backend_name(_device), _graph_name.c_str(),
get_qnn_error_string(error));
return false;
}
QNN_LOG_DEBUG("[%s][%s]build_graph succeed", get_backend_name(_device), _graph_name.c_str());
return true;
}
bool execute(const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) {
if (!_op_config->bind_input_tensors(tensor_inputs)) {
QNN_LOG_ERROR("[%s][%s]bind input tensors failed", get_backend_name(_device), _graph_name.c_str());
return false;
}
if (!_op_config->bind_output_tensors(tensor_outputs)) {
QNN_LOG_ERROR("[%s][%s]bind output tensors failed", get_backend_name(_device), _graph_name.c_str());
return false;
}
auto &qnn_tensor_inputs = _op_config->get_qnn_input_tensors();
auto &qnn_tensor_outputs = _op_config->get_qnn_output_tensors();
auto error =
_qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(),
qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr);
_op_config->unbind_input_tensors();
_op_config->unbind_output_tensors();
if (error != QNN_SUCCESS) {
if (_device == QNN_BACKEND_NPU && error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) {
QNN_LOG_WARN("[%s][%s]NPU crashed. SSR detected. Caused QNN graph execute error.",
get_backend_name(_device), _graph_name.c_str());
} else {
QNN_LOG_ERROR("[%s][%s]error: %s", get_backend_name(_device), _graph_name.c_str(),
get_qnn_error_string(error));
}
return false;
}
QNN_LOG_DEBUG("[%s][%s]execute succeed", get_backend_name(_device), _graph_name.c_str());
return true;
}
bool build_graph_from_op(ggml_tensor *op);
bool build_graph_from_ggml_graph(const ggml_cgraph *cgraph);
bool execute(ggml_tensor *op);
bool execute(const ggml_cgraph *cgraph);
bool is_valid() const { return _graph_handle != nullptr; }
Qnn_GraphHandle_t get_graph_handler() const { return _graph_handle; }
std::shared_ptr<qnn_instance> get_qnn_instance() { return _qnn_instance; }
const std::string &get_name() const { return _graph_name; }
QNNBackend get_device() const { return _device; }
private:
bool finalize();
const std::string _graph_name;
const QNNBackend _device;
Qnn_GraphHandle_t _graph_handle = nullptr;
std::shared_ptr<qnn_instance> _qnn_instance;
std::shared_ptr<qnn_interface> _qnn_interface;
std::unique_ptr<ggml_qnn_op_config> _op_config;
std::vector<Qnn_Param_t> _param_types;
qnn_op_config_array_t _operations;
DISABLE_COPY(ggml_qnn_graph);
DISABLE_MOVE(ggml_qnn_graph);
qnn_tensor_array_t _tensor_inputs;
qnn_tensor_array_t _tensor_outputs;
std::vector<Qnn_Tensor_t> _qnn_tensor_inputs;
std::vector<Qnn_Tensor_t> _qnn_tensor_outputs;
DISABLE_COPY(qnn_graph);
DISABLE_MOVE(qnn_graph);
};
using qnn_graph_ptr_t = std::shared_ptr<qnn_graph>;
} // namespace qnn

View File

@ -10,8 +10,6 @@
namespace qnn {
using ggml_tensor_array_t = std::vector<ggml_tensor *>;
/**
* @class ggml_qnn_op_config
* @brief Abstract base class for configuring QNN operations.
@ -23,6 +21,34 @@ class ggml_qnn_op_config {
public:
virtual ~ggml_qnn_op_config() {}
/**
* @brief Sets custom input tensors for the operation. This method should be called before `initialize_op_nodes`.
* If no custom input tensors are provided, the input tensors will be automatically created from the input ggml
* tensors.
*
* This pure virtual function must be overridden by derived classes to set
* the input tensors for the operation. The function takes a reference to a
* vector of qnn_tensor_ptr_t objects, which represent the input tensors.
*
* @param tensor_inputs A reference to a vector of qnn_tensor_ptr_t objects representing the input tensors.
*/
virtual void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) = 0;
virtual void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) = 0;
/**
* @brief Sets custom output tensors for the operation. This method should be called before `initialize_op_nodes`.
* If no custom output tensors are provided, the output tensors will be automatically created from the output ggml
* tensors.
*
* This pure virtual function must be overridden by derived classes to set
* the output tensors for the operation. The function takes a reference to a
* vector of qnn_tensor_ptr_t objects, which represent the output tensors.
*
* @param tensor_outputs A reference to a vector of qnn_tensor_ptr_t objects representing the output tensors.
*/
virtual void set_output_tensors(qnn::qnn_tensor_array_t &tensor_outputs) = 0;
virtual void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) = 0;
/**
* @brief Creates tensors and internal nodes for constructing the calculation graph.
*
@ -31,36 +57,32 @@ public:
* the internal nodes necessary for constructing the calculation graph. It takes
* input and output tensor arrays as parameters.
*
* @param device The backend device where tensors will be created.
* @param graph_handle The handle to the graph where tensors and nodes will be associated.
* @param tensor_inputs An array of input tensors.
* @param tensor_outputs An array of output tensors.
* @param device
* @param graph_handle
* @return true if tensors and nodes are successfully created, false otherwise.
*/
virtual bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle,
const ggml_tensor_array_t &tensor_inputs,
const ggml_tensor_array_t &tensor_outputs) = 0;
virtual bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) = 0;
/**
* @brief Pure virtual function to retrieve the input tensors for QNN (Quantized Neural Network).
* @brief Pure virtual function to retrieve the input tensors.
*
* This function must be overridden by derived classes to provide the specific implementation
* for retrieving the input tensors used in QNN operations.
*
* @return A reference to a vector of Qnn_Tensor_t objects representing the input tensors.
* @return A reference to a vector of qnn_tensor_ptr_t objects representing the input tensors.
*/
virtual std::vector<Qnn_Tensor_t> &get_qnn_input_tensors() = 0;
virtual const qnn_tensor_array_t &get_input_tensors() = 0;
/**
* @brief Pure virtual function to retrieve the output tensors of a QNN (Quantized Neural Network).
* @brief Pure virtual function to retrieve the output tensors of a QNN.
*
* This function must be overridden by any derived class to provide access to the
* output tensors of the QNN. The function returns a reference to a vector of
* Qnn_Tensor_t objects, which represent the output tensors.
* qnn_tensor_ptr_t objects, which represent the output tensors.
*
* @return std::vector<Qnn_Tensor_t>& Reference to a vector of Qnn_Tensor_t objects.
* @return A reference to a vector of qnn_tensor_ptr_t objects representing the output tensors.
*/
virtual std::vector<Qnn_Tensor_t> &get_qnn_output_tensors() = 0;
virtual const qnn_tensor_array_t &get_output_tensors() = 0;
/**
* @brief Adds an operation to the given graph.
@ -125,5 +147,6 @@ public:
};
using qnn_op_config_ptr_t = std::shared_ptr<ggml_qnn_op_config>;
using qnn_op_config_array_t = std::vector<qnn_op_config_ptr_t>;
} // namespace qnn

View File

@ -0,0 +1,223 @@
#include "op-config.hpp"
namespace {
using op_dims_calc_func_t = void (*)(const std::vector<const qnn::ggml_dimension_array_t> &input_dims,
qnn::ggml_dimension_array_t &output_dims);
void element_wise_op_dims(const std::vector<const qnn::ggml_dimension_array_t> &input_dims,
qnn::ggml_dimension_array_t &output_dims) {
for (size_t i = 1; i < std::size(output_dims); i++) {
output_dims[i] = input_dims.front()[i];
}
}
void mat_mul_op_dims(const std::vector<const qnn::ggml_dimension_array_t> &input_dims,
qnn::ggml_dimension_array_t &output_dims) {
GGML_ASSERT(input_dims.size() == 2);
output_dims[0] = input_dims.front()[1];
output_dims[1] = input_dims.back()[1];
}
struct qnn_op_caps_t {
const char *qnn_op_name = nullptr;
const size_t input_param_count = 0;
op_dims_calc_func_t calc_dims_func = nullptr;
};
constexpr const qnn_op_caps_t kOpCaps[] = {
{}, // GGML_OP_NONE
{}, // GGML_OP_DUP
{
// GGML_OP_ADD
QNN_OP_ELEMENT_WISE_ADD, // qnn_op_name
2, // input_param_count
element_wise_op_dims, // calc_dims_func
},
{}, // GGML_OP_ADD1
{}, // GGML_OP_ACC
{
// GGML_OP_SUB
QNN_OP_ELEMENT_WISE_SUBTRACT, // qnn_op_name
2, // input_param_count
element_wise_op_dims, // calc_dims_func
},
{
// GGML_OP_MUL
QNN_OP_ELEMENT_WISE_MULTIPLY, // qnn_op_name
2, // input_param_count
element_wise_op_dims, // calc_dims_func
},
{
// GGML_OP_DIV
QNN_OP_ELEMENT_WISE_DIVIDE, // qnn_op_name
2, // input_param_count
element_wise_op_dims, // calc_dims_func
},
{}, // GGML_OP_SQR
{
// GGML_OP_SQRT
QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // qnn_op_name
1, // input_param_count
element_wise_op_dims, // calc_dims_func
},
{
// GGML_OP_LOG
QNN_OP_ELEMENT_WISE_LOG, // qnn_op_name
1, // input_param_count
element_wise_op_dims, // calc_dims_func
},
{}, // GGML_OP_SIN
{}, // GGML_OP_COS
{}, // GGML_OP_SUM
{}, // GGML_OP_SUM_ROWS
{}, // GGML_OP_MEAN
{}, // GGML_OP_ARGMAX
{}, // GGML_OP_COUNT_EQUAL
{}, // GGML_OP_REPEAT
{}, // GGML_OP_REPEAT_BACK
{}, // GGML_OP_CONCAT
{}, // GGML_OP_SILU_BACK
{}, // GGML_OP_NORM
{}, // GGML_OP_RMS_NORM
{}, // GGML_OP_RMS_NORM_BACK
{}, // GGML_OP_GROUP_NORM
{
// GGML_OP_MUL_MAT
QNN_OP_MAT_MUL, // qnn_op_name
2, // input_param_count
mat_mul_op_dims, // calc_dims_func
},
{}, // GGML_OP_MUL_MAT_ID
{}, // GGML_OP_OUT_PROD
{}, // GGML_OP_SCALE
{}, // GGML_OP_SET
{}, // GGML_OP_CPY
{}, // GGML_OP_CONT
{
// GGML_OP_RESHAPE
QNN_OP_RESHAPE, // qnn_op_name
1, // input_param_count
nullptr, // TODO: calc_dims_func
},
{}, // GGML_OP_VIEW
{}, // GGML_OP_PERMUTE
{}, // GGML_OP_TRANSPOSE
{}, // GGML_OP_GET_ROWS
{}, // GGML_OP_GET_ROWS_BACK
{}, // GGML_OP_DIAG
{}, // GGML_OP_DIAG_MASK_INF
{}, // GGML_OP_DIAG_MASK_ZERO
{}, // GGML_OP_SOFT_MAX
{}, // GGML_OP_SOFT_MAX_BACK
{}, // GGML_OP_ROPE
{}, // GGML_OP_ROPE_BACK
{}, // GGML_OP_CLAMP
{}, // GGML_OP_CONV_TRANSPOSE_1D
{}, // GGML_OP_IM2COL
{}, // GGML_OP_IM2COL_BACK
{}, // GGML_OP_CONV_TRANSPOSE_2D
{}, // GGML_OP_POOL_1D
{}, // GGML_OP_POOL_2D
{}, // GGML_OP_POOL_2D_BACK
{}, // GGML_OP_UPSCALE
{}, // GGML_OP_PAD
{}, // GGML_OP_PAD_REFLECT_1D
{}, // GGML_OP_ARANGE
{}, // GGML_OP_TIMESTEP_EMBEDDING
{}, // GGML_OP_ARGSORT
{}, // GGML_OP_LEAKY_RELU
{}, // GGML_OP_FLASH_ATTN_EXT
{}, // GGML_OP_FLASH_ATTN_BACK
{}, // GGML_OP_SSM_CONV
{}, // GGML_OP_SSM_SCAN
{}, // GGML_OP_WIN_PART
{}, // GGML_OP_WIN_UNPART
{}, // GGML_OP_GET_REL_POS
{}, // GGML_OP_ADD_REL_POS
{}, // GGML_OP_RWKV_WKV6
{}, // GGML_OP_UNARY
{}, // GGML_OP_MAP_UNARY
{}, // GGML_OP_MAP_BINARY
{}, // GGML_OP_MAP_CUSTOM1_F32
{}, // GGML_OP_MAP_CUSTOM2_F32
{}, // GGML_OP_MAP_CUSTOM3_F32
{}, // GGML_OP_MAP_CUSTOM1
{}, // GGML_OP_MAP_CUSTOM2
{}, // GGML_OP_MAP_CUSTOM3
{}, // GGML_OP_CROSS_ENTROPY_LOSS
{}, // GGML_OP_CROSS_ENTROPY_LOSS_BACK
{}, // GGML_OP_OPT_STEP_ADAMW
// ggml_unary_op
{}, // GGML_UNARY_OP_ABS
{}, // GGML_UNARY_OP_SGN
{}, // GGML_UNARY_OP_NEG
{}, // GGML_UNARY_OP_STEP
{}, // GGML_UNARY_OP_TANH
{}, // GGML_UNARY_OP_ELU
{}, // GGML_UNARY_OP_RELU
{}, // GGML_UNARY_OP_SIGMOID
{
// GGML_UNARY_OP_GELU
QNN_OP_GELU, // qnn_op_name
1, // input_param_count
nullptr, // TODO: calc_dims_func
},
{}, // GGML_UNARY_OP_GELU_QUICK
{}, // GGML_UNARY_OP_SILU
{}, // GGML_UNARY_OP_HARDSWISH
{}, // GGML_UNARY_OP_HARDSIGMOID
{}, // GGML_UNARY_OP_EXP
};
static_assert(kOpCaps[GGML_OP_NONE].calc_dims_func == nullptr, "GGML_OP_NONE should not have calc_dims_func function");
static_assert(kOpCaps[GGML_OP_ADD].calc_dims_func == element_wise_op_dims,
"GGML_OP_ADD does not have element_wise_op_dims function");
static_assert(kOpCaps[GGML_OP_MUL_MAT].calc_dims_func == mat_mul_op_dims,
"GGML_OP_ADD does not have element_wise_op_dims function");
static_assert(kOpCaps[GGML_OP_LOG].calc_dims_func == element_wise_op_dims,
"GGML_OP_LOG does not have element_wise_op_dims function");
static_assert(std::size(kOpCaps) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT),
"GGML_OP_COUNT does not match the size of the kOpCaps table");
} // namespace
namespace qnn {
size_t get_qnn_op_index(const ggml_tensor *tensor) {
if (tensor->op == GGML_OP_UNARY) {
return kGgmlUnaryOpStart + ggml_get_unary_op(tensor);
}
return tensor->op;
}
void get_ggml_op_output_dimensions(const std::vector<const ggml_dimension_array_t> &input_dims, size_t op,
ggml_dimension_array_t &output_dims) {
GGML_ASSERT(op < std::size(kOpCaps));
auto get_dims = kOpCaps[op].calc_dims_func;
GGML_ASSERT(get_dims);
get_dims(input_dims, output_dims);
}
const char *get_qnn_op_name(size_t op) {
GGML_ASSERT(op < std::size(kOpCaps));
GGML_ASSERT(kOpCaps[op].qnn_op_name);
return kOpCaps[op].qnn_op_name;
}
size_t get_qnn_op_input_param_count(size_t op) {
GGML_ASSERT(op < std::size(kOpCaps));
return kOpCaps[op].input_param_count;
}
} // namespace qnn

View File

@ -24,16 +24,7 @@ qnn::qnn_dimension_array_t get_transposed_dimensions(const qnn::qnn_dimension_ar
}
int get_rank(const qnn::ggml_tensor_array_t &tensor_inputs, const qnn::ggml_tensor_array_t &tensor_outputs) {
int tensor_rank = 0;
// get the max tensor rank
for (auto tensor : tensor_inputs) {
tensor_rank = std::max(tensor_rank, ggml_n_dims(tensor));
}
for (auto tensor : tensor_outputs) {
tensor_rank = std::max(tensor_rank, ggml_n_dims(tensor));
}
return tensor_rank;
return std::max(qnn::get_ggml_tensors_max_rank(tensor_inputs), qnn::get_ggml_tensors_max_rank(tensor_outputs));
}
Qnn_DataType_t get_tensor_type(const qnn::qnn_tensor_array_t &tensors) {
@ -49,93 +40,6 @@ Qnn_DataType_t get_tensor_type(const qnn::qnn_tensor_array_t &tensors) {
return type;
}
struct tensor_common_params {
const char *name_prefix;
int tensor_rank;
bool is_input;
QNNBackend device;
Qnn_GraphHandle_t graph_handle;
std::shared_ptr<qnn::qnn_instance> qnn_instance;
};
void create_tensors_from_ggml_tensor(const tensor_common_params &params, const qnn::ggml_tensor_array_t &ggml_tensors,
qnn::qnn_tensor_array_t *tensor_wrappers, std::vector<Qnn_Tensor_t> *qnn_tensors) {
using namespace qnn;
tensor_wrappers->resize(ggml_tensors.size());
if (qnn_tensors) {
qnn_tensors->resize(ggml_tensors.size());
}
char buffer[GGML_MAX_NAME] = {};
auto tensor_type = params.is_input ? ggml_qnn_tensor::INPUT : ggml_qnn_tensor::OUTPUT;
for (size_t i = 0; i < ggml_tensors.size(); i++) {
snprintf(buffer, GGML_MAX_NAME, "%s%d", params.name_prefix, (int)i);
auto *ggml_tensor = ggml_tensors[i];
(*tensor_wrappers)[i] = std::make_shared<ggml_qnn_tensor>(tensor_type, std::string(buffer), ggml_tensor->ne,
ggml_tensor->type, params.tensor_rank, params.device,
params.graph_handle, params.qnn_instance);
}
}
bool bind_tensors(const qnn::ggml_tensor_array_t &ggml_tensors, qnn::qnn_tensor_array_t &tensor_wrappers,
std::vector<Qnn_Tensor_t> &qnn_tensors) {
for (size_t i = 0; i < ggml_tensors.size(); i++) {
auto *ggml_tensor = ggml_tensors[i];
if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) {
QNN_LOG_ERROR("bind tensor %s failed", ggml_get_name(ggml_tensor));
return false;
}
qnn_tensors[i] = tensor_wrappers[i]->get_qnn_tensor();
}
return true;
}
class ggml_qnn_connectable_op_config : public qnn::ggml_qnn_op_config_base {
public:
explicit ggml_qnn_connectable_op_config(const std::string &name, const std::string &package_name,
const std::string &op_type, std::shared_ptr<qnn::qnn_instance> qnn_instance)
: ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {}
bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle,
const qnn::ggml_tensor_array_t &tensor_inputs,
const qnn::ggml_tensor_array_t &tensor_outputs) override {
GGML_UNUSED(device);
GGML_UNUSED(graph_handle);
GGML_UNUSED(tensor_inputs);
GGML_UNUSED(tensor_outputs);
return true;
}
void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) {
_tensor_inputs = tensor_inputs;
_qnn_tensor_inputs.resize(_tensor_inputs.size());
}
void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) {
_tensor_inputs = std::move(tensor_inputs);
_qnn_tensor_inputs.resize(_tensor_inputs.size());
}
void set_output_tensors(qnn::qnn_tensor_array_t &tensor_outputs) {
_tensor_outputs = tensor_outputs;
_qnn_tensor_outputs.resize(_tensor_outputs.size());
}
void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_outputs) {
_tensor_outputs = std::move(tensor_outputs);
_qnn_tensor_outputs.resize(_tensor_outputs.size());
}
qnn::qnn_tensor_array_t &get_input_tensors() { return _tensor_inputs; }
qnn::qnn_tensor_array_t &get_output_tensors() { return _tensor_outputs; }
private:
DISABLE_COPY(ggml_qnn_connectable_op_config);
DISABLE_MOVE(ggml_qnn_connectable_op_config);
};
} // namespace
namespace qnn {
@ -161,7 +65,7 @@ bool ggml_qnn_op_config_base::add_tensor_param(const std::string &name, const qn
}
GGML_ASSERT(data_size > 0);
if (!param_tensor->bind_buffer(const_cast<uint8_t *>(data), data_size)) {
if (!param_tensor->set_data_buffer(data, data_size)) {
QNN_LOG_ERROR("parameter tensor bind_buffer failed");
return false;
}
@ -181,6 +85,26 @@ bool ggml_qnn_op_config_base::add_tensor_param(const std::string &name, const qn
return true;
}
void ggml_qnn_op_config_base::set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) {
_tensor_inputs = tensor_inputs;
_qnn_tensor_inputs.resize(_tensor_inputs.size());
}
void ggml_qnn_op_config_base::set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) {
_tensor_inputs = tensor_inputs;
_qnn_tensor_inputs.resize(_tensor_inputs.size());
}
void ggml_qnn_op_config_base::set_output_tensors(qnn::qnn_tensor_array_t &tensor_outputs) {
_tensor_outputs = std::move(tensor_outputs);
_qnn_tensor_outputs.resize(_tensor_outputs.size());
}
void ggml_qnn_op_config_base::set_output_tensors(qnn::qnn_tensor_array_t &&tensor_outputs) {
_tensor_outputs = std::move(tensor_outputs);
_qnn_tensor_outputs.resize(_tensor_outputs.size());
}
bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) {
GGML_ASSERT(_qnn_tensor_inputs.size() == _tensor_inputs.size());
GGML_ASSERT(_qnn_tensor_outputs.size() == _tensor_outputs.size());
@ -221,12 +145,12 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) {
bool ggml_qnn_op_config_base::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) {
GGML_ASSERT(tensor_inputs.size() == _tensor_inputs.size());
return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs);
return qnn::bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs);
}
bool ggml_qnn_op_config_base::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) {
GGML_ASSERT(tensor_outputs.size() == _tensor_outputs.size());
return bind_tensors(tensor_outputs, _tensor_outputs, _qnn_tensor_outputs);
return qnn::bind_tensors(tensor_outputs, _tensor_outputs, _qnn_tensor_outputs);
}
void ggml_qnn_op_config_base::unbind_input_tensors() {
@ -257,55 +181,42 @@ Qnn_OpConfig_t ggml_qnn_op_config_base::get_op_config() {
return config;
}
bool ggml_qnn_single_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle,
const ggml_tensor_array_t &tensor_inputs,
const ggml_tensor_array_t &tensor_outputs) {
const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs);
tensor_common_params params = {"src", tensor_rank, true, device, graph_handle, _qnn_instance};
create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs);
params.name_prefix = "dst";
params.is_input = false;
create_tensors_from_ggml_tensor(params, tensor_outputs, &_tensor_outputs, &_qnn_tensor_outputs);
if (_param_buffer.size() > 0) {
// handle parameters in output tensor
auto *params = tensor_outputs.front()->op_params;
memcpy(_param_buffer.data(), params, _param_buffer.size());
const uint32_t count = uint32_t(_param_buffer.size() / qnn_datatype_size(_param_type));
const qnn_dimension_array_t param_dims = {count, 1, 1, 1};
add_tensor_param(_param_name, param_dims, 1, _param_buffer.data(), _param_type, device, graph_handle);
}
bool ggml_qnn_single_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) {
GGML_UNUSED(device);
GGML_UNUSED(graph_handle);
return true;
}
void ggml_qnn_aggregate_op_config::set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) {
_tensor_inputs = tensor_inputs;
}
void ggml_qnn_aggregate_op_config::set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) {
_tensor_inputs = std::move(tensor_inputs);
}
void ggml_qnn_aggregate_op_config::set_output_tensors(qnn::qnn_tensor_array_t &tensor_outputs) {
_tensor_outputs = tensor_outputs;
}
void ggml_qnn_aggregate_op_config::set_output_tensors(qnn::qnn_tensor_array_t &&tensor_outputs) {
_tensor_outputs = std::move(tensor_outputs);
}
bool ggml_qnn_aggregate_op_config::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) {
return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs);
return qnn::bind_tensors(tensor_inputs, _tensor_inputs);
}
bool ggml_qnn_aggregate_op_config::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) {
return bind_tensors(tensor_outputs, _tensor_outputs, _qnn_tensor_outputs);
return qnn::bind_tensors(tensor_outputs, _tensor_outputs);
}
bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle,
const ggml_tensor_array_t &tensor_inputs,
const ggml_tensor_array_t &tensor_outputs) {
GGML_ASSERT(tensor_inputs.size() == 2);
GGML_ASSERT(tensor_outputs.size() == 1);
const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs);
GGML_ASSERT(tensor_rank >= 2);
// create input tensors
tensor_common_params params = {"src", tensor_rank, true, device, graph_handle, _qnn_instance};
create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs);
// create output tensor
params.name_prefix = "dst";
params.is_input = false;
create_tensors_from_ggml_tensor(params, tensor_outputs, &_tensor_outputs, &_qnn_tensor_outputs);
bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) {
GGML_ASSERT(_tensor_inputs.size() == 2);
GGML_ASSERT(_tensor_outputs.size() == 1);
// create convert nodes
const auto tensor_rank = _tensor_inputs.front()->get_rank();
qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs;
qnn_tensor_array_t mat_mul_tensor_outputs = _tensor_outputs;
if (!create_convert_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs)) {
@ -343,8 +254,8 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic
auto gather_out =
std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, name + "_out", dimensions,
tensor_input->get_data_type(), rank, device, graph_handle, qnn_instance);
auto gather_op = std::make_shared<ggml_qnn_connectable_op_config>(name, QNN_OP_PACKAGE_NAME_QTI_AISW,
QNN_OP_GATHER, qnn_instance);
auto gather_op = std::make_shared<ggml_qnn_single_op_config>(name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_GATHER,
qnn_instance);
Qnn_Scalar_t scalar = QNN_SCALAR_INIT;
scalar.dataType = QNN_DATATYPE_INT_32;
@ -355,16 +266,16 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic
// here we calculate the index mapping, will generate a 1d tensor like [0, 0, 0, 1, 1, 1, 2, 2, 2, ...],
// by repeating each index [scale] times.
const auto scale = dimensions[axis] / tensor_input->get_dimensions()[axis];
std::vector<uint8_t> index_buffer(dimensions[axis] * sizeof(uint32_t));
for (uint32_t *curr = reinterpret_cast<uint32_t *>(index_buffer.data()), *end = curr + dimensions[axis];
auto index_buffer = std::make_shared<qnn_mem_buffer>(dimensions[axis] * sizeof(uint32_t));
for (uint32_t *curr = reinterpret_cast<uint32_t *>(index_buffer->get_buffer()), *end = curr + dimensions[axis];
curr < end; curr++) {
*curr = (curr - reinterpret_cast<uint32_t *>(index_buffer.data())) / scale;
*curr = (curr - reinterpret_cast<uint32_t *>(index_buffer->get_buffer())) / scale;
}
auto gather_index = std::make_shared<ggml_qnn_tensor>(
ggml_qnn_tensor::PARAMETER, name + "_index", qnn_dimension_array_t{dimensions[axis]}, QNN_DATATYPE_UINT_32,
1, device, graph_handle, qnn_instance);
gather_index->set_data_buffer(std::move(index_buffer));
gather_index->set_data_buffer(index_buffer);
gather_op->set_input_tensors({tensor_input, gather_index});
tensor_output = gather_out;
@ -409,8 +320,8 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap
auto convert_out = std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_out",
convert_in->get_dimensions(), tensor_type, rank, device,
graph_handle, _qnn_instance);
auto convert = std::make_shared<ggml_qnn_connectable_op_config>(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW,
QNN_OP_CONVERT, _qnn_instance);
auto convert = std::make_shared<ggml_qnn_single_op_config>(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW,
QNN_OP_CONVERT, _qnn_instance);
convert->set_input_tensors({convert_in});
convert->set_output_tensors({convert_out});
tensor_inputs[i] = convert_out;
@ -424,8 +335,8 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap
auto convert_in = std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_in",
convert_out->get_dimensions(), tensor_type, rank, device,
graph_handle, _qnn_instance);
auto output_convert = std::make_shared<ggml_qnn_connectable_op_config>(
convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_CONVERT, _qnn_instance);
auto output_convert = std::make_shared<ggml_qnn_single_op_config>(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW,
QNN_OP_CONVERT, _qnn_instance);
output_convert->set_input_tensors({convert_in});
output_convert->set_output_tensors({convert_out});
tensor_outputs.front() = convert_in;
@ -495,12 +406,12 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap
dst->get_data_type(), rank, device, graph_handle, _qnn_instance);
// create transpose_out
auto transpose_out = std::make_shared<ggml_qnn_connectable_op_config>(
_name + "_trans1", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE, _qnn_instance);
auto transpose_out = std::make_shared<ggml_qnn_single_op_config>(_name + "_trans1", QNN_OP_PACKAGE_NAME_QTI_AISW,
QNN_OP_TRANSPOSE, _qnn_instance);
// create mat_mul
auto mat_mul = std::make_shared<ggml_qnn_connectable_op_config>(_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
_qnn_instance);
auto mat_mul =
std::make_shared<ggml_qnn_single_op_config>(_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL, _qnn_instance);
Qnn_Scalar_t scalar = QNN_SCALAR_INIT;
scalar.dataType = QNN_DATATYPE_BOOL_8;
@ -528,19 +439,20 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap
return true;
}
ggml_op_constructor_t create_op_constructor(const std::string &op_name) {
ggml_op_constructor_t create_op_constructor(size_t op) {
std::string op_name = get_qnn_op_name(op);
if (op_name == QNN_OP_MAT_MUL) {
// For QNN_OP_MAT_MUL, we need to transpose the input tensor
return [](const std::string &instance_name,
std::shared_ptr<qnn::qnn_instance> qnn_instance) -> std::unique_ptr<qnn::ggml_qnn_op_config> {
std::shared_ptr<qnn::qnn_instance> qnn_instance) -> std::shared_ptr<qnn::ggml_qnn_op_config> {
QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s", instance_name.c_str());
return std::make_unique<qnn::ggml_qnn_matmul_op_config>(instance_name, qnn_instance);
return std::make_shared<qnn::ggml_qnn_matmul_op_config>(instance_name, qnn_instance);
};
}
return [op_name](const std::string &instance_name,
std::shared_ptr<qnn::qnn_instance> qnn_instance) -> std::unique_ptr<qnn::ggml_qnn_op_config> {
return std::make_unique<qnn::ggml_qnn_single_op_config>(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_name,
std::shared_ptr<qnn::qnn_instance> qnn_instance) -> std::shared_ptr<qnn::ggml_qnn_op_config> {
return std::make_shared<qnn::ggml_qnn_single_op_config>(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_name,
qnn_instance);
};
}

View File

@ -1,7 +1,7 @@
#pragma once
#include <array>
#include <functional>
#include <memory>
#include <string>
#include <vector>
@ -13,9 +13,28 @@
namespace qnn {
using ggml_op_constructor_t =
std::function<std::unique_ptr<ggml_qnn_op_config>(const std::string &, std::shared_ptr<qnn_instance>)>;
std::function<std::shared_ptr<ggml_qnn_op_config>(const std::string &, std::shared_ptr<qnn_instance>)>;
ggml_op_constructor_t create_op_constructor(const std::string &op_name);
constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT;
size_t get_qnn_op_index(const ggml_tensor *tensor);
void get_ggml_op_output_dimensions(const std::vector<const ggml_dimension_array_t> &input_dims, size_t op,
ggml_dimension_array_t &output_dims);
const char *get_qnn_op_name(size_t op);
size_t get_qnn_op_input_param_count(size_t op);
ggml_op_constructor_t create_op_constructor(size_t op);
inline bool add_op_to_graph(Qnn_GraphHandle_t graph_handle, std::vector<qnn_op_config_ptr_t> &operations) {
for (auto &op : operations) {
if (!op->add_op_to_graph(graph_handle)) {
return false;
}
}
return true;
}
class ggml_qnn_op_config_base : public ggml_qnn_op_config {
public:
@ -27,13 +46,18 @@ public:
bool add_tensor_param(const std::string &name, const qnn_dimension_array_t &dimensions, int rank,
const uint8_t *data, const Qnn_DataType_t data_type, QNNBackend device,
Qnn_GraphHandle_t graph_handle);
void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override;
void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override;
void set_output_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override;
void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override;
bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override;
bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override;
bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override;
void unbind_input_tensors() override;
void unbind_output_tensors() override;
std::vector<Qnn_Tensor_t> &get_qnn_input_tensors() override { return _qnn_tensor_inputs; }
std::vector<Qnn_Tensor_t> &get_qnn_output_tensors() override { return _qnn_tensor_outputs; }
const qnn_tensor_array_t &get_input_tensors() override { return _tensor_inputs; }
const qnn_tensor_array_t &get_output_tensors() override { return _tensor_outputs; }
protected:
Qnn_OpConfig_t get_op_config();
@ -60,24 +84,9 @@ public:
const std::string &op_type, std::shared_ptr<qnn_instance> qnn_instance)
: ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {}
explicit ggml_qnn_single_op_config(const std::string &name, const std::string &package_name,
const std::string &op_type, const std::string &param_name,
const Qnn_DataType_t param_type, const size_t param_size,
std::shared_ptr<qnn_instance> qnn_instance)
: ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance),
_param_name(param_name),
_param_type(param_type),
_param_buffer(param_size) {}
bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle,
const ggml_tensor_array_t &tensor_inputs,
const ggml_tensor_array_t &tensor_outputs) override;
bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override;
private:
const std::string _param_name;
const Qnn_DataType_t _param_type = QNN_DATATYPE_UINT_32;
std::vector<uint8_t> _param_buffer;
DISABLE_COPY(ggml_qnn_single_op_config);
DISABLE_MOVE(ggml_qnn_single_op_config);
};
@ -88,26 +97,21 @@ public:
: _name(name), _qnn_instance(qnn_instance) {}
~ggml_qnn_aggregate_op_config() {
_qnn_tensor_inputs.clear();
_qnn_tensor_outputs.clear();
_tensor_inputs.clear();
_tensor_outputs.clear();
_operations.clear();
}
void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override;
void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override;
void set_output_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override;
void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override;
bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override {
for (auto &op : _operations) {
if (!op->add_op_to_graph(graph_handle)) {
return false;
}
}
return true;
return qnn::add_op_to_graph(graph_handle, _operations);
}
bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override;
bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override;
void unbind_input_tensors() override {
for (auto &tensor : _tensor_inputs) {
tensor->unbind();
@ -120,8 +124,8 @@ public:
}
}
std::vector<Qnn_Tensor_t> &get_qnn_input_tensors() override { return _qnn_tensor_inputs; }
std::vector<Qnn_Tensor_t> &get_qnn_output_tensors() override { return _qnn_tensor_outputs; }
const qnn_tensor_array_t &get_input_tensors() override { return _tensor_inputs; }
const qnn_tensor_array_t &get_output_tensors() override { return _tensor_outputs; }
protected:
std::string _name;
@ -130,8 +134,6 @@ protected:
std::vector<qnn_op_config_ptr_t> _operations;
qnn_tensor_array_t _tensor_inputs;
qnn_tensor_array_t _tensor_outputs;
std::vector<Qnn_Tensor_t> _qnn_tensor_inputs;
std::vector<Qnn_Tensor_t> _qnn_tensor_outputs;
private:
DISABLE_COPY(ggml_qnn_aggregate_op_config);
@ -143,9 +145,7 @@ public:
ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr<qnn_instance> qnn_instance)
: ggml_qnn_aggregate_op_config(name, qnn_instance) {}
bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle,
const ggml_tensor_array_t &tensor_inputs,
const ggml_tensor_array_t &tensor_outputs) override;
bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override;
private:
qnn_tensor_ptr_t create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank,

View File

@ -12,7 +12,9 @@ namespace qnn {
//
// helper data type / data structure / macros / functions of
// Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK
// ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm
// ref:
// https://github.com/pytorch/executorch/blob/ae3d558d5e6aa04fc52a3065399fe6a773702f52/backends/qualcomm/serialization/qc_schema.py#L53
// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html#supported-snapdragon-devices
// =================================================================================================
enum sdk_profile_level { profile_off = 0, profile_basic, profile_detail };
@ -22,14 +24,18 @@ enum qcom_htp_arch {
V69 = 69,
V73 = 73,
V75 = 75,
V79 = 79, // SD 8 Gen 4 (SM8750)
};
enum qcom_chipset {
UNKNOWN_SM = 0,
SM8450 = 36, // v69
SM8475 = 42, // v69
SM8550 = 43, // v73
SM8650 = 57, // v75
SM8450 = 36, // v69, SD 8 Gen 1
SM8475 = 42, // v69, SD 8+ Gen 1
SM8550 = 43, // v73, SD 8 Gen 2
SSG2115P = 46, // v73
SM8650 = 57, // v75, SD 8 Gen 3
SA8295 = 39, // v68
SM8750 = 69, // v79, SD 8 Gen 4
};
struct qcom_socinfo {

View File

@ -20,9 +20,9 @@ namespace qnn {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS should be 4");
class ggml_qnn_tensor {
class ggml_qnn_tensor : public std::enable_shared_from_this<ggml_qnn_tensor> {
public:
typedef enum _tensor_type { INPUT, OUTPUT, INTERMEDIATE, PARAMETER } tensor_type_t;
typedef enum _tensor_type { INPUT, OUTPUT, INTERMEDIATE, PARAMETER, BIDIRECTION } tensor_type_t;
explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string &name,
const qnn_dimension_array_t &dimensions, Qnn_DataType_t data_type, int rank,
@ -49,18 +49,27 @@ public:
qnn_datatype_from_ggml_datatype(data_type), rank, device, graph_handle, qnn_instance) {}
~ggml_qnn_tensor() {
_buffer_storage.clear();
unbind();
_rpc_buffer.reset();
unbind();
}
bool set_data_buffer(std::vector<uint8_t> &&buffer) {
if (!bind_buffer_impl(buffer.data(), buffer.size())) {
return false;
bool set_data_buffer(const uint8_t *buffer, const size_t buffer_size) {
auto qnn_buffer = std::make_shared<qnn_mem_buffer>(buffer, buffer_size);
if (bind_buffer_impl(qnn_buffer)) {
return true;
}
_buffer_storage = std::move(buffer);
return true;
can_unbind = false;
return false;
}
bool set_data_buffer(qnn_buffer_ptr buffer) {
if (bind_buffer_impl(buffer)) {
return true;
}
can_unbind = false;
return false;
}
bool alloc_qnn_tensor_id() {
@ -83,23 +92,32 @@ public:
return true;
}
bool bind_buffer(uint8_t *buffer, const size_t buffer_size) {
if (!_buffer_storage.empty()) {
bool bind_ggml_tensor(ggml_tensor *tensor) {
if (!can_unbind) {
QNN_LOG_DEBUG("[%s]already has buffer storage, skip bind", _tensor_name.c_str());
return true;
}
return bind_buffer_impl(buffer, buffer_size);
}
#ifndef NDEBUG
if (tensor->view_src) {
auto *src = tensor->view_src;
QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) is a view, src: %s_%dx%dx%dx%d", get_backend_name(_device),
tensor->name, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], src->name,
src->ne[0], src->ne[1], src->ne[2], src->ne[3]);
}
#endif
bool bind_ggml_tensor(ggml_tensor *tensor) {
if (!bind_buffer(reinterpret_cast<uint8_t *>(tensor->data), ggml_nbytes(tensor))) {
auto buffer =
std::make_shared<qnn_mem_buffer_slice>(reinterpret_cast<uint8_t *>(tensor->data), ggml_nbytes(tensor));
if (!bind_buffer_impl(buffer)) {
QNN_LOG_WARN("[%s]failed to bind ggml tensor(%s)", _tensor_name.c_str(), ggml_get_name(tensor));
return false;
}
QNN_LOG_DEBUG("[%s][%s]bind to ggml tensor(%s)", get_backend_name(_device), _tensor_name.c_str(),
ggml_get_name(tensor));
tensor->extra = this;
_ggml_tensor = tensor;
return true;
}
@ -110,7 +128,7 @@ public:
}
if (!_buffer) {
QNN_LOG_DEBUG("[%s]bound to ggml tensor", _tensor_name.c_str());
QNN_LOG_DEBUG("[%s]unbind to ggml tensor", _tensor_name.c_str());
return true;
}
@ -119,7 +137,7 @@ public:
return false;
}
if (!_buffer_storage.empty()) {
if (!can_unbind) {
QNN_LOG_DEBUG("[%s]already has buffer storage, stop unbind", _tensor_name.c_str());
return true;
}
@ -132,26 +150,32 @@ public:
}
QNN_LOG_DEBUG("[%s][%s]unbind from buffer: %p, size: %d", get_backend_name(_device), _tensor_name.c_str(),
_buffer, (int)_buffer_size);
_buffer = nullptr;
_buffer_size = 0;
_buffer.get(), (int)_buffer->get_size());
_buffer.reset();
if (_ggml_tensor) {
_ggml_tensor->extra = nullptr;
_ggml_tensor = nullptr;
}
return true;
}
const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; }
Qnn_DataType_t get_data_type() const { return QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor); }
const qnn_dimension_array_t &get_dimensions() const { return _dimensions; }
uint32_t get_rank() const { return QNN_TENSOR_GET_RANK(_qnn_tensor); }
uint32_t get_qnn_tensor_id() const { return QNN_TENSOR_GET_ID(_qnn_tensor); }
private:
bool bind_buffer_impl(uint8_t *buffer, const size_t buffer_size) {
bool bind_buffer_impl(qnn_buffer_ptr buffer) {
if (_buffer) {
if (_buffer != buffer) {
QNN_LOG_WARN("[%s]has been bound to another buffer %p", _tensor_name.c_str(), _buffer);
QNN_LOG_WARN("[%s]has been bound to another buffer %p", _tensor_name.c_str(), _buffer.get());
return false;
}
QNN_LOG_DEBUG("[%s]already bound to same ggml tensor %p", _tensor_name.c_str(), _buffer);
QNN_LOG_DEBUG("[%s]already bound to same ggml tensor %p", _tensor_name.c_str(), _buffer.get());
return true;
}
@ -164,7 +188,7 @@ private:
if (should_use_mem_handle()) {
if (!_rpc_buffer) {
auto rpc_buffer = std::make_shared<qnn_rpc_buffer>(
_qnn_instance, buffer_size, QNN_TENSOR_GET_RANK(_qnn_tensor),
_qnn_instance, buffer->get_size(), QNN_TENSOR_GET_RANK(_qnn_tensor),
QNN_TENSOR_GET_DIMENSIONS(_qnn_tensor), QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor));
if (!rpc_buffer->is_valid()) {
QNN_LOG_WARN("[%s][%s]alloc rpc mem failed", get_backend_name(_device), _tensor_name.c_str());
@ -187,22 +211,21 @@ private:
QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor));
} else {
QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW);
Qnn_ClientBuffer_t client_buf = {buffer, (uint32_t)buffer_size};
Qnn_ClientBuffer_t client_buf = {buffer->get_buffer(), (uint32_t)buffer->get_size()};
QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf);
QNN_LOG_DEBUG("[%s]use client buffer %p size %d", _tensor_name.c_str(), client_buf.data,
(int)client_buf.dataSize);
}
_buffer = buffer;
_buffer_size = buffer_size;
if (!write_to_qnn_tensor()) {
QNN_LOG_WARN("[%s]write to qnn tensor failed", _tensor_name.c_str());
return false;
}
QNN_LOG_DEBUG("[%s][%s]bind to buffer: %p, size: %d", get_backend_name(_device), _tensor_name.c_str(), buffer,
(int)buffer_size);
QNN_LOG_DEBUG("[%s][%s]bind to buffer: %p, size: %d", get_backend_name(_device), _tensor_name.c_str(),
buffer.get(), (int)buffer->get_size());
return true;
}
@ -214,7 +237,7 @@ private:
}
if (_rpc_buffer) {
memcpy(_rpc_buffer->get_buffer(), _buffer, _buffer_size);
memcpy(_rpc_buffer->get_buffer(), _buffer->get_buffer(), _buffer->get_size());
}
// For CPU and GPU, the data is already in the tensor.
@ -230,7 +253,7 @@ private:
}
if (_rpc_buffer) {
memcpy(_buffer, _rpc_buffer->get_buffer(), _buffer_size);
memcpy(_buffer->get_buffer(), _rpc_buffer->get_buffer(), _buffer->get_size());
}
// For CPU and GPU, the data is already in the tensor.
@ -258,6 +281,9 @@ private:
case PARAMETER:
new_tensor_type = QNN_TENSOR_TYPE_STATIC;
break;
case BIDIRECTION:
new_tensor_type = QNN_TENSOR_TYPE_APP_READWRITE;
break;
case INTERMEDIATE:
default:
new_tensor_type = QNN_TENSOR_TYPE_NATIVE;
@ -273,15 +299,15 @@ private:
}
std::string _tensor_name;
uint8_t *_buffer = nullptr;
size_t _buffer_size = 0;
std::vector<uint8_t> _buffer_storage;
qnn_buffer_ptr _buffer;
bool can_unbind = true;
QNNBackend _device;
std::shared_ptr<qnn_instance> _qnn_instance;
Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion);
qnn_dimension_array_t _dimensions = {};
Qnn_GraphHandle_t _graph_handle = nullptr;
qnn_buffer_ptr _rpc_buffer;
ggml_tensor *_ggml_tensor = nullptr;
DISABLE_COPY(ggml_qnn_tensor);
DISABLE_MOVE(ggml_qnn_tensor);
@ -289,5 +315,92 @@ private:
using qnn_tensor_ptr_t = std::shared_ptr<ggml_qnn_tensor>;
using qnn_tensor_array_t = std::vector<qnn_tensor_ptr_t>;
using ggml_tensor_array_t = std::vector<ggml_tensor *>;
inline qnn_tensor_ptr_t get_qnn_tensor_ptr(ggml_tensor *ggml_tensor) {
return ggml_tensor->extra ? reinterpret_cast<ggml_qnn_tensor *>(ggml_tensor->extra)->shared_from_this()
: qnn_tensor_ptr_t();
}
inline int get_ggml_tensors_max_rank(const qnn::ggml_tensor_array_t &tensors) {
int max_rank = 0;
for (auto tensor : tensors) {
max_rank = std::max(max_rank, ggml_n_dims(tensor));
}
return max_rank;
}
inline bool bind_tensors(const ggml_tensor_array_t &ggml_tensors, qnn_tensor_array_t &tensor_wrappers,
std::vector<Qnn_Tensor_t> &qnn_tensors) {
GGML_ASSERT(tensor_wrappers.size() == ggml_tensors.size());
qnn_tensors.resize(ggml_tensors.size());
for (size_t i = 0; i < ggml_tensors.size(); i++) {
auto *ggml_tensor = ggml_tensors[i];
if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) {
QNN_LOG_ERROR("bind tensor %s failed", ggml_get_name(ggml_tensor));
return false;
}
qnn_tensors[i] = tensor_wrappers[i]->get_qnn_tensor();
}
return true;
}
inline bool bind_tensors(const ggml_tensor_array_t &ggml_tensors, qnn_tensor_array_t &tensor_wrappers) {
GGML_ASSERT(tensor_wrappers.size() == ggml_tensors.size());
for (size_t i = 0; i < ggml_tensors.size(); i++) {
auto *ggml_tensor = ggml_tensors[i];
if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) {
QNN_LOG_ERROR("bind tensor %s failed", ggml_get_name(ggml_tensor));
return false;
}
}
return true;
}
inline void unbind_tensors(qnn_tensor_array_t &tensor_wrappers) {
for (auto &tensor : tensor_wrappers) {
tensor->unbind();
}
}
struct tensor_create_common_params {
const char *name_prefix;
int tensor_rank;
bool is_input;
QNNBackend device;
Qnn_GraphHandle_t graph_handle;
std::shared_ptr<qnn::qnn_instance> qnn_instance;
};
inline void create_tensors_from_ggml_tensor(const tensor_create_common_params &params,
const ggml_tensor_array_t &ggml_tensors,
qnn_tensor_array_t *tensor_wrappers,
std::vector<Qnn_Tensor_t> *qnn_tensors) {
if (qnn_tensors) {
qnn_tensors->resize(ggml_tensors.size());
}
if (!tensor_wrappers->empty()) {
QNN_LOG_DEBUG("tensor_wrappers is not empty, skip create tensors");
GGML_ASSERT(tensor_wrappers->size() == ggml_tensors.size());
return;
}
tensor_wrappers->resize(ggml_tensors.size());
char buffer[GGML_MAX_NAME] = {};
auto tensor_type = params.is_input ? ggml_qnn_tensor::INPUT : ggml_qnn_tensor::OUTPUT;
for (size_t i = 0; i < ggml_tensors.size(); i++) {
snprintf(buffer, GGML_MAX_NAME, "%s%d", params.name_prefix, (int)i);
auto *ggml_tensor = ggml_tensors[i];
(*tensor_wrappers)[i] = std::make_shared<ggml_qnn_tensor>(tensor_type, std::string(buffer), ggml_tensor->ne,
ggml_tensor->type, params.tensor_rank, params.device,
params.graph_handle, params.qnn_instance);
}
}
} // namespace qnn

View File

@ -188,13 +188,15 @@ const char *get_backend_name(QNNBackend device_index) {
const char *get_chipset_desc(uint32_t chipset_id) {
switch (chipset_id) {
case SM8450:
return "SM8450";
return "SD 8 Gen 1 (SM8450)";
case SM8475:
return "SM8475";
return "SD 8+ Gen 1 (SM8475)";
case SM8550:
return "SM8550";
return "SD 8 Gen 2 (SM8550)";
case SM8650:
return "SM8650";
return "SD 8 Gen 3 (SM8650)";
case SM8750:
return "SD 8 Gen 4 (SM8750)";
default:
return "unknown";
}
@ -210,6 +212,8 @@ const char *get_htparch_desc(size_t htp_arch) {
return "QCOM_HTP_V73";
case V75:
return "QCOM_HTP_V75";
case V79:
return "QCOM_HTP_V79";
default:
return "unknown";
}