[feat] Port ggml graph to QNN graph (#16)
* more log * split graph implementation into cpp file * rename: ggml_qnn_graph -> qnn_graph * add imput/output tensor to graph * fix assert * wip * add _ggml_tensor field in qnn tensor * add comments * add set_data_buffer with raw memory buffer * use set_data_buffer * op param buffer use qnn_buffer_ptr * add qnn_mem_buffer_slice * use qnn_buffer_ptr as tensor buffer * use new set_data_buffer to reduce copy * ggml_qnn_op_config: add function to set input/output tensor before init node * remove ggml_qnn_connectable_op_config and use ggml_qnn_single_op_config instead * wip * add initialize_op_nodes without tensor params * wip * add op caps table * merge kGgmlOpToQnnOp and kOpCaps tables * wip * add cache parameter to create_tensors * add init_from_ggml_graph * disable gelu for all backend * wip * move op index calc to op config module * use the ggml_tensor as parameter of build_graph * add log * use create_operation_from_op_tensor in old build_graph function * remove unused constructors * fix parameter count * remove unused member func/var * make init_from_ggml_graph as a class member: build_graph_from_ggml_graph * move graph finalize into member function `finalize()` * get graph key from ggml op tensor directly * append output type * reduce tensor key length * add function to generate key from ggml_cgraph * simplify graph cache insert and delete * remove template param at get_qnn_graph_from_cache * wip * merge kQnnUnaryOpsTable and kQnnBinaryOpsTable * refactor device_supports_op * add log * wip * use framework function to check same shape * wip * extract some logic into separated function * wip * add execution function that runs graph * add function to create qnn graph from ggml_cgraph with cache * execute graph directly * return null graph key for empty graph * add more qualcomm chipset enums * add cap for reshape * disable some ops * try to skip GGML_OP_VIEW * moew log for view tensor * append param tensor into intermedia tensor key * use 'ordered' set * fix warning in release * wip
This commit is contained in:
parent
8f07b3e3f6
commit
f2d8d017da
|
|
@ -11,12 +11,10 @@
|
|||
#include "tensor.hpp"
|
||||
#include "utils.hpp"
|
||||
|
||||
#ifndef NDEBUG
|
||||
|
||||
namespace {
|
||||
|
||||
bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor *src, ggml_tensor *dst) {
|
||||
if (!ctx || !src || !dst) {
|
||||
bool qnn_is_op_valid(ggml_backend_qnn_device_context *ctx, const ggml_tensor *dst) {
|
||||
if (!ctx || !dst) {
|
||||
QNN_LOG_WARN("invalid params");
|
||||
return false;
|
||||
}
|
||||
|
|
@ -27,77 +25,36 @@ bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor
|
|||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
||||
ggml_tensor *dst) {
|
||||
if (!ctx || !src0 || !src1 || !dst) {
|
||||
QNN_LOG_WARN("invalid params");
|
||||
return false;
|
||||
const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(dst));
|
||||
switch (param_count) {
|
||||
case 1:
|
||||
return dst->src[0];
|
||||
case 2:
|
||||
return dst->src[0] && dst->src[1];
|
||||
default:
|
||||
QNN_LOG_WARN("invalid op param count %d", (int)param_count);
|
||||
break;
|
||||
}
|
||||
|
||||
auto instance = ctx->instance;
|
||||
if (!instance) {
|
||||
QNN_LOG_WARN("invalid instance");
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
void print_ggml_tensor(const ggml_tensor *tensor) {
|
||||
QNN_LOG_DEBUG("%s: type:%s ne: %ldx%ldx%ldx%ld, nb: %ldx%ldx%ldx%ld", tensor->name, ggml_type_name(tensor->type),
|
||||
(long)tensor->ne[0], (long)tensor->ne[1], (long)tensor->ne[2], (long)tensor->ne[3],
|
||||
(long)tensor->nb[0], (long)tensor->nb[1], (long)tensor->nb[2], (long)tensor->nb[3]);
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace
|
||||
|
||||
#define CHECK_PARAMS(ctx, ...) \
|
||||
if (!qnn_is_valid_params((ctx), __VA_ARGS__)) { \
|
||||
return false; \
|
||||
}
|
||||
|
||||
#else
|
||||
#define CHECK_PARAMS(ctx, ...)
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
|
||||
bool is_tensor_dimensions_equal(const ggml_tensor *l, const ggml_tensor *r) {
|
||||
const auto dim_l = ggml_n_dims(l);
|
||||
if (dim_l != ggml_n_dims(r)) {
|
||||
return false;
|
||||
}
|
||||
typedef bool (*ggml_qnn_op_t)(ggml_backend_qnn_device_context *ctx, ggml_tensor *dst);
|
||||
|
||||
for (int i = 0; i < dim_l; i++) {
|
||||
if (l->ne[i] != r->ne[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
typedef bool (*ggml_qnn_unary_op_t)(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, ggml_tensor *dst);
|
||||
typedef bool (*ggml_qnn_binary_op_t)(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, ggml_tensor *src1,
|
||||
ggml_tensor *dst);
|
||||
|
||||
typedef const ggml_qnn_unary_op_t (&ggml_qnn_unary_op_array_t)[GGML_OP_COUNT + GGML_UNARY_OP_COUNT];
|
||||
typedef const ggml_qnn_binary_op_t (&ggml_qnn_binary_op_array_t)[GGML_OP_COUNT];
|
||||
|
||||
constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT;
|
||||
|
||||
template <size_t _Size>
|
||||
qnn::ggml_tensor_array_t to_ggml_tensor_array(const std::array<ggml_tensor *, _Size> &array) {
|
||||
return qnn::ggml_tensor_array_t(array.data(), array.data() + _Size);
|
||||
}
|
||||
|
||||
template <size_t _InputSize>
|
||||
bool execute_graph(qnn::ggml_qnn_graph *graph, const std::array<ggml_tensor *, _InputSize> &inputs,
|
||||
ggml_tensor *output) {
|
||||
if (!graph->execute(to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<1>({output}))) {
|
||||
bool execute_graph(qnn::qnn_graph *graph, ggml_tensor *output) {
|
||||
if (!graph->execute(output)) {
|
||||
QNN_LOG_WARN("execute failed");
|
||||
return false;
|
||||
}
|
||||
|
|
@ -105,165 +62,114 @@ bool execute_graph(qnn::ggml_qnn_graph *graph, const std::array<ggml_tensor *, _
|
|||
return true;
|
||||
}
|
||||
|
||||
template <size_t _InputSize, size_t _OutputSize>
|
||||
std::string get_graph_key(const std::string &op_name, const std::array<ggml_tensor *, _InputSize> &inputs,
|
||||
const std::array<ggml_tensor *, _OutputSize> &outputs) {
|
||||
constexpr static const auto append_dimensions = [](std::string &key, const ggml_tensor *tensor) {
|
||||
char buffer[256] = {};
|
||||
snprintf(buffer, sizeof(buffer), "_%ldx%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1],
|
||||
(long)tensor->ne[2], (long)tensor->ne[3], qnn::get_ggml_type_name(tensor->type));
|
||||
key += buffer;
|
||||
};
|
||||
|
||||
std::string graph_key(op_name);
|
||||
for (auto &input : inputs) {
|
||||
append_dimensions(graph_key, input);
|
||||
void append_tensor_dimensions(const ggml_tensor *tensor, std::string &output) {
|
||||
char buffer[256] = {};
|
||||
const auto *type_name = qnn::get_ggml_type_name(tensor->type);
|
||||
int len = 0;
|
||||
switch (ggml_n_dims(tensor)) {
|
||||
case 1:
|
||||
len = snprintf(buffer, sizeof(buffer), "%ld%s", (long)tensor->ne[0], type_name);
|
||||
break;
|
||||
case 2:
|
||||
len = snprintf(buffer, sizeof(buffer), "%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], type_name);
|
||||
break;
|
||||
case 3:
|
||||
len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1],
|
||||
(long)tensor->ne[2], type_name);
|
||||
break;
|
||||
case 4:
|
||||
default:
|
||||
len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1],
|
||||
(long)tensor->ne[2], (long)tensor->ne[3], type_name);
|
||||
break;
|
||||
}
|
||||
|
||||
graph_key += qnn::get_ggml_type_name(outputs.front()->type);
|
||||
return graph_key;
|
||||
GGML_ASSERT(len > 0 && len < (int)sizeof(buffer));
|
||||
output.append(buffer, len);
|
||||
}
|
||||
|
||||
constexpr const char *kGgmlOpToQnnOp[] = {
|
||||
nullptr, // GGML_OP_NONE
|
||||
nullptr, // GGML_OP_DUP
|
||||
QNN_OP_ELEMENT_WISE_ADD, // GGML_OP_ADD
|
||||
nullptr, // GGML_OP_ADD1
|
||||
nullptr, // GGML_OP_ACC
|
||||
QNN_OP_ELEMENT_WISE_SUBTRACT, // GGML_OP_SUB
|
||||
QNN_OP_ELEMENT_WISE_MULTIPLY, // GGML_OP_MUL
|
||||
QNN_OP_ELEMENT_WISE_DIVIDE, // GGML_OP_DIV
|
||||
nullptr, // GGML_OP_SQR
|
||||
QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // GGML_OP_SQRT
|
||||
QNN_OP_ELEMENT_WISE_LOG, // GGML_OP_LOG
|
||||
nullptr, // GGML_OP_SIN
|
||||
nullptr, // GGML_OP_COS
|
||||
nullptr, // GGML_OP_SUM
|
||||
nullptr, // GGML_OP_SUM_ROWS
|
||||
nullptr, // GGML_OP_MEAN
|
||||
nullptr, // GGML_OP_ARGMAX
|
||||
nullptr, // GGML_OP_COUNT_EQUAL
|
||||
nullptr, // GGML_OP_REPEAT
|
||||
nullptr, // GGML_OP_REPEAT_BACK
|
||||
nullptr, // GGML_OP_CONCAT
|
||||
nullptr, // GGML_OP_SILU_BACK
|
||||
nullptr, // GGML_OP_NORM
|
||||
nullptr, // GGML_OP_RMS_NORM
|
||||
nullptr, // GGML_OP_RMS_NORM_BACK
|
||||
nullptr, // GGML_OP_GROUP_NORM
|
||||
void get_graph_key_from_op(const ggml_tensor *op, std::string &output) {
|
||||
GGML_ASSERT(op->op != GGML_OP_NONE);
|
||||
output += ggml_op_desc(op);
|
||||
output += qnn::get_ggml_type_name(op->type);
|
||||
const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(op));
|
||||
for (size_t i = 0; i < param_count; ++i) {
|
||||
auto *input = op->src[i];
|
||||
output += '_';
|
||||
append_tensor_dimensions(input, output);
|
||||
}
|
||||
}
|
||||
|
||||
QNN_OP_MAT_MUL, // GGML_OP_MUL_MAT
|
||||
nullptr, // GGML_OP_MUL_MAT_ID
|
||||
nullptr, // GGML_OP_OUT_PROD
|
||||
void get_op_key_with_src_op_desc(const ggml_tensor *op, std::string &output) {
|
||||
output += ggml_op_desc(op);
|
||||
output += '(';
|
||||
if (op->src[0]) {
|
||||
output += ggml_op_desc(op->src[0]);
|
||||
}
|
||||
for (size_t i = 1; i < GGML_MAX_DIMS && op->src[i]; ++i) {
|
||||
output += ',';
|
||||
output += ggml_op_desc(op->src[i]);
|
||||
}
|
||||
output += ')';
|
||||
}
|
||||
|
||||
nullptr, // GGML_OP_SCALE
|
||||
nullptr, // GGML_OP_SET
|
||||
nullptr, // GGML_OP_CPY
|
||||
nullptr, // GGML_OP_CONT
|
||||
nullptr, // GGML_OP_RESHAPE
|
||||
nullptr, // GGML_OP_VIEW
|
||||
QNN_OP_TRANSPOSE, // GGML_OP_PERMUTE
|
||||
nullptr, // GGML_OP_TRANSPOSE
|
||||
nullptr, // GGML_OP_GET_ROWS
|
||||
nullptr, // GGML_OP_GET_ROWS_BACK
|
||||
nullptr, // GGML_OP_DIAG
|
||||
nullptr, // GGML_OP_DIAG_MASK_INF
|
||||
nullptr, // GGML_OP_DIAG_MASK_ZERO
|
||||
nullptr, // GGML_OP_SOFT_MAX
|
||||
nullptr, // GGML_OP_SOFT_MAX_BACK
|
||||
nullptr, // GGML_OP_ROPE
|
||||
nullptr, // GGML_OP_ROPE_BACK
|
||||
nullptr, // GGML_OP_CLAMP
|
||||
nullptr, // GGML_OP_CONV_TRANSPOSE_1D
|
||||
nullptr, // GGML_OP_IM2COL
|
||||
nullptr, // GGML_OP_IM2COL_BACK
|
||||
nullptr, // GGML_OP_CONV_TRANSPOSE_2D
|
||||
nullptr, // GGML_OP_POOL_1D
|
||||
nullptr, // GGML_OP_POOL_2D
|
||||
nullptr, // GGML_OP_POOL_2D_BACK
|
||||
nullptr, // GGML_OP_UPSCALE
|
||||
nullptr, // GGML_OP_PAD
|
||||
nullptr, // GGML_OP_PAD_REFLECT_1D
|
||||
nullptr, // GGML_OP_ARANGE
|
||||
nullptr, // GGML_OP_TIMESTEP_EMBEDDING
|
||||
nullptr, // GGML_OP_ARGSORT
|
||||
nullptr, // GGML_OP_LEAKY_RELU
|
||||
void get_graph_key_from_cgraph(const ggml_cgraph *cgraph, std::string &output) {
|
||||
// generate key from the graph, the key is used to cache the graph, like:
|
||||
// "MUL_MATf32_256x16x10f32_256x1x10f32#LOG#ADD#ADDf32_16x1x10f32"
|
||||
if (cgraph->n_nodes == 0) {
|
||||
QNN_LOG_DEBUG("empty cgraph");
|
||||
return;
|
||||
}
|
||||
|
||||
nullptr, // GGML_OP_FLASH_ATTN_EXT
|
||||
nullptr, // GGML_OP_FLASH_ATTN_BACK
|
||||
nullptr, // GGML_OP_SSM_CONV
|
||||
nullptr, // GGML_OP_SSM_SCAN
|
||||
nullptr, // GGML_OP_WIN_PART
|
||||
nullptr, // GGML_OP_WIN_UNPART
|
||||
nullptr, // GGML_OP_GET_REL_POS
|
||||
nullptr, // GGML_OP_ADD_REL_POS
|
||||
nullptr, // GGML_OP_RWKV_WKV6
|
||||
{
|
||||
bool is_start = true;
|
||||
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
||||
auto *op = cgraph->nodes[i];
|
||||
if (ggml_is_empty(op)) {
|
||||
QNN_LOG_DEBUG("empty op in graph, skipping");
|
||||
continue;
|
||||
}
|
||||
|
||||
nullptr, // GGML_OP_UNARY
|
||||
if (op->op == GGML_OP_NONE) {
|
||||
QNN_LOG_DEBUG("GGML_OP_NONE in graph, skipping");
|
||||
continue;
|
||||
}
|
||||
|
||||
nullptr, // GGML_OP_MAP_UNARY
|
||||
nullptr, // GGML_OP_MAP_BINARY
|
||||
if (is_start) {
|
||||
get_graph_key_from_op(cgraph->nodes[0], output);
|
||||
is_start = false;
|
||||
} else {
|
||||
output += '#';
|
||||
get_op_key_with_src_op_desc(op, output);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
nullptr, // GGML_OP_MAP_CUSTOM1_F32
|
||||
nullptr, // GGML_OP_MAP_CUSTOM2_F32
|
||||
nullptr, // GGML_OP_MAP_CUSTOM3_F32
|
||||
|
||||
nullptr, // GGML_OP_MAP_CUSTOM1
|
||||
nullptr, // GGML_OP_MAP_CUSTOM2
|
||||
nullptr, // GGML_OP_MAP_CUSTOM3
|
||||
|
||||
nullptr, // GGML_OP_CROSS_ENTROPY_LOSS
|
||||
nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK
|
||||
nullptr, // GGML_OP_OPT_STEP_ADAMW
|
||||
|
||||
// ggml_unary_op
|
||||
nullptr, // GGML_UNARY_OP_ABS
|
||||
nullptr, // GGML_UNARY_OP_SGN
|
||||
nullptr, // GGML_UNARY_OP_NEG
|
||||
nullptr, // GGML_UNARY_OP_STEP
|
||||
nullptr, // GGML_UNARY_OP_TANH
|
||||
nullptr, // GGML_UNARY_OP_ELU
|
||||
nullptr, // GGML_UNARY_OP_RELU
|
||||
nullptr, // GGML_UNARY_OP_SIGMOID
|
||||
QNN_OP_GELU, // GGML_UNARY_OP_GELU
|
||||
nullptr, // GGML_UNARY_OP_GELU_QUICK
|
||||
nullptr, // GGML_UNARY_OP_SILU
|
||||
nullptr, // GGML_UNARY_OP_HARDSWISH
|
||||
nullptr, // GGML_UNARY_OP_HARDSIGMOID
|
||||
nullptr, // GGML_UNARY_OP_EXP
|
||||
};
|
||||
|
||||
static_assert(sizeof(kGgmlOpToQnnOp) / sizeof(kGgmlOpToQnnOp[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT),
|
||||
"GGML_OP_COUNT does not match the size of the kGgmlOpToQnnOp table");
|
||||
static_assert(kGgmlOpToQnnOp[GGML_UNARY_OP_GELU + kGgmlUnaryOpStart] != nullptr,
|
||||
"GGML_UNARY_OP_GELU does not correspond to QNN_OP_GELU");
|
||||
|
||||
template <size_t _InputSize>
|
||||
qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *ctx, size_t op,
|
||||
const std::array<ggml_tensor *, _InputSize> &inputs,
|
||||
ggml_tensor *output) {
|
||||
GGML_ASSERT(op < (GGML_OP_COUNT + GGML_UNARY_OP_COUNT));
|
||||
if (cgraph->n_nodes > 1) {
|
||||
auto *last_op = cgraph->nodes[cgraph->n_nodes - 1];
|
||||
output += qnn::get_ggml_type_name(last_op->type);
|
||||
output += '_';
|
||||
append_tensor_dimensions(last_op, output);
|
||||
}
|
||||
}
|
||||
|
||||
qnn::qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *ctx, ggml_tensor *output) {
|
||||
auto &graph_cache = ctx->qnn_graph_cache;
|
||||
const auto *op_name =
|
||||
op < kGgmlUnaryOpStart ? ggml_op_name(ggml_op(op)) : ggml_unary_op_name(ggml_unary_op(op - kGgmlUnaryOpStart));
|
||||
auto graph_key = get_graph_key<_InputSize, 1>(op_name, inputs, {output});
|
||||
std::string graph_key;
|
||||
get_graph_key_from_op(output, graph_key);
|
||||
auto it = graph_cache.find(graph_key);
|
||||
qnn::ggml_qnn_graph *graph_ptr = nullptr;
|
||||
qnn::qnn_graph *graph_ptr = nullptr;
|
||||
if (it != graph_cache.end()) {
|
||||
QNN_LOG_DEBUG("[%s]found graph %s in cache", qnn::get_backend_name(ctx->device), graph_key.c_str());
|
||||
graph_ptr = it->second.get();
|
||||
} else {
|
||||
auto graph =
|
||||
std::make_unique<qnn::ggml_qnn_graph>(graph_key, ctx->device, ctx->instance, ctx->socinfo.vtcm_size_in_mb);
|
||||
std::make_unique<qnn::qnn_graph>(graph_key, ctx->device, ctx->instance, ctx->socinfo.vtcm_size_in_mb);
|
||||
if (!graph->is_valid()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto op_constructor = qnn::create_op_constructor(kGgmlOpToQnnOp[op]);
|
||||
if (!graph->build_graph(op_constructor, to_ggml_tensor_array<_InputSize>(inputs),
|
||||
to_ggml_tensor_array<1>({output}))) {
|
||||
QNN_LOG_ERROR("[%s]build_graph failed", qnn::get_backend_name(ctx->device));
|
||||
if (!graph->build_graph_from_op(output)) {
|
||||
QNN_LOG_ERROR("[%s]build_graph_from_op failed", qnn::get_backend_name(ctx->device));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
|
@ -274,22 +180,54 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *c
|
|||
return graph_ptr;
|
||||
}
|
||||
|
||||
template <ggml_op _GgmlOp>
|
||||
bool qnn_binary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst) {
|
||||
static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP");
|
||||
|
||||
CHECK_PARAMS(ctx, src0, src1, dst);
|
||||
|
||||
bool succeed = false;
|
||||
auto *graph_ptr = get_qnn_graph_from_cache<2>(ctx, _GgmlOp, {src0, src1}, dst);
|
||||
if (graph_ptr) {
|
||||
succeed = execute_graph<2>(graph_ptr, {src0, src1}, dst);
|
||||
qnn::qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *ctx, const ggml_cgraph *cgraph) {
|
||||
auto &graph_cache = ctx->qnn_graph_cache;
|
||||
std::string graph_key;
|
||||
get_graph_key_from_cgraph(cgraph, graph_key);
|
||||
if (graph_key.empty()) {
|
||||
QNN_LOG_DEBUG("[%s]empty graph key for cgraph: %p, size: %d", qnn::get_backend_name(ctx->device), cgraph,
|
||||
(int)cgraph->n_nodes);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto it = graph_cache.find(graph_key);
|
||||
qnn::qnn_graph *graph_ptr = nullptr;
|
||||
if (it != graph_cache.end()) {
|
||||
QNN_LOG_DEBUG("[%s]found graph %s in cache", qnn::get_backend_name(ctx->device), graph_key.c_str());
|
||||
graph_ptr = it->second.get();
|
||||
} else {
|
||||
auto graph =
|
||||
std::make_unique<qnn::qnn_graph>(graph_key, ctx->device, ctx->instance, ctx->socinfo.vtcm_size_in_mb);
|
||||
if (!graph->is_valid()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (!graph->build_graph_from_ggml_graph(cgraph)) {
|
||||
QNN_LOG_ERROR("[%s]build_graph_from_op failed", qnn::get_backend_name(ctx->device));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
graph_ptr = graph.get();
|
||||
graph_cache[graph_key] = std::move(graph);
|
||||
}
|
||||
|
||||
return graph_ptr;
|
||||
}
|
||||
|
||||
bool qnn_generic_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *dst) {
|
||||
if (!qnn_is_op_valid(ctx, dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto *graph_ptr = get_qnn_graph_from_cache(ctx, dst);
|
||||
bool succeed = graph_ptr && execute_graph(graph_ptr, dst);
|
||||
|
||||
#ifndef NDEBUG
|
||||
if (!succeed) {
|
||||
print_ggml_tensor(src0);
|
||||
print_ggml_tensor(src1);
|
||||
const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(dst));
|
||||
for (size_t i = 0; i < param_count; ++i) {
|
||||
print_ggml_tensor(dst->src[i]);
|
||||
}
|
||||
print_ggml_tensor(dst);
|
||||
}
|
||||
#endif
|
||||
|
|
@ -297,219 +235,76 @@ bool qnn_binary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0,
|
|||
return succeed;
|
||||
}
|
||||
|
||||
template <size_t _GgmlOp>
|
||||
bool qnn_unary_op_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, ggml_tensor *dst) {
|
||||
static_assert(kGgmlOpToQnnOp[_GgmlOp] != nullptr, "GGML_OP does not have a corresponding QNN_OP");
|
||||
|
||||
CHECK_PARAMS(ctx, src, dst);
|
||||
|
||||
bool succeed = false;
|
||||
auto *graph_ptr = get_qnn_graph_from_cache<1>(ctx, _GgmlOp, {src}, dst);
|
||||
if (graph_ptr) {
|
||||
succeed = execute_graph<1>(graph_ptr, {src}, dst);
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
if (!succeed) {
|
||||
print_ggml_tensor(src);
|
||||
print_ggml_tensor(dst);
|
||||
}
|
||||
#endif
|
||||
|
||||
return succeed;
|
||||
}
|
||||
|
||||
bool qnn_unary_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src, ggml_tensor *dst) {
|
||||
bool qnn_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *dst) {
|
||||
GGML_UNUSED(ctx);
|
||||
GGML_UNUSED(src);
|
||||
GGML_UNUSED(dst);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool qnn_binary_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0, ggml_tensor *src1, ggml_tensor *dst) {
|
||||
GGML_UNUSED(ctx);
|
||||
GGML_UNUSED(src0);
|
||||
GGML_UNUSED(src1);
|
||||
GGML_UNUSED(dst);
|
||||
return true;
|
||||
}
|
||||
constexpr const ggml_qnn_op_t kQnnOpsTable[] = {
|
||||
qnn_nop_impl, // GGML_OP_NONE
|
||||
nullptr, // GGML_OP_DUP
|
||||
qnn_generic_op_impl, // GGML_OP_ADD
|
||||
nullptr, // GGML_OP_ADD1
|
||||
nullptr, // GGML_OP_ACC
|
||||
qnn_generic_op_impl, // GGML_OP_SUB
|
||||
qnn_generic_op_impl, // GGML_OP_MUL
|
||||
qnn_generic_op_impl, // GGML_OP_DIV
|
||||
nullptr, // GGML_OP_SQR
|
||||
qnn_generic_op_impl, // GGML_OP_SQRT
|
||||
qnn_generic_op_impl, // GGML_OP_LOG
|
||||
nullptr, // GGML_OP_SIN
|
||||
nullptr, // GGML_OP_COS
|
||||
nullptr, // GGML_OP_SUM
|
||||
nullptr, // GGML_OP_SUM_ROWS
|
||||
nullptr, // GGML_OP_MEAN
|
||||
nullptr, // GGML_OP_ARGMAX
|
||||
nullptr, // GGML_OP_COUNT_EQUAL
|
||||
nullptr, // GGML_OP_REPEAT
|
||||
nullptr, // GGML_OP_REPEAT_BACK
|
||||
nullptr, // GGML_OP_CONCAT
|
||||
nullptr, // GGML_OP_SILU_BACK
|
||||
nullptr, // GGML_OP_NORM
|
||||
nullptr, // GGML_OP_RMS_NORM
|
||||
nullptr, // GGML_OP_RMS_NORM_BACK
|
||||
nullptr, // GGML_OP_GROUP_NORM
|
||||
|
||||
constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = {
|
||||
qnn_unary_nop_impl, // GGML_OP_NONE
|
||||
nullptr, // GGML_OP_DUP
|
||||
nullptr, // GGML_OP_ADD
|
||||
nullptr, // GGML_OP_ADD1
|
||||
nullptr, // GGML_OP_ACC
|
||||
nullptr, // GGML_OP_SUB
|
||||
nullptr, // GGML_OP_MUL
|
||||
nullptr, // GGML_OP_DIV
|
||||
nullptr, // GGML_OP_SQR
|
||||
qnn_unary_op_impl<GGML_OP_SQRT>, // GGML_OP_SQRT
|
||||
qnn_unary_op_impl<GGML_OP_LOG>, // GGML_OP_LOG
|
||||
nullptr, // GGML_OP_SIN
|
||||
nullptr, // GGML_OP_COS
|
||||
nullptr, // GGML_OP_SUM
|
||||
nullptr, // GGML_OP_SUM_ROWS
|
||||
nullptr, // GGML_OP_MEAN
|
||||
nullptr, // GGML_OP_ARGMAX
|
||||
nullptr, // GGML_OP_COUNT_EQUAL
|
||||
nullptr, // GGML_OP_REPEAT
|
||||
nullptr, // GGML_OP_REPEAT_BACK
|
||||
nullptr, // GGML_OP_CONCAT
|
||||
nullptr, // GGML_OP_SILU_BACK
|
||||
nullptr, // GGML_OP_NORM
|
||||
nullptr, // GGML_OP_RMS_NORM
|
||||
nullptr, // GGML_OP_RMS_NORM_BACK
|
||||
nullptr, // GGML_OP_GROUP_NORM
|
||||
qnn_generic_op_impl, // GGML_OP_MUL_MAT
|
||||
nullptr, // GGML_OP_MUL_MAT_ID
|
||||
nullptr, // GGML_OP_OUT_PROD
|
||||
|
||||
nullptr, // GGML_OP_MUL_MAT
|
||||
nullptr, // GGML_OP_MUL_MAT_ID
|
||||
nullptr, // GGML_OP_OUT_PROD
|
||||
|
||||
nullptr, // GGML_OP_SCALE
|
||||
nullptr, // GGML_OP_SET
|
||||
nullptr, // GGML_OP_CPY
|
||||
nullptr, // GGML_OP_CONT
|
||||
qnn_unary_nop_impl, // GGML_OP_RESHAPE
|
||||
qnn_unary_nop_impl, // GGML_OP_VIEW
|
||||
qnn_unary_nop_impl, // GGML_OP_PERMUTE
|
||||
qnn_unary_nop_impl, // GGML_OP_TRANSPOSE
|
||||
qnn_unary_nop_impl, // GGML_OP_GET_ROWS
|
||||
nullptr, // GGML_OP_GET_ROWS_BACK
|
||||
nullptr, // GGML_OP_DIAG
|
||||
nullptr, // GGML_OP_DIAG_MASK_INF
|
||||
nullptr, // GGML_OP_DIAG_MASK_ZERO
|
||||
nullptr, // GGML_OP_SOFT_MAX
|
||||
nullptr, // GGML_OP_SOFT_MAX_BACK
|
||||
nullptr, // GGML_OP_ROPE
|
||||
nullptr, // GGML_OP_ROPE_BACK
|
||||
nullptr, // GGML_OP_CLAMP
|
||||
nullptr, // GGML_OP_CONV_TRANSPOSE_1D
|
||||
nullptr, // GGML_OP_IM2COL
|
||||
nullptr, // GGML_OP_IM2COL_BACK
|
||||
nullptr, // GGML_OP_CONV_TRANSPOSE_2D
|
||||
nullptr, // GGML_OP_POOL_1D
|
||||
nullptr, // GGML_OP_POOL_2D
|
||||
nullptr, // GGML_OP_POOL_2D_BACK
|
||||
nullptr, // GGML_OP_UPSCALE
|
||||
nullptr, // GGML_OP_PAD
|
||||
nullptr, // GGML_OP_PAD_REFLECT_1D
|
||||
nullptr, // GGML_OP_ARANGE
|
||||
nullptr, // GGML_OP_TIMESTEP_EMBEDDING
|
||||
nullptr, // GGML_OP_ARGSORT
|
||||
nullptr, // GGML_OP_LEAKY_RELU
|
||||
|
||||
nullptr, // GGML_OP_FLASH_ATTN_EXT
|
||||
nullptr, // GGML_OP_FLASH_ATTN_BACK
|
||||
nullptr, // GGML_OP_SSM_CONV
|
||||
nullptr, // GGML_OP_SSM_SCAN
|
||||
nullptr, // GGML_OP_WIN_PART
|
||||
nullptr, // GGML_OP_WIN_UNPART
|
||||
nullptr, // GGML_OP_GET_REL_POS
|
||||
nullptr, // GGML_OP_ADD_REL_POS
|
||||
nullptr, // GGML_OP_RWKV_WKV6
|
||||
|
||||
nullptr, // GGML_OP_UNARY
|
||||
|
||||
nullptr, // GGML_OP_MAP_UNARY
|
||||
nullptr, // GGML_OP_MAP_BINARY
|
||||
|
||||
nullptr, // GGML_OP_MAP_CUSTOM1_F32
|
||||
nullptr, // GGML_OP_MAP_CUSTOM2_F32
|
||||
nullptr, // GGML_OP_MAP_CUSTOM3_F32
|
||||
|
||||
nullptr, // GGML_OP_MAP_CUSTOM1
|
||||
nullptr, // GGML_OP_MAP_CUSTOM2
|
||||
nullptr, // GGML_OP_MAP_CUSTOM3
|
||||
|
||||
nullptr, // GGML_OP_CROSS_ENTROPY_LOSS
|
||||
nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK
|
||||
nullptr, // GGML_OP_OPT_STEP_ADAMW
|
||||
|
||||
// ggml_unary_op
|
||||
nullptr, // GGML_UNARY_OP_ABS
|
||||
nullptr, // GGML_UNARY_OP_SGN
|
||||
nullptr, // GGML_UNARY_OP_NEG
|
||||
nullptr, // GGML_UNARY_OP_STEP
|
||||
nullptr, // GGML_UNARY_OP_TANH
|
||||
nullptr, // GGML_UNARY_OP_ELU
|
||||
nullptr, // GGML_UNARY_OP_RELU
|
||||
nullptr, // GGML_UNARY_OP_SIGMOID
|
||||
qnn_unary_op_impl<GGML_UNARY_OP_GELU + kGgmlUnaryOpStart>, // GGML_UNARY_OP_GELU
|
||||
nullptr, // GGML_UNARY_OP_GELU_QUICK
|
||||
nullptr, // GGML_UNARY_OP_SILU
|
||||
nullptr, // GGML_UNARY_OP_HARDSWISH
|
||||
nullptr, // GGML_UNARY_OP_HARDSIGMOID
|
||||
nullptr, // GGML_UNARY_OP_EXP
|
||||
};
|
||||
|
||||
static_assert(sizeof(kQnnUnaryOpsTable) / sizeof(kQnnUnaryOpsTable[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT),
|
||||
"GGML_OP_COUNT does not match the size of the kQnnUnaryOpsTable table");
|
||||
|
||||
constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = {
|
||||
nullptr, // GGML_OP_NONE
|
||||
nullptr, // GGML_OP_DUP
|
||||
qnn_binary_op_impl<GGML_OP_ADD>, // GGML_OP_ADD
|
||||
nullptr, // GGML_OP_ADD1
|
||||
nullptr, // GGML_OP_ACC
|
||||
qnn_binary_op_impl<GGML_OP_SUB>, // GGML_OP_SUB
|
||||
qnn_binary_op_impl<GGML_OP_MUL>, // GGML_OP_MUL
|
||||
qnn_binary_op_impl<GGML_OP_DIV>, // GGML_OP_DIV
|
||||
nullptr, // GGML_OP_SQR
|
||||
nullptr, // GGML_OP_SQRT
|
||||
nullptr, // GGML_OP_LOG
|
||||
nullptr, // GGML_OP_SIN
|
||||
nullptr, // GGML_OP_COS
|
||||
nullptr, // GGML_OP_SUM
|
||||
nullptr, // GGML_OP_SUM_ROWS
|
||||
nullptr, // GGML_OP_MEAN
|
||||
nullptr, // GGML_OP_ARGMAX
|
||||
nullptr, // GGML_OP_COUNT_EQUAL
|
||||
nullptr, // GGML_OP_REPEAT
|
||||
nullptr, // GGML_OP_REPEAT_BACK
|
||||
nullptr, // GGML_OP_CONCAT
|
||||
nullptr, // GGML_OP_SILU_BACK
|
||||
nullptr, // GGML_OP_NORM
|
||||
nullptr, // GGML_OP_RMS_NORM
|
||||
nullptr, // GGML_OP_RMS_NORM_BACK
|
||||
nullptr, // GGML_OP_GROUP_NORM
|
||||
|
||||
qnn_binary_op_impl<GGML_OP_MUL_MAT>, // GGML_OP_MUL_MAT
|
||||
nullptr, // GGML_OP_MUL_MAT_ID
|
||||
nullptr, // GGML_OP_OUT_PROD
|
||||
|
||||
nullptr, // GGML_OP_SCALE
|
||||
nullptr, // GGML_OP_SET
|
||||
nullptr, // GGML_OP_CPY
|
||||
nullptr, // GGML_OP_CONT
|
||||
nullptr, // GGML_OP_RESHAPE
|
||||
nullptr, // GGML_OP_VIEW
|
||||
nullptr, // GGML_OP_PERMUTE
|
||||
nullptr, // GGML_OP_TRANSPOSE
|
||||
nullptr, // GGML_OP_GET_ROWS
|
||||
nullptr, // GGML_OP_GET_ROWS_BACK
|
||||
nullptr, // GGML_OP_DIAG
|
||||
nullptr, // GGML_OP_DIAG_MASK_INF
|
||||
nullptr, // GGML_OP_DIAG_MASK_ZERO
|
||||
nullptr, // GGML_OP_SOFT_MAX
|
||||
nullptr, // GGML_OP_SOFT_MAX_BACK
|
||||
nullptr, // GGML_OP_ROPE
|
||||
nullptr, // GGML_OP_ROPE_BACK
|
||||
nullptr, // GGML_OP_CLAMP
|
||||
nullptr, // GGML_OP_CONV_TRANSPOSE_1D
|
||||
nullptr, // GGML_OP_IM2COL
|
||||
nullptr, // GGML_OP_IM2COL_BACK
|
||||
nullptr, // GGML_OP_CONV_TRANSPOSE_2D
|
||||
nullptr, // GGML_OP_POOL_1D
|
||||
nullptr, // GGML_OP_POOL_2D
|
||||
nullptr, // GGML_OP_POOL_2D_BACK
|
||||
nullptr, // GGML_OP_UPSCALE
|
||||
nullptr, // GGML_OP_PAD
|
||||
nullptr, // GGML_OP_PAD_REFLECT_1D
|
||||
nullptr, // GGML_OP_ARANGE
|
||||
nullptr, // GGML_OP_TIMESTEP_EMBEDDING
|
||||
nullptr, // GGML_OP_ARGSORT
|
||||
nullptr, // GGML_OP_LEAKY_RELU
|
||||
nullptr, // GGML_OP_SCALE
|
||||
nullptr, // GGML_OP_SET
|
||||
nullptr, // GGML_OP_CPY
|
||||
nullptr, // GGML_OP_CONT
|
||||
qnn_nop_impl, // GGML_OP_RESHAPE
|
||||
nullptr, // GGML_OP_VIEW
|
||||
nullptr, // GGML_OP_PERMUTE
|
||||
nullptr, // GGML_OP_TRANSPOSE
|
||||
nullptr, // GGML_OP_GET_ROWS
|
||||
nullptr, // GGML_OP_GET_ROWS_BACK
|
||||
nullptr, // GGML_OP_DIAG
|
||||
nullptr, // GGML_OP_DIAG_MASK_INF
|
||||
nullptr, // GGML_OP_DIAG_MASK_ZERO
|
||||
nullptr, // GGML_OP_SOFT_MAX
|
||||
nullptr, // GGML_OP_SOFT_MAX_BACK
|
||||
nullptr, // GGML_OP_ROPE
|
||||
nullptr, // GGML_OP_ROPE_BACK
|
||||
nullptr, // GGML_OP_CLAMP
|
||||
nullptr, // GGML_OP_CONV_TRANSPOSE_1D
|
||||
nullptr, // GGML_OP_IM2COL
|
||||
nullptr, // GGML_OP_IM2COL_BACK
|
||||
nullptr, // GGML_OP_CONV_TRANSPOSE_2D
|
||||
nullptr, // GGML_OP_POOL_1D
|
||||
nullptr, // GGML_OP_POOL_2D
|
||||
nullptr, // GGML_OP_POOL_2D_BACK
|
||||
nullptr, // GGML_OP_UPSCALE
|
||||
nullptr, // GGML_OP_PAD
|
||||
nullptr, // GGML_OP_PAD_REFLECT_1D
|
||||
nullptr, // GGML_OP_ARANGE
|
||||
nullptr, // GGML_OP_TIMESTEP_EMBEDDING
|
||||
nullptr, // GGML_OP_ARGSORT
|
||||
nullptr, // GGML_OP_LEAKY_RELU
|
||||
|
||||
nullptr, // GGML_OP_FLASH_ATTN_EXT
|
||||
nullptr, // GGML_OP_FLASH_ATTN_BACK
|
||||
|
|
@ -537,10 +332,36 @@ constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = {
|
|||
nullptr, // GGML_OP_CROSS_ENTROPY_LOSS
|
||||
nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK
|
||||
nullptr, // GGML_OP_OPT_STEP_ADAMW
|
||||
|
||||
// ggml_unary_op
|
||||
nullptr, // GGML_UNARY_OP_ABS
|
||||
nullptr, // GGML_UNARY_OP_SGN
|
||||
nullptr, // GGML_UNARY_OP_NEG
|
||||
nullptr, // GGML_UNARY_OP_STEP
|
||||
nullptr, // GGML_UNARY_OP_TANH
|
||||
nullptr, // GGML_UNARY_OP_ELU
|
||||
nullptr, // GGML_UNARY_OP_RELU
|
||||
nullptr, // GGML_UNARY_OP_SIGMOID
|
||||
qnn_generic_op_impl, // GGML_UNARY_OP_GELU
|
||||
nullptr, // GGML_UNARY_OP_GELU_QUICK
|
||||
nullptr, // GGML_UNARY_OP_SILU
|
||||
nullptr, // GGML_UNARY_OP_HARDSWISH
|
||||
nullptr, // GGML_UNARY_OP_HARDSIGMOID
|
||||
nullptr, // GGML_UNARY_OP_EXP
|
||||
};
|
||||
|
||||
static_assert(sizeof(kQnnBinaryOpsTable) / sizeof(kQnnBinaryOpsTable[0]) == GGML_OP_COUNT,
|
||||
"GGML_OP_COUNT does not match the size of the kQnnBinaryOpsTable table");
|
||||
static_assert(kQnnOpsTable[GGML_OP_NONE] == qnn_nop_impl, "GGML_OP_NONE does not match the qnn_nop_impl function");
|
||||
static_assert(kQnnOpsTable[GGML_OP_ADD] == qnn_generic_op_impl,
|
||||
"GGML_OP_ADD does not match the qnn_generic_op_impl function");
|
||||
static_assert(kQnnOpsTable[GGML_OP_MUL] == qnn_generic_op_impl,
|
||||
"GGML_OP_MUL does not match the qnn_generic_op_impl function");
|
||||
static_assert(kQnnOpsTable[GGML_OP_MUL_MAT] == qnn_generic_op_impl,
|
||||
"GGML_OP_MUL_MAT does not match the qnn_generic_op_impl function");
|
||||
static_assert(kQnnOpsTable[GGML_OP_RESHAPE] == qnn_nop_impl,
|
||||
"GGML_OP_RESHAPE does not match the qnn_nop_impl function");
|
||||
static_assert(kQnnOpsTable[GGML_OP_VIEW] == nullptr, "GGML_OP_VIEW is not nullptr");
|
||||
static_assert(std::size(kQnnOpsTable) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT),
|
||||
"GGML_OP_COUNT does not match the size of the kQnnOpsTable table");
|
||||
|
||||
bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_tensor *tensor) {
|
||||
if (!tensor) {
|
||||
|
|
@ -548,6 +369,7 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t
|
|||
return false;
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
if (tensor->view_src) {
|
||||
auto *src_tensor = tensor->view_src;
|
||||
QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) is a view, src: %s_%dx%dx%dx%d", qnn::get_backend_name(ctx->device),
|
||||
|
|
@ -555,6 +377,7 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t
|
|||
ggml_get_name(src_tensor), src_tensor->ne[0], src_tensor->ne[1], src_tensor->ne[2],
|
||||
src_tensor->ne[3]);
|
||||
}
|
||||
#endif
|
||||
|
||||
switch (tensor->type) {
|
||||
case GGML_TYPE_F32:
|
||||
|
|
@ -576,6 +399,25 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t
|
|||
return true;
|
||||
}
|
||||
|
||||
bool ggnl_qnn_supports_op_tensor(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) {
|
||||
if (op->op == GGML_OP_NONE) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!ggml_qnn_supports_tensor(ctx, op)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(op));
|
||||
for (size_t i = 0; i < param_count; ++i) {
|
||||
if (!ggml_qnn_supports_tensor(ctx, op->src[i])) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) {
|
||||
constexpr const size_t kMaxNpuTensorSize = 8192L * 2048 + 8192 * 512 + 2048 * 512;
|
||||
constexpr const auto get_tensor_size = [](const ggml_tensor *tensor) -> size_t {
|
||||
|
|
@ -591,11 +433,11 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm
|
|||
* TODO: remove the blocker here when NPU backend supports mul_mat like this:
|
||||
* [ne03, ne02, n, k] * [ne03 * x, ne02 * y, m, k] -> [ne03 * x, ne02 * y, m, n]
|
||||
*/
|
||||
QNN_LOG_DEBUG("[qnn-npu] src0 and src1 dimensions are not equal, support/unsupported: %d/%d",
|
||||
QNN_LOG_DEBUG("[qnn-npu][MUL_MAT]src0 and src1 dimensions are not equal, support/unsupported: %d/%d",
|
||||
ctx->support_op_count.load(), ++(ctx->unsupported_op_count));
|
||||
return false;
|
||||
} else if (get_tensor_size(src0) + get_tensor_size(src1) + get_tensor_size(op) >= kMaxNpuTensorSize) {
|
||||
QNN_LOG_DEBUG("[qnn-npu] tensor size is too large, support/unsupported: %d/%d",
|
||||
QNN_LOG_DEBUG("[qnn-npu][MUL_MAT]tensor size is too large, support/unsupported: %d/%d",
|
||||
ctx->support_op_count.load(), ++(ctx->unsupported_op_count));
|
||||
return false;
|
||||
}
|
||||
|
|
@ -604,9 +446,9 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm
|
|||
case QNN_BACKEND_GPU:
|
||||
if (src0->type != src1->type || src0->type != op->type) {
|
||||
// there's no convert op for GPU.
|
||||
QNN_LOG_DEBUG("[qnn-gpu]type src0(%d), src1(%d) and op(%d) are not equal, support/unsupported: %d/%d",
|
||||
src0->type, src1->type, op->type, ctx->support_op_count.load(),
|
||||
++(ctx->unsupported_op_count));
|
||||
QNN_LOG_DEBUG(
|
||||
"[qnn-gpu][MUL_MAT]type src0(%d), src1(%d) and op(%d) are not equal, support/unsupported: %d/%d",
|
||||
src0->type, src1->type, op->type, ctx->support_op_count.load(), ++(ctx->unsupported_op_count));
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
|
|
@ -615,12 +457,12 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm
|
|||
}
|
||||
|
||||
if ((src1->ne[2] % src0->ne[2]) != 0 || (src1->ne[3] % src0->ne[3]) != 0) {
|
||||
QNN_LOG_DEBUG("[%s] src0 and src1 dimensions are not equal, support/unsupported: %d/%d",
|
||||
QNN_LOG_DEBUG("[%s][MUL_MAT]src0 and src1 dimensions are not equal, support/unsupported: %d/%d",
|
||||
qnn::get_backend_name(ctx->device), ctx->support_op_count.load(), ++(ctx->unsupported_op_count));
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s] supported matmul op, support/unsupported: %d/%d", qnn::get_backend_name(ctx->device),
|
||||
QNN_LOG_DEBUG("[%s][MUL_MAT]supported matmul op, support/unsupported: %d/%d", qnn::get_backend_name(ctx->device),
|
||||
++(ctx->support_op_count), ctx->unsupported_op_count.load());
|
||||
return true;
|
||||
}
|
||||
|
|
@ -635,41 +477,30 @@ bool device_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor
|
|||
return true;
|
||||
}
|
||||
|
||||
auto *src0 = op->src[0];
|
||||
if (!kQnnOpsTable[qnn::get_qnn_op_index(op)]) {
|
||||
QNN_LOG_DEBUG("[%s]unsupported op", ggml_op_name(op->op));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!ggnl_qnn_supports_op_tensor(ctx, op)) {
|
||||
QNN_LOG_DEBUG("[%s]unsupported tensor", ggml_op_name(op->op));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (op->op == GGML_OP_UNARY) {
|
||||
const auto unary_op = ggml_get_unary_op(op);
|
||||
if (unary_op == GGML_UNARY_OP_GELU && ctx->device == QNN_BACKEND_NPU) {
|
||||
// TODO: fix this when NPU supports GELU
|
||||
QNN_LOG_DEBUG("unsupported unary op GGML_UNARY_OP_GELU for NPU");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!kQnnUnaryOpsTable[kGgmlUnaryOpStart + unary_op]) {
|
||||
QNN_LOG_DEBUG("unsupported unary op %d", unary_op);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, op)) {
|
||||
QNN_LOG_DEBUG("src0 is nullptr");
|
||||
if (unary_op == GGML_UNARY_OP_GELU) {
|
||||
// TODO: fix this
|
||||
QNN_LOG_DEBUG("[GELU]unsupported unary op GGML_UNARY_OP_GELU for NPU");
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (!kQnnUnaryOpsTable[op->op] && !kQnnBinaryOpsTable[op->op]) {
|
||||
QNN_LOG_DEBUG("[%s] unsupported op", ggml_op_name(op->op));
|
||||
return false;
|
||||
}
|
||||
|
||||
auto *src0 = op->src[0];
|
||||
auto *src1 = op->src[1];
|
||||
if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, op) ||
|
||||
(kQnnBinaryOpsTable[op->op] && !ggml_qnn_supports_tensor(ctx, src1))) {
|
||||
QNN_LOG_DEBUG("[%s] unsupported tensor", ggml_op_name(op->op));
|
||||
return false;
|
||||
}
|
||||
|
||||
switch (op->op) {
|
||||
case GGML_OP_ADD:
|
||||
if (!is_tensor_dimensions_equal(src0, src1)) {
|
||||
QNN_LOG_DEBUG("src0 and src1 dimensions are not equal");
|
||||
if (!ggml_are_same_shape(src0, src1)) {
|
||||
QNN_LOG_DEBUG("[ADD] src0 and src1 dimensions are not equal");
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
|
|
@ -686,34 +517,13 @@ bool device_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor
|
|||
}
|
||||
|
||||
bool device_compute_graph(ggml_backend_qnn_device_context *ctx, ggml_cgraph *cgraph) {
|
||||
QNN_LOG_DEBUG("[%s]compute graph, nodes count: %d", qnn::get_backend_name(ctx->device), (int)cgraph->n_nodes);
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor *tensor = cgraph->nodes[i];
|
||||
if (ggml_is_empty(tensor)) {
|
||||
continue;
|
||||
}
|
||||
QNN_LOG_DEBUG("[%s]compute graph start, nodes count: %d", qnn::get_backend_name(ctx->device), (int)cgraph->n_nodes);
|
||||
|
||||
size_t unary_op_idx = tensor->op;
|
||||
if (tensor->op == GGML_OP_UNARY) {
|
||||
unary_op_idx = kGgmlUnaryOpStart + ggml_get_unary_op(tensor);
|
||||
}
|
||||
auto qnn_graph = get_qnn_graph_from_cache(ctx, cgraph);
|
||||
bool success = qnn_graph && qnn_graph->execute(cgraph);
|
||||
|
||||
bool ok = false;
|
||||
auto unary_op = kQnnUnaryOpsTable[unary_op_idx];
|
||||
auto binary_op = kQnnBinaryOpsTable[tensor->op];
|
||||
if (unary_op) {
|
||||
ok = unary_op(ctx, tensor->src[0], tensor);
|
||||
} else if (binary_op) {
|
||||
ok = binary_op(ctx, tensor->src[0], tensor->src[1], tensor);
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
QNN_LOG_WARN("[%s]unsupported op %s", qnn::get_backend_name(ctx->device), ggml_op_desc(tensor));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
QNN_LOG_DEBUG("[%s]compute graph, success: %d", qnn::get_backend_name(ctx->device), (int)success);
|
||||
return success;
|
||||
}
|
||||
|
||||
} // namespace qnn
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@
|
|||
#include "qnn-lib.hpp"
|
||||
|
||||
namespace qnn {
|
||||
typedef std::unordered_map<std::string, std::unique_ptr<qnn::ggml_qnn_graph>> ggml_qnn_graph_cache_t;
|
||||
typedef std::unordered_map<std::string, std::unique_ptr<qnn::qnn_graph>> qnn_graph_cache_t;
|
||||
} // namespace qnn
|
||||
|
||||
struct ggml_backend_qnn_device_context {
|
||||
|
|
@ -35,7 +35,7 @@ struct ggml_backend_qnn_device_context {
|
|||
std::shared_ptr<qnn::qnn_instance> instance;
|
||||
std::shared_ptr<qnn::qnn_interface> qnn_interface;
|
||||
|
||||
qnn::ggml_qnn_graph_cache_t qnn_graph_cache;
|
||||
qnn::qnn_graph_cache_t qnn_graph_cache;
|
||||
|
||||
#ifndef NDEBUG
|
||||
std::atomic_uint32_t support_op_count = 0;
|
||||
|
|
|
|||
|
|
@ -8,18 +8,65 @@
|
|||
|
||||
namespace qnn {
|
||||
|
||||
/**
|
||||
* @brief An interface for managing generic QNN buffers.
|
||||
*
|
||||
* This abstract class defines the interface for managing generic memory buffers in a QNN context.
|
||||
*/
|
||||
class qnn_buffer_interface {
|
||||
public:
|
||||
virtual ~qnn_buffer_interface() = default;
|
||||
|
||||
/**
|
||||
* @brief Checks if the buffer is valid.
|
||||
*
|
||||
* This pure virtual function must be implemented by derived classes to check
|
||||
* the validity of the buffer.
|
||||
*
|
||||
* @return true if the buffer is valid, false otherwise.
|
||||
*/
|
||||
virtual bool is_valid() const = 0;
|
||||
|
||||
/**
|
||||
* @brief Gets the buffer pointer.
|
||||
*
|
||||
* This pure virtual function must be implemented by derived classes to return
|
||||
* a pointer to the buffer.
|
||||
*
|
||||
* @return A pointer to the buffer.
|
||||
*/
|
||||
virtual uint8_t *get_buffer() = 0;
|
||||
|
||||
/**
|
||||
* @brief Gets the buffer pointer.
|
||||
*
|
||||
* This pure virtual function must be implemented by derived classes to return
|
||||
* a pointer to the buffer.
|
||||
*
|
||||
* @return A pointer to the buffer.
|
||||
*/
|
||||
virtual size_t get_size() const = 0;
|
||||
|
||||
/**
|
||||
* @brief Gets the QNN memory handle associated with the buffer.
|
||||
*
|
||||
* This pure virtual function must be implemented by derived classes to return
|
||||
* the memory handle associated with the buffer.
|
||||
*
|
||||
* @return The memory handle, or null if no valid QNN memory handle is attached.
|
||||
*/
|
||||
virtual Qnn_MemHandle_t get_mem_handle() const = 0;
|
||||
};
|
||||
|
||||
using qnn_buffer_ptr = std::shared_ptr<qnn_buffer_interface>;
|
||||
|
||||
/**
|
||||
* @brief A class for managing QNN RPC memory buffers.
|
||||
*
|
||||
* This class is responsible for allocating, registering, and managing a buffer in RPC memory.
|
||||
* It ensures that the buffer is properly allocated and registered with the QNN instance, and
|
||||
* handles cleanup of the buffer and its associated memory handle upon destruction.
|
||||
*/
|
||||
class qnn_rpc_buffer : public qnn_buffer_interface {
|
||||
public:
|
||||
qnn_rpc_buffer(std::shared_ptr<qnn_instance> qnn_instance, const size_t size, const uint32_t rank,
|
||||
|
|
@ -29,7 +76,7 @@ public:
|
|||
_qnn_rpc_buffer = static_cast<uint8_t *>(qnn_instance->alloc_rpcmem(size, alignof(uint8_t *)));
|
||||
_qnn_rpc_mem_handle = qnn_instance->register_rpcmem(_qnn_rpc_buffer, rank, dimensions, data_type);
|
||||
if (!_qnn_rpc_buffer || !_qnn_rpc_mem_handle) {
|
||||
QNN_LOG_WARN("register rpc mem failure");
|
||||
QNN_LOG_WARN("Failed to register RPC memory: buffer or memory handle is null");
|
||||
// let the destructor free the buffer
|
||||
return;
|
||||
}
|
||||
|
|
@ -64,6 +111,13 @@ private:
|
|||
DISABLE_MOVE(qnn_rpc_buffer);
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief A class for managing QNN memory buffers allocated in regular memory.
|
||||
*
|
||||
* This class is responsible for allocating, managing, and freeing memory buffers
|
||||
* in regular (non-RPC) memory. It implements the qnn_buffer_interface to provide
|
||||
* a consistent interface for buffer management.
|
||||
*/
|
||||
class qnn_mem_buffer : public qnn_buffer_interface {
|
||||
public:
|
||||
explicit qnn_mem_buffer(const uint8_t *data, size_t size) {
|
||||
|
|
@ -102,4 +156,24 @@ private:
|
|||
DISABLE_MOVE(qnn_mem_buffer);
|
||||
};
|
||||
|
||||
class qnn_mem_buffer_slice : public qnn_buffer_interface {
|
||||
public:
|
||||
qnn_mem_buffer_slice(const uint8_t *buffer, size_t size) : _buffer(const_cast<uint8_t *>(buffer)), _size(size) {}
|
||||
|
||||
bool is_valid() const override { return _buffer && _size; }
|
||||
|
||||
uint8_t *get_buffer() override { return _buffer; }
|
||||
|
||||
size_t get_size() const override { return _size; }
|
||||
|
||||
Qnn_MemHandle_t get_mem_handle() const override { return nullptr; }
|
||||
|
||||
private:
|
||||
uint8_t *_buffer = nullptr;
|
||||
size_t _size = 0;
|
||||
|
||||
DISABLE_COPY(qnn_mem_buffer_slice);
|
||||
DISABLE_MOVE(qnn_mem_buffer_slice);
|
||||
};
|
||||
|
||||
} // namespace qnn
|
||||
|
|
|
|||
|
|
@ -222,6 +222,9 @@ bool ggml_backend_qnn_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_
|
|||
GGML_UNUSED(backend_dst);
|
||||
GGML_UNUSED(src);
|
||||
GGML_UNUSED(dst);
|
||||
|
||||
QNN_LOG_DEBUG("opy form %s to %s, src_is_qnn: %d, dst_is_qnn: %d", ggml_get_name(src), ggml_get_name(dst),
|
||||
(int)ggml_backend_is_qnn(backend_src), (int)ggml_backend_is_qnn(backend_dst));
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -317,8 +320,6 @@ ggml_guid_t ggml_backend_qnn_guid() {
|
|||
return &guid;
|
||||
}
|
||||
|
||||
bool ggml_backend_is_qnn(ggml_backend_t backend) { return ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); }
|
||||
|
||||
ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, const char *extend_lib_search_path) {
|
||||
if (!extend_lib_search_path) {
|
||||
extend_lib_search_path = GGML_QNN_DEFAULT_LIB_SEARCH_PATH;
|
||||
|
|
@ -420,8 +421,13 @@ bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_
|
|||
}
|
||||
|
||||
bool ggml_backend_qnn_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor *op) {
|
||||
#ifdef NDEBUG
|
||||
GGML_UNUSED(dev);
|
||||
GGML_UNUSED(op);
|
||||
#else
|
||||
auto *device_ctx = get_device_context(dev);
|
||||
QNN_LOG_DEBUG("[%s][%s]offload op", qnn::get_backend_name(device_ctx->device), ggml_op_name(op->op));
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -509,6 +515,8 @@ const ggml_backend_reg_i ggml_backend_qnn_reg_interface = {
|
|||
|
||||
} // namespace
|
||||
|
||||
bool ggml_backend_is_qnn(ggml_backend_t backend) { return ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); }
|
||||
|
||||
ggml_backend_reg_t ggml_backend_qnn_reg() {
|
||||
static ggml_backend_qnn_reg_impl reg{ggml_backend_qnn_reg_interface};
|
||||
return ®
|
||||
|
|
|
|||
|
|
@ -0,0 +1,386 @@
|
|||
|
||||
#include "graph.hpp"
|
||||
|
||||
#include <set>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#include "logger.hpp"
|
||||
#include "op-config.hpp"
|
||||
#include "tensor.hpp"
|
||||
|
||||
namespace {
|
||||
using qnn_tensor_cache_t = std::unordered_map<ggml_tensor *, qnn::qnn_tensor_ptr_t>;
|
||||
|
||||
int get_op_max_rank(const ggml_tensor *op) {
|
||||
int max_rank = ggml_n_dims(op);
|
||||
const int count = (int)qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(op));
|
||||
for (int i = 0; i < count; ++i) {
|
||||
max_rank = std::max(max_rank, ggml_n_dims(op->src[i]));
|
||||
}
|
||||
|
||||
return max_rank;
|
||||
}
|
||||
|
||||
qnn::qnn_tensor_ptr_t create_tensor_with_cache(ggml_tensor *tensor, qnn::ggml_qnn_tensor::tensor_type_t type, int rank,
|
||||
QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
std::shared_ptr<qnn::qnn_instance> qnn_instance,
|
||||
qnn_tensor_cache_t &tensor_cache) {
|
||||
GGML_ASSERT(tensor);
|
||||
if (tensor_cache.count(tensor)) {
|
||||
return tensor_cache[tensor];
|
||||
}
|
||||
|
||||
auto qnn_tensor = std::make_shared<qnn::ggml_qnn_tensor>(type, tensor->name, tensor->ne, tensor->type, rank, device,
|
||||
graph_handle, qnn_instance);
|
||||
tensor_cache[tensor] = qnn_tensor;
|
||||
return qnn_tensor;
|
||||
}
|
||||
|
||||
qnn::qnn_tensor_array_t create_tensors_with_cache(const qnn::ggml_tensor_array_t &ggml_tensors,
|
||||
qnn::ggml_qnn_tensor::tensor_type_t type, int rank, QNNBackend device,
|
||||
Qnn_GraphHandle_t graph_handle,
|
||||
std::shared_ptr<qnn::qnn_instance> qnn_instance,
|
||||
qnn_tensor_cache_t &tensor_cache) {
|
||||
qnn::qnn_tensor_array_t tensors;
|
||||
for (auto *tensor : ggml_tensors) {
|
||||
tensors.push_back(
|
||||
create_tensor_with_cache(tensor, type, rank, device, graph_handle, qnn_instance, tensor_cache));
|
||||
}
|
||||
|
||||
return tensors;
|
||||
}
|
||||
|
||||
qnn::qnn_op_config_ptr_t create_operation_from_op_tensor(ggml_tensor *dst, const std::string &name, int rank,
|
||||
QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
std::shared_ptr<qnn::qnn_instance> qnn_instance,
|
||||
bool is_intermediate, qnn_tensor_cache_t &tensor_cache) {
|
||||
const auto op_index = qnn::get_qnn_op_index(dst);
|
||||
auto qnn_op = qnn::create_op_constructor(op_index);
|
||||
auto operation = qnn_op(name, qnn_instance);
|
||||
|
||||
// input tensors
|
||||
qnn::qnn_tensor_array_t input_qnn_tensors;
|
||||
auto tensor_type = is_intermediate ? qnn::ggml_qnn_tensor::INTERMEDIATE : qnn::ggml_qnn_tensor::INPUT;
|
||||
for (size_t i = 0; i < qnn::get_qnn_op_input_param_count(op_index); ++i) {
|
||||
auto input_qnn_tensor =
|
||||
create_tensor_with_cache(dst->src[i], tensor_type, rank, device, graph_handle, qnn_instance, tensor_cache);
|
||||
input_qnn_tensors.push_back(input_qnn_tensor);
|
||||
}
|
||||
operation->set_input_tensors(input_qnn_tensors);
|
||||
|
||||
// output tensor
|
||||
tensor_type = is_intermediate ? qnn::ggml_qnn_tensor::INTERMEDIATE : qnn::ggml_qnn_tensor::OUTPUT;
|
||||
qnn::qnn_tensor_array_t output_qnn_tensors =
|
||||
create_tensors_with_cache({dst}, tensor_type, rank, device, graph_handle, qnn_instance, tensor_cache);
|
||||
operation->set_output_tensors(output_qnn_tensors);
|
||||
|
||||
// initialize operation
|
||||
if (!operation->initialize_op_nodes(device, graph_handle)) {
|
||||
QNN_LOG_ERROR("[%s][%s]initialize_op_nodes failed", qnn::get_backend_name(device), name.c_str());
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return operation;
|
||||
}
|
||||
|
||||
bool bind_src_tensors(ggml_tensor *op, qnn::qnn_tensor_array_t &tensor_wrappers,
|
||||
std::vector<Qnn_Tensor_t> &qnn_tensors) {
|
||||
if (op->op == GGML_OP_NONE) {
|
||||
QNN_LOG_DEBUG("op %s is not a valid op", ggml_get_name(op));
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto param_count = qnn::get_qnn_op_input_param_count(qnn::get_qnn_op_index(op));
|
||||
GGML_ASSERT(tensor_wrappers.size() == param_count);
|
||||
qnn_tensors.resize(param_count);
|
||||
for (size_t i = 0; i < param_count; ++i) {
|
||||
auto *ggml_tensor = op->src[i];
|
||||
if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) {
|
||||
QNN_LOG_ERROR("bind tensor %s failed", ggml_get_name(ggml_tensor));
|
||||
return false;
|
||||
}
|
||||
|
||||
qnn_tensors[i] = tensor_wrappers[i]->get_qnn_tensor();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int get_io_tensors_from_graph(const ggml_cgraph *cgraph, qnn::ggml_tensor_array_t &inputs,
|
||||
qnn::ggml_tensor_array_t &outputs) {
|
||||
using ggml_tensor_set_t = std::set<ggml_tensor *>;
|
||||
|
||||
ggml_tensor_set_t input_set;
|
||||
ggml_tensor_set_t output_set;
|
||||
ggml_tensor_set_t visited_set;
|
||||
int rank = 0;
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor *dst = cgraph->nodes[i];
|
||||
if (ggml_is_empty(dst)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (dst->op == GGML_OP_NONE || dst->op == GGML_OP_VIEW) {
|
||||
// TODO: remove GGML_OP_VIEW after view op is supported
|
||||
continue;
|
||||
}
|
||||
|
||||
rank = std::max(rank, ggml_n_dims(dst));
|
||||
input_set.erase(dst);
|
||||
if (!visited_set.count(dst)) {
|
||||
output_set.insert(dst);
|
||||
visited_set.insert(dst);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < GGML_MAX_DIMS && dst->src[i]; ++i) {
|
||||
auto *src = dst->src[i];
|
||||
rank = std::max(rank, ggml_n_dims(src));
|
||||
output_set.erase(src);
|
||||
if (!visited_set.count(src)) {
|
||||
input_set.insert(src);
|
||||
visited_set.insert(src);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inputs.assign(input_set.begin(), input_set.end());
|
||||
outputs.assign(output_set.begin(), output_set.end());
|
||||
return rank;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace qnn {
|
||||
|
||||
qnn_graph::qnn_graph(const std::string &graph_name, QNNBackend device, std::shared_ptr<qnn_instance> qnn_instance,
|
||||
size_t vtcm_size_in_mb)
|
||||
: _graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) {
|
||||
QNN_LOG_DEBUG("[%s][%s]created", get_backend_name(device), graph_name.c_str());
|
||||
|
||||
auto qnn_interface = qnn_instance->get_qnn_interface();
|
||||
auto qnn_context = qnn_instance->get_qnn_context_handle();
|
||||
Qnn_ErrorHandle_t error = QNN_SUCCESS;
|
||||
Qnn_GraphHandle_t graph_handle = nullptr;
|
||||
if (device == QNN_BACKEND_NPU) {
|
||||
// TODO: fix graph config here for NPU
|
||||
QnnHtpGraph_CustomConfig_t hvx_config;
|
||||
hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
|
||||
hvx_config.numHvxThreads = 8;
|
||||
QnnGraph_Config_t graph_hvx_config;
|
||||
graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
|
||||
graph_hvx_config.customConfig = &hvx_config;
|
||||
|
||||
QnnHtpGraph_CustomConfig_t dlbc_config;
|
||||
dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
|
||||
dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
|
||||
dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC
|
||||
QnnGraph_Config_t graph_dlbc_config;
|
||||
graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
|
||||
graph_dlbc_config.customConfig = &dlbc_config;
|
||||
|
||||
QnnHtpGraph_CustomConfig_t opt_config;
|
||||
opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
|
||||
opt_config.optimizationOption.floatValue = 1; // 1 / 3
|
||||
QnnGraph_Config_t graph_opt_config;
|
||||
graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
|
||||
graph_opt_config.customConfig = &opt_config;
|
||||
|
||||
QnnHtpGraph_CustomConfig_t vtcm_config;
|
||||
vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
|
||||
vtcm_config.vtcmSizeInMB = vtcm_size_in_mb;
|
||||
QnnGraph_Config_t graph_vtcm_config;
|
||||
graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
|
||||
graph_vtcm_config.customConfig = &vtcm_config;
|
||||
|
||||
const QnnGraph_Config_t *graph_configs[] = {&graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config,
|
||||
&graph_opt_config, nullptr};
|
||||
error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), graph_configs, &graph_handle);
|
||||
} else {
|
||||
error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), nullptr, &graph_handle);
|
||||
}
|
||||
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s", get_backend_name(device), graph_name.c_str(),
|
||||
get_qnn_error_string(error));
|
||||
return;
|
||||
}
|
||||
|
||||
QNN_LOG_INFO("[%s][%s]create succeed", get_backend_name(device), graph_name.c_str());
|
||||
_graph_handle = graph_handle;
|
||||
_qnn_interface = qnn_interface;
|
||||
}
|
||||
|
||||
qnn_graph::~qnn_graph() { QNN_LOG_DEBUG("[%s][%s]destroy", get_backend_name(_device), _graph_name.c_str()); }
|
||||
|
||||
bool qnn_graph::build_graph_from_op(ggml_tensor *op) {
|
||||
if (!is_valid()) {
|
||||
QNN_LOG_ERROR("Invalid graph");
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s][%s]build start", get_backend_name(_device), _graph_name.c_str());
|
||||
qnn_tensor_cache_t tensor_cache;
|
||||
const auto rank = get_op_max_rank(op);
|
||||
auto operation = create_operation_from_op_tensor(op, _graph_name, rank, _device, _graph_handle, _qnn_instance,
|
||||
false, tensor_cache);
|
||||
if (!operation) {
|
||||
QNN_LOG_ERROR("[%s][%s]create_operation_from_op_tensor failed", get_backend_name(_device), _graph_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
_tensor_inputs = operation->get_input_tensors();
|
||||
_tensor_outputs = operation->get_output_tensors();
|
||||
_operations.push_back(std::move(operation));
|
||||
if (!finalize()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s][%s]build succeed", get_backend_name(_device), _graph_name.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
bool qnn_graph::build_graph_from_ggml_graph(const ggml_cgraph *cgraph) {
|
||||
QNN_LOG_DEBUG("[%s][%s]build start", get_backend_name(_device), _graph_name.c_str());
|
||||
|
||||
ggml_tensor_array_t inputs;
|
||||
ggml_tensor_array_t outputs;
|
||||
int rank = get_io_tensors_from_graph(cgraph, inputs, outputs);
|
||||
QNN_LOG_DEBUG("[%s]rank: %d, input_set: %d, output_set: %d", get_backend_name(_device), rank, int(inputs.size()),
|
||||
int(outputs.size()));
|
||||
|
||||
{
|
||||
qnn_tensor_cache_t tensor_cache;
|
||||
auto input_tensors = create_tensors_with_cache(inputs, ggml_qnn_tensor::INPUT, rank, _device, _graph_handle,
|
||||
_qnn_instance, tensor_cache);
|
||||
auto output_tensors = create_tensors_with_cache(outputs, ggml_qnn_tensor::OUTPUT, rank, _device, _graph_handle,
|
||||
_qnn_instance, tensor_cache);
|
||||
qnn_op_config_array_t operations;
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor *dst = cgraph->nodes[i];
|
||||
if (ggml_is_empty(dst)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (dst->op == GGML_OP_NONE || dst->op == GGML_OP_VIEW) {
|
||||
// TODO: remove GGML_OP_VIEW after view op is supported
|
||||
continue;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s]create op: %s", get_backend_name(_device), get_qnn_op_name(dst->op));
|
||||
auto operation = create_operation_from_op_tensor(dst, dst->name, rank, _device, _graph_handle,
|
||||
_qnn_instance, true, tensor_cache); // TODO: fix op name
|
||||
operations.push_back(operation);
|
||||
}
|
||||
|
||||
_tensor_inputs = std::move(input_tensors);
|
||||
_tensor_outputs = std::move(output_tensors);
|
||||
_operations = std::move(operations);
|
||||
if (!finalize()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s][%s]build succeed", get_backend_name(_device), _graph_name.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
bool qnn_graph::execute(ggml_tensor *op) {
|
||||
if (!bind_src_tensors(op, _tensor_inputs, _qnn_tensor_inputs)) {
|
||||
QNN_LOG_ERROR("[%s][%s]bind input tensors failed", get_backend_name(_device), _graph_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!qnn::bind_tensors({op}, _tensor_outputs, _qnn_tensor_outputs)) {
|
||||
QNN_LOG_ERROR("[%s][%s]bind output tensors failed", get_backend_name(_device), _graph_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
auto &qnn_tensor_inputs = _qnn_tensor_inputs;
|
||||
auto &qnn_tensor_outputs = _qnn_tensor_outputs;
|
||||
auto error =
|
||||
_qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(),
|
||||
qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr);
|
||||
unbind_tensors(_tensor_inputs);
|
||||
unbind_tensors(_tensor_outputs);
|
||||
|
||||
if (error != QNN_SUCCESS) {
|
||||
if (_device == QNN_BACKEND_NPU && error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) {
|
||||
QNN_LOG_WARN("[%s][%s]NPU crashed. SSR detected. Caused QNN graph execute error.",
|
||||
get_backend_name(_device), _graph_name.c_str());
|
||||
} else {
|
||||
QNN_LOG_ERROR("[%s][%s]error: %s", get_backend_name(_device), _graph_name.c_str(),
|
||||
get_qnn_error_string(error));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s][%s]execute succeed", get_backend_name(_device), _graph_name.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
bool qnn_graph::execute(const ggml_cgraph *cgraph) {
|
||||
ggml_tensor_array_t inputs;
|
||||
ggml_tensor_array_t outputs;
|
||||
#ifdef NDEBUG
|
||||
get_io_tensors_from_graph(cgraph, inputs, outputs);
|
||||
#else
|
||||
int rank = get_io_tensors_from_graph(cgraph, inputs, outputs);
|
||||
QNN_LOG_DEBUG("[%s]rank: %d, input_set: %d, output_set: %d", get_backend_name(_device), rank, int(inputs.size()),
|
||||
int(outputs.size()));
|
||||
#endif
|
||||
|
||||
{
|
||||
if (!qnn::bind_tensors(inputs, _tensor_inputs, _qnn_tensor_inputs)) {
|
||||
QNN_LOG_ERROR("[%s][%s]bind input tensors failed", get_backend_name(_device), _graph_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!qnn::bind_tensors(outputs, _tensor_outputs, _qnn_tensor_outputs)) {
|
||||
QNN_LOG_ERROR("[%s][%s]bind output tensors failed", get_backend_name(_device), _graph_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
auto &qnn_tensor_inputs = _qnn_tensor_inputs;
|
||||
auto &qnn_tensor_outputs = _qnn_tensor_outputs;
|
||||
auto error =
|
||||
_qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(),
|
||||
qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr);
|
||||
unbind_tensors(_tensor_inputs);
|
||||
unbind_tensors(_tensor_outputs);
|
||||
|
||||
if (error != QNN_SUCCESS) {
|
||||
if (_device == QNN_BACKEND_NPU && error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) {
|
||||
QNN_LOG_WARN("[%s][%s]NPU crashed. SSR detected. Caused QNN graph execute error.",
|
||||
get_backend_name(_device), _graph_name.c_str());
|
||||
} else {
|
||||
QNN_LOG_ERROR("[%s][%s]error: %s", get_backend_name(_device), _graph_name.c_str(),
|
||||
get_qnn_error_string(error));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s][%s]execute succeed", get_backend_name(_device), _graph_name.c_str());
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
bool qnn_graph::finalize() {
|
||||
if (!qnn::add_op_to_graph(_graph_handle, _operations)) {
|
||||
QNN_LOG_ERROR("[%s]add nodes failed", _graph_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
auto error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_ERROR("[%s][%s]qnn_graph_finalize.error: %s", get_backend_name(_device), _graph_name.c_str(),
|
||||
get_qnn_error_string(error));
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s][%s]finalize succeed", get_backend_name(_device), _graph_name.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace qnn
|
||||
|
|
@ -1,164 +1,53 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include <cstdio>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "ggml-qnn.h"
|
||||
|
||||
#include "logger.hpp"
|
||||
#include "op-config.hpp"
|
||||
#include "qnn-lib.hpp"
|
||||
|
||||
namespace qnn {
|
||||
|
||||
class ggml_qnn_graph {
|
||||
class qnn_graph {
|
||||
public:
|
||||
explicit ggml_qnn_graph(const std::string &graph_name, QNNBackend device,
|
||||
std::shared_ptr<qnn_instance> qnn_instance, size_t vtcm_size_in_mb)
|
||||
: _graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) {
|
||||
QNN_LOG_DEBUG("[%s][%s]created", get_backend_name(device), graph_name.c_str());
|
||||
explicit qnn_graph(const std::string &graph_name, QNNBackend device, std::shared_ptr<qnn_instance> qnn_instance,
|
||||
size_t vtcm_size_in_mb);
|
||||
~qnn_graph();
|
||||
|
||||
auto qnn_interface = qnn_instance->get_qnn_interface();
|
||||
auto qnn_context = qnn_instance->get_qnn_context_handle();
|
||||
Qnn_ErrorHandle_t error = QNN_SUCCESS;
|
||||
Qnn_GraphHandle_t graph_handle = nullptr;
|
||||
if (device == QNN_BACKEND_NPU) {
|
||||
// TODO: fix graph config here for NPU
|
||||
QnnHtpGraph_CustomConfig_t hvx_config;
|
||||
hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
|
||||
hvx_config.numHvxThreads = 8;
|
||||
QnnGraph_Config_t graph_hvx_config;
|
||||
graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
|
||||
graph_hvx_config.customConfig = &hvx_config;
|
||||
|
||||
QnnHtpGraph_CustomConfig_t dlbc_config;
|
||||
dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
|
||||
dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
|
||||
dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC
|
||||
QnnGraph_Config_t graph_dlbc_config;
|
||||
graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
|
||||
graph_dlbc_config.customConfig = &dlbc_config;
|
||||
|
||||
QnnHtpGraph_CustomConfig_t opt_config;
|
||||
opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
|
||||
opt_config.optimizationOption.floatValue = 1; // 1 / 3
|
||||
QnnGraph_Config_t graph_opt_config;
|
||||
graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
|
||||
graph_opt_config.customConfig = &opt_config;
|
||||
|
||||
QnnHtpGraph_CustomConfig_t vtcm_config;
|
||||
vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
|
||||
vtcm_config.vtcmSizeInMB = vtcm_size_in_mb;
|
||||
QnnGraph_Config_t graph_vtcm_config;
|
||||
graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
|
||||
graph_vtcm_config.customConfig = &vtcm_config;
|
||||
|
||||
const QnnGraph_Config_t *graph_configs[] = {&graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config,
|
||||
&graph_opt_config, nullptr};
|
||||
error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), graph_configs, &graph_handle);
|
||||
} else {
|
||||
error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), nullptr, &graph_handle);
|
||||
}
|
||||
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s", get_backend_name(device), graph_name.c_str(),
|
||||
get_qnn_error_string(error));
|
||||
return;
|
||||
}
|
||||
|
||||
QNN_LOG_INFO("[%s][%s]create succeed", get_backend_name(device), graph_name.c_str());
|
||||
_graph_handle = graph_handle;
|
||||
_qnn_interface = qnn_interface;
|
||||
}
|
||||
|
||||
~ggml_qnn_graph() { QNN_LOG_DEBUG("[%s][%s]destroy", get_backend_name(_device), _graph_name.c_str()); }
|
||||
|
||||
bool build_graph(ggml_op_constructor_t op_constructor, const ggml_tensor_array_t &tensor_inputs,
|
||||
const ggml_tensor_array_t &tensor_outputs) {
|
||||
GGML_ASSERT(op_constructor);
|
||||
if (!is_valid()) {
|
||||
QNN_LOG_ERROR("Invalid graph");
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s][%s]build_graph start", get_backend_name(_device), _graph_name.c_str());
|
||||
_op_config = op_constructor(_graph_name, _qnn_instance);
|
||||
if (!_op_config->initialize_op_nodes(_device, _graph_handle, tensor_inputs, tensor_outputs)) {
|
||||
QNN_LOG_ERROR("[%s][%s]initialize_op_nodes failed", get_backend_name(_device), _graph_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!_op_config->add_op_to_graph(_graph_handle)) {
|
||||
QNN_LOG_ERROR("[%s]add nodes failed", _graph_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
auto error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_ERROR("[%s][%s]qnn_graph_finalize.error: %s", get_backend_name(_device), _graph_name.c_str(),
|
||||
get_qnn_error_string(error));
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s][%s]build_graph succeed", get_backend_name(_device), _graph_name.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
bool execute(const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) {
|
||||
if (!_op_config->bind_input_tensors(tensor_inputs)) {
|
||||
QNN_LOG_ERROR("[%s][%s]bind input tensors failed", get_backend_name(_device), _graph_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!_op_config->bind_output_tensors(tensor_outputs)) {
|
||||
QNN_LOG_ERROR("[%s][%s]bind output tensors failed", get_backend_name(_device), _graph_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
auto &qnn_tensor_inputs = _op_config->get_qnn_input_tensors();
|
||||
auto &qnn_tensor_outputs = _op_config->get_qnn_output_tensors();
|
||||
|
||||
auto error =
|
||||
_qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), qnn_tensor_inputs.size(),
|
||||
qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr);
|
||||
_op_config->unbind_input_tensors();
|
||||
_op_config->unbind_output_tensors();
|
||||
|
||||
if (error != QNN_SUCCESS) {
|
||||
if (_device == QNN_BACKEND_NPU && error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) {
|
||||
QNN_LOG_WARN("[%s][%s]NPU crashed. SSR detected. Caused QNN graph execute error.",
|
||||
get_backend_name(_device), _graph_name.c_str());
|
||||
} else {
|
||||
QNN_LOG_ERROR("[%s][%s]error: %s", get_backend_name(_device), _graph_name.c_str(),
|
||||
get_qnn_error_string(error));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s][%s]execute succeed", get_backend_name(_device), _graph_name.c_str());
|
||||
return true;
|
||||
}
|
||||
bool build_graph_from_op(ggml_tensor *op);
|
||||
bool build_graph_from_ggml_graph(const ggml_cgraph *cgraph);
|
||||
|
||||
bool execute(ggml_tensor *op);
|
||||
bool execute(const ggml_cgraph *cgraph);
|
||||
bool is_valid() const { return _graph_handle != nullptr; }
|
||||
|
||||
Qnn_GraphHandle_t get_graph_handler() const { return _graph_handle; }
|
||||
|
||||
std::shared_ptr<qnn_instance> get_qnn_instance() { return _qnn_instance; }
|
||||
const std::string &get_name() const { return _graph_name; }
|
||||
QNNBackend get_device() const { return _device; }
|
||||
|
||||
private:
|
||||
bool finalize();
|
||||
|
||||
const std::string _graph_name;
|
||||
const QNNBackend _device;
|
||||
Qnn_GraphHandle_t _graph_handle = nullptr;
|
||||
std::shared_ptr<qnn_instance> _qnn_instance;
|
||||
std::shared_ptr<qnn_interface> _qnn_interface;
|
||||
std::unique_ptr<ggml_qnn_op_config> _op_config;
|
||||
std::vector<Qnn_Param_t> _param_types;
|
||||
qnn_op_config_array_t _operations;
|
||||
|
||||
DISABLE_COPY(ggml_qnn_graph);
|
||||
DISABLE_MOVE(ggml_qnn_graph);
|
||||
qnn_tensor_array_t _tensor_inputs;
|
||||
qnn_tensor_array_t _tensor_outputs;
|
||||
std::vector<Qnn_Tensor_t> _qnn_tensor_inputs;
|
||||
std::vector<Qnn_Tensor_t> _qnn_tensor_outputs;
|
||||
|
||||
DISABLE_COPY(qnn_graph);
|
||||
DISABLE_MOVE(qnn_graph);
|
||||
};
|
||||
|
||||
using qnn_graph_ptr_t = std::shared_ptr<qnn_graph>;
|
||||
|
||||
} // namespace qnn
|
||||
|
|
|
|||
|
|
@ -10,8 +10,6 @@
|
|||
|
||||
namespace qnn {
|
||||
|
||||
using ggml_tensor_array_t = std::vector<ggml_tensor *>;
|
||||
|
||||
/**
|
||||
* @class ggml_qnn_op_config
|
||||
* @brief Abstract base class for configuring QNN operations.
|
||||
|
|
@ -23,6 +21,34 @@ class ggml_qnn_op_config {
|
|||
public:
|
||||
virtual ~ggml_qnn_op_config() {}
|
||||
|
||||
/**
|
||||
* @brief Sets custom input tensors for the operation. This method should be called before `initialize_op_nodes`.
|
||||
* If no custom input tensors are provided, the input tensors will be automatically created from the input ggml
|
||||
* tensors.
|
||||
*
|
||||
* This pure virtual function must be overridden by derived classes to set
|
||||
* the input tensors for the operation. The function takes a reference to a
|
||||
* vector of qnn_tensor_ptr_t objects, which represent the input tensors.
|
||||
*
|
||||
* @param tensor_inputs A reference to a vector of qnn_tensor_ptr_t objects representing the input tensors.
|
||||
*/
|
||||
virtual void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) = 0;
|
||||
virtual void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) = 0;
|
||||
|
||||
/**
|
||||
* @brief Sets custom output tensors for the operation. This method should be called before `initialize_op_nodes`.
|
||||
* If no custom output tensors are provided, the output tensors will be automatically created from the output ggml
|
||||
* tensors.
|
||||
*
|
||||
* This pure virtual function must be overridden by derived classes to set
|
||||
* the output tensors for the operation. The function takes a reference to a
|
||||
* vector of qnn_tensor_ptr_t objects, which represent the output tensors.
|
||||
*
|
||||
* @param tensor_outputs A reference to a vector of qnn_tensor_ptr_t objects representing the output tensors.
|
||||
*/
|
||||
virtual void set_output_tensors(qnn::qnn_tensor_array_t &tensor_outputs) = 0;
|
||||
virtual void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) = 0;
|
||||
|
||||
/**
|
||||
* @brief Creates tensors and internal nodes for constructing the calculation graph.
|
||||
*
|
||||
|
|
@ -31,36 +57,32 @@ public:
|
|||
* the internal nodes necessary for constructing the calculation graph. It takes
|
||||
* input and output tensor arrays as parameters.
|
||||
*
|
||||
* @param device The backend device where tensors will be created.
|
||||
* @param graph_handle The handle to the graph where tensors and nodes will be associated.
|
||||
* @param tensor_inputs An array of input tensors.
|
||||
* @param tensor_outputs An array of output tensors.
|
||||
* @param device
|
||||
* @param graph_handle
|
||||
* @return true if tensors and nodes are successfully created, false otherwise.
|
||||
*/
|
||||
virtual bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
const ggml_tensor_array_t &tensor_inputs,
|
||||
const ggml_tensor_array_t &tensor_outputs) = 0;
|
||||
virtual bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) = 0;
|
||||
|
||||
/**
|
||||
* @brief Pure virtual function to retrieve the input tensors for QNN (Quantized Neural Network).
|
||||
* @brief Pure virtual function to retrieve the input tensors.
|
||||
*
|
||||
* This function must be overridden by derived classes to provide the specific implementation
|
||||
* for retrieving the input tensors used in QNN operations.
|
||||
*
|
||||
* @return A reference to a vector of Qnn_Tensor_t objects representing the input tensors.
|
||||
* @return A reference to a vector of qnn_tensor_ptr_t objects representing the input tensors.
|
||||
*/
|
||||
virtual std::vector<Qnn_Tensor_t> &get_qnn_input_tensors() = 0;
|
||||
virtual const qnn_tensor_array_t &get_input_tensors() = 0;
|
||||
|
||||
/**
|
||||
* @brief Pure virtual function to retrieve the output tensors of a QNN (Quantized Neural Network).
|
||||
* @brief Pure virtual function to retrieve the output tensors of a QNN.
|
||||
*
|
||||
* This function must be overridden by any derived class to provide access to the
|
||||
* output tensors of the QNN. The function returns a reference to a vector of
|
||||
* Qnn_Tensor_t objects, which represent the output tensors.
|
||||
* qnn_tensor_ptr_t objects, which represent the output tensors.
|
||||
*
|
||||
* @return std::vector<Qnn_Tensor_t>& Reference to a vector of Qnn_Tensor_t objects.
|
||||
* @return A reference to a vector of qnn_tensor_ptr_t objects representing the output tensors.
|
||||
*/
|
||||
virtual std::vector<Qnn_Tensor_t> &get_qnn_output_tensors() = 0;
|
||||
virtual const qnn_tensor_array_t &get_output_tensors() = 0;
|
||||
|
||||
/**
|
||||
* @brief Adds an operation to the given graph.
|
||||
|
|
@ -125,5 +147,6 @@ public:
|
|||
};
|
||||
|
||||
using qnn_op_config_ptr_t = std::shared_ptr<ggml_qnn_op_config>;
|
||||
using qnn_op_config_array_t = std::vector<qnn_op_config_ptr_t>;
|
||||
|
||||
} // namespace qnn
|
||||
|
|
|
|||
|
|
@ -0,0 +1,223 @@
|
|||
|
||||
#include "op-config.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
using op_dims_calc_func_t = void (*)(const std::vector<const qnn::ggml_dimension_array_t> &input_dims,
|
||||
qnn::ggml_dimension_array_t &output_dims);
|
||||
|
||||
void element_wise_op_dims(const std::vector<const qnn::ggml_dimension_array_t> &input_dims,
|
||||
qnn::ggml_dimension_array_t &output_dims) {
|
||||
for (size_t i = 1; i < std::size(output_dims); i++) {
|
||||
output_dims[i] = input_dims.front()[i];
|
||||
}
|
||||
}
|
||||
|
||||
void mat_mul_op_dims(const std::vector<const qnn::ggml_dimension_array_t> &input_dims,
|
||||
qnn::ggml_dimension_array_t &output_dims) {
|
||||
GGML_ASSERT(input_dims.size() == 2);
|
||||
output_dims[0] = input_dims.front()[1];
|
||||
output_dims[1] = input_dims.back()[1];
|
||||
}
|
||||
|
||||
struct qnn_op_caps_t {
|
||||
const char *qnn_op_name = nullptr;
|
||||
const size_t input_param_count = 0;
|
||||
op_dims_calc_func_t calc_dims_func = nullptr;
|
||||
};
|
||||
|
||||
constexpr const qnn_op_caps_t kOpCaps[] = {
|
||||
{}, // GGML_OP_NONE
|
||||
{}, // GGML_OP_DUP
|
||||
{
|
||||
// GGML_OP_ADD
|
||||
QNN_OP_ELEMENT_WISE_ADD, // qnn_op_name
|
||||
2, // input_param_count
|
||||
element_wise_op_dims, // calc_dims_func
|
||||
},
|
||||
{}, // GGML_OP_ADD1
|
||||
{}, // GGML_OP_ACC
|
||||
{
|
||||
// GGML_OP_SUB
|
||||
QNN_OP_ELEMENT_WISE_SUBTRACT, // qnn_op_name
|
||||
2, // input_param_count
|
||||
element_wise_op_dims, // calc_dims_func
|
||||
},
|
||||
{
|
||||
// GGML_OP_MUL
|
||||
QNN_OP_ELEMENT_WISE_MULTIPLY, // qnn_op_name
|
||||
2, // input_param_count
|
||||
element_wise_op_dims, // calc_dims_func
|
||||
},
|
||||
{
|
||||
// GGML_OP_DIV
|
||||
QNN_OP_ELEMENT_WISE_DIVIDE, // qnn_op_name
|
||||
2, // input_param_count
|
||||
element_wise_op_dims, // calc_dims_func
|
||||
},
|
||||
{}, // GGML_OP_SQR
|
||||
{
|
||||
// GGML_OP_SQRT
|
||||
QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // qnn_op_name
|
||||
1, // input_param_count
|
||||
element_wise_op_dims, // calc_dims_func
|
||||
},
|
||||
{
|
||||
// GGML_OP_LOG
|
||||
QNN_OP_ELEMENT_WISE_LOG, // qnn_op_name
|
||||
1, // input_param_count
|
||||
element_wise_op_dims, // calc_dims_func
|
||||
},
|
||||
{}, // GGML_OP_SIN
|
||||
{}, // GGML_OP_COS
|
||||
{}, // GGML_OP_SUM
|
||||
{}, // GGML_OP_SUM_ROWS
|
||||
{}, // GGML_OP_MEAN
|
||||
{}, // GGML_OP_ARGMAX
|
||||
{}, // GGML_OP_COUNT_EQUAL
|
||||
{}, // GGML_OP_REPEAT
|
||||
{}, // GGML_OP_REPEAT_BACK
|
||||
{}, // GGML_OP_CONCAT
|
||||
{}, // GGML_OP_SILU_BACK
|
||||
{}, // GGML_OP_NORM
|
||||
{}, // GGML_OP_RMS_NORM
|
||||
{}, // GGML_OP_RMS_NORM_BACK
|
||||
{}, // GGML_OP_GROUP_NORM
|
||||
{
|
||||
// GGML_OP_MUL_MAT
|
||||
QNN_OP_MAT_MUL, // qnn_op_name
|
||||
2, // input_param_count
|
||||
mat_mul_op_dims, // calc_dims_func
|
||||
},
|
||||
{}, // GGML_OP_MUL_MAT_ID
|
||||
{}, // GGML_OP_OUT_PROD
|
||||
{}, // GGML_OP_SCALE
|
||||
{}, // GGML_OP_SET
|
||||
{}, // GGML_OP_CPY
|
||||
{}, // GGML_OP_CONT
|
||||
{
|
||||
// GGML_OP_RESHAPE
|
||||
QNN_OP_RESHAPE, // qnn_op_name
|
||||
1, // input_param_count
|
||||
nullptr, // TODO: calc_dims_func
|
||||
},
|
||||
{}, // GGML_OP_VIEW
|
||||
{}, // GGML_OP_PERMUTE
|
||||
{}, // GGML_OP_TRANSPOSE
|
||||
{}, // GGML_OP_GET_ROWS
|
||||
{}, // GGML_OP_GET_ROWS_BACK
|
||||
{}, // GGML_OP_DIAG
|
||||
{}, // GGML_OP_DIAG_MASK_INF
|
||||
{}, // GGML_OP_DIAG_MASK_ZERO
|
||||
{}, // GGML_OP_SOFT_MAX
|
||||
{}, // GGML_OP_SOFT_MAX_BACK
|
||||
{}, // GGML_OP_ROPE
|
||||
{}, // GGML_OP_ROPE_BACK
|
||||
{}, // GGML_OP_CLAMP
|
||||
{}, // GGML_OP_CONV_TRANSPOSE_1D
|
||||
{}, // GGML_OP_IM2COL
|
||||
{}, // GGML_OP_IM2COL_BACK
|
||||
{}, // GGML_OP_CONV_TRANSPOSE_2D
|
||||
{}, // GGML_OP_POOL_1D
|
||||
{}, // GGML_OP_POOL_2D
|
||||
{}, // GGML_OP_POOL_2D_BACK
|
||||
{}, // GGML_OP_UPSCALE
|
||||
{}, // GGML_OP_PAD
|
||||
{}, // GGML_OP_PAD_REFLECT_1D
|
||||
{}, // GGML_OP_ARANGE
|
||||
|
||||
{}, // GGML_OP_TIMESTEP_EMBEDDING
|
||||
{}, // GGML_OP_ARGSORT
|
||||
{}, // GGML_OP_LEAKY_RELU
|
||||
|
||||
{}, // GGML_OP_FLASH_ATTN_EXT
|
||||
{}, // GGML_OP_FLASH_ATTN_BACK
|
||||
{}, // GGML_OP_SSM_CONV
|
||||
{}, // GGML_OP_SSM_SCAN
|
||||
{}, // GGML_OP_WIN_PART
|
||||
{}, // GGML_OP_WIN_UNPART
|
||||
{}, // GGML_OP_GET_REL_POS
|
||||
{}, // GGML_OP_ADD_REL_POS
|
||||
{}, // GGML_OP_RWKV_WKV6
|
||||
|
||||
{}, // GGML_OP_UNARY
|
||||
|
||||
{}, // GGML_OP_MAP_UNARY
|
||||
{}, // GGML_OP_MAP_BINARY
|
||||
|
||||
{}, // GGML_OP_MAP_CUSTOM1_F32
|
||||
{}, // GGML_OP_MAP_CUSTOM2_F32
|
||||
{}, // GGML_OP_MAP_CUSTOM3_F32
|
||||
|
||||
{}, // GGML_OP_MAP_CUSTOM1
|
||||
{}, // GGML_OP_MAP_CUSTOM2
|
||||
{}, // GGML_OP_MAP_CUSTOM3
|
||||
|
||||
{}, // GGML_OP_CROSS_ENTROPY_LOSS
|
||||
{}, // GGML_OP_CROSS_ENTROPY_LOSS_BACK
|
||||
{}, // GGML_OP_OPT_STEP_ADAMW
|
||||
|
||||
// ggml_unary_op
|
||||
{}, // GGML_UNARY_OP_ABS
|
||||
{}, // GGML_UNARY_OP_SGN
|
||||
{}, // GGML_UNARY_OP_NEG
|
||||
{}, // GGML_UNARY_OP_STEP
|
||||
{}, // GGML_UNARY_OP_TANH
|
||||
{}, // GGML_UNARY_OP_ELU
|
||||
{}, // GGML_UNARY_OP_RELU
|
||||
{}, // GGML_UNARY_OP_SIGMOID
|
||||
{
|
||||
// GGML_UNARY_OP_GELU
|
||||
QNN_OP_GELU, // qnn_op_name
|
||||
1, // input_param_count
|
||||
nullptr, // TODO: calc_dims_func
|
||||
},
|
||||
{}, // GGML_UNARY_OP_GELU_QUICK
|
||||
{}, // GGML_UNARY_OP_SILU
|
||||
{}, // GGML_UNARY_OP_HARDSWISH
|
||||
{}, // GGML_UNARY_OP_HARDSIGMOID
|
||||
{}, // GGML_UNARY_OP_EXP
|
||||
};
|
||||
|
||||
static_assert(kOpCaps[GGML_OP_NONE].calc_dims_func == nullptr, "GGML_OP_NONE should not have calc_dims_func function");
|
||||
static_assert(kOpCaps[GGML_OP_ADD].calc_dims_func == element_wise_op_dims,
|
||||
"GGML_OP_ADD does not have element_wise_op_dims function");
|
||||
static_assert(kOpCaps[GGML_OP_MUL_MAT].calc_dims_func == mat_mul_op_dims,
|
||||
"GGML_OP_ADD does not have element_wise_op_dims function");
|
||||
static_assert(kOpCaps[GGML_OP_LOG].calc_dims_func == element_wise_op_dims,
|
||||
"GGML_OP_LOG does not have element_wise_op_dims function");
|
||||
static_assert(std::size(kOpCaps) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT),
|
||||
"GGML_OP_COUNT does not match the size of the kOpCaps table");
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace qnn {
|
||||
|
||||
size_t get_qnn_op_index(const ggml_tensor *tensor) {
|
||||
if (tensor->op == GGML_OP_UNARY) {
|
||||
return kGgmlUnaryOpStart + ggml_get_unary_op(tensor);
|
||||
}
|
||||
|
||||
return tensor->op;
|
||||
}
|
||||
|
||||
void get_ggml_op_output_dimensions(const std::vector<const ggml_dimension_array_t> &input_dims, size_t op,
|
||||
ggml_dimension_array_t &output_dims) {
|
||||
GGML_ASSERT(op < std::size(kOpCaps));
|
||||
auto get_dims = kOpCaps[op].calc_dims_func;
|
||||
GGML_ASSERT(get_dims);
|
||||
get_dims(input_dims, output_dims);
|
||||
}
|
||||
|
||||
const char *get_qnn_op_name(size_t op) {
|
||||
GGML_ASSERT(op < std::size(kOpCaps));
|
||||
GGML_ASSERT(kOpCaps[op].qnn_op_name);
|
||||
return kOpCaps[op].qnn_op_name;
|
||||
}
|
||||
|
||||
size_t get_qnn_op_input_param_count(size_t op) {
|
||||
GGML_ASSERT(op < std::size(kOpCaps));
|
||||
return kOpCaps[op].input_param_count;
|
||||
}
|
||||
|
||||
} // namespace qnn
|
||||
|
|
@ -24,16 +24,7 @@ qnn::qnn_dimension_array_t get_transposed_dimensions(const qnn::qnn_dimension_ar
|
|||
}
|
||||
|
||||
int get_rank(const qnn::ggml_tensor_array_t &tensor_inputs, const qnn::ggml_tensor_array_t &tensor_outputs) {
|
||||
int tensor_rank = 0;
|
||||
// get the max tensor rank
|
||||
for (auto tensor : tensor_inputs) {
|
||||
tensor_rank = std::max(tensor_rank, ggml_n_dims(tensor));
|
||||
}
|
||||
for (auto tensor : tensor_outputs) {
|
||||
tensor_rank = std::max(tensor_rank, ggml_n_dims(tensor));
|
||||
}
|
||||
|
||||
return tensor_rank;
|
||||
return std::max(qnn::get_ggml_tensors_max_rank(tensor_inputs), qnn::get_ggml_tensors_max_rank(tensor_outputs));
|
||||
}
|
||||
|
||||
Qnn_DataType_t get_tensor_type(const qnn::qnn_tensor_array_t &tensors) {
|
||||
|
|
@ -49,93 +40,6 @@ Qnn_DataType_t get_tensor_type(const qnn::qnn_tensor_array_t &tensors) {
|
|||
return type;
|
||||
}
|
||||
|
||||
struct tensor_common_params {
|
||||
const char *name_prefix;
|
||||
int tensor_rank;
|
||||
bool is_input;
|
||||
QNNBackend device;
|
||||
Qnn_GraphHandle_t graph_handle;
|
||||
std::shared_ptr<qnn::qnn_instance> qnn_instance;
|
||||
};
|
||||
|
||||
void create_tensors_from_ggml_tensor(const tensor_common_params ¶ms, const qnn::ggml_tensor_array_t &ggml_tensors,
|
||||
qnn::qnn_tensor_array_t *tensor_wrappers, std::vector<Qnn_Tensor_t> *qnn_tensors) {
|
||||
using namespace qnn;
|
||||
|
||||
tensor_wrappers->resize(ggml_tensors.size());
|
||||
if (qnn_tensors) {
|
||||
qnn_tensors->resize(ggml_tensors.size());
|
||||
}
|
||||
char buffer[GGML_MAX_NAME] = {};
|
||||
auto tensor_type = params.is_input ? ggml_qnn_tensor::INPUT : ggml_qnn_tensor::OUTPUT;
|
||||
for (size_t i = 0; i < ggml_tensors.size(); i++) {
|
||||
snprintf(buffer, GGML_MAX_NAME, "%s%d", params.name_prefix, (int)i);
|
||||
auto *ggml_tensor = ggml_tensors[i];
|
||||
(*tensor_wrappers)[i] = std::make_shared<ggml_qnn_tensor>(tensor_type, std::string(buffer), ggml_tensor->ne,
|
||||
ggml_tensor->type, params.tensor_rank, params.device,
|
||||
params.graph_handle, params.qnn_instance);
|
||||
}
|
||||
}
|
||||
|
||||
bool bind_tensors(const qnn::ggml_tensor_array_t &ggml_tensors, qnn::qnn_tensor_array_t &tensor_wrappers,
|
||||
std::vector<Qnn_Tensor_t> &qnn_tensors) {
|
||||
for (size_t i = 0; i < ggml_tensors.size(); i++) {
|
||||
auto *ggml_tensor = ggml_tensors[i];
|
||||
if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) {
|
||||
QNN_LOG_ERROR("bind tensor %s failed", ggml_get_name(ggml_tensor));
|
||||
return false;
|
||||
}
|
||||
|
||||
qnn_tensors[i] = tensor_wrappers[i]->get_qnn_tensor();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
class ggml_qnn_connectable_op_config : public qnn::ggml_qnn_op_config_base {
|
||||
public:
|
||||
explicit ggml_qnn_connectable_op_config(const std::string &name, const std::string &package_name,
|
||||
const std::string &op_type, std::shared_ptr<qnn::qnn_instance> qnn_instance)
|
||||
: ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {}
|
||||
|
||||
bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
const qnn::ggml_tensor_array_t &tensor_inputs,
|
||||
const qnn::ggml_tensor_array_t &tensor_outputs) override {
|
||||
GGML_UNUSED(device);
|
||||
GGML_UNUSED(graph_handle);
|
||||
GGML_UNUSED(tensor_inputs);
|
||||
GGML_UNUSED(tensor_outputs);
|
||||
return true;
|
||||
}
|
||||
|
||||
void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) {
|
||||
_tensor_inputs = tensor_inputs;
|
||||
_qnn_tensor_inputs.resize(_tensor_inputs.size());
|
||||
}
|
||||
|
||||
void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) {
|
||||
_tensor_inputs = std::move(tensor_inputs);
|
||||
_qnn_tensor_inputs.resize(_tensor_inputs.size());
|
||||
}
|
||||
|
||||
void set_output_tensors(qnn::qnn_tensor_array_t &tensor_outputs) {
|
||||
_tensor_outputs = tensor_outputs;
|
||||
_qnn_tensor_outputs.resize(_tensor_outputs.size());
|
||||
}
|
||||
|
||||
void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_outputs) {
|
||||
_tensor_outputs = std::move(tensor_outputs);
|
||||
_qnn_tensor_outputs.resize(_tensor_outputs.size());
|
||||
}
|
||||
|
||||
qnn::qnn_tensor_array_t &get_input_tensors() { return _tensor_inputs; }
|
||||
qnn::qnn_tensor_array_t &get_output_tensors() { return _tensor_outputs; }
|
||||
|
||||
private:
|
||||
DISABLE_COPY(ggml_qnn_connectable_op_config);
|
||||
DISABLE_MOVE(ggml_qnn_connectable_op_config);
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace qnn {
|
||||
|
|
@ -161,7 +65,7 @@ bool ggml_qnn_op_config_base::add_tensor_param(const std::string &name, const qn
|
|||
}
|
||||
|
||||
GGML_ASSERT(data_size > 0);
|
||||
if (!param_tensor->bind_buffer(const_cast<uint8_t *>(data), data_size)) {
|
||||
if (!param_tensor->set_data_buffer(data, data_size)) {
|
||||
QNN_LOG_ERROR("parameter tensor bind_buffer failed");
|
||||
return false;
|
||||
}
|
||||
|
|
@ -181,6 +85,26 @@ bool ggml_qnn_op_config_base::add_tensor_param(const std::string &name, const qn
|
|||
return true;
|
||||
}
|
||||
|
||||
void ggml_qnn_op_config_base::set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) {
|
||||
_tensor_inputs = tensor_inputs;
|
||||
_qnn_tensor_inputs.resize(_tensor_inputs.size());
|
||||
}
|
||||
|
||||
void ggml_qnn_op_config_base::set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) {
|
||||
_tensor_inputs = tensor_inputs;
|
||||
_qnn_tensor_inputs.resize(_tensor_inputs.size());
|
||||
}
|
||||
|
||||
void ggml_qnn_op_config_base::set_output_tensors(qnn::qnn_tensor_array_t &tensor_outputs) {
|
||||
_tensor_outputs = std::move(tensor_outputs);
|
||||
_qnn_tensor_outputs.resize(_tensor_outputs.size());
|
||||
}
|
||||
|
||||
void ggml_qnn_op_config_base::set_output_tensors(qnn::qnn_tensor_array_t &&tensor_outputs) {
|
||||
_tensor_outputs = std::move(tensor_outputs);
|
||||
_qnn_tensor_outputs.resize(_tensor_outputs.size());
|
||||
}
|
||||
|
||||
bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) {
|
||||
GGML_ASSERT(_qnn_tensor_inputs.size() == _tensor_inputs.size());
|
||||
GGML_ASSERT(_qnn_tensor_outputs.size() == _tensor_outputs.size());
|
||||
|
|
@ -221,12 +145,12 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) {
|
|||
|
||||
bool ggml_qnn_op_config_base::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) {
|
||||
GGML_ASSERT(tensor_inputs.size() == _tensor_inputs.size());
|
||||
return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs);
|
||||
return qnn::bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs);
|
||||
}
|
||||
|
||||
bool ggml_qnn_op_config_base::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) {
|
||||
GGML_ASSERT(tensor_outputs.size() == _tensor_outputs.size());
|
||||
return bind_tensors(tensor_outputs, _tensor_outputs, _qnn_tensor_outputs);
|
||||
return qnn::bind_tensors(tensor_outputs, _tensor_outputs, _qnn_tensor_outputs);
|
||||
}
|
||||
|
||||
void ggml_qnn_op_config_base::unbind_input_tensors() {
|
||||
|
|
@ -257,55 +181,42 @@ Qnn_OpConfig_t ggml_qnn_op_config_base::get_op_config() {
|
|||
return config;
|
||||
}
|
||||
|
||||
bool ggml_qnn_single_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
const ggml_tensor_array_t &tensor_inputs,
|
||||
const ggml_tensor_array_t &tensor_outputs) {
|
||||
const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs);
|
||||
tensor_common_params params = {"src", tensor_rank, true, device, graph_handle, _qnn_instance};
|
||||
create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs);
|
||||
params.name_prefix = "dst";
|
||||
params.is_input = false;
|
||||
create_tensors_from_ggml_tensor(params, tensor_outputs, &_tensor_outputs, &_qnn_tensor_outputs);
|
||||
|
||||
if (_param_buffer.size() > 0) {
|
||||
// handle parameters in output tensor
|
||||
auto *params = tensor_outputs.front()->op_params;
|
||||
memcpy(_param_buffer.data(), params, _param_buffer.size());
|
||||
|
||||
const uint32_t count = uint32_t(_param_buffer.size() / qnn_datatype_size(_param_type));
|
||||
const qnn_dimension_array_t param_dims = {count, 1, 1, 1};
|
||||
add_tensor_param(_param_name, param_dims, 1, _param_buffer.data(), _param_type, device, graph_handle);
|
||||
}
|
||||
|
||||
bool ggml_qnn_single_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) {
|
||||
GGML_UNUSED(device);
|
||||
GGML_UNUSED(graph_handle);
|
||||
return true;
|
||||
}
|
||||
|
||||
void ggml_qnn_aggregate_op_config::set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) {
|
||||
_tensor_inputs = tensor_inputs;
|
||||
}
|
||||
|
||||
void ggml_qnn_aggregate_op_config::set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) {
|
||||
_tensor_inputs = std::move(tensor_inputs);
|
||||
}
|
||||
|
||||
void ggml_qnn_aggregate_op_config::set_output_tensors(qnn::qnn_tensor_array_t &tensor_outputs) {
|
||||
_tensor_outputs = tensor_outputs;
|
||||
}
|
||||
|
||||
void ggml_qnn_aggregate_op_config::set_output_tensors(qnn::qnn_tensor_array_t &&tensor_outputs) {
|
||||
_tensor_outputs = std::move(tensor_outputs);
|
||||
}
|
||||
|
||||
bool ggml_qnn_aggregate_op_config::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) {
|
||||
return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs);
|
||||
return qnn::bind_tensors(tensor_inputs, _tensor_inputs);
|
||||
}
|
||||
|
||||
bool ggml_qnn_aggregate_op_config::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) {
|
||||
return bind_tensors(tensor_outputs, _tensor_outputs, _qnn_tensor_outputs);
|
||||
return qnn::bind_tensors(tensor_outputs, _tensor_outputs);
|
||||
}
|
||||
|
||||
bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
const ggml_tensor_array_t &tensor_inputs,
|
||||
const ggml_tensor_array_t &tensor_outputs) {
|
||||
GGML_ASSERT(tensor_inputs.size() == 2);
|
||||
GGML_ASSERT(tensor_outputs.size() == 1);
|
||||
const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs);
|
||||
GGML_ASSERT(tensor_rank >= 2);
|
||||
|
||||
// create input tensors
|
||||
tensor_common_params params = {"src", tensor_rank, true, device, graph_handle, _qnn_instance};
|
||||
create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs);
|
||||
|
||||
// create output tensor
|
||||
params.name_prefix = "dst";
|
||||
params.is_input = false;
|
||||
create_tensors_from_ggml_tensor(params, tensor_outputs, &_tensor_outputs, &_qnn_tensor_outputs);
|
||||
bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) {
|
||||
GGML_ASSERT(_tensor_inputs.size() == 2);
|
||||
GGML_ASSERT(_tensor_outputs.size() == 1);
|
||||
|
||||
// create convert nodes
|
||||
const auto tensor_rank = _tensor_inputs.front()->get_rank();
|
||||
qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs;
|
||||
qnn_tensor_array_t mat_mul_tensor_outputs = _tensor_outputs;
|
||||
if (!create_convert_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs)) {
|
||||
|
|
@ -343,8 +254,8 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic
|
|||
auto gather_out =
|
||||
std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, name + "_out", dimensions,
|
||||
tensor_input->get_data_type(), rank, device, graph_handle, qnn_instance);
|
||||
auto gather_op = std::make_shared<ggml_qnn_connectable_op_config>(name, QNN_OP_PACKAGE_NAME_QTI_AISW,
|
||||
QNN_OP_GATHER, qnn_instance);
|
||||
auto gather_op = std::make_shared<ggml_qnn_single_op_config>(name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_GATHER,
|
||||
qnn_instance);
|
||||
|
||||
Qnn_Scalar_t scalar = QNN_SCALAR_INIT;
|
||||
scalar.dataType = QNN_DATATYPE_INT_32;
|
||||
|
|
@ -355,16 +266,16 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic
|
|||
// here we calculate the index mapping, will generate a 1d tensor like [0, 0, 0, 1, 1, 1, 2, 2, 2, ...],
|
||||
// by repeating each index [scale] times.
|
||||
const auto scale = dimensions[axis] / tensor_input->get_dimensions()[axis];
|
||||
std::vector<uint8_t> index_buffer(dimensions[axis] * sizeof(uint32_t));
|
||||
for (uint32_t *curr = reinterpret_cast<uint32_t *>(index_buffer.data()), *end = curr + dimensions[axis];
|
||||
auto index_buffer = std::make_shared<qnn_mem_buffer>(dimensions[axis] * sizeof(uint32_t));
|
||||
for (uint32_t *curr = reinterpret_cast<uint32_t *>(index_buffer->get_buffer()), *end = curr + dimensions[axis];
|
||||
curr < end; curr++) {
|
||||
*curr = (curr - reinterpret_cast<uint32_t *>(index_buffer.data())) / scale;
|
||||
*curr = (curr - reinterpret_cast<uint32_t *>(index_buffer->get_buffer())) / scale;
|
||||
}
|
||||
|
||||
auto gather_index = std::make_shared<ggml_qnn_tensor>(
|
||||
ggml_qnn_tensor::PARAMETER, name + "_index", qnn_dimension_array_t{dimensions[axis]}, QNN_DATATYPE_UINT_32,
|
||||
1, device, graph_handle, qnn_instance);
|
||||
gather_index->set_data_buffer(std::move(index_buffer));
|
||||
gather_index->set_data_buffer(index_buffer);
|
||||
gather_op->set_input_tensors({tensor_input, gather_index});
|
||||
|
||||
tensor_output = gather_out;
|
||||
|
|
@ -409,8 +320,8 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap
|
|||
auto convert_out = std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_out",
|
||||
convert_in->get_dimensions(), tensor_type, rank, device,
|
||||
graph_handle, _qnn_instance);
|
||||
auto convert = std::make_shared<ggml_qnn_connectable_op_config>(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW,
|
||||
QNN_OP_CONVERT, _qnn_instance);
|
||||
auto convert = std::make_shared<ggml_qnn_single_op_config>(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW,
|
||||
QNN_OP_CONVERT, _qnn_instance);
|
||||
convert->set_input_tensors({convert_in});
|
||||
convert->set_output_tensors({convert_out});
|
||||
tensor_inputs[i] = convert_out;
|
||||
|
|
@ -424,8 +335,8 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap
|
|||
auto convert_in = std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_in",
|
||||
convert_out->get_dimensions(), tensor_type, rank, device,
|
||||
graph_handle, _qnn_instance);
|
||||
auto output_convert = std::make_shared<ggml_qnn_connectable_op_config>(
|
||||
convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_CONVERT, _qnn_instance);
|
||||
auto output_convert = std::make_shared<ggml_qnn_single_op_config>(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW,
|
||||
QNN_OP_CONVERT, _qnn_instance);
|
||||
output_convert->set_input_tensors({convert_in});
|
||||
output_convert->set_output_tensors({convert_out});
|
||||
tensor_outputs.front() = convert_in;
|
||||
|
|
@ -495,12 +406,12 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap
|
|||
dst->get_data_type(), rank, device, graph_handle, _qnn_instance);
|
||||
|
||||
// create transpose_out
|
||||
auto transpose_out = std::make_shared<ggml_qnn_connectable_op_config>(
|
||||
_name + "_trans1", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE, _qnn_instance);
|
||||
auto transpose_out = std::make_shared<ggml_qnn_single_op_config>(_name + "_trans1", QNN_OP_PACKAGE_NAME_QTI_AISW,
|
||||
QNN_OP_TRANSPOSE, _qnn_instance);
|
||||
|
||||
// create mat_mul
|
||||
auto mat_mul = std::make_shared<ggml_qnn_connectable_op_config>(_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
|
||||
_qnn_instance);
|
||||
auto mat_mul =
|
||||
std::make_shared<ggml_qnn_single_op_config>(_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL, _qnn_instance);
|
||||
|
||||
Qnn_Scalar_t scalar = QNN_SCALAR_INIT;
|
||||
scalar.dataType = QNN_DATATYPE_BOOL_8;
|
||||
|
|
@ -528,19 +439,20 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap
|
|||
return true;
|
||||
}
|
||||
|
||||
ggml_op_constructor_t create_op_constructor(const std::string &op_name) {
|
||||
ggml_op_constructor_t create_op_constructor(size_t op) {
|
||||
std::string op_name = get_qnn_op_name(op);
|
||||
if (op_name == QNN_OP_MAT_MUL) {
|
||||
// For QNN_OP_MAT_MUL, we need to transpose the input tensor
|
||||
return [](const std::string &instance_name,
|
||||
std::shared_ptr<qnn::qnn_instance> qnn_instance) -> std::unique_ptr<qnn::ggml_qnn_op_config> {
|
||||
std::shared_ptr<qnn::qnn_instance> qnn_instance) -> std::shared_ptr<qnn::ggml_qnn_op_config> {
|
||||
QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s", instance_name.c_str());
|
||||
return std::make_unique<qnn::ggml_qnn_matmul_op_config>(instance_name, qnn_instance);
|
||||
return std::make_shared<qnn::ggml_qnn_matmul_op_config>(instance_name, qnn_instance);
|
||||
};
|
||||
}
|
||||
|
||||
return [op_name](const std::string &instance_name,
|
||||
std::shared_ptr<qnn::qnn_instance> qnn_instance) -> std::unique_ptr<qnn::ggml_qnn_op_config> {
|
||||
return std::make_unique<qnn::ggml_qnn_single_op_config>(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_name,
|
||||
std::shared_ptr<qnn::qnn_instance> qnn_instance) -> std::shared_ptr<qnn::ggml_qnn_op_config> {
|
||||
return std::make_shared<qnn::ggml_qnn_single_op_config>(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_name,
|
||||
qnn_instance);
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
|
|
@ -13,9 +13,28 @@
|
|||
namespace qnn {
|
||||
|
||||
using ggml_op_constructor_t =
|
||||
std::function<std::unique_ptr<ggml_qnn_op_config>(const std::string &, std::shared_ptr<qnn_instance>)>;
|
||||
std::function<std::shared_ptr<ggml_qnn_op_config>(const std::string &, std::shared_ptr<qnn_instance>)>;
|
||||
|
||||
ggml_op_constructor_t create_op_constructor(const std::string &op_name);
|
||||
constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT;
|
||||
|
||||
size_t get_qnn_op_index(const ggml_tensor *tensor);
|
||||
void get_ggml_op_output_dimensions(const std::vector<const ggml_dimension_array_t> &input_dims, size_t op,
|
||||
ggml_dimension_array_t &output_dims);
|
||||
|
||||
const char *get_qnn_op_name(size_t op);
|
||||
size_t get_qnn_op_input_param_count(size_t op);
|
||||
|
||||
ggml_op_constructor_t create_op_constructor(size_t op);
|
||||
|
||||
inline bool add_op_to_graph(Qnn_GraphHandle_t graph_handle, std::vector<qnn_op_config_ptr_t> &operations) {
|
||||
for (auto &op : operations) {
|
||||
if (!op->add_op_to_graph(graph_handle)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
class ggml_qnn_op_config_base : public ggml_qnn_op_config {
|
||||
public:
|
||||
|
|
@ -27,13 +46,18 @@ public:
|
|||
bool add_tensor_param(const std::string &name, const qnn_dimension_array_t &dimensions, int rank,
|
||||
const uint8_t *data, const Qnn_DataType_t data_type, QNNBackend device,
|
||||
Qnn_GraphHandle_t graph_handle);
|
||||
|
||||
void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override;
|
||||
void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override;
|
||||
void set_output_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override;
|
||||
void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override;
|
||||
bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override;
|
||||
bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override;
|
||||
bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override;
|
||||
void unbind_input_tensors() override;
|
||||
void unbind_output_tensors() override;
|
||||
std::vector<Qnn_Tensor_t> &get_qnn_input_tensors() override { return _qnn_tensor_inputs; }
|
||||
std::vector<Qnn_Tensor_t> &get_qnn_output_tensors() override { return _qnn_tensor_outputs; }
|
||||
const qnn_tensor_array_t &get_input_tensors() override { return _tensor_inputs; }
|
||||
const qnn_tensor_array_t &get_output_tensors() override { return _tensor_outputs; }
|
||||
|
||||
protected:
|
||||
Qnn_OpConfig_t get_op_config();
|
||||
|
|
@ -60,24 +84,9 @@ public:
|
|||
const std::string &op_type, std::shared_ptr<qnn_instance> qnn_instance)
|
||||
: ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {}
|
||||
|
||||
explicit ggml_qnn_single_op_config(const std::string &name, const std::string &package_name,
|
||||
const std::string &op_type, const std::string ¶m_name,
|
||||
const Qnn_DataType_t param_type, const size_t param_size,
|
||||
std::shared_ptr<qnn_instance> qnn_instance)
|
||||
: ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance),
|
||||
_param_name(param_name),
|
||||
_param_type(param_type),
|
||||
_param_buffer(param_size) {}
|
||||
|
||||
bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
const ggml_tensor_array_t &tensor_inputs,
|
||||
const ggml_tensor_array_t &tensor_outputs) override;
|
||||
bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override;
|
||||
|
||||
private:
|
||||
const std::string _param_name;
|
||||
const Qnn_DataType_t _param_type = QNN_DATATYPE_UINT_32;
|
||||
std::vector<uint8_t> _param_buffer;
|
||||
|
||||
DISABLE_COPY(ggml_qnn_single_op_config);
|
||||
DISABLE_MOVE(ggml_qnn_single_op_config);
|
||||
};
|
||||
|
|
@ -88,26 +97,21 @@ public:
|
|||
: _name(name), _qnn_instance(qnn_instance) {}
|
||||
|
||||
~ggml_qnn_aggregate_op_config() {
|
||||
_qnn_tensor_inputs.clear();
|
||||
_qnn_tensor_outputs.clear();
|
||||
_tensor_inputs.clear();
|
||||
_tensor_outputs.clear();
|
||||
_operations.clear();
|
||||
}
|
||||
|
||||
void set_input_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override;
|
||||
void set_input_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override;
|
||||
void set_output_tensors(qnn::qnn_tensor_array_t &tensor_inputs) override;
|
||||
void set_output_tensors(qnn::qnn_tensor_array_t &&tensor_inputs) override;
|
||||
bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override {
|
||||
for (auto &op : _operations) {
|
||||
if (!op->add_op_to_graph(graph_handle)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
return qnn::add_op_to_graph(graph_handle, _operations);
|
||||
}
|
||||
|
||||
bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override;
|
||||
|
||||
bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override;
|
||||
|
||||
void unbind_input_tensors() override {
|
||||
for (auto &tensor : _tensor_inputs) {
|
||||
tensor->unbind();
|
||||
|
|
@ -120,8 +124,8 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
std::vector<Qnn_Tensor_t> &get_qnn_input_tensors() override { return _qnn_tensor_inputs; }
|
||||
std::vector<Qnn_Tensor_t> &get_qnn_output_tensors() override { return _qnn_tensor_outputs; }
|
||||
const qnn_tensor_array_t &get_input_tensors() override { return _tensor_inputs; }
|
||||
const qnn_tensor_array_t &get_output_tensors() override { return _tensor_outputs; }
|
||||
|
||||
protected:
|
||||
std::string _name;
|
||||
|
|
@ -130,8 +134,6 @@ protected:
|
|||
std::vector<qnn_op_config_ptr_t> _operations;
|
||||
qnn_tensor_array_t _tensor_inputs;
|
||||
qnn_tensor_array_t _tensor_outputs;
|
||||
std::vector<Qnn_Tensor_t> _qnn_tensor_inputs;
|
||||
std::vector<Qnn_Tensor_t> _qnn_tensor_outputs;
|
||||
|
||||
private:
|
||||
DISABLE_COPY(ggml_qnn_aggregate_op_config);
|
||||
|
|
@ -143,9 +145,7 @@ public:
|
|||
ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr<qnn_instance> qnn_instance)
|
||||
: ggml_qnn_aggregate_op_config(name, qnn_instance) {}
|
||||
|
||||
bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
const ggml_tensor_array_t &tensor_inputs,
|
||||
const ggml_tensor_array_t &tensor_outputs) override;
|
||||
bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override;
|
||||
|
||||
private:
|
||||
qnn_tensor_ptr_t create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank,
|
||||
|
|
|
|||
|
|
@ -12,7 +12,9 @@ namespace qnn {
|
|||
//
|
||||
// helper data type / data structure / macros / functions of
|
||||
// Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK
|
||||
// ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm
|
||||
// ref:
|
||||
// https://github.com/pytorch/executorch/blob/ae3d558d5e6aa04fc52a3065399fe6a773702f52/backends/qualcomm/serialization/qc_schema.py#L53
|
||||
// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html#supported-snapdragon-devices
|
||||
// =================================================================================================
|
||||
enum sdk_profile_level { profile_off = 0, profile_basic, profile_detail };
|
||||
|
||||
|
|
@ -22,14 +24,18 @@ enum qcom_htp_arch {
|
|||
V69 = 69,
|
||||
V73 = 73,
|
||||
V75 = 75,
|
||||
V79 = 79, // SD 8 Gen 4 (SM8750)
|
||||
};
|
||||
|
||||
enum qcom_chipset {
|
||||
UNKNOWN_SM = 0,
|
||||
SM8450 = 36, // v69
|
||||
SM8475 = 42, // v69
|
||||
SM8550 = 43, // v73
|
||||
SM8650 = 57, // v75
|
||||
SM8450 = 36, // v69, SD 8 Gen 1
|
||||
SM8475 = 42, // v69, SD 8+ Gen 1
|
||||
SM8550 = 43, // v73, SD 8 Gen 2
|
||||
SSG2115P = 46, // v73
|
||||
SM8650 = 57, // v75, SD 8 Gen 3
|
||||
SA8295 = 39, // v68
|
||||
SM8750 = 69, // v79, SD 8 Gen 4
|
||||
};
|
||||
|
||||
struct qcom_socinfo {
|
||||
|
|
|
|||
|
|
@ -20,9 +20,9 @@ namespace qnn {
|
|||
|
||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS should be 4");
|
||||
|
||||
class ggml_qnn_tensor {
|
||||
class ggml_qnn_tensor : public std::enable_shared_from_this<ggml_qnn_tensor> {
|
||||
public:
|
||||
typedef enum _tensor_type { INPUT, OUTPUT, INTERMEDIATE, PARAMETER } tensor_type_t;
|
||||
typedef enum _tensor_type { INPUT, OUTPUT, INTERMEDIATE, PARAMETER, BIDIRECTION } tensor_type_t;
|
||||
|
||||
explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string &name,
|
||||
const qnn_dimension_array_t &dimensions, Qnn_DataType_t data_type, int rank,
|
||||
|
|
@ -49,18 +49,27 @@ public:
|
|||
qnn_datatype_from_ggml_datatype(data_type), rank, device, graph_handle, qnn_instance) {}
|
||||
|
||||
~ggml_qnn_tensor() {
|
||||
_buffer_storage.clear();
|
||||
unbind();
|
||||
_rpc_buffer.reset();
|
||||
unbind();
|
||||
}
|
||||
|
||||
bool set_data_buffer(std::vector<uint8_t> &&buffer) {
|
||||
if (!bind_buffer_impl(buffer.data(), buffer.size())) {
|
||||
return false;
|
||||
bool set_data_buffer(const uint8_t *buffer, const size_t buffer_size) {
|
||||
auto qnn_buffer = std::make_shared<qnn_mem_buffer>(buffer, buffer_size);
|
||||
if (bind_buffer_impl(qnn_buffer)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
_buffer_storage = std::move(buffer);
|
||||
return true;
|
||||
can_unbind = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
bool set_data_buffer(qnn_buffer_ptr buffer) {
|
||||
if (bind_buffer_impl(buffer)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
can_unbind = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
bool alloc_qnn_tensor_id() {
|
||||
|
|
@ -83,23 +92,32 @@ public:
|
|||
return true;
|
||||
}
|
||||
|
||||
bool bind_buffer(uint8_t *buffer, const size_t buffer_size) {
|
||||
if (!_buffer_storage.empty()) {
|
||||
bool bind_ggml_tensor(ggml_tensor *tensor) {
|
||||
if (!can_unbind) {
|
||||
QNN_LOG_DEBUG("[%s]already has buffer storage, skip bind", _tensor_name.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
return bind_buffer_impl(buffer, buffer_size);
|
||||
}
|
||||
#ifndef NDEBUG
|
||||
if (tensor->view_src) {
|
||||
auto *src = tensor->view_src;
|
||||
QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) is a view, src: %s_%dx%dx%dx%d", get_backend_name(_device),
|
||||
tensor->name, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], src->name,
|
||||
src->ne[0], src->ne[1], src->ne[2], src->ne[3]);
|
||||
}
|
||||
#endif
|
||||
|
||||
bool bind_ggml_tensor(ggml_tensor *tensor) {
|
||||
if (!bind_buffer(reinterpret_cast<uint8_t *>(tensor->data), ggml_nbytes(tensor))) {
|
||||
auto buffer =
|
||||
std::make_shared<qnn_mem_buffer_slice>(reinterpret_cast<uint8_t *>(tensor->data), ggml_nbytes(tensor));
|
||||
if (!bind_buffer_impl(buffer)) {
|
||||
QNN_LOG_WARN("[%s]failed to bind ggml tensor(%s)", _tensor_name.c_str(), ggml_get_name(tensor));
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s][%s]bind to ggml tensor(%s)", get_backend_name(_device), _tensor_name.c_str(),
|
||||
ggml_get_name(tensor));
|
||||
tensor->extra = this;
|
||||
_ggml_tensor = tensor;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -110,7 +128,7 @@ public:
|
|||
}
|
||||
|
||||
if (!_buffer) {
|
||||
QNN_LOG_DEBUG("[%s]bound to ggml tensor", _tensor_name.c_str());
|
||||
QNN_LOG_DEBUG("[%s]unbind to ggml tensor", _tensor_name.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -119,7 +137,7 @@ public:
|
|||
return false;
|
||||
}
|
||||
|
||||
if (!_buffer_storage.empty()) {
|
||||
if (!can_unbind) {
|
||||
QNN_LOG_DEBUG("[%s]already has buffer storage, stop unbind", _tensor_name.c_str());
|
||||
return true;
|
||||
}
|
||||
|
|
@ -132,26 +150,32 @@ public:
|
|||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s][%s]unbind from buffer: %p, size: %d", get_backend_name(_device), _tensor_name.c_str(),
|
||||
_buffer, (int)_buffer_size);
|
||||
_buffer = nullptr;
|
||||
_buffer_size = 0;
|
||||
_buffer.get(), (int)_buffer->get_size());
|
||||
_buffer.reset();
|
||||
|
||||
if (_ggml_tensor) {
|
||||
_ggml_tensor->extra = nullptr;
|
||||
_ggml_tensor = nullptr;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; }
|
||||
Qnn_DataType_t get_data_type() const { return QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor); }
|
||||
const qnn_dimension_array_t &get_dimensions() const { return _dimensions; }
|
||||
uint32_t get_rank() const { return QNN_TENSOR_GET_RANK(_qnn_tensor); }
|
||||
uint32_t get_qnn_tensor_id() const { return QNN_TENSOR_GET_ID(_qnn_tensor); }
|
||||
|
||||
private:
|
||||
bool bind_buffer_impl(uint8_t *buffer, const size_t buffer_size) {
|
||||
bool bind_buffer_impl(qnn_buffer_ptr buffer) {
|
||||
if (_buffer) {
|
||||
if (_buffer != buffer) {
|
||||
QNN_LOG_WARN("[%s]has been bound to another buffer %p", _tensor_name.c_str(), _buffer);
|
||||
QNN_LOG_WARN("[%s]has been bound to another buffer %p", _tensor_name.c_str(), _buffer.get());
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s]already bound to same ggml tensor %p", _tensor_name.c_str(), _buffer);
|
||||
QNN_LOG_DEBUG("[%s]already bound to same ggml tensor %p", _tensor_name.c_str(), _buffer.get());
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -164,7 +188,7 @@ private:
|
|||
if (should_use_mem_handle()) {
|
||||
if (!_rpc_buffer) {
|
||||
auto rpc_buffer = std::make_shared<qnn_rpc_buffer>(
|
||||
_qnn_instance, buffer_size, QNN_TENSOR_GET_RANK(_qnn_tensor),
|
||||
_qnn_instance, buffer->get_size(), QNN_TENSOR_GET_RANK(_qnn_tensor),
|
||||
QNN_TENSOR_GET_DIMENSIONS(_qnn_tensor), QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor));
|
||||
if (!rpc_buffer->is_valid()) {
|
||||
QNN_LOG_WARN("[%s][%s]alloc rpc mem failed", get_backend_name(_device), _tensor_name.c_str());
|
||||
|
|
@ -187,22 +211,21 @@ private:
|
|||
QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor));
|
||||
} else {
|
||||
QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW);
|
||||
Qnn_ClientBuffer_t client_buf = {buffer, (uint32_t)buffer_size};
|
||||
Qnn_ClientBuffer_t client_buf = {buffer->get_buffer(), (uint32_t)buffer->get_size()};
|
||||
QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf);
|
||||
QNN_LOG_DEBUG("[%s]use client buffer %p size %d", _tensor_name.c_str(), client_buf.data,
|
||||
(int)client_buf.dataSize);
|
||||
}
|
||||
|
||||
_buffer = buffer;
|
||||
_buffer_size = buffer_size;
|
||||
|
||||
if (!write_to_qnn_tensor()) {
|
||||
QNN_LOG_WARN("[%s]write to qnn tensor failed", _tensor_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s][%s]bind to buffer: %p, size: %d", get_backend_name(_device), _tensor_name.c_str(), buffer,
|
||||
(int)buffer_size);
|
||||
QNN_LOG_DEBUG("[%s][%s]bind to buffer: %p, size: %d", get_backend_name(_device), _tensor_name.c_str(),
|
||||
buffer.get(), (int)buffer->get_size());
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -214,7 +237,7 @@ private:
|
|||
}
|
||||
|
||||
if (_rpc_buffer) {
|
||||
memcpy(_rpc_buffer->get_buffer(), _buffer, _buffer_size);
|
||||
memcpy(_rpc_buffer->get_buffer(), _buffer->get_buffer(), _buffer->get_size());
|
||||
}
|
||||
|
||||
// For CPU and GPU, the data is already in the tensor.
|
||||
|
|
@ -230,7 +253,7 @@ private:
|
|||
}
|
||||
|
||||
if (_rpc_buffer) {
|
||||
memcpy(_buffer, _rpc_buffer->get_buffer(), _buffer_size);
|
||||
memcpy(_buffer->get_buffer(), _rpc_buffer->get_buffer(), _buffer->get_size());
|
||||
}
|
||||
|
||||
// For CPU and GPU, the data is already in the tensor.
|
||||
|
|
@ -258,6 +281,9 @@ private:
|
|||
case PARAMETER:
|
||||
new_tensor_type = QNN_TENSOR_TYPE_STATIC;
|
||||
break;
|
||||
case BIDIRECTION:
|
||||
new_tensor_type = QNN_TENSOR_TYPE_APP_READWRITE;
|
||||
break;
|
||||
case INTERMEDIATE:
|
||||
default:
|
||||
new_tensor_type = QNN_TENSOR_TYPE_NATIVE;
|
||||
|
|
@ -273,15 +299,15 @@ private:
|
|||
}
|
||||
|
||||
std::string _tensor_name;
|
||||
uint8_t *_buffer = nullptr;
|
||||
size_t _buffer_size = 0;
|
||||
std::vector<uint8_t> _buffer_storage;
|
||||
qnn_buffer_ptr _buffer;
|
||||
bool can_unbind = true;
|
||||
QNNBackend _device;
|
||||
std::shared_ptr<qnn_instance> _qnn_instance;
|
||||
Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion);
|
||||
qnn_dimension_array_t _dimensions = {};
|
||||
Qnn_GraphHandle_t _graph_handle = nullptr;
|
||||
qnn_buffer_ptr _rpc_buffer;
|
||||
ggml_tensor *_ggml_tensor = nullptr;
|
||||
|
||||
DISABLE_COPY(ggml_qnn_tensor);
|
||||
DISABLE_MOVE(ggml_qnn_tensor);
|
||||
|
|
@ -289,5 +315,92 @@ private:
|
|||
|
||||
using qnn_tensor_ptr_t = std::shared_ptr<ggml_qnn_tensor>;
|
||||
using qnn_tensor_array_t = std::vector<qnn_tensor_ptr_t>;
|
||||
using ggml_tensor_array_t = std::vector<ggml_tensor *>;
|
||||
|
||||
inline qnn_tensor_ptr_t get_qnn_tensor_ptr(ggml_tensor *ggml_tensor) {
|
||||
return ggml_tensor->extra ? reinterpret_cast<ggml_qnn_tensor *>(ggml_tensor->extra)->shared_from_this()
|
||||
: qnn_tensor_ptr_t();
|
||||
}
|
||||
|
||||
inline int get_ggml_tensors_max_rank(const qnn::ggml_tensor_array_t &tensors) {
|
||||
int max_rank = 0;
|
||||
for (auto tensor : tensors) {
|
||||
max_rank = std::max(max_rank, ggml_n_dims(tensor));
|
||||
}
|
||||
|
||||
return max_rank;
|
||||
}
|
||||
|
||||
inline bool bind_tensors(const ggml_tensor_array_t &ggml_tensors, qnn_tensor_array_t &tensor_wrappers,
|
||||
std::vector<Qnn_Tensor_t> &qnn_tensors) {
|
||||
GGML_ASSERT(tensor_wrappers.size() == ggml_tensors.size());
|
||||
qnn_tensors.resize(ggml_tensors.size());
|
||||
for (size_t i = 0; i < ggml_tensors.size(); i++) {
|
||||
auto *ggml_tensor = ggml_tensors[i];
|
||||
if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) {
|
||||
QNN_LOG_ERROR("bind tensor %s failed", ggml_get_name(ggml_tensor));
|
||||
return false;
|
||||
}
|
||||
|
||||
qnn_tensors[i] = tensor_wrappers[i]->get_qnn_tensor();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool bind_tensors(const ggml_tensor_array_t &ggml_tensors, qnn_tensor_array_t &tensor_wrappers) {
|
||||
GGML_ASSERT(tensor_wrappers.size() == ggml_tensors.size());
|
||||
for (size_t i = 0; i < ggml_tensors.size(); i++) {
|
||||
auto *ggml_tensor = ggml_tensors[i];
|
||||
if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) {
|
||||
QNN_LOG_ERROR("bind tensor %s failed", ggml_get_name(ggml_tensor));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
inline void unbind_tensors(qnn_tensor_array_t &tensor_wrappers) {
|
||||
for (auto &tensor : tensor_wrappers) {
|
||||
tensor->unbind();
|
||||
}
|
||||
}
|
||||
|
||||
struct tensor_create_common_params {
|
||||
const char *name_prefix;
|
||||
int tensor_rank;
|
||||
bool is_input;
|
||||
QNNBackend device;
|
||||
Qnn_GraphHandle_t graph_handle;
|
||||
std::shared_ptr<qnn::qnn_instance> qnn_instance;
|
||||
};
|
||||
|
||||
inline void create_tensors_from_ggml_tensor(const tensor_create_common_params ¶ms,
|
||||
const ggml_tensor_array_t &ggml_tensors,
|
||||
qnn_tensor_array_t *tensor_wrappers,
|
||||
std::vector<Qnn_Tensor_t> *qnn_tensors) {
|
||||
if (qnn_tensors) {
|
||||
qnn_tensors->resize(ggml_tensors.size());
|
||||
}
|
||||
|
||||
if (!tensor_wrappers->empty()) {
|
||||
QNN_LOG_DEBUG("tensor_wrappers is not empty, skip create tensors");
|
||||
GGML_ASSERT(tensor_wrappers->size() == ggml_tensors.size());
|
||||
return;
|
||||
}
|
||||
|
||||
tensor_wrappers->resize(ggml_tensors.size());
|
||||
|
||||
char buffer[GGML_MAX_NAME] = {};
|
||||
auto tensor_type = params.is_input ? ggml_qnn_tensor::INPUT : ggml_qnn_tensor::OUTPUT;
|
||||
for (size_t i = 0; i < ggml_tensors.size(); i++) {
|
||||
snprintf(buffer, GGML_MAX_NAME, "%s%d", params.name_prefix, (int)i);
|
||||
auto *ggml_tensor = ggml_tensors[i];
|
||||
(*tensor_wrappers)[i] = std::make_shared<ggml_qnn_tensor>(tensor_type, std::string(buffer), ggml_tensor->ne,
|
||||
ggml_tensor->type, params.tensor_rank, params.device,
|
||||
params.graph_handle, params.qnn_instance);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace qnn
|
||||
|
|
|
|||
|
|
@ -188,13 +188,15 @@ const char *get_backend_name(QNNBackend device_index) {
|
|||
const char *get_chipset_desc(uint32_t chipset_id) {
|
||||
switch (chipset_id) {
|
||||
case SM8450:
|
||||
return "SM8450";
|
||||
return "SD 8 Gen 1 (SM8450)";
|
||||
case SM8475:
|
||||
return "SM8475";
|
||||
return "SD 8+ Gen 1 (SM8475)";
|
||||
case SM8550:
|
||||
return "SM8550";
|
||||
return "SD 8 Gen 2 (SM8550)";
|
||||
case SM8650:
|
||||
return "SM8650";
|
||||
return "SD 8 Gen 3 (SM8650)";
|
||||
case SM8750:
|
||||
return "SD 8 Gen 4 (SM8750)";
|
||||
default:
|
||||
return "unknown";
|
||||
}
|
||||
|
|
@ -210,6 +212,8 @@ const char *get_htparch_desc(size_t htp_arch) {
|
|||
return "QCOM_HTP_V73";
|
||||
case V75:
|
||||
return "QCOM_HTP_V75";
|
||||
case V79:
|
||||
return "QCOM_HTP_V79";
|
||||
default:
|
||||
return "unknown";
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue