bugfix: error pre-allocated tensor (k_cache_view-0) (#12)
* fix device binding at ggml_backend_qnn_buffer_type * merge ggml_backend_qnn_buffer_context and qnn_mem_buffer * wip * add log * wip * add qnn_buffer_ptr * remove tailing `\n` at log * add log * enable GGML_OP_NONE * wip * wip * disable tensor with view * wip * wip * more log for view tensor * re-enable view * wip * remove link android lib * set dimension at bind function * move graph traversal to backend-ops * wip * add get_view_internal_dimension to obtain the tensor view source dimension * use _view_source_dimensions to allocate qnn tensor * add place holder function ggml_backend_qnn_cpy_tensor_async * add ggml_qnn_aggregate_op_config * make matmul based on ggml_qnn_aggregate_op_config * wip * manually specify the order of op destruct * skip register qnn-cpu backend * disable view op again * remove _view_source_dimensions * add nop for reshape and view ops * add log * add comment
This commit is contained in:
parent
0d02ee09ed
commit
e36ad89528
|
|
@ -2,8 +2,7 @@ message(STATUS "Using QNN backend")
|
|||
|
||||
if(CMAKE_SYSTEM_NAME STREQUAL "Android")
|
||||
find_library(LOG_LIB log)
|
||||
find_library(ANDROID_LIB android)
|
||||
set(QNN_LINK_LIBRARIES ${LOG_LIB} ${ANDROID_LIB})
|
||||
set(QNN_LINK_LIBRARIES ${LOG_LIB})
|
||||
set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend")
|
||||
else()
|
||||
message(FATAL_ERROR "QNN now only available on Android")
|
||||
|
|
|
|||
|
|
@ -3,6 +3,8 @@
|
|||
|
||||
#include <memory>
|
||||
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#include "graph.hpp"
|
||||
#include "logger.hpp"
|
||||
#include "op-config.hpp"
|
||||
|
|
@ -15,13 +17,13 @@ namespace {
|
|||
|
||||
bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor *src, ggml_tensor *dst) {
|
||||
if (!ctx || !src || !dst) {
|
||||
QNN_LOG_WARN("invalid params\n");
|
||||
QNN_LOG_WARN("invalid params");
|
||||
return false;
|
||||
}
|
||||
|
||||
auto instance = ctx->instance;
|
||||
if (!instance) {
|
||||
QNN_LOG_WARN("invalid instance\n");
|
||||
QNN_LOG_WARN("invalid instance");
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -31,13 +33,13 @@ bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor
|
|||
bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
||||
ggml_tensor *dst) {
|
||||
if (!ctx || !src0 || !src1 || !dst) {
|
||||
QNN_LOG_WARN("invalid params\n");
|
||||
QNN_LOG_WARN("invalid params");
|
||||
return false;
|
||||
}
|
||||
|
||||
auto instance = ctx->instance;
|
||||
if (!instance) {
|
||||
QNN_LOG_WARN("invalid instance\n");
|
||||
QNN_LOG_WARN("invalid instance");
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -45,7 +47,7 @@ bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor
|
|||
}
|
||||
|
||||
void print_ggml_tensor(const ggml_tensor *tensor) {
|
||||
QNN_LOG_DEBUG("%s: type:%s ne: %ldx%ldx%ldx%ld, nb: %ldx%ldx%ldx%ld\n", tensor->name, ggml_type_name(tensor->type),
|
||||
QNN_LOG_DEBUG("%s: type:%s ne: %ldx%ldx%ldx%ld, nb: %ldx%ldx%ldx%ld", tensor->name, ggml_type_name(tensor->type),
|
||||
(long)tensor->ne[0], (long)tensor->ne[1], (long)tensor->ne[2], (long)tensor->ne[3],
|
||||
(long)tensor->nb[0], (long)tensor->nb[1], (long)tensor->nb[2], (long)tensor->nb[3]);
|
||||
}
|
||||
|
|
@ -96,7 +98,7 @@ template <size_t _InputSize>
|
|||
bool execute_graph(qnn::ggml_qnn_graph *graph, const std::array<ggml_tensor *, _InputSize> &inputs,
|
||||
ggml_tensor *output) {
|
||||
if (!graph->execute(to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<1>({output}))) {
|
||||
QNN_LOG_WARN("execute failed\n");
|
||||
QNN_LOG_WARN("execute failed");
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -248,7 +250,7 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *c
|
|||
auto it = graph_cache.find(graph_key);
|
||||
qnn::ggml_qnn_graph *graph_ptr = nullptr;
|
||||
if (it != graph_cache.end()) {
|
||||
QNN_LOG_DEBUG("found graph %s in cache\n", graph_key.c_str());
|
||||
QNN_LOG_DEBUG("[%s]found graph %s in cache", qnn::get_backend_name(ctx->device), graph_key.c_str());
|
||||
graph_ptr = it->second.get();
|
||||
} else {
|
||||
auto graph =
|
||||
|
|
@ -260,7 +262,7 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *c
|
|||
auto op_constructor = qnn::create_op_constructor(kGgmlOpToQnnOp[op]);
|
||||
if (!graph->build_graph(op_constructor, to_ggml_tensor_array<_InputSize>(inputs),
|
||||
to_ggml_tensor_array<1>({output}))) {
|
||||
QNN_LOG_ERROR("build_graph failed\n");
|
||||
QNN_LOG_ERROR("[%s]build_graph failed", qnn::get_backend_name(ctx->device));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
|
@ -332,7 +334,7 @@ bool qnn_binary_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0
|
|||
}
|
||||
|
||||
constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = {
|
||||
nullptr, // GGML_OP_NONE
|
||||
qnn_unary_nop_impl, // GGML_OP_NONE
|
||||
nullptr, // GGML_OP_DUP
|
||||
nullptr, // GGML_OP_ADD
|
||||
nullptr, // GGML_OP_ADD1
|
||||
|
|
@ -363,37 +365,37 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = {
|
|||
nullptr, // GGML_OP_MUL_MAT_ID
|
||||
nullptr, // GGML_OP_OUT_PROD
|
||||
|
||||
nullptr, // GGML_OP_SCALE
|
||||
nullptr, // GGML_OP_SET
|
||||
nullptr, // GGML_OP_CPY
|
||||
nullptr, // GGML_OP_CONT
|
||||
nullptr, // GGML_OP_RESHAPE
|
||||
qnn_unary_nop_impl, // GGML_OP_VIEW
|
||||
qnn_unary_op_impl<GGML_OP_PERMUTE>, // GGML_OP_PERMUTE
|
||||
nullptr, // GGML_OP_TRANSPOSE
|
||||
qnn_unary_nop_impl, // GGML_OP_GET_ROWS
|
||||
nullptr, // GGML_OP_GET_ROWS_BACK
|
||||
nullptr, // GGML_OP_DIAG
|
||||
nullptr, // GGML_OP_DIAG_MASK_INF
|
||||
nullptr, // GGML_OP_DIAG_MASK_ZERO
|
||||
nullptr, // GGML_OP_SOFT_MAX
|
||||
nullptr, // GGML_OP_SOFT_MAX_BACK
|
||||
nullptr, // GGML_OP_ROPE
|
||||
nullptr, // GGML_OP_ROPE_BACK
|
||||
nullptr, // GGML_OP_CLAMP
|
||||
nullptr, // GGML_OP_CONV_TRANSPOSE_1D
|
||||
nullptr, // GGML_OP_IM2COL
|
||||
nullptr, // GGML_OP_IM2COL_BACK
|
||||
nullptr, // GGML_OP_CONV_TRANSPOSE_2D
|
||||
nullptr, // GGML_OP_POOL_1D
|
||||
nullptr, // GGML_OP_POOL_2D
|
||||
nullptr, // GGML_OP_POOL_2D_BACK
|
||||
nullptr, // GGML_OP_UPSCALE
|
||||
nullptr, // GGML_OP_PAD
|
||||
nullptr, // GGML_OP_ARANGE
|
||||
nullptr, // GGML_OP_TIMESTEP_EMBEDDING
|
||||
nullptr, // GGML_OP_ARGSORT
|
||||
nullptr, // GGML_OP_LEAKY_RELU
|
||||
nullptr, // GGML_OP_SCALE
|
||||
nullptr, // GGML_OP_SET
|
||||
nullptr, // GGML_OP_CPY
|
||||
nullptr, // GGML_OP_CONT
|
||||
qnn_unary_nop_impl, // GGML_OP_RESHAPE
|
||||
qnn_unary_nop_impl, // GGML_OP_VIEW
|
||||
qnn_unary_nop_impl, // GGML_OP_PERMUTE
|
||||
qnn_unary_nop_impl, // GGML_OP_TRANSPOSE
|
||||
qnn_unary_nop_impl, // GGML_OP_GET_ROWS
|
||||
nullptr, // GGML_OP_GET_ROWS_BACK
|
||||
nullptr, // GGML_OP_DIAG
|
||||
nullptr, // GGML_OP_DIAG_MASK_INF
|
||||
nullptr, // GGML_OP_DIAG_MASK_ZERO
|
||||
nullptr, // GGML_OP_SOFT_MAX
|
||||
nullptr, // GGML_OP_SOFT_MAX_BACK
|
||||
nullptr, // GGML_OP_ROPE
|
||||
nullptr, // GGML_OP_ROPE_BACK
|
||||
nullptr, // GGML_OP_CLAMP
|
||||
nullptr, // GGML_OP_CONV_TRANSPOSE_1D
|
||||
nullptr, // GGML_OP_IM2COL
|
||||
nullptr, // GGML_OP_IM2COL_BACK
|
||||
nullptr, // GGML_OP_CONV_TRANSPOSE_2D
|
||||
nullptr, // GGML_OP_POOL_1D
|
||||
nullptr, // GGML_OP_POOL_2D
|
||||
nullptr, // GGML_OP_POOL_2D_BACK
|
||||
nullptr, // GGML_OP_UPSCALE
|
||||
nullptr, // GGML_OP_PAD
|
||||
nullptr, // GGML_OP_ARANGE
|
||||
nullptr, // GGML_OP_TIMESTEP_EMBEDDING
|
||||
nullptr, // GGML_OP_ARGSORT
|
||||
nullptr, // GGML_OP_LEAKY_RELU
|
||||
|
||||
nullptr, // GGML_OP_FLASH_ATTN_EXT
|
||||
nullptr, // GGML_OP_FLASH_ATTN_BACK
|
||||
|
|
@ -442,7 +444,7 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = {
|
|||
static_assert(sizeof(kQnnUnaryOpsTable) / sizeof(kQnnUnaryOpsTable[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT),
|
||||
"GGML_OP_COUNT does not match the size of the kQnnUnaryOpsTable table");
|
||||
|
||||
static constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = {
|
||||
constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = {
|
||||
nullptr, // GGML_OP_NONE
|
||||
nullptr, // GGML_OP_DUP
|
||||
qnn_binary_op_impl<GGML_OP_ADD>, // GGML_OP_ADD
|
||||
|
|
@ -543,22 +545,28 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t
|
|||
return false;
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
auto *type_name = ggml_get_type_traits(tensor->type)->type_name;
|
||||
#endif
|
||||
if (tensor->view_src) {
|
||||
auto *src_tensor = tensor->view_src;
|
||||
QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) is a view, src: %s_%dx%dx%dx%d", qnn::get_backend_name(ctx->device),
|
||||
ggml_get_name(tensor), tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3],
|
||||
ggml_get_name(src_tensor), src_tensor->ne[0], src_tensor->ne[1], src_tensor->ne[2],
|
||||
src_tensor->ne[3]);
|
||||
}
|
||||
|
||||
switch (tensor->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
if (!(ctx->supported_types & (1 << tensor->type))) {
|
||||
QNN_LOG_DEBUG("unsupported data type %s for backend %s, supported_types: 0x%x", type_name,
|
||||
qnn::get_backend_name(ctx->device), ctx->supported_types);
|
||||
QNN_LOG_DEBUG("[%s]unsupported data type %s, supported_types: 0x%x", qnn::get_backend_name(ctx->device),
|
||||
ggml_type_name(tensor->type), ctx->supported_types);
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
QNN_LOG_DEBUG("unsupported data type %s", type_name);
|
||||
QNN_LOG_DEBUG("[%s]unsupported data type %s", qnn::get_backend_name(ctx->device),
|
||||
ggml_type_name(tensor->type));
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -566,6 +574,7 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t
|
|||
}
|
||||
|
||||
bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) {
|
||||
constexpr const size_t kMaxNpuTensorSize = 8192L * 2048 + 8192 * 512 + 2048 * 512;
|
||||
constexpr const auto get_tensor_size = [](const ggml_tensor *tensor) -> size_t {
|
||||
return tensor->ne[0] * tensor->ne[1] * tensor->ne[2] * tensor->ne[3];
|
||||
};
|
||||
|
|
@ -582,8 +591,7 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm
|
|||
QNN_LOG_DEBUG("[qnn-npu] src0 and src1 dimensions are not equal, support/unsupported: %d/%d",
|
||||
ctx->support_op_count.load(), ++(ctx->unsupported_op_count));
|
||||
return false;
|
||||
} else if (get_tensor_size(src0) + get_tensor_size(src1) + get_tensor_size(op) >=
|
||||
(8192 * 2048 + 8192 * 512 + 2048 * 512)) {
|
||||
} else if (get_tensor_size(src0) + get_tensor_size(src1) + get_tensor_size(op) >= kMaxNpuTensorSize) {
|
||||
QNN_LOG_DEBUG("[qnn-npu] tensor size is too large, support/unsupported: %d/%d",
|
||||
ctx->support_op_count.load(), ++(ctx->unsupported_op_count));
|
||||
return false;
|
||||
|
|
@ -618,12 +626,13 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm
|
|||
|
||||
namespace qnn {
|
||||
|
||||
bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) {
|
||||
bool device_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) {
|
||||
// Note that this function could be called before the device context is initialized
|
||||
if (op->op == GGML_OP_NONE) {
|
||||
return true;
|
||||
}
|
||||
|
||||
auto *src0 = op->src[0];
|
||||
if (op->op == GGML_OP_UNARY) {
|
||||
const auto unary_op = ggml_get_unary_op(op);
|
||||
if (unary_op == GGML_UNARY_OP_GELU && ctx->device == QNN_BACKEND_NPU) {
|
||||
|
|
@ -637,7 +646,7 @@ bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tenso
|
|||
return false;
|
||||
}
|
||||
|
||||
if (!op->src[0]) {
|
||||
if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, op)) {
|
||||
QNN_LOG_DEBUG("src0 is nullptr");
|
||||
return false;
|
||||
}
|
||||
|
|
@ -647,7 +656,6 @@ bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tenso
|
|||
return false;
|
||||
}
|
||||
|
||||
auto *src0 = op->src[0];
|
||||
auto *src1 = op->src[1];
|
||||
if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, op) ||
|
||||
(kQnnBinaryOpsTable[op->op] && !ggml_qnn_supports_tensor(ctx, src1))) {
|
||||
|
|
@ -674,24 +682,35 @@ bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tenso
|
|||
return true;
|
||||
}
|
||||
|
||||
bool ggml_qnn_forward(ggml_backend_qnn_device_context *ctx, struct ggml_tensor *tensor) {
|
||||
size_t unary_op_idx = tensor->op;
|
||||
if (tensor->op == GGML_OP_UNARY) {
|
||||
unary_op_idx = kGgmlUnaryOpStart + ggml_get_unary_op(tensor);
|
||||
bool device_compute_graph(ggml_backend_qnn_device_context *ctx, ggml_cgraph *cgraph) {
|
||||
QNN_LOG_DEBUG("[%s]compute graph, nodes count: %d", qnn::get_backend_name(ctx->device), (int)cgraph->n_nodes);
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor *tensor = cgraph->nodes[i];
|
||||
if (ggml_is_empty(tensor)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
size_t unary_op_idx = tensor->op;
|
||||
if (tensor->op == GGML_OP_UNARY) {
|
||||
unary_op_idx = kGgmlUnaryOpStart + ggml_get_unary_op(tensor);
|
||||
}
|
||||
|
||||
bool ok = false;
|
||||
auto unary_op = kQnnUnaryOpsTable[unary_op_idx];
|
||||
auto binary_op = kQnnBinaryOpsTable[tensor->op];
|
||||
if (unary_op) {
|
||||
ok = unary_op(ctx, tensor->src[0], tensor);
|
||||
} else if (binary_op) {
|
||||
ok = binary_op(ctx, tensor->src[0], tensor->src[1], tensor);
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
QNN_LOG_WARN("[%s]unsupported op %s", qnn::get_backend_name(ctx->device), ggml_op_desc(tensor));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
auto unary_op = kQnnUnaryOpsTable[unary_op_idx];
|
||||
if (unary_op) {
|
||||
return unary_op(ctx, tensor->src[0], tensor);
|
||||
}
|
||||
|
||||
auto binary_op = kQnnBinaryOpsTable[tensor->op];
|
||||
if (binary_op) {
|
||||
return binary_op(ctx, tensor->src[0], tensor->src[1], tensor);
|
||||
}
|
||||
|
||||
QNN_LOG_WARN("[forward]unsupported op %s", ggml_op_desc(tensor));
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace qnn
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@
|
|||
|
||||
namespace qnn {
|
||||
|
||||
bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op);
|
||||
bool ggml_qnn_forward(ggml_backend_qnn_device_context *ctx, struct ggml_tensor *tensor);
|
||||
bool device_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op);
|
||||
bool device_compute_graph(ggml_backend_qnn_device_context *ctx, ggml_cgraph *cgraph);
|
||||
|
||||
} // namespace qnn
|
||||
|
|
|
|||
|
|
@ -1,28 +1,42 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
#include "logger.hpp"
|
||||
#include "qnn-lib.hpp"
|
||||
|
||||
namespace qnn {
|
||||
class ggml_qnn_rpc_buffer {
|
||||
public:
|
||||
ggml_qnn_rpc_buffer(std::shared_ptr<qnn_instance> qnn_instance, const size_t size, const uint32_t rank,
|
||||
uint32_t *dimensions, Qnn_DataType_t data_type) :
|
||||
_qnn_instance(qnn_instance), _size(size) {
|
||||
|
||||
_qnn_rpc_buffer = static_cast<uint8_t *>(qnn_instance->alloc_rpcmem(size, alignof(void *)));
|
||||
class qnn_buffer_interface {
|
||||
public:
|
||||
virtual ~qnn_buffer_interface() = default;
|
||||
|
||||
virtual bool is_valid() const = 0;
|
||||
virtual uint8_t *get_buffer() = 0;
|
||||
virtual size_t get_size() const = 0;
|
||||
virtual Qnn_MemHandle_t get_mem_handle() const = 0;
|
||||
};
|
||||
|
||||
using qnn_buffer_ptr = std::shared_ptr<qnn_buffer_interface>;
|
||||
|
||||
class qnn_rpc_buffer : public qnn_buffer_interface {
|
||||
public:
|
||||
qnn_rpc_buffer(std::shared_ptr<qnn_instance> qnn_instance, const size_t size, const uint32_t rank,
|
||||
uint32_t *dimensions, Qnn_DataType_t data_type)
|
||||
: _size(size), _qnn_instance(qnn_instance) {
|
||||
|
||||
_qnn_rpc_buffer = static_cast<uint8_t *>(qnn_instance->alloc_rpcmem(size, alignof(uint8_t *)));
|
||||
_qnn_rpc_mem_handle = qnn_instance->register_rpcmem(_qnn_rpc_buffer, rank, dimensions, data_type);
|
||||
if (!_qnn_rpc_buffer || !_qnn_rpc_mem_handle) {
|
||||
QNN_LOG_WARN("register rpc mem failure\n");
|
||||
QNN_LOG_WARN("register rpc mem failure");
|
||||
// let the destructor free the buffer
|
||||
return;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("alloc rpcmem(%p) successfully, size %d\n", _qnn_rpc_buffer, (int)size);
|
||||
QNN_LOG_DEBUG("alloc rpcmem(%p) successfully, size %d", _qnn_rpc_buffer, (int)size);
|
||||
}
|
||||
~ggml_qnn_rpc_buffer() {
|
||||
~qnn_rpc_buffer() {
|
||||
if (_qnn_instance) {
|
||||
if (_qnn_rpc_mem_handle) {
|
||||
_qnn_instance->unregister_rpcmem(_qnn_rpc_mem_handle);
|
||||
|
|
@ -34,22 +48,58 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
bool is_valid() const { return _qnn_rpc_buffer && _qnn_rpc_mem_handle; }
|
||||
bool is_valid() const override { return _qnn_rpc_buffer && _qnn_rpc_mem_handle; }
|
||||
|
||||
uint8_t *get_buffer() const { return _qnn_rpc_buffer; }
|
||||
size_t get_size() const { return _size; }
|
||||
Qnn_MemHandle_t get_mem_handle() const { return _qnn_rpc_mem_handle; }
|
||||
uint8_t *get_buffer() override { return _qnn_rpc_buffer; }
|
||||
size_t get_size() const override { return _size; }
|
||||
Qnn_MemHandle_t get_mem_handle() const override { return _qnn_rpc_mem_handle; }
|
||||
|
||||
private:
|
||||
std::shared_ptr<qnn_instance> _qnn_instance;
|
||||
size_t _size = 0;
|
||||
uint8_t *_qnn_rpc_buffer = nullptr;
|
||||
Qnn_MemHandle_t _qnn_rpc_mem_handle = nullptr;
|
||||
std::shared_ptr<qnn_instance> _qnn_instance;
|
||||
|
||||
ggml_qnn_rpc_buffer(const ggml_qnn_rpc_buffer &) = delete;
|
||||
void operator=(const ggml_qnn_rpc_buffer &) = delete;
|
||||
ggml_qnn_rpc_buffer(ggml_qnn_rpc_buffer &&) = delete;
|
||||
void operator=(ggml_qnn_rpc_buffer &&) = delete;
|
||||
DISABLE_COPY(qnn_rpc_buffer);
|
||||
DISABLE_MOVE(qnn_rpc_buffer);
|
||||
};
|
||||
|
||||
class qnn_mem_buffer : public qnn_buffer_interface {
|
||||
public:
|
||||
explicit qnn_mem_buffer(const uint8_t *data, size_t size) {
|
||||
_buffer = reinterpret_cast<uint8_t *>(qnn::page_align_alloc(size));
|
||||
|
||||
if (!_buffer) {
|
||||
QNN_LOG_WARN("failed to allocate %.2f MiB", float(size / (1 << 20)));
|
||||
return;
|
||||
}
|
||||
|
||||
_size = size;
|
||||
|
||||
if (data) {
|
||||
memcpy(_buffer, data, size);
|
||||
}
|
||||
}
|
||||
|
||||
explicit qnn_mem_buffer(size_t size) : qnn_mem_buffer(nullptr, size) {}
|
||||
|
||||
~qnn_mem_buffer() {
|
||||
// the free will do nothing if the _buffer is nullptr
|
||||
qnn::align_free(_buffer);
|
||||
}
|
||||
|
||||
bool is_valid() const override { return _buffer != nullptr; }
|
||||
|
||||
uint8_t *get_buffer() override { return _buffer; }
|
||||
size_t get_size() const override { return _size; }
|
||||
Qnn_MemHandle_t get_mem_handle() const override { return nullptr; }
|
||||
|
||||
private:
|
||||
size_t _size = 0;
|
||||
uint8_t *_buffer = nullptr;
|
||||
|
||||
DISABLE_COPY(qnn_mem_buffer);
|
||||
DISABLE_MOVE(qnn_mem_buffer);
|
||||
};
|
||||
|
||||
} // namespace qnn
|
||||
|
|
|
|||
|
|
@ -1,7 +1,5 @@
|
|||
#include "ggml-qnn.h"
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
#include <cassert>
|
||||
#include <chrono>
|
||||
#include <condition_variable>
|
||||
|
|
@ -87,78 +85,44 @@ static_assert(sizeof(kDeviceCaps) / sizeof(kDeviceCaps[0]) == GGML_QNN_MAX_DEVIC
|
|||
"The number of qnn devices should be equal to GGML_QNN_MAX_DEVICES");
|
||||
static_assert(kDeviceCaps[QNN_BACKEND_NPU].type == GGML_BACKEND_DEVICE_TYPE_ACCEL,
|
||||
"The NPU device should be an accelerator device");
|
||||
static_assert(kDeviceCaps[QNN_BACKEND_GPU].type == GGML_BACKEND_DEVICE_TYPE_GPU,
|
||||
"The NPU device should be an accelerator device");
|
||||
|
||||
class ggml_backend_qnn_buffer_context {
|
||||
public:
|
||||
ggml_backend_qnn_buffer_context(QNNBackend device, std::shared_ptr<qnn::qnn_instance> instance, size_t size)
|
||||
: _instance(instance), _name(QNN_BACKEND_NAME + std::to_string(device)) {
|
||||
// TODO: fix this for other platforms
|
||||
size_t size_page = sysconf(_SC_PAGESIZE);
|
||||
|
||||
// TODO: for qnn npu, a better way here is to reuse the buffer allocated by
|
||||
// qnn rpc, will save an extra copy
|
||||
_buffer = qnn::align_alloc(size_page, size);
|
||||
|
||||
if (!_buffer) {
|
||||
QNN_LOG_WARN("failed to allocate %.2f MiB\n", float(size / (1 << 20)));
|
||||
return;
|
||||
}
|
||||
|
||||
_buffer_size = size;
|
||||
}
|
||||
|
||||
~ggml_backend_qnn_buffer_context() {
|
||||
// the free will do nothing if the _buffer is nullptr
|
||||
qnn::align_free(_buffer);
|
||||
}
|
||||
|
||||
bool is_valid() const { return _buffer != nullptr; }
|
||||
|
||||
void *get_buffer() { return _buffer; }
|
||||
size_t get_buffer_size() { return _buffer_size; }
|
||||
|
||||
private:
|
||||
std::shared_ptr<qnn::qnn_instance> _instance;
|
||||
std::string _name;
|
||||
void *_buffer = nullptr;
|
||||
size_t _buffer_size = 0;
|
||||
};
|
||||
|
||||
struct ggml_backend_qnn_buffer_type_context {
|
||||
std::string name;
|
||||
};
|
||||
static_assert(kDeviceCaps[QNN_BACKEND_CPU].type == GGML_BACKEND_DEVICE_TYPE_CPU,
|
||||
"The NPU device should be an accelerator device");
|
||||
|
||||
ggml_backend_qnn_device_context *get_device_context(ggml_backend_dev_t dev) {
|
||||
return reinterpret_cast<ggml_backend_qnn_device_context *>(dev->context);
|
||||
}
|
||||
|
||||
qnn::qnn_buffer_interface *get_buffer_context(ggml_backend_buffer_t buffer) {
|
||||
return reinterpret_cast<qnn::qnn_buffer_interface *>(buffer->context);
|
||||
}
|
||||
|
||||
/*
|
||||
* -----------------------------------------------------------------------------------------------
|
||||
* qnn backend buffer object
|
||||
* -----------------------------------------------------------------------------------------------
|
||||
*/
|
||||
void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
|
||||
|
||||
auto *ctx = get_buffer_context(buffer);
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
void *ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
|
||||
|
||||
auto *ctx = get_buffer_context(buffer);
|
||||
return ctx->get_buffer();
|
||||
}
|
||||
|
||||
void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) {
|
||||
// Do nothing here, the qnn tensor will be create along with the graph.
|
||||
GGML_UNUSED(buffer);
|
||||
GGML_UNUSED(tensor);
|
||||
// TODO: we should create the qnn tensor along with the ggml tensor
|
||||
}
|
||||
|
||||
void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data,
|
||||
size_t offset, size_t size) {
|
||||
GGML_UNUSED(buffer);
|
||||
|
||||
memcpy((char *)tensor->data + offset, data, size);
|
||||
}
|
||||
|
||||
|
|
@ -168,8 +132,7 @@ void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml
|
|||
memcpy(data, (const char *)tensor->data + offset, size);
|
||||
}
|
||||
|
||||
bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor *src,
|
||||
struct ggml_tensor *dst) {
|
||||
bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *src, ggml_tensor *dst) {
|
||||
GGML_UNUSED(buffer);
|
||||
if (ggml_backend_buffer_is_host(src->buffer)) {
|
||||
memcpy(dst->data, src->data, ggml_nbytes(src));
|
||||
|
|
@ -180,12 +143,11 @@ bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const stru
|
|||
}
|
||||
|
||||
void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
|
||||
|
||||
memset(ctx->get_buffer(), value, ctx->get_buffer_size());
|
||||
auto *ctx = get_buffer_context(buffer);
|
||||
memset(ctx->get_buffer(), value, ctx->get_size());
|
||||
}
|
||||
|
||||
ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = {
|
||||
constexpr const ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = {
|
||||
/* .free_buffer = */ ggml_backend_qnn_buffer_free_buffer,
|
||||
/* .get_base = */ ggml_backend_qnn_buffer_get_base,
|
||||
/* .init_tensor = */ ggml_backend_qnn_buffer_init_tensor,
|
||||
|
|
@ -208,13 +170,13 @@ const char *ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
|||
}
|
||||
|
||||
ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
auto *dev_ctx = get_device_context(buft->device);
|
||||
ggml_backend_qnn_buffer_context *ctx =
|
||||
new ggml_backend_qnn_buffer_context((QNNBackend)dev_ctx->device, dev_ctx->instance, size);
|
||||
qnn::qnn_buffer_interface *ctx = new qnn::qnn_mem_buffer(size);
|
||||
if (!ctx->is_valid()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s]alloc buffer: %p, size: %ld", qnn::get_backend_name(get_device_context(buft->device)->device),
|
||||
ctx->get_buffer(), size);
|
||||
return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface, ctx, size);
|
||||
}
|
||||
|
||||
|
|
@ -227,7 +189,7 @@ size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buf
|
|||
size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
|
||||
GGML_UNUSED(buft);
|
||||
// TODO: get the max size from device
|
||||
return 1024 * 1024 * 1024;
|
||||
return 1024L * 1024 * 1024;
|
||||
}
|
||||
|
||||
bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) {
|
||||
|
|
@ -254,61 +216,52 @@ void ggml_backend_qnn_free(ggml_backend_t backend) {
|
|||
}
|
||||
}
|
||||
|
||||
bool ggml_backend_qnn_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor *src,
|
||||
ggml_tensor *dst) {
|
||||
GGML_UNUSED(backend_src);
|
||||
GGML_UNUSED(backend_dst);
|
||||
GGML_UNUSED(src);
|
||||
GGML_UNUSED(dst);
|
||||
return false;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(ggml_backend_dev_t dev) {
|
||||
static ggml_backend_qnn_buffer_type_context ggml_backend_qnn_buffer_type_contexts[GGML_QNN_MAX_DEVICES];
|
||||
static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES];
|
||||
static bool ggml_backend_qnn_buffer_type_initialized = false;
|
||||
auto *dev_ctx = get_device_context(dev);
|
||||
if (!ggml_backend_qnn_buffer_type_initialized) {
|
||||
for (size_t i = 0; i < GGML_QNN_MAX_DEVICES; i++) {
|
||||
auto &context = ggml_backend_qnn_buffer_type_contexts[i];
|
||||
context = {std::string(QNN_BACKEND_NAME) + std::to_string(i)};
|
||||
ggml_backend_qnn_buffer_types[i] = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_qnn_buffer_type_name,
|
||||
/* .alloc_buffer = */
|
||||
ggml_backend_qnn_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */
|
||||
ggml_backend_qnn_buffer_type_get_alignment,
|
||||
/* .get_max_size = */
|
||||
ggml_backend_qnn_buffer_type_get_max_size,
|
||||
/* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes
|
||||
/* .is_host = */ ggml_backend_qnn_buffer_is_host,
|
||||
},
|
||||
/* .device */ dev,
|
||||
/* .context = */ &context,
|
||||
};
|
||||
}
|
||||
ggml_backend_qnn_buffer_type_initialized = true;
|
||||
if (!ggml_backend_qnn_buffer_types[dev_ctx->device].device) {
|
||||
ggml_backend_qnn_buffer_types[dev_ctx->device] = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_qnn_buffer_type_name,
|
||||
/* .alloc_buffer = */
|
||||
ggml_backend_qnn_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */
|
||||
ggml_backend_qnn_buffer_type_get_alignment,
|
||||
/* .get_max_size = */
|
||||
ggml_backend_qnn_buffer_type_get_max_size,
|
||||
/* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes
|
||||
/* .is_host = */ ggml_backend_qnn_buffer_is_host,
|
||||
},
|
||||
/* .device */ dev,
|
||||
/* .context = */ nullptr,
|
||||
};
|
||||
} else {
|
||||
GGML_ASSERT(ggml_backend_qnn_buffer_types[dev_ctx->device].device == dev);
|
||||
}
|
||||
|
||||
return &ggml_backend_qnn_buffer_types[dev_ctx->device];
|
||||
}
|
||||
|
||||
ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph *cgraph) {
|
||||
enum ggml_status result = GGML_STATUS_SUCCESS;
|
||||
auto *device_ctx = get_device_context(backend->device);
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor *node = cgraph->nodes[i];
|
||||
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE ||
|
||||
node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
||||
continue;
|
||||
}
|
||||
bool ok = qnn::ggml_qnn_forward(device_ctx, node);
|
||||
if (!ok) {
|
||||
QNN_LOG_DEBUG("error: op not supported %s (%s)\n", node->name, ggml_op_name(node->op));
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
return qnn::device_compute_graph(get_device_context(backend->device), cgraph) ? GGML_STATUS_SUCCESS
|
||||
: GGML_STATUS_FAILED;
|
||||
}
|
||||
|
||||
ggml_backend_i ggml_backend_qnn_interface = {
|
||||
constexpr const ggml_backend_i ggml_backend_qnn_interface = {
|
||||
/* .get_name = */ ggml_backend_qnn_name,
|
||||
/* .free = */ ggml_backend_qnn_free,
|
||||
/* .set_tensor_async = */ nullptr,
|
||||
/* .get_tensor_async = */ nullptr,
|
||||
/* .cpy_tensor_async = */ nullptr,
|
||||
/* .cpy_tensor_async = */ ggml_backend_qnn_cpy_tensor_async,
|
||||
/* .synchronize = */ nullptr,
|
||||
/* .graph_plan_create = */ nullptr,
|
||||
/* .graph_plan_free = */ nullptr,
|
||||
|
|
@ -345,7 +298,7 @@ enum ggml_backend_dev_type ggml_backend_qnn_device_get_type(ggml_backend_dev_t d
|
|||
return kDeviceCaps[get_device_context(dev)->device].type;
|
||||
}
|
||||
|
||||
void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props *props) {
|
||||
void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props *props) {
|
||||
props->name = ggml_backend_qnn_device_get_name(dev);
|
||||
props->description = ggml_backend_qnn_device_get_description(dev);
|
||||
props->type = ggml_backend_qnn_device_get_type(dev);
|
||||
|
|
@ -364,6 +317,8 @@ ggml_guid_t ggml_backend_qnn_guid() {
|
|||
return &guid;
|
||||
}
|
||||
|
||||
bool ggml_backend_is_qnn(ggml_backend_t backend) { return ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); }
|
||||
|
||||
ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, const char *extend_lib_search_path) {
|
||||
if (!extend_lib_search_path) {
|
||||
extend_lib_search_path = GGML_QNN_DEFAULT_LIB_SEARCH_PATH;
|
||||
|
|
@ -401,9 +356,9 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev,
|
|||
}
|
||||
} else {
|
||||
if (setenv("LD_LIBRARY_PATH", path.c_str(), 1) == 0) {
|
||||
QNN_LOG_DEBUG("%s backend setenv successfully\n", qnn::get_backend_name(device));
|
||||
QNN_LOG_DEBUG("%s backend setenv successfully", qnn::get_backend_name(device));
|
||||
} else {
|
||||
QNN_LOG_ERROR("%s backend setenv failure\n", qnn::get_backend_name(device));
|
||||
QNN_LOG_ERROR("%s backend setenv failure", qnn::get_backend_name(device));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
|
@ -411,12 +366,12 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev,
|
|||
auto instance = std::make_shared<qnn::qnn_instance>(path, dev_ctx->lib_name, "ggml");
|
||||
auto result = instance->qnn_init(nullptr);
|
||||
if (result != 0) {
|
||||
QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", qnn::get_backend_name(device));
|
||||
QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why", qnn::get_backend_name(device));
|
||||
return nullptr;
|
||||
}
|
||||
auto qnn_interface = instance->get_qnn_interface();
|
||||
if (!qnn_interface) {
|
||||
QNN_LOG_WARN("qnn subsystem failure\n");
|
||||
QNN_LOG_WARN("qnn subsystem failure");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
|
@ -453,10 +408,10 @@ ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_ptr(ggml_backend_dev_t
|
|||
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||
}
|
||||
|
||||
bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor *op) {
|
||||
bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor *op) {
|
||||
// Note that this function could be called before the device context is initialized
|
||||
auto *device_ctx = get_device_context(dev);
|
||||
return qnn::ggml_qnn_supports_op(device_ctx, op);
|
||||
return qnn::device_supports_op(device_ctx, op);
|
||||
}
|
||||
|
||||
bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||
|
|
@ -464,7 +419,13 @@ bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_
|
|||
return ggml_backend_buft_is_host(buft);
|
||||
}
|
||||
|
||||
const struct ggml_backend_device_i ggml_backend_qnn_device_interface = {
|
||||
bool ggml_backend_qnn_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor *op) {
|
||||
auto *device_ctx = get_device_context(dev);
|
||||
QNN_LOG_DEBUG("[%s][%s]offload op", qnn::get_backend_name(device_ctx->device), ggml_op_name(op->op));
|
||||
return false;
|
||||
}
|
||||
|
||||
constexpr const ggml_backend_device_i ggml_backend_qnn_device_interface = {
|
||||
/* .get_name = */ ggml_backend_qnn_device_get_name,
|
||||
/* .get_description = */ ggml_backend_qnn_device_get_description,
|
||||
/* .get_memory = */ ggml_backend_qnn_device_get_memory,
|
||||
|
|
@ -476,7 +437,7 @@ const struct ggml_backend_device_i ggml_backend_qnn_device_interface = {
|
|||
/* .buffer_from_host_ptr = */ ggml_backend_qnn_device_buffer_from_ptr,
|
||||
/* .supports_op = */ ggml_backend_qnn_device_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_qnn_device_supports_buft,
|
||||
/* .offload_op = */ nullptr,
|
||||
/* .offload_op = */ ggml_backend_qnn_device_offload_op,
|
||||
/* .event_new = */ nullptr,
|
||||
/* .event_free = */ nullptr,
|
||||
/* .event_synchronize = */ nullptr,
|
||||
|
|
@ -489,27 +450,36 @@ const struct ggml_backend_device_i ggml_backend_qnn_device_interface = {
|
|||
*/
|
||||
|
||||
struct ggml_backend_qnn_reg_impl : ggml_backend_reg {
|
||||
std::array<std::unique_ptr<ggml_backend_qnn_device_context>, GGML_QNN_MAX_DEVICES> device_contexts;
|
||||
std::array<ggml_backend_device, GGML_QNN_MAX_DEVICES> devices;
|
||||
std::vector<std::unique_ptr<ggml_backend_qnn_device_context>> device_contexts;
|
||||
std::vector<ggml_backend_device> devices;
|
||||
|
||||
explicit ggml_backend_qnn_reg_impl(ggml_backend_reg_i interface) {
|
||||
context = this;
|
||||
iface = interface;
|
||||
|
||||
QNN_LOG_DEBUG("qnn backend registry init");
|
||||
for (int i = 0; i < GGML_QNN_MAX_DEVICES; i++) {
|
||||
const auto device_enum = (QNNBackend)(GGML_QNN_MAX_DEVICES - 1 - i); // init from the last device, i.e. NPU
|
||||
device_contexts[i] = std::make_unique<ggml_backend_qnn_device_context>(
|
||||
for (size_t i = 0; i < QNN_BACKEND_COUNT; i++) {
|
||||
const auto device_enum = (QNNBackend)(QNN_BACKEND_COUNT - 1 - i); // init from the last device, i.e. NPU
|
||||
if (device_enum == QNN_BACKEND_CPU) {
|
||||
/*
|
||||
* here we skip the initialization of CPU device,
|
||||
* cause it'll block unsupported ops fallback to ggml cpu backend
|
||||
*/
|
||||
continue;
|
||||
}
|
||||
|
||||
device_contexts.emplace_back(std::make_unique<ggml_backend_qnn_device_context>(
|
||||
/* .device = */ device_enum, // init from the last device, i.e. NPU
|
||||
/* .threads = */ 1,
|
||||
/* .name = */ qnn::get_backend_name(device_enum),
|
||||
/* .lib_name = */ kDeviceCaps[device_enum].lib_name,
|
||||
/* .supported_types = */ kDeviceCaps[device_enum].supported_types);
|
||||
/* .supported_types = */ kDeviceCaps[device_enum].supported_types));
|
||||
|
||||
auto &device = devices[i];
|
||||
device.iface = ggml_backend_qnn_device_interface;
|
||||
device.reg = this;
|
||||
device.context = device_contexts[i].get();
|
||||
devices.emplace_back(ggml_backend_device{
|
||||
/* iface = */ ggml_backend_qnn_device_interface,
|
||||
/* reg = */ this,
|
||||
/* context = */ device_contexts.back().get(),
|
||||
});
|
||||
}
|
||||
}
|
||||
};
|
||||
|
|
|
|||
|
|
@ -64,12 +64,12 @@ public:
|
|||
}
|
||||
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s\n", get_backend_name(device),
|
||||
graph_name.c_str(), get_qnn_error_string(error));
|
||||
QNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s", get_backend_name(device), graph_name.c_str(),
|
||||
get_qnn_error_string(error));
|
||||
return;
|
||||
}
|
||||
|
||||
QNN_LOG_INFO("[%s][%s]create succeed\n", get_backend_name(device), graph_name.c_str());
|
||||
QNN_LOG_INFO("[%s][%s]create succeed", get_backend_name(device), graph_name.c_str());
|
||||
_graph_handle = graph_handle;
|
||||
_qnn_interface = qnn_interface;
|
||||
}
|
||||
|
|
@ -80,7 +80,7 @@ public:
|
|||
const ggml_tensor_array_t &tensor_outputs) {
|
||||
GGML_ASSERT(op_constructor);
|
||||
if (!is_valid()) {
|
||||
QNN_LOG_ERROR("Invalid graph\n");
|
||||
QNN_LOG_ERROR("Invalid graph");
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -92,7 +92,7 @@ public:
|
|||
}
|
||||
|
||||
if (!_op_config->add_op_to_graph(_graph_handle)) {
|
||||
QNN_LOG_ERROR("[%s]add nodes failed\n", _graph_name.c_str());
|
||||
QNN_LOG_ERROR("[%s]add nodes failed", _graph_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -109,12 +109,12 @@ public:
|
|||
|
||||
bool execute(const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) {
|
||||
if (!_op_config->bind_input_tensors(tensor_inputs)) {
|
||||
QNN_LOG_ERROR("[%s][%s]bind input tensors failed\n", get_backend_name(_device), _graph_name.c_str());
|
||||
QNN_LOG_ERROR("[%s][%s]bind input tensors failed", get_backend_name(_device), _graph_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!_op_config->bind_output_tensors(tensor_outputs)) {
|
||||
QNN_LOG_ERROR("[%s][%s]bind output tensors failed\n", get_backend_name(_device), _graph_name.c_str());
|
||||
QNN_LOG_ERROR("[%s][%s]bind output tensors failed", get_backend_name(_device), _graph_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -82,7 +82,7 @@ bool bind_tensors(const qnn::ggml_tensor_array_t &ggml_tensors, qnn::qnn_tensor_
|
|||
for (size_t i = 0; i < ggml_tensors.size(); i++) {
|
||||
auto *ggml_tensor = ggml_tensors[i];
|
||||
if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) {
|
||||
QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor));
|
||||
QNN_LOG_ERROR("bind tensor %s failed", ggml_get_name(ggml_tensor));
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -162,12 +162,12 @@ bool ggml_qnn_op_config_base::add_tensor_param(const std::string &name, const qn
|
|||
|
||||
GGML_ASSERT(data_size > 0);
|
||||
if (!param_tensor->bind_buffer(const_cast<uint8_t *>(data), data_size)) {
|
||||
QNN_LOG_ERROR("parameter tensor bind_buffer failed\n");
|
||||
QNN_LOG_ERROR("parameter tensor bind_buffer failed");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!param_tensor->alloc_qnn_tensor_id()) {
|
||||
QNN_LOG_ERROR("parameter tensor alloc_qnn_tensor_id failed\n");
|
||||
QNN_LOG_ERROR("parameter tensor alloc_qnn_tensor_id failed");
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -185,26 +185,26 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) {
|
|||
GGML_ASSERT(_qnn_tensor_inputs.size() == _tensor_inputs.size());
|
||||
GGML_ASSERT(_qnn_tensor_outputs.size() == _tensor_outputs.size());
|
||||
|
||||
QNN_LOG_DEBUG("[%s]add to graph start\n", _name.c_str());
|
||||
QNN_LOG_DEBUG("[%s]add to graph start", _name.c_str());
|
||||
for (size_t i = 0; i < _tensor_inputs.size(); i++) {
|
||||
auto tensor = _tensor_inputs[i];
|
||||
if (!tensor->alloc_qnn_tensor_id()) {
|
||||
QNN_LOG_ERROR("[%s]input tensor alloc_qnn_tensor_id failed\n", _name.c_str());
|
||||
QNN_LOG_ERROR("[%s]input tensor alloc_qnn_tensor_id failed", _name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s]input tensor id: %d\n", _name.c_str(), tensor->get_qnn_tensor_id());
|
||||
QNN_LOG_DEBUG("[%s]input tensor id: %d", _name.c_str(), tensor->get_qnn_tensor_id());
|
||||
_qnn_tensor_inputs[i] = tensor->get_qnn_tensor();
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < _tensor_outputs.size(); i++) {
|
||||
auto tensor = _tensor_outputs[i];
|
||||
if (!tensor->alloc_qnn_tensor_id()) {
|
||||
QNN_LOG_ERROR("[%s]output tensor alloc_qnn_tensor_id failed\n", _name.c_str());
|
||||
QNN_LOG_ERROR("[%s]output tensor alloc_qnn_tensor_id failed", _name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s]output tensor id: %d\n", _name.c_str(), tensor->get_qnn_tensor_id());
|
||||
QNN_LOG_DEBUG("[%s]output tensor id: %d", _name.c_str(), tensor->get_qnn_tensor_id());
|
||||
_qnn_tensor_outputs[i] = tensor->get_qnn_tensor();
|
||||
}
|
||||
|
||||
|
|
@ -215,7 +215,7 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) {
|
|||
return false;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("[%s]added to graph succeed\n", _name.c_str());
|
||||
QNN_LOG_DEBUG("[%s]added to graph succeed", _name.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -280,6 +280,14 @@ bool ggml_qnn_single_op_config::initialize_op_nodes(QNNBackend device, Qnn_Graph
|
|||
return true;
|
||||
}
|
||||
|
||||
bool ggml_qnn_aggregate_op_config::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) {
|
||||
return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs);
|
||||
}
|
||||
|
||||
bool ggml_qnn_aggregate_op_config::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) {
|
||||
return bind_tensors(tensor_outputs, _tensor_outputs, _qnn_tensor_outputs);
|
||||
}
|
||||
|
||||
bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
const ggml_tensor_array_t &tensor_inputs,
|
||||
const ggml_tensor_array_t &tensor_outputs) {
|
||||
|
|
@ -293,20 +301,21 @@ bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_Graph
|
|||
create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs);
|
||||
|
||||
// create output tensor
|
||||
qnn_tensor_array_t mat_mul_tensor_outputs;
|
||||
params.name_prefix = "dst";
|
||||
params.is_input = false;
|
||||
create_tensors_from_ggml_tensor(params, tensor_outputs, &mat_mul_tensor_outputs, nullptr);
|
||||
create_tensors_from_ggml_tensor(params, tensor_outputs, &_tensor_outputs, &_qnn_tensor_outputs);
|
||||
|
||||
// create convert nodes
|
||||
qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs;
|
||||
qnn_tensor_array_t mat_mul_tensor_outputs = _tensor_outputs;
|
||||
if (!create_convert_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs)) {
|
||||
QNN_LOG_ERROR("create convert nodes failed\n");
|
||||
QNN_LOG_ERROR("create convert nodes failed");
|
||||
return false;
|
||||
}
|
||||
|
||||
mat_mul_tensor_inputs.front() = create_gather_nodes(device, graph_handle, tensor_rank, _tensor_inputs.front(),
|
||||
_tensor_inputs.back()->get_dimensions());
|
||||
mat_mul_tensor_inputs.front() =
|
||||
create_gather_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs.front(),
|
||||
mat_mul_tensor_inputs.back()->get_dimensions());
|
||||
return create_mat_mul_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs);
|
||||
}
|
||||
|
||||
|
|
@ -365,15 +374,15 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic
|
|||
qnn_dimension_array_t intermediate_dimensions = input_dimensions;
|
||||
intermediate_dimensions[rank - 3] = output_dimensions[rank - 3];
|
||||
qnn_tensor_ptr_t gather0_out;
|
||||
_gather0 = create_node(_name + "_gather0", rank, rank - 3, intermediate_dimensions, tensor_input, device,
|
||||
graph_handle, _qnn_instance, gather0_out);
|
||||
_operations.push_back(create_node(_name + "_gather0", rank, rank - 3, intermediate_dimensions, tensor_input, device,
|
||||
graph_handle, _qnn_instance, gather0_out));
|
||||
if (rank == 3) {
|
||||
return gather0_out;
|
||||
}
|
||||
|
||||
qnn_tensor_ptr_t gather1_out;
|
||||
_gather1 = create_node(_name + "_gather1", rank, rank - 4, output_dimensions, gather0_out, device, graph_handle,
|
||||
_qnn_instance, gather1_out);
|
||||
_operations.push_back(create_node(_name + "_gather1", rank, rank - 4, output_dimensions, gather0_out, device,
|
||||
graph_handle, _qnn_instance, gather1_out));
|
||||
return gather1_out;
|
||||
}
|
||||
|
||||
|
|
@ -387,9 +396,8 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap
|
|||
|
||||
// create tensors for convert node
|
||||
auto tensor_type = get_tensor_type(tensor_inputs);
|
||||
QNN_LOG_DEBUG("input tensor type: %s\n", qnn_datatype_to_string(tensor_type));
|
||||
QNN_LOG_DEBUG("input tensor type: %s", qnn_datatype_to_string(tensor_type));
|
||||
|
||||
_input_converts.resize(tensor_inputs.size());
|
||||
for (size_t i = 0; i < tensor_inputs.size(); ++i) {
|
||||
// create input convert nodes
|
||||
auto convert_in = tensor_inputs[i];
|
||||
|
|
@ -406,7 +414,7 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap
|
|||
convert->set_input_tensors({convert_in});
|
||||
convert->set_output_tensors({convert_out});
|
||||
tensor_inputs[i] = convert_out;
|
||||
_input_converts[i] = convert;
|
||||
_operations.push_back(convert);
|
||||
}
|
||||
|
||||
if (tensor_outputs.front()->get_data_type() != tensor_type) {
|
||||
|
|
@ -421,7 +429,7 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap
|
|||
output_convert->set_input_tensors({convert_in});
|
||||
output_convert->set_output_tensors({convert_out});
|
||||
tensor_outputs.front() = convert_in;
|
||||
_output_convert = output_convert;
|
||||
_operations.push_back(output_convert);
|
||||
}
|
||||
|
||||
return true;
|
||||
|
|
@ -432,7 +440,7 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap
|
|||
qnn_tensor_array_t &tensor_outputs) {
|
||||
|
||||
/*
|
||||
* First, both the ggml and qnn tensor in memory are stored as row-major format. (For more details, please also:
|
||||
* First, both the ggml and qnn tensor in memory are stored as row-major format. (For more details, please refer to:
|
||||
* https://pytorch.org/blog/tensor-memory-format-matters/#:~:text=Column%20Major%20Order:%20In%20this%20format,%20the%20matrix)
|
||||
* But the dimensions of the tensor are stored in different order.
|
||||
* For example, a 2x3 matrix:
|
||||
|
|
@ -515,81 +523,19 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap
|
|||
transpose_out->set_input_tensors(tensors);
|
||||
transpose_out->set_output_tensors(tensor_outputs);
|
||||
|
||||
_mat_mul = mat_mul;
|
||||
_transpose_out = transpose_out;
|
||||
_operations.push_back(mat_mul);
|
||||
_operations.push_back(transpose_out);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ggml_qnn_matmul_op_config::add_op_to_graph(Qnn_GraphHandle_t graph_handle) {
|
||||
for (auto &convert : _input_converts) {
|
||||
if (convert && !convert->add_op_to_graph(graph_handle)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (_gather0 && !_gather0->add_op_to_graph(graph_handle)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (_gather1 && !_gather1->add_op_to_graph(graph_handle)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return _mat_mul->add_op_to_graph(graph_handle) && _transpose_out->add_op_to_graph(graph_handle) &&
|
||||
(!_output_convert || _output_convert->add_op_to_graph(graph_handle));
|
||||
}
|
||||
|
||||
bool ggml_qnn_matmul_op_config::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) {
|
||||
return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs);
|
||||
}
|
||||
|
||||
bool ggml_qnn_matmul_op_config::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) {
|
||||
if (_output_convert) {
|
||||
return _output_convert->bind_output_tensors(tensor_outputs);
|
||||
} else {
|
||||
return _transpose_out->bind_output_tensors(tensor_outputs);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_qnn_matmul_op_config::unbind_input_tensors() {
|
||||
_mat_mul->unbind_input_tensors();
|
||||
for (auto &convert : _input_converts) {
|
||||
if (convert) {
|
||||
convert->unbind_input_tensors();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_qnn_matmul_op_config::unbind_output_tensors() {
|
||||
_transpose_out->unbind_output_tensors();
|
||||
if (_output_convert) {
|
||||
_output_convert->unbind_output_tensors();
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<Qnn_Tensor_t> &ggml_qnn_matmul_op_config::get_qnn_output_tensors() {
|
||||
if (_output_convert) {
|
||||
return _output_convert->get_qnn_output_tensors();
|
||||
} else {
|
||||
return _transpose_out->get_qnn_output_tensors();
|
||||
}
|
||||
}
|
||||
|
||||
ggml_op_constructor_t create_op_constructor(const std::string &op_name) {
|
||||
if (op_name == QNN_OP_MAT_MUL) {
|
||||
// For QNN_OP_MAT_MUL, we need to transpose the input tensor
|
||||
return [](const std::string &instance_name,
|
||||
std::shared_ptr<qnn::qnn_instance> qnn_instance) -> std::unique_ptr<qnn::ggml_qnn_op_config> {
|
||||
QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s\n", instance_name.c_str());
|
||||
QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s", instance_name.c_str());
|
||||
return std::make_unique<qnn::ggml_qnn_matmul_op_config>(instance_name, qnn_instance);
|
||||
};
|
||||
} else if (op_name == QNN_OP_TRANSPOSE) {
|
||||
return [](const std::string &instance_name,
|
||||
std::shared_ptr<qnn::qnn_instance> qnn_instance) -> std::unique_ptr<qnn::ggml_qnn_op_config> {
|
||||
return std::make_unique<qnn::ggml_qnn_single_op_config>(
|
||||
instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE, QNN_OP_TRANSPOSE_PARAM_PERM,
|
||||
QNN_DATATYPE_UINT_32, 4 * sizeof(uint32_t), qnn_instance);
|
||||
};
|
||||
}
|
||||
|
||||
return [op_name](const std::string &instance_name,
|
||||
|
|
|
|||
|
|
@ -82,21 +82,70 @@ private:
|
|||
DISABLE_MOVE(ggml_qnn_single_op_config);
|
||||
};
|
||||
|
||||
class ggml_qnn_matmul_op_config : public ggml_qnn_op_config {
|
||||
class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config {
|
||||
public:
|
||||
explicit ggml_qnn_aggregate_op_config(const std::string &name, std::shared_ptr<qnn_instance> qnn_instance)
|
||||
: _name(name), _qnn_instance(qnn_instance) {}
|
||||
|
||||
~ggml_qnn_aggregate_op_config() {
|
||||
_qnn_tensor_inputs.clear();
|
||||
_qnn_tensor_outputs.clear();
|
||||
_tensor_inputs.clear();
|
||||
_tensor_outputs.clear();
|
||||
_operations.clear();
|
||||
}
|
||||
|
||||
bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override {
|
||||
for (auto &op : _operations) {
|
||||
if (!op->add_op_to_graph(graph_handle)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override;
|
||||
|
||||
bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override;
|
||||
|
||||
void unbind_input_tensors() override {
|
||||
for (auto &tensor : _tensor_inputs) {
|
||||
tensor->unbind();
|
||||
}
|
||||
}
|
||||
|
||||
void unbind_output_tensors() override {
|
||||
for (auto &tensor : _tensor_outputs) {
|
||||
tensor->unbind();
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<Qnn_Tensor_t> &get_qnn_input_tensors() override { return _qnn_tensor_inputs; }
|
||||
std::vector<Qnn_Tensor_t> &get_qnn_output_tensors() override { return _qnn_tensor_outputs; }
|
||||
|
||||
protected:
|
||||
std::string _name;
|
||||
std::shared_ptr<qnn_instance> _qnn_instance;
|
||||
|
||||
std::vector<qnn_op_config_ptr_t> _operations;
|
||||
qnn_tensor_array_t _tensor_inputs;
|
||||
qnn_tensor_array_t _tensor_outputs;
|
||||
std::vector<Qnn_Tensor_t> _qnn_tensor_inputs;
|
||||
std::vector<Qnn_Tensor_t> _qnn_tensor_outputs;
|
||||
|
||||
private:
|
||||
DISABLE_COPY(ggml_qnn_aggregate_op_config);
|
||||
DISABLE_MOVE(ggml_qnn_aggregate_op_config);
|
||||
};
|
||||
|
||||
class ggml_qnn_matmul_op_config : public ggml_qnn_aggregate_op_config {
|
||||
public:
|
||||
ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr<qnn_instance> qnn_instance)
|
||||
: _name(name), _qnn_instance(qnn_instance) {}
|
||||
: ggml_qnn_aggregate_op_config(name, qnn_instance) {}
|
||||
|
||||
bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
const ggml_tensor_array_t &tensor_inputs,
|
||||
const ggml_tensor_array_t &tensor_outputs) override;
|
||||
bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override;
|
||||
bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override;
|
||||
bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override;
|
||||
void unbind_input_tensors() override;
|
||||
void unbind_output_tensors() override;
|
||||
std::vector<Qnn_Tensor_t> &get_qnn_input_tensors() override { return _qnn_tensor_inputs; }
|
||||
std::vector<Qnn_Tensor_t> &get_qnn_output_tensors() override;
|
||||
|
||||
private:
|
||||
qnn_tensor_ptr_t create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank,
|
||||
|
|
@ -106,17 +155,6 @@ private:
|
|||
bool create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank,
|
||||
qnn_tensor_array_t &tensor_inputs, qnn_tensor_array_t &tensor_outputs);
|
||||
|
||||
std::string _name;
|
||||
std::shared_ptr<qnn_instance> _qnn_instance;
|
||||
qnn_op_config_ptr_t _transpose_out;
|
||||
qnn_op_config_ptr_t _mat_mul;
|
||||
qnn_op_config_ptr_t _gather0;
|
||||
qnn_op_config_ptr_t _gather1;
|
||||
std::vector<qnn_op_config_ptr_t> _input_converts;
|
||||
qnn_op_config_ptr_t _output_convert;
|
||||
qnn_tensor_array_t _tensor_inputs;
|
||||
std::vector<Qnn_Tensor_t> _qnn_tensor_inputs;
|
||||
|
||||
DISABLE_COPY(ggml_qnn_matmul_op_config);
|
||||
DISABLE_MOVE(ggml_qnn_matmul_op_config);
|
||||
};
|
||||
|
|
|
|||
|
|
@ -195,21 +195,21 @@ public:
|
|||
|
||||
int qnn_init(const QnnSaver_Config_t **saver_config) {
|
||||
BackendIdType backend_id = QNN_BACKEND_ID_NULL;
|
||||
QNN_LOG_DEBUG("enter qni_init\n");
|
||||
QNN_LOG_DEBUG("enter qnn_init");
|
||||
|
||||
std::lock_guard<std::mutex> lock(_init_mutex);
|
||||
if (load_system() != 0) {
|
||||
QNN_LOG_WARN("can not load QNN system lib, pls check why?\n");
|
||||
QNN_LOG_WARN("can not load QNN system lib, pls check why?");
|
||||
return 1;
|
||||
} else {
|
||||
QNN_LOG_DEBUG("load QNN system lib successfully\n");
|
||||
QNN_LOG_DEBUG("load QNN system lib successfully");
|
||||
}
|
||||
|
||||
std::string backend_lib_path = _lib_path + _backend_name;
|
||||
if (_lib_path_to_backend_id.count(backend_lib_path) == 0) {
|
||||
int is_load_ok = load_backend(backend_lib_path, saver_config);
|
||||
if (is_load_ok != 0) {
|
||||
QNN_LOG_WARN("failed to load QNN backend\n");
|
||||
QNN_LOG_WARN("failed to load QNN backend");
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
|
|
@ -218,7 +218,7 @@ public:
|
|||
if (_loaded_backend.count(backend_id) == 0 || _loaded_lib_handle.count(backend_id) == 0) {
|
||||
QNN_LOG_WARN(
|
||||
"library %s is loaded but loaded backend count=%zu, "
|
||||
"loaded lib_handle count=%zu\n",
|
||||
"loaded lib_handle count=%zu",
|
||||
backend_lib_path.c_str(), _loaded_backend.count(backend_id), _loaded_lib_handle.count(backend_id));
|
||||
return 3;
|
||||
}
|
||||
|
|
@ -227,28 +227,28 @@ public:
|
|||
_qnn_interface->qnn_log_create(qnn::sdk_logcallback, _qnn_log_level, &_qnn_log_handle);
|
||||
if (nullptr == _qnn_log_handle) {
|
||||
// NPU backend not work on Qualcomm SoC equipped low-end phone
|
||||
QNN_LOG_WARN("why failed to initialize qnn log\n");
|
||||
QNN_LOG_WARN("why failed to initialize qnn log");
|
||||
return 4;
|
||||
} else {
|
||||
QNN_LOG_DEBUG("initialize qnn log successfully\n");
|
||||
QNN_LOG_DEBUG("initialize qnn log successfully");
|
||||
}
|
||||
|
||||
std::vector<const QnnBackend_Config_t *> temp_backend_config;
|
||||
_qnn_interface->qnn_backend_create(
|
||||
_qnn_log_handle, temp_backend_config.empty() ? nullptr : temp_backend_config.data(), &_qnn_backend_handle);
|
||||
if (nullptr == _qnn_backend_handle) {
|
||||
QNN_LOG_WARN("why failed to initialize qnn backend\n");
|
||||
QNN_LOG_WARN("why failed to initialize qnn backend");
|
||||
return 5;
|
||||
} else {
|
||||
QNN_LOG_DEBUG("initialize qnn backend successfully\n");
|
||||
QNN_LOG_DEBUG("initialize qnn backend successfully");
|
||||
}
|
||||
|
||||
Qnn_ErrorHandle_t qnn_status = _qnn_interface->qnn_property_has_capability(QNN_PROPERTY_GROUP_DEVICE);
|
||||
if (QNN_PROPERTY_NOT_SUPPORTED == qnn_status) {
|
||||
QNN_LOG_WARN("device property is not supported\n");
|
||||
QNN_LOG_WARN("device property is not supported");
|
||||
}
|
||||
if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnn_status) {
|
||||
QNN_LOG_WARN("device property is not known to backend\n");
|
||||
QNN_LOG_WARN("device property is not known to backend");
|
||||
}
|
||||
|
||||
qnn_status = QNN_SUCCESS;
|
||||
|
|
@ -294,9 +294,9 @@ public:
|
|||
qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, nullptr, &_qnn_device_handle);
|
||||
}
|
||||
if (QNN_SUCCESS != qnn_status && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) {
|
||||
QNN_LOG_WARN("failed to create QNN device\n");
|
||||
QNN_LOG_WARN("failed to create QNN device");
|
||||
} else {
|
||||
QNN_LOG_INFO("create QNN device successfully\n");
|
||||
QNN_LOG_INFO("create QNN device successfully");
|
||||
}
|
||||
|
||||
if (_profile_level != sdk_profile_level::profile_off) {
|
||||
|
|
@ -306,19 +306,19 @@ public:
|
|||
|
||||
if (QNN_PROFILE_NO_ERROR !=
|
||||
_qnn_interface->qnn_profile_create(_qnn_backend_handle, profile_level, &_qnn_profile_handle)) {
|
||||
QNN_LOG_WARN("unable to create profile handle in the backend\n");
|
||||
QNN_LOG_WARN("unable to create profile handle in the backend");
|
||||
return 6;
|
||||
} else {
|
||||
QNN_LOG_DEBUG("initialize qnn profile successfully\n");
|
||||
QNN_LOG_DEBUG("initialize qnn profile successfully");
|
||||
}
|
||||
}
|
||||
|
||||
_rpc_lib_handle = dl_load("libcdsprpc.so");
|
||||
if (nullptr == _rpc_lib_handle) {
|
||||
QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dl_error());
|
||||
QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s", dl_error());
|
||||
return 8;
|
||||
} else {
|
||||
QNN_LOG_DEBUG("load rpcmem lib successfully\n");
|
||||
QNN_LOG_DEBUG("load rpcmem lib successfully");
|
||||
set_rpcmem_initialized(true);
|
||||
}
|
||||
_pfn_rpc_mem_init = reinterpret_cast<qnn::pfn_rpc_mem_init>(dl_sym(_rpc_lib_handle, "rpcmem_init"));
|
||||
|
|
@ -343,10 +343,10 @@ public:
|
|||
*/
|
||||
_qnn_interface->qnn_context_create(_qnn_backend_handle, _qnn_device_handle, nullptr, &_qnn_context_handle);
|
||||
if (nullptr == _qnn_context_handle) {
|
||||
QNN_LOG_WARN("why failed to initialize qnn context\n");
|
||||
QNN_LOG_WARN("why failed to initialize qnn context");
|
||||
return 10;
|
||||
} else {
|
||||
QNN_LOG_DEBUG("initialize qnn context successfully\n");
|
||||
QNN_LOG_DEBUG("initialize qnn context successfully");
|
||||
}
|
||||
|
||||
if (_backend_name.find("Htp") != _backend_name.npos) {
|
||||
|
|
@ -359,7 +359,7 @@ public:
|
|||
for (size_t idx = 0; idx < probe_counts; idx++) {
|
||||
rpc_buffer = static_cast<uint8_t *>(alloc_rpcmem(probe_slots[idx] * size_in_mb, sizeof(void *)));
|
||||
if (!rpc_buffer) {
|
||||
QNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno));
|
||||
QNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s", probe_slots[idx], strerror(errno));
|
||||
break;
|
||||
} else {
|
||||
candidate_size = probe_slots[idx];
|
||||
|
|
@ -369,7 +369,7 @@ public:
|
|||
}
|
||||
|
||||
_rpcmem_capacity = std::max(candidate_size, _rpcmem_capacity);
|
||||
QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB\n", _rpcmem_capacity);
|
||||
QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB", _rpcmem_capacity);
|
||||
|
||||
if (0 != init_htp_perfinfra()) {
|
||||
QNN_LOG_WARN("initialize HTP performance failure");
|
||||
|
|
@ -382,7 +382,7 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("leave qni_init\n");
|
||||
QNN_LOG_DEBUG("leave qnn_init");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -395,9 +395,9 @@ public:
|
|||
_pfn_rpc_mem_deinit();
|
||||
|
||||
if (dl_unload(_rpc_lib_handle) != 0) {
|
||||
QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dl_error());
|
||||
QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s", dl_error());
|
||||
} else {
|
||||
QNN_LOG_DEBUG("succeed to close rpcmem lib\n");
|
||||
QNN_LOG_DEBUG("succeed to close rpcmem lib");
|
||||
}
|
||||
|
||||
if (_backend_name.find("Htp") != _backend_name.npos) {
|
||||
|
|
@ -407,7 +407,7 @@ public:
|
|||
if (nullptr != _qnn_context_handle) {
|
||||
error = _qnn_interface->qnn_context_free(_qnn_context_handle, _qnn_profile_handle);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(),
|
||||
QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d", _qnn_interface->get_backend_id(),
|
||||
QNN_GET_ERROR_CODE(error));
|
||||
}
|
||||
_qnn_context_handle = nullptr;
|
||||
|
|
@ -416,7 +416,7 @@ public:
|
|||
if (nullptr != _qnn_profile_handle) {
|
||||
error = _qnn_interface->qnn_profile_free(_qnn_profile_handle);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(),
|
||||
QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d", _qnn_interface->get_backend_id(),
|
||||
QNN_GET_ERROR_CODE(error));
|
||||
}
|
||||
_qnn_profile_handle = nullptr;
|
||||
|
|
@ -425,7 +425,7 @@ public:
|
|||
if (nullptr != _qnn_device_handle) {
|
||||
error = _qnn_interface->qnn_device_free(_qnn_device_handle);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(),
|
||||
QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d", _qnn_interface->get_backend_id(),
|
||||
QNN_GET_ERROR_CODE(error));
|
||||
}
|
||||
_qnn_device_handle = nullptr;
|
||||
|
|
@ -434,7 +434,7 @@ public:
|
|||
if (nullptr != _qnn_backend_handle) {
|
||||
error = _qnn_interface->qnn_backend_free(_qnn_backend_handle);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(),
|
||||
QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d", _qnn_interface->get_backend_id(),
|
||||
QNN_GET_ERROR_CODE(error));
|
||||
}
|
||||
_qnn_backend_handle = nullptr;
|
||||
|
|
@ -443,7 +443,7 @@ public:
|
|||
if (nullptr != _qnn_log_handle) {
|
||||
error = _qnn_interface->qnn_log_free(_qnn_log_handle);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(),
|
||||
QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d", _qnn_interface->get_backend_id(),
|
||||
QNN_GET_ERROR_CODE(error));
|
||||
}
|
||||
_qnn_log_handle = nullptr;
|
||||
|
|
@ -458,7 +458,7 @@ public:
|
|||
|
||||
std::shared_ptr<qnn_interface> get_qnn_interface() {
|
||||
if (!_qnn_interface) {
|
||||
QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
|
||||
QNN_LOG_WARN("pls check why _qnn_interface is not loaded");
|
||||
}
|
||||
return _qnn_interface;
|
||||
}
|
||||
|
|
@ -479,10 +479,10 @@ public:
|
|||
QnnDevice_Infrastructure_t device_infra = nullptr;
|
||||
int error = _qnn_interface->qnn_device_get_infrastructure(&device_infra);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_WARN("failed to get qnn device infra\n");
|
||||
QNN_LOG_WARN("failed to get qnn device infra");
|
||||
return 1;
|
||||
} else {
|
||||
QNN_LOG_INFO("HTP backend perf_infrastructure creation ok\n");
|
||||
QNN_LOG_INFO("HTP backend perf_infrastructure creation ok");
|
||||
}
|
||||
|
||||
QnnHtpDevice_Infrastructure_t *htp_infra = static_cast<QnnHtpDevice_Infrastructure_t *>(device_infra);
|
||||
|
|
@ -494,7 +494,7 @@ public:
|
|||
if (htp_infra->infraType != QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF) {
|
||||
QNN_LOG_INFO("HTP infra type = %d, which is not perf infra type", htp_infra->infraType);
|
||||
} else {
|
||||
QNN_LOG_INFO("HTP infra type = %d, which is perf infra type\n", htp_infra->infraType);
|
||||
QNN_LOG_INFO("HTP infra type = %d, which is perf infra type", htp_infra->infraType);
|
||||
}
|
||||
_qnn_htp_perfinfra = htp_perfinfra;
|
||||
_qnn_power_configid = power_configid;
|
||||
|
|
@ -520,12 +520,12 @@ public:
|
|||
nullptr};
|
||||
Qnn_ErrorHandle_t qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs);
|
||||
if (qnn_status != QNN_SUCCESS) {
|
||||
QNN_LOG_WARN("set htp perf failed\n");
|
||||
QNN_LOG_WARN("set htp perf failed");
|
||||
} else {
|
||||
QNN_LOG_DEBUG("set htp perf ok\n");
|
||||
QNN_LOG_DEBUG("set htp perf ok");
|
||||
}
|
||||
} else {
|
||||
QNN_LOG_WARN("can't set htp perf\n");
|
||||
QNN_LOG_WARN("can't set htp perf");
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
|
@ -533,7 +533,7 @@ public:
|
|||
|
||||
int set_high_performance_mode() {
|
||||
if (nullptr == _qnn_htp_perfinfra) {
|
||||
QNN_LOG_WARN("perf intra is null\n");
|
||||
QNN_LOG_WARN("perf intra is null");
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
@ -566,9 +566,9 @@ public:
|
|||
Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS;
|
||||
qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs);
|
||||
if (qnn_status != QNN_SUCCESS) {
|
||||
QNN_LOG_WARN("set htp high performance mode failed\n");
|
||||
QNN_LOG_WARN("set htp high performance mode failed");
|
||||
} else {
|
||||
QNN_LOG_DEBUG("set htp high performance mode ok\n");
|
||||
QNN_LOG_DEBUG("set htp high performance mode ok");
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
|
@ -584,21 +584,21 @@ public:
|
|||
|
||||
void *alloc_rpcmem(size_t bytes, size_t alignment) {
|
||||
if (!_rpcmem_initialized) {
|
||||
QNN_LOG_WARN("rpc memory not initialized\n");
|
||||
QNN_LOG_WARN("rpc memory not initialized");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto allocate_bytes = static_cast<int64_t>(bytes + alignment);
|
||||
void *buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (int)allocate_bytes);
|
||||
if (!buf) {
|
||||
QNN_LOG_WARN("failed to allocate rpc memory, size: %d MB\n", (int)(allocate_bytes / (1 << 20)));
|
||||
QNN_LOG_WARN("failed to allocate rpc memory, size: %d MB", (int)(allocate_bytes / (1 << 20)));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto aligned_buf = reinterpret_cast<void *>(qnn::align_to(alignment, reinterpret_cast<intptr_t>(buf)));
|
||||
bool status = _rpcmem_store_map.insert(std::pair<void *, void *>(aligned_buf, buf)).second;
|
||||
if (!status) {
|
||||
QNN_LOG_WARN("failed to allocate rpc memory\n");
|
||||
QNN_LOG_WARN("failed to allocate rpc memory");
|
||||
_pfn_rpc_mem_free(buf);
|
||||
}
|
||||
|
||||
|
|
@ -607,9 +607,9 @@ public:
|
|||
|
||||
void free_rpcmem(void *buf) {
|
||||
if (!_rpcmem_initialized) {
|
||||
QNN_LOG_WARN("rpc memory not initialized\n");
|
||||
QNN_LOG_WARN("rpc memory not initialized");
|
||||
} else if (_rpcmem_store_map.count(buf) == 0) {
|
||||
QNN_LOG_WARN("no allocated tensor\n");
|
||||
QNN_LOG_WARN("no allocated tensor");
|
||||
} else {
|
||||
_pfn_rpc_mem_free(_rpcmem_store_map[buf]);
|
||||
_rpcmem_store_map.erase(buf);
|
||||
|
|
@ -619,7 +619,7 @@ public:
|
|||
int32_t rpcmem_to_fd(void *buf) {
|
||||
int32_t mem_fd = -1;
|
||||
if (!is_rpcmem_initialized()) {
|
||||
QNN_LOG_WARN("rpc memory not initialized\n");
|
||||
QNN_LOG_WARN("rpc memory not initialized");
|
||||
} else {
|
||||
mem_fd = _pfn_rpc_mem_to_fd(buf);
|
||||
}
|
||||
|
|
@ -629,52 +629,51 @@ public:
|
|||
|
||||
Qnn_MemHandle_t register_rpcmem(void *p_data, const uint32_t rank, uint32_t *dimensions, Qnn_DataType_t data_type) {
|
||||
if (!p_data) {
|
||||
QNN_LOG_WARN("invalid param\n");
|
||||
QNN_LOG_WARN("invalid param");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (!is_rpcmem_initialized()) {
|
||||
QNN_LOG_WARN("rpc memory not initialized\n");
|
||||
QNN_LOG_WARN("rpc memory not initialized");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (is_rpcmem_registered(p_data)) {
|
||||
QNN_LOG_WARN("rpc memory already registered\n");
|
||||
QNN_LOG_WARN("rpc memory already registered");
|
||||
return _qnn_rpc_buffer_to_handles[p_data];
|
||||
}
|
||||
|
||||
auto mem_fd = rpcmem_to_fd(p_data);
|
||||
if (mem_fd == -1) {
|
||||
QNN_LOG_WARN("failed to get file descriptor\n");
|
||||
QNN_LOG_WARN("failed to get file descriptor");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("mem_fd %d\n", mem_fd);
|
||||
QNN_LOG_DEBUG("mem_fd %d", mem_fd);
|
||||
Qnn_MemDescriptor_t descriptor = {{rank, dimensions, nullptr}, data_type, QNN_MEM_TYPE_ION, {{mem_fd}}};
|
||||
Qnn_MemHandle_t handle = nullptr;
|
||||
auto error = _qnn_interface->qnn_mem_register(_qnn_context_handle, &descriptor,
|
||||
/*numDescriptors=*/1, &handle);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error),
|
||||
strerror(error));
|
||||
QNN_LOG_WARN("failed to register shared memory, error %d, %s", QNN_GET_ERROR_CODE(error), strerror(error));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
_qnn_rpc_buffer_to_handles.insert({p_data, handle});
|
||||
QNN_LOG_DEBUG("successfully register shared memory handler: %p\n", handle);
|
||||
QNN_LOG_DEBUG("successfully register shared memory handler: %p", handle);
|
||||
return handle;
|
||||
}
|
||||
|
||||
void unregister_rpcmem(Qnn_MemHandle_t mem_handle) {
|
||||
Qnn_ErrorHandle_t error = _qnn_interface->qnn_mem_de_register(&mem_handle, 1);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error));
|
||||
QNN_LOG_WARN("failed to unregister shared memory, error %d", QNN_GET_ERROR_CODE(error));
|
||||
}
|
||||
|
||||
auto it = std::find_if(_qnn_rpc_buffer_to_handles.begin(), _qnn_rpc_buffer_to_handles.end(),
|
||||
[mem_handle](const auto &kv) { return kv.second == mem_handle; });
|
||||
if (it == _qnn_rpc_buffer_to_handles.end()) {
|
||||
QNN_LOG_WARN("failed to find shared memory handler: %p\n", mem_handle);
|
||||
QNN_LOG_WARN("failed to find shared memory handler: %p", mem_handle);
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -691,18 +690,18 @@ private:
|
|||
Qnn_ErrorHandle_t error = QNN_SUCCESS;
|
||||
|
||||
std::string system_lib_path = _lib_path + "libQnnSystem.so";
|
||||
QNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str());
|
||||
QNN_LOG_DEBUG("system_lib_path:%s", system_lib_path.c_str());
|
||||
|
||||
auto system_lib_handle = dl_load(system_lib_path);
|
||||
if (!system_lib_handle) {
|
||||
QNN_LOG_WARN("can not load QNN library %s, error: %s\n", system_lib_path.c_str(), dl_error());
|
||||
QNN_LOG_WARN("can not load QNN library %s, error: %s", system_lib_path.c_str(), dl_error());
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto *get_providers = dl_sym_typed<qnn::pfn_qnnsysteminterface_getproviders *>(
|
||||
system_lib_handle, "QnnSystemInterface_getProviders");
|
||||
if (!get_providers) {
|
||||
QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dl_error());
|
||||
QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s", dl_error());
|
||||
return 2;
|
||||
}
|
||||
|
||||
|
|
@ -710,17 +709,17 @@ private:
|
|||
const QnnSystemInterface_t **provider_list = nullptr;
|
||||
error = get_providers(&provider_list, &num_providers);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_WARN("failed to get providers, error %d\n", QNN_GET_ERROR_CODE(error));
|
||||
QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error));
|
||||
return 3;
|
||||
}
|
||||
|
||||
if (num_providers != _required_num_providers) {
|
||||
QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, _required_num_providers);
|
||||
QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers);
|
||||
return 4;
|
||||
}
|
||||
|
||||
if (!provider_list) {
|
||||
QNN_LOG_WARN("can not get providers\n");
|
||||
QNN_LOG_WARN("can not get providers");
|
||||
return 5;
|
||||
}
|
||||
|
||||
|
|
@ -735,15 +734,15 @@ private:
|
|||
}
|
||||
}
|
||||
if (!found_valid_system_interface) {
|
||||
QNN_LOG_WARN("unable to find a valid qnn system interface\n");
|
||||
QNN_LOG_WARN("unable to find a valid qnn system interface");
|
||||
return 6;
|
||||
} else {
|
||||
QNN_LOG_DEBUG("find a valid qnn system interface\n");
|
||||
QNN_LOG_DEBUG("find a valid qnn system interface");
|
||||
}
|
||||
|
||||
auto qnn_sys_interface = std::make_shared<qnn::qnn_system_interface>(*provider_list[0], system_lib_handle);
|
||||
if (!qnn_sys_interface->is_valid()) {
|
||||
QNN_LOG_WARN("failed to create QNN system interface\n");
|
||||
QNN_LOG_WARN("failed to create QNN system interface");
|
||||
return 7;
|
||||
}
|
||||
|
||||
|
|
@ -753,7 +752,7 @@ private:
|
|||
|
||||
int load_backend(std::string &lib_path, const QnnSaver_Config_t ** /*saver_config*/) {
|
||||
Qnn_ErrorHandle_t error = QNN_SUCCESS;
|
||||
QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str());
|
||||
QNN_LOG_DEBUG("lib_path:%s", lib_path.c_str());
|
||||
|
||||
auto lib_handle = dl_load(lib_path.c_str());
|
||||
if (!lib_handle) {
|
||||
|
|
@ -775,14 +774,14 @@ private:
|
|||
QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error));
|
||||
return 3;
|
||||
}
|
||||
QNN_LOG_DEBUG("num_providers=%d\n", num_providers);
|
||||
QNN_LOG_DEBUG("num_providers=%d", num_providers);
|
||||
if (num_providers != _required_num_providers) {
|
||||
QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers);
|
||||
return 4;
|
||||
}
|
||||
|
||||
if (!provider_list) {
|
||||
QNN_LOG_WARN("failed to get qnn interface providers\n");
|
||||
QNN_LOG_WARN("failed to get qnn interface providers");
|
||||
return 5;
|
||||
}
|
||||
bool found_valid_interface = false;
|
||||
|
|
@ -797,23 +796,23 @@ private:
|
|||
}
|
||||
|
||||
if (!found_valid_interface) {
|
||||
QNN_LOG_WARN("unable to find a valid qnn interface\n");
|
||||
QNN_LOG_WARN("unable to find a valid qnn interface");
|
||||
return 6;
|
||||
} else {
|
||||
QNN_LOG_DEBUG("find a valid qnn interface\n");
|
||||
QNN_LOG_DEBUG("find a valid qnn interface");
|
||||
}
|
||||
|
||||
BackendIdType backend_id = provider_list[0]->backendId;
|
||||
_lib_path_to_backend_id[lib_path] = backend_id;
|
||||
if (_loaded_backend.count(backend_id) > 0) {
|
||||
QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", lib_path.c_str(), backend_id);
|
||||
QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists", lib_path.c_str(), backend_id);
|
||||
}
|
||||
_loaded_backend[backend_id] = provider_list[0];
|
||||
if (_loaded_lib_handle.count(backend_id) > 0) {
|
||||
QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]);
|
||||
QNN_LOG_WARN("closing %p", _loaded_lib_handle[backend_id]);
|
||||
int dlclose_error = dl_unload(_loaded_lib_handle[backend_id]);
|
||||
if (dlclose_error != 0) {
|
||||
QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dl_error());
|
||||
QNN_LOG_WARN("fail to close %p with error %s", _loaded_lib_handle[backend_id], dl_error());
|
||||
}
|
||||
}
|
||||
_loaded_lib_handle[backend_id] = lib_handle;
|
||||
|
|
@ -827,7 +826,7 @@ private:
|
|||
for (auto &it : _loaded_lib_handle) {
|
||||
dlclose_error = dl_unload(it.second);
|
||||
if (dlclose_error != 0) {
|
||||
QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dl_error());
|
||||
QNN_LOG_WARN("failed to close QNN backend %d, error %s", it.first, dl_error());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -32,10 +32,10 @@ public:
|
|||
if (!_tensor_name.empty()) {
|
||||
QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str());
|
||||
}
|
||||
QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions.data());
|
||||
QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER);
|
||||
|
||||
_dimensions = dimensions;
|
||||
QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions.data());
|
||||
QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER);
|
||||
update_params_from_ggml_tensor(tensor_type, data_type, rank);
|
||||
QNN_LOG_DEBUG("[%s][%s]created, rank: %d, dims: [%d, %d, %d, %d], type: %s", get_backend_name(device),
|
||||
_tensor_name.c_str(), rank, (int)_dimensions[0], (int)_dimensions[1], (int)_dimensions[2],
|
||||
|
|
@ -51,7 +51,7 @@ public:
|
|||
~ggml_qnn_tensor() {
|
||||
_buffer_storage.clear();
|
||||
unbind();
|
||||
_qnn_rpc_buffer.reset();
|
||||
_rpc_buffer.reset();
|
||||
}
|
||||
|
||||
bool set_data_buffer(std::vector<uint8_t> &&buffer) {
|
||||
|
|
@ -73,7 +73,7 @@ public:
|
|||
auto qnn_interface = _qnn_instance->get_qnn_interface();
|
||||
auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_WARN("[%s]allocate id failed , error: %d\n", _tensor_name.c_str(), error);
|
||||
QNN_LOG_WARN("[%s]allocate id failed , error: %d", _tensor_name.c_str(), error);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -162,21 +162,29 @@ private:
|
|||
}
|
||||
|
||||
if (should_use_mem_handle()) {
|
||||
if (!_qnn_rpc_buffer) {
|
||||
auto qnn_rpc_buffer = std::make_unique<ggml_qnn_rpc_buffer>(
|
||||
if (!_rpc_buffer) {
|
||||
auto rpc_buffer = std::make_shared<qnn_rpc_buffer>(
|
||||
_qnn_instance, buffer_size, QNN_TENSOR_GET_RANK(_qnn_tensor),
|
||||
QNN_TENSOR_GET_DIMENSIONS(_qnn_tensor), QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor));
|
||||
if (!qnn_rpc_buffer->is_valid()) {
|
||||
QNN_LOG_WARN("[%s]alloc rpc mem failed", _tensor_name.c_str());
|
||||
if (!rpc_buffer->is_valid()) {
|
||||
QNN_LOG_WARN("[%s][%s]alloc rpc mem failed", get_backend_name(_device), _tensor_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
_qnn_rpc_buffer = std::move(qnn_rpc_buffer);
|
||||
_rpc_buffer = std::move(rpc_buffer);
|
||||
}
|
||||
|
||||
QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE);
|
||||
QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, _qnn_rpc_buffer->get_mem_handle());
|
||||
QNN_LOG_DEBUG("[%s]use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor));
|
||||
auto mem_handle = _rpc_buffer->get_mem_handle();
|
||||
if (!mem_handle) {
|
||||
QNN_LOG_WARN("[%s][%s]can't find rpcmem from qnn mem handle", get_backend_name(_device),
|
||||
_tensor_name.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, mem_handle);
|
||||
QNN_LOG_DEBUG("[%s][%s]use mem handle %p", get_backend_name(_device), _tensor_name.c_str(),
|
||||
QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor));
|
||||
} else {
|
||||
QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW);
|
||||
Qnn_ClientBuffer_t client_buf = {buffer, (uint32_t)buffer_size};
|
||||
|
|
@ -205,13 +213,8 @@ private:
|
|||
return true;
|
||||
}
|
||||
|
||||
if (should_use_mem_handle()) {
|
||||
if (_qnn_rpc_buffer) {
|
||||
memcpy(_qnn_rpc_buffer->get_buffer(), _buffer, _buffer_size);
|
||||
} else {
|
||||
QNN_LOG_WARN("[%s]can't find rpcmem from qnn mem handle\n", _tensor_name.c_str());
|
||||
return false;
|
||||
}
|
||||
if (_rpc_buffer) {
|
||||
memcpy(_rpc_buffer->get_buffer(), _buffer, _buffer_size);
|
||||
}
|
||||
|
||||
// For CPU and GPU, the data is already in the tensor.
|
||||
|
|
@ -226,13 +229,8 @@ private:
|
|||
return true;
|
||||
}
|
||||
|
||||
if (should_use_mem_handle()) {
|
||||
if (_qnn_rpc_buffer) {
|
||||
memcpy(_buffer, _qnn_rpc_buffer->get_buffer(), _buffer_size);
|
||||
} else {
|
||||
QNN_LOG_WARN("[%s]can't find rpcmem from qnn mem handle", _tensor_name.c_str());
|
||||
return false;
|
||||
}
|
||||
if (_rpc_buffer) {
|
||||
memcpy(_buffer, _rpc_buffer->get_buffer(), _buffer_size);
|
||||
}
|
||||
|
||||
// For CPU and GPU, the data is already in the tensor.
|
||||
|
|
@ -283,7 +281,7 @@ private:
|
|||
Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion);
|
||||
qnn_dimension_array_t _dimensions = {};
|
||||
Qnn_GraphHandle_t _graph_handle = nullptr;
|
||||
std::unique_ptr<ggml_qnn_rpc_buffer> _qnn_rpc_buffer;
|
||||
qnn_buffer_ptr _rpc_buffer;
|
||||
|
||||
DISABLE_COPY(ggml_qnn_tensor);
|
||||
DISABLE_MOVE(ggml_qnn_tensor);
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
|
||||
#include "utils.hpp"
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ggml-qnn.h"
|
||||
|
|
@ -37,6 +39,28 @@ qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims,
|
|||
return internal_dims;
|
||||
}
|
||||
|
||||
qnn_dimension_array_t get_view_internal_dimension(const ggml_tensor *tensor, size_t &element_offset_out) {
|
||||
|
||||
element_offset_out = 0;
|
||||
|
||||
auto *parent_tensor = tensor;
|
||||
while (parent_tensor->view_src) {
|
||||
element_offset_out += parent_tensor->view_offs;
|
||||
parent_tensor = parent_tensor->view_src;
|
||||
}
|
||||
|
||||
const auto rank = get_ggml_tensor_rank(tensor);
|
||||
const auto parent_rank = get_ggml_tensor_rank(parent_tensor);
|
||||
GGML_ASSERT(parent_tensor->type == tensor->type);
|
||||
GGML_ASSERT(parent_rank == rank);
|
||||
|
||||
const auto block_size = ggml_blck_size(tensor->type);
|
||||
element_offset_out =
|
||||
element_offset_out * block_size / tensor->nb[0]; // calculate the element offset in the view tensor
|
||||
|
||||
return get_internal_dimension(parent_tensor->ne, parent_rank);
|
||||
}
|
||||
|
||||
// TODO: mapping more ggml data type to QNN data type
|
||||
// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684
|
||||
Qnn_DataType_t qnn_datatype_from_ggml_datatype(ggml_type ggml_type) {
|
||||
|
|
@ -199,6 +223,12 @@ intptr_t align_to(size_t alignment, intptr_t offset) {
|
|||
|
||||
uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor) { return ggml_nbytes(tensor); }
|
||||
|
||||
void *page_align_alloc(size_t size) {
|
||||
// TODO: fix this for other platforms
|
||||
const size_t alignment = sysconf(_SC_PAGESIZE);
|
||||
return align_alloc(alignment, size);
|
||||
}
|
||||
|
||||
void *align_alloc(size_t alignment, size_t size) {
|
||||
size_t size_aligned = size;
|
||||
if ((size_aligned % alignment) != 0) {
|
||||
|
|
|
|||
|
|
@ -21,9 +21,11 @@
|
|||
namespace qnn {
|
||||
|
||||
using ggml_dimension_array_t = int64_t[GGML_MAX_DIMS];
|
||||
using ggml_stride_array_t = size_t[GGML_MAX_DIMS];
|
||||
using qnn_dimension_array_t = std::array<uint32_t, GGML_MAX_DIMS>;
|
||||
|
||||
qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, uint32_t rank);
|
||||
qnn_dimension_array_t get_view_internal_dimension(const ggml_tensor *tensor, size_t &element_offser_out);
|
||||
|
||||
uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor);
|
||||
const char *get_ggml_type_name(ggml_type type);
|
||||
|
|
@ -33,6 +35,7 @@ const char *get_htparch_desc(size_t htp_arch);
|
|||
intptr_t align_to(size_t alignment, intptr_t offset);
|
||||
uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor);
|
||||
|
||||
void *page_align_alloc(size_t size);
|
||||
void *align_alloc(size_t alignment, size_t size);
|
||||
void align_free(void *ptr);
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue