feat: perf opt part2 (#39)
* add qurt_thread * add thread pool * add thread_pool obj at device ctx * wip * small refactoring to fit the thread pool structure * set start/end threads for add * init thread pool * fix thread creation * split complete and pending signals * opt mulmat * wip * 2 threads * back to 4 threads * use barrier * remove some unnecessary package * add multi thread support for mul mat * wip * use qurt_barrier_t instead of qurt_signal_t * wip * wip * add log * split qnn cmake config * create function to calculate the start and end func * wip * fix comment * fix comment * fix comment * wip * fix typo
This commit is contained in:
parent
a0e54cfc70
commit
c2b6fec63f
|
|
@ -5,11 +5,9 @@ option(GGML_QNN_ENABLE_HEXAGON_BACKEND "ggml-qnn: Enable Hexagon custom package"
|
|||
|
||||
if(CMAKE_SYSTEM_NAME STREQUAL "Android")
|
||||
find_library(LOG_LIB log)
|
||||
set(QNN_LINK_LIBRARIES ${LOG_LIB})
|
||||
set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend")
|
||||
add_compile_options(-g -O0)
|
||||
set(COMMON_LINK_LIBRARIES ${LOG_LIB})
|
||||
elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows" OR CMAKE_SYSTEM_NAME STREQUAL "Linux")
|
||||
set(QNN_DEFAULT_LIB_SEARCH_PATH "" CACHE STRING "customized library search path for QNN backend")
|
||||
message("Building for Linux or Windows")
|
||||
else()
|
||||
message(FATAL_ERROR "QNN now only available on Android, Windows and Linux")
|
||||
endif()
|
||||
|
|
@ -29,33 +27,15 @@ message("CMAKE_CXX_FLAGS_DEBUG: ${CMAKE_CXX_FLAGS_DEBUG}")
|
|||
message("CMAKE_CXX_FLAGS_RELEASE: ${CMAKE_CXX_FLAGS_RELEASE}")
|
||||
message("QNN_SDK_PATH: ${GGML_QNN_SDK_PATH}")
|
||||
|
||||
file(GLOB QNN_SOURCES "${CMAKE_CURRENT_LIST_DIR}/qnn/*.cpp")
|
||||
file(GLOB COMMON_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp")
|
||||
message("GGML_QNN: ${GGML_QNN}")
|
||||
message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING: ${GGML_QNN_ENABLE_PERFORMANCE_TRACKING}")
|
||||
message("GGML_QNN_ENABLE_HEXAGON_BACKEND: ${GGML_QNN_ENABLE_HEXAGON_BACKEND}")
|
||||
message("GGML_HEXAGON_NPU_ONLY: ${GGML_HEXAGON_NPU_ONLY}")
|
||||
|
||||
ggml_add_backend_library(ggml-qnn
|
||||
${QNN_SOURCES}
|
||||
${COMMON_SOURCES}
|
||||
../../include/ggml-qnn.h
|
||||
)
|
||||
|
||||
target_include_directories(ggml-qnn PRIVATE
|
||||
${GGML_QNN_SDK_PATH}/include/QNN
|
||||
${CMAKE_CURRENT_LIST_DIR}/qnn
|
||||
${CMAKE_CURRENT_LIST_DIR}
|
||||
)
|
||||
target_link_libraries(ggml-qnn PRIVATE ${QNN_LINK_LIBRARIES})
|
||||
|
||||
if(NOT "${QNN_DEFAULT_LIB_SEARCH_PATH}" STREQUAL "")
|
||||
string(REGEX REPLACE "/$" "" QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}")
|
||||
endif()
|
||||
|
||||
message("GGML_QNN_DEFAULT_LIB_SEARCH_PATH: ${QNN_DEFAULT_LIB_SEARCH_PATH}")
|
||||
target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}")
|
||||
|
||||
if(GGML_QNN_ENABLE_CPU_BACKEND)
|
||||
message("GGML_QNN_ENABLE_CPU_BACKEND is enabled")
|
||||
target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_ENABLE_CPU_BACKEND)
|
||||
else()
|
||||
message("GGML_QNN_ENABLE_CPU_BACKEND is disabled")
|
||||
endif()
|
||||
target_link_libraries(ggml-qnn PRIVATE ${COMMON_LINK_LIBRARIES})
|
||||
|
||||
if(GGML_QNN_ENABLE_PERFORMANCE_TRACKING)
|
||||
message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING is enabled")
|
||||
|
|
@ -72,6 +52,8 @@ if(GGML_HEXAGON_NPU_ONLY)
|
|||
set(GGML_QNN_ENABLE_HEXAGON_BACKEND ON)
|
||||
else()
|
||||
message("GGML_HEXAGON_NPU_ONLY is disabled")
|
||||
add_subdirectory(qnn)
|
||||
target_link_libraries(ggml-qnn PRIVATE qnn-backend)
|
||||
endif()
|
||||
|
||||
if(GGML_QNN_ENABLE_HEXAGON_BACKEND)
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
#include <HAP_compute_res.h>
|
||||
#include <hexagon_types.h>
|
||||
|
||||
#include <memory>
|
||||
#include <new>
|
||||
|
||||
#include "graph.hpp"
|
||||
|
|
@ -10,15 +11,30 @@
|
|||
#include "op_impl.hpp"
|
||||
#include "remote.h"
|
||||
#include "tensor.hpp"
|
||||
#include "thread_pool.hpp"
|
||||
#include "util.hpp"
|
||||
|
||||
#define NPU_UNUSED(x) (void) (x)
|
||||
|
||||
namespace {
|
||||
|
||||
struct npu_device_context {
|
||||
int unused = 0;
|
||||
// TODO: should we add tensor context here?
|
||||
std::unique_ptr<hexagon::default_thread_pool> thread_pool;
|
||||
|
||||
bool init_thread_pool() {
|
||||
if (thread_pool) {
|
||||
DEVICE_LOG_DEBUG("Thread pool already initialized");
|
||||
return true;
|
||||
}
|
||||
|
||||
auto pool = std::make_unique<hexagon::default_thread_pool>();
|
||||
if (!pool) {
|
||||
DEVICE_LOG_ERROR("Failed to create thread pool");
|
||||
return false;
|
||||
}
|
||||
|
||||
thread_pool = std::move(pool);
|
||||
DEVICE_LOG_DEBUG("Thread pool initialized");
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
inline hexagon::tensor * tensor_from_handle(npu_device_graph_handle_t h) {
|
||||
|
|
@ -37,6 +53,10 @@ inline npu_device_tensor_handle_t graph_to_handle(hexagon::graph * graph) {
|
|||
return reinterpret_cast<npu_device_tensor_handle_t>(graph);
|
||||
}
|
||||
|
||||
inline npu_device_context * device_context_from_handle(remote_handle64 h) {
|
||||
return reinterpret_cast<npu_device_context *>(h);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
int npu_device_open(const char * uri, remote_handle64 * h) {
|
||||
|
|
@ -47,12 +67,18 @@ int npu_device_open(const char * uri, remote_handle64 * h) {
|
|||
return AEE_ENOMEMORY;
|
||||
}
|
||||
|
||||
if (!context->init_thread_pool()) {
|
||||
DEVICE_LOG_ERROR("Failed to initialize thread pool");
|
||||
delete context;
|
||||
return AEE_EFAILED;
|
||||
}
|
||||
|
||||
*h = reinterpret_cast<remote_handle64>(context);
|
||||
return AEE_SUCCESS;
|
||||
}
|
||||
|
||||
int npu_device_close(remote_handle64 h) {
|
||||
auto * context = reinterpret_cast<npu_device_context *>(h);
|
||||
auto * context = device_context_from_handle(h);
|
||||
if (!context) {
|
||||
DEVICE_LOG_ERROR("Invalid npu_device_context handle");
|
||||
return AEE_EINVHANDLE;
|
||||
|
|
@ -149,13 +175,19 @@ AEEResult npu_device_graph_set_tensor(remote_handle64 _h, npu_device_graph_handl
|
|||
}
|
||||
|
||||
AEEResult npu_device_graph_compute(remote_handle64 _h, npu_device_graph_handle_t graph_handle) {
|
||||
NPU_UNUSED(_h);
|
||||
auto * graph = graph_from_handle(graph_handle);
|
||||
if (!graph) {
|
||||
auto dev_ctx = device_context_from_handle(_h);
|
||||
if (!dev_ctx) {
|
||||
DEVICE_LOG_DEBUG("Invalid npu_device_context handle");
|
||||
return AEE_EINVHANDLE;
|
||||
}
|
||||
|
||||
if (!graph->compute()) {
|
||||
auto * graph = graph_from_handle(graph_handle);
|
||||
if (!graph) {
|
||||
DEVICE_LOG_ERROR("Invalid graph handle");
|
||||
return AEE_EINVHANDLE;
|
||||
}
|
||||
|
||||
if (!graph->compute(dev_ctx->thread_pool.get())) {
|
||||
return AEE_EFAILED;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -8,24 +8,23 @@
|
|||
|
||||
namespace hexagon {
|
||||
|
||||
graph::graph() noexcept {
|
||||
DEVICE_LOG_DEBUG("graph(%p) created\n", (void *) this);
|
||||
}
|
||||
|
||||
graph::~graph() noexcept {
|
||||
if (_tensors) {
|
||||
delete[] _tensors;
|
||||
}
|
||||
_tensors.reset();
|
||||
DEVICE_LOG_DEBUG("graph(%p) destroyed\n", (void *) this);
|
||||
}
|
||||
|
||||
void graph::set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_count) {
|
||||
if (_tensor_count > 0) {
|
||||
delete[] _tensors;
|
||||
}
|
||||
|
||||
if (tensor_count <= 0) {
|
||||
_tensors = nullptr;
|
||||
_tensors.reset();
|
||||
_tensor_count = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
_tensors = new (std::nothrow) tensor *[tensor_count];
|
||||
_tensors = std::make_unique<tensor *[]>(size_t(tensor_count));
|
||||
for (int i = 0; i < tensor_count; ++i) {
|
||||
auto * tensor_obj = reinterpret_cast<tensor *>(tensors[i]);
|
||||
_tensors[i] = tensor_obj;
|
||||
|
|
@ -37,31 +36,43 @@ void graph::set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_co
|
|||
DEVICE_LOG_DEBUG("graph(%p) tensor count: %zu\n", (void *) this, _tensor_count);
|
||||
}
|
||||
|
||||
bool graph::compute() {
|
||||
bool graph::compute(default_thread_pool * thread_pool) {
|
||||
if (!_tensors || !_tensor_count) {
|
||||
DEVICE_LOG_DEBUG("graph(%p) no tensors to compute\n", (void *) this);
|
||||
return true; // return success if no tensors to compute
|
||||
}
|
||||
|
||||
DEVICE_LOG_DEBUG("graph(%p) compute\n", (void *) this);
|
||||
thread_pool->sync_execute(reinterpret_cast<default_thread_pool::task_type>(&graph::thread_pool_task), this);
|
||||
|
||||
for (size_t i = 0; i < _tensor_count; ++i) {
|
||||
auto * dst = _tensors[i];
|
||||
auto op = dst->get_op();
|
||||
auto * func = get_compute_func(op);
|
||||
if (!func) {
|
||||
DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d not supported\n", (void *) this, i, op);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!func(dst)) {
|
||||
DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d compute failed\n", (void *) this, i, op);
|
||||
return false;
|
||||
}
|
||||
|
||||
auto * dst = _tensors[i];
|
||||
dst->flush(); // TODO: optimize this
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void graph::thread_pool_task(default_thread_pool * pool, size_t thread_idx, size_t thread_count, graph * graph) {
|
||||
NPU_UNUSED(pool);
|
||||
graph->compute_impl(thread_idx, thread_count);
|
||||
}
|
||||
|
||||
void graph::compute_impl(size_t thread_idx, size_t thread_count) {
|
||||
for (size_t i = 0; i < _tensor_count; ++i) {
|
||||
auto * dst = _tensors[i];
|
||||
auto op = dst->get_op();
|
||||
auto * func = get_compute_func(op);
|
||||
if (!func) {
|
||||
DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d not supported\n", (void *) this, i, op);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!func(dst, thread_idx, thread_count)) {
|
||||
DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d compute failed\n", (void *) this, i, op);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace hexagon
|
||||
|
|
|
|||
|
|
@ -1,29 +1,32 @@
|
|||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "hexagon_npu.h"
|
||||
#include "tensor.hpp"
|
||||
#include "thread_pool.hpp"
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
class graph {
|
||||
public:
|
||||
// TODO: add execute direction here
|
||||
explicit graph() noexcept {}
|
||||
explicit graph() noexcept;
|
||||
|
||||
~graph() noexcept;
|
||||
|
||||
void set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_count);
|
||||
|
||||
bool compute();
|
||||
bool compute(default_thread_pool * thread_pool);
|
||||
|
||||
private:
|
||||
tensor ** _tensors = nullptr;
|
||||
size_t _tensor_count = 0;
|
||||
static void thread_pool_task(default_thread_pool * pool, size_t thread_idx, size_t thread_count, graph * graph);
|
||||
void compute_impl(size_t thread_idx, size_t thread_count);
|
||||
|
||||
graph(const graph &) = delete;
|
||||
void operator=(const graph &) = delete;
|
||||
graph(graph &&) = delete;
|
||||
void operator=(graph &&) = delete;
|
||||
std::unique_ptr<tensor *[]> _tensors;
|
||||
size_t _tensor_count = 0;
|
||||
|
||||
DISABLE_COPY_AND_MOVE(graph);
|
||||
};
|
||||
|
||||
} // namespace hexagon
|
||||
|
|
|
|||
|
|
@ -76,11 +76,12 @@ inline HVX_Vector vmul_f32_f32(HVX_Vector a, HVX_Vector b) {
|
|||
}
|
||||
|
||||
template <typename _TySrc, typename _TyDst, void (*_RowFunc)(const _TySrc *, const _TySrc *, size_t, _TyDst *)>
|
||||
bool element_wise_op(hexagon::tensor * out) {
|
||||
bool element_wise_op(hexagon::tensor * out, size_t tidx, size_t tcnt) {
|
||||
if (!out) {
|
||||
return false;
|
||||
}
|
||||
|
||||
static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "element_wise_op requires max dims 4");
|
||||
auto * src0 = out->get_src(0);
|
||||
auto * src1 = out->get_src(1);
|
||||
if (!src0 || !src1) {
|
||||
|
|
@ -93,28 +94,24 @@ bool element_wise_op(hexagon::tensor * out) {
|
|||
return false;
|
||||
}
|
||||
|
||||
static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "element_wise_op requires max dims 4");
|
||||
|
||||
const auto * src0_ptr = reinterpret_cast<const uint8_t *>(src0->get_data());
|
||||
const auto * src1_ptr = reinterpret_cast<const uint8_t *>(src1->get_data());
|
||||
auto * dst_ptr = reinterpret_cast<uint8_t *>(out->get_data());
|
||||
for (int64_t i3 = 0; i3 < out->get_ne(3); i3++) {
|
||||
const auto * src0_cube = src0_ptr + i3 * src0->get_nb(3);
|
||||
const auto * src1_cube = src1_ptr + (i3 % src1->get_ne(3)) * src1->get_nb(3);
|
||||
auto * dst_cube = dst_ptr + i3 * out->get_nb(3);
|
||||
for (int64_t i2 = 0; i2 < out->get_ne(2); i2++) {
|
||||
const auto * src0_plane = src0_cube + i2 * src0->get_nb(2);
|
||||
const auto * src1_plane = src1_cube + (i2 % src1->get_ne(2)) * src1->get_nb(2);
|
||||
auto * dst_plane = dst_cube + i2 * out->get_nb(2);
|
||||
for (int64_t i1 = 0; i1 < out->get_ne(1); i1++) {
|
||||
// TODO: prefetch row?
|
||||
auto * src0_row = src0_plane + i1 * src0->get_nb(1);
|
||||
auto * src1_row = src1_plane + (i1 % src1->get_ne(1)) * src1->get_nb(1);
|
||||
auto * dst_row = reinterpret_cast<float *>(dst_plane + i1 * out->get_nb(1));
|
||||
_RowFunc(reinterpret_cast<const _TySrc *>(src0_row), reinterpret_cast<const _TySrc *>(src1_row),
|
||||
static_cast<size_t>(out->get_ne(0)), reinterpret_cast<_TyDst *>(dst_row));
|
||||
}
|
||||
}
|
||||
const auto * src0_ptr = reinterpret_cast<const uint8_t *>(src0->get_data());
|
||||
const auto * src1_ptr = reinterpret_cast<const uint8_t *>(src1->get_data());
|
||||
auto * dst_ptr = reinterpret_cast<uint8_t *>(out->get_data());
|
||||
auto total_rows = out->get_ne(3) * out->get_ne(2) * out->get_ne(1);
|
||||
const auto rows_per_box = out->get_ne(2) * out->get_ne(1);
|
||||
const auto start_end = hexagon::get_thread_work_slice(total_rows, tidx, tcnt);
|
||||
for (int64_t ir = start_end.first; ir < start_end.second; ++ir) {
|
||||
const auto i03 = ir / rows_per_box;
|
||||
const auto i02 = ir / out->get_ne(1) - i03 * out->get_ne(2);
|
||||
const auto i01 = ir % out->get_ne(1);
|
||||
const auto i13 = i03 % src1->get_ne(3);
|
||||
const auto i12 = i02 % src1->get_ne(2);
|
||||
const auto i11 = i01 % src1->get_ne(1);
|
||||
auto * src0_row = src0_ptr + i03 * src0->get_nb(3) + i02 * src0->get_nb(2) + i01 * src0->get_nb(1);
|
||||
auto * src1_row = src1_ptr + i13 * src1->get_nb(3) + i12 * src1->get_nb(2) + i11 * src1->get_nb(1);
|
||||
auto * dst_row = dst_ptr + i03 * out->get_nb(3) + i02 * out->get_nb(2) + i01 * out->get_nb(1);
|
||||
_RowFunc(reinterpret_cast<const _TySrc *>(src0_row), reinterpret_cast<const _TySrc *>(src1_row),
|
||||
static_cast<size_t>(out->get_ne(0)), reinterpret_cast<_TyDst *>(dst_row));
|
||||
}
|
||||
|
||||
return true;
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@
|
|||
|
||||
namespace hexagon {
|
||||
|
||||
typedef bool (*compute_func_type)(tensor * dst);
|
||||
typedef bool (*compute_func_type)(tensor * dst, size_t tidx, size_t tcnt);
|
||||
typedef bool (*op_is_supported_func_type)(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
|
||||
const npu_device_tensor_spec & dst, npu_device_tensor_op op);
|
||||
|
||||
|
|
|
|||
|
|
@ -71,43 +71,45 @@ inline float vec_dot_product_f32_f32(const float * src0, const float * src1, siz
|
|||
|
||||
namespace hexagon {
|
||||
|
||||
bool mul_mat_f32(hexagon::tensor * out) {
|
||||
bool mul_mat_f32(hexagon::tensor * out, size_t tidx, size_t tcnt) {
|
||||
if (!out) {
|
||||
return false;
|
||||
}
|
||||
|
||||
static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "mul_mat_f32 requires max dims 4");
|
||||
auto * src0 = out->get_src(0);
|
||||
auto * src1 = out->get_src(1);
|
||||
if (!src0 || !src1) {
|
||||
return true; // skip if no src
|
||||
}
|
||||
|
||||
static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "mul_mat_f32 requires max dims 4");
|
||||
const auto r02 = src1->get_ne(2) / src0->get_ne(2);
|
||||
const auto r03 = src1->get_ne(3) / src0->get_ne(3);
|
||||
const auto * src0_ptr = reinterpret_cast<const uint8_t *>(src0->get_data());
|
||||
const auto * src1_ptr = reinterpret_cast<const uint8_t *>(src1->get_data());
|
||||
auto * dst_ptr = reinterpret_cast<uint8_t *>(out->get_data());
|
||||
const auto total_planes = out->get_ne(3) * out->get_ne(2);
|
||||
|
||||
const auto r02 = src1->get_ne(2) / src0->get_ne(2);
|
||||
const auto r03 = src1->get_ne(3) / src0->get_ne(3);
|
||||
const auto * src0_ptr = reinterpret_cast<const uint8_t *>(src0->get_data());
|
||||
const auto * src1_ptr = reinterpret_cast<const uint8_t *>(src1->get_data());
|
||||
auto * dst_ptr = reinterpret_cast<uint8_t *>(out->get_data());
|
||||
for (int64_t i3 = 0; i3 < out->get_ne(3); i3++) {
|
||||
const auto * src0_cube = src0_ptr + i3 / r03 * src0->get_nb(3);
|
||||
const auto * src1_cube = src1_ptr + i3 * src1->get_nb(3);
|
||||
auto * dst_cube = dst_ptr + i3 * out->get_nb(3);
|
||||
for (int64_t i2 = 0; i2 < out->get_ne(2); i2++) {
|
||||
const auto * src0_plane = src0_cube + i2 / r02 * src0->get_nb(2);
|
||||
const auto * src1_plane = src1_cube + i2 * src1->get_nb(2);
|
||||
auto * dst_plane = dst_cube + i2 * out->get_nb(2);
|
||||
for (int64_t i1 = 0; i1 < out->get_ne(1); i1++) {
|
||||
// TODO: prefetch row?
|
||||
auto * src1_row = src1_plane + i1 * src1->get_nb(1);
|
||||
auto * dst_row = reinterpret_cast<float *>(dst_plane + i1 * out->get_nb(1));
|
||||
for (int64_t i0 = 0; i0 < out->get_ne(0); i0++) {
|
||||
auto * src0_row = src0_plane + i0 * src0->get_nb(1);
|
||||
// TODO: figure out how to handle a entire row
|
||||
*dst_row++ =
|
||||
vec_dot_product_f32_f32(reinterpret_cast<const float *>(src0_row),
|
||||
reinterpret_cast<const float *>(src1_row), (size_t) src0->get_ne(0));
|
||||
}
|
||||
const auto start_end_plane = (total_planes >= tcnt) ? get_thread_work_slice(total_planes, tidx, tcnt) :
|
||||
std::pair<int64_t, int64_t>{ 0, total_planes };
|
||||
const auto start_end_row = (total_planes >= tcnt) ? std::pair<int64_t, int64_t>{ 0, out->get_ne(1) } :
|
||||
get_thread_work_slice(out->get_ne(1), tidx, tcnt);
|
||||
for (int64_t ip = start_end_plane.first; ip < start_end_plane.second; ip++) {
|
||||
const auto i3 = ip / out->get_ne(2);
|
||||
const auto i2 = ip - i3 * out->get_ne(2);
|
||||
const auto * src0_plane = src0_ptr + i3 / r03 * src0->get_nb(3) + i2 / r02 * src0->get_nb(2);
|
||||
const auto * src1_plane = src1_ptr + i3 * src1->get_nb(3) + i2 * src1->get_nb(2);
|
||||
auto * dst_plane = dst_ptr + i3 * out->get_nb(3) + i2 * out->get_nb(2);
|
||||
for (int64_t i1 = start_end_row.first; i1 < start_end_row.second; i1++) {
|
||||
// TODO: prefetch row?
|
||||
auto * src1_row = src1_plane + i1 * src1->get_nb(1);
|
||||
auto * dst_row = reinterpret_cast<float *>(dst_plane + i1 * out->get_nb(1));
|
||||
for (int64_t i0 = 0; i0 < out->get_ne(0); i0++) {
|
||||
auto * src0_row = src0_plane + i0 * src0->get_nb(1);
|
||||
// TODO: figure out how to handle a entire row
|
||||
*dst_row++ =
|
||||
vec_dot_product_f32_f32(reinterpret_cast<const float *>(src0_row),
|
||||
reinterpret_cast<const float *>(src1_row), (size_t) src0->get_ne(0));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ inline bool is_addr_aligned(void * addr) {
|
|||
return unaligned_bytes(addr) == 0;
|
||||
}
|
||||
|
||||
bool mul_mat_f32(tensor * out);
|
||||
bool mul_mat_f32(tensor * out, size_t tidx, size_t tcnt);
|
||||
bool is_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
|
||||
const npu_device_tensor_spec & dst, npu_device_tensor_op op);
|
||||
|
||||
|
|
|
|||
|
|
@ -81,10 +81,7 @@ class tensor {
|
|||
tensor * _src[kMaxTensorSrc] = {};
|
||||
uint8_t * _data = nullptr;
|
||||
|
||||
tensor(const tensor &) = delete;
|
||||
void operator=(const tensor &) = delete;
|
||||
tensor(tensor &&) = delete;
|
||||
void operator=(tensor &&) = delete;
|
||||
DISABLE_COPY_AND_MOVE(tensor);
|
||||
};
|
||||
|
||||
} // namespace hexagon
|
||||
|
|
|
|||
|
|
@ -0,0 +1,190 @@
|
|||
#pragma once
|
||||
|
||||
#include <qurt.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "util.hpp"
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
constexpr const size_t kMaxThreadCount = 4;
|
||||
constexpr const size_t kDefaultStackSize = 1024 * 16; // 16KB
|
||||
constexpr const unsigned long long kThreadTaskPendingBit = 1;
|
||||
|
||||
template <size_t _stack_size> class qurt_thread {
|
||||
public:
|
||||
typedef void (*qurt_thread_func_type)(qurt_thread * thread, void * arg);
|
||||
|
||||
explicit qurt_thread(const std::string & thread_name, qurt_thread_func_type thread_func, void * arg,
|
||||
unsigned short priority) {
|
||||
DEVICE_LOG_DEBUG("qurt_thread.create: %s", thread_name.c_str());
|
||||
qurt_thread_attr_init(&_attributes);
|
||||
qurt_thread_attr_set_name(&_attributes, (char *) thread_name.c_str());
|
||||
qurt_thread_attr_set_stack_addr(&_attributes, _stack);
|
||||
qurt_thread_attr_set_stack_size(&_attributes, _stack_size);
|
||||
qurt_thread_attr_set_priority(&_attributes, priority);
|
||||
|
||||
_func = thread_func;
|
||||
_arg = arg;
|
||||
auto ret = qurt_thread_create(
|
||||
&_tid, &_attributes, reinterpret_cast<void (*)(void *)>(&qurt_thread::thread_func_impl), (void *) this);
|
||||
if (ret != QURT_EOK) {
|
||||
DEVICE_LOG_ERROR("Failed to create thread: %d", (int) ret);
|
||||
_func = nullptr;
|
||||
_arg = nullptr;
|
||||
return;
|
||||
}
|
||||
|
||||
DEVICE_LOG_DEBUG("qurt_thread.created: %s, id: %d", thread_name.c_str(), (int) _tid);
|
||||
}
|
||||
|
||||
~qurt_thread() {
|
||||
DEVICE_LOG_DEBUG("qurt_thread.destroy: %d", (int) _tid);
|
||||
int thread_exit_code = QURT_EOK;
|
||||
auto ret = qurt_thread_join(_tid, &thread_exit_code);
|
||||
if (ret != QURT_EOK && ret != QURT_ENOTHREAD) {
|
||||
DEVICE_LOG_ERROR("Failed to join thread: %d", (int) ret);
|
||||
return;
|
||||
}
|
||||
|
||||
if (thread_exit_code != QURT_EOK) {
|
||||
DEVICE_LOG_ERROR("Thread exit code: %d", (int) thread_exit_code);
|
||||
}
|
||||
}
|
||||
|
||||
bool is_valid() const { return _tid != 0 && _func != nullptr; }
|
||||
|
||||
private:
|
||||
static void thread_func_impl(qurt_thread * thread) {
|
||||
if (thread->_func) {
|
||||
thread->_func(thread, thread->_arg);
|
||||
}
|
||||
|
||||
qurt_thread_exit(QURT_EOK);
|
||||
}
|
||||
|
||||
uint8_t _stack[_stack_size] = {};
|
||||
qurt_thread_t _tid;
|
||||
qurt_thread_attr_t _attributes;
|
||||
qurt_thread_func_type _func = nullptr;
|
||||
void * _arg = nullptr;
|
||||
|
||||
DISABLE_COPY_AND_MOVE(qurt_thread);
|
||||
};
|
||||
|
||||
using qurt_thread_ptr = std::unique_ptr<qurt_thread<kDefaultStackSize>>;
|
||||
|
||||
template <size_t _thread_count> class thread_pool {
|
||||
static_assert(_thread_count > 1, "Thread count must be greater than 1");
|
||||
constexpr const static size_t kMaxThreadCount = _thread_count - 1;
|
||||
|
||||
public:
|
||||
typedef qurt_thread<kDefaultStackSize> thread_type;
|
||||
typedef void (*task_type)(thread_pool * pool, size_t thread_idx, size_t thread_count, void * arg);
|
||||
|
||||
thread_pool() {
|
||||
std::string thread_name_base = "thread_pool_";
|
||||
qurt_barrier_init(&_pending, kMaxThreadCount + 1);
|
||||
qurt_barrier_init(&_completed, kMaxThreadCount + 1);
|
||||
for (size_t i = 0; i < kMaxThreadCount; ++i) {
|
||||
auto & thread_arg = _thread_args[i];
|
||||
thread_arg.pool = this;
|
||||
thread_arg.thread_idx = i + 1;
|
||||
|
||||
auto thread = std::make_unique<thread_type>(
|
||||
thread_name_base + std::to_string(i),
|
||||
reinterpret_cast<thread_type::qurt_thread_func_type>(&thread_pool::thread_func_impl), &thread_arg,
|
||||
QURT_THREAD_ATTR_PRIORITY_DEFAULT);
|
||||
if (!thread->is_valid()) {
|
||||
DEVICE_LOG_ERROR("Failed to create thread: %zu", i);
|
||||
// destroy all barriers and threads at destructor
|
||||
return;
|
||||
}
|
||||
|
||||
_threads[i] = std::move(thread);
|
||||
}
|
||||
DEVICE_LOG_DEBUG("thread_pool.created: %zu", kMaxThreadCount);
|
||||
}
|
||||
|
||||
~thread_pool() {
|
||||
DEVICE_LOG_DEBUG("thread_pool.destroy");
|
||||
_thread_exit = true;
|
||||
qurt_barrier_wait(&_pending); // release all task threads
|
||||
|
||||
for (auto & thread : _threads) {
|
||||
thread.reset();
|
||||
}
|
||||
|
||||
qurt_barrier_destroy(&_completed);
|
||||
qurt_barrier_destroy(&_pending);
|
||||
}
|
||||
|
||||
bool sync_execute(task_type task, void * arg) {
|
||||
if (!task) {
|
||||
DEVICE_LOG_ERROR("Invalid task");
|
||||
return false;
|
||||
}
|
||||
|
||||
_task = task;
|
||||
_arg = arg;
|
||||
qurt_barrier_wait(&_pending);
|
||||
|
||||
task(this, 0, kMaxThreadCount + 1, arg);
|
||||
DEVICE_LOG_DEBUG("main_thread.task_completed: 0");
|
||||
|
||||
qurt_barrier_wait(&_completed);
|
||||
|
||||
_task = nullptr;
|
||||
_arg = nullptr;
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
struct thread_pool_arg {
|
||||
thread_pool * pool = nullptr;
|
||||
size_t thread_idx = 0;
|
||||
};
|
||||
|
||||
static void thread_func_impl(thread_type * thread, thread_pool_arg * arg) {
|
||||
NPU_UNUSED(thread);
|
||||
|
||||
DEVICE_LOG_DEBUG("thread_func_impl.start: %zu", arg->thread_idx);
|
||||
|
||||
auto & pool = *arg->pool;
|
||||
for (;;) {
|
||||
qurt_barrier_wait(&pool._pending);
|
||||
if (pool._thread_exit) {
|
||||
DEVICE_LOG_DEBUG("thread_func_impl.exit: %zu", arg->thread_idx);
|
||||
break;
|
||||
}
|
||||
|
||||
auto task = pool._task;
|
||||
if (task) {
|
||||
task(arg->pool, arg->thread_idx, kMaxThreadCount + 1, pool._arg);
|
||||
}
|
||||
|
||||
DEVICE_LOG_DEBUG("thread_func_impl.task_completed: %zu", arg->thread_idx);
|
||||
qurt_barrier_wait(&pool._completed);
|
||||
}
|
||||
|
||||
DEVICE_LOG_DEBUG("thread_func_impl.end: %zu", arg->thread_idx);
|
||||
}
|
||||
|
||||
std::atomic_bool _thread_exit = false;
|
||||
std::array<qurt_thread_ptr, kMaxThreadCount> _threads;
|
||||
thread_pool_arg _thread_args[kMaxThreadCount] = {};
|
||||
qurt_barrier_t _pending = {};
|
||||
qurt_barrier_t _completed = {};
|
||||
task_type _task = nullptr;
|
||||
void * _arg = nullptr;
|
||||
|
||||
DISABLE_COPY_AND_MOVE(thread_pool);
|
||||
};
|
||||
|
||||
using default_thread_pool = thread_pool<kMaxThreadCount>;
|
||||
|
||||
} // namespace hexagon
|
||||
|
|
@ -2,6 +2,10 @@
|
|||
|
||||
#include <HAP_farf.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <utility>
|
||||
|
||||
#include "hexagon_npu.h"
|
||||
|
||||
#define DEVICE_LOG_ERROR(...) FARF(FATAL, __VA_ARGS__)
|
||||
|
|
@ -16,9 +20,24 @@
|
|||
# define DEVICE_LOG_DEBUG(...) (void) 0
|
||||
#endif
|
||||
|
||||
// TODO: reuse the declaration at host
|
||||
#define DISABLE_COPY(class_name) \
|
||||
class_name(const class_name &) = delete; \
|
||||
void operator=(const class_name &) = delete
|
||||
|
||||
#define DISABLE_MOVE(class_name) \
|
||||
class_name(class_name &&) = delete; \
|
||||
void operator=(class_name &&) = delete
|
||||
|
||||
#define DISABLE_COPY_AND_MOVE(class_name) \
|
||||
DISABLE_COPY(class_name); \
|
||||
DISABLE_MOVE(class_name)
|
||||
|
||||
#define NPU_UNUSED(x) (void) (x)
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
constexpr const char * op_get_name(npu_device_tensor_op op) {
|
||||
inline constexpr const char * op_get_name(npu_device_tensor_op op) {
|
||||
switch (op) {
|
||||
case NPU_OP_MUL_MAT:
|
||||
return "MUL_MAT";
|
||||
|
|
@ -33,4 +52,11 @@ constexpr const char * op_get_name(npu_device_tensor_op op) {
|
|||
}
|
||||
}
|
||||
|
||||
inline constexpr std::pair<int64_t, int64_t> get_thread_work_slice(int64_t total, size_t tidx, size_t tcnt) {
|
||||
const auto elements_per_thread = (total + tcnt - 1) / tcnt;
|
||||
const auto start = tidx * elements_per_thread;
|
||||
const auto end = std::min<int64_t>(start + elements_per_thread, total);
|
||||
return { start, end };
|
||||
}
|
||||
|
||||
} // namespace hexagon
|
||||
|
|
|
|||
|
|
@ -0,0 +1,42 @@
|
|||
|
||||
|
||||
file(GLOB qnn_srcs "${CMAKE_CURRENT_LIST_DIR}/*.cpp")
|
||||
|
||||
add_library(qnn-backend STATIC
|
||||
${qnn_srcs}
|
||||
)
|
||||
|
||||
target_include_directories(qnn-backend PRIVATE
|
||||
${GGML_QNN_SDK_PATH}/include/QNN/
|
||||
${CMAKE_CURRENT_LIST_DIR}/
|
||||
${CMAKE_CURRENT_LIST_DIR}/../
|
||||
${CMAKE_CURRENT_LIST_DIR}/../../
|
||||
${CMAKE_CURRENT_LIST_DIR}/../../../include/ # TODO: figure out how to remove this
|
||||
${CMAKE_CURRENT_LIST_DIR}/../shared/
|
||||
)
|
||||
|
||||
target_link_directories(qnn-backend PRIVATE
|
||||
runtime-common
|
||||
)
|
||||
|
||||
if(GGML_QNN_ENABLE_CPU_BACKEND)
|
||||
message("GGML_QNN_ENABLE_CPU_BACKEND is enabled")
|
||||
target_compile_definitions(qnn-backend PUBLIC GGML_QNN_ENABLE_CPU_BACKEND)
|
||||
else()
|
||||
message("GGML_QNN_ENABLE_CPU_BACKEND is disabled")
|
||||
endif()
|
||||
|
||||
if(CMAKE_SYSTEM_NAME STREQUAL "Android")
|
||||
set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend")
|
||||
elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows" OR CMAKE_SYSTEM_NAME STREQUAL "Linux")
|
||||
set(QNN_DEFAULT_LIB_SEARCH_PATH "" CACHE STRING "customized library search path for QNN backend")
|
||||
else()
|
||||
message(FATAL_ERROR "QNN now only available on Android, Windows and Linux")
|
||||
endif()
|
||||
|
||||
if(NOT "${QNN_DEFAULT_LIB_SEARCH_PATH}" STREQUAL "")
|
||||
string(REGEX REPLACE "/$" "" QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}")
|
||||
endif()
|
||||
|
||||
message("GGML_QNN_DEFAULT_LIB_SEARCH_PATH: ${QNN_DEFAULT_LIB_SEARCH_PATH}")
|
||||
target_compile_definitions(qnn-backend PUBLIC GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}")
|
||||
|
|
@ -1,88 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<OpDefCollection PackageName="GgmlOpPackage" Domain="ggml" Version="1.0">
|
||||
<OpDefList>
|
||||
<OpDef>
|
||||
<Name>GgmlMulMat</Name>
|
||||
<Description>
|
||||
<Content>
|
||||
GGML MulMat operator
|
||||
</Content>
|
||||
</Description>
|
||||
|
||||
<Input>
|
||||
<Name>in[0]</Name>
|
||||
<Description>
|
||||
<Content>src0</Content>
|
||||
</Description>
|
||||
<Mandatory>true</Mandatory>
|
||||
<Datatype>BACKEND_SPECIFIC</Datatype>
|
||||
<Shape>
|
||||
<Rank>4D</Rank>
|
||||
<Layout>NHWC</Layout>
|
||||
<Text>[N, C, H , W]</Text>
|
||||
</Shape>
|
||||
</Input>
|
||||
|
||||
<Input>
|
||||
<Name>in[1]</Name>
|
||||
<Description>
|
||||
<Content>src1</Content>
|
||||
</Description>
|
||||
<Mandatory>true</Mandatory>
|
||||
<Datatype>BACKEND_SPECIFIC</Datatype>
|
||||
<Shape>
|
||||
<Rank>4D</Rank>
|
||||
<Layout>NHWC</Layout>
|
||||
<Text>[N, C, H , W]</Text>
|
||||
</Shape>
|
||||
</Input>
|
||||
|
||||
<Output>
|
||||
<Name>out[0]</Name>
|
||||
<Description>
|
||||
<Content>dst</Content>
|
||||
</Description>
|
||||
<Mandatory>true</Mandatory>
|
||||
<Datatype>BACKEND_SPECIFIC</Datatype>
|
||||
<Shape>
|
||||
<Rank>4D</Rank>
|
||||
<Text> [N, C, H , W] </Text>
|
||||
</Shape>
|
||||
</Output>
|
||||
|
||||
<!--This Op is implemented on these Backends-->
|
||||
<SupportedBackend>HTP</SupportedBackend>
|
||||
</OpDef>
|
||||
|
||||
</OpDefList>
|
||||
|
||||
<SupplementalOpDefList Backend="HTP">
|
||||
<SupportedOps>
|
||||
<OpName>GgmlMulMat</OpName>
|
||||
</SupportedOps>
|
||||
|
||||
<!--ggml-mul-->
|
||||
<SupplementalOpDef>
|
||||
<Name>GgmlMulMat</Name>
|
||||
|
||||
<Input>
|
||||
<Name>in[0]</Name>
|
||||
<Datatype>QNN_DATATYPE_FLOAT_16</Datatype>
|
||||
<Datatype>QNN_DATATYPE_FLOAT_32</Datatype>
|
||||
</Input>
|
||||
<Input>
|
||||
<Name>in[1]</Name>
|
||||
<Datatype>QNN_DATATYPE_FLOAT_16</Datatype>
|
||||
<Datatype>QNN_DATATYPE_FLOAT_32</Datatype>
|
||||
</Input>
|
||||
|
||||
<Output>
|
||||
<Name>out[0]</Name>
|
||||
<Datatype>QNN_DATATYPE_FLOAT_16</Datatype>
|
||||
<Datatype>QNN_DATATYPE_FLOAT_32</Datatype>
|
||||
</Output>
|
||||
</SupplementalOpDef>
|
||||
|
||||
</SupplementalOpDefList>
|
||||
|
||||
</OpDefCollection>
|
||||
|
|
@ -1,357 +0,0 @@
|
|||
# check all setup prerequisites if the command goal is not clean
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
ifndef QNN_INCLUDE
|
||||
$(info "INFO: Qnn include not explicitly defined, attempting to use QNN_SDK_ROOT if it is valid")
|
||||
QNN_INCLUDE := $(QNN_SDK_ROOT)/include/QNN
|
||||
endif
|
||||
ifeq ($(wildcard $(QNN_INCLUDE)),)
|
||||
$(error "ERROR: QNN_INCLUDE path is not set. QNN include paths must be set to obtain BE headers necessary to compile the package")
|
||||
endif
|
||||
ifndef QNN_TARGET_LIB
|
||||
$(info "INFO: Qnn target not explicitly defined, attempting to use QNN_SDK_ROOT if it is valid")
|
||||
QNN_TARGET_LIB := $(QNN_SDK_ROOT)/lib/aarch64-android
|
||||
endif
|
||||
ifeq ($(wildcard $(QNN_TARGET_LIB)),)
|
||||
ifeq ($(MAKECMDGOALS),htp_aarch64)
|
||||
$(error "ERROR: QNN_TARGET_LIB is needed to compile package for aarch64")
|
||||
else ifeq ($(MAKECMDGOALS),all)
|
||||
$(info "WARNING:QNN_TARGET_LIB may need to be defined to compile packages")
|
||||
endif
|
||||
endif
|
||||
|
||||
ifndef HEXAGON_SDK_ROOT
|
||||
$(error "ERROR: HEXAGON_SDK_ROOT is not set. Hexagon-SDK path must be set to the latest hexagon-sdk-x.y.z")
|
||||
endif
|
||||
|
||||
ifeq ($(wildcard $(HEXAGON_SDK_ROOT)),)
|
||||
$(error "ERROR: HEXAGON_SDK_ROOT is not set correctly. Please set HEXAGON_SDK_ROOT to latest hexagon-sdk-X.Y.Z path")
|
||||
endif
|
||||
|
||||
HEXAGON_SDK_BASE := $(dir $(HEXAGON_SDK_ROOT))
|
||||
|
||||
$(info "HEXAGON_SDK_ROOT is [${HEXAGON_SDK_ROOT}]")
|
||||
# Users should note that the tools version may change between hexagon sdk versions
|
||||
# Following combination of SDK and Tool version is supported
|
||||
# fix the sdk root for new versions
|
||||
HEXAGON_SDK_ROOT_V68 := $(HEXAGON_SDK_ROOT)
|
||||
HEXAGON_SDK_ROOT_V69 := $(HEXAGON_SDK_ROOT)
|
||||
HEXAGON_SDK_ROOT_V73 := $(HEXAGON_SDK_ROOT)
|
||||
HEXAGON_SDK_ROOT_V75 := $(HEXAGON_SDK_ROOT)
|
||||
HEXAGON_SDK_ROOT_V79 := $(HEXAGON_SDK_ROOT)
|
||||
|
||||
#Updated to point to latest sdk to match with libQnnHtp.so
|
||||
HEXAGON_SDK_ROOT_X86 := $(HEXAGON_SDK_ROOT)
|
||||
HEXAGON_TOOLS_VERSION_V68 := 8.7.06
|
||||
HEXAGON_TOOLS_VERSION_V69 := 8.7.06
|
||||
HEXAGON_TOOLS_VERSION_V73 := 8.7.06
|
||||
HEXAGON_TOOLS_VERSION_V75 := 8.7.06
|
||||
HEXAGON_TOOLS_VERSION_V79 := 8.7.06
|
||||
|
||||
#Updated to point to latest sdk to match with libQnnHtp.so
|
||||
HEXAGON_TOOLS_VERSION_X86 := 8.7.06
|
||||
|
||||
ifndef ANDROID_NDK_ROOT
|
||||
ifeq ($(MAKECMDGOALS),htp_aarch64)
|
||||
$(error "ERROR: ANDROID_NDK_ROOT is not set. Android NDK path must be set to compile package for aarch64")
|
||||
else ifeq ($(MAKECMDGOALS),all)
|
||||
$(info "WARNING: ANDROID_NDK_ROOT is not set. Android NDK path must be set to compile package for aarch64")
|
||||
endif
|
||||
endif
|
||||
|
||||
ifndef PACKAGE_NAME
|
||||
export
|
||||
PACKAGE_NAME := $(notdir $(shell pwd))
|
||||
$(info "INFO: No package name defined. Using current directory name: $(PACKAGE_NAME) as the package name")
|
||||
endif
|
||||
|
||||
WORK := build
|
||||
SRC_DIR := src
|
||||
OP_SRC_DIR := src/ops
|
||||
OP_INCLUDE_DIR := ./include
|
||||
OP_INCLUDES = #$(wildcard $(OP_INCLUDE_DIR)/*.h) user defined if any op specific headers are needed, add -I to common flags
|
||||
LIBRARY_NAME := libQnn$(PACKAGE_NAME).so
|
||||
SUPPORTED_TARGETS = x86_64-linux-clang hexagon-v68 hexagon-v69 hexagon-v73 hexagon-v75 hexagon-v79 aarch64-android
|
||||
|
||||
|
||||
COMMON_CXX_FLAGS = -std=c++17 -I$(QNN_INCLUDE) -fPIC -Wall -Wreorder -Wno-missing-braces -Wno-unused-function
|
||||
COMMON_CXX_FLAGS += -Werror -Wno-format -Wno-unused-command-line-argument -fvisibility=default -stdlib=libc++
|
||||
COMMON_CXX_FLAGS += -DQNN_API="__attribute__((visibility(\"default\")))" -D__QAIC_HEADER_EXPORT="__attribute__((visibility(\"default\")))"
|
||||
|
||||
X86_LIBNATIVE_RELEASE_DIR := $(HEXAGON_SDK_ROOT_X86)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_X86)/Tools
|
||||
|
||||
# Ensure hexagon sdk tool version can be retrieved
|
||||
ifeq ($(wildcard $(X86_LIBNATIVE_RELEASE_DIR)/.),)
|
||||
$(error "Cannot retrieve hexagon tools from: $(X86_LIBNATIVE_RELEASE_DIR). \
|
||||
\
|
||||
Please check that hexagon tools version is correct. Expected: $(HEXAGON_TOOLS_VERSION_X86)")
|
||||
endif
|
||||
|
||||
#Check tools for hexagon_v68 are present.
|
||||
ifeq ($(MAKECMDGOALS),htp_v68)
|
||||
ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V68)),)
|
||||
$(error "ERROR: HEXAGON_SDK_ROOT_V68 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V68)")
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(MAKECMDGOALS),htp_v69)
|
||||
ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V69)),)
|
||||
$(error "ERROR: HEXAGON_SDK_ROOT_V69 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V69)")
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(MAKECMDGOALS),htp_v73)
|
||||
ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V73)),)
|
||||
$(error "ERROR: HEXAGON_SDK_ROOT_V73 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V73)")
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(MAKECMDGOALS),htp_v75)
|
||||
ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V75)),)
|
||||
$(error "ERROR: HEXAGON_SDK_ROOT_V75 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V75)")
|
||||
endif
|
||||
endif
|
||||
|
||||
#Check tools for hexagon_v79 are present.
|
||||
ifeq ($(MAKECMDGOALS),htp_v79)
|
||||
ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V79)),)
|
||||
$(error "ERROR: HEXAGON_SDK_ROOT_V79 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V79)")
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
|
||||
endif
|
||||
OP_SOURCES = $(wildcard $(OP_SRC_DIR)/*.cpp)
|
||||
OTHER_SOURCES = $(wildcard $(SRC_DIR)/*.cpp)
|
||||
HFILES = $(wildcard $(QNN_INCLUDE)/*.h)
|
||||
HFILES += $(wildcard $(QNN_INCLUDE)/HTP/*.h)
|
||||
HFILES += $(wildcard $(QNN_INCLUDE)/HTP/core/*.h)
|
||||
OP_OBJS = $(patsubst $(SRC_DIR)/%,%,$(patsubst %.cpp,%.o,$(OP_SOURCES)))
|
||||
OTHER_OBJS = $(patsubst $(SRC_DIR)/%,%,$(patsubst %.cpp,%.o,$(OTHER_SOURCES)))
|
||||
|
||||
#======= Assembly ========
|
||||
OP_SOURCES_ASM_X86 += $(wildcard $(OP_SRC_DIR)/x86_asm/*.S)
|
||||
OP_OBJS_ASM_X86 += $(subst /x86_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_X86))))
|
||||
OP_SOURCES_ASM_V68 += $(wildcard $(OP_SRC_DIR)/v68_asm/*.S)
|
||||
OP_OBJS_ASM_V68 += $(subst /v68_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V68))))
|
||||
OP_SOURCES_ASM_V69 += $(wildcard $(OP_SRC_DIR)/v69_asm/*.S)
|
||||
OP_OBJS_ASM_V69 += $(subst /v69_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V69))))
|
||||
OP_SOURCES_ASM_V73 += $(wildcard $(OP_SRC_DIR)/v73_asm/*.S)
|
||||
OP_OBJS_ASM_V73 += $(subst /v73_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V73))))
|
||||
OP_SOURCES_ASM_V75 += $(wildcard $(OP_SRC_DIR)/v75_asm/*.S)
|
||||
OP_OBJS_ASM_V75 += $(subst /v75_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V75))))
|
||||
OP_SOURCES_ASM_V79 += $(wildcard $(OP_SRC_DIR)/v79_asm/*.S)
|
||||
OP_OBJS_ASM_V79 += $(subst /v79_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V79))))
|
||||
|
||||
OP_SOURCES_ASM_ANDROID += $(wildcard $(OP_SRC_DIR)/android_asm/*.S)
|
||||
OP_OBJS_ASM_ANDROID += $(subst /android_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_ANDROID))))
|
||||
|
||||
|
||||
all: htp_v68 htp_x86 htp_aarch64
|
||||
|
||||
#============================================================================================================
|
||||
# Setup compiler, compiler instructions and linker for x86
|
||||
X86_CXX ?= clang++-9
|
||||
# Checking if clang++-9 is present. If not switch to clang++
|
||||
ifeq ($(shell $(X86_CXX) -v 2>&1 | grep -c "clang version"), 0)
|
||||
X86_CXX := clang++
|
||||
endif
|
||||
X86_LDFLAGS:= -Wl,--whole-archive -L$(X86_LIBNATIVE_RELEASE_DIR)/libnative/lib -lnative -Wl,--no-whole-archive -lpthread -L$(QNN_SDK_ROOT)/lib/x86_64-linux-clang -lHtpPrepare
|
||||
X86_C_FLAGS := -D__HVXDBL__ -I$(X86_LIBNATIVE_RELEASE_DIR)/libnative/include -ffast-math -DUSE_OS_LINUX
|
||||
X86_CXX_FLAGS = $(COMMON_CXX_FLAGS) $(X86_C_FLAGS) -fomit-frame-pointer -Wno-invalid-offsetof
|
||||
linux_objs =
|
||||
#============================================================================================================
|
||||
# Setup compiler, compiler instructions and linker for hexagon
|
||||
HEXAGON_CXX_FLAGS := $(COMMON_CXX_FLAGS) -mhvx -mhvx-length=128B -mhmx -DUSE_OS_QURT -O2 -Wno-reorder -DPREPARE_DISABLED
|
||||
|
||||
HEXAGON_CXX_FLAGS_V68 := $(HEXAGON_CXX_FLAGS) -mv68 -I$(HEXAGON_SDK_ROOT_V68)/rtos/qurt/computev68/include/qurt -I$(HEXAGON_SDK_ROOT_V68)/rtos/qurt/computev68/include/posix -I$(HEXAGON_SDK_ROOT_V68)/incs -I$(HEXAGON_SDK_ROOT_V68)/incs/stddef
|
||||
HEXAGON_CXX_FLAGS_V69 := $(HEXAGON_CXX_FLAGS) -mv69 -I$(HEXAGON_SDK_ROOT_V69)/rtos/qurt/computev69/include/qurt -I$(HEXAGON_SDK_ROOT_V69)/rtos/qurt/computev69/include/posix -I$(HEXAGON_SDK_ROOT_V69)/incs -I$(HEXAGON_SDK_ROOT_V69)/incs/stddef
|
||||
HEXAGON_CXX_FLAGS_V73 := $(HEXAGON_CXX_FLAGS) -mv73 -I$(HEXAGON_SDK_ROOT_V73)/rtos/qurt/computev73/include/qurt -I$(HEXAGON_SDK_ROOT_V73)/rtos/qurt/computev73/include/posix -I$(HEXAGON_SDK_ROOT_V73)/incs -I$(HEXAGON_SDK_ROOT_V73)/incs/stddef
|
||||
HEXAGON_CXX_FLAGS_V75 := $(HEXAGON_CXX_FLAGS) -mv75 -I$(HEXAGON_SDK_ROOT_V75)/rtos/qurt/computev75/include/qurt -I$(HEXAGON_SDK_ROOT_V75)/rtos/qurt/computev75/include/posix -I$(HEXAGON_SDK_ROOT_V75)/incs -I$(HEXAGON_SDK_ROOT_V75)/incs/stddef
|
||||
HEXAGON_CXX_FLAGS_V79 := $(HEXAGON_CXX_FLAGS) -mv79 -I$(HEXAGON_SDK_ROOT_V79)/rtos/qurt/computev79/include/qurt -I$(HEXAGON_SDK_ROOT_V79)/rtos/qurt/computev79/include/posix -I$(HEXAGON_SDK_ROOT_V79)/incs -I$(HEXAGON_SDK_ROOT_V79)/incs/stddef
|
||||
|
||||
|
||||
HEXAGON_CXX_V68 := $(HEXAGON_SDK_ROOT_V68)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V68)/Tools/bin/hexagon-clang++
|
||||
HEXAGON_CXX_V69 := $(HEXAGON_SDK_ROOT_V69)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V69)/Tools/bin/hexagon-clang++
|
||||
HEXAGON_CXX_V73 := $(HEXAGON_SDK_ROOT_V73)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V73)/Tools/bin/hexagon-clang++
|
||||
HEXAGON_CXX_V75 := $(HEXAGON_SDK_ROOT_V75)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V75)/Tools/bin/hexagon-clang++
|
||||
HEXAGON_CXX_V79 := $(HEXAGON_SDK_ROOT_V79)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V79)/Tools/bin/hexagon-clang++
|
||||
|
||||
|
||||
HEX_LDFLAGS =
|
||||
hexagon_objs =
|
||||
#============================================================================================================
|
||||
# Setup compiler, compiler instructions and linker for aarch64
|
||||
AARCH64_C__FLAGS = -D__HVXDBL__ -I$(X86_LIBNATIVE_RELEASE_DIR)/libnative/include -ffast-math -DUSE_OS_LINUX -DANDROID
|
||||
AARCH64_CXX_FLAGS = $(COMMON_CXX_FLAGS) $(AARCH64_C__FLAGS) -fomit-frame-pointer -Wno-invalid-offsetof -Wno-unused-variable -Wno-unused-parameter -Wno-missing-braces -Wno-sign-compare -Wno-unused-private-field -Wno-unused-variable -Wno-ignored-qualifiers -Wno-missing-field-initializers
|
||||
ARM_CLANG_OPTS =--target=aarch64-none-linux-android21 --sysroot=$(ANDROID_NDK_ROOT)/toolchains/llvm/prebuilt/linux-x86_64/sysroot -stdlib=libc++ -static-libstdc++
|
||||
AARCH64_CXX = $(ANDROID_NDK_ROOT)/toolchains/llvm/prebuilt/linux-x86_64/bin/clang++ $(ARM_CLANG_OPTS)
|
||||
AARCH64_LDFLAGS = -L$(QNN_TARGET_LIB) -lQnnHtp -lQnnHtpPrepare
|
||||
aarch64_objs =
|
||||
#============================================================================================================
|
||||
# Setup targets and goals
|
||||
|
||||
htp_x86: X86_BUILD
|
||||
|
||||
htp_v68: HEXAGON_BUILD_V68
|
||||
|
||||
htp_v69: HEXAGON_BUILD_V69
|
||||
|
||||
htp_v73: HEXAGON_BUILD_V73
|
||||
|
||||
htp_v75: HEXAGON_BUILD_V75
|
||||
|
||||
htp_v79: HEXAGON_BUILD_V79
|
||||
|
||||
|
||||
|
||||
htp_aarch64: AARCH64_BUILD
|
||||
|
||||
AARCH64_BUILD: $(WORK)/aarch64-android/$(LIBRARY_NAME)
|
||||
|
||||
HEXAGON_BUILD_V68: $(WORK)/hexagon-v68/$(LIBRARY_NAME)
|
||||
|
||||
HEXAGON_BUILD_V69: $(WORK)/hexagon-v69/$(LIBRARY_NAME)
|
||||
|
||||
HEXAGON_BUILD_V73: $(WORK)/hexagon-v73/$(LIBRARY_NAME)
|
||||
|
||||
HEXAGON_BUILD_V75: $(WORK)/hexagon-v75/$(LIBRARY_NAME)
|
||||
|
||||
HEXAGON_BUILD_V79: $(WORK)/hexagon-v79/$(LIBRARY_NAME)
|
||||
|
||||
|
||||
|
||||
X86_BUILD: $(WORK)/x86_64-linux-clang/$(LIBRARY_NAME)
|
||||
|
||||
|
||||
define build_objs =
|
||||
ifneq ($(filter $(2),$(SUPPORTED_TARGETS)),)
|
||||
$(2)_objs += $(foreach x,$(1),$(WORK)/$(2)/$(x))
|
||||
else
|
||||
$$(error "Unknown target option provided: $(2): Supported targets are: $(SUPPORTED_TARGETS)")
|
||||
endif
|
||||
endef
|
||||
|
||||
$(eval $(call build_objs,$(OTHER_OBJS),x86_64-linux-clang))
|
||||
$(eval $(call build_objs,$(OP_OBJS),x86_64-linux-clang))
|
||||
$(eval $(call build_objs,$(OP_OBJS_ASM_X86),x86_64-linux-clang))
|
||||
$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v68))
|
||||
$(eval $(call build_objs,$(OP_OBJS),hexagon-v68))
|
||||
$(eval $(call build_objs,$(OP_OBJS_ASM_V68),hexagon-v68))
|
||||
$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v69))
|
||||
$(eval $(call build_objs,$(OP_OBJS),hexagon-v69))
|
||||
$(eval $(call build_objs,$(OP_OBJS_ASM_V69),hexagon-v69))
|
||||
$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v73))
|
||||
$(eval $(call build_objs,$(OP_OBJS),hexagon-v73))
|
||||
$(eval $(call build_objs,$(OP_OBJS_ASM_V73),hexagon-v73))
|
||||
$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v75))
|
||||
$(eval $(call build_objs,$(OP_OBJS),hexagon-v75))
|
||||
$(eval $(call build_objs,$(OP_OBJS_ASM_V75),hexagon-v75))
|
||||
$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v79))
|
||||
$(eval $(call build_objs,$(OP_OBJS),hexagon-v79))
|
||||
$(eval $(call build_objs,$(OP_OBJS_ASM_V75),hexagon-v79))
|
||||
|
||||
$(eval $(call build_objs,$(OTHER_OBJS),aarch64-android))
|
||||
$(eval $(call build_objs,$(OP_OBJS),aarch64-android))
|
||||
$(eval $(call build_objs,$(OP_OBJS_ASM_ANDROID),aarch64-android))
|
||||
|
||||
# x86
|
||||
$(WORK)/x86_64-linux-clang $(WORK)/hexagon-v68 $(WORK)/hexagon-v69 $(WORK)/hexagon-v73 $(WORK)/hexagon-v75 $(WORK)/hexagon-v79 $(WORK)/aarch64-android:
|
||||
@mkdir -p $@/ops
|
||||
|
||||
$(WORK)/x86_64-linux-clang/%.o: $(SRC_DIR)/%.cpp | $(WORK)/x86_64-linux-clang
|
||||
$(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/x86_64-linux-clang/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/x86_64-linux-clang
|
||||
$(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/x86_64-linux-clang/ops/%.o: $(OP_SRC_DIR)/x86_asm/%.S | $(WORK)/x86_64-linux-clang
|
||||
$(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/x86_64-linux-clang/$(LIBRARY_NAME): $(x86_64-linux-clang_objs) | $(HFILES)
|
||||
$(X86_CXX) -fPIC -std=c++17 -g -shared -o $@ $^ $(X86_LDFLAGS)
|
||||
|
||||
# v68
|
||||
$(WORK)/hexagon-v68/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v68
|
||||
$(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/hexagon-v68/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v68
|
||||
$(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/hexagon-v68/ops/%.o: $(OP_SRC_DIR)/v68_asm/%.S | $(WORK)/hexagon-v68
|
||||
$(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/hexagon-v68/$(LIBRARY_NAME): $(hexagon-v68_objs) | $(HFILES)
|
||||
$(HEXAGON_CXX_V68) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS)
|
||||
|
||||
# v69
|
||||
$(WORK)/hexagon-v69/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v69
|
||||
$(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/hexagon-v69/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v69
|
||||
$(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/hexagon-v69/ops/%.o: $(OP_SRC_DIR)/v69_asm/%.S | $(WORK)/hexagon-v69
|
||||
$(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/hexagon-v69/$(LIBRARY_NAME): $(hexagon-v69_objs) | $(HFILES)
|
||||
$(HEXAGON_CXX_V69) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS)
|
||||
|
||||
# v73
|
||||
$(WORK)/hexagon-v73/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v73
|
||||
$(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/hexagon-v73/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v73
|
||||
$(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/hexagon-v73/ops/%.o: $(OP_SRC_DIR)/v73_asm/%.S | $(WORK)/hexagon-v73
|
||||
$(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/hexagon-v73/$(LIBRARY_NAME): $(hexagon-v73_objs) | $(HFILES)
|
||||
$(HEXAGON_CXX_V73) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS)
|
||||
|
||||
#v75
|
||||
$(WORK)/hexagon-v75/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v75
|
||||
$(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/hexagon-v75/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v75
|
||||
$(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/hexagon-v75/ops/%.o: $(OP_SRC_DIR)/v75_asm/%.S | $(WORK)/hexagon-v75
|
||||
$(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/hexagon-v75/$(LIBRARY_NAME): $(hexagon-v75_objs) | $(HFILES)
|
||||
$(HEXAGON_CXX_V75) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS)
|
||||
|
||||
#v79
|
||||
$(WORK)/hexagon-v79/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v79
|
||||
$(HEXAGON_CXX_V79) $(HEXAGON_CXX_FLAGS_V79) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/hexagon-v79/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v79
|
||||
$(HEXAGON_CXX_V79) $(HEXAGON_CXX_FLAGS_V79) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/hexagon-v79/ops/%.o: $(OP_SRC_DIR)/v79_asm/%.S | $(WORK)/hexagon-v79
|
||||
$(HEXAGON_CXX_V79) $(HEXAGON_CXX_FLAGS_V79) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/hexagon-v79/$(LIBRARY_NAME): $(hexagon-v79_objs) | $(HFILES)
|
||||
$(HEXAGON_CXX_V79) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS)
|
||||
|
||||
|
||||
|
||||
# aarch64
|
||||
$(WORK)/aarch64-android/%.o: $(SRC_DIR)/%.cpp | $(WORK)/aarch64-android
|
||||
$(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/aarch64-android/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/aarch64-android
|
||||
$(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/aarch64-android/ops/%.o: $(OP_SRC_DIR)/android_asm/%.S | $(WORK)/aarch64-android
|
||||
$(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/aarch64-android/$(LIBRARY_NAME): $(aarch64-android_objs) | $(HFILES)
|
||||
$(AARCH64_CXX) -fPIC -std=c++17 -g -shared -o $@ $^ $(AARCH64_LDFLAGS)
|
||||
|
||||
clean:
|
||||
-rm -rf $(WORK)
|
||||
|
||||
.PHONY: all clean
|
||||
|
|
@ -1,88 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<OpDefCollection PackageName="GgmlOpPackage" Domain="ggml" Version="1.0">
|
||||
<OpDefList>
|
||||
<OpDef>
|
||||
<Name>GgmlMulMat</Name>
|
||||
<Description>
|
||||
<Content>
|
||||
GGML MulMat operator
|
||||
</Content>
|
||||
</Description>
|
||||
|
||||
<Input>
|
||||
<Name>in[0]</Name>
|
||||
<Description>
|
||||
<Content>src0</Content>
|
||||
</Description>
|
||||
<Mandatory>true</Mandatory>
|
||||
<Datatype>BACKEND_SPECIFIC</Datatype>
|
||||
<Shape>
|
||||
<Rank>4D</Rank>
|
||||
<Layout>NHWC</Layout>
|
||||
<Text>[N, C, H , W]</Text>
|
||||
</Shape>
|
||||
</Input>
|
||||
|
||||
<Input>
|
||||
<Name>in[1]</Name>
|
||||
<Description>
|
||||
<Content>src1</Content>
|
||||
</Description>
|
||||
<Mandatory>true</Mandatory>
|
||||
<Datatype>BACKEND_SPECIFIC</Datatype>
|
||||
<Shape>
|
||||
<Rank>4D</Rank>
|
||||
<Layout>NHWC</Layout>
|
||||
<Text>[N, C, H , W]</Text>
|
||||
</Shape>
|
||||
</Input>
|
||||
|
||||
<Output>
|
||||
<Name>out[0]</Name>
|
||||
<Description>
|
||||
<Content>dst</Content>
|
||||
</Description>
|
||||
<Mandatory>true</Mandatory>
|
||||
<Datatype>BACKEND_SPECIFIC</Datatype>
|
||||
<Shape>
|
||||
<Rank>4D</Rank>
|
||||
<Text> [N, C, H , W] </Text>
|
||||
</Shape>
|
||||
</Output>
|
||||
|
||||
<!--This Op is implemented on these Backends-->
|
||||
<SupportedBackend>HTP</SupportedBackend>
|
||||
</OpDef>
|
||||
|
||||
</OpDefList>
|
||||
|
||||
<SupplementalOpDefList Backend="HTP">
|
||||
<SupportedOps>
|
||||
<OpName>GgmlMulMat</OpName>
|
||||
</SupportedOps>
|
||||
|
||||
<!--ggml-mul-->
|
||||
<SupplementalOpDef>
|
||||
<Name>GgmlMulMat</Name>
|
||||
|
||||
<Input>
|
||||
<Name>in[0]</Name>
|
||||
<Datatype>QNN_DATATYPE_FLOAT_16</Datatype>
|
||||
<Datatype>QNN_DATATYPE_FLOAT_32</Datatype>
|
||||
</Input>
|
||||
<Input>
|
||||
<Name>in[1]</Name>
|
||||
<Datatype>QNN_DATATYPE_FLOAT_16</Datatype>
|
||||
<Datatype>QNN_DATATYPE_FLOAT_32</Datatype>
|
||||
</Input>
|
||||
|
||||
<Output>
|
||||
<Name>out[0]</Name>
|
||||
<Datatype>QNN_DATATYPE_FLOAT_16</Datatype>
|
||||
<Datatype>QNN_DATATYPE_FLOAT_32</Datatype>
|
||||
</Output>
|
||||
</SupplementalOpDef>
|
||||
|
||||
</SupplementalOpDefList>
|
||||
|
||||
</OpDefCollection>
|
||||
|
|
@ -1,274 +0,0 @@
|
|||
//==============================================================================
|
||||
// Auto Generated Code for GgmlOpPackage
|
||||
//==============================================================================
|
||||
|
||||
#include "HTP/QnnHtpCommon.h"
|
||||
#include "HTP/core/constraints.h"
|
||||
#include "HTP/core/op_package_feature_support.h"
|
||||
#include "HTP/core/op_register_ext.h"
|
||||
#include "HTP/core/optimize.h"
|
||||
#include "HTP/core/simple_reg.h"
|
||||
#include "HTP/core/unique_types.h"
|
||||
#include "QnnOpPackage.h"
|
||||
#include "QnnSdkBuildId.h"
|
||||
|
||||
DEFINE_UNIQ_TY()
|
||||
BEGIN_PKG_OPS_OPTS_LIST()
|
||||
|
||||
/** Note that the order of declarations given here defines the order in which ops and graph optimizations are
|
||||
* registered to the HTP Core.
|
||||
* Append the latest OpName at the bottom
|
||||
*/
|
||||
DECLARE_PKG_OPS_OPTS_LIST(PKG_GgmlMulMat)
|
||||
|
||||
END_PKG_OPS_OPTS_LIST()
|
||||
|
||||
// op package info
|
||||
static constexpr auto sg_packageName = THIS_PKG_NAME_STR; // package name passed in as compile flag
|
||||
|
||||
static std::array<const char*, 1> sg_opNames{{"GgmlMulMat"}};
|
||||
|
||||
static Qnn_ApiVersion_t sg_sdkApiVersion = QNN_HTP_API_VERSION_INIT;
|
||||
static QnnOpPackage_Info_t sg_packageInfo = QNN_OP_PACKAGE_INFO_INIT;
|
||||
|
||||
// global data
|
||||
static QnnOpPackage_GlobalInfrastructure_t sg_globalInfra =
|
||||
nullptr; // global infrastructure not in use for now
|
||||
static bool sg_packageInitialized = false;
|
||||
|
||||
/*
|
||||
* user provided logging call back function
|
||||
* currently only supported on linux x86-64 and nonrpc versions
|
||||
* typedef void (*QnnLog_Callback_t)(const char* fmt,
|
||||
* QnnLog_Level_t level,
|
||||
* uint64_t timestamp,
|
||||
* va_list args);
|
||||
* usage: if(sg_logInitialized && level <= sg_maxLogLevel)
|
||||
* sg_logCallback(fmt, level, timestamp, args);
|
||||
*
|
||||
* for cross rpc versions, skel side user provided logging call back function
|
||||
* can be defined as part of op packages. maximal log level sg_maxLogLevel
|
||||
* can be set by Qnn_ErrorHandle_t GgmlOpPackageLogSetLevel(QnnLog_Level_t maxLogLevel)
|
||||
*/
|
||||
/*
|
||||
* for alternative logging method provided by HTP core, please refer to log.h
|
||||
*/
|
||||
static QnnLog_Callback_t sg_logCallback =
|
||||
nullptr; // user provided call back function pointer for logging
|
||||
static QnnLog_Level_t sg_maxLogLevel =
|
||||
(QnnLog_Level_t)0; // maximal log level used in user provided logging
|
||||
static bool sg_logInitialized =
|
||||
false; // tracks whether user provided logging method has been initialized
|
||||
|
||||
|
||||
/*
|
||||
* op initialization
|
||||
* needs to be global in the package
|
||||
* one initialization per package before any op definitions
|
||||
* syntax: INIT_PACKAGE_OP_DEF()
|
||||
*/
|
||||
INIT_PACKAGE_OP_DEF()
|
||||
|
||||
/*
|
||||
* optimization initialization
|
||||
* needs to be global in the package
|
||||
* one initialization per package before any optimization definitions
|
||||
* syntax: INIT_PACKAGE_OPTIMIZATION_DEF()
|
||||
*/
|
||||
INIT_PACKAGE_OPTIMIZATION_DEF()
|
||||
|
||||
/*
|
||||
* op parameter order initialization
|
||||
* needs to be global in the package
|
||||
* one initialization per package before any op parameter order definitions
|
||||
* syntax: INIT_PACKAGE_PARAM_ORDER_DEF()
|
||||
*/
|
||||
INIT_PACKAGE_PARAM_ORDER_DEF()
|
||||
|
||||
/*
|
||||
* axis parameter name list
|
||||
* optional
|
||||
* needs to be global in the package
|
||||
* one list per package
|
||||
* for listing axis parameter names passed into Qnn_AddNode API
|
||||
* HTP backend auto-adjusts values in axis parameters based on HTP backfilling
|
||||
* note: HTP backend backfills tensor dimensions to 4 dimensions
|
||||
* syntax: LIST_PACKAGE_AXIS_PARAMS(...)
|
||||
* e.g. LIST_PACKAGE_AXIS_PARAMS("Axis", "AXIS", "axis")
|
||||
*/
|
||||
// LIST_PACKAGE_AXIS_PARAMS()
|
||||
|
||||
/*
|
||||
* per-channel quantized op name list
|
||||
* optional
|
||||
* needs to be global in the package
|
||||
* one list per package
|
||||
* for listing op names which support per-channel quantization
|
||||
* per-axis quantization info of an op is embeded in axisScaleOffsetEncoding
|
||||
* inside Qnn_Tensor_t types
|
||||
* HTP backend only supports per-channel scale ops
|
||||
* i.e. along last dimension, offset is always zero
|
||||
* if an op name is marked as having per-channel scale support, and in
|
||||
* QNN_AddNode, at least one input, parameter, or output has
|
||||
* QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET type:
|
||||
* then:
|
||||
* HTP backend will pass to op implementation function the following:
|
||||
* output(s), input(s), parameter(s),
|
||||
* outputPerChannelScale(s), inputPerChannelScale(s), paramPerChannelScale(s)
|
||||
*
|
||||
* optimization rules can be used to remove extra perChannelScale tensors
|
||||
*
|
||||
* syntax: LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(...)
|
||||
* e.g. LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(sg_op1Name, sg_op2Name)
|
||||
*/
|
||||
|
||||
// LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS()
|
||||
|
||||
/*
|
||||
* Declare and define the special intialize function for HTP Backend to load
|
||||
*/
|
||||
INIT_PKG_CORE_INIT_FUNC()
|
||||
|
||||
/* op package API's */
|
||||
|
||||
Qnn_ErrorHandle_t GgmlOpPackageInit(QnnOpPackage_GlobalInfrastructure_t infrastructure) {
|
||||
if (sg_packageInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED;
|
||||
|
||||
/*
|
||||
* op parameter order registration
|
||||
* registers all defined op parameter orders in the package
|
||||
* syntax: REGISTER_PACKAGE_PARAM_ORDERS()
|
||||
*/
|
||||
REGISTER_PACKAGE_PARAM_ORDERS()
|
||||
|
||||
/*
|
||||
* op axis parameter name registration
|
||||
* registers all axis parameter names in the package
|
||||
* used with LIST_PACKAGE_AXIS_PARAMS(...)
|
||||
* syntax: REGISTER_PACKAGE_AXIS_PARAMS()
|
||||
*/
|
||||
REGISTER_PACKAGE_AXIS_PARAMS()
|
||||
|
||||
/*
|
||||
* per-channel scale op name registration
|
||||
* registers all per-channel scale op names in the package
|
||||
* used with LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(...)
|
||||
* syntax: REGISTER_PACKAGE_PER_CHANNEL_QUANTIZED_OPS()
|
||||
*/
|
||||
REGISTER_PACKAGE_PER_CHANNEL_QUANTIZED_OPS()
|
||||
|
||||
sg_globalInfra = infrastructure;
|
||||
sg_packageInitialized = true;
|
||||
return QNN_SUCCESS;
|
||||
}
|
||||
|
||||
Qnn_ErrorHandle_t GgmlOpPackageGetInfo(const QnnOpPackage_Info_t** info) {
|
||||
if (!sg_packageInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED;
|
||||
if (!info) return QNN_OP_PACKAGE_ERROR_INVALID_INFO;
|
||||
|
||||
sg_packageInfo = QNN_OP_PACKAGE_INFO_INIT;
|
||||
sg_packageInfo.packageName = sg_packageName;
|
||||
sg_packageInfo.operationNames = sg_opNames.data();
|
||||
sg_packageInfo.numOperations = sg_opNames.size();
|
||||
sg_packageInfo.sdkBuildId = QNN_SDK_BUILD_ID;
|
||||
sg_packageInfo.sdkApiVersion = &sg_sdkApiVersion;
|
||||
|
||||
*info = &sg_packageInfo;
|
||||
return QNN_SUCCESS;
|
||||
}
|
||||
|
||||
Qnn_ErrorHandle_t GgmlOpPackageLogInitialize(QnnLog_Callback_t callback, QnnLog_Level_t maxLogLevel) {
|
||||
if (sg_logInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED;
|
||||
if (!callback) return QNN_LOG_ERROR_INVALID_ARGUMENT;
|
||||
if (maxLogLevel < QNN_LOG_LEVEL_ERROR) return QNN_LOG_ERROR_INVALID_ARGUMENT;
|
||||
sg_logCallback = callback;
|
||||
sg_maxLogLevel = maxLogLevel;
|
||||
sg_logInitialized = true;
|
||||
return QNN_SUCCESS;
|
||||
}
|
||||
|
||||
Qnn_ErrorHandle_t GgmlOpPackageLogSetLevel(QnnLog_Level_t maxLogLevel) {
|
||||
if (maxLogLevel < QNN_LOG_LEVEL_ERROR) return QNN_LOG_ERROR_INVALID_ARGUMENT;
|
||||
sg_maxLogLevel = maxLogLevel;
|
||||
return QNN_SUCCESS;
|
||||
}
|
||||
|
||||
Qnn_ErrorHandle_t GgmlOpPackageLogTerminate() {
|
||||
if (!sg_logInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED;
|
||||
sg_logCallback = nullptr;
|
||||
sg_maxLogLevel = (QnnLog_Level_t)0;
|
||||
sg_logInitialized = false;
|
||||
return QNN_SUCCESS;
|
||||
}
|
||||
|
||||
Qnn_ErrorHandle_t GgmlOpPackageValidateOpConfig (Qnn_OpConfig_t opConfig){
|
||||
if (std::string(sg_packageName) != opConfig.v1.packageName) {
|
||||
return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
|
||||
}
|
||||
|
||||
/* auto-generated validation code below
|
||||
* Check if op config type matches any registered ops
|
||||
* If a match is found, check number of inputs, outputs and params
|
||||
*/
|
||||
if (std::string(opConfig.v1.typeName) == "GgmlMulMat"){
|
||||
if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 2 || opConfig.v1.numOfOutputs != 1){
|
||||
return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
|
||||
}
|
||||
}
|
||||
else{
|
||||
return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
|
||||
}
|
||||
|
||||
/*
|
||||
* additional validation code here
|
||||
* */
|
||||
|
||||
return QNN_SUCCESS;
|
||||
}
|
||||
|
||||
/* The following three functions in this comment are not called by HTP backend for now,
|
||||
* no auto-generated implementations are created. Users should see example for full function signatures.
|
||||
* (version 1.3.0) Qnn_ErrorHandle_t GgmlOpPackageCreateKernels (QnnOpPackage_GraphInfrastructure_t
|
||||
* graphInfrastructure, QnnOpPackage_Node_t node, QnnOpPackage_Kernel_t** kernels, uint32_t*
|
||||
* numKernels)
|
||||
* (version 1.3.0) Qnn_ErrorHandle_t GgmlOpPackageFreeKernels (QnnOpPackage_Kernel_t* kernels)
|
||||
*
|
||||
* (version 1.4.0) Qnn_ErrorHandle_t GgmlOpPackageCreateOpImpl (QnnOpPackage_GraphInfrastructure_t
|
||||
* graphInfrastructure, QnnOpPackage_Node_t node, QnnOpPackage_OpImpl_t* opImpl)
|
||||
*(version 1.4.0) Qnn_ErrorHandle_t GgmlOpPackageFreeOpImpl (QnnOpPackage_OpImpl_t opImpl)
|
||||
*/
|
||||
|
||||
Qnn_ErrorHandle_t GgmlOpPackageTerminate() {
|
||||
if (!sg_packageInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED;
|
||||
|
||||
sg_globalInfra = nullptr;
|
||||
sg_packageInitialized = false;
|
||||
return QNN_SUCCESS;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
/* latest version */
|
||||
Qnn_ErrorHandle_t GgmlOpPackageInterfaceProvider(QnnOpPackage_Interface_t* interface) {
|
||||
if (!interface) return QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT;
|
||||
interface->interfaceVersion = {1, 4, 0};
|
||||
interface->v1_4.init = GgmlOpPackageInit;
|
||||
interface->v1_4.terminate = GgmlOpPackageTerminate;
|
||||
interface->v1_4.getInfo = GgmlOpPackageGetInfo;
|
||||
interface->v1_4.validateOpConfig = GgmlOpPackageValidateOpConfig;
|
||||
interface->v1_4.createOpImpl = nullptr;
|
||||
interface->v1_4.freeOpImpl = nullptr;
|
||||
interface->v1_4.logInitialize = GgmlOpPackageLogInitialize;
|
||||
interface->v1_4.logSetLevel = GgmlOpPackageLogSetLevel;
|
||||
interface->v1_4.logTerminate = GgmlOpPackageLogTerminate;
|
||||
return QNN_SUCCESS;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
|
@ -1,213 +0,0 @@
|
|||
//==============================================================================
|
||||
// Auto Generated Code for GgmlOpPackage
|
||||
//==============================================================================
|
||||
|
||||
#include "HTP/core/constraints.h"
|
||||
#include "HTP/core/op_package_feature_support.h"
|
||||
#include "HTP/core/op_register_ext.h"
|
||||
#include "HTP/core/optimize.h"
|
||||
#include "HTP/core/simple_reg.h"
|
||||
#include "QnnOpPackage.h"
|
||||
|
||||
BEGIN_PKG_OP_DEFINITION(PKG_GgmlMulMat);
|
||||
|
||||
// op execute function declarations
|
||||
template <typename TensorType>
|
||||
GraphStatus ggmlmulmatImpl(TensorType & out_0, const TensorType & in_0, const TensorType & in_1);
|
||||
|
||||
// forward declaration of sample cost function
|
||||
static float ggmlmulmatCostFunc(const Op * op);
|
||||
|
||||
/*
|
||||
* method 1 for defining op, using default cost value (i.e. GLACIAL) and default flag (Flags::RESOURCE_HVX)
|
||||
* syntax: DEF_PACKAGE_OP(F,OP)
|
||||
* e.g. DEF_PACKAGE_OP((ggmlmulmatImpl<Tensor>), "GgmlMulMat")
|
||||
*/
|
||||
DEF_PACKAGE_OP((ggmlmulmatImpl<Tensor>), "GgmlMulMat")
|
||||
|
||||
/*
|
||||
* method 2 for defining op with specified cost value (one of GLACIAL, SNAIL, FAST, FREE)
|
||||
* and provided flags
|
||||
* syntax: DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...)
|
||||
* can use zero or more flags, FLAG options are IS_CONST, INHIBIT_CONST_PROP,
|
||||
* RESOURCE_HVX, RESOURCE_HMX(not supported in external op packages)
|
||||
* e.g. DEF_PACKAGE_OP_AND_COST_AND_FLAGS((ggmlmulmatImpl<PlainFloatTensor>), "GgmlMulMat", SNAIL)
|
||||
*/
|
||||
|
||||
/*
|
||||
* method 3 for defining op with cost function pointer and provided flags
|
||||
* cost function pointer type: typedef float (*cost_function) (const Op * op);
|
||||
* syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...)
|
||||
* e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((ggmlmulmatImpl<PlainFloatTensor>),
|
||||
* "GgmlMulMat", ggmlmulmatCostFunc, Flags::RESOURCE_HVX)
|
||||
*/
|
||||
|
||||
/*
|
||||
* optimization definitions
|
||||
* need to be global in the package
|
||||
* one definition per optimization
|
||||
* syntax: DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE)
|
||||
* PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000)
|
||||
* HTP core provides some replacement functions for op package to use
|
||||
* for more information about optimization rules, please refer to HTP core documentations
|
||||
*/
|
||||
|
||||
/*
|
||||
* op parameter order definitions
|
||||
* need to be global in the package
|
||||
* one definition per op, and this is optional
|
||||
* syntax: DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...)
|
||||
* one or more parameters can be specified for each op
|
||||
* order of parameters listed determines the order of parameters passed into op execution functions
|
||||
* if an op does not have a parameter order definition, parameter order passed into Qnn_addNode
|
||||
* will be passed into op execution functions
|
||||
* if an op has a parameter order definition, any parameter passed into Qnn_addNode with unlisted
|
||||
* name will be abandoned
|
||||
* if two or more op packages with the same package name will be registered, they cannot list
|
||||
* conflicting parameter orders
|
||||
* PARAM refers to parameter name as a string literal
|
||||
* MANDATORY refers to whether this parameter is required to be provided at Qnn_addNode
|
||||
* DEFAULT is used when MANDATORY is false
|
||||
* if provided as Qnn_Param_t*,
|
||||
* DEFAULT will be used for graph construction when this parameter is not provided at
|
||||
* Qnn_addNode
|
||||
* if provided as nullptr,
|
||||
* graph construction will skip this parameter when this parameter is not provided at
|
||||
* Qnn_addNode
|
||||
*/
|
||||
|
||||
namespace {
|
||||
|
||||
constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73
|
||||
constexpr const size_t kFloatsPerVector = kBytesPerVector / sizeof(float);
|
||||
constexpr const size_t kAlignMask = kBytesPerVector - 1;
|
||||
|
||||
inline size_t unaligned_bytes(const void * addr) {
|
||||
return ((size_t) addr) & kAlignMask;
|
||||
}
|
||||
|
||||
inline bool is_addr_aligned(void * addr) {
|
||||
return unaligned_bytes(addr) == 0;
|
||||
}
|
||||
|
||||
inline float vec_dot_product_f32(const float * src0, const float * src1, size_t count) {
|
||||
HVX_Vector * iptr0 = ((HVX_Vector *) src0);
|
||||
HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kFloatsPerVector);
|
||||
HVX_Vector * iptr1 = ((HVX_Vector *) src1);
|
||||
HVX_Vector prev0 = *iptr0++;
|
||||
HVX_Vector prev1 = *iptr1++;
|
||||
HVX_Vector sum = Q6_V_vzero();
|
||||
|
||||
// TODO: prefetch?
|
||||
while (iptr0 < iptr0_end) {
|
||||
HVX_Vector curr0 = *iptr0++;
|
||||
HVX_Vector curr1 = *iptr1++;
|
||||
HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
|
||||
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
|
||||
sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, s1), sum);
|
||||
prev0 = curr0;
|
||||
prev1 = curr1;
|
||||
}
|
||||
|
||||
if ((iptr0_end - ((HVX_Vector *) src0)) > 0) {
|
||||
// handle the last vector
|
||||
// see also: https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147
|
||||
HVX_Vector curr0 = is_addr_aligned(iptr0) ? prev0 : *iptr0++;
|
||||
HVX_Vector curr1 = is_addr_aligned(iptr1) ? prev1 : *iptr1++;
|
||||
HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
|
||||
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
|
||||
sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, s1), sum);
|
||||
prev0 = curr0;
|
||||
prev1 = curr1;
|
||||
}
|
||||
|
||||
const size_t leftover = count % kFloatsPerVector;
|
||||
const size_t leftover_bytes = leftover * sizeof(float);
|
||||
if (leftover > 0) {
|
||||
// handle the leftover elements
|
||||
HVX_Vector curr0 = (leftover_bytes + unaligned_bytes(iptr0) > kBytesPerVector) ? *iptr0 : prev0;
|
||||
curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
|
||||
|
||||
HVX_Vector curr1 = (leftover_bytes + unaligned_bytes(iptr1) > kBytesPerVector) ? *iptr1 : prev1;
|
||||
curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
|
||||
|
||||
sum = Q6_Vqf32_vadd_Vqf32Vqf32(
|
||||
Q6_V_valign_VVR(Q6_Vqf32_vmpy_VsfVsf(curr0, curr1), Q6_V_vzero(), leftover_bytes), sum);
|
||||
}
|
||||
|
||||
// TODO: do we have a better way to do the reduction?
|
||||
for (size_t i = kFloatsPerVector / 2; i > 0; i /= 2) {
|
||||
sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_V_vror_VR(sum, i * sizeof(float)));
|
||||
}
|
||||
|
||||
float result;
|
||||
q6op_vstu_variable_ARV(&result, sizeof(float), Q6_Vsf_equals_Vqf32(sum));
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename TensorType>
|
||||
inline GraphStatus mul_mat_2d_f32(TensorType & out_0, const TensorType & in_0, const TensorType & in_1) {
|
||||
// TODO: handle strides?
|
||||
if (in_1.dim(1) != in_0.dim(1)) {
|
||||
return GraphStatus::ErrorDimensions;
|
||||
}
|
||||
|
||||
size_t dims[4] = { in_1.dim(0), in_0.dim(0) };
|
||||
out_0.set_dims(dims);
|
||||
|
||||
auto in0_ptr = (float *) in_0.raw_data_const();
|
||||
auto in1_ptr = (float *) in_1.raw_data_const();
|
||||
auto out_ptr = (float *) out_0.raw_data();
|
||||
|
||||
for (size_t i = 0; i < dims[0]; i++) {
|
||||
// TODO: prefetch?
|
||||
auto * in1_row = in1_ptr + i * in_1.dim(1);
|
||||
auto * out_row = out_ptr + i * dims[1];
|
||||
for (size_t j = 0; j < dims[1]; j++) {
|
||||
*out_row++ = vec_dot_product_f32(in0_ptr + j * in_0.dim(1), in1_row, in_0.dim(1));
|
||||
}
|
||||
}
|
||||
|
||||
return GraphStatus::Success;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
/* execute functions for ops */
|
||||
|
||||
template <typename TensorType>
|
||||
GraphStatus ggmlmulmatImpl(TensorType & out_0, const TensorType & in_0, const TensorType & in_1) {
|
||||
if (!in_0.raw_data_const() || !in_1.raw_data_const() || !out_0.raw_data()) {
|
||||
return GraphStatus::ErrorBadInput;
|
||||
}
|
||||
|
||||
if (in_0.rank() != in_1.rank()) {
|
||||
return GraphStatus::ErrorRank;
|
||||
}
|
||||
|
||||
auto rank = in_0.rank();
|
||||
switch (rank) {
|
||||
case 4:
|
||||
case 3:
|
||||
// TODO: add implementation
|
||||
return GraphStatus::ErrorUnsupported;
|
||||
case 2:
|
||||
return mul_mat_2d_f32(out_0, in_0, in_1);
|
||||
}
|
||||
|
||||
return GraphStatus::ErrorRank;
|
||||
}
|
||||
|
||||
__attribute__((unused)) static float ggmlmulmatCostFunc(const Op * op) {
|
||||
/*
|
||||
* add code here
|
||||
* */
|
||||
|
||||
float cost = 0.0; // add cost computation here
|
||||
return cost;
|
||||
}
|
||||
|
||||
/* At the bottom of the op file, call END_PKG_OP_DEFINITION(<name>),
|
||||
where <name> is as BEGIN_PKG_OP_DEFINITION
|
||||
*/
|
||||
END_PKG_OP_DEFINITION(PKG_GgmlMulMat);
|
||||
Loading…
Reference in New Issue