diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt index 3e8fa3a1b8..e605ce8ff2 100644 --- a/ggml/src/ggml-qnn/CMakeLists.txt +++ b/ggml/src/ggml-qnn/CMakeLists.txt @@ -5,11 +5,9 @@ option(GGML_QNN_ENABLE_HEXAGON_BACKEND "ggml-qnn: Enable Hexagon custom package" if(CMAKE_SYSTEM_NAME STREQUAL "Android") find_library(LOG_LIB log) - set(QNN_LINK_LIBRARIES ${LOG_LIB}) - set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend") - add_compile_options(-g -O0) + set(COMMON_LINK_LIBRARIES ${LOG_LIB}) elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows" OR CMAKE_SYSTEM_NAME STREQUAL "Linux") - set(QNN_DEFAULT_LIB_SEARCH_PATH "" CACHE STRING "customized library search path for QNN backend") + message("Building for Linux or Windows") else() message(FATAL_ERROR "QNN now only available on Android, Windows and Linux") endif() @@ -29,33 +27,15 @@ message("CMAKE_CXX_FLAGS_DEBUG: ${CMAKE_CXX_FLAGS_DEBUG}") message("CMAKE_CXX_FLAGS_RELEASE: ${CMAKE_CXX_FLAGS_RELEASE}") message("QNN_SDK_PATH: ${GGML_QNN_SDK_PATH}") -file(GLOB QNN_SOURCES "${CMAKE_CURRENT_LIST_DIR}/qnn/*.cpp") -file(GLOB COMMON_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp") +message("GGML_QNN: ${GGML_QNN}") +message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING: ${GGML_QNN_ENABLE_PERFORMANCE_TRACKING}") +message("GGML_QNN_ENABLE_HEXAGON_BACKEND: ${GGML_QNN_ENABLE_HEXAGON_BACKEND}") +message("GGML_HEXAGON_NPU_ONLY: ${GGML_HEXAGON_NPU_ONLY}") + ggml_add_backend_library(ggml-qnn - ${QNN_SOURCES} - ${COMMON_SOURCES} + ../../include/ggml-qnn.h ) - -target_include_directories(ggml-qnn PRIVATE - ${GGML_QNN_SDK_PATH}/include/QNN - ${CMAKE_CURRENT_LIST_DIR}/qnn - ${CMAKE_CURRENT_LIST_DIR} -) -target_link_libraries(ggml-qnn PRIVATE ${QNN_LINK_LIBRARIES}) - -if(NOT "${QNN_DEFAULT_LIB_SEARCH_PATH}" STREQUAL "") - string(REGEX REPLACE "/$" "" QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}") -endif() - -message("GGML_QNN_DEFAULT_LIB_SEARCH_PATH: ${QNN_DEFAULT_LIB_SEARCH_PATH}") -target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}") - -if(GGML_QNN_ENABLE_CPU_BACKEND) - message("GGML_QNN_ENABLE_CPU_BACKEND is enabled") - target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_ENABLE_CPU_BACKEND) -else() - message("GGML_QNN_ENABLE_CPU_BACKEND is disabled") -endif() +target_link_libraries(ggml-qnn PRIVATE ${COMMON_LINK_LIBRARIES}) if(GGML_QNN_ENABLE_PERFORMANCE_TRACKING) message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING is enabled") @@ -72,6 +52,8 @@ if(GGML_HEXAGON_NPU_ONLY) set(GGML_QNN_ENABLE_HEXAGON_BACKEND ON) else() message("GGML_HEXAGON_NPU_ONLY is disabled") + add_subdirectory(qnn) + target_link_libraries(ggml-qnn PRIVATE qnn-backend) endif() if(GGML_QNN_ENABLE_HEXAGON_BACKEND) diff --git a/ggml/src/ggml-qnn/npu/device/device.cpp b/ggml/src/ggml-qnn/npu/device/device.cpp index 2368d44f67..7281dd48d2 100644 --- a/ggml/src/ggml-qnn/npu/device/device.cpp +++ b/ggml/src/ggml-qnn/npu/device/device.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include "graph.hpp" @@ -10,15 +11,30 @@ #include "op_impl.hpp" #include "remote.h" #include "tensor.hpp" +#include "thread_pool.hpp" #include "util.hpp" -#define NPU_UNUSED(x) (void) (x) - namespace { struct npu_device_context { - int unused = 0; - // TODO: should we add tensor context here? + std::unique_ptr thread_pool; + + bool init_thread_pool() { + if (thread_pool) { + DEVICE_LOG_DEBUG("Thread pool already initialized"); + return true; + } + + auto pool = std::make_unique(); + if (!pool) { + DEVICE_LOG_ERROR("Failed to create thread pool"); + return false; + } + + thread_pool = std::move(pool); + DEVICE_LOG_DEBUG("Thread pool initialized"); + return true; + } }; inline hexagon::tensor * tensor_from_handle(npu_device_graph_handle_t h) { @@ -37,6 +53,10 @@ inline npu_device_tensor_handle_t graph_to_handle(hexagon::graph * graph) { return reinterpret_cast(graph); } +inline npu_device_context * device_context_from_handle(remote_handle64 h) { + return reinterpret_cast(h); +} + } // namespace int npu_device_open(const char * uri, remote_handle64 * h) { @@ -47,12 +67,18 @@ int npu_device_open(const char * uri, remote_handle64 * h) { return AEE_ENOMEMORY; } + if (!context->init_thread_pool()) { + DEVICE_LOG_ERROR("Failed to initialize thread pool"); + delete context; + return AEE_EFAILED; + } + *h = reinterpret_cast(context); return AEE_SUCCESS; } int npu_device_close(remote_handle64 h) { - auto * context = reinterpret_cast(h); + auto * context = device_context_from_handle(h); if (!context) { DEVICE_LOG_ERROR("Invalid npu_device_context handle"); return AEE_EINVHANDLE; @@ -149,13 +175,19 @@ AEEResult npu_device_graph_set_tensor(remote_handle64 _h, npu_device_graph_handl } AEEResult npu_device_graph_compute(remote_handle64 _h, npu_device_graph_handle_t graph_handle) { - NPU_UNUSED(_h); - auto * graph = graph_from_handle(graph_handle); - if (!graph) { + auto dev_ctx = device_context_from_handle(_h); + if (!dev_ctx) { + DEVICE_LOG_DEBUG("Invalid npu_device_context handle"); return AEE_EINVHANDLE; } - if (!graph->compute()) { + auto * graph = graph_from_handle(graph_handle); + if (!graph) { + DEVICE_LOG_ERROR("Invalid graph handle"); + return AEE_EINVHANDLE; + } + + if (!graph->compute(dev_ctx->thread_pool.get())) { return AEE_EFAILED; } diff --git a/ggml/src/ggml-qnn/npu/device/graph.cpp b/ggml/src/ggml-qnn/npu/device/graph.cpp index b21b8add29..2024d15a21 100644 --- a/ggml/src/ggml-qnn/npu/device/graph.cpp +++ b/ggml/src/ggml-qnn/npu/device/graph.cpp @@ -8,24 +8,23 @@ namespace hexagon { +graph::graph() noexcept { + DEVICE_LOG_DEBUG("graph(%p) created\n", (void *) this); +} + graph::~graph() noexcept { - if (_tensors) { - delete[] _tensors; - } + _tensors.reset(); + DEVICE_LOG_DEBUG("graph(%p) destroyed\n", (void *) this); } void graph::set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_count) { - if (_tensor_count > 0) { - delete[] _tensors; - } - if (tensor_count <= 0) { - _tensors = nullptr; + _tensors.reset(); _tensor_count = 0; return; } - _tensors = new (std::nothrow) tensor *[tensor_count]; + _tensors = std::make_unique(size_t(tensor_count)); for (int i = 0; i < tensor_count; ++i) { auto * tensor_obj = reinterpret_cast(tensors[i]); _tensors[i] = tensor_obj; @@ -37,31 +36,43 @@ void graph::set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_co DEVICE_LOG_DEBUG("graph(%p) tensor count: %zu\n", (void *) this, _tensor_count); } -bool graph::compute() { +bool graph::compute(default_thread_pool * thread_pool) { if (!_tensors || !_tensor_count) { DEVICE_LOG_DEBUG("graph(%p) no tensors to compute\n", (void *) this); return true; // return success if no tensors to compute } DEVICE_LOG_DEBUG("graph(%p) compute\n", (void *) this); + thread_pool->sync_execute(reinterpret_cast(&graph::thread_pool_task), this); + for (size_t i = 0; i < _tensor_count; ++i) { - auto * dst = _tensors[i]; - auto op = dst->get_op(); - auto * func = get_compute_func(op); - if (!func) { - DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d not supported\n", (void *) this, i, op); - return false; - } - - if (!func(dst)) { - DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d compute failed\n", (void *) this, i, op); - return false; - } - + auto * dst = _tensors[i]; dst->flush(); // TODO: optimize this } return true; } +void graph::thread_pool_task(default_thread_pool * pool, size_t thread_idx, size_t thread_count, graph * graph) { + NPU_UNUSED(pool); + graph->compute_impl(thread_idx, thread_count); +} + +void graph::compute_impl(size_t thread_idx, size_t thread_count) { + for (size_t i = 0; i < _tensor_count; ++i) { + auto * dst = _tensors[i]; + auto op = dst->get_op(); + auto * func = get_compute_func(op); + if (!func) { + DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d not supported\n", (void *) this, i, op); + return; + } + + if (!func(dst, thread_idx, thread_count)) { + DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d compute failed\n", (void *) this, i, op); + return; + } + } +} + } // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/graph.hpp b/ggml/src/ggml-qnn/npu/device/graph.hpp index 22f6615d14..7ca2931699 100644 --- a/ggml/src/ggml-qnn/npu/device/graph.hpp +++ b/ggml/src/ggml-qnn/npu/device/graph.hpp @@ -1,29 +1,32 @@ #pragma once +#include + #include "hexagon_npu.h" #include "tensor.hpp" +#include "thread_pool.hpp" namespace hexagon { class graph { public: // TODO: add execute direction here - explicit graph() noexcept {} + explicit graph() noexcept; ~graph() noexcept; void set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_count); - bool compute(); + bool compute(default_thread_pool * thread_pool); private: - tensor ** _tensors = nullptr; - size_t _tensor_count = 0; + static void thread_pool_task(default_thread_pool * pool, size_t thread_idx, size_t thread_count, graph * graph); + void compute_impl(size_t thread_idx, size_t thread_count); - graph(const graph &) = delete; - void operator=(const graph &) = delete; - graph(graph &&) = delete; - void operator=(graph &&) = delete; + std::unique_ptr _tensors; + size_t _tensor_count = 0; + + DISABLE_COPY_AND_MOVE(graph); }; } // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/op_impl.cpp b/ggml/src/ggml-qnn/npu/device/op_impl.cpp index 7067a1d52b..8d55971a72 100644 --- a/ggml/src/ggml-qnn/npu/device/op_impl.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_impl.cpp @@ -76,11 +76,12 @@ inline HVX_Vector vmul_f32_f32(HVX_Vector a, HVX_Vector b) { } template -bool element_wise_op(hexagon::tensor * out) { +bool element_wise_op(hexagon::tensor * out, size_t tidx, size_t tcnt) { if (!out) { return false; } + static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "element_wise_op requires max dims 4"); auto * src0 = out->get_src(0); auto * src1 = out->get_src(1); if (!src0 || !src1) { @@ -93,28 +94,24 @@ bool element_wise_op(hexagon::tensor * out) { return false; } - static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "element_wise_op requires max dims 4"); - - const auto * src0_ptr = reinterpret_cast(src0->get_data()); - const auto * src1_ptr = reinterpret_cast(src1->get_data()); - auto * dst_ptr = reinterpret_cast(out->get_data()); - for (int64_t i3 = 0; i3 < out->get_ne(3); i3++) { - const auto * src0_cube = src0_ptr + i3 * src0->get_nb(3); - const auto * src1_cube = src1_ptr + (i3 % src1->get_ne(3)) * src1->get_nb(3); - auto * dst_cube = dst_ptr + i3 * out->get_nb(3); - for (int64_t i2 = 0; i2 < out->get_ne(2); i2++) { - const auto * src0_plane = src0_cube + i2 * src0->get_nb(2); - const auto * src1_plane = src1_cube + (i2 % src1->get_ne(2)) * src1->get_nb(2); - auto * dst_plane = dst_cube + i2 * out->get_nb(2); - for (int64_t i1 = 0; i1 < out->get_ne(1); i1++) { - // TODO: prefetch row? - auto * src0_row = src0_plane + i1 * src0->get_nb(1); - auto * src1_row = src1_plane + (i1 % src1->get_ne(1)) * src1->get_nb(1); - auto * dst_row = reinterpret_cast(dst_plane + i1 * out->get_nb(1)); - _RowFunc(reinterpret_cast(src0_row), reinterpret_cast(src1_row), - static_cast(out->get_ne(0)), reinterpret_cast<_TyDst *>(dst_row)); - } - } + const auto * src0_ptr = reinterpret_cast(src0->get_data()); + const auto * src1_ptr = reinterpret_cast(src1->get_data()); + auto * dst_ptr = reinterpret_cast(out->get_data()); + auto total_rows = out->get_ne(3) * out->get_ne(2) * out->get_ne(1); + const auto rows_per_box = out->get_ne(2) * out->get_ne(1); + const auto start_end = hexagon::get_thread_work_slice(total_rows, tidx, tcnt); + for (int64_t ir = start_end.first; ir < start_end.second; ++ir) { + const auto i03 = ir / rows_per_box; + const auto i02 = ir / out->get_ne(1) - i03 * out->get_ne(2); + const auto i01 = ir % out->get_ne(1); + const auto i13 = i03 % src1->get_ne(3); + const auto i12 = i02 % src1->get_ne(2); + const auto i11 = i01 % src1->get_ne(1); + auto * src0_row = src0_ptr + i03 * src0->get_nb(3) + i02 * src0->get_nb(2) + i01 * src0->get_nb(1); + auto * src1_row = src1_ptr + i13 * src1->get_nb(3) + i12 * src1->get_nb(2) + i11 * src1->get_nb(1); + auto * dst_row = dst_ptr + i03 * out->get_nb(3) + i02 * out->get_nb(2) + i01 * out->get_nb(1); + _RowFunc(reinterpret_cast(src0_row), reinterpret_cast(src1_row), + static_cast(out->get_ne(0)), reinterpret_cast<_TyDst *>(dst_row)); } return true; diff --git a/ggml/src/ggml-qnn/npu/device/op_impl.hpp b/ggml/src/ggml-qnn/npu/device/op_impl.hpp index 1fee7769ce..6b30d24819 100644 --- a/ggml/src/ggml-qnn/npu/device/op_impl.hpp +++ b/ggml/src/ggml-qnn/npu/device/op_impl.hpp @@ -5,7 +5,7 @@ namespace hexagon { -typedef bool (*compute_func_type)(tensor * dst); +typedef bool (*compute_func_type)(tensor * dst, size_t tidx, size_t tcnt); typedef bool (*op_is_supported_func_type)(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, const npu_device_tensor_spec & dst, npu_device_tensor_op op); diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp index fbda69d2d7..381629da34 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp @@ -71,43 +71,45 @@ inline float vec_dot_product_f32_f32(const float * src0, const float * src1, siz namespace hexagon { -bool mul_mat_f32(hexagon::tensor * out) { +bool mul_mat_f32(hexagon::tensor * out, size_t tidx, size_t tcnt) { if (!out) { return false; } + static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "mul_mat_f32 requires max dims 4"); auto * src0 = out->get_src(0); auto * src1 = out->get_src(1); if (!src0 || !src1) { return true; // skip if no src } - static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "mul_mat_f32 requires max dims 4"); + const auto r02 = src1->get_ne(2) / src0->get_ne(2); + const auto r03 = src1->get_ne(3) / src0->get_ne(3); + const auto * src0_ptr = reinterpret_cast(src0->get_data()); + const auto * src1_ptr = reinterpret_cast(src1->get_data()); + auto * dst_ptr = reinterpret_cast(out->get_data()); + const auto total_planes = out->get_ne(3) * out->get_ne(2); - const auto r02 = src1->get_ne(2) / src0->get_ne(2); - const auto r03 = src1->get_ne(3) / src0->get_ne(3); - const auto * src0_ptr = reinterpret_cast(src0->get_data()); - const auto * src1_ptr = reinterpret_cast(src1->get_data()); - auto * dst_ptr = reinterpret_cast(out->get_data()); - for (int64_t i3 = 0; i3 < out->get_ne(3); i3++) { - const auto * src0_cube = src0_ptr + i3 / r03 * src0->get_nb(3); - const auto * src1_cube = src1_ptr + i3 * src1->get_nb(3); - auto * dst_cube = dst_ptr + i3 * out->get_nb(3); - for (int64_t i2 = 0; i2 < out->get_ne(2); i2++) { - const auto * src0_plane = src0_cube + i2 / r02 * src0->get_nb(2); - const auto * src1_plane = src1_cube + i2 * src1->get_nb(2); - auto * dst_plane = dst_cube + i2 * out->get_nb(2); - for (int64_t i1 = 0; i1 < out->get_ne(1); i1++) { - // TODO: prefetch row? - auto * src1_row = src1_plane + i1 * src1->get_nb(1); - auto * dst_row = reinterpret_cast(dst_plane + i1 * out->get_nb(1)); - for (int64_t i0 = 0; i0 < out->get_ne(0); i0++) { - auto * src0_row = src0_plane + i0 * src0->get_nb(1); - // TODO: figure out how to handle a entire row - *dst_row++ = - vec_dot_product_f32_f32(reinterpret_cast(src0_row), - reinterpret_cast(src1_row), (size_t) src0->get_ne(0)); - } + const auto start_end_plane = (total_planes >= tcnt) ? get_thread_work_slice(total_planes, tidx, tcnt) : + std::pair{ 0, total_planes }; + const auto start_end_row = (total_planes >= tcnt) ? std::pair{ 0, out->get_ne(1) } : + get_thread_work_slice(out->get_ne(1), tidx, tcnt); + for (int64_t ip = start_end_plane.first; ip < start_end_plane.second; ip++) { + const auto i3 = ip / out->get_ne(2); + const auto i2 = ip - i3 * out->get_ne(2); + const auto * src0_plane = src0_ptr + i3 / r03 * src0->get_nb(3) + i2 / r02 * src0->get_nb(2); + const auto * src1_plane = src1_ptr + i3 * src1->get_nb(3) + i2 * src1->get_nb(2); + auto * dst_plane = dst_ptr + i3 * out->get_nb(3) + i2 * out->get_nb(2); + for (int64_t i1 = start_end_row.first; i1 < start_end_row.second; i1++) { + // TODO: prefetch row? + auto * src1_row = src1_plane + i1 * src1->get_nb(1); + auto * dst_row = reinterpret_cast(dst_plane + i1 * out->get_nb(1)); + for (int64_t i0 = 0; i0 < out->get_ne(0); i0++) { + auto * src0_row = src0_plane + i0 * src0->get_nb(1); + // TODO: figure out how to handle a entire row + *dst_row++ = + vec_dot_product_f32_f32(reinterpret_cast(src0_row), + reinterpret_cast(src1_row), (size_t) src0->get_ne(0)); } } } diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp index cc57d3d1fe..fc2eb2c97e 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp @@ -20,7 +20,7 @@ inline bool is_addr_aligned(void * addr) { return unaligned_bytes(addr) == 0; } -bool mul_mat_f32(tensor * out); +bool mul_mat_f32(tensor * out, size_t tidx, size_t tcnt); bool is_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, const npu_device_tensor_spec & dst, npu_device_tensor_op op); diff --git a/ggml/src/ggml-qnn/npu/device/tensor.hpp b/ggml/src/ggml-qnn/npu/device/tensor.hpp index 83aa29a609..ad1915ecb6 100644 --- a/ggml/src/ggml-qnn/npu/device/tensor.hpp +++ b/ggml/src/ggml-qnn/npu/device/tensor.hpp @@ -81,10 +81,7 @@ class tensor { tensor * _src[kMaxTensorSrc] = {}; uint8_t * _data = nullptr; - tensor(const tensor &) = delete; - void operator=(const tensor &) = delete; - tensor(tensor &&) = delete; - void operator=(tensor &&) = delete; + DISABLE_COPY_AND_MOVE(tensor); }; } // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/thread_pool.hpp b/ggml/src/ggml-qnn/npu/device/thread_pool.hpp new file mode 100644 index 0000000000..a936ae0c4c --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/thread_pool.hpp @@ -0,0 +1,190 @@ +#pragma once + +#include + +#include +#include +#include +#include + +#include "util.hpp" + +namespace hexagon { + +constexpr const size_t kMaxThreadCount = 4; +constexpr const size_t kDefaultStackSize = 1024 * 16; // 16KB +constexpr const unsigned long long kThreadTaskPendingBit = 1; + +template class qurt_thread { + public: + typedef void (*qurt_thread_func_type)(qurt_thread * thread, void * arg); + + explicit qurt_thread(const std::string & thread_name, qurt_thread_func_type thread_func, void * arg, + unsigned short priority) { + DEVICE_LOG_DEBUG("qurt_thread.create: %s", thread_name.c_str()); + qurt_thread_attr_init(&_attributes); + qurt_thread_attr_set_name(&_attributes, (char *) thread_name.c_str()); + qurt_thread_attr_set_stack_addr(&_attributes, _stack); + qurt_thread_attr_set_stack_size(&_attributes, _stack_size); + qurt_thread_attr_set_priority(&_attributes, priority); + + _func = thread_func; + _arg = arg; + auto ret = qurt_thread_create( + &_tid, &_attributes, reinterpret_cast(&qurt_thread::thread_func_impl), (void *) this); + if (ret != QURT_EOK) { + DEVICE_LOG_ERROR("Failed to create thread: %d", (int) ret); + _func = nullptr; + _arg = nullptr; + return; + } + + DEVICE_LOG_DEBUG("qurt_thread.created: %s, id: %d", thread_name.c_str(), (int) _tid); + } + + ~qurt_thread() { + DEVICE_LOG_DEBUG("qurt_thread.destroy: %d", (int) _tid); + int thread_exit_code = QURT_EOK; + auto ret = qurt_thread_join(_tid, &thread_exit_code); + if (ret != QURT_EOK && ret != QURT_ENOTHREAD) { + DEVICE_LOG_ERROR("Failed to join thread: %d", (int) ret); + return; + } + + if (thread_exit_code != QURT_EOK) { + DEVICE_LOG_ERROR("Thread exit code: %d", (int) thread_exit_code); + } + } + + bool is_valid() const { return _tid != 0 && _func != nullptr; } + + private: + static void thread_func_impl(qurt_thread * thread) { + if (thread->_func) { + thread->_func(thread, thread->_arg); + } + + qurt_thread_exit(QURT_EOK); + } + + uint8_t _stack[_stack_size] = {}; + qurt_thread_t _tid; + qurt_thread_attr_t _attributes; + qurt_thread_func_type _func = nullptr; + void * _arg = nullptr; + + DISABLE_COPY_AND_MOVE(qurt_thread); +}; + +using qurt_thread_ptr = std::unique_ptr>; + +template class thread_pool { + static_assert(_thread_count > 1, "Thread count must be greater than 1"); + constexpr const static size_t kMaxThreadCount = _thread_count - 1; + + public: + typedef qurt_thread thread_type; + typedef void (*task_type)(thread_pool * pool, size_t thread_idx, size_t thread_count, void * arg); + + thread_pool() { + std::string thread_name_base = "thread_pool_"; + qurt_barrier_init(&_pending, kMaxThreadCount + 1); + qurt_barrier_init(&_completed, kMaxThreadCount + 1); + for (size_t i = 0; i < kMaxThreadCount; ++i) { + auto & thread_arg = _thread_args[i]; + thread_arg.pool = this; + thread_arg.thread_idx = i + 1; + + auto thread = std::make_unique( + thread_name_base + std::to_string(i), + reinterpret_cast(&thread_pool::thread_func_impl), &thread_arg, + QURT_THREAD_ATTR_PRIORITY_DEFAULT); + if (!thread->is_valid()) { + DEVICE_LOG_ERROR("Failed to create thread: %zu", i); + // destroy all barriers and threads at destructor + return; + } + + _threads[i] = std::move(thread); + } + DEVICE_LOG_DEBUG("thread_pool.created: %zu", kMaxThreadCount); + } + + ~thread_pool() { + DEVICE_LOG_DEBUG("thread_pool.destroy"); + _thread_exit = true; + qurt_barrier_wait(&_pending); // release all task threads + + for (auto & thread : _threads) { + thread.reset(); + } + + qurt_barrier_destroy(&_completed); + qurt_barrier_destroy(&_pending); + } + + bool sync_execute(task_type task, void * arg) { + if (!task) { + DEVICE_LOG_ERROR("Invalid task"); + return false; + } + + _task = task; + _arg = arg; + qurt_barrier_wait(&_pending); + + task(this, 0, kMaxThreadCount + 1, arg); + DEVICE_LOG_DEBUG("main_thread.task_completed: 0"); + + qurt_barrier_wait(&_completed); + + _task = nullptr; + _arg = nullptr; + return true; + } + + private: + struct thread_pool_arg { + thread_pool * pool = nullptr; + size_t thread_idx = 0; + }; + + static void thread_func_impl(thread_type * thread, thread_pool_arg * arg) { + NPU_UNUSED(thread); + + DEVICE_LOG_DEBUG("thread_func_impl.start: %zu", arg->thread_idx); + + auto & pool = *arg->pool; + for (;;) { + qurt_barrier_wait(&pool._pending); + if (pool._thread_exit) { + DEVICE_LOG_DEBUG("thread_func_impl.exit: %zu", arg->thread_idx); + break; + } + + auto task = pool._task; + if (task) { + task(arg->pool, arg->thread_idx, kMaxThreadCount + 1, pool._arg); + } + + DEVICE_LOG_DEBUG("thread_func_impl.task_completed: %zu", arg->thread_idx); + qurt_barrier_wait(&pool._completed); + } + + DEVICE_LOG_DEBUG("thread_func_impl.end: %zu", arg->thread_idx); + } + + std::atomic_bool _thread_exit = false; + std::array _threads; + thread_pool_arg _thread_args[kMaxThreadCount] = {}; + qurt_barrier_t _pending = {}; + qurt_barrier_t _completed = {}; + task_type _task = nullptr; + void * _arg = nullptr; + + DISABLE_COPY_AND_MOVE(thread_pool); +}; + +using default_thread_pool = thread_pool; + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/util.hpp b/ggml/src/ggml-qnn/npu/device/util.hpp index 12b7dde81e..f6f5479694 100644 --- a/ggml/src/ggml-qnn/npu/device/util.hpp +++ b/ggml/src/ggml-qnn/npu/device/util.hpp @@ -2,6 +2,10 @@ #include +#include +#include +#include + #include "hexagon_npu.h" #define DEVICE_LOG_ERROR(...) FARF(FATAL, __VA_ARGS__) @@ -16,9 +20,24 @@ # define DEVICE_LOG_DEBUG(...) (void) 0 #endif +// TODO: reuse the declaration at host +#define DISABLE_COPY(class_name) \ + class_name(const class_name &) = delete; \ + void operator=(const class_name &) = delete + +#define DISABLE_MOVE(class_name) \ + class_name(class_name &&) = delete; \ + void operator=(class_name &&) = delete + +#define DISABLE_COPY_AND_MOVE(class_name) \ + DISABLE_COPY(class_name); \ + DISABLE_MOVE(class_name) + +#define NPU_UNUSED(x) (void) (x) + namespace hexagon { -constexpr const char * op_get_name(npu_device_tensor_op op) { +inline constexpr const char * op_get_name(npu_device_tensor_op op) { switch (op) { case NPU_OP_MUL_MAT: return "MUL_MAT"; @@ -33,4 +52,11 @@ constexpr const char * op_get_name(npu_device_tensor_op op) { } } +inline constexpr std::pair get_thread_work_slice(int64_t total, size_t tidx, size_t tcnt) { + const auto elements_per_thread = (total + tcnt - 1) / tcnt; + const auto start = tidx * elements_per_thread; + const auto end = std::min(start + elements_per_thread, total); + return { start, end }; +} + } // namespace hexagon diff --git a/ggml/src/ggml-qnn/qnn/CMakeLists.txt b/ggml/src/ggml-qnn/qnn/CMakeLists.txt new file mode 100644 index 0000000000..2a9455b864 --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/CMakeLists.txt @@ -0,0 +1,42 @@ + + +file(GLOB qnn_srcs "${CMAKE_CURRENT_LIST_DIR}/*.cpp") + +add_library(qnn-backend STATIC + ${qnn_srcs} +) + +target_include_directories(qnn-backend PRIVATE + ${GGML_QNN_SDK_PATH}/include/QNN/ + ${CMAKE_CURRENT_LIST_DIR}/ + ${CMAKE_CURRENT_LIST_DIR}/../ + ${CMAKE_CURRENT_LIST_DIR}/../../ + ${CMAKE_CURRENT_LIST_DIR}/../../../include/ # TODO: figure out how to remove this + ${CMAKE_CURRENT_LIST_DIR}/../shared/ +) + +target_link_directories(qnn-backend PRIVATE + runtime-common +) + +if(GGML_QNN_ENABLE_CPU_BACKEND) + message("GGML_QNN_ENABLE_CPU_BACKEND is enabled") + target_compile_definitions(qnn-backend PUBLIC GGML_QNN_ENABLE_CPU_BACKEND) +else() + message("GGML_QNN_ENABLE_CPU_BACKEND is disabled") +endif() + +if(CMAKE_SYSTEM_NAME STREQUAL "Android") + set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend") +elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows" OR CMAKE_SYSTEM_NAME STREQUAL "Linux") + set(QNN_DEFAULT_LIB_SEARCH_PATH "" CACHE STRING "customized library search path for QNN backend") +else() + message(FATAL_ERROR "QNN now only available on Android, Windows and Linux") +endif() + +if(NOT "${QNN_DEFAULT_LIB_SEARCH_PATH}" STREQUAL "") + string(REGEX REPLACE "/$" "" QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}") +endif() + +message("GGML_QNN_DEFAULT_LIB_SEARCH_PATH: ${QNN_DEFAULT_LIB_SEARCH_PATH}") +target_compile_definitions(qnn-backend PUBLIC GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}") diff --git a/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage.xml b/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage.xml deleted file mode 100644 index f4c6575902..0000000000 --- a/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage.xml +++ /dev/null @@ -1,88 +0,0 @@ - - - - - GgmlMulMat - - - GGML MulMat operator - - - - - in[0] - - src0 - - true - BACKEND_SPECIFIC - - 4D - NHWC - [N, C, H , W] - - - - - in[1] - - src1 - - true - BACKEND_SPECIFIC - - 4D - NHWC - [N, C, H , W] - - - - - out[0] - - dst - - true - BACKEND_SPECIFIC - - 4D - [N, C, H , W] - - - - - HTP - - - - - - - GgmlMulMat - - - - - GgmlMulMat - - - in[0] - QNN_DATATYPE_FLOAT_16 - QNN_DATATYPE_FLOAT_32 - - - in[1] - QNN_DATATYPE_FLOAT_16 - QNN_DATATYPE_FLOAT_32 - - - - out[0] - QNN_DATATYPE_FLOAT_16 - QNN_DATATYPE_FLOAT_32 - - - - - - diff --git a/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/Makefile b/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/Makefile deleted file mode 100644 index f177822d35..0000000000 --- a/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/Makefile +++ /dev/null @@ -1,357 +0,0 @@ -# check all setup prerequisites if the command goal is not clean -ifneq ($(MAKECMDGOALS),clean) -ifndef QNN_INCLUDE -$(info "INFO: Qnn include not explicitly defined, attempting to use QNN_SDK_ROOT if it is valid") -QNN_INCLUDE := $(QNN_SDK_ROOT)/include/QNN -endif -ifeq ($(wildcard $(QNN_INCLUDE)),) -$(error "ERROR: QNN_INCLUDE path is not set. QNN include paths must be set to obtain BE headers necessary to compile the package") -endif -ifndef QNN_TARGET_LIB -$(info "INFO: Qnn target not explicitly defined, attempting to use QNN_SDK_ROOT if it is valid") -QNN_TARGET_LIB := $(QNN_SDK_ROOT)/lib/aarch64-android -endif -ifeq ($(wildcard $(QNN_TARGET_LIB)),) -ifeq ($(MAKECMDGOALS),htp_aarch64) -$(error "ERROR: QNN_TARGET_LIB is needed to compile package for aarch64") -else ifeq ($(MAKECMDGOALS),all) -$(info "WARNING:QNN_TARGET_LIB may need to be defined to compile packages") -endif -endif - -ifndef HEXAGON_SDK_ROOT -$(error "ERROR: HEXAGON_SDK_ROOT is not set. Hexagon-SDK path must be set to the latest hexagon-sdk-x.y.z") -endif - -ifeq ($(wildcard $(HEXAGON_SDK_ROOT)),) -$(error "ERROR: HEXAGON_SDK_ROOT is not set correctly. Please set HEXAGON_SDK_ROOT to latest hexagon-sdk-X.Y.Z path") -endif - -HEXAGON_SDK_BASE := $(dir $(HEXAGON_SDK_ROOT)) - -$(info "HEXAGON_SDK_ROOT is [${HEXAGON_SDK_ROOT}]") -# Users should note that the tools version may change between hexagon sdk versions -# Following combination of SDK and Tool version is supported -# fix the sdk root for new versions -HEXAGON_SDK_ROOT_V68 := $(HEXAGON_SDK_ROOT) -HEXAGON_SDK_ROOT_V69 := $(HEXAGON_SDK_ROOT) -HEXAGON_SDK_ROOT_V73 := $(HEXAGON_SDK_ROOT) -HEXAGON_SDK_ROOT_V75 := $(HEXAGON_SDK_ROOT) -HEXAGON_SDK_ROOT_V79 := $(HEXAGON_SDK_ROOT) - -#Updated to point to latest sdk to match with libQnnHtp.so -HEXAGON_SDK_ROOT_X86 := $(HEXAGON_SDK_ROOT) -HEXAGON_TOOLS_VERSION_V68 := 8.7.06 -HEXAGON_TOOLS_VERSION_V69 := 8.7.06 -HEXAGON_TOOLS_VERSION_V73 := 8.7.06 -HEXAGON_TOOLS_VERSION_V75 := 8.7.06 -HEXAGON_TOOLS_VERSION_V79 := 8.7.06 - -#Updated to point to latest sdk to match with libQnnHtp.so -HEXAGON_TOOLS_VERSION_X86 := 8.7.06 - -ifndef ANDROID_NDK_ROOT -ifeq ($(MAKECMDGOALS),htp_aarch64) -$(error "ERROR: ANDROID_NDK_ROOT is not set. Android NDK path must be set to compile package for aarch64") -else ifeq ($(MAKECMDGOALS),all) -$(info "WARNING: ANDROID_NDK_ROOT is not set. Android NDK path must be set to compile package for aarch64") -endif -endif - -ifndef PACKAGE_NAME -export -PACKAGE_NAME := $(notdir $(shell pwd)) -$(info "INFO: No package name defined. Using current directory name: $(PACKAGE_NAME) as the package name") -endif - -WORK := build -SRC_DIR := src -OP_SRC_DIR := src/ops -OP_INCLUDE_DIR := ./include -OP_INCLUDES = #$(wildcard $(OP_INCLUDE_DIR)/*.h) user defined if any op specific headers are needed, add -I to common flags -LIBRARY_NAME := libQnn$(PACKAGE_NAME).so -SUPPORTED_TARGETS = x86_64-linux-clang hexagon-v68 hexagon-v69 hexagon-v73 hexagon-v75 hexagon-v79 aarch64-android - - -COMMON_CXX_FLAGS = -std=c++17 -I$(QNN_INCLUDE) -fPIC -Wall -Wreorder -Wno-missing-braces -Wno-unused-function -COMMON_CXX_FLAGS += -Werror -Wno-format -Wno-unused-command-line-argument -fvisibility=default -stdlib=libc++ -COMMON_CXX_FLAGS += -DQNN_API="__attribute__((visibility(\"default\")))" -D__QAIC_HEADER_EXPORT="__attribute__((visibility(\"default\")))" - -X86_LIBNATIVE_RELEASE_DIR := $(HEXAGON_SDK_ROOT_X86)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_X86)/Tools - -# Ensure hexagon sdk tool version can be retrieved -ifeq ($(wildcard $(X86_LIBNATIVE_RELEASE_DIR)/.),) -$(error "Cannot retrieve hexagon tools from: $(X86_LIBNATIVE_RELEASE_DIR). \ - \ - Please check that hexagon tools version is correct. Expected: $(HEXAGON_TOOLS_VERSION_X86)") -endif - -#Check tools for hexagon_v68 are present. -ifeq ($(MAKECMDGOALS),htp_v68) -ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V68)),) -$(error "ERROR: HEXAGON_SDK_ROOT_V68 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V68)") -endif -endif - -ifeq ($(MAKECMDGOALS),htp_v69) -ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V69)),) -$(error "ERROR: HEXAGON_SDK_ROOT_V69 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V69)") -endif -endif - -ifeq ($(MAKECMDGOALS),htp_v73) -ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V73)),) -$(error "ERROR: HEXAGON_SDK_ROOT_V73 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V73)") -endif -endif - -ifeq ($(MAKECMDGOALS),htp_v75) -ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V75)),) -$(error "ERROR: HEXAGON_SDK_ROOT_V75 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V75)") -endif -endif - -#Check tools for hexagon_v79 are present. -ifeq ($(MAKECMDGOALS),htp_v79) -ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V79)),) -$(error "ERROR: HEXAGON_SDK_ROOT_V79 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V79)") -endif -endif - - - -endif -OP_SOURCES = $(wildcard $(OP_SRC_DIR)/*.cpp) -OTHER_SOURCES = $(wildcard $(SRC_DIR)/*.cpp) -HFILES = $(wildcard $(QNN_INCLUDE)/*.h) -HFILES += $(wildcard $(QNN_INCLUDE)/HTP/*.h) -HFILES += $(wildcard $(QNN_INCLUDE)/HTP/core/*.h) -OP_OBJS = $(patsubst $(SRC_DIR)/%,%,$(patsubst %.cpp,%.o,$(OP_SOURCES))) -OTHER_OBJS = $(patsubst $(SRC_DIR)/%,%,$(patsubst %.cpp,%.o,$(OTHER_SOURCES))) - -#======= Assembly ======== -OP_SOURCES_ASM_X86 += $(wildcard $(OP_SRC_DIR)/x86_asm/*.S) -OP_OBJS_ASM_X86 += $(subst /x86_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_X86)))) -OP_SOURCES_ASM_V68 += $(wildcard $(OP_SRC_DIR)/v68_asm/*.S) -OP_OBJS_ASM_V68 += $(subst /v68_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V68)))) -OP_SOURCES_ASM_V69 += $(wildcard $(OP_SRC_DIR)/v69_asm/*.S) -OP_OBJS_ASM_V69 += $(subst /v69_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V69)))) -OP_SOURCES_ASM_V73 += $(wildcard $(OP_SRC_DIR)/v73_asm/*.S) -OP_OBJS_ASM_V73 += $(subst /v73_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V73)))) -OP_SOURCES_ASM_V75 += $(wildcard $(OP_SRC_DIR)/v75_asm/*.S) -OP_OBJS_ASM_V75 += $(subst /v75_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V75)))) -OP_SOURCES_ASM_V79 += $(wildcard $(OP_SRC_DIR)/v79_asm/*.S) -OP_OBJS_ASM_V79 += $(subst /v79_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V79)))) - -OP_SOURCES_ASM_ANDROID += $(wildcard $(OP_SRC_DIR)/android_asm/*.S) -OP_OBJS_ASM_ANDROID += $(subst /android_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_ANDROID)))) - - -all: htp_v68 htp_x86 htp_aarch64 - -#============================================================================================================ -# Setup compiler, compiler instructions and linker for x86 -X86_CXX ?= clang++-9 -# Checking if clang++-9 is present. If not switch to clang++ -ifeq ($(shell $(X86_CXX) -v 2>&1 | grep -c "clang version"), 0) - X86_CXX := clang++ -endif -X86_LDFLAGS:= -Wl,--whole-archive -L$(X86_LIBNATIVE_RELEASE_DIR)/libnative/lib -lnative -Wl,--no-whole-archive -lpthread -L$(QNN_SDK_ROOT)/lib/x86_64-linux-clang -lHtpPrepare -X86_C_FLAGS := -D__HVXDBL__ -I$(X86_LIBNATIVE_RELEASE_DIR)/libnative/include -ffast-math -DUSE_OS_LINUX -X86_CXX_FLAGS = $(COMMON_CXX_FLAGS) $(X86_C_FLAGS) -fomit-frame-pointer -Wno-invalid-offsetof -linux_objs = -#============================================================================================================ -# Setup compiler, compiler instructions and linker for hexagon -HEXAGON_CXX_FLAGS := $(COMMON_CXX_FLAGS) -mhvx -mhvx-length=128B -mhmx -DUSE_OS_QURT -O2 -Wno-reorder -DPREPARE_DISABLED - -HEXAGON_CXX_FLAGS_V68 := $(HEXAGON_CXX_FLAGS) -mv68 -I$(HEXAGON_SDK_ROOT_V68)/rtos/qurt/computev68/include/qurt -I$(HEXAGON_SDK_ROOT_V68)/rtos/qurt/computev68/include/posix -I$(HEXAGON_SDK_ROOT_V68)/incs -I$(HEXAGON_SDK_ROOT_V68)/incs/stddef -HEXAGON_CXX_FLAGS_V69 := $(HEXAGON_CXX_FLAGS) -mv69 -I$(HEXAGON_SDK_ROOT_V69)/rtos/qurt/computev69/include/qurt -I$(HEXAGON_SDK_ROOT_V69)/rtos/qurt/computev69/include/posix -I$(HEXAGON_SDK_ROOT_V69)/incs -I$(HEXAGON_SDK_ROOT_V69)/incs/stddef -HEXAGON_CXX_FLAGS_V73 := $(HEXAGON_CXX_FLAGS) -mv73 -I$(HEXAGON_SDK_ROOT_V73)/rtos/qurt/computev73/include/qurt -I$(HEXAGON_SDK_ROOT_V73)/rtos/qurt/computev73/include/posix -I$(HEXAGON_SDK_ROOT_V73)/incs -I$(HEXAGON_SDK_ROOT_V73)/incs/stddef -HEXAGON_CXX_FLAGS_V75 := $(HEXAGON_CXX_FLAGS) -mv75 -I$(HEXAGON_SDK_ROOT_V75)/rtos/qurt/computev75/include/qurt -I$(HEXAGON_SDK_ROOT_V75)/rtos/qurt/computev75/include/posix -I$(HEXAGON_SDK_ROOT_V75)/incs -I$(HEXAGON_SDK_ROOT_V75)/incs/stddef -HEXAGON_CXX_FLAGS_V79 := $(HEXAGON_CXX_FLAGS) -mv79 -I$(HEXAGON_SDK_ROOT_V79)/rtos/qurt/computev79/include/qurt -I$(HEXAGON_SDK_ROOT_V79)/rtos/qurt/computev79/include/posix -I$(HEXAGON_SDK_ROOT_V79)/incs -I$(HEXAGON_SDK_ROOT_V79)/incs/stddef - - -HEXAGON_CXX_V68 := $(HEXAGON_SDK_ROOT_V68)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V68)/Tools/bin/hexagon-clang++ -HEXAGON_CXX_V69 := $(HEXAGON_SDK_ROOT_V69)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V69)/Tools/bin/hexagon-clang++ -HEXAGON_CXX_V73 := $(HEXAGON_SDK_ROOT_V73)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V73)/Tools/bin/hexagon-clang++ -HEXAGON_CXX_V75 := $(HEXAGON_SDK_ROOT_V75)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V75)/Tools/bin/hexagon-clang++ -HEXAGON_CXX_V79 := $(HEXAGON_SDK_ROOT_V79)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V79)/Tools/bin/hexagon-clang++ - - -HEX_LDFLAGS = -hexagon_objs = -#============================================================================================================ -# Setup compiler, compiler instructions and linker for aarch64 -AARCH64_C__FLAGS = -D__HVXDBL__ -I$(X86_LIBNATIVE_RELEASE_DIR)/libnative/include -ffast-math -DUSE_OS_LINUX -DANDROID -AARCH64_CXX_FLAGS = $(COMMON_CXX_FLAGS) $(AARCH64_C__FLAGS) -fomit-frame-pointer -Wno-invalid-offsetof -Wno-unused-variable -Wno-unused-parameter -Wno-missing-braces -Wno-sign-compare -Wno-unused-private-field -Wno-unused-variable -Wno-ignored-qualifiers -Wno-missing-field-initializers -ARM_CLANG_OPTS =--target=aarch64-none-linux-android21 --sysroot=$(ANDROID_NDK_ROOT)/toolchains/llvm/prebuilt/linux-x86_64/sysroot -stdlib=libc++ -static-libstdc++ -AARCH64_CXX = $(ANDROID_NDK_ROOT)/toolchains/llvm/prebuilt/linux-x86_64/bin/clang++ $(ARM_CLANG_OPTS) -AARCH64_LDFLAGS = -L$(QNN_TARGET_LIB) -lQnnHtp -lQnnHtpPrepare -aarch64_objs = -#============================================================================================================ -# Setup targets and goals - -htp_x86: X86_BUILD - -htp_v68: HEXAGON_BUILD_V68 - -htp_v69: HEXAGON_BUILD_V69 - -htp_v73: HEXAGON_BUILD_V73 - -htp_v75: HEXAGON_BUILD_V75 - -htp_v79: HEXAGON_BUILD_V79 - - - -htp_aarch64: AARCH64_BUILD - -AARCH64_BUILD: $(WORK)/aarch64-android/$(LIBRARY_NAME) - -HEXAGON_BUILD_V68: $(WORK)/hexagon-v68/$(LIBRARY_NAME) - -HEXAGON_BUILD_V69: $(WORK)/hexagon-v69/$(LIBRARY_NAME) - -HEXAGON_BUILD_V73: $(WORK)/hexagon-v73/$(LIBRARY_NAME) - -HEXAGON_BUILD_V75: $(WORK)/hexagon-v75/$(LIBRARY_NAME) - -HEXAGON_BUILD_V79: $(WORK)/hexagon-v79/$(LIBRARY_NAME) - - - -X86_BUILD: $(WORK)/x86_64-linux-clang/$(LIBRARY_NAME) - - -define build_objs = -ifneq ($(filter $(2),$(SUPPORTED_TARGETS)),) -$(2)_objs += $(foreach x,$(1),$(WORK)/$(2)/$(x)) -else -$$(error "Unknown target option provided: $(2): Supported targets are: $(SUPPORTED_TARGETS)") -endif -endef - -$(eval $(call build_objs,$(OTHER_OBJS),x86_64-linux-clang)) -$(eval $(call build_objs,$(OP_OBJS),x86_64-linux-clang)) -$(eval $(call build_objs,$(OP_OBJS_ASM_X86),x86_64-linux-clang)) -$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v68)) -$(eval $(call build_objs,$(OP_OBJS),hexagon-v68)) -$(eval $(call build_objs,$(OP_OBJS_ASM_V68),hexagon-v68)) -$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v69)) -$(eval $(call build_objs,$(OP_OBJS),hexagon-v69)) -$(eval $(call build_objs,$(OP_OBJS_ASM_V69),hexagon-v69)) -$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v73)) -$(eval $(call build_objs,$(OP_OBJS),hexagon-v73)) -$(eval $(call build_objs,$(OP_OBJS_ASM_V73),hexagon-v73)) -$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v75)) -$(eval $(call build_objs,$(OP_OBJS),hexagon-v75)) -$(eval $(call build_objs,$(OP_OBJS_ASM_V75),hexagon-v75)) -$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v79)) -$(eval $(call build_objs,$(OP_OBJS),hexagon-v79)) -$(eval $(call build_objs,$(OP_OBJS_ASM_V75),hexagon-v79)) - -$(eval $(call build_objs,$(OTHER_OBJS),aarch64-android)) -$(eval $(call build_objs,$(OP_OBJS),aarch64-android)) -$(eval $(call build_objs,$(OP_OBJS_ASM_ANDROID),aarch64-android)) - -# x86 -$(WORK)/x86_64-linux-clang $(WORK)/hexagon-v68 $(WORK)/hexagon-v69 $(WORK)/hexagon-v73 $(WORK)/hexagon-v75 $(WORK)/hexagon-v79 $(WORK)/aarch64-android: - @mkdir -p $@/ops - -$(WORK)/x86_64-linux-clang/%.o: $(SRC_DIR)/%.cpp | $(WORK)/x86_64-linux-clang - $(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/x86_64-linux-clang/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/x86_64-linux-clang - $(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/x86_64-linux-clang/ops/%.o: $(OP_SRC_DIR)/x86_asm/%.S | $(WORK)/x86_64-linux-clang - $(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/x86_64-linux-clang/$(LIBRARY_NAME): $(x86_64-linux-clang_objs) | $(HFILES) - $(X86_CXX) -fPIC -std=c++17 -g -shared -o $@ $^ $(X86_LDFLAGS) - -# v68 -$(WORK)/hexagon-v68/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v68 - $(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/hexagon-v68/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v68 - $(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/hexagon-v68/ops/%.o: $(OP_SRC_DIR)/v68_asm/%.S | $(WORK)/hexagon-v68 - $(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/hexagon-v68/$(LIBRARY_NAME): $(hexagon-v68_objs) | $(HFILES) - $(HEXAGON_CXX_V68) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) - -# v69 -$(WORK)/hexagon-v69/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v69 - $(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/hexagon-v69/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v69 - $(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/hexagon-v69/ops/%.o: $(OP_SRC_DIR)/v69_asm/%.S | $(WORK)/hexagon-v69 - $(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/hexagon-v69/$(LIBRARY_NAME): $(hexagon-v69_objs) | $(HFILES) - $(HEXAGON_CXX_V69) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) - -# v73 -$(WORK)/hexagon-v73/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v73 - $(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/hexagon-v73/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v73 - $(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/hexagon-v73/ops/%.o: $(OP_SRC_DIR)/v73_asm/%.S | $(WORK)/hexagon-v73 - $(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/hexagon-v73/$(LIBRARY_NAME): $(hexagon-v73_objs) | $(HFILES) - $(HEXAGON_CXX_V73) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) - -#v75 -$(WORK)/hexagon-v75/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v75 - $(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/hexagon-v75/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v75 - $(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/hexagon-v75/ops/%.o: $(OP_SRC_DIR)/v75_asm/%.S | $(WORK)/hexagon-v75 - $(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/hexagon-v75/$(LIBRARY_NAME): $(hexagon-v75_objs) | $(HFILES) - $(HEXAGON_CXX_V75) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) - -#v79 -$(WORK)/hexagon-v79/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v79 - $(HEXAGON_CXX_V79) $(HEXAGON_CXX_FLAGS_V79) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/hexagon-v79/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v79 - $(HEXAGON_CXX_V79) $(HEXAGON_CXX_FLAGS_V79) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/hexagon-v79/ops/%.o: $(OP_SRC_DIR)/v79_asm/%.S | $(WORK)/hexagon-v79 - $(HEXAGON_CXX_V79) $(HEXAGON_CXX_FLAGS_V79) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/hexagon-v79/$(LIBRARY_NAME): $(hexagon-v79_objs) | $(HFILES) - $(HEXAGON_CXX_V79) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) - - - -# aarch64 -$(WORK)/aarch64-android/%.o: $(SRC_DIR)/%.cpp | $(WORK)/aarch64-android - $(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/aarch64-android/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/aarch64-android - $(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/aarch64-android/ops/%.o: $(OP_SRC_DIR)/android_asm/%.S | $(WORK)/aarch64-android - $(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ - -$(WORK)/aarch64-android/$(LIBRARY_NAME): $(aarch64-android_objs) | $(HFILES) - $(AARCH64_CXX) -fPIC -std=c++17 -g -shared -o $@ $^ $(AARCH64_LDFLAGS) - -clean: - -rm -rf $(WORK) - -.PHONY: all clean diff --git a/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/config/GgmlOpPackage.xml b/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/config/GgmlOpPackage.xml deleted file mode 100644 index f4c6575902..0000000000 --- a/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/config/GgmlOpPackage.xml +++ /dev/null @@ -1,88 +0,0 @@ - - - - - GgmlMulMat - - - GGML MulMat operator - - - - - in[0] - - src0 - - true - BACKEND_SPECIFIC - - 4D - NHWC - [N, C, H , W] - - - - - in[1] - - src1 - - true - BACKEND_SPECIFIC - - 4D - NHWC - [N, C, H , W] - - - - - out[0] - - dst - - true - BACKEND_SPECIFIC - - 4D - [N, C, H , W] - - - - - HTP - - - - - - - GgmlMulMat - - - - - GgmlMulMat - - - in[0] - QNN_DATATYPE_FLOAT_16 - QNN_DATATYPE_FLOAT_32 - - - in[1] - QNN_DATATYPE_FLOAT_16 - QNN_DATATYPE_FLOAT_32 - - - - out[0] - QNN_DATATYPE_FLOAT_16 - QNN_DATATYPE_FLOAT_32 - - - - - - diff --git a/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/src/GgmlOpPackageInterface.cpp b/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/src/GgmlOpPackageInterface.cpp deleted file mode 100644 index df9ab36420..0000000000 --- a/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/src/GgmlOpPackageInterface.cpp +++ /dev/null @@ -1,274 +0,0 @@ -//============================================================================== -// Auto Generated Code for GgmlOpPackage -//============================================================================== - -#include "HTP/QnnHtpCommon.h" -#include "HTP/core/constraints.h" -#include "HTP/core/op_package_feature_support.h" -#include "HTP/core/op_register_ext.h" -#include "HTP/core/optimize.h" -#include "HTP/core/simple_reg.h" -#include "HTP/core/unique_types.h" -#include "QnnOpPackage.h" -#include "QnnSdkBuildId.h" - -DEFINE_UNIQ_TY() -BEGIN_PKG_OPS_OPTS_LIST() - -/** Note that the order of declarations given here defines the order in which ops and graph optimizations are - * registered to the HTP Core. - * Append the latest OpName at the bottom - */ -DECLARE_PKG_OPS_OPTS_LIST(PKG_GgmlMulMat) - -END_PKG_OPS_OPTS_LIST() - -// op package info -static constexpr auto sg_packageName = THIS_PKG_NAME_STR; // package name passed in as compile flag - -static std::array sg_opNames{{"GgmlMulMat"}}; - -static Qnn_ApiVersion_t sg_sdkApiVersion = QNN_HTP_API_VERSION_INIT; -static QnnOpPackage_Info_t sg_packageInfo = QNN_OP_PACKAGE_INFO_INIT; - -// global data -static QnnOpPackage_GlobalInfrastructure_t sg_globalInfra = -nullptr; // global infrastructure not in use for now -static bool sg_packageInitialized = false; - -/* - * user provided logging call back function - * currently only supported on linux x86-64 and nonrpc versions - * typedef void (*QnnLog_Callback_t)(const char* fmt, - * QnnLog_Level_t level, - * uint64_t timestamp, - * va_list args); - * usage: if(sg_logInitialized && level <= sg_maxLogLevel) - * sg_logCallback(fmt, level, timestamp, args); - * - * for cross rpc versions, skel side user provided logging call back function - * can be defined as part of op packages. maximal log level sg_maxLogLevel - * can be set by Qnn_ErrorHandle_t GgmlOpPackageLogSetLevel(QnnLog_Level_t maxLogLevel) - */ -/* - * for alternative logging method provided by HTP core, please refer to log.h - */ -static QnnLog_Callback_t sg_logCallback = - nullptr; // user provided call back function pointer for logging -static QnnLog_Level_t sg_maxLogLevel = - (QnnLog_Level_t)0; // maximal log level used in user provided logging -static bool sg_logInitialized = - false; // tracks whether user provided logging method has been initialized - - -/* -* op initialization -* needs to be global in the package -* one initialization per package before any op definitions -* syntax: INIT_PACKAGE_OP_DEF() -*/ -INIT_PACKAGE_OP_DEF() - -/* -* optimization initialization -* needs to be global in the package -* one initialization per package before any optimization definitions -* syntax: INIT_PACKAGE_OPTIMIZATION_DEF() -*/ -INIT_PACKAGE_OPTIMIZATION_DEF() - -/* - * op parameter order initialization - * needs to be global in the package - * one initialization per package before any op parameter order definitions - * syntax: INIT_PACKAGE_PARAM_ORDER_DEF() - */ -INIT_PACKAGE_PARAM_ORDER_DEF() - -/* - * axis parameter name list - * optional - * needs to be global in the package - * one list per package - * for listing axis parameter names passed into Qnn_AddNode API - * HTP backend auto-adjusts values in axis parameters based on HTP backfilling - * note: HTP backend backfills tensor dimensions to 4 dimensions - * syntax: LIST_PACKAGE_AXIS_PARAMS(...) - * e.g. LIST_PACKAGE_AXIS_PARAMS("Axis", "AXIS", "axis") - */ -// LIST_PACKAGE_AXIS_PARAMS() - -/* - * per-channel quantized op name list - * optional - * needs to be global in the package - * one list per package - * for listing op names which support per-channel quantization - * per-axis quantization info of an op is embeded in axisScaleOffsetEncoding - * inside Qnn_Tensor_t types - * HTP backend only supports per-channel scale ops - * i.e. along last dimension, offset is always zero - * if an op name is marked as having per-channel scale support, and in - * QNN_AddNode, at least one input, parameter, or output has - * QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET type: - * then: - * HTP backend will pass to op implementation function the following: - * output(s), input(s), parameter(s), - * outputPerChannelScale(s), inputPerChannelScale(s), paramPerChannelScale(s) - * - * optimization rules can be used to remove extra perChannelScale tensors - * - * syntax: LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(...) - * e.g. LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(sg_op1Name, sg_op2Name) - */ - -// LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS() - -/* -* Declare and define the special intialize function for HTP Backend to load -*/ -INIT_PKG_CORE_INIT_FUNC() - -/* op package API's */ - -Qnn_ErrorHandle_t GgmlOpPackageInit(QnnOpPackage_GlobalInfrastructure_t infrastructure) { - if (sg_packageInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED; - - /* - * op parameter order registration - * registers all defined op parameter orders in the package - * syntax: REGISTER_PACKAGE_PARAM_ORDERS() - */ - REGISTER_PACKAGE_PARAM_ORDERS() - - /* - * op axis parameter name registration - * registers all axis parameter names in the package - * used with LIST_PACKAGE_AXIS_PARAMS(...) - * syntax: REGISTER_PACKAGE_AXIS_PARAMS() - */ - REGISTER_PACKAGE_AXIS_PARAMS() - - /* - * per-channel scale op name registration - * registers all per-channel scale op names in the package - * used with LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(...) - * syntax: REGISTER_PACKAGE_PER_CHANNEL_QUANTIZED_OPS() - */ - REGISTER_PACKAGE_PER_CHANNEL_QUANTIZED_OPS() - - sg_globalInfra = infrastructure; - sg_packageInitialized = true; - return QNN_SUCCESS; -} - -Qnn_ErrorHandle_t GgmlOpPackageGetInfo(const QnnOpPackage_Info_t** info) { - if (!sg_packageInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED; - if (!info) return QNN_OP_PACKAGE_ERROR_INVALID_INFO; - - sg_packageInfo = QNN_OP_PACKAGE_INFO_INIT; - sg_packageInfo.packageName = sg_packageName; - sg_packageInfo.operationNames = sg_opNames.data(); - sg_packageInfo.numOperations = sg_opNames.size(); - sg_packageInfo.sdkBuildId = QNN_SDK_BUILD_ID; - sg_packageInfo.sdkApiVersion = &sg_sdkApiVersion; - - *info = &sg_packageInfo; - return QNN_SUCCESS; -} - -Qnn_ErrorHandle_t GgmlOpPackageLogInitialize(QnnLog_Callback_t callback, QnnLog_Level_t maxLogLevel) { - if (sg_logInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED; - if (!callback) return QNN_LOG_ERROR_INVALID_ARGUMENT; - if (maxLogLevel < QNN_LOG_LEVEL_ERROR) return QNN_LOG_ERROR_INVALID_ARGUMENT; - sg_logCallback = callback; - sg_maxLogLevel = maxLogLevel; - sg_logInitialized = true; - return QNN_SUCCESS; -} - -Qnn_ErrorHandle_t GgmlOpPackageLogSetLevel(QnnLog_Level_t maxLogLevel) { - if (maxLogLevel < QNN_LOG_LEVEL_ERROR) return QNN_LOG_ERROR_INVALID_ARGUMENT; - sg_maxLogLevel = maxLogLevel; - return QNN_SUCCESS; -} - -Qnn_ErrorHandle_t GgmlOpPackageLogTerminate() { - if (!sg_logInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED; - sg_logCallback = nullptr; - sg_maxLogLevel = (QnnLog_Level_t)0; - sg_logInitialized = false; - return QNN_SUCCESS; -} - -Qnn_ErrorHandle_t GgmlOpPackageValidateOpConfig (Qnn_OpConfig_t opConfig){ - if (std::string(sg_packageName) != opConfig.v1.packageName) { - return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; - } - - /* auto-generated validation code below - * Check if op config type matches any registered ops - * If a match is found, check number of inputs, outputs and params - */ - if (std::string(opConfig.v1.typeName) == "GgmlMulMat"){ - if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 2 || opConfig.v1.numOfOutputs != 1){ - return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; - } - } - else{ - return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; - } - - /* - * additional validation code here - * */ - - return QNN_SUCCESS; -} - -/* The following three functions in this comment are not called by HTP backend for now, - * no auto-generated implementations are created. Users should see example for full function signatures. - * (version 1.3.0) Qnn_ErrorHandle_t GgmlOpPackageCreateKernels (QnnOpPackage_GraphInfrastructure_t - * graphInfrastructure, QnnOpPackage_Node_t node, QnnOpPackage_Kernel_t** kernels, uint32_t* - * numKernels) - * (version 1.3.0) Qnn_ErrorHandle_t GgmlOpPackageFreeKernels (QnnOpPackage_Kernel_t* kernels) - * - * (version 1.4.0) Qnn_ErrorHandle_t GgmlOpPackageCreateOpImpl (QnnOpPackage_GraphInfrastructure_t - * graphInfrastructure, QnnOpPackage_Node_t node, QnnOpPackage_OpImpl_t* opImpl) - *(version 1.4.0) Qnn_ErrorHandle_t GgmlOpPackageFreeOpImpl (QnnOpPackage_OpImpl_t opImpl) - */ - -Qnn_ErrorHandle_t GgmlOpPackageTerminate() { -if (!sg_packageInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED; - -sg_globalInfra = nullptr; -sg_packageInitialized = false; -return QNN_SUCCESS; -} - -#ifdef __cplusplus -extern "C" { -#endif - - -/* latest version */ -Qnn_ErrorHandle_t GgmlOpPackageInterfaceProvider(QnnOpPackage_Interface_t* interface) { - if (!interface) return QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT; - interface->interfaceVersion = {1, 4, 0}; - interface->v1_4.init = GgmlOpPackageInit; - interface->v1_4.terminate = GgmlOpPackageTerminate; - interface->v1_4.getInfo = GgmlOpPackageGetInfo; - interface->v1_4.validateOpConfig = GgmlOpPackageValidateOpConfig; - interface->v1_4.createOpImpl = nullptr; - interface->v1_4.freeOpImpl = nullptr; - interface->v1_4.logInitialize = GgmlOpPackageLogInitialize; - interface->v1_4.logSetLevel = GgmlOpPackageLogSetLevel; - interface->v1_4.logTerminate = GgmlOpPackageLogTerminate; - return QNN_SUCCESS; -} - -#ifdef __cplusplus -} -#endif - - diff --git a/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/src/ops/GgmlMulMat.cpp b/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/src/ops/GgmlMulMat.cpp deleted file mode 100644 index 137522cc80..0000000000 --- a/ggml/src/ggml-qnn/qnn/hexagon/GgmlOpPackage/src/ops/GgmlMulMat.cpp +++ /dev/null @@ -1,213 +0,0 @@ -//============================================================================== -// Auto Generated Code for GgmlOpPackage -//============================================================================== - -#include "HTP/core/constraints.h" -#include "HTP/core/op_package_feature_support.h" -#include "HTP/core/op_register_ext.h" -#include "HTP/core/optimize.h" -#include "HTP/core/simple_reg.h" -#include "QnnOpPackage.h" - -BEGIN_PKG_OP_DEFINITION(PKG_GgmlMulMat); - -// op execute function declarations -template -GraphStatus ggmlmulmatImpl(TensorType & out_0, const TensorType & in_0, const TensorType & in_1); - -// forward declaration of sample cost function -static float ggmlmulmatCostFunc(const Op * op); - -/* - * method 1 for defining op, using default cost value (i.e. GLACIAL) and default flag (Flags::RESOURCE_HVX) - * syntax: DEF_PACKAGE_OP(F,OP) - * e.g. DEF_PACKAGE_OP((ggmlmulmatImpl), "GgmlMulMat") - */ -DEF_PACKAGE_OP((ggmlmulmatImpl), "GgmlMulMat") - -/* - * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL, FAST, FREE) - * and provided flags - * syntax: DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) - * can use zero or more flags, FLAG options are IS_CONST, INHIBIT_CONST_PROP, - * RESOURCE_HVX, RESOURCE_HMX(not supported in external op packages) - * e.g. DEF_PACKAGE_OP_AND_COST_AND_FLAGS((ggmlmulmatImpl), "GgmlMulMat", SNAIL) - */ - -/* - * method 3 for defining op with cost function pointer and provided flags - * cost function pointer type: typedef float (*cost_function) (const Op * op); - * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...) - * e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((ggmlmulmatImpl), - * "GgmlMulMat", ggmlmulmatCostFunc, Flags::RESOURCE_HVX) - */ - -/* - * optimization definitions - * need to be global in the package - * one definition per optimization - * syntax: DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE) - * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000) - * HTP core provides some replacement functions for op package to use - * for more information about optimization rules, please refer to HTP core documentations - */ - -/* - * op parameter order definitions - * need to be global in the package - * one definition per op, and this is optional - * syntax: DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...) - * one or more parameters can be specified for each op - * order of parameters listed determines the order of parameters passed into op execution functions - * if an op does not have a parameter order definition, parameter order passed into Qnn_addNode - * will be passed into op execution functions - * if an op has a parameter order definition, any parameter passed into Qnn_addNode with unlisted - * name will be abandoned - * if two or more op packages with the same package name will be registered, they cannot list - * conflicting parameter orders - * PARAM refers to parameter name as a string literal - * MANDATORY refers to whether this parameter is required to be provided at Qnn_addNode - * DEFAULT is used when MANDATORY is false - * if provided as Qnn_Param_t*, - * DEFAULT will be used for graph construction when this parameter is not provided at - * Qnn_addNode - * if provided as nullptr, - * graph construction will skip this parameter when this parameter is not provided at - * Qnn_addNode - */ - -namespace { - -constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73 -constexpr const size_t kFloatsPerVector = kBytesPerVector / sizeof(float); -constexpr const size_t kAlignMask = kBytesPerVector - 1; - -inline size_t unaligned_bytes(const void * addr) { - return ((size_t) addr) & kAlignMask; -} - -inline bool is_addr_aligned(void * addr) { - return unaligned_bytes(addr) == 0; -} - -inline float vec_dot_product_f32(const float * src0, const float * src1, size_t count) { - HVX_Vector * iptr0 = ((HVX_Vector *) src0); - HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kFloatsPerVector); - HVX_Vector * iptr1 = ((HVX_Vector *) src1); - HVX_Vector prev0 = *iptr0++; - HVX_Vector prev1 = *iptr1++; - HVX_Vector sum = Q6_V_vzero(); - - // TODO: prefetch? - while (iptr0 < iptr0_end) { - HVX_Vector curr0 = *iptr0++; - HVX_Vector curr1 = *iptr1++; - HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); - HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, s1), sum); - prev0 = curr0; - prev1 = curr1; - } - - if ((iptr0_end - ((HVX_Vector *) src0)) > 0) { - // handle the last vector - // see also: https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147 - HVX_Vector curr0 = is_addr_aligned(iptr0) ? prev0 : *iptr0++; - HVX_Vector curr1 = is_addr_aligned(iptr1) ? prev1 : *iptr1++; - HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); - HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, s1), sum); - prev0 = curr0; - prev1 = curr1; - } - - const size_t leftover = count % kFloatsPerVector; - const size_t leftover_bytes = leftover * sizeof(float); - if (leftover > 0) { - // handle the leftover elements - HVX_Vector curr0 = (leftover_bytes + unaligned_bytes(iptr0) > kBytesPerVector) ? *iptr0 : prev0; - curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); - - HVX_Vector curr1 = (leftover_bytes + unaligned_bytes(iptr1) > kBytesPerVector) ? *iptr1 : prev1; - curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - - sum = Q6_Vqf32_vadd_Vqf32Vqf32( - Q6_V_valign_VVR(Q6_Vqf32_vmpy_VsfVsf(curr0, curr1), Q6_V_vzero(), leftover_bytes), sum); - } - - // TODO: do we have a better way to do the reduction? - for (size_t i = kFloatsPerVector / 2; i > 0; i /= 2) { - sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_V_vror_VR(sum, i * sizeof(float))); - } - - float result; - q6op_vstu_variable_ARV(&result, sizeof(float), Q6_Vsf_equals_Vqf32(sum)); - return result; -} - -template -inline GraphStatus mul_mat_2d_f32(TensorType & out_0, const TensorType & in_0, const TensorType & in_1) { - // TODO: handle strides? - if (in_1.dim(1) != in_0.dim(1)) { - return GraphStatus::ErrorDimensions; - } - - size_t dims[4] = { in_1.dim(0), in_0.dim(0) }; - out_0.set_dims(dims); - - auto in0_ptr = (float *) in_0.raw_data_const(); - auto in1_ptr = (float *) in_1.raw_data_const(); - auto out_ptr = (float *) out_0.raw_data(); - - for (size_t i = 0; i < dims[0]; i++) { - // TODO: prefetch? - auto * in1_row = in1_ptr + i * in_1.dim(1); - auto * out_row = out_ptr + i * dims[1]; - for (size_t j = 0; j < dims[1]; j++) { - *out_row++ = vec_dot_product_f32(in0_ptr + j * in_0.dim(1), in1_row, in_0.dim(1)); - } - } - - return GraphStatus::Success; -} - -} // namespace - -/* execute functions for ops */ - -template -GraphStatus ggmlmulmatImpl(TensorType & out_0, const TensorType & in_0, const TensorType & in_1) { - if (!in_0.raw_data_const() || !in_1.raw_data_const() || !out_0.raw_data()) { - return GraphStatus::ErrorBadInput; - } - - if (in_0.rank() != in_1.rank()) { - return GraphStatus::ErrorRank; - } - - auto rank = in_0.rank(); - switch (rank) { - case 4: - case 3: - // TODO: add implementation - return GraphStatus::ErrorUnsupported; - case 2: - return mul_mat_2d_f32(out_0, in_0, in_1); - } - - return GraphStatus::ErrorRank; -} - -__attribute__((unused)) static float ggmlmulmatCostFunc(const Op * op) { - /* - * add code here - * */ - - float cost = 0.0; // add cost computation here - return cost; -} - -/* At the bottom of the op file, call END_PKG_OP_DEFINITION(), - where is as BEGIN_PKG_OP_DEFINITION -*/ -END_PKG_OP_DEFINITION(PKG_GgmlMulMat);