diff --git a/ggml/src/ggml-qnn/npu/CMakeLists.txt b/ggml/src/ggml-qnn/npu/CMakeLists.txt index e8ce255fec..1723fd3d4f 100644 --- a/ggml/src/ggml-qnn/npu/CMakeLists.txt +++ b/ggml/src/ggml-qnn/npu/CMakeLists.txt @@ -10,6 +10,15 @@ else() message(FATAL_ERROR "HEXAGON_SDK_ROOT not defined") endif() +if(${CMAKE_SYSTEM_NAME} MATCHES "Android") + set(PREBUILT_LIB_DIR "android_aarch64") +elseif(${CMAKE_SYSTEM_NAME} MATCHES "Linux") + set(PREBUILT_LIB_DIR "UbuntuARM_aarch64") +elseif(${CMAKE_SYSTEM_NAME} MATCHES "Windows") + # Windows + set(PREBUILT_LIB_DIR "windows_aarch64") +endif() + if(HEXAGON_SDK_ROOT) include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_fun.cmake) else() @@ -138,21 +147,13 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Android|Linux|Windows") link_options(hexagon-npu-host) - if(${CMAKE_SYSTEM_NAME} MATCHES "Android") - set(PREBUILT_LIB_DIR "android_aarch64") - elseif(${CMAKE_SYSTEM_NAME} MATCHES "Linux") - set(PREBUILT_LIB_DIR "UbuntuARM_aarch64") - else() - # Windows - set(PREBUILT_LIB_DIR "windows_aarch64") - endif() - choose_dsprpc("3" dsprpc) # cdsprpc link_custom_library(hexagon-npu-host ${dsprpc}) cmake_host_system_information(RESULT BUILD_CPU_COUNT QUERY NUMBER_OF_PHYSICAL_CORES) add_dsp_targets_for_host(hexagon-npu-host "v73" ${BUILD_CPU_COUNT}) add_dsp_targets_for_host(hexagon-npu-host "v75" ${BUILD_CPU_COUNT}) + add_dsp_targets_for_host(hexagon-npu-host "v79" ${BUILD_CPU_COUNT}) list(APPEND NPU_RUNTIME_LIBS "${HEXAGON_SDK_ROOT}/tools/utils/sysmon/sysMonApp") list(APPEND NPU_RUNTIME_LIBS "${HEXAGON_SDK_ROOT}/tools/utils/sysmon/sysMonAppLE") @@ -249,6 +250,7 @@ else() target_link_libraries(hexagon_npu_skel ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++abi.a ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++.a + ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc.a ) set_target_properties(hexagon_npu_skel PROPERTIES OUTPUT_NAME "hexagon_npu_skel_${HEXAGON_ARCH}") target_link_libraries(hexagon_npu_skel qprintf_static) diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp index f8b4da8a21..ff1335ace2 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp @@ -3,7 +3,6 @@ #include "thread_pool.hpp" // TODO: remove this dependency #include "type_traits.hpp" #include "vec_ops.hpp" -#include "vtcm_mem.hpp" namespace { diff --git a/ggml/src/ggml-qnn/npu/device/thread_pool.hpp b/ggml/src/ggml-qnn/npu/device/thread_pool.hpp index 9661c00670..455d4eec30 100644 --- a/ggml/src/ggml-qnn/npu/device/thread_pool.hpp +++ b/ggml/src/ggml-qnn/npu/device/thread_pool.hpp @@ -2,6 +2,7 @@ #include +#include #include #include #include @@ -237,8 +238,8 @@ template class thread_pool { DEVICE_LOG_DEBUG("thread_func_impl.end: %zu", param->tidx); } - std::atomic_bool _thread_exit = false; - std::array _threads; + std::atomic_bool _thread_exit = false; + std::array _threads = {}; qurt_barrier_t _pending = {}; qurt_barrier_t _completed = {}; thread_params _thread_params[kMaxThreadCount] = {}; diff --git a/ggml/src/ggml-qnn/npu/device/vtcm_mem.hpp b/ggml/src/ggml-qnn/npu/device/vtcm_mem.hpp index ab1041f626..b66ea7f348 100644 --- a/ggml/src/ggml-qnn/npu/device/vtcm_mem.hpp +++ b/ggml/src/ggml-qnn/npu/device/vtcm_mem.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include #include "util.hpp" @@ -9,15 +10,28 @@ namespace hexagon { class vtcm_mem { public: explicit vtcm_mem(size_t size, bool single_page) { + constexpr const unsigned int kTimeoutUs = 10000; // 10ms timeout + size_t avail_size = single_page ? get_avail_page_size() : get_avail_block_size(); if (size > avail_size) { DEVICE_LOG_ERROR("Requested VTCM size %zu exceeds available size %zu\n", size, avail_size); return; } - _vtcm_mem = HAP_request_VTCM((unsigned int) size, single_page ? 1 : 0); + compute_res_attr_t compute_res; + HAP_compute_res_attr_init(&compute_res); + HAP_compute_res_attr_set_serialize(&compute_res, false); + HAP_compute_res_attr_set_vtcm_param(&compute_res, size, single_page ? 1 : 0); + + _vtcm_context_id = HAP_compute_res_acquire(&compute_res, kTimeoutUs); // 10ms timeout + if (_vtcm_context_id == 0) { + DEVICE_LOG_ERROR("Failed to acquire VTCM context: %zu bytes, timeout %zu us\n", size, kTimeoutUs); + return; + } + + _vtcm_mem = HAP_compute_res_attr_get_vtcm_ptr(&compute_res); if (_vtcm_mem == nullptr) { - DEVICE_LOG_ERROR("Failed to allocate VTCM memory: %zu bytes\n", size); + DEVICE_LOG_ERROR("Failed to allocate VTCM memory: %zu bytes, timeout %zu us\n", size, kTimeoutUs); return; } @@ -26,7 +40,18 @@ class vtcm_mem { } explicit vtcm_mem(size_t size, bool single_page, size_t timeout_us) { - _vtcm_mem = HAP_request_async_VTCM((unsigned int) size, single_page ? 1 : 0, (unsigned int) timeout_us); + compute_res_attr_t compute_res; + HAP_compute_res_attr_init(&compute_res); + HAP_compute_res_attr_set_serialize(&compute_res, false); + HAP_compute_res_attr_set_vtcm_param(&compute_res, size, single_page ? 1 : 0); + + _vtcm_context_id = HAP_compute_res_acquire(&compute_res, timeout_us); + if (_vtcm_context_id == 0) { + DEVICE_LOG_ERROR("Failed to acquire VTCM context: %zu bytes, timeout %zu us\n", size, timeout_us); + return; + } + + _vtcm_mem = HAP_compute_res_attr_get_vtcm_ptr(&compute_res); if (_vtcm_mem == nullptr) { DEVICE_LOG_ERROR("Failed to allocate VTCM memory: %zu bytes, timeout %zu us\n", size, timeout_us); return; @@ -37,8 +62,8 @@ class vtcm_mem { } ~vtcm_mem() { - if (is_valid()) { - auto ret = HAP_release_VTCM(_vtcm_mem); + if (_vtcm_context_id != 0) { + auto ret = HAP_compute_res_release(_vtcm_context_id); if (ret != AEE_SUCCESS) { DEVICE_LOG_ERROR("Failed to release VTCM memory: %d\n", ret); } @@ -95,6 +120,8 @@ class vtcm_mem { void * _vtcm_mem = nullptr; size_t _vtcm_size = 0; + unsigned int _vtcm_context_id = 0; + DISABLE_COPY_AND_MOVE(vtcm_mem); };