feat: perf opt part3 (#42)
* add f16 support to etl wise op * wip * Revert "wip" This reverts commit efa88deb0e8265614fd91db3c3dba777c00e858b. * qf32 for mul * wip * Revert "wip" This reverts commit bb419f89ca4599470d61d636fe6fa1e033d62748. * disable fp16 add/sub * tempate trick * wip * add f16 mulmat * add log * fix view liked op * add log * fix f16 mulmat * add quant type * wip * add l2fetch * add vtcm_mem * wip * fix fetch * use vtcm cache in mulmat * revert vtcm cache * cache plane * small opt for plane cache * cache plane for some element wise op * wip * enable fetch even on vtcm * wip * copy sysMonApp * small opt * init ltu * add compute_params * add op common header * move vtcm_mem allocation to compute_param * fallback to memcache when vtcm allocate failed * pre-calculate quantize type * wip * try fix test failure * try fix mulmat nan * fix inf in mulmat * remove debug logs * wip * small refactoring on the dequant row func * fix typo * improve logging * add q4_0 and q8_0 * wip * wip * build hexagon libs in cmake * wip * fix qnn only build flag * fix typo * fix todo * wip * wip * add to_float * use to)float directly instead of ltu * wip * cache f16_to_f32 table into vtcm * print tensor dims at log * init device in supports_op_impl * revert cache ltu * wip * wip * fix graph calc issues by validate cache manually after each op * add cache invalidate func * enable cache fallback only in quantize tensors * add option to disable quantized tensors * propagate the asan flag to npu build * fix asan option * wip * invalidate tensors after finished * implement backend_buffer_reset * wip * wip * refactoring plane cache mechanism * wip * split row elements across thread * use table for f16 to f32 conversion * sync after each op * small refactoring to invalidate l2 cahce * wip * opt on float fetching * unroll for loop manually * reduce vtcm usage * add perf tracking for npu * print dimensions for profiler log * wip * wip * wip * add sub proc tracker * fix typo * print pcycles * wip * wip * prefetch rows * add l2fetch_row * small tweak based on perf tracer * opt l2 fetching * wip
This commit is contained in:
parent
db2a125438
commit
295f7f5957
|
|
@ -2,6 +2,8 @@ message(STATUS "Using QNN backend")
|
|||
|
||||
option(GGML_HEXAGON_NPU_ONLY "ggml-qnn: Only use Hexagon NPU" OFF)
|
||||
option(GGML_QNN_ENABLE_HEXAGON_BACKEND "ggml-qnn: Enable Hexagon custom package" ${GGML_HEXAGON_NPU_ONLY})
|
||||
option(GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS "ggml-qnn: Enable quantized tensors support" OFF)
|
||||
option(GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING "ggml-qnn: Enable performance tracking" OFF)
|
||||
|
||||
if(CMAKE_SYSTEM_NAME STREQUAL "Android")
|
||||
find_library(LOG_LIB log)
|
||||
|
|
@ -17,6 +19,9 @@ if(NOT DEFINED GGML_QNN_SDK_PATH)
|
|||
# TODO: create a function to search for the SDK path
|
||||
if(DEFINED ENV{QNN_SDK_PATH})
|
||||
set(GGML_QNN_SDK_PATH $ENV{QNN_SDK_PATH})
|
||||
elseif(DEFINED ENV{QNN_SDK_ROOT})
|
||||
message("found QNN_SDK_ROOT: ${QNN_SDK_ROOT}")
|
||||
set(GGML_QNN_SDK_PATH $ENV{QNN_SDK_ROOT})
|
||||
else()
|
||||
message(FATAL_ERROR "GGML_QNN_SDK_PATH not defined")
|
||||
endif()
|
||||
|
|
@ -28,9 +33,10 @@ message("CMAKE_CXX_FLAGS_RELEASE: ${CMAKE_CXX_FLAGS_RELEASE}")
|
|||
message("QNN_SDK_PATH: ${GGML_QNN_SDK_PATH}")
|
||||
|
||||
message("GGML_QNN: ${GGML_QNN}")
|
||||
message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING: ${GGML_QNN_ENABLE_PERFORMANCE_TRACKING}")
|
||||
message("GGML_QNN_ENABLE_HEXAGON_BACKEND: ${GGML_QNN_ENABLE_HEXAGON_BACKEND}")
|
||||
message("GGML_HEXAGON_NPU_ONLY: ${GGML_HEXAGON_NPU_ONLY}")
|
||||
message("GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS: ${GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS}")
|
||||
message("GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING: ${GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING}")
|
||||
|
||||
ggml_add_backend_library(ggml-qnn
|
||||
../../include/ggml-qnn.h
|
||||
|
|
@ -58,8 +64,8 @@ else()
|
|||
target_link_libraries(ggml-qnn PRIVATE runtime-common)
|
||||
endif()
|
||||
|
||||
# Copy QNN dynamic libraries
|
||||
set(QNN_DYNAMIC_LIBS "")
|
||||
# Copy dynamic libraries
|
||||
set(BACKEND_RUNTIME_LIBS "")
|
||||
|
||||
if(CMAKE_SYSTEM_NAME STREQUAL "Android" OR CMAKE_SYSTEM_NAME STREQUAL "Linux")
|
||||
if(CMAKE_SYSTEM_NAME STREQUAL "Android")
|
||||
|
|
@ -73,35 +79,35 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Android" OR CMAKE_SYSTEM_NAME STREQUAL "Linux")
|
|||
set(QNN_SDK_LIB_PATH "${GGML_QNN_SDK_PATH}/lib/aarch64-oe-linux-gcc11.2")
|
||||
endif()
|
||||
|
||||
list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/libQnnSystem.so")
|
||||
list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/libQnnCpu.so")
|
||||
list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/libQnnGpu.so")
|
||||
list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/libQnnHtp.so")
|
||||
list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/libQnnSystem.so")
|
||||
list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/libQnnCpu.so")
|
||||
list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/libQnnGpu.so")
|
||||
list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/libQnnHtp.so")
|
||||
file(GLOB HTP_STUB_LIBS "${QNN_SDK_LIB_PATH}/libQnnHtp*.so")
|
||||
list(APPEND QNN_DYNAMIC_LIBS ${HTP_STUB_LIBS})
|
||||
list(APPEND BACKEND_RUNTIME_LIBS ${HTP_STUB_LIBS})
|
||||
|
||||
if(CMAKE_SYSTEM_NAME STREQUAL "Android")
|
||||
file(GLOB HTP_SKEL_LIBS "${GGML_QNN_SDK_PATH}/lib/hexagon-*/unsigned/libQnnHtp*Skel.so")
|
||||
list(APPEND QNN_DYNAMIC_LIBS ${HTP_SKEL_LIBS})
|
||||
list(APPEND BACKEND_RUNTIME_LIBS ${HTP_SKEL_LIBS})
|
||||
|
||||
if(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
|
||||
if(EXISTS "${CMAKE_ANDROID_NDK}/prebuilt/android-arm64/gdbserver/gdbserver")
|
||||
list(APPEND QNN_DYNAMIC_LIBS "${CMAKE_ANDROID_NDK}/prebuilt/android-arm64/gdbserver/gdbserver")
|
||||
list(APPEND BACKEND_RUNTIME_LIBS "${CMAKE_ANDROID_NDK}/prebuilt/android-arm64/gdbserver/gdbserver")
|
||||
message("old ndk, copy gdbserver")
|
||||
else()
|
||||
file(GLOB LLDB_SERVER "${CMAKE_ANDROID_NDK}/toolchains/llvm/prebuilt/linux-x86_64/lib64/clang/*/lib/linux/aarch64/lldb-server")
|
||||
list(APPEND QNN_DYNAMIC_LIBS ${LLDB_SERVER})
|
||||
list(APPEND BACKEND_RUNTIME_LIBS ${LLDB_SERVER})
|
||||
message("new ndk, copy lldb-server")
|
||||
endif()
|
||||
|
||||
file(GLOB OMP_LIBS "${CMAKE_ANDROID_NDK}/toolchains/llvm/prebuilt/linux-x86_64/lib64/clang/*/lib/linux/aarch64/libomp.so")
|
||||
file(GLOB ASAN_LIBS "${CMAKE_ANDROID_NDK}/toolchains/llvm/prebuilt/linux-x86_64/lib64/clang/*/lib/linux/libclang_rt.asan-aarch64-android.so")
|
||||
list(APPEND QNN_DYNAMIC_LIBS ${OMP_LIBS})
|
||||
list(APPEND QNN_DYNAMIC_LIBS ${ASAN_LIBS})
|
||||
list(APPEND BACKEND_RUNTIME_LIBS ${OMP_LIBS})
|
||||
list(APPEND BACKEND_RUNTIME_LIBS ${ASAN_LIBS})
|
||||
endif()
|
||||
else()
|
||||
# Linux
|
||||
list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/libHtpPrepare.so")
|
||||
list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/libHtpPrepare.so")
|
||||
endif()
|
||||
elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows")
|
||||
if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
|
||||
|
|
@ -112,24 +118,24 @@ elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows")
|
|||
set(QNN_SDK_LIB_PATH "${GGML_QNN_SDK_PATH}/lib/aarch64-windows-msvc")
|
||||
endif()
|
||||
|
||||
list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/QnnSystem.dll")
|
||||
list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/QnnCpu.dll")
|
||||
list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/QnnGpu.dll")
|
||||
list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/QnnHtp.dll")
|
||||
list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/QnnSystem.dll")
|
||||
list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/QnnCpu.dll")
|
||||
list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/QnnGpu.dll")
|
||||
list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/QnnHtp.dll")
|
||||
file(GLOB HTP_STUB_LIBS "${QNN_SDK_LIB_PATH}/QnnHtp*.dll")
|
||||
|
||||
if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
|
||||
list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/HtpPrepare.dll")
|
||||
list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/HtpPrepare.dll")
|
||||
endif()
|
||||
|
||||
list(APPEND QNN_DYNAMIC_LIBS ${HTP_STUB_LIBS})
|
||||
list(APPEND BACKEND_RUNTIME_LIBS ${HTP_STUB_LIBS})
|
||||
endif()
|
||||
|
||||
foreach(QNN_DYNAMIC_LIB ${QNN_DYNAMIC_LIBS})
|
||||
message("Copy: ${QNN_DYNAMIC_LIB} -> ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
|
||||
foreach(RUNTIME_LIB ${BACKEND_RUNTIME_LIBS})
|
||||
message("Copy: ${RUNTIME_LIB} -> ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
|
||||
add_custom_command(
|
||||
TARGET ggml-qnn POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy
|
||||
${QNN_DYNAMIC_LIB}
|
||||
${RUNTIME_LIB}
|
||||
${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
|
||||
endforeach()
|
||||
|
|
|
|||
|
|
@ -26,6 +26,56 @@ set(common_incs
|
|||
|
||||
include_directories(${common_incs})
|
||||
|
||||
function(add_device_target target_name DSP_ARCH IS_SIMULATOR BUILD_CPU_COUNT)
|
||||
if(${CMAKE_BUILD_TYPE} MATCHES "Debug|Dbg")
|
||||
set(HEXAGON_BUILD_CONFIG "Debug")
|
||||
set(EXTRA_BUILD_FLAGS
|
||||
VERBOSE=1
|
||||
TREE=1
|
||||
)
|
||||
else()
|
||||
set(HEXAGON_BUILD_CONFIG "Release")
|
||||
set(EXTRA_BUILD_FLAGS)
|
||||
endif()
|
||||
|
||||
if(${GGML_SANITIZE_ADDRESS} OR ${LLAMA_SANITIZE_ADDRESS})
|
||||
set(GGML_HEXAGON_NPU_SANITIZE_ADDRESS ON)
|
||||
else()
|
||||
set(GGML_HEXAGON_NPU_SANITIZE_ADDRESS OFF)
|
||||
endif()
|
||||
|
||||
set(EXTRA_BUILD_FLAGS ${EXTRA_BUILD_FLAGS} GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS=${GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS})
|
||||
set(EXTRA_BUILD_FLAGS ${EXTRA_BUILD_FLAGS} GGML_HEXAGON_NPU_SANITIZE_ADDRESS=${GGML_HEXAGON_NPU_SANITIZE_ADDRESS})
|
||||
set(EXTRA_BUILD_FLAGS ${EXTRA_BUILD_FLAGS} GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING=${GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING})
|
||||
|
||||
set(HEXAGON_TOOLS_VARIANT $ENV{DEFAULT_TOOLS_VARIANT})
|
||||
set(BUILD_DIR ${CMAKE_CURRENT_LIST_DIR}/hexagon_${HEXAGON_BUILD_CONFIG}_${HEXAGON_TOOLS_VARIANT}_${DSP_ARCH})
|
||||
set(BUILD_BINARY_NAME ${BUILD_DIR}/libhexagon_npu_skel_${DSP_ARCH}.so)
|
||||
|
||||
if(${IS_SIMULATOR})
|
||||
set(HEXAGON_TOOLCHAIN_TYPE "hexagonsim")
|
||||
set(OUTPUT_BINARY_NAME libhexagon_npu_skel_${DSP_ARCH}_sim.so)
|
||||
else()
|
||||
set(HEXAGON_TOOLCHAIN_TYPE "hexagon")
|
||||
set(OUTPUT_BINARY_NAME libhexagon_npu_skel_${DSP_ARCH}.so)
|
||||
endif()
|
||||
|
||||
add_custom_target(${target_name} ALL
|
||||
COMMAND ${CMAKE_COMMAND} -E remove_directory ${BUILD_DIR}
|
||||
COMMAND build_cmake ${HEXAGON_TOOLCHAIN_TYPE} DSP_ARCH=${DSP_ARCH} BUILD=${HEXAGON_BUILD_CONFIG} ${EXTRA_BUILD_FLAGS} -j${BUILD_CPU_COUNT}
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${BUILD_BINARY_NAME} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${OUTPUT_BINARY_NAME}
|
||||
BYPRODUCTS ${BUILD_BINARY_NAME}
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
|
||||
)
|
||||
endfunction()
|
||||
|
||||
function(add_dsp_targets_for_host host_target DSP_ARCH BUILD_CPU_COUNT)
|
||||
add_device_target(hexagon-npu-device-${DSP_ARCH} ${DSP_ARCH} FALSE ${BUILD_CPU_COUNT})
|
||||
add_device_target(hexagon-npu-device-${DSP_ARCH}-sim ${DSP_ARCH} TRUE ${BUILD_CPU_COUNT})
|
||||
add_dependencies(hexagon-npu-device-${DSP_ARCH}-sim hexagon-npu-device-${DSP_ARCH})
|
||||
add_dependencies(${host_target} hexagon-npu-device-${DSP_ARCH}-sim)
|
||||
endfunction()
|
||||
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Android|Linux|Windows")
|
||||
# host build
|
||||
file(GLOB common_srcs "${CMAKE_CURRENT_LIST_DIR}/common/*.cpp")
|
||||
|
|
@ -52,6 +102,12 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Android|Linux|Windows")
|
|||
GGML_QNN_ENABLE_HEXAGON_BACKEND
|
||||
)
|
||||
|
||||
if(GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS)
|
||||
target_compile_definitions(hexagon-npu-host PUBLIC
|
||||
GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS
|
||||
)
|
||||
endif()
|
||||
|
||||
target_include_directories(hexagon-npu-host PRIVATE
|
||||
${HEXAGON_SDK_ROOT}/ipc/fastrpc/rpcmem/inc/
|
||||
${QNN_SDK_ROOT}/include/QNN/
|
||||
|
|
@ -71,6 +127,13 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Android|Linux|Windows")
|
|||
target_link_options(hexagon-npu-host PUBLIC -pie)
|
||||
endif()
|
||||
|
||||
if(GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING)
|
||||
message("GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING is enabled")
|
||||
target_compile_definitions(hexagon-npu-host PUBLIC GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING)
|
||||
else()
|
||||
message("GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING is disabled")
|
||||
endif()
|
||||
|
||||
link_options(hexagon-npu-host)
|
||||
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Android")
|
||||
|
|
@ -84,8 +147,24 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Android|Linux|Windows")
|
|||
|
||||
choose_dsprpc("3" dsprpc) # cdsprpc
|
||||
link_custom_library(hexagon-npu-host ${dsprpc})
|
||||
|
||||
cmake_host_system_information(RESULT BUILD_CPU_COUNT QUERY NUMBER_OF_PHYSICAL_CORES)
|
||||
add_dsp_targets_for_host(hexagon-npu-host "v73" ${BUILD_CPU_COUNT})
|
||||
add_dsp_targets_for_host(hexagon-npu-host "v75" ${BUILD_CPU_COUNT})
|
||||
|
||||
list(APPEND NPU_RUNTIME_LIBS "${HEXAGON_SDK_ROOT}/tools/utils/sysmon/sysMonApp")
|
||||
list(APPEND NPU_RUNTIME_LIBS "${HEXAGON_SDK_ROOT}/tools/utils/sysmon/sysMonAppLE")
|
||||
|
||||
foreach(RUNTIME_LIB ${NPU_RUNTIME_LIBS})
|
||||
message("Copy: ${RUNTIME_LIB} -> ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
|
||||
add_custom_command(
|
||||
TARGET hexagon-npu-host POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy
|
||||
${RUNTIME_LIB}
|
||||
${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
|
||||
endforeach()
|
||||
else()
|
||||
# hexagon npu build
|
||||
# hexagon npu build, this section will run inside the `build_cmake` script
|
||||
cmake_minimum_required(VERSION 3.14.3)
|
||||
project(hexagon_npu C CXX ASM)
|
||||
|
||||
|
|
@ -96,6 +175,8 @@ else()
|
|||
|
||||
set(QNN_SDK_ROOT $ENV{QNN_SDK_ROOT})
|
||||
message("QNN_SDK_ROOT: ${QNN_SDK_ROOT}")
|
||||
message("GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS: ${GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS}")
|
||||
|
||||
include_directories(
|
||||
${QNN_SDK_ROOT}/include/QNN/
|
||||
)
|
||||
|
|
@ -124,6 +205,30 @@ else()
|
|||
)
|
||||
endif()
|
||||
|
||||
if(GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS)
|
||||
message("GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS is enabled")
|
||||
target_compile_definitions(hexagon_npu_skel_OBJS PUBLIC
|
||||
GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS
|
||||
)
|
||||
endif()
|
||||
|
||||
if(GGML_HEXAGON_NPU_SANITIZE_ADDRESS)
|
||||
message("GGML_HEXAGON_NPU_SANITIZE_ADDRESS is enabled")
|
||||
target_compile_options(hexagon_npu_skel_OBJS PUBLIC
|
||||
-fsanitize=address -fno-omit-frame-pointer
|
||||
)
|
||||
target_link_libraries(hexagon_npu_skel_OBJS PUBLIC
|
||||
-fsanitize=address
|
||||
)
|
||||
endif()
|
||||
|
||||
if(GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING)
|
||||
message("GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING is enabled")
|
||||
target_compile_definitions(hexagon_npu_skel_OBJS PUBLIC
|
||||
GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
|
||||
)
|
||||
endif()
|
||||
|
||||
build_idl(idl/hexagon_npu.idl hexagon_npu_skel_OBJS)
|
||||
|
||||
# disable warnings for the skel
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@
|
|||
#include "graph.hpp"
|
||||
#include "hexagon_npu.h"
|
||||
#include "op_impl.hpp"
|
||||
#include "quants.hpp"
|
||||
#include "remote.h"
|
||||
#include "tensor.hpp"
|
||||
#include "thread_pool.hpp"
|
||||
|
|
@ -18,6 +19,37 @@ namespace {
|
|||
|
||||
struct npu_device_context {
|
||||
std::unique_ptr<hexagon::default_thread_pool> thread_pool;
|
||||
std::unique_ptr<float[]> f16_to_f32_table; // TODO: store vtcm?
|
||||
|
||||
bool init() {
|
||||
if (!init_ltu()) {
|
||||
DEVICE_LOG_ERROR("Failed to initialize LTU");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!init_thread_pool()) {
|
||||
DEVICE_LOG_ERROR("Failed to initialize thread pool");
|
||||
return false;
|
||||
}
|
||||
|
||||
DEVICE_LOG_DEBUG("NPU device context initialized");
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
bool init_ltu() {
|
||||
constexpr const size_t kLtuCount = 1U << 16;
|
||||
|
||||
f16_to_f32_table = std::make_unique<float[]>(kLtuCount);
|
||||
if (!f16_to_f32_table) {
|
||||
DEVICE_LOG_ERROR("Failed to allocate memory for f16_to_f32 table");
|
||||
return false;
|
||||
}
|
||||
|
||||
hexagon::init_f16_f32_table(f16_to_f32_table.get(), kLtuCount);
|
||||
DEVICE_LOG_DEBUG("f16_to_f32 table initialized");
|
||||
return true;
|
||||
}
|
||||
|
||||
bool init_thread_pool() {
|
||||
if (thread_pool) {
|
||||
|
|
@ -67,8 +99,8 @@ int npu_device_open(const char * uri, remote_handle64 * h) {
|
|||
return AEE_ENOMEMORY;
|
||||
}
|
||||
|
||||
if (!context->init_thread_pool()) {
|
||||
DEVICE_LOG_ERROR("Failed to initialize thread pool");
|
||||
if (!context->init()) {
|
||||
DEVICE_LOG_ERROR("Failed to initialize npu_device_context");
|
||||
delete context;
|
||||
return AEE_EFAILED;
|
||||
}
|
||||
|
|
@ -187,7 +219,7 @@ AEEResult npu_device_graph_compute(remote_handle64 _h, npu_device_graph_handle_t
|
|||
return AEE_EINVHANDLE;
|
||||
}
|
||||
|
||||
if (!graph->compute(dev_ctx->thread_pool.get())) {
|
||||
if (!graph->compute(dev_ctx->thread_pool.get(), dev_ctx->f16_to_f32_table.get())) {
|
||||
return AEE_EFAILED;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@
|
|||
|
||||
#include "op_impl.hpp"
|
||||
#include "util.hpp"
|
||||
#include "vtcm_mem.hpp"
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
|
|
@ -28,50 +29,57 @@ void graph::set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_co
|
|||
for (int i = 0; i < tensor_count; ++i) {
|
||||
auto * tensor_obj = reinterpret_cast<tensor *>(tensors[i]);
|
||||
_tensors[i] = tensor_obj;
|
||||
DEVICE_LOG_DEBUG("graph(%p) set_tensor[%d]: %p(%p,%p), op: %d\n", (void *) this, i, (void *) tensor_obj,
|
||||
(void *) tensor_obj->get_src(0), (void *) tensor_obj->get_src(1), tensor_obj->get_op());
|
||||
DEVICE_LOG_DEBUG("graph(%p) set_tensor[%d]: %p(%p,%p), op: %s\n", (void *) this, i, (void *) tensor_obj,
|
||||
(void *) tensor_obj->get_src(0), (void *) tensor_obj->get_src(1),
|
||||
op_get_name(tensor_obj->get_op()));
|
||||
}
|
||||
|
||||
_tensor_count = tensor_count;
|
||||
DEVICE_LOG_DEBUG("graph(%p) tensor count: %zu\n", (void *) this, _tensor_count);
|
||||
}
|
||||
|
||||
bool graph::compute(default_thread_pool * thread_pool) {
|
||||
if (!_tensors || !_tensor_count) {
|
||||
bool graph::compute(default_thread_pool * thread_pool, const float * f16_to_f32_table) {
|
||||
if (_tensors == nullptr || !_tensor_count) {
|
||||
DEVICE_LOG_DEBUG("graph(%p) no tensors to compute\n", (void *) this);
|
||||
return true; // return success if no tensors to compute
|
||||
}
|
||||
|
||||
DEVICE_LOG_DEBUG("graph(%p) compute\n", (void *) this);
|
||||
thread_pool->sync_execute(reinterpret_cast<default_thread_pool::task_type>(&graph::thread_pool_task), this);
|
||||
|
||||
for (size_t i = 0; i < _tensor_count; ++i) {
|
||||
auto * dst = _tensors[i];
|
||||
dst->flush(); // TODO: optimize this
|
||||
_f16_to_f32_table = f16_to_f32_table;
|
||||
if (thread_pool) {
|
||||
thread_pool->sync_execute(reinterpret_cast<default_thread_pool::task_type>(&graph::thread_pool_task), this);
|
||||
} else {
|
||||
compute_impl(nullptr, 0, 1);
|
||||
}
|
||||
|
||||
_f16_to_f32_table = nullptr;
|
||||
return true;
|
||||
}
|
||||
|
||||
void graph::thread_pool_task(default_thread_pool * pool, size_t thread_idx, size_t thread_count, graph * graph) {
|
||||
NPU_UNUSED(pool);
|
||||
graph->compute_impl(thread_idx, thread_count);
|
||||
graph->compute_impl(pool, thread_idx, thread_count);
|
||||
}
|
||||
|
||||
void graph::compute_impl(size_t thread_idx, size_t thread_count) {
|
||||
void graph::compute_impl(default_thread_pool * pool, size_t thread_idx, size_t thread_count) {
|
||||
for (size_t i = 0; i < _tensor_count; ++i) {
|
||||
auto * dst = _tensors[i];
|
||||
auto op = dst->get_op();
|
||||
auto * func = get_compute_func(op);
|
||||
if (!func) {
|
||||
auto * func = get_compute_func(dst);
|
||||
if (func == nullptr) {
|
||||
DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d not supported\n", (void *) this, i, op);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!func(dst, thread_idx, thread_count)) {
|
||||
hexagon::compute_params params = { thread_idx, thread_count, _f16_to_f32_table };
|
||||
if (!func(dst, ¶ms)) {
|
||||
DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d compute failed\n", (void *) this, i, op);
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO: figure out which ops need to sync
|
||||
if (pool) {
|
||||
pool->sync_thread();
|
||||
}
|
||||
dst->invalidate();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -17,14 +17,15 @@ class graph {
|
|||
|
||||
void set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_count);
|
||||
|
||||
bool compute(default_thread_pool * thread_pool);
|
||||
bool compute(default_thread_pool * thread_pool, const float * f16_to_f32_table);
|
||||
|
||||
private:
|
||||
static void thread_pool_task(default_thread_pool * pool, size_t thread_idx, size_t thread_count, graph * graph);
|
||||
void compute_impl(size_t thread_idx, size_t thread_count);
|
||||
void compute_impl(default_thread_pool * pool, size_t thread_idx, size_t thread_count);
|
||||
|
||||
std::unique_ptr<tensor *[]> _tensors;
|
||||
size_t _tensor_count = 0;
|
||||
size_t _tensor_count = 0;
|
||||
const float * _f16_to_f32_table = nullptr;
|
||||
|
||||
DISABLE_COPY_AND_MOVE(graph);
|
||||
};
|
||||
|
|
|
|||
|
|
@ -6,25 +6,27 @@
|
|||
#include <HTP/core/intrinsics.h>
|
||||
|
||||
#include "op_mul_mat.hpp"
|
||||
#include "quants.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
template <HVX_Vector (*_OpIntrinsic)(HVX_Vector, HVX_Vector)>
|
||||
inline void vec_op_f32_f32(const float * src0, const float * src1, size_t count, float * dst) {
|
||||
template <HVX_Vector (*_OpIntrinsic)(HVX_Vector, HVX_Vector), typename _TyData>
|
||||
inline void vec_op_impl(const _TyData * src0, const _TyData * src1, size_t count, _TyData * dst) {
|
||||
constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(_TyData);
|
||||
|
||||
HVX_Vector * iptr0 = ((HVX_Vector *) src0);
|
||||
HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / hexagon::kFloatsPerVector);
|
||||
HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kElementsPerVector);
|
||||
HVX_Vector * iptr1 = ((HVX_Vector *) src1);
|
||||
HVX_Vector * optr = ((HVX_Vector *) dst);
|
||||
HVX_Vector prev0 = *iptr0++;
|
||||
HVX_Vector prev1 = *iptr1++;
|
||||
|
||||
// TODO: prefetch or just use VTCM?
|
||||
while (iptr0 < iptr0_end) {
|
||||
HVX_Vector curr0 = *iptr0++;
|
||||
HVX_Vector curr1 = *iptr1++;
|
||||
HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
|
||||
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
|
||||
*optr++ = Q6_Vsf_equals_Vqf32(_OpIntrinsic(s0, s1));
|
||||
*optr++ = _OpIntrinsic(s0, s1);
|
||||
prev0 = curr0;
|
||||
prev1 = curr1;
|
||||
}
|
||||
|
|
@ -42,13 +44,13 @@ inline void vec_op_f32_f32(const float * src0, const float * src1, size_t count,
|
|||
iptr1 = iptr1_aligned ? iptr1 : iptr1 + 1;
|
||||
HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
|
||||
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
|
||||
*optr++ = Q6_Vsf_equals_Vqf32(_OpIntrinsic(s0, s1));
|
||||
*optr++ = _OpIntrinsic(s0, s1);
|
||||
prev0 = curr0;
|
||||
prev1 = curr1;
|
||||
}
|
||||
|
||||
const size_t leftover = count % hexagon::kFloatsPerVector;
|
||||
const size_t leftover_bytes = leftover * sizeof(float);
|
||||
const size_t leftover = count % kElementsPerVector;
|
||||
const size_t leftover_bytes = leftover * sizeof(_TyData);
|
||||
if (leftover > 0) {
|
||||
// handle the leftover elements
|
||||
HVX_Vector curr0 =
|
||||
|
|
@ -59,24 +61,56 @@ inline void vec_op_f32_f32(const float * src0, const float * src1, size_t count,
|
|||
(leftover_bytes + hexagon::unaligned_bytes(iptr1) > hexagon::kBytesPerVector) ? *iptr1 : prev1;
|
||||
curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
|
||||
|
||||
q6op_vstu_variable_ARV(optr, leftover_bytes, Q6_Vsf_equals_Vqf32(_OpIntrinsic(curr0, curr1)));
|
||||
q6op_vstu_variable_ARV(optr, leftover_bytes, _OpIntrinsic(curr0, curr1));
|
||||
}
|
||||
}
|
||||
|
||||
template <HVX_Vector (*_OpIntrinsic)(HVX_Vector, HVX_Vector)>
|
||||
inline void vec_op_f32_f32(const float * src0, const float * src1, size_t count, float * dst) {
|
||||
vec_op_impl<_OpIntrinsic, float>(src0, src1, count, dst);
|
||||
}
|
||||
|
||||
inline HVX_Vector vadd_f32_f32(HVX_Vector a, HVX_Vector b) {
|
||||
return Q6_Vqf32_vadd_VsfVsf(a, b);
|
||||
return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a, b));
|
||||
}
|
||||
|
||||
inline HVX_Vector vsub_f32_f32(HVX_Vector a, HVX_Vector b) {
|
||||
return Q6_Vqf32_vsub_VsfVsf(a, b);
|
||||
return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a, b));
|
||||
}
|
||||
|
||||
inline HVX_Vector vmul_f32_f32(HVX_Vector a, HVX_Vector b) {
|
||||
return Q6_Vqf32_vmpy_VsfVsf(a, b);
|
||||
return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a, b));
|
||||
}
|
||||
|
||||
template <typename _TySrc, typename _TyDst, void (*_RowFunc)(const _TySrc *, const _TySrc *, size_t, _TyDst *)>
|
||||
bool element_wise_op(hexagon::tensor * out, size_t tidx, size_t tcnt) {
|
||||
template <HVX_Vector (*_OpIntrinsic)(HVX_Vector, HVX_Vector)>
|
||||
inline void vec_op_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count,
|
||||
npu_device_fp16_t * dst) {
|
||||
vec_op_impl<_OpIntrinsic, npu_device_fp16_t>(src0, src1, count, dst);
|
||||
}
|
||||
|
||||
inline HVX_Vector vadd_f16_f16(HVX_Vector a, HVX_Vector b) {
|
||||
// TODO: fix this since qf16 has less precision than fp16
|
||||
return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_VhfVhf(a, b));
|
||||
}
|
||||
|
||||
inline HVX_Vector vsub_f16_f16(HVX_Vector a, HVX_Vector b) {
|
||||
// TODO: fix this since qf16 has less precision than fp16
|
||||
return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vsub_VhfVhf(a, b));
|
||||
}
|
||||
|
||||
inline HVX_Vector vmul_f16_f16(HVX_Vector a, HVX_Vector b) {
|
||||
return Q6_Vhf_equals_Wqf32(Q6_Wqf32_vmpy_VhfVhf(a, b));
|
||||
}
|
||||
|
||||
template <typename T> struct get_data_type {};
|
||||
|
||||
template <typename _TyData> struct get_data_type<void (*)(const _TyData *, const _TyData *, size_t, _TyData *)> {
|
||||
using type = _TyData;
|
||||
};
|
||||
|
||||
template <auto _RowFunc> bool element_wise_op(hexagon::tensor * out, hexagon::compute_params * params) {
|
||||
using data_type = typename get_data_type<decltype(_RowFunc)>::type;
|
||||
|
||||
if (!out) {
|
||||
return false;
|
||||
}
|
||||
|
|
@ -94,24 +128,39 @@ bool element_wise_op(hexagon::tensor * out, size_t tidx, size_t tcnt) {
|
|||
return false;
|
||||
}
|
||||
|
||||
const auto * src0_ptr = reinterpret_cast<const uint8_t *>(src0->get_data());
|
||||
const auto * src1_ptr = reinterpret_cast<const uint8_t *>(src1->get_data());
|
||||
auto * dst_ptr = reinterpret_cast<uint8_t *>(out->get_data());
|
||||
auto total_rows = out->get_ne(3) * out->get_ne(2) * out->get_ne(1);
|
||||
const auto rows_per_box = out->get_ne(2) * out->get_ne(1);
|
||||
const auto start_end = hexagon::get_thread_work_slice(total_rows, tidx, tcnt);
|
||||
const auto * src0_ptr = reinterpret_cast<const uint8_t *>(src0->get_read_buffer());
|
||||
const auto * src1_ptr = reinterpret_cast<const uint8_t *>(src1->get_read_buffer());
|
||||
auto * dst_ptr = reinterpret_cast<uint8_t *>(out->get_write_buffer());
|
||||
auto total_rows = out->get_ne(3) * out->get_ne(2) * out->get_ne(1);
|
||||
const auto rows_per_cube = out->get_ne(2) * out->get_ne(1);
|
||||
const auto start_end = hexagon::get_thread_work_slice(total_rows, params->tidx, params->tcnt);
|
||||
|
||||
if (start_end.first >= start_end.second) {
|
||||
return true;
|
||||
}
|
||||
|
||||
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(out, params->tidx);
|
||||
|
||||
const size_t valid_row_bytes = src0->get_ne(0) * sizeof(data_type);
|
||||
for (int64_t ir = start_end.first; ir < start_end.second; ++ir) {
|
||||
const auto i03 = ir / rows_per_box;
|
||||
const auto i02 = ir / out->get_ne(1) - i03 * out->get_ne(2);
|
||||
const auto i01 = ir % out->get_ne(1);
|
||||
const auto i13 = i03 % src1->get_ne(3);
|
||||
const auto i12 = i02 % src1->get_ne(2);
|
||||
const auto i11 = i01 % src1->get_ne(1);
|
||||
auto * src0_row = src0_ptr + i03 * src0->get_nb(3) + i02 * src0->get_nb(2) + i01 * src0->get_nb(1);
|
||||
auto * src1_row = src1_ptr + i13 * src1->get_nb(3) + i12 * src1->get_nb(2) + i11 * src1->get_nb(1);
|
||||
auto * dst_row = dst_ptr + i03 * out->get_nb(3) + i02 * out->get_nb(2) + i01 * out->get_nb(1);
|
||||
_RowFunc(reinterpret_cast<const _TySrc *>(src0_row), reinterpret_cast<const _TySrc *>(src1_row),
|
||||
static_cast<size_t>(out->get_ne(0)), reinterpret_cast<_TyDst *>(dst_row));
|
||||
const auto i03 = ir / rows_per_cube;
|
||||
const auto i02 = ir / out->get_ne(1) - i03 * out->get_ne(2);
|
||||
const auto i01 = ir % out->get_ne(1); // TODO: should we use divide instead of mod?
|
||||
const auto i13 = i03 % src1->get_ne(3);
|
||||
const auto i12 = i02 % src1->get_ne(2);
|
||||
const auto i11 = i01 % src1->get_ne(1);
|
||||
|
||||
auto * src1_plane = src1_ptr + i13 * src1->get_nb(3) + i12 * src1->get_nb(2);
|
||||
auto * src0_row = src0_ptr + i03 * src0->get_nb(3) + i02 * src0->get_nb(2) + i01 * src0->get_nb(1);
|
||||
auto * src1_row = src1_plane + i11 * src1->get_nb(1);
|
||||
auto * dst_row = dst_ptr + i03 * out->get_nb(3) + i02 * out->get_nb(2) + i01 * out->get_nb(1);
|
||||
if (ir + 1 < start_end.second) {
|
||||
hexagon::l2fetch_row(src0_row + src0->get_nb(1), valid_row_bytes);
|
||||
hexagon::l2fetch_row(src1_row + src1->get_nb(1), valid_row_bytes);
|
||||
}
|
||||
|
||||
_RowFunc(reinterpret_cast<const data_type *>(src0_row), reinterpret_cast<const data_type *>(src1_row),
|
||||
static_cast<size_t>(out->get_ne(0)), reinterpret_cast<data_type *>(dst_row));
|
||||
}
|
||||
|
||||
return true;
|
||||
|
|
@ -120,19 +169,37 @@ bool element_wise_op(hexagon::tensor * out, size_t tidx, size_t tcnt) {
|
|||
bool is_element_wise_op_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
|
||||
const npu_device_tensor_spec & dst, npu_device_tensor_op op) {
|
||||
if (op != NPU_OP_ADD && op != NPU_OP_SUB && op != NPU_OP_MUL) {
|
||||
DEVICE_LOG_DEBUG("Unsupported element wise op: %s\n", hexagon::op_get_name(op));
|
||||
DEVICE_LOG_DEBUG("[%s]unsupported\n", hexagon::op_get_name(op));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dst.type != src0.type || dst.type != src1.type) {
|
||||
DEVICE_LOG_DEBUG("[%s]src0.type and dst.type mismatch: %s vs %s\n", hexagon::op_get_name(op),
|
||||
hexagon::get_type_name(src0.type), hexagon::get_type_name(dst.type));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dst.type != NPU_DATA_TYPE_F32 && dst.type != NPU_DATA_TYPE_F16) {
|
||||
DEVICE_LOG_DEBUG("[%s]unsupported data type: %s\n", hexagon::op_get_name(op), hexagon::get_type_name(dst.type));
|
||||
return false;
|
||||
}
|
||||
|
||||
// TODO: fix FP16 add/sub
|
||||
if (dst.type == NPU_DATA_TYPE_F16 && op != NPU_OP_MUL) {
|
||||
DEVICE_LOG_DEBUG("[%s]unsupported data type: %s\n", hexagon::op_get_name(op), hexagon::get_type_name(dst.type));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src0.ne[0] != src1.ne[0]) {
|
||||
DEVICE_LOG_DEBUG("src0.ne[0] and src1.ne[0] not match: %ld vs %ld\n", (long) src0.ne[0], (long) src1.ne[0]);
|
||||
DEVICE_LOG_DEBUG("[%s]src0.ne[0] and src1.ne[0] not match: %ld vs %ld\n", hexagon::op_get_name(op),
|
||||
(long) src0.ne[0], (long) src1.ne[0]);
|
||||
return false;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < DEVICE_TENSOR_MAX_DIMS; ++i) {
|
||||
if (src0.ne[i] != dst.ne[i]) {
|
||||
DEVICE_LOG_DEBUG("src0.ne[%zu] and dst.ne[%zu] not match: %lld vs %lld\n", i, i, (long long) src0.ne[i],
|
||||
(long long) dst.ne[i]);
|
||||
DEVICE_LOG_DEBUG("[%s]src0.ne[%zu] and dst.ne[%zu] not match: %lld vs %lld\n", hexagon::op_get_name(op), i,
|
||||
i, (long long) src0.ne[i], (long long) dst.ne[i]);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
@ -142,46 +209,67 @@ bool is_element_wise_op_supported(const npu_device_tensor_spec & src0, const npu
|
|||
|
||||
struct op_capabilities {
|
||||
npu_device_tensor_op op;
|
||||
hexagon::compute_func_type compute_func;
|
||||
hexagon::op_is_supported_func_type is_supported;
|
||||
hexagon::compute_func_type compute_funcs[NPU_DATA_TYPE_COUNT];
|
||||
};
|
||||
|
||||
constexpr const op_capabilities kOpCapabilities[] = {
|
||||
{ NPU_OP_MUL_MAT, hexagon::mul_mat_f32, hexagon::is_mul_mat_supported },
|
||||
{ NPU_OP_ADD, element_wise_op<float, float, vec_op_f32_f32<vadd_f32_f32>>, is_element_wise_op_supported },
|
||||
{ NPU_OP_SUB, element_wise_op<float, float, vec_op_f32_f32<vsub_f32_f32>>, is_element_wise_op_supported },
|
||||
{ NPU_OP_MUL, element_wise_op<float, float, vec_op_f32_f32<vmul_f32_f32>>, is_element_wise_op_supported },
|
||||
{
|
||||
NPU_OP_MUL_MAT, hexagon::is_mul_mat_supported,
|
||||
{
|
||||
hexagon::mul_mat_f32, // NPU_DATA_TYPE_F32
|
||||
nullptr, // NPU_DATA_TYPE_F16
|
||||
}, },
|
||||
{ NPU_OP_ADD,
|
||||
is_element_wise_op_supported, {
|
||||
element_wise_op<vec_op_f32_f32<vadd_f32_f32>>, // NPU_DATA_TYPE_F32
|
||||
element_wise_op<vec_op_f16_f16<vadd_f16_f16>>, // NPU_DATA_TYPE_F16
|
||||
} },
|
||||
{ NPU_OP_SUB,
|
||||
is_element_wise_op_supported, {
|
||||
element_wise_op<vec_op_f32_f32<vsub_f32_f32>>, // NPU_DATA_TYPE_F32
|
||||
element_wise_op<vec_op_f16_f16<vsub_f16_f16>>, // NPU_DATA_TYPE_F16
|
||||
} },
|
||||
{ NPU_OP_MUL,
|
||||
is_element_wise_op_supported, {
|
||||
element_wise_op<vec_op_f32_f32<vmul_f32_f32>>, // NPU_DATA_TYPE_F32
|
||||
element_wise_op<vec_op_f16_f16<vmul_f16_f16>>, // NPU_DATA_TYPE_F16
|
||||
} },
|
||||
};
|
||||
|
||||
static_assert(kOpCapabilities[NPU_OP_MUL_MAT].compute_func == hexagon::mul_mat_f32,
|
||||
static_assert(kOpCapabilities[NPU_OP_MUL_MAT].compute_funcs[NPU_DATA_TYPE_F32] == hexagon::mul_mat_f32,
|
||||
"kOpArray[NPU_OP_MUL_MAT] != mul_mat_f32");
|
||||
|
||||
static_assert(std::size(kOpCapabilities) == NPU_OP_COUNT);
|
||||
static_assert(kOpCapabilities[NPU_OP_MUL_MAT].op == NPU_OP_MUL_MAT, "kOpArray[NPU_OP_MUL_MAT].op != NPU_OP_MUL_MAT");
|
||||
static_assert(kOpCapabilities[NPU_OP_MUL].op == NPU_OP_MUL, "kOpArray[NPU_OP_MUL].op != NPU_OP_MUL");
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
compute_func_type get_compute_func(npu_device_tensor_op op) {
|
||||
hexagon::compute_func_type get_compute_func_impl(npu_device_tensor_op op, npu_device_tensor_data_type type) {
|
||||
if (op >= NPU_OP_COUNT) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return kOpCapabilities[op].compute_func;
|
||||
return kOpCapabilities[op].compute_funcs[type];
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
compute_func_type get_compute_func(tensor * dst) {
|
||||
return get_compute_func_impl(dst->get_op(), dst->get_type());
|
||||
}
|
||||
|
||||
bool support_op(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
|
||||
const npu_device_tensor_spec & dst, npu_device_tensor_op op) {
|
||||
if (get_compute_func(op) == nullptr) {
|
||||
DEVICE_LOG_ERROR("Unsupported op: %s, get_compute_func failed\n", op_get_name(op));
|
||||
if (get_compute_func_impl(op, dst.type) == nullptr) {
|
||||
DEVICE_LOG_ERROR("[%s]unsupported, get_compute_func failed\n", op_get_name(op));
|
||||
return false;
|
||||
}
|
||||
|
||||
auto is_supported_func = kOpCapabilities[op].is_supported;
|
||||
if (!is_supported_func || !is_supported_func(src0, src1, dst, op)) {
|
||||
DEVICE_LOG_ERROR("Unsupported op: %s, is_supported_func failed\n", op_get_name(op));
|
||||
DEVICE_LOG_DEBUG("[%s]unsupported, is_supported_func failed\n", op_get_name(op));
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,15 +1,10 @@
|
|||
#pragma once
|
||||
|
||||
#include "hexagon_npu.h"
|
||||
#include "tensor.hpp"
|
||||
#include "op_types.hpp"
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
typedef bool (*compute_func_type)(tensor * dst, size_t tidx, size_t tcnt);
|
||||
typedef bool (*op_is_supported_func_type)(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
|
||||
const npu_device_tensor_spec & dst, npu_device_tensor_op op);
|
||||
|
||||
compute_func_type get_compute_func(npu_device_tensor_op op);
|
||||
compute_func_type get_compute_func(tensor * dst);
|
||||
|
||||
bool support_op(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
|
||||
const npu_device_tensor_spec & dst, npu_device_tensor_op op);
|
||||
|
|
|
|||
|
|
@ -2,17 +2,42 @@
|
|||
|
||||
#include <HTP/core/intrinsics.h>
|
||||
|
||||
#include "quants.hpp"
|
||||
#include "vtcm_mem.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
inline float vec_reduction_f32(HVX_Vector sums) {
|
||||
constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(float);
|
||||
static_assert(kFloatsPerVector == 32 || kFloatsPerVector == 16, "kFloatsPerVector should be 16 or 32");
|
||||
|
||||
// TODO: do we have a better way to do the reduction?
|
||||
switch (kFloatsPerVector) {
|
||||
default:
|
||||
case 32:
|
||||
sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 16 * sizeof(float)));
|
||||
// fallthrough
|
||||
case 16:
|
||||
sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 8 * sizeof(float)));
|
||||
sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 4 * sizeof(float)));
|
||||
sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 2 * sizeof(float)));
|
||||
sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, sizeof(float)));
|
||||
break;
|
||||
}
|
||||
|
||||
return hexagon::get_flt0_from_fltv(Q6_Vsf_equals_Vqf32(sums));
|
||||
}
|
||||
|
||||
inline float vec_dot_product_f32_f32(const float * src0, const float * src1, size_t count) {
|
||||
constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(float);
|
||||
|
||||
HVX_Vector * iptr0 = ((HVX_Vector *) src0);
|
||||
HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / hexagon::kFloatsPerVector);
|
||||
HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kElementsPerVector);
|
||||
HVX_Vector * iptr1 = ((HVX_Vector *) src1);
|
||||
HVX_Vector prev0 = *iptr0++;
|
||||
HVX_Vector prev1 = *iptr1++;
|
||||
HVX_Vector sum = Q6_V_vzero();
|
||||
|
||||
// TODO: prefetch or just use VTCM?
|
||||
while (iptr0 < iptr0_end) {
|
||||
HVX_Vector curr0 = *iptr0++;
|
||||
HVX_Vector curr1 = *iptr1++;
|
||||
|
|
@ -41,7 +66,7 @@ inline float vec_dot_product_f32_f32(const float * src0, const float * src1, siz
|
|||
prev1 = curr1;
|
||||
}
|
||||
|
||||
const size_t leftover = count % hexagon::kFloatsPerVector;
|
||||
const size_t leftover = count % kElementsPerVector;
|
||||
const size_t leftover_bytes = leftover * sizeof(float);
|
||||
if (leftover > 0) {
|
||||
// handle the leftover elements
|
||||
|
|
@ -57,21 +82,201 @@ inline float vec_dot_product_f32_f32(const float * src0, const float * src1, siz
|
|||
Q6_V_valign_VVR(Q6_Vqf32_vmpy_VsfVsf(curr0, curr1), Q6_V_vzero(), leftover_bytes), sum);
|
||||
}
|
||||
|
||||
// TODO: do we have a better way to do the reduction?
|
||||
for (size_t i = hexagon::kFloatsPerVector / 2; i > 0; i /= 2) {
|
||||
sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_V_vror_VR(sum, i * sizeof(float)));
|
||||
return vec_reduction_f32(sum);
|
||||
}
|
||||
|
||||
// TODO: merge with vec_dot_product_f32_f32?
|
||||
inline float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count) {
|
||||
constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(npu_device_fp16_t);
|
||||
constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(float);
|
||||
|
||||
HVX_Vector * iptr0 = ((HVX_Vector *) src0);
|
||||
HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kElementsPerVector);
|
||||
HVX_Vector * iptr1 = ((HVX_Vector *) src1);
|
||||
HVX_Vector prev0 = *iptr0++;
|
||||
HVX_Vector prev1 = *iptr1++;
|
||||
HVX_Vector sum_hi = Q6_V_vzero();
|
||||
HVX_Vector sum_lo = Q6_V_vzero();
|
||||
|
||||
while (iptr0 < iptr0_end) {
|
||||
HVX_Vector curr0 = *iptr0++;
|
||||
HVX_Vector curr1 = *iptr1++;
|
||||
HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
|
||||
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
|
||||
HVX_VectorPair result = Q6_Wqf32_vmpy_VhfVhf(s0, s1);
|
||||
sum_hi = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_hi_W(result), sum_hi);
|
||||
sum_lo = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(result), sum_lo);
|
||||
prev0 = curr0;
|
||||
prev1 = curr1;
|
||||
}
|
||||
|
||||
float result;
|
||||
q6op_vstu_variable_ARV(&result, sizeof(float), Q6_Vsf_equals_Vqf32(sum));
|
||||
return result;
|
||||
if ((iptr0_end - ((HVX_Vector *) src0)) > 0) {
|
||||
// handle the last vector
|
||||
// see also:
|
||||
// https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147
|
||||
// or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c
|
||||
bool iptr0_aligned = hexagon::is_addr_aligned(iptr0);
|
||||
HVX_Vector curr0 = iptr0_aligned ? prev0 : *iptr0;
|
||||
iptr0 = iptr0_aligned ? iptr0 : iptr0 + 1;
|
||||
bool iptr1_aligned = hexagon::is_addr_aligned(iptr1);
|
||||
HVX_Vector curr1 = iptr1_aligned ? prev1 : *iptr1;
|
||||
iptr1 = iptr1_aligned ? iptr1 : iptr1 + 1;
|
||||
HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
|
||||
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
|
||||
HVX_VectorPair result = Q6_Wqf32_vmpy_VhfVhf(s0, s1);
|
||||
sum_hi = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_hi_W(result), sum_hi);
|
||||
sum_lo = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(result), sum_lo);
|
||||
prev0 = curr0;
|
||||
prev1 = curr1;
|
||||
}
|
||||
|
||||
const size_t leftover = count % kElementsPerVector;
|
||||
const size_t leftover_bytes = leftover * sizeof(npu_device_fp16_t);
|
||||
if (leftover > 0) {
|
||||
// handle the leftover elements
|
||||
HVX_Vector curr0 =
|
||||
(leftover_bytes + hexagon::unaligned_bytes(iptr0) > hexagon::kBytesPerVector) ? *iptr0 : prev0;
|
||||
curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
|
||||
|
||||
HVX_Vector curr1 =
|
||||
(leftover_bytes + hexagon::unaligned_bytes(iptr1) > hexagon::kBytesPerVector) ? *iptr1 : prev1;
|
||||
curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
|
||||
|
||||
HVX_VectorPair result = Q6_Wqf32_vmpy_VhfVhf(curr0, curr1);
|
||||
|
||||
// TODO: can we do this better?
|
||||
if (leftover > kFloatsPerVector) {
|
||||
sum_hi = Q6_Vqf32_vadd_Vqf32Vqf32(
|
||||
Q6_V_valign_VVR(Q6_V_hi_W(result), Q6_V_vzero(), (leftover % kFloatsPerVector) * sizeof(float)),
|
||||
sum_hi);
|
||||
sum_lo = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(result), sum_lo);
|
||||
} else {
|
||||
sum_lo = Q6_Vqf32_vadd_Vqf32Vqf32(
|
||||
Q6_V_valign_VVR(Q6_V_lo_W(result), Q6_V_vzero(), leftover * sizeof(float)), sum_lo);
|
||||
}
|
||||
}
|
||||
|
||||
return vec_reduction_f32(Q6_Vqf32_vadd_Vqf32Vqf32(sum_hi, sum_lo));
|
||||
}
|
||||
|
||||
template <typename T> struct get_data_type {};
|
||||
|
||||
template <typename _TyData> struct get_data_type<float (*)(const _TyData *, const _TyData *, size_t)> {
|
||||
using type = _TyData;
|
||||
};
|
||||
|
||||
template <auto _DotFunc>
|
||||
void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tensor * dst,
|
||||
hexagon::compute_params * params) {
|
||||
using data_type = typename get_data_type<decltype(_DotFunc)>::type;
|
||||
|
||||
const bool is_quantized = hexagon::is_quantized_type(src0->get_type());
|
||||
const auto src0_actual_row_size = hexagon::get_dequantized_row_size(src0);
|
||||
auto * dequantize_row_func = hexagon::get_type_traits(src0->get_type()).dequantize_row;
|
||||
if (is_quantized && dequantize_row_func == nullptr) {
|
||||
DEVICE_LOG_ERROR("Unsupported quantized src0 type: %d, dequantize_row_func is null\n", src0->get_type());
|
||||
return;
|
||||
}
|
||||
|
||||
const auto r02 = src1->get_ne(2) / src0->get_ne(2);
|
||||
const auto r03 = src1->get_ne(3) / src0->get_ne(3);
|
||||
const auto * src0_ptr = reinterpret_cast<const uint8_t *>(src0->get_read_buffer());
|
||||
const auto * src1_ptr = reinterpret_cast<const uint8_t *>(src1->get_read_buffer());
|
||||
auto * dst_ptr = reinterpret_cast<uint8_t *>(dst->get_write_buffer());
|
||||
const auto total_planes = dst->get_ne(3) * dst->get_ne(2);
|
||||
|
||||
auto start_end_plane = std::pair<int64_t, int64_t>{ 0, total_planes };
|
||||
auto start_end_row = std::pair<int64_t, int64_t>{ 0, dst->get_ne(1) };
|
||||
auto start_end_element = std::pair<int64_t, int64_t>{ 0, dst->get_ne(0) };
|
||||
|
||||
if (total_planes >= params->tcnt) {
|
||||
start_end_plane = hexagon::get_thread_work_slice(total_planes, params->tidx, params->tcnt);
|
||||
} else if (dst->get_ne(1) >= params->tcnt) {
|
||||
start_end_row = hexagon::get_thread_work_slice(dst->get_ne(1), params->tidx, params->tcnt);
|
||||
} else {
|
||||
start_end_element = hexagon::get_thread_work_slice(dst->get_ne(0), params->tidx, params->tcnt);
|
||||
}
|
||||
|
||||
if (start_end_plane.second <= start_end_plane.first || start_end_row.second <= start_end_row.first ||
|
||||
start_end_element.second <= start_end_element.first) {
|
||||
DEVICE_LOG_DEBUG(
|
||||
"mul_mat_impl: no work to do, start_end_plane: (%ld, %ld), start_end_row: (%ld, %ld), "
|
||||
"start_end_element: (%ld, %ld)\n",
|
||||
start_end_plane.first, start_end_plane.second, start_end_row.first, start_end_row.second,
|
||||
start_end_element.first, start_end_element.second);
|
||||
return;
|
||||
}
|
||||
|
||||
// cache the src0 plane in VTCM
|
||||
const size_t src0_plane_row_count = start_end_element.second - start_end_element.first;
|
||||
size_t src0_plane_cache_size = 0;
|
||||
uint8_t * src0_plane_cache_ptr = nullptr;
|
||||
const uint8_t * last_cached_plane_ptr = nullptr;
|
||||
if (is_quantized) {
|
||||
src0_plane_cache_size = src0_actual_row_size * src0_plane_row_count;
|
||||
src0_plane_cache_ptr = params->get_cache(src0_plane_cache_size, is_quantized);
|
||||
}
|
||||
|
||||
DEVICE_LOG_DEBUG("mul_mat_impl src0_actual_row_size: %zu, is_quantized: %d, vtcm_mem: %p(%zu)\n",
|
||||
src0_actual_row_size, is_quantized, (void *) src0_plane_cache_ptr, src0_plane_cache_size);
|
||||
|
||||
const size_t valid_row_bytes = src1->get_ne(0) * sizeof(data_type);
|
||||
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_SUB_PROC(dst, params->tidx, dequant);
|
||||
for (int64_t ip = start_end_plane.first; ip < start_end_plane.second; ip++) {
|
||||
const auto i3 = ip / dst->get_ne(2);
|
||||
const auto i2 = ip - i3 * dst->get_ne(2);
|
||||
const auto * src0_plane = src0_ptr + i3 / r03 * src0->get_nb(3) + i2 / r02 * src0->get_nb(2) +
|
||||
start_end_element.first * src0->get_nb(1);
|
||||
const auto * src1_plane = src1_ptr + i3 * src1->get_nb(3) + i2 * src1->get_nb(2);
|
||||
auto * dst_plane = dst_ptr + i3 * dst->get_nb(3) + i2 * dst->get_nb(2);
|
||||
|
||||
if (src0_plane_cache_ptr) {
|
||||
if (last_cached_plane_ptr != src0_plane) {
|
||||
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_SUB_PROC(dequant);
|
||||
|
||||
for (int64_t ir = 0; ir < (int64_t) src0_plane_row_count; ir++) {
|
||||
auto * src0_row = src0_plane + ir * src0->get_nb(1);
|
||||
if (ir + 1 < src0_plane_row_count) {
|
||||
hexagon::l2fetch_row(src0_row + src0->get_nb(1), src0->get_nb(1));
|
||||
}
|
||||
|
||||
auto * dst_row = reinterpret_cast<float *>(src0_plane_cache_ptr + ir * src0_actual_row_size);
|
||||
dequantize_row_func(src0_row, reinterpret_cast<float *>(dst_row), src0->get_ne(0),
|
||||
params->f16_to_f32_table);
|
||||
}
|
||||
|
||||
last_cached_plane_ptr = src0_plane;
|
||||
}
|
||||
|
||||
src0_plane = src0_plane_cache_ptr;
|
||||
}
|
||||
|
||||
for (int64_t i1 = start_end_row.first; i1 < start_end_row.second; i1++) {
|
||||
auto * src1_row = src1_plane + i1 * src1->get_nb(1);
|
||||
auto * dst_row = reinterpret_cast<float *>(dst_plane + i1 * dst->get_nb(1)) + start_end_element.first;
|
||||
for (int64_t i0 = 0; i0 < (int64_t) src0_plane_row_count; i0++) {
|
||||
auto * src0_row = src0_plane + i0 * src0_actual_row_size;
|
||||
if (i0 + 1 < src0_plane_row_count) {
|
||||
if (!src0_plane_cache_ptr) {
|
||||
hexagon::l2fetch_row(src0_row + src0_actual_row_size, valid_row_bytes);
|
||||
}
|
||||
} else if (ip + 1 < start_end_plane.second) {
|
||||
hexagon::l2fetch_row(src1_row + src1->get_nb(1), valid_row_bytes);
|
||||
}
|
||||
|
||||
// TODO: figure dst how to handle a entire row
|
||||
dst_row[i0] = _DotFunc(reinterpret_cast<const data_type *>(src0_row),
|
||||
reinterpret_cast<const data_type *>(src1_row), (size_t) src0->get_ne(0));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
bool mul_mat_f32(hexagon::tensor * out, size_t tidx, size_t tcnt) {
|
||||
bool mul_mat_f32(hexagon::tensor * out, compute_params * params) {
|
||||
if (!out) {
|
||||
return false;
|
||||
}
|
||||
|
|
@ -83,62 +288,80 @@ bool mul_mat_f32(hexagon::tensor * out, size_t tidx, size_t tcnt) {
|
|||
return true; // skip if no src
|
||||
}
|
||||
|
||||
const auto r02 = src1->get_ne(2) / src0->get_ne(2);
|
||||
const auto r03 = src1->get_ne(3) / src0->get_ne(3);
|
||||
const auto * src0_ptr = reinterpret_cast<const uint8_t *>(src0->get_data());
|
||||
const auto * src1_ptr = reinterpret_cast<const uint8_t *>(src1->get_data());
|
||||
auto * dst_ptr = reinterpret_cast<uint8_t *>(out->get_data());
|
||||
const auto total_planes = out->get_ne(3) * out->get_ne(2);
|
||||
// TODO: array?
|
||||
switch (src1->get_type()) {
|
||||
case NPU_DATA_TYPE_F32:
|
||||
mul_mat_impl<vec_dot_product_f32_f32>(src0, src1, out, params);
|
||||
return true;
|
||||
|
||||
const auto start_end_plane = (total_planes >= tcnt) ? get_thread_work_slice(total_planes, tidx, tcnt) :
|
||||
std::pair<int64_t, int64_t>{ 0, total_planes };
|
||||
const auto start_end_row = (total_planes >= tcnt) ? std::pair<int64_t, int64_t>{ 0, out->get_ne(1) } :
|
||||
get_thread_work_slice(out->get_ne(1), tidx, tcnt);
|
||||
for (int64_t ip = start_end_plane.first; ip < start_end_plane.second; ip++) {
|
||||
const auto i3 = ip / out->get_ne(2);
|
||||
const auto i2 = ip - i3 * out->get_ne(2);
|
||||
const auto * src0_plane = src0_ptr + i3 / r03 * src0->get_nb(3) + i2 / r02 * src0->get_nb(2);
|
||||
const auto * src1_plane = src1_ptr + i3 * src1->get_nb(3) + i2 * src1->get_nb(2);
|
||||
auto * dst_plane = dst_ptr + i3 * out->get_nb(3) + i2 * out->get_nb(2);
|
||||
for (int64_t i1 = start_end_row.first; i1 < start_end_row.second; i1++) {
|
||||
// TODO: prefetch row?
|
||||
auto * src1_row = src1_plane + i1 * src1->get_nb(1);
|
||||
auto * dst_row = reinterpret_cast<float *>(dst_plane + i1 * out->get_nb(1));
|
||||
for (int64_t i0 = 0; i0 < out->get_ne(0); i0++) {
|
||||
auto * src0_row = src0_plane + i0 * src0->get_nb(1);
|
||||
// TODO: figure out how to handle a entire row
|
||||
*dst_row++ =
|
||||
vec_dot_product_f32_f32(reinterpret_cast<const float *>(src0_row),
|
||||
reinterpret_cast<const float *>(src1_row), (size_t) src0->get_ne(0));
|
||||
}
|
||||
}
|
||||
case NPU_DATA_TYPE_F16:
|
||||
mul_mat_impl<vec_dot_product_f16_f16>(src0, src1, out, params);
|
||||
return true;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return true;
|
||||
DEVICE_LOG_ERROR("Unsupported src1 tensor type: %s\n", get_type_name(src1->get_type()));
|
||||
return false;
|
||||
}
|
||||
|
||||
bool is_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
|
||||
const npu_device_tensor_spec & dst, npu_device_tensor_op op) {
|
||||
if (op != NPU_OP_MUL_MAT) {
|
||||
DEVICE_LOG_DEBUG("op is not NPU_OP_MUL_MAT: %d\n", op);
|
||||
DEVICE_LOG_DEBUG("op is not MUL_MAT: %d\n", op);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dst.type != NPU_DATA_TYPE_F32) {
|
||||
DEVICE_LOG_DEBUG("[%s]dst type is not F32: %s\n", op_get_name(op), get_type_name(dst.type));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src0.type != src1.type) {
|
||||
#ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS
|
||||
if (src1.type != NPU_DATA_TYPE_F32) {
|
||||
DEVICE_LOG_DEBUG("[%s]src0.type(%s) and src1.type(%s) mismatch and src1 is not F32\n", op_get_name(op),
|
||||
get_type_name(src0.type), get_type_name(src1.type));
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto type_traits = get_type_traits(src0.type);
|
||||
if (!type_traits.is_quantized || type_traits.dequantize_row == nullptr) {
|
||||
DEVICE_LOG_DEBUG("[%s]src0.type(%s) and src1.type(%s) mismatch and src0 is not quantized\n",
|
||||
op_get_name(op), get_type_name(src0.type), get_type_name(src1.type));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src0.ne[0] % type_traits.blck_size) {
|
||||
DEVICE_LOG_DEBUG("[%s]src0.type(%s) ne[0] is not aligned: %ld\n", op_get_name(op), get_type_name(src0.type),
|
||||
(long) src0.ne[0]);
|
||||
return false;
|
||||
}
|
||||
|
||||
DEVICE_LOG_DEBUG("[%s]supported quantized src0.type(%s) and src1.type(%s)\n", op_get_name(op),
|
||||
get_type_name(src0.type), get_type_name(src1.type));
|
||||
#else
|
||||
DEVICE_LOG_DEBUG("[%s]src0.type(%s) and src1.type(%s) mismatch and quantized tensors are not supported\n",
|
||||
op_get_name(op), get_type_name(src0.type), get_type_name(src1.type));
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
if (src0.ne[0] != src1.ne[0] || src0.ne[1] != dst.ne[0]) {
|
||||
DEVICE_LOG_DEBUG("src0 and src1 cannot multiply: %ldx%ld vs %ldx%ld\n", (long) src0.ne[0], (long) src0.ne[1],
|
||||
(long) src1.ne[0], (long) src1.ne[1]);
|
||||
DEVICE_LOG_DEBUG("[%s]src0 and src1 cannot multiply: %ldx%ld vs %ldx%ld\n", op_get_name(op), (long) src0.ne[0],
|
||||
(long) src0.ne[1], (long) src1.ne[0], (long) src1.ne[1]);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src1.ne[1] != dst.ne[1] || src1.ne[2] != dst.ne[2] || src1.ne[3] != dst.ne[3]) {
|
||||
DEVICE_LOG_DEBUG("src1 and dst dimensions not match: %ldx%ld vs %ldx%ld\n", (long) src1.ne[2],
|
||||
(long) src1.ne[3], (long) dst.ne[2], (long) dst.ne[3]);
|
||||
DEVICE_LOG_DEBUG("[%s]src1 and dst dimensions not match: %ldx%ld vs %ldx%ld\n", op_get_name(op),
|
||||
(long) src1.ne[2], (long) src1.ne[3], (long) dst.ne[2], (long) dst.ne[3]);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src1.ne[2] % src0.ne[2] || src1.ne[3] % src0.ne[3]) {
|
||||
DEVICE_LOG_DEBUG("src0 cannot broadcast to src1: %ldx%ld vs %ldx%ld\n", (long) src0.ne[2], (long) src0.ne[3],
|
||||
(long) src1.ne[2], (long) src1.ne[3]);
|
||||
DEVICE_LOG_DEBUG("[%s]src0 cannot broadcast to src1: %ldx%ld vs %ldx%ld\n", op_get_name(op), (long) src0.ne[2],
|
||||
(long) src0.ne[3], (long) src1.ne[2], (long) src1.ne[3]);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -2,15 +2,15 @@
|
|||
|
||||
#include <hexagon_types.h>
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#include "op_types.hpp"
|
||||
#include "tensor.hpp"
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73
|
||||
constexpr const size_t kFloatsPerVector = kBytesPerVector / sizeof(float);
|
||||
constexpr const size_t kAlignMask = kBytesPerVector - 1;
|
||||
constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73
|
||||
constexpr const size_t kAlignMask = kBytesPerVector - 1;
|
||||
constexpr const size_t kL2CacheSize = 8 * 1024; // // 8KB L2 cache
|
||||
constexpr const size_t kL2FetchAheadVectors = kL2CacheSize / kBytesPerVector;
|
||||
|
||||
inline size_t unaligned_bytes(const void * addr) {
|
||||
return ((size_t) addr) & kAlignMask;
|
||||
|
|
@ -20,7 +20,30 @@ inline bool is_addr_aligned(void * addr) {
|
|||
return unaligned_bytes(addr) == 0;
|
||||
}
|
||||
|
||||
bool mul_mat_f32(tensor * out, size_t tidx, size_t tcnt);
|
||||
inline void l2fetch(const void * p, uint32_t stride, uint32_t width, uint32_t height, uint32_t dir) {
|
||||
uint64_t control = HEXAGON_V64_CREATE_H(dir, stride, width, height);
|
||||
__asm__ __volatile__(" l2fetch(%0,%1) " : : "r"(p), "r"(control));
|
||||
}
|
||||
|
||||
inline void l2fetch_row(const uint8_t * curr_row, size_t bytes) {
|
||||
// TODO: should we use small kL2FetchAheadVectors?
|
||||
int32_t l2fetch_vectors = Q6_R_min_RR(bytes / kBytesPerVector, kL2FetchAheadVectors);
|
||||
hexagon::l2fetch(curr_row, kBytesPerVector, kBytesPerVector, l2fetch_vectors, 0);
|
||||
}
|
||||
|
||||
inline float get_flt0_from_fltv(HVX_Vector vect) {
|
||||
// See also: tools\HEXAGON_Tools\8.6.07\Examples\StandAlone_Applications\QFloat\QFloat.c
|
||||
|
||||
union {
|
||||
int32_t i;
|
||||
float f;
|
||||
} cvt;
|
||||
|
||||
cvt.i = vect[0];
|
||||
return cvt.f;
|
||||
}
|
||||
|
||||
bool mul_mat_f32(tensor * out, compute_params * params);
|
||||
bool is_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
|
||||
const npu_device_tensor_spec & dst, npu_device_tensor_op op);
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,58 @@
|
|||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
#include "hexagon_npu.h"
|
||||
#include "tensor.hpp"
|
||||
#include "util.hpp"
|
||||
#include "vtcm_mem.hpp"
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
struct compute_params {
|
||||
const size_t tidx;
|
||||
const size_t tcnt;
|
||||
const float * f16_to_f32_table;
|
||||
std::unique_ptr<hexagon::vtcm_mem> vtcm_cache;
|
||||
std::unique_ptr<uint8_t[]> mem_cache;
|
||||
size_t mem_cache_size = 0;
|
||||
|
||||
uint8_t * get_cache(size_t size, bool fallback_to_mem) {
|
||||
if (!vtcm_cache || vtcm_cache->get_size() < size) {
|
||||
vtcm_cache = std::make_unique<hexagon::vtcm_mem>(size, false);
|
||||
}
|
||||
|
||||
if (vtcm_cache->is_valid()) {
|
||||
return vtcm_cache->get_mem();
|
||||
}
|
||||
|
||||
if (!fallback_to_mem) {
|
||||
DEVICE_LOG_DEBUG("vtcm_mem not valid, return nullptr\n");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
DEVICE_LOG_DEBUG("vtcm_mem not valid, allocate from mem_cache\n");
|
||||
if (!mem_cache || mem_cache_size < size) {
|
||||
mem_cache = std::make_unique<uint8_t[]>(size + 256);
|
||||
mem_cache_size = mem_cache ? size : 0;
|
||||
}
|
||||
|
||||
return mem_cache.get();
|
||||
}
|
||||
};
|
||||
|
||||
typedef bool (*compute_func_type)(tensor * dst, compute_params * params);
|
||||
typedef bool (*op_is_supported_func_type)(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
|
||||
const npu_device_tensor_spec & dst, npu_device_tensor_op op);
|
||||
|
||||
inline constexpr std::pair<int64_t, int64_t> get_thread_work_slice(int64_t total, size_t tidx, size_t tcnt) {
|
||||
const auto elements_per_thread = (total + tcnt - 1) / tcnt;
|
||||
const auto start = tidx * elements_per_thread;
|
||||
const auto end = std::min<int64_t>(start + elements_per_thread, total);
|
||||
return { start, end };
|
||||
}
|
||||
|
||||
} // namespace hexagon
|
||||
|
|
@ -0,0 +1,151 @@
|
|||
#include "quants.hpp"
|
||||
|
||||
#include <hexagon_types.h>
|
||||
|
||||
#include <array>
|
||||
|
||||
static_assert(sizeof(npu_device_block_q4_K) ==
|
||||
2 * sizeof(npu_device_fp16_t) + QUANT_K_SCALE_SIZE + QUANT_K_BLOCK_SIZE / 2,
|
||||
"wrong q4_K block size/padding");
|
||||
|
||||
static_assert(sizeof(npu_device_block_q4_0) == sizeof(npu_device_fp16_t) + QUANT_BLOCK_SIZE / 2,
|
||||
"wrong q4_0 block size/padding");
|
||||
|
||||
static_assert(sizeof(npu_device_block_q8_0) == sizeof(npu_device_fp16_t) + QUANT_BLOCK_SIZE,
|
||||
"wrong q8_0 block size/padding");
|
||||
|
||||
namespace {
|
||||
|
||||
inline float to_float(const npu_device_fp16_t src) {
|
||||
union {
|
||||
__fp16 f16;
|
||||
npu_device_fp16_t u16;
|
||||
} f16;
|
||||
|
||||
f16.u16 = src;
|
||||
return f16.f16;
|
||||
}
|
||||
|
||||
inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) {
|
||||
if (j < 4) {
|
||||
*d = q[j] & 63;
|
||||
*m = q[j + 4] & 63;
|
||||
} else {
|
||||
*d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4);
|
||||
*m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4);
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q8_0(const void * src, float * dst, size_t count, const float * f16_to_f32_table) {
|
||||
constexpr const int qk = QUANT_BLOCK_SIZE;
|
||||
const int nb = count / qk;
|
||||
const auto * src_ptr = reinterpret_cast<const npu_device_block_q8_0 *>(src);
|
||||
|
||||
// TODO: use intrinsics
|
||||
for (int i = 0; i < nb; i++) {
|
||||
const float d = f16_to_f32_table[src_ptr[i].d];
|
||||
|
||||
for (int j = 0; j < qk; ++j) {
|
||||
dst[i * qk + j] = src_ptr[i].qs[j] * d;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q4_0(const void * src, float * dst, size_t count, const float * f16_to_f32_table) {
|
||||
constexpr const int qk = QUANT_BLOCK_SIZE;
|
||||
static_assert(qk % 2 == 0, "qk must be even");
|
||||
|
||||
const int nb = count / qk;
|
||||
const auto * src_ptr = reinterpret_cast<const npu_device_block_q4_0 *>(src);
|
||||
|
||||
// TODO: use intrinsics
|
||||
for (int i = 0; i < nb; i++) {
|
||||
const float d = f16_to_f32_table[src_ptr[i].d];
|
||||
|
||||
for (int j = 0; j < qk / 2; ++j) {
|
||||
const int x0 = (src_ptr[i].qs[j] & 0x0F) - 8;
|
||||
const int x1 = ((src_ptr[i].qs[j] >> 4) & 0xF) - 8;
|
||||
|
||||
dst[i * qk + j + 0] = x0 * d;
|
||||
dst[i * qk + j + qk / 2] = x1 * d;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q4_K(const void * src, float * dst, size_t count, const float * f16_to_f32_table) {
|
||||
const int nb = count / QUANT_K_BLOCK_SIZE;
|
||||
const auto * src_ptr = reinterpret_cast<const npu_device_block_q4_K *>(src);
|
||||
|
||||
// TODO: use intrinsics
|
||||
for (int i = 0; i < nb; i++) {
|
||||
const uint8_t * q = src_ptr[i].qs;
|
||||
|
||||
const float d = f16_to_f32_table[src_ptr[i].d];
|
||||
const float min = f16_to_f32_table[src_ptr[i].dmin];
|
||||
|
||||
int is = 0;
|
||||
uint8_t sc = 0;
|
||||
uint8_t m = 0;
|
||||
const auto * scales = src_ptr[i].scales;
|
||||
for (int j = 0; j < QUANT_K_BLOCK_SIZE; j += 64) {
|
||||
get_scale_min_k4(is + 0, scales, &sc, &m);
|
||||
const float d1 = d * sc;
|
||||
const float m1 = min * m;
|
||||
get_scale_min_k4(is + 1, scales, &sc, &m);
|
||||
const float d2 = d * sc;
|
||||
const float m2 = min * m;
|
||||
for (int l = 0; l < 32; ++l) {
|
||||
dst[0] = d1 * (q[l] & 0xF) - m1;
|
||||
dst[32] = d2 * ((q[l] >> 4) & 0xF) - m2;
|
||||
dst++;
|
||||
}
|
||||
dst += 32;
|
||||
q += 32;
|
||||
is += 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
constexpr const hexagon::device_type_traits kDeviceTypeTraits[] = {
|
||||
{ NPU_DATA_TYPE_F32, "F32", 1, false, nullptr },
|
||||
{ NPU_DATA_TYPE_F16, "F16", 1, false, nullptr },
|
||||
{ NPU_DATA_TYPE_Q8_0, "Q8_0", QUANT_BLOCK_SIZE, true, dequantize_row_q8_0 },
|
||||
{ NPU_DATA_TYPE_Q4_0, "Q4_0", QUANT_BLOCK_SIZE, true, dequantize_row_q4_0 },
|
||||
{ NPU_DATA_TYPE_Q4_K, "Q4_K", QUANT_K_BLOCK_SIZE, true, dequantize_row_q4_K },
|
||||
};
|
||||
|
||||
static_assert(std::size(kDeviceTypeTraits) == NPU_DATA_TYPE_COUNT,
|
||||
"kDeviceTypeTraits size mismatch with npu_device_tensor_data_type enum");
|
||||
static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_F32].type == NPU_DATA_TYPE_F32,
|
||||
"kDeviceTypeTraits F32 type mismatch with npu_device_tensor_data_type enum");
|
||||
static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_F16].type == NPU_DATA_TYPE_F16,
|
||||
"kDeviceTypeTraits F16 type mismatch with npu_device_tensor_data_type enum");
|
||||
static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_Q8_0].type == NPU_DATA_TYPE_Q8_0,
|
||||
"kDeviceTypeTraits Q8_0 type mismatch with npu_device_tensor_data_type enum");
|
||||
static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_Q4_0].type == NPU_DATA_TYPE_Q4_0,
|
||||
"kDeviceTypeTraits Q4_0 type mismatch with npu_device_tensor_data_type enum");
|
||||
static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_Q4_K].type == NPU_DATA_TYPE_Q4_K,
|
||||
"kDeviceTypeTraits Q4_K type mismatch with npu_device_tensor_data_type enum");
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
bool init_f16_f32_table(float * table, size_t count) {
|
||||
constexpr const size_t kTableSize = (1U << 16);
|
||||
if (count < kTableSize) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
table[i] = to_float(i);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
const device_type_traits & get_type_traits(npu_device_tensor_data_type type) {
|
||||
return kDeviceTypeTraits[type];
|
||||
}
|
||||
|
||||
} // namespace hexagon
|
||||
|
|
@ -0,0 +1,78 @@
|
|||
|
||||
#include "hexagon_npu.h"
|
||||
#include "tensor.hpp"
|
||||
#include "util.hpp"
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
bool init_f16_f32_table(float * table, size_t count);
|
||||
|
||||
typedef void (*dequantize_row_type)(const void * src, float * dst, size_t count, const float * f16_to_f32_table);
|
||||
|
||||
struct device_type_traits {
|
||||
npu_device_tensor_data_type type;
|
||||
const char * type_name;
|
||||
int64_t blck_size;
|
||||
bool is_quantized;
|
||||
dequantize_row_type dequantize_row;
|
||||
};
|
||||
|
||||
const device_type_traits & get_type_traits(npu_device_tensor_data_type type);
|
||||
|
||||
inline bool is_quantized_type(npu_device_tensor_data_type type) {
|
||||
return get_type_traits(type).is_quantized;
|
||||
}
|
||||
|
||||
inline size_t get_dequantized_row_size(tensor * tensor) {
|
||||
if (!is_quantized_type(tensor->get_type())) {
|
||||
return tensor->get_nb(1); // for f32 and f16
|
||||
}
|
||||
|
||||
auto row_elems_count = tensor->get_ne(0);
|
||||
return row_elems_count * sizeof(float); // currently only f32 is supported
|
||||
}
|
||||
|
||||
inline const char * get_type_name(npu_device_tensor_data_type type) {
|
||||
return get_type_traits(type).type_name;
|
||||
}
|
||||
|
||||
} // namespace hexagon
|
||||
|
||||
// TODO: move this to a common header
|
||||
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
|
||||
namespace hexagon {
|
||||
|
||||
inline auto make_scoped_op_perf_timer(tensor * op, size_t tidx, const char * sub_proc_log_prefix = nullptr) {
|
||||
auto * src0 = op->get_src(0);
|
||||
auto * src1 = op->get_src(1);
|
||||
char buffer[512];
|
||||
if (src1 == nullptr) {
|
||||
snprintf(buffer, sizeof(buffer), "[%s][%lldx%lldx%lldx%lld%s], tidx: %zu", op_get_name(op->get_op()),
|
||||
src0->get_ne(0), src0->get_ne(1), src0->get_ne(2), src0->get_ne(3), get_type_name(src0->get_type()),
|
||||
tidx);
|
||||
} else {
|
||||
snprintf(buffer, sizeof(buffer), "[%s][%lldx%lldx%lldx%lld%s],[%lldx%lldx%lldx%lld%s], tidx: %zu",
|
||||
op_get_name(op->get_op()), src0->get_ne(0), src0->get_ne(1), src0->get_ne(2), src0->get_ne(3),
|
||||
get_type_name(src0->get_type()), src1->get_ne(0), src1->get_ne(1), src1->get_ne(2), src1->get_ne(3),
|
||||
get_type_name(src1->get_type()), tidx);
|
||||
}
|
||||
return npu_scoped_timer<512>(buffer, sub_proc_log_prefix);
|
||||
}
|
||||
|
||||
} // namespace hexagon
|
||||
|
||||
# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(op, tidx) \
|
||||
auto __npu_op_timer_##__LINE__ = hexagon::make_scoped_op_perf_timer(op, tidx)
|
||||
|
||||
# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_SUB_PROC(op, tidx, sub_prefix) \
|
||||
auto __npu_op_timer_##sub_prefix = hexagon::make_scoped_op_perf_timer(op, tidx, #sub_prefix)
|
||||
|
||||
# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_SUB_PROC(sub_prefix) \
|
||||
hexagon::npu_sub_process_scoped_timer<decltype(__npu_op_timer_##sub_prefix)::kBufferCount> \
|
||||
__npu_op_sub_timer##sub_prefix(__npu_op_timer_##sub_prefix)
|
||||
|
||||
#else
|
||||
# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(op, tidx) ((void) 0)
|
||||
# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_SUB_PROC(op, tidx, sub_prefix) ((void) 0)
|
||||
# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_SUB_PROC(sub_prefix) ((void) 0)
|
||||
#endif
|
||||
|
|
@ -36,7 +36,14 @@ class tensor {
|
|||
DEVICE_LOG_INFO("~tensor(%p) fd: %d", (void *) this, _info.buffer_fd);
|
||||
}
|
||||
|
||||
void flush() {
|
||||
void flush() const {
|
||||
if (_data) {
|
||||
qurt_mem_cache_clean((qurt_addr_t) (_data + _info.offset), (qurt_size_t) _info.size, QURT_MEM_CACHE_FLUSH,
|
||||
QURT_MEM_DCACHE);
|
||||
}
|
||||
}
|
||||
|
||||
void invalidate() const {
|
||||
if (_data) {
|
||||
qurt_mem_cache_clean((qurt_addr_t) (_data + _info.offset), (qurt_size_t) _info.size,
|
||||
QURT_MEM_CACHE_INVALIDATE, QURT_MEM_DCACHE);
|
||||
|
|
@ -72,7 +79,12 @@ class tensor {
|
|||
|
||||
npu_device_tensor_data_type get_type() const { return _info.type; }
|
||||
|
||||
uint8_t * get_data() const { return _data + _info.offset; }
|
||||
const uint8_t * get_read_buffer() const {
|
||||
invalidate();
|
||||
return _data + _info.offset;
|
||||
}
|
||||
|
||||
uint8_t * get_write_buffer() const { return _data + _info.offset; }
|
||||
|
||||
bool is_valid() const { return _data != nullptr; }
|
||||
|
||||
|
|
|
|||
|
|
@ -143,6 +143,8 @@ template <size_t _thread_count> class thread_pool {
|
|||
return true;
|
||||
}
|
||||
|
||||
void sync_thread() { qurt_barrier_wait(&_completed); }
|
||||
|
||||
private:
|
||||
struct thread_pool_arg {
|
||||
thread_pool * pool = nullptr;
|
||||
|
|
|
|||
|
|
@ -1,9 +1,10 @@
|
|||
#pragma once
|
||||
|
||||
#include <HAP_farf.h>
|
||||
#include <HAP_perf.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <utility>
|
||||
|
||||
#include "hexagon_npu.h"
|
||||
|
|
@ -52,11 +53,105 @@ inline constexpr const char * op_get_name(npu_device_tensor_op op) {
|
|||
}
|
||||
}
|
||||
|
||||
inline constexpr std::pair<int64_t, int64_t> get_thread_work_slice(int64_t total, size_t tidx, size_t tcnt) {
|
||||
const auto elements_per_thread = (total + tcnt - 1) / tcnt;
|
||||
const auto start = tidx * elements_per_thread;
|
||||
const auto end = std::min<int64_t>(start + elements_per_thread, total);
|
||||
return { start, end };
|
||||
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
|
||||
|
||||
template <size_t _buffer_count> class npu_scoped_timer {
|
||||
public:
|
||||
enum { kBufferCount = _buffer_count };
|
||||
|
||||
explicit npu_scoped_timer(const char * log_prefix, const char * sub_proc_log_prefix) {
|
||||
strncpy(_log_prefix, log_prefix, kBufferCount - 1);
|
||||
if (sub_proc_log_prefix != nullptr) {
|
||||
strncpy(_sub_proc_log_prefix, sub_proc_log_prefix, kBufferCount - 1);
|
||||
}
|
||||
|
||||
_begin_cycles = HAP_perf_get_qtimer_count();
|
||||
_begin_pcycles = HAP_perf_get_pcycles();
|
||||
}
|
||||
|
||||
npu_scoped_timer(npu_scoped_timer && other) { *this = std::move(other); }
|
||||
|
||||
~npu_scoped_timer() { print(); }
|
||||
|
||||
void operator=(npu_scoped_timer && other) {
|
||||
strncpy(_log_prefix, other._log_prefix, kBufferCount - 1);
|
||||
strncpy(_sub_proc_log_prefix, other._sub_proc_log_prefix, kBufferCount - 1);
|
||||
_begin_cycles = other._begin_cycles;
|
||||
_sub_proc_cycles = other._sub_proc_cycles;
|
||||
_sub_proc_count = other._sub_proc_count;
|
||||
}
|
||||
|
||||
void add_sub_proc_cycles(uint64_t cycles, uint64_t pcycles) {
|
||||
_sub_proc_cycles += cycles;
|
||||
_sub_proc_pcycles += pcycles;
|
||||
_sub_proc_count++;
|
||||
}
|
||||
|
||||
void print() const {
|
||||
auto total_cycles = HAP_perf_get_qtimer_count() - _begin_cycles;
|
||||
auto total_pcycles = HAP_perf_get_pcycles() - _begin_pcycles;
|
||||
auto duration = HAP_perf_qtimer_count_to_us(total_cycles);
|
||||
|
||||
if (_sub_proc_count > 0) {
|
||||
auto sub_proc_duration = HAP_perf_qtimer_count_to_us(_sub_proc_cycles);
|
||||
DEVICE_LOG_WARN("[profiler]%s, pcyc: %llu, dur: %lluus, [%s]cnt: %llu, pcyc: %llu, dur: %lluus\n",
|
||||
_log_prefix, total_pcycles, duration, _sub_proc_log_prefix, _sub_proc_count,
|
||||
_sub_proc_pcycles, sub_proc_duration);
|
||||
} else {
|
||||
DEVICE_LOG_WARN("[profiler]%s, pcyc: %llu, dur: %lluus\n", _log_prefix, total_pcycles, duration);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
char _log_prefix[kBufferCount] = {};
|
||||
char _sub_proc_log_prefix[kBufferCount] = {};
|
||||
uint64_t _begin_cycles = 0;
|
||||
uint64_t _begin_pcycles = 0;
|
||||
uint64_t _sub_proc_cycles = 0;
|
||||
uint64_t _sub_proc_pcycles = 0;
|
||||
uint64_t _sub_proc_count = 0;
|
||||
|
||||
DISABLE_COPY(npu_scoped_timer);
|
||||
};
|
||||
|
||||
template <size_t _buffer_count> class npu_sub_process_scoped_timer {
|
||||
public:
|
||||
using npu_scoped_timer = npu_scoped_timer<_buffer_count>;
|
||||
|
||||
explicit npu_sub_process_scoped_timer(npu_scoped_timer & timer) : _timer(timer) {
|
||||
_begin_cycles = HAP_perf_get_qtimer_count();
|
||||
_begin_pcycles = HAP_perf_get_pcycles();
|
||||
}
|
||||
|
||||
~npu_sub_process_scoped_timer() {
|
||||
_timer.add_sub_proc_cycles(HAP_perf_get_qtimer_count() - _begin_cycles,
|
||||
HAP_perf_get_pcycles() - _begin_pcycles);
|
||||
}
|
||||
|
||||
private:
|
||||
npu_scoped_timer & _timer;
|
||||
uint64_t _begin_cycles = 0;
|
||||
uint64_t _begin_pcycles = 0;
|
||||
|
||||
DISABLE_COPY_AND_MOVE(npu_sub_process_scoped_timer);
|
||||
};
|
||||
|
||||
inline auto make_scoped_perf_timer(const char * format, ...) {
|
||||
va_list args;
|
||||
va_start(args, format);
|
||||
char buffer[512];
|
||||
vsnprintf(buffer, sizeof(buffer), format, args);
|
||||
va_end(args);
|
||||
return npu_scoped_timer<512>(buffer, nullptr);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
} // namespace hexagon
|
||||
|
||||
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
|
||||
# define DEVICE_SCOPED_PERFORMANCE_TRACKER(fmt, ...) \
|
||||
auto __npu_timer_##__LINE__ = hexagon::make_scoped_perf_timer(fmt, __VA_ARGS__)
|
||||
#else
|
||||
# define DEVICE_SCOPED_PERFORMANCE_TRACKER(fmt, ...) ((void) 0)
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -0,0 +1,101 @@
|
|||
#pragma once
|
||||
|
||||
#include <HAP_vtcm_mgr.h>
|
||||
|
||||
#include "util.hpp"
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
class vtcm_mem {
|
||||
public:
|
||||
explicit vtcm_mem(size_t size, bool single_page) {
|
||||
size_t avail_size = single_page ? get_avail_page_size() : get_avail_block_size();
|
||||
if (size > avail_size) {
|
||||
DEVICE_LOG_ERROR("Requested VTCM size %zu exceeds available size %zu\n", size, avail_size);
|
||||
return;
|
||||
}
|
||||
|
||||
_vtcm_mem = HAP_request_VTCM((unsigned int) size, single_page ? 1 : 0);
|
||||
if (_vtcm_mem == nullptr) {
|
||||
DEVICE_LOG_ERROR("Failed to allocate VTCM memory: %zu bytes\n", size);
|
||||
return;
|
||||
}
|
||||
|
||||
_vtcm_size = size;
|
||||
DEVICE_LOG_DEBUG("VTCM allocated: %p(%zu), avail: %zu\n", _vtcm_mem, size, avail_size);
|
||||
}
|
||||
|
||||
explicit vtcm_mem(size_t size, bool single_page, size_t timeout_us) {
|
||||
_vtcm_mem = HAP_request_async_VTCM((unsigned int) size, single_page ? 1 : 0, (unsigned int) timeout_us);
|
||||
if (_vtcm_mem == nullptr) {
|
||||
DEVICE_LOG_ERROR("Failed to allocate VTCM memory: %zu bytes, timeout %zu us\n", size, timeout_us);
|
||||
return;
|
||||
}
|
||||
|
||||
_vtcm_size = size;
|
||||
DEVICE_LOG_DEBUG("VTCM allocated: %p(%zu), avail: %zu\n", _vtcm_mem, size, get_avail_block_size());
|
||||
}
|
||||
|
||||
~vtcm_mem() {
|
||||
if (is_valid()) {
|
||||
auto ret = HAP_release_VTCM(_vtcm_mem);
|
||||
if (ret != AEE_SUCCESS) {
|
||||
DEVICE_LOG_ERROR("Failed to release VTCM memory: %d\n", ret);
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_LOG_DEBUG("VTCM released: %zu bytes at %p\n", _vtcm_size, _vtcm_mem);
|
||||
}
|
||||
|
||||
bool is_valid() const { return _vtcm_mem != nullptr; }
|
||||
|
||||
uint8_t * get_mem() const { return reinterpret_cast<uint8_t *>(_vtcm_mem); }
|
||||
|
||||
size_t get_size() const { return _vtcm_size; }
|
||||
|
||||
static size_t get_total_size() {
|
||||
unsigned int arch_page_aligned_size = 0;
|
||||
unsigned int arch_page_count = 0;
|
||||
auto ret = HAP_query_total_VTCM(&arch_page_aligned_size, &arch_page_count);
|
||||
if (ret != AEE_SUCCESS) {
|
||||
DEVICE_LOG_ERROR("Failed to query total VTCM: %d\n", ret);
|
||||
return 0;
|
||||
}
|
||||
|
||||
return arch_page_aligned_size;
|
||||
}
|
||||
|
||||
static size_t get_avail_block_size() {
|
||||
unsigned int avail_block_size = 0;
|
||||
unsigned int avail_page_size = 0;
|
||||
unsigned int num_pages = 0;
|
||||
auto ret = HAP_query_avail_VTCM(&avail_block_size, &avail_page_size, &num_pages);
|
||||
if (ret != AEE_SUCCESS) {
|
||||
DEVICE_LOG_ERROR("Failed to query available VTCM: %d\n", ret);
|
||||
return 0;
|
||||
}
|
||||
|
||||
return avail_block_size;
|
||||
}
|
||||
|
||||
static size_t get_avail_page_size() {
|
||||
unsigned int avail_block_size = 0;
|
||||
unsigned int avail_page_size = 0;
|
||||
unsigned int num_pages = 0;
|
||||
auto ret = HAP_query_avail_VTCM(&avail_block_size, &avail_page_size, &num_pages);
|
||||
if (ret != AEE_SUCCESS) {
|
||||
DEVICE_LOG_ERROR("Failed to query available VTCM: %d\n", ret);
|
||||
return 0;
|
||||
}
|
||||
|
||||
return avail_page_size;
|
||||
}
|
||||
|
||||
private:
|
||||
void * _vtcm_mem = nullptr;
|
||||
size_t _vtcm_size = 0;
|
||||
|
||||
DISABLE_COPY_AND_MOVE(vtcm_mem);
|
||||
};
|
||||
|
||||
} // namespace hexagon
|
||||
|
|
@ -75,6 +75,12 @@ void backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|||
memset(buffer_obj->get_buffer(), value, buffer_obj->get_size());
|
||||
}
|
||||
|
||||
void backend_buffer_reset(ggml_backend_buffer_t buffer) {
|
||||
auto * buffer_obj = get_buffer_object(buffer);
|
||||
GGML_ASSERT(buffer_obj != nullptr);
|
||||
buffer_obj->clear_tensors();
|
||||
}
|
||||
|
||||
constexpr const ggml_backend_buffer_i backend_buffer_interface = {
|
||||
/* .free_buffer = */ backend_buffer_free_buffer,
|
||||
/* .get_base = */ backend_buffer_get_base,
|
||||
|
|
@ -84,7 +90,7 @@ constexpr const ggml_backend_buffer_i backend_buffer_interface = {
|
|||
/* .get_tensor = */ backend_buffer_get_tensor,
|
||||
/* .cpy_tensor = */ backend_buffer_cpy_tensor,
|
||||
/* .clear = */ backend_buffer_clear,
|
||||
/* .reset = */ nullptr,
|
||||
/* .reset = */ backend_buffer_reset,
|
||||
};
|
||||
|
||||
const char * backend_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
|
|
@ -190,6 +196,11 @@ std::shared_ptr<host_tensor> host_buffer::init_tensor(ggml_tensor * tensor, remo
|
|||
return tensor_object;
|
||||
}
|
||||
|
||||
void host_buffer::clear_tensors() {
|
||||
_tensors.clear();
|
||||
LOG_DEBUG("clear host_buffer(%p) tensors\n", (void *) _data);
|
||||
}
|
||||
|
||||
host_buffer_type::host_buffer_type(ggml_backend_dev_t dev, const std::string & name, common::rpc_mem_ptr rpc_mem) :
|
||||
_name(name),
|
||||
_rpc_mem(rpc_mem) {
|
||||
|
|
|
|||
|
|
@ -25,6 +25,8 @@ class host_buffer {
|
|||
|
||||
std::shared_ptr<host_tensor> init_tensor(ggml_tensor * tensor, remote_handle64 device_handle);
|
||||
|
||||
void clear_tensors();
|
||||
|
||||
private:
|
||||
common::rpc_mem_ptr _allocator;
|
||||
void * _data = nullptr;
|
||||
|
|
|
|||
|
|
@ -32,7 +32,8 @@ bool host_graph::update(ggml_cgraph * cgraph) {
|
|||
_tensor_handles.reserve(cgraph->n_nodes);
|
||||
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
||||
auto * node = cgraph->nodes[i];
|
||||
if (node->op == GGML_OP_NONE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) {
|
||||
if (node->op == GGML_OP_NONE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE ||
|
||||
node->op == GGML_OP_RESHAPE) {
|
||||
// skip view liked ops
|
||||
LOG_DEBUG("node[%d]%s(%s), addr: %p, type: %s, skipped\n", i, ggml_get_name(node), ggml_op_desc(node),
|
||||
(void *) node, ggml_type_name(node->type));
|
||||
|
|
@ -55,8 +56,8 @@ bool host_graph::update(ggml_cgraph * cgraph) {
|
|||
}
|
||||
}
|
||||
|
||||
LOG_DEBUG("host_graph::update, host_graph(%p), ggml_cgraph(%p), tensor count(%zu)\n", (void *) this,
|
||||
(void *) cgraph, _tensor_handles.size());
|
||||
LOG_DEBUG("host_graph::update, host_graph(%p), handle(%p), ggml_cgraph(%p), tensor count(%zu)\n", (void *) this,
|
||||
(void *) _graph_handle, (void *) cgraph, _tensor_handles.size());
|
||||
if (!_tensor_handles.empty()) {
|
||||
npu_device_graph_set_tensor(_device_handle, _graph_handle, _tensor_handles.data(),
|
||||
(int) _tensor_handles.size());
|
||||
|
|
|
|||
|
|
@ -57,7 +57,7 @@ void backend_dev_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props
|
|||
ggml_backend_t backend_dev_init_backend(ggml_backend_dev_t dev, const char * params) {
|
||||
auto * dev_obj = get_device_object(dev);
|
||||
GGML_ASSERT(dev_obj != nullptr);
|
||||
if (!dev_obj->init_device(dev, params)) {
|
||||
if (!dev_obj->init_device()) {
|
||||
LOG_ERROR("[%s]Failed to init device\n", backend_dev_get_name(dev));
|
||||
return nullptr;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,6 +7,8 @@
|
|||
|
||||
#include <remote.h>
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
#include "graph.hpp"
|
||||
#include "util.hpp"
|
||||
|
||||
|
|
@ -114,11 +116,117 @@ bool npu_device::is_device_initialized() const {
|
|||
return true;
|
||||
}
|
||||
|
||||
bool npu_device::init_device(ggml_backend_dev_t dev, const char * params) {
|
||||
bool npu_device::init_device() {
|
||||
if (!init_rpc_mem()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!init_device_lib()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool npu_device::supports_buft(ggml_backend_buffer_type_t buft) const {
|
||||
return buft && buft->device && buft->device->context == this;
|
||||
}
|
||||
|
||||
bool npu_device::supports_op_impl(const ggml_tensor * op) {
|
||||
static_assert(std::is_same<npu_device_fp16_t, ggml_fp16_t>::value,
|
||||
"npu_device_fp16_t should be same as ggml_fp16_t");
|
||||
|
||||
if (op->op == GGML_OP_NONE) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (op->op == GGML_OP_VIEW || op->op == GGML_OP_RESHAPE || op->op == GGML_OP_PERMUTE) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (type_to_npu_type(op->type) == NPU_DATA_TYPE_COUNT) {
|
||||
LOG_DEBUG("[%s]Unsupported op tensor type: %s\n", get_name(), ggml_type_name(op->type));
|
||||
return false;
|
||||
}
|
||||
|
||||
auto * src0 = op->src[0];
|
||||
if (!src0) {
|
||||
LOG_DEBUG("[%s]Unsupported inplace op: %s\n", get_name(), ggml_op_name(op->op));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (type_to_npu_type(src0->type) == NPU_DATA_TYPE_COUNT) {
|
||||
LOG_DEBUG("[%s]Unsupported src0 tensor type: %s\n", get_name(), ggml_type_name(src0->type));
|
||||
return false;
|
||||
}
|
||||
|
||||
auto * src1 = op->src[1];
|
||||
if (src1 && type_to_npu_type(src1->type) == NPU_DATA_TYPE_COUNT) {
|
||||
LOG_DEBUG("[%s]Unsupported src1 tensor type: %s\n", get_name(), ggml_type_name(src1->type));
|
||||
return false;
|
||||
}
|
||||
|
||||
auto npu_op = op_to_npu_op(op->op);
|
||||
if (npu_op == NPU_OP_COUNT) {
|
||||
LOG_DEBUG("[%s]Unsupported op: %s\n", get_name(), ggml_op_name(op->op));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!_device_handle && !init_device()) {
|
||||
LOG_DEBUG("[%s]NPU device initialization failed\n", get_name());
|
||||
return false;
|
||||
}
|
||||
|
||||
constexpr const auto get_spec = [](const ggml_tensor * tensor) -> npu_device_tensor_spec {
|
||||
if (!tensor) {
|
||||
return npu_device_tensor_spec{};
|
||||
}
|
||||
|
||||
static_assert(DEVICE_TENSOR_MAX_DIMS == GGML_MAX_DIMS, "tensor dimensions mismatch");
|
||||
npu_device_tensor_spec spec{};
|
||||
spec.ne[0] = tensor->ne[0];
|
||||
spec.ne[1] = tensor->ne[1];
|
||||
spec.ne[2] = tensor->ne[2];
|
||||
spec.ne[3] = tensor->ne[3];
|
||||
spec.type = type_to_npu_type(tensor->type);
|
||||
return spec;
|
||||
};
|
||||
|
||||
boolean supported = false;
|
||||
auto src0_spec = get_spec(src0);
|
||||
auto src1_spec = get_spec(src1);
|
||||
auto dst_spec = get_spec(op);
|
||||
auto ret = npu_device_device_support_op(_device_handle, &src0_spec, &src1_spec, &dst_spec, npu_op, &supported);
|
||||
if (ret != AEE_SUCCESS || !supported) {
|
||||
LOG_DEBUG("[%s][%s]unsupported %s(%s,%s), ret: 0x%x, supported: %d\n", get_name(), ggml_op_name(op->op),
|
||||
ggml_type_name(op->type), ggml_type_name(src0->type), (src1 ? ggml_type_name(src1->type) : "null"),
|
||||
ret, supported);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool npu_device::init_rpc_mem() {
|
||||
if (!_rpc_mem) {
|
||||
auto rpc_interface = std::make_shared<common::rpc_interface>();
|
||||
if (!rpc_interface->is_valid()) {
|
||||
LOG_ERROR("[%s]Failed to load rpc memory library\n", get_name());
|
||||
return false;
|
||||
}
|
||||
|
||||
auto rpc_mem = std::make_shared<common::rpc_mem>(rpc_interface);
|
||||
_rpc_interface = rpc_interface;
|
||||
_rpc_mem = rpc_mem;
|
||||
LOG_DEBUG("[%s]rpc memory initialized\n", get_name());
|
||||
} else {
|
||||
LOG_DEBUG("[%s]rpc memory already initialized\n", get_name());
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool npu_device::init_device_lib() {
|
||||
if (!_device_handle) {
|
||||
auto arch = get_dsp_arch(_rpc_interface, _dsp_domain_id);
|
||||
const auto & device_lib_info = get_device_library_info(arch);
|
||||
|
|
@ -152,97 +260,38 @@ bool npu_device::init_device(ggml_backend_dev_t dev, const char * params) {
|
|||
return true;
|
||||
}
|
||||
|
||||
bool npu_device::supports_buft(ggml_backend_buffer_type_t buft) const {
|
||||
return buft && buft->device && buft->device->context == this;
|
||||
}
|
||||
|
||||
bool npu_device::supports_op_impl(const ggml_tensor * op) {
|
||||
if (op->op == GGML_OP_NONE) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (type_to_npu_type(op->type) == NPU_DATA_TYPE_COUNT) {
|
||||
LOG_DEBUG("[%s]Unsupported op tensor type: %s\n", get_name(), ggml_type_name(op->type));
|
||||
return false;
|
||||
}
|
||||
|
||||
auto * src0 = op->src[0];
|
||||
if (!src0) {
|
||||
LOG_DEBUG("[%s]Unsupported inplace op: %s\n", get_name(), ggml_op_name(op->op));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (type_to_npu_type(src0->type) == NPU_DATA_TYPE_COUNT) {
|
||||
LOG_DEBUG("[%s]Unsupported src0 tensor type: %s\n", get_name(), ggml_type_name(src0->type));
|
||||
return false;
|
||||
}
|
||||
|
||||
auto * src1 = op->src[1];
|
||||
if (src1 && type_to_npu_type(src1->type) == NPU_DATA_TYPE_COUNT) {
|
||||
LOG_DEBUG("[%s]Unsupported src1 tensor type: %s\n", get_name(), ggml_type_name(src1->type));
|
||||
return false;
|
||||
}
|
||||
|
||||
auto npu_op = op_to_npu_op(op->op);
|
||||
if (npu_op == NPU_OP_COUNT) {
|
||||
LOG_DEBUG("[%s]Unsupported op: %s\n", get_name(), ggml_op_name(op->op));
|
||||
return false;
|
||||
}
|
||||
|
||||
constexpr const auto get_spec = [](const ggml_tensor * tensor) -> npu_device_tensor_spec {
|
||||
if (!tensor) {
|
||||
return npu_device_tensor_spec{};
|
||||
}
|
||||
|
||||
static_assert(DEVICE_TENSOR_MAX_DIMS == GGML_MAX_DIMS, "tensor dimensions mismatch");
|
||||
npu_device_tensor_spec spec{};
|
||||
spec.ne[0] = tensor->ne[0];
|
||||
spec.ne[1] = tensor->ne[1];
|
||||
spec.ne[2] = tensor->ne[2];
|
||||
spec.ne[3] = tensor->ne[3];
|
||||
spec.type = type_to_npu_type(tensor->type);
|
||||
return spec;
|
||||
};
|
||||
|
||||
boolean supported = false;
|
||||
auto src0_spec = get_spec(src0);
|
||||
auto src1_spec = get_spec(src1);
|
||||
auto dst_spec = get_spec(op);
|
||||
auto ret = npu_device_device_support_op(_device_handle, &src0_spec, &src1_spec, &dst_spec, npu_op, &supported);
|
||||
if (ret != AEE_SUCCESS || !supported) {
|
||||
LOG_DEBUG("[%s]Unsupported op: %s, ret: 0x%x, supported: %d\n", get_name(), ggml_op_name(op->op), ret,
|
||||
supported);
|
||||
return false;
|
||||
}
|
||||
|
||||
LOG_DEBUG("[%s]Supported op: %s\n", get_name(), ggml_op_name(op->op));
|
||||
return true;
|
||||
}
|
||||
|
||||
bool npu_device::init_rpc_mem() {
|
||||
if (!_rpc_mem) {
|
||||
auto rpc_interface = std::make_shared<common::rpc_interface>();
|
||||
if (!rpc_interface->is_valid()) {
|
||||
LOG_ERROR("[%s]Failed to load rpc memory library\n", get_name());
|
||||
return false;
|
||||
}
|
||||
|
||||
auto rpc_mem = std::make_shared<common::rpc_mem>(rpc_interface);
|
||||
_rpc_interface = rpc_interface;
|
||||
_rpc_mem = rpc_mem;
|
||||
LOG_DEBUG("[%s]rpc memory initialized\n", get_name());
|
||||
} else {
|
||||
LOG_DEBUG("[%s]rpc memory already initialized\n", get_name());
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool npu_device::offload_op(const ggml_tensor * op) {
|
||||
// TODO: implement this
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
bool npu_device::supports_op(const ggml_tensor * op) {
|
||||
char op_desc[1024];
|
||||
get_op_tensor_desc(op, op_desc, sizeof(op_desc));
|
||||
|
||||
if (supports_op_impl(op)) {
|
||||
if (op->op != GGML_OP_NONE && op->op != GGML_OP_VIEW && op->op != GGML_OP_RESHAPE &&
|
||||
op->op != GGML_OP_PERMUTE) {
|
||||
_supported_op++;
|
||||
LOG_DEBUG("[%s][%s]supported, %s, supported/unsupported: %u/%u\n", get_name(), ggml_op_name(op->op),
|
||||
op_desc, _supported_op.load(), _unsupported_op.load());
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
_unsupported_op++;
|
||||
LOG_DEBUG("[%s][%s]unsupported, %s, supported/unsupported: %u/%u\n", get_name(), ggml_op_name(op->op), op_desc,
|
||||
_supported_op.load(), _unsupported_op.load());
|
||||
return false;
|
||||
}
|
||||
#else
|
||||
bool npu_device::supports_op(const ggml_tensor * op) {
|
||||
return supports_op_impl(op);
|
||||
}
|
||||
#endif
|
||||
|
||||
ggml_backend_buffer_type_t npu_device::get_default_buffer_type(ggml_backend_dev_t dev) {
|
||||
// Note that this function will be called before the npu_device::init_device
|
||||
if (!init_rpc_mem()) {
|
||||
|
|
|
|||
|
|
@ -31,37 +31,18 @@ class npu_device {
|
|||
ggml_backend_buffer_type_t get_default_buffer_type(ggml_backend_dev_t dev);
|
||||
|
||||
bool is_device_initialized() const;
|
||||
bool init_device(ggml_backend_dev_t dev, const char * params);
|
||||
bool init_device();
|
||||
|
||||
bool supports_buft(ggml_backend_buffer_type_t buft) const;
|
||||
bool offload_op(const ggml_tensor * op);
|
||||
|
||||
#ifndef NDEBUG
|
||||
bool supports_op(const ggml_tensor * op) {
|
||||
if (supports_op_impl(op)) {
|
||||
if (op->op != GGML_OP_NONE) {
|
||||
_supported_op++;
|
||||
LOG_DEBUG("[%s]Supported op: %s, supported/unsupported: %u/%u\n", get_name(), ggml_op_name(op->op),
|
||||
_supported_op.load(), _unsupported_op.load());
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
_unsupported_op++;
|
||||
LOG_DEBUG("[%s]Unsupported op: %s, supported/unsupported: %u/%u\n", get_name(), ggml_op_name(op->op),
|
||||
_supported_op.load(), _unsupported_op.load());
|
||||
return false;
|
||||
}
|
||||
#else
|
||||
bool supports_op(const ggml_tensor * op) { return supports_op_impl(op); }
|
||||
#endif
|
||||
bool supports_op(const ggml_tensor * op);
|
||||
|
||||
remote_handle64 get_device_handle() const { return _device_handle; }
|
||||
|
||||
private:
|
||||
bool supports_op_impl(const ggml_tensor * op);
|
||||
bool init_rpc_mem();
|
||||
bool init_device_lib();
|
||||
|
||||
std::string _name = "hexagon-npu";
|
||||
std::string _description = "Hexagon NPU";
|
||||
|
|
|
|||
|
|
@ -40,19 +40,17 @@ class host_tensor {
|
|||
|
||||
tensor->extra = this;
|
||||
_ggml_tensor = tensor;
|
||||
LOG_DEBUG(
|
||||
"host_tensor(%p) created, ggml_tensor(%p[%ldx%ldx%ldx%ld], nb[%ld][%ld][%ld][%ld]), "
|
||||
"device_tensor_handle(%p)\n",
|
||||
(void *) this, (void *) tensor, (long) tensor->ne[0], (long) tensor->ne[1], (long) tensor->ne[2],
|
||||
(long) tensor->ne[3], (long) tensor->nb[0], (long) tensor->nb[1], (long) tensor->nb[2],
|
||||
(long) tensor->nb[3], (void *) _device_tensor_handle);
|
||||
LOG_DEBUG("host_tensor(%p), ggml_tensor(%p[%ldx%ldx%ldx%ld], nb[%ld][%ld][%ld][%ld], %s), handle(%p)\n",
|
||||
(void *) this, (void *) tensor, (long) tensor->ne[0], (long) tensor->ne[1], (long) tensor->ne[2],
|
||||
(long) tensor->ne[3], (long) tensor->nb[0], (long) tensor->nb[1], (long) tensor->nb[2],
|
||||
(long) tensor->nb[3], ggml_type_name(tensor->type), (void *) _device_tensor_handle);
|
||||
}
|
||||
|
||||
~host_tensor() {
|
||||
LOG_DEBUG("host_tensor(%p) destroy, device_tensor_handle: %p\n", (void *) this, (void *) _device_tensor_handle);
|
||||
if (_device_tensor_handle) {
|
||||
npu_device_tensor_free(_device_handle, _device_tensor_handle);
|
||||
_ggml_tensor->extra = nullptr;
|
||||
// TODO: figure out why the _ggml_tensor is invalid here
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -2,6 +2,17 @@
|
|||
|
||||
#include <remote.h>
|
||||
|
||||
#define GGML_COMMON_DECL_CPP
|
||||
#include "ggml-common.h"
|
||||
#undef GGML_COMMON_DECL_CPP
|
||||
|
||||
static_assert(sizeof(npu_device_block_q4_K) == sizeof(block_q4_K), "npu_device_block_q4_K size mismatch");
|
||||
static_assert(sizeof(npu_device_block_q4_0) == sizeof(block_q4_0), "npu_device_block_q4_0 size mismatch");
|
||||
static_assert(sizeof(npu_device_block_q8_0) == sizeof(block_q8_0), "npu_device_block_q8_0 size mismatch");
|
||||
static_assert(QUANT_K_SCALE_SIZE == K_SCALE_SIZE, "QUANT_K_SCALE_SIZE size mismatch");
|
||||
static_assert(QUANT_K_BLOCK_SIZE == QK_K, "QUANT_K_BLOCK_SIZE size mismatch");
|
||||
static_assert(QUANT_BLOCK_SIZE == QK4_0, "QUANT_BLOCK_SIZE size mismatch");
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
enum npu_device_tensor_op op_to_npu_op(ggml_op op) {
|
||||
|
|
@ -23,6 +34,14 @@ enum npu_device_tensor_data_type type_to_npu_type(ggml_type type) {
|
|||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return NPU_DATA_TYPE_F32;
|
||||
case GGML_TYPE_F16:
|
||||
return NPU_DATA_TYPE_F16;
|
||||
case GGML_TYPE_Q4_K:
|
||||
return NPU_DATA_TYPE_Q4_K;
|
||||
case GGML_TYPE_Q4_0:
|
||||
return NPU_DATA_TYPE_Q4_0;
|
||||
case GGML_TYPE_Q8_0:
|
||||
return NPU_DATA_TYPE_Q8_0;
|
||||
default:
|
||||
return NPU_DATA_TYPE_COUNT;
|
||||
}
|
||||
|
|
@ -93,4 +112,56 @@ void enable_unsigned_dsp_module(common::rpc_interface_ptr rpc_interface, uint32_
|
|||
}
|
||||
}
|
||||
|
||||
void get_op_tensor_desc(const ggml_tensor * dst, char * out, size_t max_len) {
|
||||
if (dst == nullptr) {
|
||||
snprintf(out, max_len, "null");
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr const auto print_tensor = [](const ggml_tensor * tensor, char * out, size_t max_len) {
|
||||
auto dims = ggml_n_dims(tensor);
|
||||
|
||||
switch (dims) {
|
||||
default:
|
||||
case 4:
|
||||
snprintf(out, max_len, "%s[%ldx%ldx%ldx%ld]", ggml_type_name(tensor->type), (long) tensor->ne[0],
|
||||
(long) tensor->ne[1], (long) tensor->ne[2], (long) tensor->ne[3]);
|
||||
break;
|
||||
case 3:
|
||||
snprintf(out, max_len, "%s[%ldx%ldx%ld]", ggml_type_name(tensor->type), (long) tensor->ne[0],
|
||||
(long) tensor->ne[1], (long) tensor->ne[2]);
|
||||
break;
|
||||
case 2:
|
||||
snprintf(out, max_len, "%s[%ldx%ld]", ggml_type_name(tensor->type), (long) tensor->ne[0],
|
||||
(long) tensor->ne[1]);
|
||||
break;
|
||||
case 1:
|
||||
snprintf(out, max_len, "%s[%ld]", ggml_type_name(tensor->type), (long) tensor->ne[0]);
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
auto * src0 = dst->src[0];
|
||||
if (src0 == nullptr) {
|
||||
print_tensor(dst, out, max_len);
|
||||
return;
|
||||
}
|
||||
|
||||
char dst_desc[256];
|
||||
print_tensor(dst, dst_desc, sizeof(dst_desc));
|
||||
|
||||
char src0_desc[256];
|
||||
print_tensor(src0, src0_desc, sizeof(src0_desc));
|
||||
|
||||
auto * src1 = dst->src[1];
|
||||
if (src1 == nullptr) {
|
||||
snprintf(out, max_len, "dst: %s, src0: %s", dst_desc, src0_desc);
|
||||
return;
|
||||
}
|
||||
|
||||
char src1_desc[256];
|
||||
print_tensor(src1, src1_desc, sizeof(src1_desc));
|
||||
snprintf(out, max_len, "dst: %s, src0: %s, src1: %s", dst_desc, src0_desc, src1_desc);
|
||||
}
|
||||
|
||||
} // namespace hexagon
|
||||
|
|
|
|||
|
|
@ -23,4 +23,6 @@ const char * get_dsp_arch_desc(hexagon_dsp_arch arch);
|
|||
|
||||
void enable_unsigned_dsp_module(common::rpc_interface_ptr rpc_interface, uint32_t domain_id);
|
||||
|
||||
void get_op_tensor_desc(const ggml_tensor * dst, char * out, size_t max_len);
|
||||
|
||||
} // namespace hexagon
|
||||
|
|
|
|||
|
|
@ -4,6 +4,9 @@
|
|||
|
||||
const uint32_t DEVICE_TENSOR_MAX_DIMS = 4;
|
||||
const uint32_t DEVICE_TENSOR_MAX_SRC = 2;
|
||||
const uint32_t QUANT_BLOCK_SIZE = 32;
|
||||
const uint32_t QUANT_K_BLOCK_SIZE = 256;
|
||||
const uint32_t QUANT_K_SCALE_SIZE = 12;
|
||||
|
||||
interface npu_device : remote_handle64{
|
||||
|
||||
|
|
@ -11,6 +14,25 @@ interface npu_device : remote_handle64{
|
|||
typedef uint64_t tensor_handle_t;
|
||||
typedef uint64_t graph_handle_t;
|
||||
|
||||
typedef uint16_t fp16_t;
|
||||
|
||||
struct block_q4_0 {
|
||||
fp16_t d;
|
||||
uint8_t qs[QUANT_BLOCK_SIZE / 2];
|
||||
};
|
||||
|
||||
struct block_q4_K {
|
||||
fp16_t d;
|
||||
fp16_t dmin;
|
||||
uint8_t scales[QUANT_K_SCALE_SIZE];
|
||||
uint8_t qs[QUANT_K_BLOCK_SIZE / 2];
|
||||
};
|
||||
|
||||
struct block_q8_0 {
|
||||
fp16_t d;
|
||||
int8_t qs[QUANT_BLOCK_SIZE];
|
||||
};
|
||||
|
||||
enum tensor_op {
|
||||
NPU_OP_MUL_MAT,
|
||||
NPU_OP_ADD,
|
||||
|
|
@ -21,6 +43,10 @@ interface npu_device : remote_handle64{
|
|||
|
||||
enum tensor_data_type {
|
||||
NPU_DATA_TYPE_F32,
|
||||
NPU_DATA_TYPE_F16,
|
||||
NPU_DATA_TYPE_Q8_0,
|
||||
NPU_DATA_TYPE_Q4_0,
|
||||
NPU_DATA_TYPE_Q4_K,
|
||||
NPU_DATA_TYPE_COUNT
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -26,11 +26,11 @@ else()
|
|||
message("GGML_QNN_ENABLE_CPU_BACKEND is disabled")
|
||||
endif()
|
||||
|
||||
if(GGML_QNN_ENABLE_PERFORMANCE_TRACKING)
|
||||
message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING is enabled")
|
||||
target_compile_definitions(qnn-backend PUBLIC GGML_QNN_ENABLE_PERFORMANCE_TRACKING)
|
||||
if(GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING)
|
||||
message("GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING is enabled")
|
||||
target_compile_definitions(qnn-backend PUBLIC GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING)
|
||||
else()
|
||||
message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING is disabled")
|
||||
message("GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING is disabled")
|
||||
endif()
|
||||
|
||||
if(CMAKE_SYSTEM_NAME STREQUAL "Android")
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@
|
|||
#include "profiler.hpp"
|
||||
#include "tensor.hpp"
|
||||
|
||||
#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING
|
||||
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
|
||||
# define GRAPH_PROFILE_HANDLE (_event_tracer ? _event_tracer->get_handle() : nullptr)
|
||||
# define GRAPH_PROFILE_PRINT() \
|
||||
if (_event_tracer) { \
|
||||
|
|
@ -381,7 +381,7 @@ qnn_graph::qnn_graph(const std::string & graph_name, backend_index_type device,
|
|||
return;
|
||||
}
|
||||
|
||||
#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING
|
||||
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
|
||||
if (device == QNN_BACKEND_NPU) {
|
||||
_event_tracer = std::make_shared<qnn_event_tracer>(
|
||||
graph_name, qnn_interface, qnn_instance->get_qnn_backend_handle(), qnn_event_tracer::PROFILE_OP_TRACE);
|
||||
|
|
|
|||
|
|
@ -79,7 +79,7 @@ class qnn_graph {
|
|||
std::vector<Qnn_Tensor_t> _qnn_tensor_inputs;
|
||||
std::vector<Qnn_Tensor_t> _qnn_tensor_outputs;
|
||||
|
||||
#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING
|
||||
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
|
||||
// profiler
|
||||
qnn_event_tracer_ptr _event_tracer;
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@
|
|||
|
||||
namespace qnn {
|
||||
|
||||
#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING
|
||||
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
|
||||
|
||||
class qnn_scoped_timer {
|
||||
public:
|
||||
|
|
@ -92,7 +92,7 @@ using qnn_event_tracer_ptr = std::shared_ptr<qnn_event_tracer>;
|
|||
|
||||
} // namespace qnn
|
||||
|
||||
#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING
|
||||
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
|
||||
# define QNN_SCOPED_PERFORMANCE_TRACKER(fmt, ...) \
|
||||
auto __qnn_timer_##__LINE__ = qnn::make_scope_perf_timer(fmt, __VA_ARGS__)
|
||||
#else
|
||||
|
|
|
|||
Loading…
Reference in New Issue