diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt index 64c5d1a91e..bd37ada3ac 100644 --- a/ggml/src/ggml-qnn/CMakeLists.txt +++ b/ggml/src/ggml-qnn/CMakeLists.txt @@ -2,6 +2,8 @@ message(STATUS "Using QNN backend") option(GGML_HEXAGON_NPU_ONLY "ggml-qnn: Only use Hexagon NPU" OFF) option(GGML_QNN_ENABLE_HEXAGON_BACKEND "ggml-qnn: Enable Hexagon custom package" ${GGML_HEXAGON_NPU_ONLY}) +option(GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS "ggml-qnn: Enable quantized tensors support" OFF) +option(GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING "ggml-qnn: Enable performance tracking" OFF) if(CMAKE_SYSTEM_NAME STREQUAL "Android") find_library(LOG_LIB log) @@ -17,6 +19,9 @@ if(NOT DEFINED GGML_QNN_SDK_PATH) # TODO: create a function to search for the SDK path if(DEFINED ENV{QNN_SDK_PATH}) set(GGML_QNN_SDK_PATH $ENV{QNN_SDK_PATH}) + elseif(DEFINED ENV{QNN_SDK_ROOT}) + message("found QNN_SDK_ROOT: ${QNN_SDK_ROOT}") + set(GGML_QNN_SDK_PATH $ENV{QNN_SDK_ROOT}) else() message(FATAL_ERROR "GGML_QNN_SDK_PATH not defined") endif() @@ -28,9 +33,10 @@ message("CMAKE_CXX_FLAGS_RELEASE: ${CMAKE_CXX_FLAGS_RELEASE}") message("QNN_SDK_PATH: ${GGML_QNN_SDK_PATH}") message("GGML_QNN: ${GGML_QNN}") -message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING: ${GGML_QNN_ENABLE_PERFORMANCE_TRACKING}") message("GGML_QNN_ENABLE_HEXAGON_BACKEND: ${GGML_QNN_ENABLE_HEXAGON_BACKEND}") message("GGML_HEXAGON_NPU_ONLY: ${GGML_HEXAGON_NPU_ONLY}") +message("GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS: ${GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS}") +message("GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING: ${GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING}") ggml_add_backend_library(ggml-qnn ../../include/ggml-qnn.h @@ -58,8 +64,8 @@ else() target_link_libraries(ggml-qnn PRIVATE runtime-common) endif() -# Copy QNN dynamic libraries -set(QNN_DYNAMIC_LIBS "") +# Copy dynamic libraries +set(BACKEND_RUNTIME_LIBS "") if(CMAKE_SYSTEM_NAME STREQUAL "Android" OR CMAKE_SYSTEM_NAME STREQUAL "Linux") if(CMAKE_SYSTEM_NAME STREQUAL "Android") @@ -73,35 +79,35 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Android" OR CMAKE_SYSTEM_NAME STREQUAL "Linux") set(QNN_SDK_LIB_PATH "${GGML_QNN_SDK_PATH}/lib/aarch64-oe-linux-gcc11.2") endif() - list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/libQnnSystem.so") - list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/libQnnCpu.so") - list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/libQnnGpu.so") - list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/libQnnHtp.so") + list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/libQnnSystem.so") + list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/libQnnCpu.so") + list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/libQnnGpu.so") + list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/libQnnHtp.so") file(GLOB HTP_STUB_LIBS "${QNN_SDK_LIB_PATH}/libQnnHtp*.so") - list(APPEND QNN_DYNAMIC_LIBS ${HTP_STUB_LIBS}) + list(APPEND BACKEND_RUNTIME_LIBS ${HTP_STUB_LIBS}) if(CMAKE_SYSTEM_NAME STREQUAL "Android") file(GLOB HTP_SKEL_LIBS "${GGML_QNN_SDK_PATH}/lib/hexagon-*/unsigned/libQnnHtp*Skel.so") - list(APPEND QNN_DYNAMIC_LIBS ${HTP_SKEL_LIBS}) + list(APPEND BACKEND_RUNTIME_LIBS ${HTP_SKEL_LIBS}) if(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") if(EXISTS "${CMAKE_ANDROID_NDK}/prebuilt/android-arm64/gdbserver/gdbserver") - list(APPEND QNN_DYNAMIC_LIBS "${CMAKE_ANDROID_NDK}/prebuilt/android-arm64/gdbserver/gdbserver") + list(APPEND BACKEND_RUNTIME_LIBS "${CMAKE_ANDROID_NDK}/prebuilt/android-arm64/gdbserver/gdbserver") message("old ndk, copy gdbserver") else() file(GLOB LLDB_SERVER "${CMAKE_ANDROID_NDK}/toolchains/llvm/prebuilt/linux-x86_64/lib64/clang/*/lib/linux/aarch64/lldb-server") - list(APPEND QNN_DYNAMIC_LIBS ${LLDB_SERVER}) + list(APPEND BACKEND_RUNTIME_LIBS ${LLDB_SERVER}) message("new ndk, copy lldb-server") endif() file(GLOB OMP_LIBS "${CMAKE_ANDROID_NDK}/toolchains/llvm/prebuilt/linux-x86_64/lib64/clang/*/lib/linux/aarch64/libomp.so") file(GLOB ASAN_LIBS "${CMAKE_ANDROID_NDK}/toolchains/llvm/prebuilt/linux-x86_64/lib64/clang/*/lib/linux/libclang_rt.asan-aarch64-android.so") - list(APPEND QNN_DYNAMIC_LIBS ${OMP_LIBS}) - list(APPEND QNN_DYNAMIC_LIBS ${ASAN_LIBS}) + list(APPEND BACKEND_RUNTIME_LIBS ${OMP_LIBS}) + list(APPEND BACKEND_RUNTIME_LIBS ${ASAN_LIBS}) endif() else() # Linux - list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/libHtpPrepare.so") + list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/libHtpPrepare.so") endif() elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows") if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") @@ -112,24 +118,24 @@ elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows") set(QNN_SDK_LIB_PATH "${GGML_QNN_SDK_PATH}/lib/aarch64-windows-msvc") endif() - list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/QnnSystem.dll") - list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/QnnCpu.dll") - list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/QnnGpu.dll") - list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/QnnHtp.dll") + list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/QnnSystem.dll") + list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/QnnCpu.dll") + list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/QnnGpu.dll") + list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/QnnHtp.dll") file(GLOB HTP_STUB_LIBS "${QNN_SDK_LIB_PATH}/QnnHtp*.dll") if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") - list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/HtpPrepare.dll") + list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/HtpPrepare.dll") endif() - list(APPEND QNN_DYNAMIC_LIBS ${HTP_STUB_LIBS}) + list(APPEND BACKEND_RUNTIME_LIBS ${HTP_STUB_LIBS}) endif() -foreach(QNN_DYNAMIC_LIB ${QNN_DYNAMIC_LIBS}) - message("Copy: ${QNN_DYNAMIC_LIB} -> ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}") +foreach(RUNTIME_LIB ${BACKEND_RUNTIME_LIBS}) + message("Copy: ${RUNTIME_LIB} -> ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}") add_custom_command( TARGET ggml-qnn POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy - ${QNN_DYNAMIC_LIB} + ${RUNTIME_LIB} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) endforeach() diff --git a/ggml/src/ggml-qnn/npu/CMakeLists.txt b/ggml/src/ggml-qnn/npu/CMakeLists.txt index 4c734bb098..5f1009bb9b 100644 --- a/ggml/src/ggml-qnn/npu/CMakeLists.txt +++ b/ggml/src/ggml-qnn/npu/CMakeLists.txt @@ -26,6 +26,56 @@ set(common_incs include_directories(${common_incs}) +function(add_device_target target_name DSP_ARCH IS_SIMULATOR BUILD_CPU_COUNT) + if(${CMAKE_BUILD_TYPE} MATCHES "Debug|Dbg") + set(HEXAGON_BUILD_CONFIG "Debug") + set(EXTRA_BUILD_FLAGS + VERBOSE=1 + TREE=1 + ) + else() + set(HEXAGON_BUILD_CONFIG "Release") + set(EXTRA_BUILD_FLAGS) + endif() + + if(${GGML_SANITIZE_ADDRESS} OR ${LLAMA_SANITIZE_ADDRESS}) + set(GGML_HEXAGON_NPU_SANITIZE_ADDRESS ON) + else() + set(GGML_HEXAGON_NPU_SANITIZE_ADDRESS OFF) + endif() + + set(EXTRA_BUILD_FLAGS ${EXTRA_BUILD_FLAGS} GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS=${GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS}) + set(EXTRA_BUILD_FLAGS ${EXTRA_BUILD_FLAGS} GGML_HEXAGON_NPU_SANITIZE_ADDRESS=${GGML_HEXAGON_NPU_SANITIZE_ADDRESS}) + set(EXTRA_BUILD_FLAGS ${EXTRA_BUILD_FLAGS} GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING=${GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING}) + + set(HEXAGON_TOOLS_VARIANT $ENV{DEFAULT_TOOLS_VARIANT}) + set(BUILD_DIR ${CMAKE_CURRENT_LIST_DIR}/hexagon_${HEXAGON_BUILD_CONFIG}_${HEXAGON_TOOLS_VARIANT}_${DSP_ARCH}) + set(BUILD_BINARY_NAME ${BUILD_DIR}/libhexagon_npu_skel_${DSP_ARCH}.so) + + if(${IS_SIMULATOR}) + set(HEXAGON_TOOLCHAIN_TYPE "hexagonsim") + set(OUTPUT_BINARY_NAME libhexagon_npu_skel_${DSP_ARCH}_sim.so) + else() + set(HEXAGON_TOOLCHAIN_TYPE "hexagon") + set(OUTPUT_BINARY_NAME libhexagon_npu_skel_${DSP_ARCH}.so) + endif() + + add_custom_target(${target_name} ALL + COMMAND ${CMAKE_COMMAND} -E remove_directory ${BUILD_DIR} + COMMAND build_cmake ${HEXAGON_TOOLCHAIN_TYPE} DSP_ARCH=${DSP_ARCH} BUILD=${HEXAGON_BUILD_CONFIG} ${EXTRA_BUILD_FLAGS} -j${BUILD_CPU_COUNT} + COMMAND ${CMAKE_COMMAND} -E copy ${BUILD_BINARY_NAME} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${OUTPUT_BINARY_NAME} + BYPRODUCTS ${BUILD_BINARY_NAME} + WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR} + ) +endfunction() + +function(add_dsp_targets_for_host host_target DSP_ARCH BUILD_CPU_COUNT) + add_device_target(hexagon-npu-device-${DSP_ARCH} ${DSP_ARCH} FALSE ${BUILD_CPU_COUNT}) + add_device_target(hexagon-npu-device-${DSP_ARCH}-sim ${DSP_ARCH} TRUE ${BUILD_CPU_COUNT}) + add_dependencies(hexagon-npu-device-${DSP_ARCH}-sim hexagon-npu-device-${DSP_ARCH}) + add_dependencies(${host_target} hexagon-npu-device-${DSP_ARCH}-sim) +endfunction() + if(${CMAKE_SYSTEM_NAME} MATCHES "Android|Linux|Windows") # host build file(GLOB common_srcs "${CMAKE_CURRENT_LIST_DIR}/common/*.cpp") @@ -52,6 +102,12 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Android|Linux|Windows") GGML_QNN_ENABLE_HEXAGON_BACKEND ) + if(GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS) + target_compile_definitions(hexagon-npu-host PUBLIC + GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS + ) + endif() + target_include_directories(hexagon-npu-host PRIVATE ${HEXAGON_SDK_ROOT}/ipc/fastrpc/rpcmem/inc/ ${QNN_SDK_ROOT}/include/QNN/ @@ -71,6 +127,13 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Android|Linux|Windows") target_link_options(hexagon-npu-host PUBLIC -pie) endif() + if(GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING) + message("GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING is enabled") + target_compile_definitions(hexagon-npu-host PUBLIC GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING) + else() + message("GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING is disabled") + endif() + link_options(hexagon-npu-host) if(${CMAKE_SYSTEM_NAME} MATCHES "Android") @@ -84,8 +147,24 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Android|Linux|Windows") choose_dsprpc("3" dsprpc) # cdsprpc link_custom_library(hexagon-npu-host ${dsprpc}) + + cmake_host_system_information(RESULT BUILD_CPU_COUNT QUERY NUMBER_OF_PHYSICAL_CORES) + add_dsp_targets_for_host(hexagon-npu-host "v73" ${BUILD_CPU_COUNT}) + add_dsp_targets_for_host(hexagon-npu-host "v75" ${BUILD_CPU_COUNT}) + + list(APPEND NPU_RUNTIME_LIBS "${HEXAGON_SDK_ROOT}/tools/utils/sysmon/sysMonApp") + list(APPEND NPU_RUNTIME_LIBS "${HEXAGON_SDK_ROOT}/tools/utils/sysmon/sysMonAppLE") + + foreach(RUNTIME_LIB ${NPU_RUNTIME_LIBS}) + message("Copy: ${RUNTIME_LIB} -> ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}") + add_custom_command( + TARGET hexagon-npu-host POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${RUNTIME_LIB} + ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) + endforeach() else() - # hexagon npu build + # hexagon npu build, this section will run inside the `build_cmake` script cmake_minimum_required(VERSION 3.14.3) project(hexagon_npu C CXX ASM) @@ -96,6 +175,8 @@ else() set(QNN_SDK_ROOT $ENV{QNN_SDK_ROOT}) message("QNN_SDK_ROOT: ${QNN_SDK_ROOT}") + message("GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS: ${GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS}") + include_directories( ${QNN_SDK_ROOT}/include/QNN/ ) @@ -124,6 +205,30 @@ else() ) endif() + if(GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS) + message("GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS is enabled") + target_compile_definitions(hexagon_npu_skel_OBJS PUBLIC + GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS + ) + endif() + + if(GGML_HEXAGON_NPU_SANITIZE_ADDRESS) + message("GGML_HEXAGON_NPU_SANITIZE_ADDRESS is enabled") + target_compile_options(hexagon_npu_skel_OBJS PUBLIC + -fsanitize=address -fno-omit-frame-pointer + ) + target_link_libraries(hexagon_npu_skel_OBJS PUBLIC + -fsanitize=address + ) + endif() + + if(GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING) + message("GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING is enabled") + target_compile_definitions(hexagon_npu_skel_OBJS PUBLIC + GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING + ) + endif() + build_idl(idl/hexagon_npu.idl hexagon_npu_skel_OBJS) # disable warnings for the skel diff --git a/ggml/src/ggml-qnn/npu/device/device.cpp b/ggml/src/ggml-qnn/npu/device/device.cpp index 7281dd48d2..fbed4b0a28 100644 --- a/ggml/src/ggml-qnn/npu/device/device.cpp +++ b/ggml/src/ggml-qnn/npu/device/device.cpp @@ -9,6 +9,7 @@ #include "graph.hpp" #include "hexagon_npu.h" #include "op_impl.hpp" +#include "quants.hpp" #include "remote.h" #include "tensor.hpp" #include "thread_pool.hpp" @@ -18,6 +19,37 @@ namespace { struct npu_device_context { std::unique_ptr thread_pool; + std::unique_ptr f16_to_f32_table; // TODO: store vtcm? + + bool init() { + if (!init_ltu()) { + DEVICE_LOG_ERROR("Failed to initialize LTU"); + return false; + } + + if (!init_thread_pool()) { + DEVICE_LOG_ERROR("Failed to initialize thread pool"); + return false; + } + + DEVICE_LOG_DEBUG("NPU device context initialized"); + return true; + } + + private: + bool init_ltu() { + constexpr const size_t kLtuCount = 1U << 16; + + f16_to_f32_table = std::make_unique(kLtuCount); + if (!f16_to_f32_table) { + DEVICE_LOG_ERROR("Failed to allocate memory for f16_to_f32 table"); + return false; + } + + hexagon::init_f16_f32_table(f16_to_f32_table.get(), kLtuCount); + DEVICE_LOG_DEBUG("f16_to_f32 table initialized"); + return true; + } bool init_thread_pool() { if (thread_pool) { @@ -67,8 +99,8 @@ int npu_device_open(const char * uri, remote_handle64 * h) { return AEE_ENOMEMORY; } - if (!context->init_thread_pool()) { - DEVICE_LOG_ERROR("Failed to initialize thread pool"); + if (!context->init()) { + DEVICE_LOG_ERROR("Failed to initialize npu_device_context"); delete context; return AEE_EFAILED; } @@ -187,7 +219,7 @@ AEEResult npu_device_graph_compute(remote_handle64 _h, npu_device_graph_handle_t return AEE_EINVHANDLE; } - if (!graph->compute(dev_ctx->thread_pool.get())) { + if (!graph->compute(dev_ctx->thread_pool.get(), dev_ctx->f16_to_f32_table.get())) { return AEE_EFAILED; } diff --git a/ggml/src/ggml-qnn/npu/device/graph.cpp b/ggml/src/ggml-qnn/npu/device/graph.cpp index 2024d15a21..5201edefea 100644 --- a/ggml/src/ggml-qnn/npu/device/graph.cpp +++ b/ggml/src/ggml-qnn/npu/device/graph.cpp @@ -5,6 +5,7 @@ #include "op_impl.hpp" #include "util.hpp" +#include "vtcm_mem.hpp" namespace hexagon { @@ -28,50 +29,57 @@ void graph::set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_co for (int i = 0; i < tensor_count; ++i) { auto * tensor_obj = reinterpret_cast(tensors[i]); _tensors[i] = tensor_obj; - DEVICE_LOG_DEBUG("graph(%p) set_tensor[%d]: %p(%p,%p), op: %d\n", (void *) this, i, (void *) tensor_obj, - (void *) tensor_obj->get_src(0), (void *) tensor_obj->get_src(1), tensor_obj->get_op()); + DEVICE_LOG_DEBUG("graph(%p) set_tensor[%d]: %p(%p,%p), op: %s\n", (void *) this, i, (void *) tensor_obj, + (void *) tensor_obj->get_src(0), (void *) tensor_obj->get_src(1), + op_get_name(tensor_obj->get_op())); } _tensor_count = tensor_count; DEVICE_LOG_DEBUG("graph(%p) tensor count: %zu\n", (void *) this, _tensor_count); } -bool graph::compute(default_thread_pool * thread_pool) { - if (!_tensors || !_tensor_count) { +bool graph::compute(default_thread_pool * thread_pool, const float * f16_to_f32_table) { + if (_tensors == nullptr || !_tensor_count) { DEVICE_LOG_DEBUG("graph(%p) no tensors to compute\n", (void *) this); return true; // return success if no tensors to compute } DEVICE_LOG_DEBUG("graph(%p) compute\n", (void *) this); - thread_pool->sync_execute(reinterpret_cast(&graph::thread_pool_task), this); - - for (size_t i = 0; i < _tensor_count; ++i) { - auto * dst = _tensors[i]; - dst->flush(); // TODO: optimize this + _f16_to_f32_table = f16_to_f32_table; + if (thread_pool) { + thread_pool->sync_execute(reinterpret_cast(&graph::thread_pool_task), this); + } else { + compute_impl(nullptr, 0, 1); } + _f16_to_f32_table = nullptr; return true; } void graph::thread_pool_task(default_thread_pool * pool, size_t thread_idx, size_t thread_count, graph * graph) { - NPU_UNUSED(pool); - graph->compute_impl(thread_idx, thread_count); + graph->compute_impl(pool, thread_idx, thread_count); } -void graph::compute_impl(size_t thread_idx, size_t thread_count) { +void graph::compute_impl(default_thread_pool * pool, size_t thread_idx, size_t thread_count) { for (size_t i = 0; i < _tensor_count; ++i) { auto * dst = _tensors[i]; auto op = dst->get_op(); - auto * func = get_compute_func(op); - if (!func) { + auto * func = get_compute_func(dst); + if (func == nullptr) { DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d not supported\n", (void *) this, i, op); return; } - if (!func(dst, thread_idx, thread_count)) { + hexagon::compute_params params = { thread_idx, thread_count, _f16_to_f32_table }; + if (!func(dst, ¶ms)) { DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d compute failed\n", (void *) this, i, op); - return; } + + // TODO: figure out which ops need to sync + if (pool) { + pool->sync_thread(); + } + dst->invalidate(); } } diff --git a/ggml/src/ggml-qnn/npu/device/graph.hpp b/ggml/src/ggml-qnn/npu/device/graph.hpp index 7ca2931699..126d254178 100644 --- a/ggml/src/ggml-qnn/npu/device/graph.hpp +++ b/ggml/src/ggml-qnn/npu/device/graph.hpp @@ -17,14 +17,15 @@ class graph { void set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_count); - bool compute(default_thread_pool * thread_pool); + bool compute(default_thread_pool * thread_pool, const float * f16_to_f32_table); private: static void thread_pool_task(default_thread_pool * pool, size_t thread_idx, size_t thread_count, graph * graph); - void compute_impl(size_t thread_idx, size_t thread_count); + void compute_impl(default_thread_pool * pool, size_t thread_idx, size_t thread_count); std::unique_ptr _tensors; - size_t _tensor_count = 0; + size_t _tensor_count = 0; + const float * _f16_to_f32_table = nullptr; DISABLE_COPY_AND_MOVE(graph); }; diff --git a/ggml/src/ggml-qnn/npu/device/op_impl.cpp b/ggml/src/ggml-qnn/npu/device/op_impl.cpp index 8d55971a72..d68fd9a53b 100644 --- a/ggml/src/ggml-qnn/npu/device/op_impl.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_impl.cpp @@ -6,25 +6,27 @@ #include #include "op_mul_mat.hpp" +#include "quants.hpp" namespace { -template -inline void vec_op_f32_f32(const float * src0, const float * src1, size_t count, float * dst) { +template +inline void vec_op_impl(const _TyData * src0, const _TyData * src1, size_t count, _TyData * dst) { + constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(_TyData); + HVX_Vector * iptr0 = ((HVX_Vector *) src0); - HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / hexagon::kFloatsPerVector); + HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kElementsPerVector); HVX_Vector * iptr1 = ((HVX_Vector *) src1); HVX_Vector * optr = ((HVX_Vector *) dst); HVX_Vector prev0 = *iptr0++; HVX_Vector prev1 = *iptr1++; - // TODO: prefetch or just use VTCM? while (iptr0 < iptr0_end) { HVX_Vector curr0 = *iptr0++; HVX_Vector curr1 = *iptr1++; HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - *optr++ = Q6_Vsf_equals_Vqf32(_OpIntrinsic(s0, s1)); + *optr++ = _OpIntrinsic(s0, s1); prev0 = curr0; prev1 = curr1; } @@ -42,13 +44,13 @@ inline void vec_op_f32_f32(const float * src0, const float * src1, size_t count, iptr1 = iptr1_aligned ? iptr1 : iptr1 + 1; HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - *optr++ = Q6_Vsf_equals_Vqf32(_OpIntrinsic(s0, s1)); + *optr++ = _OpIntrinsic(s0, s1); prev0 = curr0; prev1 = curr1; } - const size_t leftover = count % hexagon::kFloatsPerVector; - const size_t leftover_bytes = leftover * sizeof(float); + const size_t leftover = count % kElementsPerVector; + const size_t leftover_bytes = leftover * sizeof(_TyData); if (leftover > 0) { // handle the leftover elements HVX_Vector curr0 = @@ -59,24 +61,56 @@ inline void vec_op_f32_f32(const float * src0, const float * src1, size_t count, (leftover_bytes + hexagon::unaligned_bytes(iptr1) > hexagon::kBytesPerVector) ? *iptr1 : prev1; curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); - q6op_vstu_variable_ARV(optr, leftover_bytes, Q6_Vsf_equals_Vqf32(_OpIntrinsic(curr0, curr1))); + q6op_vstu_variable_ARV(optr, leftover_bytes, _OpIntrinsic(curr0, curr1)); } } +template +inline void vec_op_f32_f32(const float * src0, const float * src1, size_t count, float * dst) { + vec_op_impl<_OpIntrinsic, float>(src0, src1, count, dst); +} + inline HVX_Vector vadd_f32_f32(HVX_Vector a, HVX_Vector b) { - return Q6_Vqf32_vadd_VsfVsf(a, b); + return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a, b)); } inline HVX_Vector vsub_f32_f32(HVX_Vector a, HVX_Vector b) { - return Q6_Vqf32_vsub_VsfVsf(a, b); + return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a, b)); } inline HVX_Vector vmul_f32_f32(HVX_Vector a, HVX_Vector b) { - return Q6_Vqf32_vmpy_VsfVsf(a, b); + return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a, b)); } -template -bool element_wise_op(hexagon::tensor * out, size_t tidx, size_t tcnt) { +template +inline void vec_op_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count, + npu_device_fp16_t * dst) { + vec_op_impl<_OpIntrinsic, npu_device_fp16_t>(src0, src1, count, dst); +} + +inline HVX_Vector vadd_f16_f16(HVX_Vector a, HVX_Vector b) { + // TODO: fix this since qf16 has less precision than fp16 + return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_VhfVhf(a, b)); +} + +inline HVX_Vector vsub_f16_f16(HVX_Vector a, HVX_Vector b) { + // TODO: fix this since qf16 has less precision than fp16 + return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vsub_VhfVhf(a, b)); +} + +inline HVX_Vector vmul_f16_f16(HVX_Vector a, HVX_Vector b) { + return Q6_Vhf_equals_Wqf32(Q6_Wqf32_vmpy_VhfVhf(a, b)); +} + +template struct get_data_type {}; + +template struct get_data_type { + using type = _TyData; +}; + +template bool element_wise_op(hexagon::tensor * out, hexagon::compute_params * params) { + using data_type = typename get_data_type::type; + if (!out) { return false; } @@ -94,24 +128,39 @@ bool element_wise_op(hexagon::tensor * out, size_t tidx, size_t tcnt) { return false; } - const auto * src0_ptr = reinterpret_cast(src0->get_data()); - const auto * src1_ptr = reinterpret_cast(src1->get_data()); - auto * dst_ptr = reinterpret_cast(out->get_data()); - auto total_rows = out->get_ne(3) * out->get_ne(2) * out->get_ne(1); - const auto rows_per_box = out->get_ne(2) * out->get_ne(1); - const auto start_end = hexagon::get_thread_work_slice(total_rows, tidx, tcnt); + const auto * src0_ptr = reinterpret_cast(src0->get_read_buffer()); + const auto * src1_ptr = reinterpret_cast(src1->get_read_buffer()); + auto * dst_ptr = reinterpret_cast(out->get_write_buffer()); + auto total_rows = out->get_ne(3) * out->get_ne(2) * out->get_ne(1); + const auto rows_per_cube = out->get_ne(2) * out->get_ne(1); + const auto start_end = hexagon::get_thread_work_slice(total_rows, params->tidx, params->tcnt); + + if (start_end.first >= start_end.second) { + return true; + } + + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(out, params->tidx); + + const size_t valid_row_bytes = src0->get_ne(0) * sizeof(data_type); for (int64_t ir = start_end.first; ir < start_end.second; ++ir) { - const auto i03 = ir / rows_per_box; - const auto i02 = ir / out->get_ne(1) - i03 * out->get_ne(2); - const auto i01 = ir % out->get_ne(1); - const auto i13 = i03 % src1->get_ne(3); - const auto i12 = i02 % src1->get_ne(2); - const auto i11 = i01 % src1->get_ne(1); - auto * src0_row = src0_ptr + i03 * src0->get_nb(3) + i02 * src0->get_nb(2) + i01 * src0->get_nb(1); - auto * src1_row = src1_ptr + i13 * src1->get_nb(3) + i12 * src1->get_nb(2) + i11 * src1->get_nb(1); - auto * dst_row = dst_ptr + i03 * out->get_nb(3) + i02 * out->get_nb(2) + i01 * out->get_nb(1); - _RowFunc(reinterpret_cast(src0_row), reinterpret_cast(src1_row), - static_cast(out->get_ne(0)), reinterpret_cast<_TyDst *>(dst_row)); + const auto i03 = ir / rows_per_cube; + const auto i02 = ir / out->get_ne(1) - i03 * out->get_ne(2); + const auto i01 = ir % out->get_ne(1); // TODO: should we use divide instead of mod? + const auto i13 = i03 % src1->get_ne(3); + const auto i12 = i02 % src1->get_ne(2); + const auto i11 = i01 % src1->get_ne(1); + + auto * src1_plane = src1_ptr + i13 * src1->get_nb(3) + i12 * src1->get_nb(2); + auto * src0_row = src0_ptr + i03 * src0->get_nb(3) + i02 * src0->get_nb(2) + i01 * src0->get_nb(1); + auto * src1_row = src1_plane + i11 * src1->get_nb(1); + auto * dst_row = dst_ptr + i03 * out->get_nb(3) + i02 * out->get_nb(2) + i01 * out->get_nb(1); + if (ir + 1 < start_end.second) { + hexagon::l2fetch_row(src0_row + src0->get_nb(1), valid_row_bytes); + hexagon::l2fetch_row(src1_row + src1->get_nb(1), valid_row_bytes); + } + + _RowFunc(reinterpret_cast(src0_row), reinterpret_cast(src1_row), + static_cast(out->get_ne(0)), reinterpret_cast(dst_row)); } return true; @@ -120,19 +169,37 @@ bool element_wise_op(hexagon::tensor * out, size_t tidx, size_t tcnt) { bool is_element_wise_op_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, const npu_device_tensor_spec & dst, npu_device_tensor_op op) { if (op != NPU_OP_ADD && op != NPU_OP_SUB && op != NPU_OP_MUL) { - DEVICE_LOG_DEBUG("Unsupported element wise op: %s\n", hexagon::op_get_name(op)); + DEVICE_LOG_DEBUG("[%s]unsupported\n", hexagon::op_get_name(op)); + return false; + } + + if (dst.type != src0.type || dst.type != src1.type) { + DEVICE_LOG_DEBUG("[%s]src0.type and dst.type mismatch: %s vs %s\n", hexagon::op_get_name(op), + hexagon::get_type_name(src0.type), hexagon::get_type_name(dst.type)); + return false; + } + + if (dst.type != NPU_DATA_TYPE_F32 && dst.type != NPU_DATA_TYPE_F16) { + DEVICE_LOG_DEBUG("[%s]unsupported data type: %s\n", hexagon::op_get_name(op), hexagon::get_type_name(dst.type)); + return false; + } + + // TODO: fix FP16 add/sub + if (dst.type == NPU_DATA_TYPE_F16 && op != NPU_OP_MUL) { + DEVICE_LOG_DEBUG("[%s]unsupported data type: %s\n", hexagon::op_get_name(op), hexagon::get_type_name(dst.type)); return false; } if (src0.ne[0] != src1.ne[0]) { - DEVICE_LOG_DEBUG("src0.ne[0] and src1.ne[0] not match: %ld vs %ld\n", (long) src0.ne[0], (long) src1.ne[0]); + DEVICE_LOG_DEBUG("[%s]src0.ne[0] and src1.ne[0] not match: %ld vs %ld\n", hexagon::op_get_name(op), + (long) src0.ne[0], (long) src1.ne[0]); return false; } for (size_t i = 0; i < DEVICE_TENSOR_MAX_DIMS; ++i) { if (src0.ne[i] != dst.ne[i]) { - DEVICE_LOG_DEBUG("src0.ne[%zu] and dst.ne[%zu] not match: %lld vs %lld\n", i, i, (long long) src0.ne[i], - (long long) dst.ne[i]); + DEVICE_LOG_DEBUG("[%s]src0.ne[%zu] and dst.ne[%zu] not match: %lld vs %lld\n", hexagon::op_get_name(op), i, + i, (long long) src0.ne[i], (long long) dst.ne[i]); return false; } } @@ -142,46 +209,67 @@ bool is_element_wise_op_supported(const npu_device_tensor_spec & src0, const npu struct op_capabilities { npu_device_tensor_op op; - hexagon::compute_func_type compute_func; hexagon::op_is_supported_func_type is_supported; + hexagon::compute_func_type compute_funcs[NPU_DATA_TYPE_COUNT]; }; constexpr const op_capabilities kOpCapabilities[] = { - { NPU_OP_MUL_MAT, hexagon::mul_mat_f32, hexagon::is_mul_mat_supported }, - { NPU_OP_ADD, element_wise_op>, is_element_wise_op_supported }, - { NPU_OP_SUB, element_wise_op>, is_element_wise_op_supported }, - { NPU_OP_MUL, element_wise_op>, is_element_wise_op_supported }, + { + NPU_OP_MUL_MAT, hexagon::is_mul_mat_supported, + { + hexagon::mul_mat_f32, // NPU_DATA_TYPE_F32 + nullptr, // NPU_DATA_TYPE_F16 + }, }, + { NPU_OP_ADD, + is_element_wise_op_supported, { + element_wise_op>, // NPU_DATA_TYPE_F32 + element_wise_op>, // NPU_DATA_TYPE_F16 + } }, + { NPU_OP_SUB, + is_element_wise_op_supported, { + element_wise_op>, // NPU_DATA_TYPE_F32 + element_wise_op>, // NPU_DATA_TYPE_F16 + } }, + { NPU_OP_MUL, + is_element_wise_op_supported, { + element_wise_op>, // NPU_DATA_TYPE_F32 + element_wise_op>, // NPU_DATA_TYPE_F16 + } }, }; -static_assert(kOpCapabilities[NPU_OP_MUL_MAT].compute_func == hexagon::mul_mat_f32, +static_assert(kOpCapabilities[NPU_OP_MUL_MAT].compute_funcs[NPU_DATA_TYPE_F32] == hexagon::mul_mat_f32, "kOpArray[NPU_OP_MUL_MAT] != mul_mat_f32"); static_assert(std::size(kOpCapabilities) == NPU_OP_COUNT); static_assert(kOpCapabilities[NPU_OP_MUL_MAT].op == NPU_OP_MUL_MAT, "kOpArray[NPU_OP_MUL_MAT].op != NPU_OP_MUL_MAT"); static_assert(kOpCapabilities[NPU_OP_MUL].op == NPU_OP_MUL, "kOpArray[NPU_OP_MUL].op != NPU_OP_MUL"); -} // namespace - -namespace hexagon { - -compute_func_type get_compute_func(npu_device_tensor_op op) { +hexagon::compute_func_type get_compute_func_impl(npu_device_tensor_op op, npu_device_tensor_data_type type) { if (op >= NPU_OP_COUNT) { return nullptr; } - return kOpCapabilities[op].compute_func; + return kOpCapabilities[op].compute_funcs[type]; +} + +} // namespace + +namespace hexagon { + +compute_func_type get_compute_func(tensor * dst) { + return get_compute_func_impl(dst->get_op(), dst->get_type()); } bool support_op(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, const npu_device_tensor_spec & dst, npu_device_tensor_op op) { - if (get_compute_func(op) == nullptr) { - DEVICE_LOG_ERROR("Unsupported op: %s, get_compute_func failed\n", op_get_name(op)); + if (get_compute_func_impl(op, dst.type) == nullptr) { + DEVICE_LOG_ERROR("[%s]unsupported, get_compute_func failed\n", op_get_name(op)); return false; } auto is_supported_func = kOpCapabilities[op].is_supported; if (!is_supported_func || !is_supported_func(src0, src1, dst, op)) { - DEVICE_LOG_ERROR("Unsupported op: %s, is_supported_func failed\n", op_get_name(op)); + DEVICE_LOG_DEBUG("[%s]unsupported, is_supported_func failed\n", op_get_name(op)); return false; } diff --git a/ggml/src/ggml-qnn/npu/device/op_impl.hpp b/ggml/src/ggml-qnn/npu/device/op_impl.hpp index 6b30d24819..f9a3d01187 100644 --- a/ggml/src/ggml-qnn/npu/device/op_impl.hpp +++ b/ggml/src/ggml-qnn/npu/device/op_impl.hpp @@ -1,15 +1,10 @@ #pragma once -#include "hexagon_npu.h" -#include "tensor.hpp" +#include "op_types.hpp" namespace hexagon { -typedef bool (*compute_func_type)(tensor * dst, size_t tidx, size_t tcnt); -typedef bool (*op_is_supported_func_type)(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, - const npu_device_tensor_spec & dst, npu_device_tensor_op op); - -compute_func_type get_compute_func(npu_device_tensor_op op); +compute_func_type get_compute_func(tensor * dst); bool support_op(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, const npu_device_tensor_spec & dst, npu_device_tensor_op op); diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp index 381629da34..647a5ff925 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp @@ -2,17 +2,42 @@ #include +#include "quants.hpp" +#include "vtcm_mem.hpp" + namespace { +inline float vec_reduction_f32(HVX_Vector sums) { + constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(float); + static_assert(kFloatsPerVector == 32 || kFloatsPerVector == 16, "kFloatsPerVector should be 16 or 32"); + + // TODO: do we have a better way to do the reduction? + switch (kFloatsPerVector) { + default: + case 32: + sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 16 * sizeof(float))); + // fallthrough + case 16: + sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 8 * sizeof(float))); + sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 4 * sizeof(float))); + sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 2 * sizeof(float))); + sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, sizeof(float))); + break; + } + + return hexagon::get_flt0_from_fltv(Q6_Vsf_equals_Vqf32(sums)); +} + inline float vec_dot_product_f32_f32(const float * src0, const float * src1, size_t count) { + constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(float); + HVX_Vector * iptr0 = ((HVX_Vector *) src0); - HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / hexagon::kFloatsPerVector); + HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kElementsPerVector); HVX_Vector * iptr1 = ((HVX_Vector *) src1); HVX_Vector prev0 = *iptr0++; HVX_Vector prev1 = *iptr1++; HVX_Vector sum = Q6_V_vzero(); - // TODO: prefetch or just use VTCM? while (iptr0 < iptr0_end) { HVX_Vector curr0 = *iptr0++; HVX_Vector curr1 = *iptr1++; @@ -41,7 +66,7 @@ inline float vec_dot_product_f32_f32(const float * src0, const float * src1, siz prev1 = curr1; } - const size_t leftover = count % hexagon::kFloatsPerVector; + const size_t leftover = count % kElementsPerVector; const size_t leftover_bytes = leftover * sizeof(float); if (leftover > 0) { // handle the leftover elements @@ -57,21 +82,201 @@ inline float vec_dot_product_f32_f32(const float * src0, const float * src1, siz Q6_V_valign_VVR(Q6_Vqf32_vmpy_VsfVsf(curr0, curr1), Q6_V_vzero(), leftover_bytes), sum); } - // TODO: do we have a better way to do the reduction? - for (size_t i = hexagon::kFloatsPerVector / 2; i > 0; i /= 2) { - sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_V_vror_VR(sum, i * sizeof(float))); + return vec_reduction_f32(sum); +} + +// TODO: merge with vec_dot_product_f32_f32? +inline float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count) { + constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(npu_device_fp16_t); + constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(float); + + HVX_Vector * iptr0 = ((HVX_Vector *) src0); + HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kElementsPerVector); + HVX_Vector * iptr1 = ((HVX_Vector *) src1); + HVX_Vector prev0 = *iptr0++; + HVX_Vector prev1 = *iptr1++; + HVX_Vector sum_hi = Q6_V_vzero(); + HVX_Vector sum_lo = Q6_V_vzero(); + + while (iptr0 < iptr0_end) { + HVX_Vector curr0 = *iptr0++; + HVX_Vector curr1 = *iptr1++; + HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + HVX_VectorPair result = Q6_Wqf32_vmpy_VhfVhf(s0, s1); + sum_hi = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_hi_W(result), sum_hi); + sum_lo = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(result), sum_lo); + prev0 = curr0; + prev1 = curr1; } - float result; - q6op_vstu_variable_ARV(&result, sizeof(float), Q6_Vsf_equals_Vqf32(sum)); - return result; + if ((iptr0_end - ((HVX_Vector *) src0)) > 0) { + // handle the last vector + // see also: + // https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147 + // or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c + bool iptr0_aligned = hexagon::is_addr_aligned(iptr0); + HVX_Vector curr0 = iptr0_aligned ? prev0 : *iptr0; + iptr0 = iptr0_aligned ? iptr0 : iptr0 + 1; + bool iptr1_aligned = hexagon::is_addr_aligned(iptr1); + HVX_Vector curr1 = iptr1_aligned ? prev1 : *iptr1; + iptr1 = iptr1_aligned ? iptr1 : iptr1 + 1; + HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + HVX_VectorPair result = Q6_Wqf32_vmpy_VhfVhf(s0, s1); + sum_hi = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_hi_W(result), sum_hi); + sum_lo = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(result), sum_lo); + prev0 = curr0; + prev1 = curr1; + } + + const size_t leftover = count % kElementsPerVector; + const size_t leftover_bytes = leftover * sizeof(npu_device_fp16_t); + if (leftover > 0) { + // handle the leftover elements + HVX_Vector curr0 = + (leftover_bytes + hexagon::unaligned_bytes(iptr0) > hexagon::kBytesPerVector) ? *iptr0 : prev0; + curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + + HVX_Vector curr1 = + (leftover_bytes + hexagon::unaligned_bytes(iptr1) > hexagon::kBytesPerVector) ? *iptr1 : prev1; + curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + + HVX_VectorPair result = Q6_Wqf32_vmpy_VhfVhf(curr0, curr1); + + // TODO: can we do this better? + if (leftover > kFloatsPerVector) { + sum_hi = Q6_Vqf32_vadd_Vqf32Vqf32( + Q6_V_valign_VVR(Q6_V_hi_W(result), Q6_V_vzero(), (leftover % kFloatsPerVector) * sizeof(float)), + sum_hi); + sum_lo = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(result), sum_lo); + } else { + sum_lo = Q6_Vqf32_vadd_Vqf32Vqf32( + Q6_V_valign_VVR(Q6_V_lo_W(result), Q6_V_vzero(), leftover * sizeof(float)), sum_lo); + } + } + + return vec_reduction_f32(Q6_Vqf32_vadd_Vqf32Vqf32(sum_hi, sum_lo)); +} + +template struct get_data_type {}; + +template struct get_data_type { + using type = _TyData; +}; + +template +void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tensor * dst, + hexagon::compute_params * params) { + using data_type = typename get_data_type::type; + + const bool is_quantized = hexagon::is_quantized_type(src0->get_type()); + const auto src0_actual_row_size = hexagon::get_dequantized_row_size(src0); + auto * dequantize_row_func = hexagon::get_type_traits(src0->get_type()).dequantize_row; + if (is_quantized && dequantize_row_func == nullptr) { + DEVICE_LOG_ERROR("Unsupported quantized src0 type: %d, dequantize_row_func is null\n", src0->get_type()); + return; + } + + const auto r02 = src1->get_ne(2) / src0->get_ne(2); + const auto r03 = src1->get_ne(3) / src0->get_ne(3); + const auto * src0_ptr = reinterpret_cast(src0->get_read_buffer()); + const auto * src1_ptr = reinterpret_cast(src1->get_read_buffer()); + auto * dst_ptr = reinterpret_cast(dst->get_write_buffer()); + const auto total_planes = dst->get_ne(3) * dst->get_ne(2); + + auto start_end_plane = std::pair{ 0, total_planes }; + auto start_end_row = std::pair{ 0, dst->get_ne(1) }; + auto start_end_element = std::pair{ 0, dst->get_ne(0) }; + + if (total_planes >= params->tcnt) { + start_end_plane = hexagon::get_thread_work_slice(total_planes, params->tidx, params->tcnt); + } else if (dst->get_ne(1) >= params->tcnt) { + start_end_row = hexagon::get_thread_work_slice(dst->get_ne(1), params->tidx, params->tcnt); + } else { + start_end_element = hexagon::get_thread_work_slice(dst->get_ne(0), params->tidx, params->tcnt); + } + + if (start_end_plane.second <= start_end_plane.first || start_end_row.second <= start_end_row.first || + start_end_element.second <= start_end_element.first) { + DEVICE_LOG_DEBUG( + "mul_mat_impl: no work to do, start_end_plane: (%ld, %ld), start_end_row: (%ld, %ld), " + "start_end_element: (%ld, %ld)\n", + start_end_plane.first, start_end_plane.second, start_end_row.first, start_end_row.second, + start_end_element.first, start_end_element.second); + return; + } + + // cache the src0 plane in VTCM + const size_t src0_plane_row_count = start_end_element.second - start_end_element.first; + size_t src0_plane_cache_size = 0; + uint8_t * src0_plane_cache_ptr = nullptr; + const uint8_t * last_cached_plane_ptr = nullptr; + if (is_quantized) { + src0_plane_cache_size = src0_actual_row_size * src0_plane_row_count; + src0_plane_cache_ptr = params->get_cache(src0_plane_cache_size, is_quantized); + } + + DEVICE_LOG_DEBUG("mul_mat_impl src0_actual_row_size: %zu, is_quantized: %d, vtcm_mem: %p(%zu)\n", + src0_actual_row_size, is_quantized, (void *) src0_plane_cache_ptr, src0_plane_cache_size); + + const size_t valid_row_bytes = src1->get_ne(0) * sizeof(data_type); + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_SUB_PROC(dst, params->tidx, dequant); + for (int64_t ip = start_end_plane.first; ip < start_end_plane.second; ip++) { + const auto i3 = ip / dst->get_ne(2); + const auto i2 = ip - i3 * dst->get_ne(2); + const auto * src0_plane = src0_ptr + i3 / r03 * src0->get_nb(3) + i2 / r02 * src0->get_nb(2) + + start_end_element.first * src0->get_nb(1); + const auto * src1_plane = src1_ptr + i3 * src1->get_nb(3) + i2 * src1->get_nb(2); + auto * dst_plane = dst_ptr + i3 * dst->get_nb(3) + i2 * dst->get_nb(2); + + if (src0_plane_cache_ptr) { + if (last_cached_plane_ptr != src0_plane) { + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_SUB_PROC(dequant); + + for (int64_t ir = 0; ir < (int64_t) src0_plane_row_count; ir++) { + auto * src0_row = src0_plane + ir * src0->get_nb(1); + if (ir + 1 < src0_plane_row_count) { + hexagon::l2fetch_row(src0_row + src0->get_nb(1), src0->get_nb(1)); + } + + auto * dst_row = reinterpret_cast(src0_plane_cache_ptr + ir * src0_actual_row_size); + dequantize_row_func(src0_row, reinterpret_cast(dst_row), src0->get_ne(0), + params->f16_to_f32_table); + } + + last_cached_plane_ptr = src0_plane; + } + + src0_plane = src0_plane_cache_ptr; + } + + for (int64_t i1 = start_end_row.first; i1 < start_end_row.second; i1++) { + auto * src1_row = src1_plane + i1 * src1->get_nb(1); + auto * dst_row = reinterpret_cast(dst_plane + i1 * dst->get_nb(1)) + start_end_element.first; + for (int64_t i0 = 0; i0 < (int64_t) src0_plane_row_count; i0++) { + auto * src0_row = src0_plane + i0 * src0_actual_row_size; + if (i0 + 1 < src0_plane_row_count) { + if (!src0_plane_cache_ptr) { + hexagon::l2fetch_row(src0_row + src0_actual_row_size, valid_row_bytes); + } + } else if (ip + 1 < start_end_plane.second) { + hexagon::l2fetch_row(src1_row + src1->get_nb(1), valid_row_bytes); + } + + // TODO: figure dst how to handle a entire row + dst_row[i0] = _DotFunc(reinterpret_cast(src0_row), + reinterpret_cast(src1_row), (size_t) src0->get_ne(0)); + } + } + } } } // namespace namespace hexagon { -bool mul_mat_f32(hexagon::tensor * out, size_t tidx, size_t tcnt) { +bool mul_mat_f32(hexagon::tensor * out, compute_params * params) { if (!out) { return false; } @@ -83,62 +288,80 @@ bool mul_mat_f32(hexagon::tensor * out, size_t tidx, size_t tcnt) { return true; // skip if no src } - const auto r02 = src1->get_ne(2) / src0->get_ne(2); - const auto r03 = src1->get_ne(3) / src0->get_ne(3); - const auto * src0_ptr = reinterpret_cast(src0->get_data()); - const auto * src1_ptr = reinterpret_cast(src1->get_data()); - auto * dst_ptr = reinterpret_cast(out->get_data()); - const auto total_planes = out->get_ne(3) * out->get_ne(2); + // TODO: array? + switch (src1->get_type()) { + case NPU_DATA_TYPE_F32: + mul_mat_impl(src0, src1, out, params); + return true; - const auto start_end_plane = (total_planes >= tcnt) ? get_thread_work_slice(total_planes, tidx, tcnt) : - std::pair{ 0, total_planes }; - const auto start_end_row = (total_planes >= tcnt) ? std::pair{ 0, out->get_ne(1) } : - get_thread_work_slice(out->get_ne(1), tidx, tcnt); - for (int64_t ip = start_end_plane.first; ip < start_end_plane.second; ip++) { - const auto i3 = ip / out->get_ne(2); - const auto i2 = ip - i3 * out->get_ne(2); - const auto * src0_plane = src0_ptr + i3 / r03 * src0->get_nb(3) + i2 / r02 * src0->get_nb(2); - const auto * src1_plane = src1_ptr + i3 * src1->get_nb(3) + i2 * src1->get_nb(2); - auto * dst_plane = dst_ptr + i3 * out->get_nb(3) + i2 * out->get_nb(2); - for (int64_t i1 = start_end_row.first; i1 < start_end_row.second; i1++) { - // TODO: prefetch row? - auto * src1_row = src1_plane + i1 * src1->get_nb(1); - auto * dst_row = reinterpret_cast(dst_plane + i1 * out->get_nb(1)); - for (int64_t i0 = 0; i0 < out->get_ne(0); i0++) { - auto * src0_row = src0_plane + i0 * src0->get_nb(1); - // TODO: figure out how to handle a entire row - *dst_row++ = - vec_dot_product_f32_f32(reinterpret_cast(src0_row), - reinterpret_cast(src1_row), (size_t) src0->get_ne(0)); - } - } + case NPU_DATA_TYPE_F16: + mul_mat_impl(src0, src1, out, params); + return true; + default: + break; } - return true; + DEVICE_LOG_ERROR("Unsupported src1 tensor type: %s\n", get_type_name(src1->get_type())); + return false; } bool is_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, const npu_device_tensor_spec & dst, npu_device_tensor_op op) { if (op != NPU_OP_MUL_MAT) { - DEVICE_LOG_DEBUG("op is not NPU_OP_MUL_MAT: %d\n", op); + DEVICE_LOG_DEBUG("op is not MUL_MAT: %d\n", op); return false; } + if (dst.type != NPU_DATA_TYPE_F32) { + DEVICE_LOG_DEBUG("[%s]dst type is not F32: %s\n", op_get_name(op), get_type_name(dst.type)); + return false; + } + + if (src0.type != src1.type) { +#ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS + if (src1.type != NPU_DATA_TYPE_F32) { + DEVICE_LOG_DEBUG("[%s]src0.type(%s) and src1.type(%s) mismatch and src1 is not F32\n", op_get_name(op), + get_type_name(src0.type), get_type_name(src1.type)); + return false; + } + + const auto type_traits = get_type_traits(src0.type); + if (!type_traits.is_quantized || type_traits.dequantize_row == nullptr) { + DEVICE_LOG_DEBUG("[%s]src0.type(%s) and src1.type(%s) mismatch and src0 is not quantized\n", + op_get_name(op), get_type_name(src0.type), get_type_name(src1.type)); + return false; + } + + if (src0.ne[0] % type_traits.blck_size) { + DEVICE_LOG_DEBUG("[%s]src0.type(%s) ne[0] is not aligned: %ld\n", op_get_name(op), get_type_name(src0.type), + (long) src0.ne[0]); + return false; + } + + DEVICE_LOG_DEBUG("[%s]supported quantized src0.type(%s) and src1.type(%s)\n", op_get_name(op), + get_type_name(src0.type), get_type_name(src1.type)); +#else + DEVICE_LOG_DEBUG("[%s]src0.type(%s) and src1.type(%s) mismatch and quantized tensors are not supported\n", + op_get_name(op), get_type_name(src0.type), get_type_name(src1.type)); + return false; +#endif + } + if (src0.ne[0] != src1.ne[0] || src0.ne[1] != dst.ne[0]) { - DEVICE_LOG_DEBUG("src0 and src1 cannot multiply: %ldx%ld vs %ldx%ld\n", (long) src0.ne[0], (long) src0.ne[1], - (long) src1.ne[0], (long) src1.ne[1]); + DEVICE_LOG_DEBUG("[%s]src0 and src1 cannot multiply: %ldx%ld vs %ldx%ld\n", op_get_name(op), (long) src0.ne[0], + (long) src0.ne[1], (long) src1.ne[0], (long) src1.ne[1]); return false; } if (src1.ne[1] != dst.ne[1] || src1.ne[2] != dst.ne[2] || src1.ne[3] != dst.ne[3]) { - DEVICE_LOG_DEBUG("src1 and dst dimensions not match: %ldx%ld vs %ldx%ld\n", (long) src1.ne[2], - (long) src1.ne[3], (long) dst.ne[2], (long) dst.ne[3]); + DEVICE_LOG_DEBUG("[%s]src1 and dst dimensions not match: %ldx%ld vs %ldx%ld\n", op_get_name(op), + (long) src1.ne[2], (long) src1.ne[3], (long) dst.ne[2], (long) dst.ne[3]); return false; } if (src1.ne[2] % src0.ne[2] || src1.ne[3] % src0.ne[3]) { - DEVICE_LOG_DEBUG("src0 cannot broadcast to src1: %ldx%ld vs %ldx%ld\n", (long) src0.ne[2], (long) src0.ne[3], - (long) src1.ne[2], (long) src1.ne[3]); + DEVICE_LOG_DEBUG("[%s]src0 cannot broadcast to src1: %ldx%ld vs %ldx%ld\n", op_get_name(op), (long) src0.ne[2], + (long) src0.ne[3], (long) src1.ne[2], (long) src1.ne[3]); return false; } diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp index fc2eb2c97e..3a97858606 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp @@ -2,15 +2,15 @@ #include -#include - +#include "op_types.hpp" #include "tensor.hpp" namespace hexagon { -constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73 -constexpr const size_t kFloatsPerVector = kBytesPerVector / sizeof(float); -constexpr const size_t kAlignMask = kBytesPerVector - 1; +constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73 +constexpr const size_t kAlignMask = kBytesPerVector - 1; +constexpr const size_t kL2CacheSize = 8 * 1024; // // 8KB L2 cache +constexpr const size_t kL2FetchAheadVectors = kL2CacheSize / kBytesPerVector; inline size_t unaligned_bytes(const void * addr) { return ((size_t) addr) & kAlignMask; @@ -20,7 +20,30 @@ inline bool is_addr_aligned(void * addr) { return unaligned_bytes(addr) == 0; } -bool mul_mat_f32(tensor * out, size_t tidx, size_t tcnt); +inline void l2fetch(const void * p, uint32_t stride, uint32_t width, uint32_t height, uint32_t dir) { + uint64_t control = HEXAGON_V64_CREATE_H(dir, stride, width, height); + __asm__ __volatile__(" l2fetch(%0,%1) " : : "r"(p), "r"(control)); +} + +inline void l2fetch_row(const uint8_t * curr_row, size_t bytes) { + // TODO: should we use small kL2FetchAheadVectors? + int32_t l2fetch_vectors = Q6_R_min_RR(bytes / kBytesPerVector, kL2FetchAheadVectors); + hexagon::l2fetch(curr_row, kBytesPerVector, kBytesPerVector, l2fetch_vectors, 0); +} + +inline float get_flt0_from_fltv(HVX_Vector vect) { + // See also: tools\HEXAGON_Tools\8.6.07\Examples\StandAlone_Applications\QFloat\QFloat.c + + union { + int32_t i; + float f; + } cvt; + + cvt.i = vect[0]; + return cvt.f; +} + +bool mul_mat_f32(tensor * out, compute_params * params); bool is_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, const npu_device_tensor_spec & dst, npu_device_tensor_op op); diff --git a/ggml/src/ggml-qnn/npu/device/op_types.hpp b/ggml/src/ggml-qnn/npu/device/op_types.hpp new file mode 100644 index 0000000000..8bf10637db --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/op_types.hpp @@ -0,0 +1,58 @@ +#pragma once + +#include +#include +#include +#include + +#include "hexagon_npu.h" +#include "tensor.hpp" +#include "util.hpp" +#include "vtcm_mem.hpp" + +namespace hexagon { + +struct compute_params { + const size_t tidx; + const size_t tcnt; + const float * f16_to_f32_table; + std::unique_ptr vtcm_cache; + std::unique_ptr mem_cache; + size_t mem_cache_size = 0; + + uint8_t * get_cache(size_t size, bool fallback_to_mem) { + if (!vtcm_cache || vtcm_cache->get_size() < size) { + vtcm_cache = std::make_unique(size, false); + } + + if (vtcm_cache->is_valid()) { + return vtcm_cache->get_mem(); + } + + if (!fallback_to_mem) { + DEVICE_LOG_DEBUG("vtcm_mem not valid, return nullptr\n"); + return nullptr; + } + + DEVICE_LOG_DEBUG("vtcm_mem not valid, allocate from mem_cache\n"); + if (!mem_cache || mem_cache_size < size) { + mem_cache = std::make_unique(size + 256); + mem_cache_size = mem_cache ? size : 0; + } + + return mem_cache.get(); + } +}; + +typedef bool (*compute_func_type)(tensor * dst, compute_params * params); +typedef bool (*op_is_supported_func_type)(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, + const npu_device_tensor_spec & dst, npu_device_tensor_op op); + +inline constexpr std::pair get_thread_work_slice(int64_t total, size_t tidx, size_t tcnt) { + const auto elements_per_thread = (total + tcnt - 1) / tcnt; + const auto start = tidx * elements_per_thread; + const auto end = std::min(start + elements_per_thread, total); + return { start, end }; +} + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/quants.cpp b/ggml/src/ggml-qnn/npu/device/quants.cpp new file mode 100644 index 0000000000..d873691b58 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/quants.cpp @@ -0,0 +1,151 @@ +#include "quants.hpp" + +#include + +#include + +static_assert(sizeof(npu_device_block_q4_K) == + 2 * sizeof(npu_device_fp16_t) + QUANT_K_SCALE_SIZE + QUANT_K_BLOCK_SIZE / 2, + "wrong q4_K block size/padding"); + +static_assert(sizeof(npu_device_block_q4_0) == sizeof(npu_device_fp16_t) + QUANT_BLOCK_SIZE / 2, + "wrong q4_0 block size/padding"); + +static_assert(sizeof(npu_device_block_q8_0) == sizeof(npu_device_fp16_t) + QUANT_BLOCK_SIZE, + "wrong q8_0 block size/padding"); + +namespace { + +inline float to_float(const npu_device_fp16_t src) { + union { + __fp16 f16; + npu_device_fp16_t u16; + } f16; + + f16.u16 = src; + return f16.f16; +} + +inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) { + if (j < 4) { + *d = q[j] & 63; + *m = q[j + 4] & 63; + } else { + *d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4); + *m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4); + } +} + +void dequantize_row_q8_0(const void * src, float * dst, size_t count, const float * f16_to_f32_table) { + constexpr const int qk = QUANT_BLOCK_SIZE; + const int nb = count / qk; + const auto * src_ptr = reinterpret_cast(src); + + // TODO: use intrinsics + for (int i = 0; i < nb; i++) { + const float d = f16_to_f32_table[src_ptr[i].d]; + + for (int j = 0; j < qk; ++j) { + dst[i * qk + j] = src_ptr[i].qs[j] * d; + } + } +} + +void dequantize_row_q4_0(const void * src, float * dst, size_t count, const float * f16_to_f32_table) { + constexpr const int qk = QUANT_BLOCK_SIZE; + static_assert(qk % 2 == 0, "qk must be even"); + + const int nb = count / qk; + const auto * src_ptr = reinterpret_cast(src); + + // TODO: use intrinsics + for (int i = 0; i < nb; i++) { + const float d = f16_to_f32_table[src_ptr[i].d]; + + for (int j = 0; j < qk / 2; ++j) { + const int x0 = (src_ptr[i].qs[j] & 0x0F) - 8; + const int x1 = ((src_ptr[i].qs[j] >> 4) & 0xF) - 8; + + dst[i * qk + j + 0] = x0 * d; + dst[i * qk + j + qk / 2] = x1 * d; + } + } +} + +void dequantize_row_q4_K(const void * src, float * dst, size_t count, const float * f16_to_f32_table) { + const int nb = count / QUANT_K_BLOCK_SIZE; + const auto * src_ptr = reinterpret_cast(src); + + // TODO: use intrinsics + for (int i = 0; i < nb; i++) { + const uint8_t * q = src_ptr[i].qs; + + const float d = f16_to_f32_table[src_ptr[i].d]; + const float min = f16_to_f32_table[src_ptr[i].dmin]; + + int is = 0; + uint8_t sc = 0; + uint8_t m = 0; + const auto * scales = src_ptr[i].scales; + for (int j = 0; j < QUANT_K_BLOCK_SIZE; j += 64) { + get_scale_min_k4(is + 0, scales, &sc, &m); + const float d1 = d * sc; + const float m1 = min * m; + get_scale_min_k4(is + 1, scales, &sc, &m); + const float d2 = d * sc; + const float m2 = min * m; + for (int l = 0; l < 32; ++l) { + dst[0] = d1 * (q[l] & 0xF) - m1; + dst[32] = d2 * ((q[l] >> 4) & 0xF) - m2; + dst++; + } + dst += 32; + q += 32; + is += 2; + } + } +} + +constexpr const hexagon::device_type_traits kDeviceTypeTraits[] = { + { NPU_DATA_TYPE_F32, "F32", 1, false, nullptr }, + { NPU_DATA_TYPE_F16, "F16", 1, false, nullptr }, + { NPU_DATA_TYPE_Q8_0, "Q8_0", QUANT_BLOCK_SIZE, true, dequantize_row_q8_0 }, + { NPU_DATA_TYPE_Q4_0, "Q4_0", QUANT_BLOCK_SIZE, true, dequantize_row_q4_0 }, + { NPU_DATA_TYPE_Q4_K, "Q4_K", QUANT_K_BLOCK_SIZE, true, dequantize_row_q4_K }, +}; + +static_assert(std::size(kDeviceTypeTraits) == NPU_DATA_TYPE_COUNT, + "kDeviceTypeTraits size mismatch with npu_device_tensor_data_type enum"); +static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_F32].type == NPU_DATA_TYPE_F32, + "kDeviceTypeTraits F32 type mismatch with npu_device_tensor_data_type enum"); +static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_F16].type == NPU_DATA_TYPE_F16, + "kDeviceTypeTraits F16 type mismatch with npu_device_tensor_data_type enum"); +static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_Q8_0].type == NPU_DATA_TYPE_Q8_0, + "kDeviceTypeTraits Q8_0 type mismatch with npu_device_tensor_data_type enum"); +static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_Q4_0].type == NPU_DATA_TYPE_Q4_0, + "kDeviceTypeTraits Q4_0 type mismatch with npu_device_tensor_data_type enum"); +static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_Q4_K].type == NPU_DATA_TYPE_Q4_K, + "kDeviceTypeTraits Q4_K type mismatch with npu_device_tensor_data_type enum"); + +} // namespace + +namespace hexagon { + +bool init_f16_f32_table(float * table, size_t count) { + constexpr const size_t kTableSize = (1U << 16); + if (count < kTableSize) { + return false; + } + + for (size_t i = 0; i < count; ++i) { + table[i] = to_float(i); + } + + return true; +} + +const device_type_traits & get_type_traits(npu_device_tensor_data_type type) { + return kDeviceTypeTraits[type]; +} + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/quants.hpp b/ggml/src/ggml-qnn/npu/device/quants.hpp new file mode 100644 index 0000000000..6ffbeb0031 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/quants.hpp @@ -0,0 +1,78 @@ + +#include "hexagon_npu.h" +#include "tensor.hpp" +#include "util.hpp" + +namespace hexagon { + +bool init_f16_f32_table(float * table, size_t count); + +typedef void (*dequantize_row_type)(const void * src, float * dst, size_t count, const float * f16_to_f32_table); + +struct device_type_traits { + npu_device_tensor_data_type type; + const char * type_name; + int64_t blck_size; + bool is_quantized; + dequantize_row_type dequantize_row; +}; + +const device_type_traits & get_type_traits(npu_device_tensor_data_type type); + +inline bool is_quantized_type(npu_device_tensor_data_type type) { + return get_type_traits(type).is_quantized; +} + +inline size_t get_dequantized_row_size(tensor * tensor) { + if (!is_quantized_type(tensor->get_type())) { + return tensor->get_nb(1); // for f32 and f16 + } + + auto row_elems_count = tensor->get_ne(0); + return row_elems_count * sizeof(float); // currently only f32 is supported +} + +inline const char * get_type_name(npu_device_tensor_data_type type) { + return get_type_traits(type).type_name; +} + +} // namespace hexagon + +// TODO: move this to a common header +#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING +namespace hexagon { + +inline auto make_scoped_op_perf_timer(tensor * op, size_t tidx, const char * sub_proc_log_prefix = nullptr) { + auto * src0 = op->get_src(0); + auto * src1 = op->get_src(1); + char buffer[512]; + if (src1 == nullptr) { + snprintf(buffer, sizeof(buffer), "[%s][%lldx%lldx%lldx%lld%s], tidx: %zu", op_get_name(op->get_op()), + src0->get_ne(0), src0->get_ne(1), src0->get_ne(2), src0->get_ne(3), get_type_name(src0->get_type()), + tidx); + } else { + snprintf(buffer, sizeof(buffer), "[%s][%lldx%lldx%lldx%lld%s],[%lldx%lldx%lldx%lld%s], tidx: %zu", + op_get_name(op->get_op()), src0->get_ne(0), src0->get_ne(1), src0->get_ne(2), src0->get_ne(3), + get_type_name(src0->get_type()), src1->get_ne(0), src1->get_ne(1), src1->get_ne(2), src1->get_ne(3), + get_type_name(src1->get_type()), tidx); + } + return npu_scoped_timer<512>(buffer, sub_proc_log_prefix); +} + +} // namespace hexagon + +# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(op, tidx) \ + auto __npu_op_timer_##__LINE__ = hexagon::make_scoped_op_perf_timer(op, tidx) + +# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_SUB_PROC(op, tidx, sub_prefix) \ + auto __npu_op_timer_##sub_prefix = hexagon::make_scoped_op_perf_timer(op, tidx, #sub_prefix) + +# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_SUB_PROC(sub_prefix) \ + hexagon::npu_sub_process_scoped_timer \ + __npu_op_sub_timer##sub_prefix(__npu_op_timer_##sub_prefix) + +#else +# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(op, tidx) ((void) 0) +# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_SUB_PROC(op, tidx, sub_prefix) ((void) 0) +# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_SUB_PROC(sub_prefix) ((void) 0) +#endif diff --git a/ggml/src/ggml-qnn/npu/device/tensor.hpp b/ggml/src/ggml-qnn/npu/device/tensor.hpp index ad1915ecb6..9c7f6bffef 100644 --- a/ggml/src/ggml-qnn/npu/device/tensor.hpp +++ b/ggml/src/ggml-qnn/npu/device/tensor.hpp @@ -36,7 +36,14 @@ class tensor { DEVICE_LOG_INFO("~tensor(%p) fd: %d", (void *) this, _info.buffer_fd); } - void flush() { + void flush() const { + if (_data) { + qurt_mem_cache_clean((qurt_addr_t) (_data + _info.offset), (qurt_size_t) _info.size, QURT_MEM_CACHE_FLUSH, + QURT_MEM_DCACHE); + } + } + + void invalidate() const { if (_data) { qurt_mem_cache_clean((qurt_addr_t) (_data + _info.offset), (qurt_size_t) _info.size, QURT_MEM_CACHE_INVALIDATE, QURT_MEM_DCACHE); @@ -72,7 +79,12 @@ class tensor { npu_device_tensor_data_type get_type() const { return _info.type; } - uint8_t * get_data() const { return _data + _info.offset; } + const uint8_t * get_read_buffer() const { + invalidate(); + return _data + _info.offset; + } + + uint8_t * get_write_buffer() const { return _data + _info.offset; } bool is_valid() const { return _data != nullptr; } diff --git a/ggml/src/ggml-qnn/npu/device/thread_pool.hpp b/ggml/src/ggml-qnn/npu/device/thread_pool.hpp index a936ae0c4c..bd7e83dd8a 100644 --- a/ggml/src/ggml-qnn/npu/device/thread_pool.hpp +++ b/ggml/src/ggml-qnn/npu/device/thread_pool.hpp @@ -143,6 +143,8 @@ template class thread_pool { return true; } + void sync_thread() { qurt_barrier_wait(&_completed); } + private: struct thread_pool_arg { thread_pool * pool = nullptr; diff --git a/ggml/src/ggml-qnn/npu/device/util.hpp b/ggml/src/ggml-qnn/npu/device/util.hpp index f6f5479694..a5e1ae5201 100644 --- a/ggml/src/ggml-qnn/npu/device/util.hpp +++ b/ggml/src/ggml-qnn/npu/device/util.hpp @@ -1,9 +1,10 @@ #pragma once #include +#include -#include #include +#include #include #include "hexagon_npu.h" @@ -52,11 +53,105 @@ inline constexpr const char * op_get_name(npu_device_tensor_op op) { } } -inline constexpr std::pair get_thread_work_slice(int64_t total, size_t tidx, size_t tcnt) { - const auto elements_per_thread = (total + tcnt - 1) / tcnt; - const auto start = tidx * elements_per_thread; - const auto end = std::min(start + elements_per_thread, total); - return { start, end }; +#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING + +template class npu_scoped_timer { + public: + enum { kBufferCount = _buffer_count }; + + explicit npu_scoped_timer(const char * log_prefix, const char * sub_proc_log_prefix) { + strncpy(_log_prefix, log_prefix, kBufferCount - 1); + if (sub_proc_log_prefix != nullptr) { + strncpy(_sub_proc_log_prefix, sub_proc_log_prefix, kBufferCount - 1); + } + + _begin_cycles = HAP_perf_get_qtimer_count(); + _begin_pcycles = HAP_perf_get_pcycles(); + } + + npu_scoped_timer(npu_scoped_timer && other) { *this = std::move(other); } + + ~npu_scoped_timer() { print(); } + + void operator=(npu_scoped_timer && other) { + strncpy(_log_prefix, other._log_prefix, kBufferCount - 1); + strncpy(_sub_proc_log_prefix, other._sub_proc_log_prefix, kBufferCount - 1); + _begin_cycles = other._begin_cycles; + _sub_proc_cycles = other._sub_proc_cycles; + _sub_proc_count = other._sub_proc_count; + } + + void add_sub_proc_cycles(uint64_t cycles, uint64_t pcycles) { + _sub_proc_cycles += cycles; + _sub_proc_pcycles += pcycles; + _sub_proc_count++; + } + + void print() const { + auto total_cycles = HAP_perf_get_qtimer_count() - _begin_cycles; + auto total_pcycles = HAP_perf_get_pcycles() - _begin_pcycles; + auto duration = HAP_perf_qtimer_count_to_us(total_cycles); + + if (_sub_proc_count > 0) { + auto sub_proc_duration = HAP_perf_qtimer_count_to_us(_sub_proc_cycles); + DEVICE_LOG_WARN("[profiler]%s, pcyc: %llu, dur: %lluus, [%s]cnt: %llu, pcyc: %llu, dur: %lluus\n", + _log_prefix, total_pcycles, duration, _sub_proc_log_prefix, _sub_proc_count, + _sub_proc_pcycles, sub_proc_duration); + } else { + DEVICE_LOG_WARN("[profiler]%s, pcyc: %llu, dur: %lluus\n", _log_prefix, total_pcycles, duration); + } + } + + private: + char _log_prefix[kBufferCount] = {}; + char _sub_proc_log_prefix[kBufferCount] = {}; + uint64_t _begin_cycles = 0; + uint64_t _begin_pcycles = 0; + uint64_t _sub_proc_cycles = 0; + uint64_t _sub_proc_pcycles = 0; + uint64_t _sub_proc_count = 0; + + DISABLE_COPY(npu_scoped_timer); +}; + +template class npu_sub_process_scoped_timer { + public: + using npu_scoped_timer = npu_scoped_timer<_buffer_count>; + + explicit npu_sub_process_scoped_timer(npu_scoped_timer & timer) : _timer(timer) { + _begin_cycles = HAP_perf_get_qtimer_count(); + _begin_pcycles = HAP_perf_get_pcycles(); + } + + ~npu_sub_process_scoped_timer() { + _timer.add_sub_proc_cycles(HAP_perf_get_qtimer_count() - _begin_cycles, + HAP_perf_get_pcycles() - _begin_pcycles); + } + + private: + npu_scoped_timer & _timer; + uint64_t _begin_cycles = 0; + uint64_t _begin_pcycles = 0; + + DISABLE_COPY_AND_MOVE(npu_sub_process_scoped_timer); +}; + +inline auto make_scoped_perf_timer(const char * format, ...) { + va_list args; + va_start(args, format); + char buffer[512]; + vsnprintf(buffer, sizeof(buffer), format, args); + va_end(args); + return npu_scoped_timer<512>(buffer, nullptr); } +#endif + } // namespace hexagon + +#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING +# define DEVICE_SCOPED_PERFORMANCE_TRACKER(fmt, ...) \ + auto __npu_timer_##__LINE__ = hexagon::make_scoped_perf_timer(fmt, __VA_ARGS__) +#else +# define DEVICE_SCOPED_PERFORMANCE_TRACKER(fmt, ...) ((void) 0) +#endif diff --git a/ggml/src/ggml-qnn/npu/device/vtcm_mem.hpp b/ggml/src/ggml-qnn/npu/device/vtcm_mem.hpp new file mode 100644 index 0000000000..4c2922ca87 --- /dev/null +++ b/ggml/src/ggml-qnn/npu/device/vtcm_mem.hpp @@ -0,0 +1,101 @@ +#pragma once + +#include + +#include "util.hpp" + +namespace hexagon { + +class vtcm_mem { + public: + explicit vtcm_mem(size_t size, bool single_page) { + size_t avail_size = single_page ? get_avail_page_size() : get_avail_block_size(); + if (size > avail_size) { + DEVICE_LOG_ERROR("Requested VTCM size %zu exceeds available size %zu\n", size, avail_size); + return; + } + + _vtcm_mem = HAP_request_VTCM((unsigned int) size, single_page ? 1 : 0); + if (_vtcm_mem == nullptr) { + DEVICE_LOG_ERROR("Failed to allocate VTCM memory: %zu bytes\n", size); + return; + } + + _vtcm_size = size; + DEVICE_LOG_DEBUG("VTCM allocated: %p(%zu), avail: %zu\n", _vtcm_mem, size, avail_size); + } + + explicit vtcm_mem(size_t size, bool single_page, size_t timeout_us) { + _vtcm_mem = HAP_request_async_VTCM((unsigned int) size, single_page ? 1 : 0, (unsigned int) timeout_us); + if (_vtcm_mem == nullptr) { + DEVICE_LOG_ERROR("Failed to allocate VTCM memory: %zu bytes, timeout %zu us\n", size, timeout_us); + return; + } + + _vtcm_size = size; + DEVICE_LOG_DEBUG("VTCM allocated: %p(%zu), avail: %zu\n", _vtcm_mem, size, get_avail_block_size()); + } + + ~vtcm_mem() { + if (is_valid()) { + auto ret = HAP_release_VTCM(_vtcm_mem); + if (ret != AEE_SUCCESS) { + DEVICE_LOG_ERROR("Failed to release VTCM memory: %d\n", ret); + } + } + + DEVICE_LOG_DEBUG("VTCM released: %zu bytes at %p\n", _vtcm_size, _vtcm_mem); + } + + bool is_valid() const { return _vtcm_mem != nullptr; } + + uint8_t * get_mem() const { return reinterpret_cast(_vtcm_mem); } + + size_t get_size() const { return _vtcm_size; } + + static size_t get_total_size() { + unsigned int arch_page_aligned_size = 0; + unsigned int arch_page_count = 0; + auto ret = HAP_query_total_VTCM(&arch_page_aligned_size, &arch_page_count); + if (ret != AEE_SUCCESS) { + DEVICE_LOG_ERROR("Failed to query total VTCM: %d\n", ret); + return 0; + } + + return arch_page_aligned_size; + } + + static size_t get_avail_block_size() { + unsigned int avail_block_size = 0; + unsigned int avail_page_size = 0; + unsigned int num_pages = 0; + auto ret = HAP_query_avail_VTCM(&avail_block_size, &avail_page_size, &num_pages); + if (ret != AEE_SUCCESS) { + DEVICE_LOG_ERROR("Failed to query available VTCM: %d\n", ret); + return 0; + } + + return avail_block_size; + } + + static size_t get_avail_page_size() { + unsigned int avail_block_size = 0; + unsigned int avail_page_size = 0; + unsigned int num_pages = 0; + auto ret = HAP_query_avail_VTCM(&avail_block_size, &avail_page_size, &num_pages); + if (ret != AEE_SUCCESS) { + DEVICE_LOG_ERROR("Failed to query available VTCM: %d\n", ret); + return 0; + } + + return avail_page_size; + } + + private: + void * _vtcm_mem = nullptr; + size_t _vtcm_size = 0; + + DISABLE_COPY_AND_MOVE(vtcm_mem); +}; + +} // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/host/buffer.cpp b/ggml/src/ggml-qnn/npu/host/buffer.cpp index ff5c8a320c..ace3dbee8e 100644 --- a/ggml/src/ggml-qnn/npu/host/buffer.cpp +++ b/ggml/src/ggml-qnn/npu/host/buffer.cpp @@ -75,6 +75,12 @@ void backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { memset(buffer_obj->get_buffer(), value, buffer_obj->get_size()); } +void backend_buffer_reset(ggml_backend_buffer_t buffer) { + auto * buffer_obj = get_buffer_object(buffer); + GGML_ASSERT(buffer_obj != nullptr); + buffer_obj->clear_tensors(); +} + constexpr const ggml_backend_buffer_i backend_buffer_interface = { /* .free_buffer = */ backend_buffer_free_buffer, /* .get_base = */ backend_buffer_get_base, @@ -84,7 +90,7 @@ constexpr const ggml_backend_buffer_i backend_buffer_interface = { /* .get_tensor = */ backend_buffer_get_tensor, /* .cpy_tensor = */ backend_buffer_cpy_tensor, /* .clear = */ backend_buffer_clear, - /* .reset = */ nullptr, + /* .reset = */ backend_buffer_reset, }; const char * backend_buffer_type_get_name(ggml_backend_buffer_type_t buft) { @@ -190,6 +196,11 @@ std::shared_ptr host_buffer::init_tensor(ggml_tensor * tensor, remo return tensor_object; } +void host_buffer::clear_tensors() { + _tensors.clear(); + LOG_DEBUG("clear host_buffer(%p) tensors\n", (void *) _data); +} + host_buffer_type::host_buffer_type(ggml_backend_dev_t dev, const std::string & name, common::rpc_mem_ptr rpc_mem) : _name(name), _rpc_mem(rpc_mem) { diff --git a/ggml/src/ggml-qnn/npu/host/buffer.hpp b/ggml/src/ggml-qnn/npu/host/buffer.hpp index 955944bb98..38c9eed815 100644 --- a/ggml/src/ggml-qnn/npu/host/buffer.hpp +++ b/ggml/src/ggml-qnn/npu/host/buffer.hpp @@ -25,6 +25,8 @@ class host_buffer { std::shared_ptr init_tensor(ggml_tensor * tensor, remote_handle64 device_handle); + void clear_tensors(); + private: common::rpc_mem_ptr _allocator; void * _data = nullptr; diff --git a/ggml/src/ggml-qnn/npu/host/graph.cpp b/ggml/src/ggml-qnn/npu/host/graph.cpp index 9e8cf83204..72ef5cc786 100644 --- a/ggml/src/ggml-qnn/npu/host/graph.cpp +++ b/ggml/src/ggml-qnn/npu/host/graph.cpp @@ -32,7 +32,8 @@ bool host_graph::update(ggml_cgraph * cgraph) { _tensor_handles.reserve(cgraph->n_nodes); for (int i = 0; i < cgraph->n_nodes; ++i) { auto * node = cgraph->nodes[i]; - if (node->op == GGML_OP_NONE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) { + if (node->op == GGML_OP_NONE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || + node->op == GGML_OP_RESHAPE) { // skip view liked ops LOG_DEBUG("node[%d]%s(%s), addr: %p, type: %s, skipped\n", i, ggml_get_name(node), ggml_op_desc(node), (void *) node, ggml_type_name(node->type)); @@ -55,8 +56,8 @@ bool host_graph::update(ggml_cgraph * cgraph) { } } - LOG_DEBUG("host_graph::update, host_graph(%p), ggml_cgraph(%p), tensor count(%zu)\n", (void *) this, - (void *) cgraph, _tensor_handles.size()); + LOG_DEBUG("host_graph::update, host_graph(%p), handle(%p), ggml_cgraph(%p), tensor count(%zu)\n", (void *) this, + (void *) _graph_handle, (void *) cgraph, _tensor_handles.size()); if (!_tensor_handles.empty()) { npu_device_graph_set_tensor(_device_handle, _graph_handle, _tensor_handles.data(), (int) _tensor_handles.size()); diff --git a/ggml/src/ggml-qnn/npu/host/host.cpp b/ggml/src/ggml-qnn/npu/host/host.cpp index 90c4cd29e8..28c561a49f 100644 --- a/ggml/src/ggml-qnn/npu/host/host.cpp +++ b/ggml/src/ggml-qnn/npu/host/host.cpp @@ -57,7 +57,7 @@ void backend_dev_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props ggml_backend_t backend_dev_init_backend(ggml_backend_dev_t dev, const char * params) { auto * dev_obj = get_device_object(dev); GGML_ASSERT(dev_obj != nullptr); - if (!dev_obj->init_device(dev, params)) { + if (!dev_obj->init_device()) { LOG_ERROR("[%s]Failed to init device\n", backend_dev_get_name(dev)); return nullptr; } diff --git a/ggml/src/ggml-qnn/npu/host/host_device.cpp b/ggml/src/ggml-qnn/npu/host/host_device.cpp index aa90cfa8bc..fb1ad4dfd6 100644 --- a/ggml/src/ggml-qnn/npu/host/host_device.cpp +++ b/ggml/src/ggml-qnn/npu/host/host_device.cpp @@ -7,6 +7,8 @@ #include +#include + #include "graph.hpp" #include "util.hpp" @@ -114,11 +116,117 @@ bool npu_device::is_device_initialized() const { return true; } -bool npu_device::init_device(ggml_backend_dev_t dev, const char * params) { +bool npu_device::init_device() { if (!init_rpc_mem()) { return false; } + if (!init_device_lib()) { + return false; + } + + return true; +} + +bool npu_device::supports_buft(ggml_backend_buffer_type_t buft) const { + return buft && buft->device && buft->device->context == this; +} + +bool npu_device::supports_op_impl(const ggml_tensor * op) { + static_assert(std::is_same::value, + "npu_device_fp16_t should be same as ggml_fp16_t"); + + if (op->op == GGML_OP_NONE) { + return true; + } + + if (op->op == GGML_OP_VIEW || op->op == GGML_OP_RESHAPE || op->op == GGML_OP_PERMUTE) { + return true; + } + + if (type_to_npu_type(op->type) == NPU_DATA_TYPE_COUNT) { + LOG_DEBUG("[%s]Unsupported op tensor type: %s\n", get_name(), ggml_type_name(op->type)); + return false; + } + + auto * src0 = op->src[0]; + if (!src0) { + LOG_DEBUG("[%s]Unsupported inplace op: %s\n", get_name(), ggml_op_name(op->op)); + return false; + } + + if (type_to_npu_type(src0->type) == NPU_DATA_TYPE_COUNT) { + LOG_DEBUG("[%s]Unsupported src0 tensor type: %s\n", get_name(), ggml_type_name(src0->type)); + return false; + } + + auto * src1 = op->src[1]; + if (src1 && type_to_npu_type(src1->type) == NPU_DATA_TYPE_COUNT) { + LOG_DEBUG("[%s]Unsupported src1 tensor type: %s\n", get_name(), ggml_type_name(src1->type)); + return false; + } + + auto npu_op = op_to_npu_op(op->op); + if (npu_op == NPU_OP_COUNT) { + LOG_DEBUG("[%s]Unsupported op: %s\n", get_name(), ggml_op_name(op->op)); + return false; + } + + if (!_device_handle && !init_device()) { + LOG_DEBUG("[%s]NPU device initialization failed\n", get_name()); + return false; + } + + constexpr const auto get_spec = [](const ggml_tensor * tensor) -> npu_device_tensor_spec { + if (!tensor) { + return npu_device_tensor_spec{}; + } + + static_assert(DEVICE_TENSOR_MAX_DIMS == GGML_MAX_DIMS, "tensor dimensions mismatch"); + npu_device_tensor_spec spec{}; + spec.ne[0] = tensor->ne[0]; + spec.ne[1] = tensor->ne[1]; + spec.ne[2] = tensor->ne[2]; + spec.ne[3] = tensor->ne[3]; + spec.type = type_to_npu_type(tensor->type); + return spec; + }; + + boolean supported = false; + auto src0_spec = get_spec(src0); + auto src1_spec = get_spec(src1); + auto dst_spec = get_spec(op); + auto ret = npu_device_device_support_op(_device_handle, &src0_spec, &src1_spec, &dst_spec, npu_op, &supported); + if (ret != AEE_SUCCESS || !supported) { + LOG_DEBUG("[%s][%s]unsupported %s(%s,%s), ret: 0x%x, supported: %d\n", get_name(), ggml_op_name(op->op), + ggml_type_name(op->type), ggml_type_name(src0->type), (src1 ? ggml_type_name(src1->type) : "null"), + ret, supported); + return false; + } + + return true; +} + +bool npu_device::init_rpc_mem() { + if (!_rpc_mem) { + auto rpc_interface = std::make_shared(); + if (!rpc_interface->is_valid()) { + LOG_ERROR("[%s]Failed to load rpc memory library\n", get_name()); + return false; + } + + auto rpc_mem = std::make_shared(rpc_interface); + _rpc_interface = rpc_interface; + _rpc_mem = rpc_mem; + LOG_DEBUG("[%s]rpc memory initialized\n", get_name()); + } else { + LOG_DEBUG("[%s]rpc memory already initialized\n", get_name()); + } + + return true; +} + +bool npu_device::init_device_lib() { if (!_device_handle) { auto arch = get_dsp_arch(_rpc_interface, _dsp_domain_id); const auto & device_lib_info = get_device_library_info(arch); @@ -152,97 +260,38 @@ bool npu_device::init_device(ggml_backend_dev_t dev, const char * params) { return true; } -bool npu_device::supports_buft(ggml_backend_buffer_type_t buft) const { - return buft && buft->device && buft->device->context == this; -} - -bool npu_device::supports_op_impl(const ggml_tensor * op) { - if (op->op == GGML_OP_NONE) { - return true; - } - - if (type_to_npu_type(op->type) == NPU_DATA_TYPE_COUNT) { - LOG_DEBUG("[%s]Unsupported op tensor type: %s\n", get_name(), ggml_type_name(op->type)); - return false; - } - - auto * src0 = op->src[0]; - if (!src0) { - LOG_DEBUG("[%s]Unsupported inplace op: %s\n", get_name(), ggml_op_name(op->op)); - return false; - } - - if (type_to_npu_type(src0->type) == NPU_DATA_TYPE_COUNT) { - LOG_DEBUG("[%s]Unsupported src0 tensor type: %s\n", get_name(), ggml_type_name(src0->type)); - return false; - } - - auto * src1 = op->src[1]; - if (src1 && type_to_npu_type(src1->type) == NPU_DATA_TYPE_COUNT) { - LOG_DEBUG("[%s]Unsupported src1 tensor type: %s\n", get_name(), ggml_type_name(src1->type)); - return false; - } - - auto npu_op = op_to_npu_op(op->op); - if (npu_op == NPU_OP_COUNT) { - LOG_DEBUG("[%s]Unsupported op: %s\n", get_name(), ggml_op_name(op->op)); - return false; - } - - constexpr const auto get_spec = [](const ggml_tensor * tensor) -> npu_device_tensor_spec { - if (!tensor) { - return npu_device_tensor_spec{}; - } - - static_assert(DEVICE_TENSOR_MAX_DIMS == GGML_MAX_DIMS, "tensor dimensions mismatch"); - npu_device_tensor_spec spec{}; - spec.ne[0] = tensor->ne[0]; - spec.ne[1] = tensor->ne[1]; - spec.ne[2] = tensor->ne[2]; - spec.ne[3] = tensor->ne[3]; - spec.type = type_to_npu_type(tensor->type); - return spec; - }; - - boolean supported = false; - auto src0_spec = get_spec(src0); - auto src1_spec = get_spec(src1); - auto dst_spec = get_spec(op); - auto ret = npu_device_device_support_op(_device_handle, &src0_spec, &src1_spec, &dst_spec, npu_op, &supported); - if (ret != AEE_SUCCESS || !supported) { - LOG_DEBUG("[%s]Unsupported op: %s, ret: 0x%x, supported: %d\n", get_name(), ggml_op_name(op->op), ret, - supported); - return false; - } - - LOG_DEBUG("[%s]Supported op: %s\n", get_name(), ggml_op_name(op->op)); - return true; -} - -bool npu_device::init_rpc_mem() { - if (!_rpc_mem) { - auto rpc_interface = std::make_shared(); - if (!rpc_interface->is_valid()) { - LOG_ERROR("[%s]Failed to load rpc memory library\n", get_name()); - return false; - } - - auto rpc_mem = std::make_shared(rpc_interface); - _rpc_interface = rpc_interface; - _rpc_mem = rpc_mem; - LOG_DEBUG("[%s]rpc memory initialized\n", get_name()); - } else { - LOG_DEBUG("[%s]rpc memory already initialized\n", get_name()); - } - - return true; -} - bool npu_device::offload_op(const ggml_tensor * op) { // TODO: implement this return false; } +#ifndef NDEBUG +bool npu_device::supports_op(const ggml_tensor * op) { + char op_desc[1024]; + get_op_tensor_desc(op, op_desc, sizeof(op_desc)); + + if (supports_op_impl(op)) { + if (op->op != GGML_OP_NONE && op->op != GGML_OP_VIEW && op->op != GGML_OP_RESHAPE && + op->op != GGML_OP_PERMUTE) { + _supported_op++; + LOG_DEBUG("[%s][%s]supported, %s, supported/unsupported: %u/%u\n", get_name(), ggml_op_name(op->op), + op_desc, _supported_op.load(), _unsupported_op.load()); + } + + return true; + } + + _unsupported_op++; + LOG_DEBUG("[%s][%s]unsupported, %s, supported/unsupported: %u/%u\n", get_name(), ggml_op_name(op->op), op_desc, + _supported_op.load(), _unsupported_op.load()); + return false; +} +#else +bool npu_device::supports_op(const ggml_tensor * op) { + return supports_op_impl(op); +} +#endif + ggml_backend_buffer_type_t npu_device::get_default_buffer_type(ggml_backend_dev_t dev) { // Note that this function will be called before the npu_device::init_device if (!init_rpc_mem()) { diff --git a/ggml/src/ggml-qnn/npu/host/host_device.hpp b/ggml/src/ggml-qnn/npu/host/host_device.hpp index efc7914f18..b2fab667d6 100644 --- a/ggml/src/ggml-qnn/npu/host/host_device.hpp +++ b/ggml/src/ggml-qnn/npu/host/host_device.hpp @@ -31,37 +31,18 @@ class npu_device { ggml_backend_buffer_type_t get_default_buffer_type(ggml_backend_dev_t dev); bool is_device_initialized() const; - bool init_device(ggml_backend_dev_t dev, const char * params); + bool init_device(); bool supports_buft(ggml_backend_buffer_type_t buft) const; bool offload_op(const ggml_tensor * op); - -#ifndef NDEBUG - bool supports_op(const ggml_tensor * op) { - if (supports_op_impl(op)) { - if (op->op != GGML_OP_NONE) { - _supported_op++; - LOG_DEBUG("[%s]Supported op: %s, supported/unsupported: %u/%u\n", get_name(), ggml_op_name(op->op), - _supported_op.load(), _unsupported_op.load()); - } - - return true; - } - - _unsupported_op++; - LOG_DEBUG("[%s]Unsupported op: %s, supported/unsupported: %u/%u\n", get_name(), ggml_op_name(op->op), - _supported_op.load(), _unsupported_op.load()); - return false; - } -#else - bool supports_op(const ggml_tensor * op) { return supports_op_impl(op); } -#endif + bool supports_op(const ggml_tensor * op); remote_handle64 get_device_handle() const { return _device_handle; } private: bool supports_op_impl(const ggml_tensor * op); bool init_rpc_mem(); + bool init_device_lib(); std::string _name = "hexagon-npu"; std::string _description = "Hexagon NPU"; diff --git a/ggml/src/ggml-qnn/npu/host/tensor.hpp b/ggml/src/ggml-qnn/npu/host/tensor.hpp index e7d5f7a88a..c5d2decbc5 100644 --- a/ggml/src/ggml-qnn/npu/host/tensor.hpp +++ b/ggml/src/ggml-qnn/npu/host/tensor.hpp @@ -40,19 +40,17 @@ class host_tensor { tensor->extra = this; _ggml_tensor = tensor; - LOG_DEBUG( - "host_tensor(%p) created, ggml_tensor(%p[%ldx%ldx%ldx%ld], nb[%ld][%ld][%ld][%ld]), " - "device_tensor_handle(%p)\n", - (void *) this, (void *) tensor, (long) tensor->ne[0], (long) tensor->ne[1], (long) tensor->ne[2], - (long) tensor->ne[3], (long) tensor->nb[0], (long) tensor->nb[1], (long) tensor->nb[2], - (long) tensor->nb[3], (void *) _device_tensor_handle); + LOG_DEBUG("host_tensor(%p), ggml_tensor(%p[%ldx%ldx%ldx%ld], nb[%ld][%ld][%ld][%ld], %s), handle(%p)\n", + (void *) this, (void *) tensor, (long) tensor->ne[0], (long) tensor->ne[1], (long) tensor->ne[2], + (long) tensor->ne[3], (long) tensor->nb[0], (long) tensor->nb[1], (long) tensor->nb[2], + (long) tensor->nb[3], ggml_type_name(tensor->type), (void *) _device_tensor_handle); } ~host_tensor() { LOG_DEBUG("host_tensor(%p) destroy, device_tensor_handle: %p\n", (void *) this, (void *) _device_tensor_handle); if (_device_tensor_handle) { npu_device_tensor_free(_device_handle, _device_tensor_handle); - _ggml_tensor->extra = nullptr; + // TODO: figure out why the _ggml_tensor is invalid here } } diff --git a/ggml/src/ggml-qnn/npu/host/util.cpp b/ggml/src/ggml-qnn/npu/host/util.cpp index 5db54b661e..9ce9841004 100644 --- a/ggml/src/ggml-qnn/npu/host/util.cpp +++ b/ggml/src/ggml-qnn/npu/host/util.cpp @@ -2,6 +2,17 @@ #include +#define GGML_COMMON_DECL_CPP +#include "ggml-common.h" +#undef GGML_COMMON_DECL_CPP + +static_assert(sizeof(npu_device_block_q4_K) == sizeof(block_q4_K), "npu_device_block_q4_K size mismatch"); +static_assert(sizeof(npu_device_block_q4_0) == sizeof(block_q4_0), "npu_device_block_q4_0 size mismatch"); +static_assert(sizeof(npu_device_block_q8_0) == sizeof(block_q8_0), "npu_device_block_q8_0 size mismatch"); +static_assert(QUANT_K_SCALE_SIZE == K_SCALE_SIZE, "QUANT_K_SCALE_SIZE size mismatch"); +static_assert(QUANT_K_BLOCK_SIZE == QK_K, "QUANT_K_BLOCK_SIZE size mismatch"); +static_assert(QUANT_BLOCK_SIZE == QK4_0, "QUANT_BLOCK_SIZE size mismatch"); + namespace hexagon { enum npu_device_tensor_op op_to_npu_op(ggml_op op) { @@ -23,6 +34,14 @@ enum npu_device_tensor_data_type type_to_npu_type(ggml_type type) { switch (type) { case GGML_TYPE_F32: return NPU_DATA_TYPE_F32; + case GGML_TYPE_F16: + return NPU_DATA_TYPE_F16; + case GGML_TYPE_Q4_K: + return NPU_DATA_TYPE_Q4_K; + case GGML_TYPE_Q4_0: + return NPU_DATA_TYPE_Q4_0; + case GGML_TYPE_Q8_0: + return NPU_DATA_TYPE_Q8_0; default: return NPU_DATA_TYPE_COUNT; } @@ -93,4 +112,56 @@ void enable_unsigned_dsp_module(common::rpc_interface_ptr rpc_interface, uint32_ } } +void get_op_tensor_desc(const ggml_tensor * dst, char * out, size_t max_len) { + if (dst == nullptr) { + snprintf(out, max_len, "null"); + return; + } + + constexpr const auto print_tensor = [](const ggml_tensor * tensor, char * out, size_t max_len) { + auto dims = ggml_n_dims(tensor); + + switch (dims) { + default: + case 4: + snprintf(out, max_len, "%s[%ldx%ldx%ldx%ld]", ggml_type_name(tensor->type), (long) tensor->ne[0], + (long) tensor->ne[1], (long) tensor->ne[2], (long) tensor->ne[3]); + break; + case 3: + snprintf(out, max_len, "%s[%ldx%ldx%ld]", ggml_type_name(tensor->type), (long) tensor->ne[0], + (long) tensor->ne[1], (long) tensor->ne[2]); + break; + case 2: + snprintf(out, max_len, "%s[%ldx%ld]", ggml_type_name(tensor->type), (long) tensor->ne[0], + (long) tensor->ne[1]); + break; + case 1: + snprintf(out, max_len, "%s[%ld]", ggml_type_name(tensor->type), (long) tensor->ne[0]); + break; + } + }; + + auto * src0 = dst->src[0]; + if (src0 == nullptr) { + print_tensor(dst, out, max_len); + return; + } + + char dst_desc[256]; + print_tensor(dst, dst_desc, sizeof(dst_desc)); + + char src0_desc[256]; + print_tensor(src0, src0_desc, sizeof(src0_desc)); + + auto * src1 = dst->src[1]; + if (src1 == nullptr) { + snprintf(out, max_len, "dst: %s, src0: %s", dst_desc, src0_desc); + return; + } + + char src1_desc[256]; + print_tensor(src1, src1_desc, sizeof(src1_desc)); + snprintf(out, max_len, "dst: %s, src0: %s, src1: %s", dst_desc, src0_desc, src1_desc); +} + } // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/host/util.hpp b/ggml/src/ggml-qnn/npu/host/util.hpp index c001272d4c..469e506660 100644 --- a/ggml/src/ggml-qnn/npu/host/util.hpp +++ b/ggml/src/ggml-qnn/npu/host/util.hpp @@ -23,4 +23,6 @@ const char * get_dsp_arch_desc(hexagon_dsp_arch arch); void enable_unsigned_dsp_module(common::rpc_interface_ptr rpc_interface, uint32_t domain_id); +void get_op_tensor_desc(const ggml_tensor * dst, char * out, size_t max_len); + } // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl b/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl index d62e65b3bd..df3cdf4957 100644 --- a/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl +++ b/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl @@ -4,6 +4,9 @@ const uint32_t DEVICE_TENSOR_MAX_DIMS = 4; const uint32_t DEVICE_TENSOR_MAX_SRC = 2; +const uint32_t QUANT_BLOCK_SIZE = 32; +const uint32_t QUANT_K_BLOCK_SIZE = 256; +const uint32_t QUANT_K_SCALE_SIZE = 12; interface npu_device : remote_handle64{ @@ -11,6 +14,25 @@ interface npu_device : remote_handle64{ typedef uint64_t tensor_handle_t; typedef uint64_t graph_handle_t; + typedef uint16_t fp16_t; + + struct block_q4_0 { + fp16_t d; + uint8_t qs[QUANT_BLOCK_SIZE / 2]; + }; + + struct block_q4_K { + fp16_t d; + fp16_t dmin; + uint8_t scales[QUANT_K_SCALE_SIZE]; + uint8_t qs[QUANT_K_BLOCK_SIZE / 2]; + }; + + struct block_q8_0 { + fp16_t d; + int8_t qs[QUANT_BLOCK_SIZE]; + }; + enum tensor_op { NPU_OP_MUL_MAT, NPU_OP_ADD, @@ -21,6 +43,10 @@ interface npu_device : remote_handle64{ enum tensor_data_type { NPU_DATA_TYPE_F32, + NPU_DATA_TYPE_F16, + NPU_DATA_TYPE_Q8_0, + NPU_DATA_TYPE_Q4_0, + NPU_DATA_TYPE_Q4_K, NPU_DATA_TYPE_COUNT }; diff --git a/ggml/src/ggml-qnn/qnn/CMakeLists.txt b/ggml/src/ggml-qnn/qnn/CMakeLists.txt index 010fcf08db..8b13083998 100644 --- a/ggml/src/ggml-qnn/qnn/CMakeLists.txt +++ b/ggml/src/ggml-qnn/qnn/CMakeLists.txt @@ -26,11 +26,11 @@ else() message("GGML_QNN_ENABLE_CPU_BACKEND is disabled") endif() -if(GGML_QNN_ENABLE_PERFORMANCE_TRACKING) - message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING is enabled") - target_compile_definitions(qnn-backend PUBLIC GGML_QNN_ENABLE_PERFORMANCE_TRACKING) +if(GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING) + message("GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING is enabled") + target_compile_definitions(qnn-backend PUBLIC GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING) else() - message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING is disabled") + message("GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING is disabled") endif() if(CMAKE_SYSTEM_NAME STREQUAL "Android") diff --git a/ggml/src/ggml-qnn/qnn/graph.cpp b/ggml/src/ggml-qnn/qnn/graph.cpp index 70fc71c211..3094b5c3be 100644 --- a/ggml/src/ggml-qnn/qnn/graph.cpp +++ b/ggml/src/ggml-qnn/qnn/graph.cpp @@ -10,7 +10,7 @@ #include "profiler.hpp" #include "tensor.hpp" -#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING +#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING # define GRAPH_PROFILE_HANDLE (_event_tracer ? _event_tracer->get_handle() : nullptr) # define GRAPH_PROFILE_PRINT() \ if (_event_tracer) { \ @@ -381,7 +381,7 @@ qnn_graph::qnn_graph(const std::string & graph_name, backend_index_type device, return; } -#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING +#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING if (device == QNN_BACKEND_NPU) { _event_tracer = std::make_shared( graph_name, qnn_interface, qnn_instance->get_qnn_backend_handle(), qnn_event_tracer::PROFILE_OP_TRACE); diff --git a/ggml/src/ggml-qnn/qnn/graph.hpp b/ggml/src/ggml-qnn/qnn/graph.hpp index 5e862112fb..99ffeaa3d0 100644 --- a/ggml/src/ggml-qnn/qnn/graph.hpp +++ b/ggml/src/ggml-qnn/qnn/graph.hpp @@ -79,7 +79,7 @@ class qnn_graph { std::vector _qnn_tensor_inputs; std::vector _qnn_tensor_outputs; -#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING +#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING // profiler qnn_event_tracer_ptr _event_tracer; #endif diff --git a/ggml/src/ggml-qnn/qnn/profiler.hpp b/ggml/src/ggml-qnn/qnn/profiler.hpp index 34db09e0bf..0d4f839fda 100644 --- a/ggml/src/ggml-qnn/qnn/profiler.hpp +++ b/ggml/src/ggml-qnn/qnn/profiler.hpp @@ -12,7 +12,7 @@ namespace qnn { -#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING +#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING class qnn_scoped_timer { public: @@ -92,7 +92,7 @@ using qnn_event_tracer_ptr = std::shared_ptr; } // namespace qnn -#ifdef GGML_QNN_ENABLE_PERFORMANCE_TRACKING +#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING # define QNN_SCOPED_PERFORMANCE_TRACKER(fmt, ...) \ auto __qnn_timer_##__LINE__ = qnn::make_scope_perf_timer(fmt, __VA_ARGS__) #else