This commit is contained in:
nullname 2026-02-07 03:05:31 +08:00 committed by GitHub
commit c1038fad43
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
80 changed files with 34642 additions and 0 deletions

View File

@ -163,6 +163,7 @@ llama_option_depr(WARNING LLAMA_RPC GGML_RPC)
llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL)
llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
llama_option_depr(WARNING LLAMA_CANN GGML_CANN)
llama_option_depr(WARNING LLAMA_QNN GGML_QNN)
include("cmake/license.cmake")
license_add_file("llama.cpp" "LICENSE")

17663
docs/ops/hexagon-npu.csv Normal file

File diff suppressed because it is too large Load Diff

View File

@ -247,6 +247,7 @@ set (GGML_SYCL_TARGET "INTEL" CACHE STRING
"ggml: sycl target device")
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
"ggml: sycl device architecture")
option(GGML_QNN "ggml: use QNN" OFF)
option(GGML_OPENCL "ggml: use OpenCL" OFF)
option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF)
@ -326,6 +327,7 @@ set(GGML_PUBLIC_HEADERS
include/ggml-sycl.h
include/ggml-vulkan.h
include/ggml-webgpu.h
include/ggml-qnn.h
include/ggml-zendnn.h
include/gguf.h)

13
ggml/include/ggml-qnn.h Normal file
View File

@ -0,0 +1,13 @@
#pragma once
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
GGML_API ggml_backend_reg_t ggml_backend_qnn_reg(void);
#ifdef __cplusplus
}
#endif

View File

@ -459,6 +459,7 @@ ggml_add_backend(WebGPU)
ggml_add_backend(zDNN)
ggml_add_backend(OpenCL)
ggml_add_backend(Hexagon)
ggml_add_backend(QNN)
ggml_add_backend(ZenDNN)
foreach (target ggml-base ggml)

View File

@ -78,6 +78,10 @@
#include "ggml-cann.h"
#endif
#ifdef GGML_USE_QNN
#include "ggml-qnn.h"
#endif
#ifdef GGML_USE_ZENDNN
#include "ggml-zendnn.h"
#endif
@ -154,6 +158,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_RPC
register_backend(ggml_backend_rpc_reg());
#endif
#ifdef GGML_USE_QNN
register_backend(ggml_backend_qnn_reg());
#endif
#ifdef GGML_USE_CPU
register_backend(ggml_backend_cpu_reg());
#endif

View File

@ -0,0 +1,141 @@
message(STATUS "Using QNN backend")
option(GGML_HEXAGON_NPU_ONLY "ggml-qnn: Only use Hexagon NPU" OFF)
option(GGML_QNN_ENABLE_HEXAGON_BACKEND "ggml-qnn: Enable Hexagon custom package" ${GGML_HEXAGON_NPU_ONLY})
option(GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS "ggml-qnn: Enable quantized tensors support" OFF)
option(GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING "ggml-qnn: Enable performance tracking" OFF)
if(CMAKE_SYSTEM_NAME STREQUAL "Android")
find_library(LOG_LIB log)
set(COMMON_LINK_LIBRARIES ${LOG_LIB})
elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows" OR CMAKE_SYSTEM_NAME STREQUAL "Linux")
message("Building for Linux or Windows")
else()
message(FATAL_ERROR "QNN now only available on Android, Windows and Linux")
endif()
if(NOT DEFINED GGML_QNN_SDK_PATH)
# try read from environment variable
# TODO: create a function to search for the SDK path
if(DEFINED ENV{QNN_SDK_PATH})
set(GGML_QNN_SDK_PATH $ENV{QNN_SDK_PATH})
elseif(DEFINED ENV{QNN_SDK_ROOT})
message("found QNN_SDK_ROOT: ${QNN_SDK_ROOT}")
set(GGML_QNN_SDK_PATH $ENV{QNN_SDK_ROOT})
else()
message(FATAL_ERROR "GGML_QNN_SDK_PATH not defined")
endif()
endif()
message("CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}")
message("CMAKE_CXX_FLAGS_DEBUG: ${CMAKE_CXX_FLAGS_DEBUG}")
message("CMAKE_CXX_FLAGS_RELEASE: ${CMAKE_CXX_FLAGS_RELEASE}")
message("QNN_SDK_PATH: ${GGML_QNN_SDK_PATH}")
message("GGML_QNN: ${GGML_QNN}")
message("GGML_QNN_ENABLE_HEXAGON_BACKEND: ${GGML_QNN_ENABLE_HEXAGON_BACKEND}")
message("GGML_HEXAGON_NPU_ONLY: ${GGML_HEXAGON_NPU_ONLY}")
message("GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS: ${GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS}")
message("GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING: ${GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING}")
ggml_add_backend_library(ggml-qnn
../../include/ggml-qnn.h
)
target_link_libraries(ggml-qnn PRIVATE ${COMMON_LINK_LIBRARIES})
add_subdirectory(shared)
if(GGML_HEXAGON_NPU_ONLY)
message("GGML_HEXAGON_NPU_ONLY is enabled")
set(GGML_QNN_ENABLE_HEXAGON_BACKEND ON)
else()
message("GGML_HEXAGON_NPU_ONLY is disabled")
add_subdirectory(qnn)
target_link_libraries(runtime-common PUBLIC qnn-backend)
endif()
if(GGML_QNN_ENABLE_HEXAGON_BACKEND)
message("GGML_QNN_ENABLE_HEXAGON_BACKEND is enabled")
add_subdirectory(npu)
target_link_libraries(hexagon-npu-host runtime-common)
target_link_libraries(ggml-qnn PRIVATE hexagon-npu-host)
else()
message("GGML_QNN_ENABLE_HEXAGON_BACKEND is disabled")
target_link_libraries(ggml-qnn PRIVATE runtime-common)
endif()
# Copy dynamic libraries
set(BACKEND_RUNTIME_LIBS "")
if(CMAKE_SYSTEM_NAME STREQUAL "Android" OR CMAKE_SYSTEM_NAME STREQUAL "Linux")
if(CMAKE_SYSTEM_NAME STREQUAL "Android")
# Android
set(QNN_SDK_LIB_PATH "${GGML_QNN_SDK_PATH}/lib/aarch64-android")
elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
# Linux x86_64
set(QNN_SDK_LIB_PATH "${GGML_QNN_SDK_PATH}/lib/x86_64-linux-clang")
else()
# Linux aarch64
set(QNN_SDK_LIB_PATH "${GGML_QNN_SDK_PATH}/lib/aarch64-oe-linux-gcc11.2")
endif()
list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/libQnnSystem.so")
list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/libQnnCpu.so")
list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/libQnnGpu.so")
list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/libQnnHtp.so")
file(GLOB HTP_STUB_LIBS "${QNN_SDK_LIB_PATH}/libQnnHtp*.so")
list(APPEND BACKEND_RUNTIME_LIBS ${HTP_STUB_LIBS})
if(CMAKE_SYSTEM_NAME STREQUAL "Android")
file(GLOB HTP_SKEL_LIBS "${GGML_QNN_SDK_PATH}/lib/hexagon-*/unsigned/libQnnHtp*Skel.so")
list(APPEND BACKEND_RUNTIME_LIBS ${HTP_SKEL_LIBS})
if(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
if(EXISTS "${CMAKE_ANDROID_NDK}/prebuilt/android-arm64/gdbserver/gdbserver")
list(APPEND BACKEND_RUNTIME_LIBS "${CMAKE_ANDROID_NDK}/prebuilt/android-arm64/gdbserver/gdbserver")
message("old ndk, copy gdbserver")
else()
file(GLOB LLDB_SERVER "${CMAKE_ANDROID_NDK}/toolchains/llvm/prebuilt/linux-x86_64/lib64/clang/*/lib/linux/aarch64/lldb-server")
list(APPEND BACKEND_RUNTIME_LIBS ${LLDB_SERVER})
message("new ndk, copy lldb-server")
endif()
file(GLOB OMP_LIBS "${CMAKE_ANDROID_NDK}/toolchains/llvm/prebuilt/linux-x86_64/lib64/clang/*/lib/linux/aarch64/libomp.so")
file(GLOB ASAN_LIBS "${CMAKE_ANDROID_NDK}/toolchains/llvm/prebuilt/linux-x86_64/lib64/clang/*/lib/linux/libclang_rt.asan-aarch64-android.so")
list(APPEND BACKEND_RUNTIME_LIBS ${OMP_LIBS})
list(APPEND BACKEND_RUNTIME_LIBS ${ASAN_LIBS})
endif()
else()
# Linux
list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/libHtpPrepare.so")
endif()
elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows")
if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
# x86_64
set(QNN_SDK_LIB_PATH "${GGML_QNN_SDK_PATH}/lib/x86_64-windows-msvc")
else()
# aarch64
set(QNN_SDK_LIB_PATH "${GGML_QNN_SDK_PATH}/lib/aarch64-windows-msvc")
endif()
list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/QnnSystem.dll")
list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/QnnCpu.dll")
list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/QnnGpu.dll")
list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/QnnHtp.dll")
file(GLOB HTP_STUB_LIBS "${QNN_SDK_LIB_PATH}/QnnHtp*.dll")
if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
list(APPEND BACKEND_RUNTIME_LIBS "${QNN_SDK_LIB_PATH}/HtpPrepare.dll")
endif()
list(APPEND BACKEND_RUNTIME_LIBS ${HTP_STUB_LIBS})
endif()
foreach(RUNTIME_LIB ${BACKEND_RUNTIME_LIBS})
message("Copy: ${RUNTIME_LIB} -> ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
add_custom_command(
TARGET ggml-qnn POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${RUNTIME_LIB}
${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
endforeach()

View File

@ -0,0 +1,134 @@
enable_language(ASM)
cmake_policy(SET CMP0115 OLD)
if(DEFINED ENV{HEXAGON_SDK_ROOT})
set(HEXAGON_SDK_ROOT $ENV{HEXAGON_SDK_ROOT})
message("HEXAGON_SDK_ROOT (from environment): ${HEXAGON_SDK_ROOT}")
elseif(DEFINED HEXAGON_SDK_ROOT)
message("HEXAGON_SDK_ROOT: ${HEXAGON_SDK_ROOT}")
else()
message(FATAL_ERROR "HEXAGON_SDK_ROOT not defined")
endif()
if(${CMAKE_SYSTEM_NAME} MATCHES "Android")
set(PREBUILT_LIB_DIR "android_aarch64")
elseif(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
set(PREBUILT_LIB_DIR "UbuntuARM_aarch64")
elseif(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
# Windows
set(PREBUILT_LIB_DIR "windows_aarch64")
endif()
if(HEXAGON_SDK_ROOT)
include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_fun.cmake)
else()
include(${HEXAGON_CMAKE_ROOT}/hexagon_fun.cmake)
endif()
# Base Include dirs for the Project
set(common_incs
${CMAKE_CURRENT_BINARY_DIR}/
${HEXAGON_SDK_ROOT}/incs/
${HEXAGON_SDK_ROOT}/incs/stddef/
${HEXAGON_SDK_ROOT}/incs/HAP/
${HEXAGON_SDK_ROOT}/rtos/qurt/
${HEXAGON_SDK_ROOT}/utils/examples/
)
include_directories(${common_incs})
# host build
file(GLOB common_srcs "${CMAKE_CURRENT_LIST_DIR}/common/*.cpp")
file(GLOB host_srcs "${CMAKE_CURRENT_LIST_DIR}/host/*.cpp")
set(stub_srcs "${CMAKE_CURRENT_BINARY_DIR}/npu_device_stub.c")
add_library(hexagon-npu-host STATIC
${common_srcs}
${host_srcs}
${stub_srcs}
)
# disable warnings for the stub
set_source_files_properties(
${stub_srcs}
PROPERTIES
COMPILE_FLAGS "-w"
)
build_idl(idl/hexagon_npu.idl hexagon-npu-host)
# Add compile definitions to the target
target_compile_definitions(hexagon-npu-host PUBLIC
VERIFY_PRINT_ERROR
GGML_QNN_ENABLE_HEXAGON_BACKEND
)
if(GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS)
target_compile_definitions(hexagon-npu-host PUBLIC
GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS
)
endif()
target_include_directories(hexagon-npu-host PRIVATE
${HEXAGON_SDK_ROOT}/ipc/fastrpc/rpcmem/inc/
${QNN_SDK_ROOT}/include/QNN/
${CMAKE_CURRENT_LIST_DIR}/host/
${CMAKE_CURRENT_LIST_DIR}/
)
target_include_directories(hexagon-npu-host PUBLIC
${HEXAGON_SDK_ROOT}/incs/ # TODO: this is for rpc-mem
)
if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "Windows")
set_target_properties(hexagon-npu-host PROPERTIES OUTPUT_NAME "hexagon_npu")
endif()
if(${CMAKE_SYSTEM_NAME} MATCHES "Android|Linux")
target_link_options(hexagon-npu-host PUBLIC -pie)
endif()
if(GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING)
message("GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING is enabled")
target_compile_definitions(hexagon-npu-host PUBLIC GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING)
else()
message("GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING is disabled")
endif()
link_options(hexagon-npu-host)
choose_dsprpc("3" dsprpc) # cdsprpc
link_custom_library(hexagon-npu-host ${dsprpc})
cmake_host_system_information(RESULT BUILD_CPU_COUNT QUERY NUMBER_OF_PHYSICAL_CORES)
# Build HTP bits
set(HTP_CMAKE_ARGS
-DCMAKE_TOOLCHAIN_FILE=${CMAKE_CURRENT_SOURCE_DIR}/../../ggml-hexagon/htp/cmake-toolchain.cmake
-DCMAKE_BUILD_TYPE=Release
-DCMAKE_INSTALL_LIBDIR=${CMAKE_CURRENT_BINARY_DIR}
-DHEXAGON_SDK_ROOT=$ENV{HEXAGON_SDK_ROOT}
-DHEXAGON_TOOLS_ROOT=$ENV{HEXAGON_TOOLS_ROOT}
-DHEXAGON_HTP_DEBUG=${GGML_HEXAGON_HTP_DEBUG}
-DGGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE})
ExternalProject_Add(hexagon_npu_skel_v73
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/device BUILD_ALWAYS ON
CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v73 -DPREBUILT_LIB_DIR="toolv19_v73")
ExternalProject_Add(hexagon_npu_skel_v75
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/device BUILD_ALWAYS ON
CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v75 -DPREBUILT_LIB_DIR="toolv19_v75")
ExternalProject_Add(hexagon_npu_skel_v79
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/device BUILD_ALWAYS ON
CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v79 -DPREBUILT_LIB_DIR="toolv19_v79")
list(APPEND NPU_RUNTIME_LIBS "${HEXAGON_SDK_ROOT}/tools/utils/sysmon/sysMonApp")
list(APPEND NPU_RUNTIME_LIBS "${HEXAGON_SDK_ROOT}/tools/utils/sysmon/sysMonAppLE")
# Install Hexagon skels required at runtime
install(FILES
${CMAKE_CURRENT_BINARY_DIR}/libhexagon_npu_skel_v73.so
${CMAKE_CURRENT_BINARY_DIR}/libhexagon_npu_skel_v75.so
${CMAKE_CURRENT_BINARY_DIR}/libhexagon_npu_skel_v79.so
${HEXAGON_SDK_ROOT}/tools/utils/sysmon/sysMonApp
${HEXAGON_SDK_ROOT}/tools/utils/sysmon/sysMonAppLE
TYPE LIB)

View File

@ -0,0 +1,135 @@
# hexagon npu build, this section will run inside the `build_cmake` script
cmake_minimum_required(VERSION 3.14.3)
project(hexagon_npu C CXX ASM)
enable_language(ASM)
cmake_policy(SET CMP0115 OLD)
if(DEFINED ENV{HEXAGON_SDK_ROOT})
set(HEXAGON_SDK_ROOT $ENV{HEXAGON_SDK_ROOT})
message("HEXAGON_SDK_ROOT (from environment): ${HEXAGON_SDK_ROOT}")
elseif(DEFINED HEXAGON_SDK_ROOT)
message("HEXAGON_SDK_ROOT: ${HEXAGON_SDK_ROOT}")
else()
message(FATAL_ERROR "HEXAGON_SDK_ROOT not defined")
endif()
if(${CMAKE_SYSTEM_NAME} MATCHES "Android")
set(PREBUILT_LIB_DIR "android_aarch64")
elseif(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
set(PREBUILT_LIB_DIR "UbuntuARM_aarch64")
elseif(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
# Windows
set(PREBUILT_LIB_DIR "windows_aarch64")
endif()
if(HEXAGON_SDK_ROOT)
include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_fun.cmake)
else()
include(${HEXAGON_CMAKE_ROOT}/hexagon_fun.cmake)
endif()
# Base Include dirs for the Project
set(common_incs
${CMAKE_CURRENT_BINARY_DIR}/
${HEXAGON_SDK_ROOT}/incs/
${HEXAGON_SDK_ROOT}/incs/stddef/
${HEXAGON_SDK_ROOT}/incs/HAP/
${HEXAGON_SDK_ROOT}/rtos/qurt/
${HEXAGON_SDK_ROOT}/utils/examples/
)
include_directories(${common_incs})
# check if QNN_SDK_ROOT is set
if(NOT DEFINED ENV{QNN_SDK_ROOT})
message(FATAL_ERROR "QNN_SDK_ROOT not defined")
endif()
set(QNN_SDK_ROOT $ENV{QNN_SDK_ROOT})
message("QNN_SDK_ROOT: ${QNN_SDK_ROOT}")
message("GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS: ${GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS}")
include_directories(
${QNN_SDK_ROOT}/include/QNN/
)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
file(GLOB common_srcs "${CMAKE_CURRENT_LIST_DIR}/common/*.cpp")
file(GLOB device_srcs "${CMAKE_CURRENT_LIST_DIR}/device/*.cpp")
file(GLOB device_op_srcs "${CMAKE_CURRENT_LIST_DIR}/device/op/*.cpp")
set(skel_srcs "${CMAKE_CURRENT_BINARY_DIR}/npu_device_skel.c")
add_library(hexagon_npu_skel_OBJS OBJECT
${common_srcs}
${device_srcs}
${device_op_srcs}
${skel_srcs}
)
if(CMAKE_BUILD_TYPE MATCHES "Debug|Dbg")
message("Debug build, enable all logging")
target_compile_definitions(hexagon_npu_skel_OBJS PUBLIC
_DEBUG
DEBUG_LOGGING
)
else()
message("Release build, disable debug logging")
target_compile_definitions(hexagon_npu_skel_OBJS PUBLIC
NDEBUG
RELEASE_LOGGING
)
endif()
if(GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS)
message("GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS is enabled")
target_compile_definitions(hexagon_npu_skel_OBJS PUBLIC
GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS
)
endif()
if(GGML_HEXAGON_NPU_SANITIZE_ADDRESS)
message("GGML_HEXAGON_NPU_SANITIZE_ADDRESS is enabled")
target_compile_options(hexagon_npu_skel_OBJS PUBLIC
-fsanitize=address -fno-omit-frame-pointer
)
target_link_options(hexagon_npu_skel_OBJS PUBLIC
-fsanitize=address
)
endif()
if(GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING)
message("GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING is enabled")
target_compile_definitions(hexagon_npu_skel_OBJS PUBLIC
GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
)
endif()
build_idl(../idl/hexagon_npu.idl hexagon_npu_skel_OBJS)
add_subdirectory(${HEXAGON_SDK_ROOT}/libs/qprintf qprintf_dir)
target_include_directories(hexagon_npu_skel_OBJS PUBLIC
${HEXAGON_SDK_ROOT}/libs/qprintf/inc/
# TODO: find a better way to include these
${CMAKE_CURRENT_LIST_DIR}
${CMAKE_CURRENT_LIST_DIR}/op/
)
# disable warnings for the skel
set_source_files_properties(
${skel_srcs}
PROPERTIES
COMPILE_FLAGS "-w"
)
add_library(hexagon_npu_skel SHARED $<TARGET_OBJECTS:hexagon_npu_skel_OBJS>)
target_link_libraries(hexagon_npu_skel
${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++abi.so.1
${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++.so.1
${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc.so
)
set_target_properties(hexagon_npu_skel PROPERTIES OUTPUT_NAME "hexagon_npu_skel_${HEXAGON_ARCH}")
target_link_libraries(hexagon_npu_skel qprintf_static)
copy_binaries(hexagon_npu_skel)

View File

@ -0,0 +1,287 @@
#include "graph.hpp"
#include "hexagon_npu.h"
#include "op_registry.hpp"
#include "remote.h"
#include "tensor.hpp"
#include "thread_pool.hpp"
#include "type_traits.hpp"
#include "util.hpp"
#include <AEEStdErr.h>
#include <HAP_compute_res.h>
#include <hexagon_types.h>
#include <memory>
namespace {
struct npu_device_context {
std::unique_ptr<hexagon::power_utils> power_utils; // Power management utilities
std::unique_ptr<hexagon::default_thread_pool> thread_pool;
std::unique_ptr<float[]> f16_to_f32_table; // TODO: store vtcm?
bool init() {
if (!init_ltu()) {
DEVICE_LOG_ERROR("Failed to initialize LTU\n");
return false;
}
if (!init_thread_pool()) {
DEVICE_LOG_ERROR("Failed to initialize thread pool\n");
return false;
}
power_utils = std::make_unique<hexagon::power_utils>();
if (power_utils && power_utils->is_valid()) {
power_utils->set_dvcs_performance_mode(true);
DEVICE_LOG_DEBUG("Power utilities initialized with DVCS performance mode enabled\n");
} else {
DEVICE_LOG_ERROR("Failed to initialize power utilities\n");
}
DEVICE_LOG_DEBUG("NPU device context initialized\n");
return true;
}
private:
bool init_ltu() {
constexpr const size_t kLtuCount = 1U << 16;
f16_to_f32_table = std::make_unique<float[]>(kLtuCount);
if (!f16_to_f32_table) {
DEVICE_LOG_ERROR("Failed to allocate memory for f16_to_f32 table\n");
return false;
}
hexagon::init_f16_f32_table(f16_to_f32_table.get(), kLtuCount);
DEVICE_LOG_DEBUG("f16_to_f32 table initialized\n");
return true;
}
bool init_thread_pool() {
if (thread_pool) {
DEVICE_LOG_DEBUG("Thread pool already initialized\n");
return true;
}
auto pool = std::make_unique<hexagon::default_thread_pool>();
if (!pool) {
DEVICE_LOG_ERROR("Failed to create thread pool\n");
return false;
}
thread_pool = std::move(pool);
DEVICE_LOG_DEBUG("Thread pool initialized\n");
return true;
}
};
inline hexagon::tensor * tensor_from_handle(npu_device_tensor_handle_t h) {
if (h == npu_device_INVALID_DEVICE_TENSOR_HANDLE) {
return nullptr;
}
return reinterpret_cast<hexagon::tensor *>(h);
}
inline npu_device_tensor_handle_t tensor_to_handle(hexagon::tensor * tensor) {
return reinterpret_cast<npu_device_tensor_handle_t>(tensor);
}
inline hexagon::graph * graph_from_handle(npu_device_graph_handle_t h) {
if (h == npu_device_INVALID_DEVICE_GRAPH_HANDLE) {
return nullptr;
}
return reinterpret_cast<hexagon::graph *>(h);
}
inline npu_device_graph_handle_t graph_to_handle(hexagon::graph * graph) {
return reinterpret_cast<npu_device_graph_handle_t>(graph);
}
inline npu_device_context * device_context_from_handle(remote_handle64 h) {
return reinterpret_cast<npu_device_context *>(h);
}
} // namespace
int npu_device_open(const char * uri, remote_handle64 * h) {
// TODO: should we have a device context here?
auto * context = new npu_device_context();
if (!context->init()) {
DEVICE_LOG_ERROR("Failed to initialize npu_device_context\n");
delete context;
return AEE_EFAILED;
}
*h = reinterpret_cast<remote_handle64>(context);
DEVICE_LOG_INFO("NPU device context created: %p\n", (void *) *h);
return AEE_SUCCESS;
}
int npu_device_close(remote_handle64 h) {
auto * context = device_context_from_handle(h);
if (!context) {
DEVICE_LOG_ERROR("Invalid npu_device_context handle\n");
return AEE_EINVHANDLE;
}
delete context;
DEVICE_LOG_INFO("NPU device context destroyed: %p\n", (void *) h);
return AEE_SUCCESS;
}
AEEResult npu_device_device_get_alignment(remote_handle64 _h, uint32_t * alignment) {
NPU_UNUSED(_h);
*alignment = sizeof(HVX_VectorPair);
return AEE_SUCCESS;
}
AEEResult npu_device_device_support_op(remote_handle64 _h,
const npu_device_tensor_op_spec * op_spec,
const npu_device_tensor_spec * dst,
const npu_device_tensor_spec * srcs,
int srcsLen,
boolean * is_supported) {
NPU_UNUSED(_h);
if (!srcs || srcsLen <= 0 || !dst || !is_supported) {
DEVICE_LOG_ERROR("npu_device_device_support_op: Invalid arguments\n");
return AEE_EINVARGS;
}
*is_supported = hexagon::support_op(op_spec, dst, srcs, srcsLen);
return AEE_SUCCESS;
}
AEEResult npu_device_tensor_init(remote_handle64 _h,
const npu_device_tensor_config * info,
npu_device_tensor_handle_t * tensor_handle) {
NPU_UNUSED(_h);
auto * tensor = new hexagon::tensor(*info);
*tensor_handle = tensor_to_handle(tensor);
return AEE_SUCCESS;
}
AEEResult npu_device_tensor_update_params(remote_handle64 _h,
npu_device_tensor_handle_t tensor_handle,
const npu_device_tensor_update_config * config) {
NPU_UNUSED(_h);
auto * tensor = tensor_from_handle(tensor_handle);
if (!tensor || !config) {
return AEE_EINVHANDLE;
}
tensor->update_config(*config);
return AEE_SUCCESS;
}
AEEResult npu_device_tensor_free(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle) {
NPU_UNUSED(_h);
auto * tensor = tensor_from_handle(tensor_handle);
if (!tensor) {
return AEE_EINVHANDLE;
}
delete tensor;
return AEE_SUCCESS;
}
AEEResult npu_device_tensors_free(remote_handle64 _h,
const npu_device_tensor_handle_t * tensor_handles,
int tensor_handlesLen) {
NPU_UNUSED(_h);
if (!tensor_handles || tensor_handlesLen < 0) {
DEVICE_LOG_ERROR("npu_device_tensors_free: Invalid arguments\n");
return AEE_EINVARGS;
}
for (int i = 0; i < tensor_handlesLen; ++i) {
auto * tensor = tensor_from_handle(tensor_handles[i]);
if (tensor) {
delete tensor;
} else {
DEVICE_LOG_ERROR("npu_device_tensors_free: Invalid tensor handle at index %d\n", i);
}
}
return AEE_SUCCESS;
}
AEEResult npu_device_graph_init(remote_handle64 _h, npu_device_graph_handle_t * graph_handle) {
NPU_UNUSED(_h);
auto * graph = new hexagon::graph();
*graph_handle = graph_to_handle(graph);
return AEE_SUCCESS;
}
AEEResult npu_device_graph_set_tensor(remote_handle64 _h,
npu_device_graph_handle_t graph_handle,
const npu_device_tensor_handle_t * tensor_handles,
int tensor_handlesLen) {
NPU_UNUSED(_h);
auto * graph = graph_from_handle(graph_handle);
if (!graph || !tensor_handles || tensor_handlesLen <= 0) {
return AEE_EINVHANDLE;
}
graph->set_tensor(tensor_handles, tensor_handlesLen);
return AEE_SUCCESS;
}
AEEResult npu_device_graph_set_tensor_with_param(remote_handle64 _h,
npu_device_graph_handle_t graph_handle,
const npu_device_tensor_handle_t * tensor_handles,
int tensor_handlesLen,
const npu_device_tensor_update_config * tensor_params,
int tensor_paramsLen) {
NPU_UNUSED(_h);
auto * graph = graph_from_handle(graph_handle);
if (!graph || tensor_handlesLen != tensor_paramsLen || tensor_handlesLen < 0) {
return AEE_EINVHANDLE;
}
if (tensor_params && tensor_handles) {
for (int i = 0; i < tensor_handlesLen; ++i) {
auto * tensor = tensor_from_handle(tensor_handles[i]);
if (tensor) {
tensor->update_config(tensor_params[i]);
}
}
}
graph->set_tensor(tensor_handles, tensor_handlesLen);
return AEE_SUCCESS;
}
AEEResult npu_device_graph_compute(remote_handle64 _h, npu_device_graph_handle_t graph_handle) {
auto dev_ctx = device_context_from_handle(_h);
if (!dev_ctx) {
DEVICE_LOG_DEBUG("Invalid npu_device_context handle\n");
return AEE_EINVHANDLE;
}
auto * graph = graph_from_handle(graph_handle);
if (!graph) {
DEVICE_LOG_ERROR("Invalid graph handle\n");
return AEE_EINVHANDLE;
}
if (!graph->compute(dev_ctx->thread_pool.get(), dev_ctx->f16_to_f32_table.get())) {
return AEE_EFAILED;
}
return AEE_SUCCESS;
}
AEEResult npu_device_graph_free(remote_handle64 _h, npu_device_graph_handle_t graph_handle) {
NPU_UNUSED(_h);
auto * graph = graph_from_handle(graph_handle);
if (graph) {
delete graph;
}
return AEE_SUCCESS;
}

View File

@ -0,0 +1,436 @@
#include "dma_transfer.hpp"
#include <qurt.h>
#include <array>
#include <cstdlib>
namespace {
// From addons/compute/libs/userdma/utils_lib/
#define DM0_STATUS_MASK 0x00000003
#define DM0_STATUS_SHIFT 0
#define DM0_STATUS_IDLE 0
#define DM0_STATUS_RUN 1
#define DM0_STATUS_ERROR 2
#define DM0_DESC_ADDR_MASK 0xFFFFFFF0
#define DM0_DESC_ADDR_SHIFT 4
#define DMA_COMPLETE 1
#define DMA_INCOMPLETE 0
#define DMA_SUCCESS 0
#define DMA_FAIL -1
#define DMA_DESC_TYPE_1D 0
#define DMA_DESC_TYPE_2D 1
#define DESC_NEXT_MASK 0xFFFFFFFF
#define DESC_NEXT_SHIFT 0
#define DESC_DSTATE_MASK 0x80000000
#define DESC_DSTATE_SHIFT 31
#define DESC_DSTATE_INCOMPLETE 0
#define DESC_DSTATE_COMPLETE 1
#define DESC_ORDER_MASK 0x40000000
#define DESC_ORDER_SHIFT 30
#define DESC_ORDER_NOORDER 0
#define DESC_ORDER_ORDER 1
#define DESC_BYPASSSRC_MASK 0x20000000
#define DESC_BYPASSSRC_SHIFT 29
#define DESC_BYPASSDST_MASK 0x10000000
#define DESC_BYPASSDST_SHIFT 28
#define DESC_BYPASS_OFF 0
#define DESC_BYPASS_ON 1
#define DESC_DESCTYPE_MASK 0x03000000
#define DESC_DESCTYPE_SHIFT 24
#define DESC_DESCTYPE_1D 0
#define DESC_DESCTYPE_2D 1
#define DESC_LENGTH_MASK 0x00FFFFFF
#define DESC_LENGTH_SHIFT 0
#define DESC_SRC_MASK 0xFFFFFFFF
#define DESC_SRC_SHIFT 0
#define DESC_DST_MASK 0xFFFFFFFF
#define DESC_DST_SHIFT 0
#define DESC_CACHEALLOC_MASK 0x03000000
#define DESC_CACHEALLOC_SHIFT 24
#define DESC_CACHEALLOC_NONE 0
#define DESC_CACHEALLOC_WRITEONLY 1
#define DESC_CACHEALLOC_READONLY 2
#define DESC_CACHEALLOC_READWRITE 3
#define DESC_ROIWIDTH_MASK 0x0000FFFF
#define DESC_ROIWIDTH_SHIFT 0
#define DESC_ROIHEIGHT_MASK 0xFFFF0000
#define DESC_ROIHEIGHT_SHIFT 16
#define DESC_SRCSTRIDE_MASK 0x0000FFFF
#define DESC_SRCSTRIDE_SHIFT 0
#define DESC_DSTSTRIDE_MASK 0xFFFF0000
#define DESC_DSTSTRIDE_SHIFT 16
#define DESC_SRCWIDTHOFFSET_MASK 0x0000FFFF
#define DESC_SRCWIDTHOFFSET_SHIFT 0
#define DESC_DSTWIDTHOFFSET_MASK 0xFFFF0000
#define DESC_DSTWIDTHOFFSET_SHIFT 16
/**************************/
/* 1D (linear) descriptor */
/**************************/
typedef struct _dma_desc_1d_t {
uint32_t next;
uint32_t dstate_order_bypass_desctype_length;
uint32_t src;
uint32_t dst;
} dma_desc_1d_t;
static_assert(sizeof(dma_desc_1d_t) == hexagon::dma::kDmaDescSize1D, "kDmaDescSize1D size incorrect");
/***********************/
/* 2D (box) descriptor */
/***********************/
typedef struct _dma_desc_2d_t {
uint32_t next;
uint32_t dstate_order_bypass_desctype_length;
uint32_t src;
uint32_t dst;
uint32_t allocation;
uint32_t roiheight_roiwidth;
uint32_t dststride_srcstride;
uint32_t dstwidthoffset_srcwidthoffset;
} dma_desc_2d_t;
static_assert(sizeof(dma_desc_2d_t) == hexagon::dma::kDmaDescSize2D, "kDmaDescSize2D size incorrect");
inline void dmstart(void * next) {
asm volatile(" release(%0):at" : : "r"(next));
asm volatile(" dmstart(%0)" : : "r"(next));
}
inline void dmlink(void * cur, void * next) {
asm volatile(" release(%0):at" : : "r"(next));
asm volatile(" dmlink(%0, %1)" : : "r"(cur), "r"(next));
}
inline unsigned int dmpoll(void) {
unsigned int ret = 0;
asm volatile(" %0 = dmpoll" : "=r"(ret) : : "memory");
return ret;
}
inline unsigned int dmwait(void) {
unsigned int ret = 0;
asm volatile(" %0 = dmwait" : "=r"(ret) : : "memory");
return ret;
}
inline void dma_desc_set_next(void * d, uint32_t v) {
(((dma_desc_1d_t *) d)->next) &= ~DESC_NEXT_MASK;
(((dma_desc_1d_t *) d)->next) |= ((v << DESC_NEXT_SHIFT) & DESC_NEXT_MASK);
}
inline uint32_t dma_desc_get_dstate(void * d) {
return (((((dma_desc_1d_t *) d)->dstate_order_bypass_desctype_length) & DESC_DSTATE_MASK) >> DESC_DSTATE_SHIFT);
}
inline void dma_desc_set_dstate(void * d, uint32_t v) {
(((dma_desc_1d_t *) d)->dstate_order_bypass_desctype_length) &= ~DESC_DSTATE_MASK;
(((dma_desc_1d_t *) d)->dstate_order_bypass_desctype_length) |= ((v << DESC_DSTATE_SHIFT) & DESC_DSTATE_MASK);
}
inline void dma_desc_set_desctype(void * d, uint32_t v) {
(((dma_desc_1d_t *) d)->dstate_order_bypass_desctype_length) &= ~DESC_DESCTYPE_MASK;
(((dma_desc_1d_t *) d)->dstate_order_bypass_desctype_length) |= ((v << DESC_DESCTYPE_SHIFT) & DESC_DESCTYPE_MASK);
}
inline void dma_desc_set_order(void * d, uint32_t v) {
(((dma_desc_1d_t *) d)->dstate_order_bypass_desctype_length) &= ~DESC_ORDER_MASK;
(((dma_desc_1d_t *) d)->dstate_order_bypass_desctype_length) |= ((v << DESC_ORDER_SHIFT) & DESC_ORDER_MASK);
}
inline void dma_desc_set_bypasssrc(void * d, uint32_t v) {
(((dma_desc_1d_t *) d)->dstate_order_bypass_desctype_length) &= ~DESC_BYPASSSRC_MASK;
(((dma_desc_1d_t *) d)->dstate_order_bypass_desctype_length) |= ((v << DESC_BYPASSSRC_SHIFT) & DESC_BYPASSSRC_MASK);
}
inline void dma_desc_set_bypassdst(void * d, uint32_t v) {
(((dma_desc_1d_t *) d)->dstate_order_bypass_desctype_length) &= ~DESC_BYPASSDST_MASK;
(((dma_desc_1d_t *) d)->dstate_order_bypass_desctype_length) |= ((v << DESC_BYPASSDST_SHIFT) & DESC_BYPASSDST_MASK);
}
inline void dma_desc_set_length(void * d, uint32_t v) {
(((dma_desc_1d_t *) d)->dstate_order_bypass_desctype_length) &= ~DESC_LENGTH_MASK;
(((dma_desc_1d_t *) d)->dstate_order_bypass_desctype_length) |= ((v << DESC_LENGTH_SHIFT) & DESC_LENGTH_MASK);
}
inline uint32_t dma_desc_get_src(void * d) {
return (((((dma_desc_1d_t *) d)->src) & DESC_SRC_MASK) >> DESC_SRC_SHIFT);
}
inline void dma_desc_set_src(void * d, uint32_t v) {
(((dma_desc_1d_t *) d)->src) &= ~DESC_SRC_MASK;
(((dma_desc_1d_t *) d)->src) |= ((v << DESC_SRC_SHIFT) & DESC_SRC_MASK);
}
inline void dma_desc_set_dst(void * d, uint32_t v) {
(((dma_desc_1d_t *) d)->dst) &= ~DESC_DST_MASK;
(((dma_desc_1d_t *) d)->dst) |= ((v << DESC_DST_SHIFT) & DESC_DST_MASK);
}
inline void dma_desc_set_roiwidth(void * d, uint32_t v) {
(((dma_desc_2d_t *) d)->roiheight_roiwidth) &= ~DESC_ROIWIDTH_MASK;
(((dma_desc_2d_t *) d)->roiheight_roiwidth) |= ((v << DESC_ROIWIDTH_SHIFT) & DESC_ROIWIDTH_MASK);
}
inline void dma_desc_set_roiheight(void * d, uint32_t v) {
(((dma_desc_2d_t *) d)->roiheight_roiwidth) &= ~DESC_ROIHEIGHT_MASK;
(((dma_desc_2d_t *) d)->roiheight_roiwidth) |= ((v << DESC_ROIHEIGHT_SHIFT) & DESC_ROIHEIGHT_MASK);
}
inline void dma_desc_set_srcstride(void * d, uint32_t v) {
(((dma_desc_2d_t *) d)->dststride_srcstride) &= ~DESC_SRCSTRIDE_MASK;
(((dma_desc_2d_t *) d)->dststride_srcstride) |= ((v << DESC_SRCSTRIDE_SHIFT) & DESC_SRCSTRIDE_MASK);
}
inline void dma_desc_set_dststride(void * d, uint32_t v) {
(((dma_desc_2d_t *) d)->dststride_srcstride) &= ~DESC_DSTSTRIDE_MASK;
(((dma_desc_2d_t *) d)->dststride_srcstride) |= ((v << DESC_DSTSTRIDE_SHIFT) & DESC_DSTSTRIDE_MASK);
}
inline void dma_desc_set_srcwidthoffset(void * d, uint32_t v) {
(((dma_desc_2d_t *) d)->dstwidthoffset_srcwidthoffset) &= ~DESC_SRCWIDTHOFFSET_MASK;
(((dma_desc_2d_t *) d)->dstwidthoffset_srcwidthoffset) |=
((v << DESC_SRCWIDTHOFFSET_SHIFT) & DESC_SRCWIDTHOFFSET_MASK);
}
inline void dma_desc_set_dstwidthoffset(void * d, uint32_t v) {
(((dma_desc_2d_t *) d)->dstwidthoffset_srcwidthoffset) &= ~DESC_DSTWIDTHOFFSET_MASK;
(((dma_desc_2d_t *) d)->dstwidthoffset_srcwidthoffset) |=
((v << DESC_DSTWIDTHOFFSET_SHIFT) & DESC_DSTWIDTHOFFSET_MASK);
}
inline void dma_desc_set_cachealloc(void * d, uint32_t v) {
(((dma_desc_2d_t *) d)->allocation) &= ~DESC_CACHEALLOC_MASK;
(((dma_desc_2d_t *) d)->allocation) |= ((v << DESC_CACHEALLOC_SHIFT) & DESC_CACHEALLOC_MASK);
}
} // namespace
namespace hexagon::dma {
dma_transfer::dma_transfer() {
dma_desc_set_next(_dma_1d_desc0, 0);
dma_desc_set_dstate(_dma_1d_desc0, DESC_DSTATE_INCOMPLETE);
dma_desc_set_desctype(_dma_1d_desc0, DMA_DESC_TYPE_1D);
dma_desc_set_order(_dma_1d_desc0, DESC_ORDER_NOORDER);
dma_desc_set_bypasssrc(_dma_1d_desc0, DESC_BYPASS_ON); // for dram
dma_desc_set_bypassdst(_dma_1d_desc0, DESC_BYPASS_OFF); // for vtcm
dma_desc_set_length(_dma_1d_desc0, 0);
dma_desc_set_next(_dma_1d_desc1, 0);
dma_desc_set_dstate(_dma_1d_desc1, DESC_DSTATE_INCOMPLETE);
dma_desc_set_desctype(_dma_1d_desc1, DMA_DESC_TYPE_1D);
dma_desc_set_order(_dma_1d_desc1, DESC_ORDER_NOORDER);
dma_desc_set_bypasssrc(_dma_1d_desc1, DESC_BYPASS_ON); // for dram
dma_desc_set_bypassdst(_dma_1d_desc1, DESC_BYPASS_OFF); // for vtcm
dma_desc_set_length(_dma_1d_desc1, 0);
dma_desc_set_next(_dma_2d_desc0, 0);
dma_desc_set_dstate(_dma_2d_desc0, DESC_DSTATE_INCOMPLETE);
dma_desc_set_desctype(_dma_2d_desc0, DMA_DESC_TYPE_2D);
dma_desc_set_order(_dma_2d_desc0, DESC_ORDER_NOORDER);
dma_desc_set_bypasssrc(_dma_2d_desc0, DESC_BYPASS_ON); // for dram
dma_desc_set_bypassdst(_dma_2d_desc0, DESC_BYPASS_OFF); // for vtcm
dma_desc_set_cachealloc(_dma_2d_desc0, DESC_CACHEALLOC_NONE);
dma_desc_set_roiwidth(_dma_2d_desc0, 0);
dma_desc_set_roiheight(_dma_2d_desc0, 0);
dma_desc_set_srcstride(_dma_2d_desc0, 0);
dma_desc_set_dststride(_dma_2d_desc0, 0);
dma_desc_set_srcwidthoffset(_dma_2d_desc0, 0);
dma_desc_set_dstwidthoffset(_dma_2d_desc0, 0);
}
dma_transfer::~dma_transfer() {
wait();
}
bool dma_transfer::submit1d(const uint8_t * src, uint8_t * dst, size_t size) {
constexpr size_t kMaxDmaTransferSize = DESC_LENGTH_MASK;
if (size > kMaxDmaTransferSize) {
// TODO: support chained descriptors for large transfers
DEVICE_LOG_ERROR("dma_transfer::submit1d, size(%zu) is too large\n", size);
return false;
}
if (!dma_transfer::is_desc_done(_dma_1d_desc0)) {
DEVICE_LOG_ERROR("Failed to initiate DMA transfer for one or more descriptors\n");
return false;
}
dma_desc_set_next(_dma_1d_desc0, 0);
dma_desc_set_dstate(_dma_1d_desc0, DESC_DSTATE_INCOMPLETE);
dma_desc_set_src(_dma_1d_desc0, reinterpret_cast<uint32_t>(src));
dma_desc_set_dst(_dma_1d_desc0, reinterpret_cast<uint32_t>(dst));
dma_desc_set_length(_dma_1d_desc0, size);
void * buffs[] = { _dma_1d_desc0 };
if (!submit_impl(buffs, std::size(buffs))) {
DEVICE_LOG_ERROR("Failed to submit DMA descriptor\n");
return false;
}
DEVICE_LOG_DEBUG("dma_transfer::submit1d, src(%p), dst(%p), size(%zu), desc(%p)\n", (void *) src, (void *) dst,
size, (void *) _dma_1d_desc0);
return true;
}
bool dma_transfer::submit1d(const uint8_t * src0, uint8_t * dst0, const uint8_t * src1, uint8_t * dst1, size_t size) {
constexpr size_t kMaxDmaTransferSize = DESC_LENGTH_MASK;
if (size > kMaxDmaTransferSize) {
// TODO: support chained descriptors for large transfers
DEVICE_LOG_ERROR("dma_transfer::submit1d, size(%zu) is too large\n", size);
return false;
}
if (!dma_transfer::is_desc_done(_dma_1d_desc0) || !dma_transfer::is_desc_done(_dma_1d_desc1)) {
DEVICE_LOG_ERROR("Failed to initiate DMA transfer for one or more descriptors\n");
return false;
}
dma_desc_set_next(_dma_1d_desc0, 0);
dma_desc_set_dstate(_dma_1d_desc0, DESC_DSTATE_INCOMPLETE);
dma_desc_set_src(_dma_1d_desc0, reinterpret_cast<uint32_t>(src0));
dma_desc_set_dst(_dma_1d_desc0, reinterpret_cast<uint32_t>(dst0));
dma_desc_set_length(_dma_1d_desc0, size);
dma_desc_set_next(_dma_1d_desc1, 0);
dma_desc_set_dstate(_dma_1d_desc1, DESC_DSTATE_INCOMPLETE);
dma_desc_set_src(_dma_1d_desc1, reinterpret_cast<uint32_t>(src1));
dma_desc_set_dst(_dma_1d_desc1, reinterpret_cast<uint32_t>(dst1));
dma_desc_set_length(_dma_1d_desc1, size);
void * buffs[] = { _dma_1d_desc0, _dma_1d_desc1 };
if (!submit_impl(buffs, std::size(buffs))) {
DEVICE_LOG_ERROR("Failed to submit DMA descriptor\n");
return false;
}
DEVICE_LOG_DEBUG(
"dma_transfer::submit1d, src0(%p), dst0(%p), src1(%p), dst1(%p), size(%zu), desc0(%p), desc1(%p)\n",
(void *) src0, (void *) dst0, (void *) src1, (void *) dst1, size, (void *) _dma_1d_desc0,
(void *) _dma_1d_desc1);
return true;
}
bool dma_transfer::submit2d(const uint8_t * src,
uint8_t * dst,
size_t width,
size_t height,
size_t src_stride,
size_t dst_stride) {
// Note that the dma only supports 16-bit width and height for 2D transfer, see also: DESC_ROIWIDTH_MASK
constexpr size_t kMaxDmaTransferSize = DESC_ROIWIDTH_MASK;
if (width > kMaxDmaTransferSize || height > kMaxDmaTransferSize || src_stride > kMaxDmaTransferSize ||
dst_stride > kMaxDmaTransferSize) {
if (src_stride != dst_stride) {
// TODO: support chained descriptors for large transfers
DEVICE_LOG_ERROR("dma_transfer::submit2d, src_stride(%zu) or dst_stride(%zu) is too large\n", src_stride,
dst_stride);
return false;
}
DEVICE_LOG_DEBUG("dma_transfer::submit2d, width(%zu) or height(%zu) is too large, fallback to 1D transfer\n",
width, height);
return submit1d(src, dst, src_stride * height);
}
if (!dma_transfer::is_desc_done(_dma_2d_desc0)) {
DEVICE_LOG_ERROR("Failed to initiate DMA transfer for one or more descriptors\n");
return false;
}
dma_desc_set_next(_dma_2d_desc0, 0);
dma_desc_set_dstate(_dma_2d_desc0, DESC_DSTATE_INCOMPLETE);
dma_desc_set_src(_dma_2d_desc0, reinterpret_cast<uint32_t>(src));
dma_desc_set_dst(_dma_2d_desc0, reinterpret_cast<uint32_t>(dst));
dma_desc_set_roiwidth(_dma_2d_desc0, width);
dma_desc_set_roiheight(_dma_2d_desc0, height);
dma_desc_set_srcstride(_dma_2d_desc0, src_stride);
dma_desc_set_dststride(_dma_2d_desc0, dst_stride);
void * buffs[] = { _dma_2d_desc0 };
if (!submit_impl(buffs, std::size(buffs))) {
DEVICE_LOG_ERROR("Failed to submit DMA descriptor\n");
return false;
}
DEVICE_LOG_DEBUG(
"dma_transfer::submit2d, src(%p), dst(%p), width(%zu), height(%zu), src_stride(%zu), dst_stride(%zu), "
"desc(%p)\n",
(void *) src, (void *) dst, width, height, src_stride, dst_stride, (void *) _dma_2d_desc0);
return true;
}
void dma_transfer::wait() {
uint32_t dm0_status = dmwait() & DM0_STATUS_MASK;
if (dm0_status != DM0_STATUS_IDLE) {
DEVICE_LOG_ERROR("dma_transfer: failed to wait for DMA idle, dm0_status: %d\n", (int) dm0_status);
}
}
bool dma_transfer::is_desc_done(uint8_t * desc) {
if (!dma_desc_get_src(desc)) {
return true;
}
if (dma_desc_get_dstate(desc) == DESC_DSTATE_COMPLETE) {
return true;
}
dmpoll();
return false;
}
bool dma_transfer::submit_impl(void ** desc_batch, size_t batch_len) {
_dma_desc_mutex.lock();
for (size_t i = 0; i < batch_len - 1; i++) {
dma_desc_set_next(desc_batch[i], (uint32_t) desc_batch[i + 1]);
}
dma_desc_set_next(desc_batch[batch_len - 1], (uint32_t) nullptr);
uint32_t dm0_status = dmpoll() & DM0_STATUS_MASK;
if (dm0_status == DM0_STATUS_IDLE) {
dmstart(desc_batch[0]);
} else if (dm0_status == DM0_STATUS_RUN) {
if (_dma_last_desc == nullptr) {
_dma_desc_mutex.unlock();
DEVICE_LOG_ERROR("dma_transfer: last descriptor not found for linking. Submission failed\n");
return false;
} else {
dmlink(_dma_last_desc, desc_batch[0]);
}
} else {
_dma_desc_mutex.unlock();
DEVICE_LOG_ERROR("dma_transfer: DMA not idle or running. Submission failed\n");
return false;
}
dmpoll();
_dma_last_desc = (void *) desc_batch[batch_len - 1];
_dma_desc_mutex.unlock();
return true;
}
qurt_mutex dma_transfer::_dma_desc_mutex;
void * dma_transfer::_dma_last_desc = nullptr;
} // namespace hexagon::dma

View File

@ -0,0 +1,48 @@
#pragma once
#include "util.hpp"
namespace hexagon::dma {
constexpr const size_t kDmaDescSize1D = 16;
constexpr const size_t kDmaDescSize2D = 32;
class dma_transfer {
public:
dma_transfer();
~dma_transfer();
/**
* Submits a 1D DMA transfer.
*
* Limitations:
* - The maximum supported transfer size is kMaxDmaTransferSize (DESC_LENGTH_MASK, 24bit).
* - Transfers larger than this size are not supported and will fail.
* - Large transfers must be split into multiple smaller transfers by the caller.
*/
bool submit1d(const uint8_t * src, uint8_t * dst, size_t size);
bool submit1d(const uint8_t * src0, uint8_t * dst0, const uint8_t * src1, uint8_t * dst1, size_t size);
bool submit2d(const uint8_t * src,
uint8_t * dst,
size_t width,
size_t height,
size_t src_stride,
size_t dst_stride);
void wait();
private:
static bool is_desc_done(uint8_t * desc); // TODO: should we use void * here?
static qurt_mutex _dma_desc_mutex;
static void * _dma_last_desc;
// TODO: can we avoid the void ** here?
bool submit_impl(void ** desc_batch, size_t batch_len);
alignas(kDmaDescSize1D) uint8_t _dma_1d_desc0[kDmaDescSize1D] = {};
alignas(kDmaDescSize1D) uint8_t _dma_1d_desc1[kDmaDescSize1D] = {};
alignas(kDmaDescSize2D) uint8_t _dma_2d_desc0[kDmaDescSize2D] = {};
DISABLE_COPY_AND_MOVE(dma_transfer);
};
} // namespace hexagon::dma

View File

@ -0,0 +1,107 @@
#include "graph.hpp"
#include "op_registry.hpp"
#include "util.hpp"
#include "vtcm_mem.hpp"
#include <new>
namespace hexagon {
graph::graph() noexcept {
DEVICE_LOG_DEBUG("graph(%p) created\n", (void *) this);
}
graph::~graph() noexcept {
_tensors.reset();
DEVICE_LOG_DEBUG("graph(%p) destroyed\n", (void *) this);
}
void graph::set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_count) {
if (tensor_count <= 0 || !tensors) {
_tensors.reset();
_tensor_count = 0;
DEVICE_LOG_DEBUG("graph(%p) set_tensor: no tensors to set\n", (void *) this);
return;
}
_tensors = std::make_unique<tensor *[]>(size_t(tensor_count));
for (int i = 0; i < tensor_count; ++i) {
auto * tensor_obj = reinterpret_cast<tensor *>(tensors[i]);
_tensors[i] = tensor_obj;
DEVICE_LOG_DEBUG("graph(%p) set_tensor[%d]: %p(%p,%p), op: %s\n", (void *) this, i, (void *) tensor_obj,
(void *) tensor_obj->get_src(0), (void *) tensor_obj->get_src(1),
op_get_name(tensor_obj->get_op()));
}
_tensor_count = tensor_count;
DEVICE_LOG_DEBUG("graph(%p) tensor count: %zu\n", (void *) this, _tensor_count);
}
bool graph::compute(default_thread_pool * thread_pool, const float * f16_to_f32_table) {
if (_tensors == nullptr || !_tensor_count) {
DEVICE_LOG_DEBUG("graph(%p) no tensors to compute\n", (void *) this);
return true; // return success if no tensors to compute
}
DEVICE_LOG_DEBUG("graph(%p) compute\n", (void *) this);
DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p]compute", (void *) this);
_f16_to_f32_table = f16_to_f32_table;
if (thread_pool) {
thread_pool->sync_execute(&graph::thread_pool_task, this);
} else {
default_thread_pool::thread_params param = {
0, 1, nullptr, hexagon::vtcm_mem::get_avail_block_size()
}; // TODO: should have a better way to initialize thread_params
compute_impl(nullptr, &param);
}
_tensors[_tensor_count - 1]->invalidate();
_f16_to_f32_table = nullptr;
return true;
}
void graph::thread_pool_task(default_thread_pool * pool,
default_thread_pool::thread_params * thread_params,
void * graph) {
reinterpret_cast<hexagon::graph *>(graph)->compute_impl(pool, thread_params);
}
void graph::compute_impl(default_thread_pool * pool, default_thread_pool::thread_params * thread_params) {
hexagon::compute_params params = { thread_params, _f16_to_f32_table };
npu_device_tensor_op prev_op = NPU_OP_COUNT;
npu_device_ne_type prev_ne = {};
for (size_t i = 0; i < _tensor_count; ++i) {
auto * dst = _tensors[i];
auto op = dst->get_op();
const auto & ne = dst->get_info().ne;
const auto * op_name = op_get_name(op);
auto * func = get_compute_func(dst);
if (!func) {
DEVICE_LOG_ERROR("[%p][%s]graph tensor[%zu] op not supported\n", (void *) this, op_name, i);
return;
}
const bool should_sync = requires_thread_barrier(prev_op, prev_ne, op, ne);
if (pool && should_sync) {
// For the last tensor, the thread pool will handle synchronization
DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p][%s]sync_thread, tidx: %zu, tensor[%zu/%zu]", (void *) this, op_name,
params.get_thread_index(), i + 1, _tensor_count);
pool->sync_thread();
}
prev_op = op;
memcpy(&prev_ne, &ne, sizeof(prev_ne));
if (!func(dst, &params)) {
DEVICE_LOG_ERROR("[%p][%s]graph tensor[%zu] op %d compute failed\n", (void *) this, op_name, i, op);
}
}
}
} // namespace hexagon

View File

@ -0,0 +1,35 @@
#pragma once
#include "hexagon_npu.h"
#include "tensor.hpp"
#include "thread_pool.hpp"
#include <memory>
namespace hexagon {
class graph {
public:
// TODO: add execute direction here
explicit graph() noexcept;
~graph() noexcept;
void set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_count);
bool compute(default_thread_pool * thread_pool, const float * f16_to_f32_table);
private:
static void thread_pool_task(default_thread_pool * pool,
default_thread_pool::thread_params * thread_params,
void * graph);
void compute_impl(default_thread_pool * pool, default_thread_pool::thread_params * thread_params);
std::unique_ptr<tensor *[]> _tensors;
size_t _tensor_count = 0;
const float * _f16_to_f32_table = nullptr;
DISABLE_COPY_AND_MOVE(graph);
};
} // namespace hexagon

View File

@ -0,0 +1,406 @@
#pragma once
#include "op_types.hpp"
#include "type_traits.hpp"
#include "vec_ops.hpp"
namespace hexagon {
template <HVX_Vector (*_OpBinaryTransform)(HVX_Vector, HVX_Vector)>
inline void vec_op_f32_f32(const float * src0, const float * src1, float * dst, size_t count) {
using namespace hexagon::vec;
vec_trans_impl<_OpBinaryTransform, float>(src0, src1, dst, count);
}
inline HVX_Vector vadd_f32_f32(HVX_Vector a, HVX_Vector b) {
return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a, b));
}
inline HVX_Vector vsub_f32_f32(HVX_Vector a, HVX_Vector b) {
return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a, b));
}
inline HVX_Vector vmul_f32_f32(HVX_Vector a, HVX_Vector b) {
return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a, b));
}
template <HVX_Vector (*_OpBinaryTransform)(HVX_Vector, HVX_Vector)>
inline void vec_op_f16_f16(const npu_device_fp16_t * src0,
const npu_device_fp16_t * src1,
npu_device_fp16_t * dst,
size_t count) {
using namespace hexagon::vec;
vec_trans_impl<_OpBinaryTransform, npu_device_fp16_t>(src0, src1, dst, count);
}
template <HVX_Vector (*_OpUnaryTransform)(HVX_VectorPair)>
inline void unary_vec_op_f16_f32(const float * src, npu_device_fp16_t * dst, size_t count, size_t) {
// TODO: remove the unused param
using namespace hexagon::vec;
vec_trans_with_half_ret_impl<_OpUnaryTransform, float, npu_device_fp16_t>(src, dst, count);
}
inline HVX_Vector vadd_f16_f16(HVX_Vector a, HVX_Vector b) {
// TODO: fix this since qf16 has less precision than fp16
return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_VhfVhf(a, b));
}
inline HVX_Vector vsub_f16_f16(HVX_Vector a, HVX_Vector b) {
// TODO: fix this since qf16 has less precision than fp16
return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vsub_VhfVhf(a, b));
}
inline HVX_Vector vmul_f16_f16(HVX_Vector a, HVX_Vector b) {
return Q6_Vhf_equals_Wqf32(Q6_Wqf32_vmpy_VhfVhf(a, b));
}
inline HVX_Vector vequals_f16_f32(HVX_VectorPair a) {
const HVX_Vector kZeroV = Q6_V_vzero();
HVX_Vector lo = Q6_Vqf32_vadd_Vqf32Vsf(kZeroV, Q6_V_lo_W(a));
HVX_Vector hi = Q6_Vqf32_vadd_Vqf32Vsf(kZeroV, Q6_V_hi_W(a));
a = Q6_W_vcombine_VV(hi, lo);
return Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(a));
}
template <typename T> struct get_data_type {};
template <typename _TyData> struct get_data_type<void (*)(const _TyData *, const _TyData *, _TyData *, size_t)> {
using type = _TyData;
};
template <typename _TyInput, typename _TyOutput, typename _TyParam>
struct get_data_type<void (*)(const _TyInput *, _TyOutput *, size_t, _TyParam)> {
using type = _TyInput;
using output_type = _TyOutput;
using param_type = typename std::remove_cv<typename std::remove_reference<_TyParam>::type>::type;
};
template <auto _RowFunc> bool element_wise_op(hexagon::tensor * out, hexagon::compute_params * params) {
using data_type = typename get_data_type<decltype(_RowFunc)>::type;
if (!out) {
return false;
}
static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "element_wise_op requires max dims 4");
auto * src0 = out->get_src(0);
auto * src1 = out->get_src(1);
if (!src0 || !src1) {
return true; // skip if no src
}
if (src0->get_ne(0) != src1->get_ne(0)) {
// TODO: handle this case
DEVICE_LOG_ERROR("src0[0] and src1[0] not match: %ld vs %ld\n", (long) src0->get_ne(0), (long) src1->get_ne(0));
return false;
}
const auto total_rows = out->get_ne(3) * out->get_ne(2) * out->get_ne(1);
const auto start_end = params->get_work_slice(total_rows);
if (start_end.first >= start_end.second) {
return true;
}
const auto src_row_bytes = src0->get_ne(0) * sizeof(data_type);
const auto src_row_bytes_aligned = hexagon::get_aligned_size(src_row_bytes);
uint8_t * src_cache_ptr = params->get_vtcm_cache(src_row_bytes_aligned * 4);
if (!src_cache_ptr) {
DEVICE_LOG_ERROR("element_wise_op: failed to get VTCM cache, size: %zu\n", size_t(src_row_bytes_aligned * 4));
return false;
}
uint8_t * dst_ptr = out->get_write_buffer();
if (!dst_ptr) {
DEVICE_LOG_ERROR("element_wise_op: dst_ptr is not writable, tensor: %p, type: %s\n", (void *) out,
hexagon::get_type_name(out->get_type()));
return false;
}
const uint8_t * src0_ptr = src0->get_read_buffer(true); // TODO: avoid invalidation
const uint8_t * src1_ptr = src1->get_read_buffer(true); // TODO: avoid invalidation
const auto rows_per_cube = out->get_ne(2) * out->get_ne(1);
uint8_t * src0_read_cache_ptr = src_cache_ptr;
uint8_t * src0_write_cache_ptr = src_cache_ptr + src_row_bytes_aligned;
uint8_t * src1_read_cache_ptr = src_cache_ptr + src_row_bytes_aligned * 2;
uint8_t * src1_write_cache_ptr = src_cache_ptr + src_row_bytes_aligned * 3;
{
const auto i03 = start_end.first / rows_per_cube;
const auto i02 = start_end.first / out->get_ne(1) - i03 * out->get_ne(2);
const auto i01 = start_end.first % out->get_ne(1); // TODO: should we use divide instead of mod?
const auto i13 = i03 % src1->get_ne(3);
const auto i12 = i02 % src1->get_ne(2);
const auto i11 = i01 % src1->get_ne(1);
auto * src0_row = src0_ptr + i03 * src0->get_nb(3) + i02 * src0->get_nb(2) + i01 * src0->get_nb(1);
auto * src1_row = src1_ptr + i13 * src1->get_nb(3) + i12 * src1->get_nb(2) + i11 * src1->get_nb(1);
if (!params->initiate_dma_row_transfer(src0_row, src0_write_cache_ptr, src1_row, src1_write_cache_ptr,
src_row_bytes)) {
DEVICE_LOG_ERROR("element_wise_op: failed to initiate dma transfer\n");
return false;
}
}
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(out, params->get_thread_index());
for (int64_t ir = start_end.first; ir < start_end.second; ++ir) {
const auto i03 = ir / rows_per_cube;
const auto i02 = ir / out->get_ne(1) - i03 * out->get_ne(2);
const auto i01 = ir % out->get_ne(1); // TODO: should we use divide instead of mod?
const auto ir_next = ir + 1;
auto * dst_row = dst_ptr + i03 * out->get_nb(3) + i02 * out->get_nb(2) + i01 * out->get_nb(1);
{
std::swap(src0_read_cache_ptr, src0_write_cache_ptr);
std::swap(src1_read_cache_ptr, src1_write_cache_ptr);
params->wait_for_dma();
}
if (ir_next < start_end.second) {
const auto i03_next = ir_next / rows_per_cube;
const auto i02_next = ir_next / out->get_ne(1) - i03_next * out->get_ne(2);
const auto i01_next = ir_next % out->get_ne(1);
const auto i13_next = i03_next % src1->get_ne(3);
const auto i12_next = i02_next % src1->get_ne(2);
const auto i11_next = i01_next % src1->get_ne(1);
auto * src0_next_row =
src0_ptr + i03_next * src0->get_nb(3) + i02_next * src0->get_nb(2) + i01_next * src0->get_nb(1);
auto * src1_next_row =
src1_ptr + i13_next * src1->get_nb(3) + i12_next * src1->get_nb(2) + i11_next * src1->get_nb(1);
if (!params->initiate_dma_row_transfer(src0_next_row, src0_write_cache_ptr, src1_next_row,
src1_write_cache_ptr, src_row_bytes)) {
DEVICE_LOG_ERROR("element_wise_op: failed to continue DMA transfer\n");
return false;
}
}
_RowFunc(reinterpret_cast<const data_type *>(src0_read_cache_ptr),
reinterpret_cast<const data_type *>(src1_read_cache_ptr), reinterpret_cast<data_type *>(dst_row),
static_cast<size_t>(out->get_ne(0)));
}
out->release_write_buffer(); // mark the output tensor as modified
return true;
}
bool is_element_wise_op_supported(const npu_device_tensor_op_spec * op_spec,
const npu_device_tensor_spec * dst,
const npu_device_tensor_spec * srcs,
size_t src_len) {
const auto op = op_spec->op;
if (op != NPU_OP_ADD && op != NPU_OP_SUB && op != NPU_OP_MUL) {
DEVICE_LOG_DEBUG("[%s]unsupported\n", hexagon::op_get_name(op));
return false;
}
if (!dst || !srcs || src_len < 2) {
DEVICE_LOG_DEBUG("[%s]invalid dst or srcs\n", hexagon::op_get_name(op));
return false;
}
const auto & src0 = srcs[0];
const auto & src1 = srcs[1];
if (dst->type != src0.type || dst->type != src1.type) {
DEVICE_LOG_DEBUG("[%s]src0.type and dst.type mismatch: %s vs %s\n", hexagon::op_get_name(op),
hexagon::get_type_name(src0.type), hexagon::get_type_name(dst->type));
return false;
}
if (dst->type != NPU_DATA_TYPE_F32 && dst->type != NPU_DATA_TYPE_F16) {
DEVICE_LOG_DEBUG("[%s]unsupported data type: %s\n", hexagon::op_get_name(op),
hexagon::get_type_name(dst->type));
return false;
}
// TODO: fix FP16 add/sub
if (dst->type == NPU_DATA_TYPE_F16 && op != NPU_OP_MUL) {
DEVICE_LOG_DEBUG("[%s]unsupported data type: %s\n", hexagon::op_get_name(op),
hexagon::get_type_name(dst->type));
return false;
}
if (src0.ne[0] != src1.ne[0]) {
DEVICE_LOG_DEBUG("[%s]src0.ne[0] and src1.ne[0] not match: %ld vs %ld\n", hexagon::op_get_name(op),
(long) src0.ne[0], (long) src1.ne[0]);
return false;
}
if (!hexagon::is_same_shape(src0, *dst)) {
DEVICE_LOG_DEBUG("[%s]src0 and dst have different shape\n", hexagon::op_get_name(op));
return false;
}
return true;
}
bool is_element_wise_op_required_sync(npu_device_tensor_op prev_op,
const npu_device_ne_type & prev_ne,
npu_device_tensor_op op,
const npu_device_ne_type & ne) {
NPU_UNUSED(prev_ne);
NPU_UNUSED(op);
NPU_UNUSED(ne);
return prev_op != NPU_OP_ADD && prev_op != NPU_OP_SUB && prev_op != NPU_OP_MUL && prev_op != NPU_OP_RMS_NORM &&
prev_op != NPU_OP_COUNT;
}
void rms_norm_vec_f32(const float * src, float * dst, size_t count, float eps) {
constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(float);
HVX_Vector * src_vec_ptr = ((HVX_Vector *) src);
HVX_Vector * const src_vec_end = ((HVX_Vector *) src) + (count / kElementsPerVector);
HVX_Vector prev = *src_vec_ptr++;
HVX_Vector sum = Q6_V_vzero();
while (src_vec_ptr < src_vec_end) {
HVX_Vector curr = *src_vec_ptr++;
HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src);
sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_Vqf32_vmpy_VsfVsf(s0, s0));
prev = curr;
}
const size_t leftover = count % kElementsPerVector;
if ((src_vec_end - ((HVX_Vector *) src)) > 0) {
// handle the last vector
bool should_fetch_src = leftover != 0 || !hexagon::is_addr_aligned(src_vec_ptr);
HVX_Vector curr = should_fetch_src ? *src_vec_ptr : prev;
src_vec_ptr += should_fetch_src ? 1 : 0;
HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src);
sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_Vqf32_vmpy_VsfVsf(s0, s0));
prev = curr;
}
if (leftover > 0) {
// handle the leftover elements
const size_t leftover_bytes = leftover * sizeof(float);
HVX_Vector curr =
(leftover_bytes + hexagon::unaligned_bytes(src_vec_ptr) > hexagon::kBytesPerVector) ? *src_vec_ptr : prev;
curr = Q6_V_valign_VVR(curr, prev, (size_t) src);
sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum,
Q6_V_valign_VVR(Q6_Vqf32_vmpy_VsfVsf(curr, curr), Q6_V_vzero(), leftover_bytes));
}
const float mean = hexagon::vec_reduction_f32_qf32(sum) / count; // TODO: figure out how to do division in vector
const float scale = 1.0f / sqrtf(mean + eps); // TODO: use buildin blas sqrtf?
hexagon::vec_scale_f32(src, scale, dst, count);
}
// TODO: merge with element_wise_op?
template <auto _RowFunc> bool unary_op(hexagon::tensor * out, hexagon::compute_params * params) {
using input_type = typename get_data_type<decltype(_RowFunc)>::type;
using output_type = typename get_data_type<decltype(_RowFunc)>::output_type;
using param_type = typename get_data_type<decltype(_RowFunc)>::param_type;
if (!out) {
return false;
}
static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "element_wise_op requires max dims 4");
auto * src0 = out->get_src(0);
if (!src0) {
return true; // skip if no src
}
auto * dst_ptr = out->get_write_buffer();
if (!dst_ptr) {
DEVICE_LOG_ERROR("unary_op: dst_ptr is not writable, tensor: %p, type: %s\n", (void *) out,
hexagon::get_type_name(out->get_type()));
return false;
}
const auto * src0_ptr = src0->get_read_buffer();
auto total_rows = out->get_ne(3) * out->get_ne(2) * out->get_ne(1);
const auto rows_per_cube = out->get_ne(2) * out->get_ne(1);
const auto start_end = params->get_work_slice(total_rows);
if (start_end.first >= start_end.second) {
return true;
}
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(out, params->get_thread_index());
const auto param = out->get_op_param<param_type>(0);
const size_t valid_row_bytes = src0->get_ne(0) * sizeof(input_type);
for (int64_t ir = start_end.first; ir < start_end.second; ++ir) {
const auto i03 = ir / rows_per_cube;
const auto i02 = ir / out->get_ne(1) - i03 * out->get_ne(2);
const auto i01 = ir % out->get_ne(1); // TODO: should we use divide instead of mod?
auto * src0_row = src0_ptr + i03 * src0->get_nb(3) + i02 * src0->get_nb(2) + i01 * src0->get_nb(1);
auto * dst_row = dst_ptr + i03 * out->get_nb(3) + i02 * out->get_nb(2) + i01 * out->get_nb(1);
if (ir + 1 < start_end.second) {
hexagon::l2fetch_row(src0_row + src0->get_nb(1), valid_row_bytes);
}
_RowFunc(reinterpret_cast<const input_type *>(src0_row), reinterpret_cast<output_type *>(dst_row),
static_cast<size_t>(out->get_ne(0)), param);
}
out->release_write_buffer(); // mark the output tensor as modified
return true;
}
bool is_unary_op_supported(const npu_device_tensor_op_spec * op_spec,
const npu_device_tensor_spec * dst,
const npu_device_tensor_spec * srcs,
size_t src_len) {
const auto op = op_spec->op;
if (op != NPU_OP_RMS_NORM && op != NPU_OP_CPY) {
DEVICE_LOG_DEBUG("[%s]unsupported\n", hexagon::op_get_name(op));
return false;
}
if (!dst || !srcs || src_len < 1) {
DEVICE_LOG_DEBUG("[%s]invalid dst or srcs\n", hexagon::op_get_name(op));
return false;
}
const auto & src0 = srcs[0];
if (!hexagon::is_same_shape(src0, *dst)) {
DEVICE_LOG_DEBUG("[%s]src0 and dst have different shape\n", hexagon::op_get_name(op));
return false;
}
if (op == NPU_OP_RMS_NORM) {
if (dst->type != src0.type) {
DEVICE_LOG_DEBUG("[%s]src0.type and dst.type mismatch: %s vs %s\n", hexagon::op_get_name(op),
hexagon::get_type_name(src0.type), hexagon::get_type_name(dst->type));
return false;
}
if (dst->type != NPU_DATA_TYPE_F32) {
DEVICE_LOG_DEBUG("[%s]unsupported data type: %s\n", hexagon::op_get_name(op),
hexagon::get_type_name(dst->type));
return false;
}
} else {
if (dst->nb[1] < dst->nb[0] || src0.nb[1] < src0.nb[0]) {
// TODO: support non-continuous row
DEVICE_LOG_DEBUG("[%s]unsupported non-continuous row\n", hexagon::op_get_name(op));
return false;
}
if (dst->type != NPU_DATA_TYPE_F16 || src0.type != NPU_DATA_TYPE_F32) {
// TODO: support more types
DEVICE_LOG_DEBUG("[%s]unsupported data type src:%s dst:%s\n", hexagon::op_get_name(op),
hexagon::get_type_name(src0.type), hexagon::get_type_name(dst->type));
return false;
}
}
return true;
}
bool is_unary_op_required_sync(npu_device_tensor_op prev_op,
const npu_device_ne_type & prev_ne,
npu_device_tensor_op op,
const npu_device_ne_type & ne) {
NPU_UNUSED(prev_ne);
NPU_UNUSED(op);
NPU_UNUSED(ne);
return prev_op != NPU_OP_ADD && prev_op != NPU_OP_SUB && prev_op != NPU_OP_MUL && prev_op != NPU_OP_RMS_NORM &&
prev_op != NPU_OP_COUNT;
}
} // namespace hexagon

View File

@ -0,0 +1,388 @@
#include "op_flash_attn.hpp"
#include "type_traits.hpp"
#include "util.hpp"
#include "vec_ops.hpp"
namespace {
// TODO: use a more efficient conversion
inline float f16_to_f32(const npu_device_fp16_t src) {
return reinterpret_cast<const __fp16 &>(src);
}
// From: ggml/src/ggml-cpu/ops.cpp
template <bool _IsKvF16, bool _HasMask>
void flash_attn_impl(hexagon::tensor * out,
const hexagon::tensor * q,
const hexagon::tensor * k,
const hexagon::tensor * v,
const hexagon::tensor * mask,
const hexagon::tensor * sinks,
hexagon::compute_params * params) {
static_assert(3 <= hexagon::kMaxParamsCount, "flash_attn op params count exceeds max params count");
constexpr const npu_device_tensor_data_type kKvDataType = _IsKvF16 ? NPU_DATA_TYPE_F16 : NPU_DATA_TYPE_F32;
constexpr const bool kHasMask = _HasMask;
if (k->get_type() != kKvDataType || v->get_type() != k->get_type()) {
DEVICE_LOG_ERROR("flash_attn_impl: k and v must have same type, got k: %s, v: %s\n",
hexagon::get_type_name(k->get_type()), hexagon::get_type_name(v->get_type()));
return;
}
if (kHasMask != (mask != nullptr)) {
DEVICE_LOG_ERROR("flash_attn_impl: mask is required when kHasMask is true\n");
return;
}
float scale = out->get_op_param<float>(0);
const float max_bias = out->get_op_param<float>(1);
const float logit_softcap = out->get_op_param<float>(2);
if (logit_softcap != 0) {
scale /= logit_softcap;
}
// broadcast factors
const int64_t rk2 = q->get_ne(2) / k->get_ne(2);
const int64_t rk3 = q->get_ne(3) / k->get_ne(3);
const int64_t rv2 = q->get_ne(2) / v->get_ne(2);
const int64_t rv3 = q->get_ne(3) / v->get_ne(3);
const uint32_t n_head = q->get_ne(2);
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
const float m0 = powf(2.0f, -(max_bias) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
const auto & k_type_traits = hexagon::get_type_traits(kKvDataType);
const auto q_to_kv_type = k_type_traits.from_float;
constexpr const auto kq_vec_dot = _IsKvF16 ? hexagon::type_erase_dot_func<hexagon::vec_dot_product_f16_f16> :
hexagon::type_erase_dot_func<hexagon::vec_dot_product_f32_f32>;
if (!q_to_kv_type) {
DEVICE_LOG_ERROR("flash_attn_impl: unsupported data type for q, k, or v\n");
return;
}
const int64_t total_rows = q->get_ne(1) * q->get_ne(2) * q->get_ne(3); // total number of rows in Q
const auto start_end_row = params->get_work_slice(total_rows); // work slice for this thread
const auto DK = k->get_ne(0);
const auto DV = v->get_ne(0);
const auto row_bytes_q = q->get_ne(0) * hexagon::get_type_traits(q->get_type()).type_size;
const auto row_bytes_k = DK * k_type_traits.type_size;
const auto row_bytes_v = DV * hexagon::get_type_traits(v->get_type()).type_size;
constexpr const size_t kFloatsPerVectorPair = hexagon::kBytesPerVector * 2 / sizeof(float);
const auto aligned_dk = (DK + kFloatsPerVectorPair - 1) / kFloatsPerVectorPair * kFloatsPerVectorPair;
const auto aligned_dv = (DV + kFloatsPerVectorPair - 1) / kFloatsPerVectorPair * kFloatsPerVectorPair;
size_t total_cache_size = sizeof(float) * (aligned_dk + 2 * aligned_dv);
auto * cache_ptr = params->get_vtcm_cache(total_cache_size);
if (!cache_ptr) {
DEVICE_LOG_ERROR("Failed to allocate VTCM cache for flash_attn: %zu bytes\n", total_cache_size);
return;
}
// loop over n_batch and n_head
constexpr bool is_v_f16 = _IsKvF16; // check if V is in FP16 format, otherwise it is in FP32 format
const auto rows_per_batch = q->get_ne(2) * q->get_ne(1);
const auto out_rows_per_batch = out->get_ne(2) * out->get_ne(1);
uint8_t * dst_ptr = out->get_write_buffer();
if (!dst_ptr) {
DEVICE_LOG_ERROR("flash_attn_impl: dst_ptr is not writable, tensor: %p, type: %s\n", (void *) out,
hexagon::get_type_name(out->get_type()));
return;
}
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_MULTI_SUB_PROC(out, params->get_thread_index(), flash_attn);
const uint8_t * q_ptr = q->get_read_buffer();
const uint8_t * k_ptr = k->get_read_buffer();
const uint8_t * v_ptr = v->get_read_buffer();
const uint8_t * mask_ptr = kHasMask ? mask->get_read_buffer() : nullptr;
const uint8_t * sinks_ptr = sinks ? sinks->get_read_buffer() : nullptr;
float * VKQ32 = reinterpret_cast<float *>(cache_ptr); // FP32 VKQ accumulator
auto * VKQ16 = reinterpret_cast<npu_device_fp16_t *>(VKQ32 + aligned_dv); // (temporary) FP16 VKQ accumulator
auto * Q_q = reinterpret_cast<npu_device_fp16_t *>(
VKQ32 + 2 * aligned_dv); // (temporary) buffer for Q converted to quantized/FP16
for (auto ir = start_end_row.first; ir < start_end_row.second; ++ir) {
// q indices
const auto iq3 = ir / rows_per_batch;
const auto iq2 = (ir - iq3 * rows_per_batch) / q->get_ne(1);
const auto iq1 = (ir - iq3 * rows_per_batch - iq2 * q->get_ne(1));
const auto * q_data = q_ptr + (iq1 * q->get_nb(1) + iq2 * q->get_nb(2) + iq3 * q->get_nb(3));
hexagon::l2fetch_row(q_data, row_bytes_q);
const uint32_t h = iq2; // head index
const float slope =
(max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2 * (h - n_head_log2) + 1) : 1.0f;
float S = 0.0f; // sum
float M = -INFINITY; // maximum KQ value
if constexpr (is_v_f16) {
memset(VKQ16, 0, DV * sizeof(npu_device_fp16_t));
} else {
memset(VKQ32, 0, DV * sizeof(float));
}
const npu_device_fp16_t * mp =
kHasMask ? reinterpret_cast<const npu_device_fp16_t *>(mask_ptr + iq1 * mask->get_nb(1) +
(iq2 % mask->get_ne(2)) * mask->get_nb(2) +
(iq3 % mask->get_ne(3)) * mask->get_nb(3)) :
nullptr;
q_to_kv_type(reinterpret_cast<const float *>(q_data), Q_q, DK);
if (kHasMask) {
hexagon::l2fetch_row(reinterpret_cast<const uint8_t *>(mp), mask->get_nb(1));
}
// k indices
const int ik3 = iq3 / rk3;
const int ik2 = iq2 / rk2;
// v indices
const int iv3 = iq3 / rv3;
const int iv2 = iq2 / rv2;
// online softmax / attention
// loop over n_kv and n_head_kv
// ref: https://arxiv.org/pdf/2112.05682.pdf
const auto * k_plane_ptr = k_ptr + ik2 * k->get_nb(2) + ik3 * k->get_nb(3);
const auto * v_plane_ptr = v_ptr + iv2 * v->get_nb(2) + iv3 * v->get_nb(3);
for (int64_t ic = 0; ic < k->get_ne(1); ++ic) {
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 0, loop);
float mv = kHasMask ? (slope * f16_to_f32(mp[ic])) : 0.0f;
if (mv == -INFINITY) {
continue;
}
float s = 0.f;
{
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 1, kq_dot);
const auto * k_data = k_plane_ptr + ic * k->get_nb(1);
if (ic < k->get_ne(1) - 1) {
hexagon::l2fetch_row(k_data + k->get_nb(1), row_bytes_k);
}
s = kq_vec_dot(k_data, Q_q, DK); // KQ value
s = s * scale; // scale KQ value
if (logit_softcap != 0.0f) {
s = logit_softcap * tanhf(s); // TODO: vectorize this?
}
s += mv; // apply mask
}
const float Mold = M;
float ms = 1.0f; // upon new higher max val, scale VKQ and KQ sum with this value
float vs = 1.0f; // post-softmax KQ value, expf(s - M)
const auto * v_data = v_plane_ptr + ic * v->get_nb(1);
if (ic < v->get_ne(1)) {
hexagon::l2fetch_row(v_data, row_bytes_v);
}
if constexpr (is_v_f16) {
if (s > M) {
// s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f
M = s;
ms = expf(Mold - M);
// V = V*expf(Mold - M)
hexagon::vec_scale_f16(VKQ16, ms, VKQ16, DV);
} else {
// no new maximum, ms == 1.0f, vs != 1.0f
vs = expf(s - M);
}
// V += v*expf(s - M)
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 2, mad);
hexagon::vec_mad_f16(reinterpret_cast<const npu_device_fp16_t *>(v_data), vs, VKQ16, DV);
} else {
if (s > M) {
// s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f
M = s;
ms = expf(Mold - M);
// V = V*expf(Mold - M)
hexagon::vec_scale_f32(VKQ32, ms, VKQ32, DV);
} else {
// no new maximum, ms == 1.0f, vs != 1.0f
vs = expf(s - M);
}
// V += v*expf(s - M)
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 2, mad);
{
// V is F32
hexagon::vec_mad_f32(reinterpret_cast<const float *>(v_data), vs, VKQ32, DV);
}
}
S = S * ms + vs; // scale and increment sum with partial sum
}
if constexpr (is_v_f16) {
// TODO: use a more efficient conversion
for (int64_t d = 0; d < DV; ++d) {
VKQ32[d] = f16_to_f32(VKQ16[d]);
}
}
if (sinks_ptr) {
const float s = reinterpret_cast<const float *>(sinks_ptr)[h];
float ms = 1.0f;
float vs = 1.0f;
if (s > M) {
ms = expf(M - s);
hexagon::vec_scale_f32(VKQ32, ms, VKQ32, DV);
} else {
vs = expf(s - M);
}
S = S * ms + vs;
}
// V /= S
const float S_inv = 1.0f / S;
hexagon::vec_scale_f32(VKQ32, S_inv, VKQ32, DV);
// dst indices
const int i1 = iq1;
const int i2 = iq2;
const int i3 = iq3;
// permute(0, 2, 1, 3)
hexagon::vec_cpy_f32(
reinterpret_cast<const float *>(VKQ32),
reinterpret_cast<float *>(dst_ptr + (i3 * out_rows_per_batch + i2 + i1 * out->get_ne(1)) * out->get_nb(1)),
out->get_ne(0));
}
out->release_write_buffer(); // mark the output tensor as modified
}
} // namespace
namespace hexagon {
bool flash_attn_f32(tensor * out, compute_params * params) {
if (!out || !params) {
DEVICE_LOG_DEBUG("invalid out or params\n");
return false;
}
const auto * q = out->get_src(0);
const auto * k = out->get_src(1);
const auto * v = out->get_src(2);
if (!q || !k || !v) {
DEVICE_LOG_DEBUG("invalid src tensors: q: %p, k: %p, v: %p\n", (void *) q, (void *) k, (void *) v);
return false;
}
const auto * mask = out->get_src(3);
const auto * sinks = out->get_src(4);
if (k->get_type() == NPU_DATA_TYPE_F16) {
if (mask) {
flash_attn_impl<true, true>(out, q, k, v, mask, sinks, params);
} else {
flash_attn_impl<true, false>(out, q, k, v, mask, sinks, params);
}
} else {
if (mask) {
flash_attn_impl<false, true>(out, q, k, v, mask, sinks, params);
} else {
flash_attn_impl<false, false>(out, q, k, v, mask, sinks, params);
}
}
return true;
}
bool is_flash_attn_supported(const npu_device_tensor_op_spec * op_spec,
const npu_device_tensor_spec * dst,
const npu_device_tensor_spec * srcs,
size_t src_len) {
const auto op = op_spec->op;
if (op != NPU_OP_FLASH_ATTN) {
DEVICE_LOG_DEBUG("op is not NPU_OP_FLASH_ATTN: %d\n", op);
return false;
}
if (!dst || !srcs || src_len < 4) {
DEVICE_LOG_DEBUG("[%s]invalid dst or srcs\n", op_get_name(op));
return false;
}
if (dst->type != NPU_DATA_TYPE_F32) {
DEVICE_LOG_DEBUG("[%s]dst type is not F32: %s\n", op_get_name(op), get_type_name(dst->type));
return false;
}
const auto * q = &srcs[0];
if (q->type != NPU_DATA_TYPE_F32) {
DEVICE_LOG_DEBUG("[%s]q type is not F32: %s\n", op_get_name(op), get_type_name(q->type));
return false;
}
const auto * k = &srcs[1];
if (k->type != NPU_DATA_TYPE_F16) { // TODO: support more k types
DEVICE_LOG_DEBUG("[%s]k type is not F16: %s\n", op_get_name(op), get_type_name(k->type));
return false;
}
const auto * v = &srcs[2];
if (v->type != k->type) { // TODO: support more v types
DEVICE_LOG_DEBUG("[%s]v type is not the same as k: %s vs %s\n", op_get_name(op), get_type_name(v->type),
get_type_name(k->type));
return false;
}
const auto * mask = &srcs[3];
if (mask->type != NPU_DATA_TYPE_F16) {
DEVICE_LOG_DEBUG("[%s]mask type is not F16: %s\n", op_get_name(op), get_type_name(mask->type));
return false;
}
if (dst->ne[0] != v->ne[0] || dst->ne[2] != q->ne[1]) {
DEVICE_LOG_DEBUG(
"[%s]dst shape does not match q and v: dst ne: %lld, %lld, %lld, %lld, q ne: %lld, %lld, %lld, %lld, "
"v ne: %lld, %lld, %lld, %lld\n",
op_get_name(op), dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], q->ne[0], q->ne[1], q->ne[2], q->ne[3],
v->ne[0], v->ne[1], v->ne[2], v->ne[3]);
return false;
}
if (is_transposed_or_permuted(dst->nb)) {
DEVICE_LOG_DEBUG("[%s]dst cannot be transposed or permuted, nb: %zu, %zu, %zu, %zu\n", op_get_name(op),
(size_t) dst->nb[0], (size_t) dst->nb[1], (size_t) dst->nb[2], (size_t) dst->nb[3]);
return false;
}
if (q->ne[0] != k->ne[0]) {
DEVICE_LOG_DEBUG(
"[%s]q and k shapes do not match: q ne: %lld, %lld, %lld, %lld, k ne: %lld, %lld, %lld, %lld\n",
op_get_name(op), q->ne[0], q->ne[1], q->ne[2], q->ne[3], k->ne[0], k->ne[1], k->ne[2], k->ne[3]);
return false;
}
return true;
}
bool is_flash_attn_required_sync(npu_device_tensor_op prev_op,
const npu_device_ne_type & prev_ne,
npu_device_tensor_op op,
const npu_device_ne_type & ne) {
NPU_UNUSED(prev_ne);
NPU_UNUSED(op);
NPU_UNUSED(ne);
return prev_op != NPU_OP_COUNT;
}
} // namespace hexagon

View File

@ -0,0 +1,17 @@
#pragma once
#include "op_types.hpp"
namespace hexagon {
bool flash_attn_f32(tensor * out, compute_params * params);
bool is_flash_attn_supported(const npu_device_tensor_op_spec * op_spec,
const npu_device_tensor_spec * dst,
const npu_device_tensor_spec * srcs,
size_t src_len);
bool is_flash_attn_required_sync(npu_device_tensor_op prev_op,
const npu_device_ne_type & prev_ne,
npu_device_tensor_op op,
const npu_device_ne_type & ne);
} // namespace hexagon

View File

@ -0,0 +1,228 @@
#include "op_glu.hpp"
#include "type_traits.hpp"
#include "util.hpp"
namespace {
template <typename T> struct get_data_type {};
template <typename _TyData, typename _TyParam>
struct get_data_type<void (*)(const _TyData *, const _TyData *, _TyData *, size_t, _TyParam)> {
using type = _TyData;
using param_type = typename std::remove_cv<typename std::remove_reference<_TyParam>::type>::type;
};
inline float dummy_load_coeff() {
// This is a dummy function to satisfy the template requirements.
// In practice, this should be replaced with a proper coefficient loading function.
return 0;
}
inline float expf_f16_guard_inf(float x) {
// Avoid overflow for large values, f16: log(65504)
constexpr float kMaxExp = 11.0898664f;
if (x >= kMaxExp) {
// Avoid overflow for large values
return std::numeric_limits<float>::infinity();
}
return std::expf(x);
}
inline void glu_vec_op_f16_f16(const __fp16 * src0, const __fp16 * src1, __fp16 * dst, size_t count, float coeff) {
// TODO: use simd version, for some input hexagon intrinsics will generate nan instead of inf.
for (uint32_t i = 0; i < count; ++i) {
float x = src0[i];
float g = src1[i];
dst[i] = (x / (1.0f + expf_f16_guard_inf(-x))) * g;
}
}
inline void glu_vec_op_f32_f32(const float * src0,
const float * src1,
float * dst,
size_t count,
hexagon::HVX_VectorPair_x4 coeff) {
using namespace hexagon::vec;
vec_trans_impl<hexagon::vec_swiglu_f32_f32, float, hexagon::HVX_VectorPair_x4>(src0, src1, dst, count, coeff);
}
template <auto _GluRowFunc, auto _CoeffLoadFunc>
bool glu_impl(hexagon::tensor * out, hexagon::compute_params * params) {
using data_type = typename get_data_type<decltype(_GluRowFunc)>::type;
using param_type = typename get_data_type<decltype(_GluRowFunc)>::param_type;
static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "element_wise_op requires max dims 4");
static_assert(std::is_same_v<param_type, decltype(_CoeffLoadFunc())>,
"GluRowFunc must have the same param type as CoeffLoadFunc");
if (!out) {
return false;
}
const bool has_src1 = out->get_src(1) != nullptr;
auto * src0 = out->get_src(0);
auto * src1 = has_src1 ? out->get_src(1) : src0;
if (!src0 || !src1) {
return true; // skip if no src
}
const auto total_cols = has_src1 ? src0->get_ne(0) : src0->get_ne(0) / 2;
if (out->get_ne(0) != total_cols) {
DEVICE_LOG_ERROR("[hexagon-npu][GLU]out.ne[0] (%ld) != total_cols (%d)\n", (long) out->get_ne(0),
(int) total_cols);
return false;
}
auto total_rows = out->get_ne(3) * out->get_ne(2) * out->get_ne(1);
const auto rows_per_cube = out->get_ne(2) * out->get_ne(1);
const auto start_end = params->get_work_slice(total_rows);
if (start_end.first >= start_end.second) {
return true;
}
uint8_t * dst_ptr = out->get_write_buffer();
if (!dst_ptr) {
DEVICE_LOG_ERROR("[hexagon-npu][GLU]glu_impl: dst_ptr is not writable, tensor: %p, type: %s\n", (void *) out,
hexagon::get_type_name(out->get_type()));
return false;
}
const int32_t swapped = out->get_op_param<int32_t>(1);
const uint8_t * src0_ptr = src0->get_read_buffer();
const uint8_t * src1_ptr = has_src1 ? src1->get_read_buffer() : (src0_ptr + total_cols * sizeof(data_type));
if (swapped) {
std::swap(src0_ptr, src1_ptr);
}
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(out, params->get_thread_index());
auto coeff = _CoeffLoadFunc();
const size_t valid_row_bytes = src0->get_ne(0) * sizeof(data_type);
for (int64_t ir = start_end.first; ir < start_end.second; ++ir) {
const auto i03 = ir / rows_per_cube;
const auto i02 = ir / out->get_ne(1) - i03 * out->get_ne(2);
const auto i01 = ir % out->get_ne(1); // TODO: should we use divide instead of mod?
const auto i13 = i03 % src1->get_ne(3);
const auto i12 = i02 % src1->get_ne(2);
const auto i11 = i01 % src1->get_ne(1);
auto * src1_plane = src1_ptr + i13 * src1->get_nb(3) + i12 * src1->get_nb(2);
auto * src0_row = src0_ptr + i03 * src0->get_nb(3) + i02 * src0->get_nb(2) + i01 * src0->get_nb(1);
auto * src1_row = src1_plane + i11 * src1->get_nb(1);
auto * dst_row = dst_ptr + i03 * out->get_nb(3) + i02 * out->get_nb(2) + i01 * out->get_nb(1);
if (ir + 1 < start_end.second) {
hexagon::l2fetch_row(src0_row + src0->get_nb(1), valid_row_bytes);
hexagon::l2fetch_row(src1_row + src1->get_nb(1), valid_row_bytes);
}
_GluRowFunc(reinterpret_cast<const data_type *>(src0_row), reinterpret_cast<const data_type *>(src1_row),
reinterpret_cast<data_type *>(dst_row), static_cast<size_t>(total_cols), coeff);
}
out->release_write_buffer(); // mark the output tensor as modified
return true;
}
template <npu_device_tensor_data_type _DataType>
bool glu_compute(hexagon::tensor * out, hexagon::compute_params * params) {
using namespace hexagon::vec::math;
if (out->get_op_param<int32_t>(0) != NPU_GLU_OP_SWIGLU) {
DEVICE_LOG_ERROR("Invalid GLU op type: %d\n", (int) out->get_op_param<int32_t>(0));
return false;
}
if (out->get_type() != _DataType) {
DEVICE_LOG_ERROR("GLU op type mismatch: %s vs %s\n", hexagon::get_type_name(out->get_type()),
hexagon::get_type_name(_DataType));
return false;
}
if constexpr (_DataType == NPU_DATA_TYPE_F32) {
return glu_impl<glu_vec_op_f32_f32, qhmath_load_div_sf_ltu>(out, params);
} else if constexpr (_DataType == NPU_DATA_TYPE_F16) {
return glu_impl<glu_vec_op_f16_f16, dummy_load_coeff>(out, params);
}
DEVICE_LOG_ERROR("Unsupported GLU data type: %s\n", hexagon::get_type_name(out->get_type()));
return true;
}
} // namespace
namespace hexagon {
bool glu_f32(hexagon::tensor * out, hexagon::compute_params * params) {
return glu_compute<npu_device_tensor_data_type::NPU_DATA_TYPE_F32>(out, params);
}
bool glu_f16(hexagon::tensor * out, hexagon::compute_params * params) {
return glu_compute<npu_device_tensor_data_type::NPU_DATA_TYPE_F16>(out, params);
}
bool is_glu_op_supported(const npu_device_tensor_op_spec * op_spec,
const npu_device_tensor_spec * dst,
const npu_device_tensor_spec * srcs,
size_t src_len) {
const auto op = op_spec->op;
if (op != NPU_OP_GLU) {
DEVICE_LOG_DEBUG("[%s]unsupported\n", hexagon::op_get_name(op));
return false;
}
if (op_spec->params[0] != NPU_GLU_OP_SWIGLU) {
DEVICE_LOG_DEBUG("[%s]unsupported GLU op type: %d\n", hexagon::op_get_name(op), (int) op_spec->params[0]);
return false;
}
if (!dst || !srcs || src_len < 1) {
DEVICE_LOG_DEBUG("[%s]invalid dst or srcs\n", hexagon::op_get_name(op));
return false;
}
const auto & src0 = srcs[0];
if (dst->type != src0.type) {
DEVICE_LOG_DEBUG("[%s]src0.type and dst.type mismatch: %s vs %s\n", hexagon::op_get_name(op),
hexagon::get_type_name(src0.type), hexagon::get_type_name(dst->type));
return false;
}
if (dst->type != NPU_DATA_TYPE_F32 && dst->type != NPU_DATA_TYPE_F16) {
DEVICE_LOG_DEBUG("[%s]unsupported data type: %s\n", hexagon::op_get_name(op),
hexagon::get_type_name(dst->type));
return false;
}
if (src_len > 1) {
if (!hexagon::is_same_shape(src0, *dst) || !hexagon::is_same_shape(srcs[1], *dst)) {
DEVICE_LOG_DEBUG("[%s]src0, src1 and dst have different shape\n", hexagon::op_get_name(op));
return false; // src0 and src1 have the same shape as dst
}
} else {
static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "GLU requires max dims 4");
if (src0.ne[0] / 2 != dst->ne[0] || src0.ne[1] != dst->ne[1] || src0.ne[2] != dst->ne[2] ||
src0.ne[3] != dst->ne[3]) {
DEVICE_LOG_DEBUG("[%s]src0 and dst have different shape: src0.ne[0]: %ld, dst.ne[0]: %ld\n",
hexagon::op_get_name(op), (long) src0.ne[0], (long) dst->ne[0]);
return false;
}
}
return true;
}
bool is_glu_required_sync(npu_device_tensor_op prev_op,
const npu_device_ne_type & prev_ne,
npu_device_tensor_op op,
const npu_device_ne_type & ne) {
NPU_UNUSED(prev_ne);
NPU_UNUSED(op);
NPU_UNUSED(ne);
return prev_op == NPU_OP_MUL_MAT;
}
} // namespace hexagon

View File

@ -0,0 +1,19 @@
#pragma once
#include "op_types.hpp"
namespace hexagon {
bool glu_f32(hexagon::tensor * out, hexagon::compute_params * params);
bool glu_f16(hexagon::tensor * out, hexagon::compute_params * params);
bool is_glu_op_supported(const npu_device_tensor_op_spec * op_spec,
const npu_device_tensor_spec * dst,
const npu_device_tensor_spec * srcs,
size_t src_len);
bool is_glu_required_sync(npu_device_tensor_op prev_op,
const npu_device_ne_type & prev_ne,
npu_device_tensor_op op,
const npu_device_ne_type & ne);
} // namespace hexagon

View File

@ -0,0 +1,899 @@
#include "op_mul_mat.hpp"
#include "thread_pool.hpp" // TODO: remove this dependency
#include "type_traits.hpp"
#include "vec_ops.hpp"
namespace {
inline std::pair<size_t, size_t> unflatten_i3_i2(size_t idx, const hexagon::tensor * t) {
const auto i3 = idx / t->get_ne(2);
const auto i2 = idx - i3 * t->get_ne(2);
return { i3, i2 };
}
template <typename _T> struct get_data_type {};
template <typename _TData0, typename _TData1>
struct get_data_type<HVX_Vector (*)(const _TData0 *, const _TData1 *, size_t)> {
using data_type0 = _TData0;
using data_type1 = _TData1;
};
template <typename _TData0, typename _TData1>
struct get_data_type<HVX_Vector (*)(const _TData0 *, const _TData1 *, size_t, const HVX_Vector)> {
using data_type0 = _TData0;
using data_type1 = _TData1;
};
template <typename _TRet> struct convert_vector {};
template <> struct convert_vector<float> {
static float convert(HVX_Vector vec) { return hexagon::get_flt0_from_fltv(Q6_Vsf_equals_Vqf32(vec)); }
};
inline std::pair<int64_t, int64_t> unflatten_i3_i2(int64_t idx, const hexagon::tensor * t) {
const auto i3 = idx / t->get_ne(2);
const auto i2 = idx - i3 * t->get_ne(2);
return { i3, i2 };
}
template <> struct convert_vector<npu_device_fp16_t> {
static float convert(HVX_Vector vec) {
HVX_Vector vect = Q6_Vhf_equals_Vqf16(vec);
uint16_t i = (vect[0] & 0xffff);
return reinterpret_cast<__fp16 &>(i);
}
};
template <bool _IsQuantized>
inline bool init_dma_transfer(hexagon::compute_params * params,
const uint8_t * src,
uint8_t * dst,
size_t width,
size_t height,
size_t src_stride,
size_t dst_stride) {
if constexpr (_IsQuantized) {
if (!params->initiate_dma_row_transfer(src, dst, src_stride * height)) {
return false;
}
} else {
if (!params->initiate_dma_plane_transfer(src, dst, width, height, src_stride, dst_stride)) {
return false;
}
}
return true;
}
template <auto _DotFunc, typename... _TExtraArgs>
inline void batched_row_dot(const uint8_t * src0_plane,
const size_t src0_ne0,
const size_t src0_nb1,
const uint8_t * src1_row,
const size_t src1_nb1,
float * dst_row,
const size_t slice_rows,
const size_t src1_fetch_row_bytes,
_TExtraArgs... args) {
using data_type0 = typename get_data_type<decltype(_DotFunc)>::data_type0;
using data_type1 = typename get_data_type<decltype(_DotFunc)>::data_type1;
size_t i0 = 0;
for (; i0 + 1 < slice_rows; i0 += 2) {
auto * src0_row = src0_plane + i0 * src0_nb1;
// TODO: figure dst how to handle a entire row
auto res0 = _DotFunc(reinterpret_cast<const data_type0 *>(src0_row),
reinterpret_cast<const data_type1 *>(src1_row), src0_ne0, args...);
// TODO: figure dst how to handle a entire row
auto res1 = _DotFunc(reinterpret_cast<const data_type0 *>(src0_row + src0_nb1),
reinterpret_cast<const data_type1 *>(src1_row), src0_ne0, args...);
{
dst_row[i0] = convert_vector<data_type1>::convert(res0);
dst_row[i0 + 1] = convert_vector<data_type1>::convert(res1);
}
}
if (src1_fetch_row_bytes > 0) {
hexagon::l2fetch_row(src1_row + src1_nb1, src1_fetch_row_bytes);
}
if (i0 < slice_rows) {
auto * src0_row = src0_plane + i0 * src0_nb1;
auto res = _DotFunc(reinterpret_cast<const data_type0 *>(src0_row),
reinterpret_cast<const data_type1 *>(src1_row), src0_ne0, args...);
dst_row[i0] = convert_vector<data_type1>::convert(res);
}
}
template <auto _DotFunc, bool _IsSrcQuantized>
inline void mul_mat_impl(hexagon::tensor * src0,
hexagon::tensor * src1,
hexagon::tensor * dst,
hexagon::compute_params * params) {
using data_type0 = typename get_data_type<decltype(_DotFunc)>::data_type0;
using data_type1 = typename get_data_type<decltype(_DotFunc)>::data_type1;
const auto src0_row_stride = hexagon::get_dequantized_row_size(src0);
auto * dequantize_row_func = hexagon::get_type_traits(src0->get_type()).to_float;
auto * load_dequant_table_func = hexagon::get_type_traits(src0->get_type()).load_dequant_table;
if (_IsSrcQuantized && dequantize_row_func == nullptr) {
DEVICE_LOG_ERROR("Unsupported quantized src0 type: %d, dequantize_row_func is null\n", src0->get_type());
return;
}
const auto r02 = src1->get_ne(2) / src0->get_ne(2);
const auto r03 = src1->get_ne(3) / src0->get_ne(3);
const auto total_planes = dst->get_ne(3) * dst->get_ne(2);
auto start_end_plane = std::pair<int64_t, int64_t>{ 0, total_planes };
auto start_end_row = std::pair<int64_t, int64_t>{ 0, dst->get_ne(1) };
auto start_end_element = std::pair<int64_t, int64_t>{ 0, dst->get_ne(0) };
if (total_planes >= params->get_thread_count()) {
start_end_plane = params->get_work_slice(total_planes);
} else if (dst->get_ne(0) >= params->get_thread_count()) {
start_end_element = params->get_work_slice(dst->get_ne(0));
} else {
start_end_row = params->get_work_slice(dst->get_ne(1));
}
if (start_end_plane.second <= start_end_plane.first || start_end_row.second <= start_end_row.first ||
start_end_element.second <= start_end_element.first || start_end_plane.first < 0 || start_end_row.first < 0 ||
start_end_element.first < 0) {
DEVICE_LOG_DEBUG(
"mul_mat_impl: no work to do, start_end_plane: (%lld, %lld), start_end_row: (%lld, %lld), "
"start_end_element: (%lld, %lld)\n",
start_end_plane.first, start_end_plane.second, start_end_row.first, start_end_row.second,
start_end_element.first, start_end_element.second);
return;
}
const uint8_t * src0_ptr = src0->get_read_buffer(true); // TODO: avoid invalidation
// cache the src0 plane in VTCM
const size_t valid_src0_row_bytes = _IsSrcQuantized ? src0->get_nb(1) : (src0->get_ne(0) * sizeof(data_type0));
const size_t src1_row_stride = hexagon::get_aligned_size(src1->get_nb(1));
// TODO: figure out why we have to add padding after src0 plane cache
const size_t src0_plane_slice_row_count =
std::min<size_t>((params->get_vtcm_quota_size() - src1_row_stride) / (src0_row_stride * 2),
start_end_element.second - start_end_element.first);
uint8_t * src0_plane_read_cache_ptr = nullptr;
uint8_t * src0_plane_write_cache_ptr = nullptr;
size_t src0_plane_write_cache_offset = 0;
const uint8_t * last_write_cached_plane_ptr = nullptr;
const uint8_t * last_read_cached_plane_ptr = nullptr;
{
const size_t src0_plane_cache_size = src0_row_stride * src0_plane_slice_row_count;
src0_plane_read_cache_ptr = params->get_vtcm_cache(src0_plane_cache_size * 2);
if (!src0_plane_read_cache_ptr) {
DEVICE_LOG_ERROR(
"mul_mat_impl: failed to get VTCM cache for src0, size: %zu, src0_plane_slice_row_count: %zu, "
"src0_row_stride: %zu, will fallback to mem cache\n",
src0_plane_cache_size, src0_plane_slice_row_count, src0_row_stride);
return;
}
src0_plane_write_cache_ptr = src0_plane_read_cache_ptr + src0_plane_cache_size;
if constexpr (_IsSrcQuantized) {
src0_plane_write_cache_offset =
src0_plane_cache_size - size_t(src0->get_nb(1) * src0_plane_slice_row_count);
}
DEVICE_LOG_DEBUG(
"[%d]mul_mat_impl, src0_row_stride:%zu, valid_src0_row_bytes:%zu, src_nb0:%zu, "
"slice_row_count:%zu, write_cache_offset: %zu, "
"total_planes:%lld, planes:[%d,%d), rows:[%d,%d), elems:[%d,%d), is_quant:%d, "
"vtcm_mem:%p(%zu)\n",
(int) params->get_thread_index(), src0_row_stride, valid_src0_row_bytes, (size_t) src0->get_nb(1),
src0_plane_slice_row_count, src0_plane_write_cache_offset, total_planes, (int) start_end_plane.first,
(int) start_end_plane.second, (int) start_end_row.first, (int) start_end_row.second,
(int) start_end_element.first, (int) start_end_element.second, _IsSrcQuantized,
(void *) src0_plane_read_cache_ptr, params->get_vtcm_quota_size());
}
{
const auto [i3, i2] = unflatten_i3_i2(start_end_plane.first, dst);
const uint8_t * src0_plane = src0_ptr + i3 / r03 * src0->get_nb(3) + i2 / r02 * src0->get_nb(2) +
start_end_element.first * src0->get_nb(1);
const size_t next_row_count =
std::min<size_t>(src0_plane_slice_row_count,
start_end_element.second - start_end_element.first); // number of rows in this slice
if (!init_dma_transfer<_IsSrcQuantized>(
params, src0_plane, src0_plane_write_cache_ptr + src0_plane_write_cache_offset, valid_src0_row_bytes,
next_row_count, src0->get_nb(1), src0->get_nb(1))) {
DEVICE_LOG_ERROR("mul_mat_impl: failed to continue dma transfer for src0 plane, is_quant: %d\n",
(int) _IsSrcQuantized);
return;
}
DEVICE_LOG_DEBUG("mul_mat_impl: [i2,i3]:[%d,%d], src0_plane:%p, row_count:%zu\n", (int) i2, (int) i3,
(void *) src0_plane, next_row_count);
last_write_cached_plane_ptr = src0_plane;
}
const size_t valid_src1_row_bytes =
src0->get_ne(0) * sizeof(data_type1); // src0 and src1 should have the same element count in the 1st dimension
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_MULTI_SUB_PROC(dst, params->get_thread_index(), mul_mat);
uint8_t * dst_ptr = dst->get_write_buffer();
if (!dst_ptr) {
DEVICE_LOG_ERROR("[%d]mul_mat_impl: dst_ptr is not writable, tensor: %p, type: %s\n",
(int) params->get_thread_index(), (void *) dst, hexagon::get_type_name(dst->get_type()));
return;
}
const uint8_t * src1_ptr = src1->get_read_buffer();
const auto dequant_table = load_dequant_table_func ? load_dequant_table_func() : HVX_Vector();
for (size_t ip = start_end_plane.first; ip < size_t(start_end_plane.second); ip++) {
const auto [i3, i2] = unflatten_i3_i2(ip, dst);
const auto * src1_plane = src1_ptr + i3 * src1->get_nb(3) + i2 * src1->get_nb(2);
auto * dst_plane = dst_ptr + i3 * dst->get_nb(3) + i2 * dst->get_nb(2);
const uint8_t * src0_plane_base = src0_ptr + i3 / r03 * src0->get_nb(3) + i2 / r02 * src0->get_nb(2);
for (size_t col_idx = start_end_element.first; col_idx < size_t(start_end_element.second);
col_idx += src0_plane_slice_row_count) {
const uint8_t * src0_plane = src0_plane_base + col_idx * src0->get_nb(1);
const size_t slice_rows =
std::min<size_t>(src0_plane_slice_row_count,
start_end_element.second - col_idx); // number of rows in this slice
{
const uint8_t * src0_next_plane = last_write_cached_plane_ptr;
size_t next_row_count = 0;
if (col_idx + src0_plane_slice_row_count < start_end_element.second) {
const auto next_col_idx = col_idx + src0_plane_slice_row_count;
src0_next_plane = src0_plane_base + next_col_idx * src0->get_nb(1);
next_row_count =
std::min<size_t>(src0_plane_slice_row_count,
start_end_element.second - next_col_idx); // number of rows in this slice
} else if (ip + 1 < start_end_plane.second) {
// prefetch the next plane's first slice
const auto [i3_next, i2_next] = unflatten_i3_i2(ip + 1, dst);
const uint8_t * src0_next_plane_base =
src0_ptr + i3_next / r03 * src0->get_nb(3) + i2_next / r02 * src0->get_nb(2);
src0_next_plane = src0_next_plane_base + start_end_element.first * src0->get_nb(1);
next_row_count = std::min<size_t>(
src0_plane_slice_row_count,
start_end_element.second - start_end_element.first); // number of rows in this slice
}
if (last_read_cached_plane_ptr != src0_plane) {
std::swap(src0_plane_read_cache_ptr, src0_plane_write_cache_ptr);
params->wait_for_dma();
}
if (last_write_cached_plane_ptr != src0_next_plane) {
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(mul_mat, 2, dma);
if (!init_dma_transfer<_IsSrcQuantized>(
params, src0_next_plane, src0_plane_write_cache_ptr + src0_plane_write_cache_offset,
valid_src0_row_bytes, next_row_count, src0->get_nb(1), src0->get_nb(1))) {
DEVICE_LOG_ERROR("mul_mat_impl: failed to continue dma transfer for src0 plane\n");
return;
}
last_write_cached_plane_ptr = src0_next_plane;
}
}
if constexpr (_IsSrcQuantized) {
if (last_read_cached_plane_ptr != src0_plane) {
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(mul_mat, 0, dequant);
const uint8_t * src0_quant_plane = src0_plane_read_cache_ptr + src0_plane_write_cache_offset;
for (size_t ir = 0; ir < slice_rows; ir++) {
auto * src0_row = src0_quant_plane + ir * src0->get_nb(1);
auto * cached_row_ptr = src0_plane_read_cache_ptr + ir * src0_row_stride;
dequantize_row_func(src0_row, reinterpret_cast<hexagon::dequant_output_type *>(cached_row_ptr),
src0->get_ne(0), dequant_table);
}
}
}
last_read_cached_plane_ptr = src0_plane;
if (start_end_row.second > start_end_row.first) {
hexagon::l2fetch_row(src1_plane + start_end_row.first * src1->get_nb(1), valid_src1_row_bytes);
}
for (size_t i1 = start_end_row.first; i1 < size_t(start_end_row.second); i1++) {
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(mul_mat, 1, dot);
auto * src1_row = src1_plane + i1 * src1->get_nb(1);
auto * dst_row = reinterpret_cast<float *>(dst_plane + i1 * dst->get_nb(1)) + col_idx;
batched_row_dot<_DotFunc>(src0_plane_read_cache_ptr, src0->get_ne(0), src0_row_stride, src1_row,
src1->get_nb(1), dst_row, slice_rows,
(ip + 1 < start_end_plane.second) ? valid_src1_row_bytes : 0);
}
}
}
dst->release_write_buffer(); // mark the output tensor as modified
}
template <auto _DotFunc, bool _IsSrcQuantized>
inline void mul_mat_gemv_impl(hexagon::tensor * src0,
hexagon::tensor * src1,
hexagon::tensor * dst,
hexagon::compute_params * params) {
using data_type0 = typename get_data_type<decltype(_DotFunc)>::data_type0;
using data_type1 = typename get_data_type<decltype(_DotFunc)>::data_type1;
auto * dequantize_row_func = hexagon::get_type_traits(src0->get_type()).to_float;
auto * load_dequant_table_func = hexagon::get_type_traits(src0->get_type()).load_dequant_table;
if (_IsSrcQuantized && dequantize_row_func == nullptr) {
DEVICE_LOG_ERROR("Unsupported quantized src0 type: %d, dequantize_row_func is null\n", src0->get_type());
return;
}
if (dst->get_ne(0) < params->get_thread_count()) {
DEVICE_LOG_ERROR("Unsupported src1 tensor shape for gemv: %s, ne: %lldx%lldx%lldx%lld\n",
hexagon::get_type_name(src1->get_type()), src1->get_ne(0), src1->get_ne(1), src1->get_ne(2),
src1->get_ne(3));
return;
}
const auto start_end_element = params->get_work_slice(dst->get_ne(0));
if (start_end_element.second <= start_end_element.first || start_end_element.first < 0) {
DEVICE_LOG_DEBUG(
"mul_mat_gemv_impl: no work to do, start_end_plane: [0, 1), start_end_row: [0, 1), "
"start_end_element: [%lld, %lld)\n",
start_end_element.first, start_end_element.second);
return;
}
const auto src0_row_stride = hexagon::get_dequantized_row_size(src0);
const uint8_t * src0_ptr = src0->get_read_buffer(true); // TODO: avoid invalidation
const size_t valid_src0_row_bytes = _IsSrcQuantized ? src0->get_nb(1) : (src0->get_ne(0) * sizeof(data_type0));
// cache the src0 plane in VTCM
const size_t src1_row_stride = hexagon::get_aligned_size(src1->get_nb(1));
const size_t src0_plane_slice_row_count =
std::min<size_t>((params->get_vtcm_quota_size() - src1_row_stride) / (src0_row_stride * 2),
start_end_element.second - start_end_element.first);
uint8_t * src0_plane_read_cache_ptr = nullptr;
uint8_t * src0_plane_write_cache_ptr = nullptr;
size_t src0_plane_write_cache_offset = 0;
uint8_t * src1_row_cache_ptr = nullptr;
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_MULTI_SUB_PROC(dst, params->get_thread_index(), mul_mat);
{
const size_t src0_plane_cache_size = src0_row_stride * src0_plane_slice_row_count;
src0_plane_read_cache_ptr = params->get_vtcm_cache(src0_plane_cache_size * 2 + src1_row_stride);
if (!src0_plane_read_cache_ptr) {
DEVICE_LOG_ERROR(
"mul_mat_gemv_impl: failed to get VTCM cache for src0, size: %zu, src0_plane_slice_row_count: %zu, "
"src0_row_stride: %zu, will fallback to mem cache\n",
src0_plane_cache_size, src0_plane_slice_row_count, src0_row_stride);
return;
}
src0_plane_write_cache_ptr = src0_plane_read_cache_ptr + src0_plane_cache_size;
src1_row_cache_ptr = src0_plane_write_cache_ptr + src0_plane_cache_size;
if constexpr (_IsSrcQuantized) {
src0_plane_write_cache_offset = src0_plane_cache_size - (src0->get_nb(1) * src0_plane_slice_row_count);
}
DEVICE_LOG_DEBUG(
"mul_mat_gemv_impl: src0_row_stride: %zu, src0_plane_slice_row_count: %zu, "
"src0_plane_write_cache_offset: %zu, src0.nb[1]: %d, is_quantized: %d, vtcm_mem: %p(%zu)\n",
src0_row_stride, src0_plane_slice_row_count, src0_plane_write_cache_offset, int(src0->get_nb(1)),
_IsSrcQuantized, (void *) src0_plane_read_cache_ptr, src0_plane_cache_size);
}
uint8_t * dst_ptr = dst->get_write_buffer();
if (!dst_ptr) {
DEVICE_LOG_ERROR("mul_mat_gemv_impl: dst_ptr is not writable, tensor: %p, type: %s\n", (void *) dst,
hexagon::get_type_name(dst->get_type()));
return;
}
const uint8_t * src1_ptr = src1->get_read_buffer();
{
if (!params->initiate_dma_row_transfer(src1_ptr, src1_row_cache_ptr, src1->get_ne(0) * sizeof(data_type1))) {
DEVICE_LOG_ERROR("mul_mat_gemv_impl: failed to initiate dma transfer for src1\n");
return;
}
const uint8_t * src0_plane = src0_ptr + start_end_element.first * src0->get_nb(1);
const size_t next_row_count =
std::min<size_t>(src0_plane_slice_row_count,
start_end_element.second - start_end_element.first); // number of rows in this slice
params->wait_for_dma();
if (!init_dma_transfer<_IsSrcQuantized>(
params, src0_plane, src0_plane_write_cache_ptr + src0_plane_write_cache_offset, valid_src0_row_bytes,
next_row_count, src0->get_nb(1), src0->get_nb(1))) {
DEVICE_LOG_ERROR("mul_mat_gemv_impl: failed to initiate dma plane transfer for src0 plane, is_quant: %d\n",
(int) _IsSrcQuantized);
return;
}
}
const auto dequant_table = load_dequant_table_func ? load_dequant_table_func() : HVX_Vector();
{
for (size_t col_idx = start_end_element.first; col_idx < size_t(start_end_element.second);
col_idx += src0_plane_slice_row_count) {
const size_t slice_rows =
std::min<size_t>(src0_plane_slice_row_count,
start_end_element.second - col_idx); // number of rows in this slice
const size_t next_col_idx = col_idx + src0_plane_slice_row_count;
std::swap(src0_plane_read_cache_ptr, src0_plane_write_cache_ptr);
params->wait_for_dma();
if (next_col_idx < start_end_element.second) {
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(mul_mat, 2, dma);
const uint8_t * src0_next_plane = src0_ptr + next_col_idx * src0->get_nb(1);
const size_t next_row_count =
std::min<size_t>(src0_plane_slice_row_count,
start_end_element.second - next_col_idx); // number of rows in this slice
if (!init_dma_transfer<_IsSrcQuantized>(
params, src0_next_plane, src0_plane_write_cache_ptr + src0_plane_write_cache_offset,
valid_src0_row_bytes, next_row_count, src0->get_nb(1), src0->get_nb(1))) {
DEVICE_LOG_ERROR(
"mul_mat_gemv_impl: failed to continue dma plane transfer for src0 plane, is_quant: %d\n",
(int) _IsSrcQuantized);
return;
}
}
if constexpr (_IsSrcQuantized) {
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(mul_mat, 0, dequant);
const uint8_t * src0_quant_plane = src0_plane_read_cache_ptr + src0_plane_write_cache_offset;
for (size_t ir = 0; ir < slice_rows; ir++) {
auto * src0_row = src0_quant_plane + ir * src0->get_nb(1);
auto * cached_row_ptr = src0_plane_read_cache_ptr + ir * src0_row_stride;
dequantize_row_func(src0_row, reinterpret_cast<hexagon::dequant_output_type *>(cached_row_ptr),
src0->get_ne(0), dequant_table);
}
}
{
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(mul_mat, 1, dot);
auto * dst_row = reinterpret_cast<float *>(dst_ptr) + col_idx;
batched_row_dot<_DotFunc>(src0_plane_read_cache_ptr, src0->get_ne(0), src0_row_stride,
src1_row_cache_ptr, src1->get_nb(1), dst_row, slice_rows, 0);
}
}
}
dst->release_write_buffer(); // mark the output tensor as modified
}
template <auto _DotFunc>
inline void mul_mat_gemv_quant_impl(hexagon::tensor * src0,
hexagon::tensor * src1,
hexagon::tensor * dst,
hexagon::compute_params * params) {
// TODO: merge with mul_mat_gemv_impl?
using data_type1 = typename get_data_type<decltype(_DotFunc)>::data_type1;
if (dst->get_ne(0) < params->get_thread_count()) {
DEVICE_LOG_ERROR("Unsupported src1 tensor shape for gemv: %s, ne: %lldx%lldx%lldx%lld\n",
hexagon::get_type_name(src1->get_type()), src1->get_ne(0), src1->get_ne(1), src1->get_ne(2),
src1->get_ne(3));
return;
}
const auto src0_row_stride = src0->get_nb(1);
const auto start_end_element = params->get_work_slice(dst->get_ne(0));
if (start_end_element.second <= start_end_element.first || start_end_element.first < 0) {
DEVICE_LOG_DEBUG(
"mul_mat_gemv_quant_impl: no work to do, start_end_plane: [0, 1), start_end_row: [0, 1), "
"start_end_element: [%lld, %lld)\n",
start_end_element.first, start_end_element.second);
return;
}
const uint8_t * src0_ptr = src0->get_read_buffer(true); // TODO: avoid invalidation
// cache the src0 plane in VTCM
const size_t src1_row_stride = hexagon::get_aligned_size(src1->get_nb(1));
const size_t src0_plane_slice_row_count =
std::min<size_t>((params->get_vtcm_quota_size() - src1_row_stride) / (src0_row_stride * 2),
start_end_element.second - start_end_element.first);
uint8_t * src0_plane_read_cache_ptr = nullptr;
uint8_t * src0_plane_write_cache_ptr = nullptr;
uint8_t * src1_row_cache_ptr = nullptr;
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_MULTI_SUB_PROC(dst, params->get_thread_index(), mul_mat);
{
const size_t src0_plane_cache_size = src0_row_stride * src0_plane_slice_row_count;
src0_plane_read_cache_ptr = params->get_vtcm_cache(src0_plane_cache_size * 2 + src1_row_stride);
if (!src0_plane_read_cache_ptr) {
DEVICE_LOG_ERROR(
"mul_mat_gemv_quant_impl: failed to get VTCM cache for src0, size: %zu, src0_plane_slice_row_count: "
"%zu, "
"src0_row_stride: %zu, will fallback to mem cache\n",
src0_plane_cache_size, src0_plane_slice_row_count, src0_row_stride);
return;
}
src0_plane_write_cache_ptr = src0_plane_read_cache_ptr + src0_plane_cache_size;
src1_row_cache_ptr = src0_plane_write_cache_ptr + src0_plane_cache_size;
DEVICE_LOG_DEBUG(
"mul_mat_gemv_quant_impl: src0_row_stride: %zu, src0_plane_slice_row_count: %zu, src0.nb[1]: %d, vtcm_mem: "
"%p(%zu)\n",
src0_row_stride, src0_plane_slice_row_count, int(src0->get_nb(1)), (void *) src0_plane_read_cache_ptr,
src0_plane_cache_size);
}
uint8_t * dst_ptr = dst->get_write_buffer();
if (!dst_ptr) {
DEVICE_LOG_ERROR("mul_mat_gemv_quant_impl: dst_ptr is not writable, tensor: %p, type: %s\n", (void *) dst,
hexagon::get_type_name(dst->get_type()));
return;
}
const uint8_t * src1_ptr = src1->get_read_buffer();
{
if (!params->initiate_dma_row_transfer(src1_ptr, src1_row_cache_ptr, src1->get_ne(0) * sizeof(data_type1))) {
DEVICE_LOG_ERROR("mul_mat_gemv_quant_impl: failed to initiate dma transfer for src1\n");
return;
}
const uint8_t * src0_plane = src0_ptr + start_end_element.first * src0_row_stride;
const size_t next_row_count =
std::min<size_t>(src0_plane_slice_row_count,
start_end_element.second - start_end_element.first); // number of rows in this slice
params->wait_for_dma();
if (!init_dma_transfer<true>(params, src0_plane, src0_plane_write_cache_ptr, src0_row_stride, next_row_count,
src0_row_stride, src0_row_stride)) {
DEVICE_LOG_ERROR("mul_mat_gemv_quant_impl: failed to initiate dma plane transfer for src0 plane\n");
return;
}
}
auto * load_dequant_table_func = hexagon::get_type_traits(src0->get_type()).load_dequant_table;
const auto dequant_table = load_dequant_table_func ? load_dequant_table_func() : HVX_Vector();
{
for (size_t col_idx = start_end_element.first; col_idx < size_t(start_end_element.second);
col_idx += src0_plane_slice_row_count) {
const size_t slice_rows =
std::min<size_t>(src0_plane_slice_row_count,
start_end_element.second - col_idx); // number of rows in this slice
const size_t next_col_idx = col_idx + src0_plane_slice_row_count;
std::swap(src0_plane_read_cache_ptr, src0_plane_write_cache_ptr);
params->wait_for_dma();
if (next_col_idx < start_end_element.second) {
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(mul_mat, 1, dma);
const uint8_t * src0_next_plane = src0_ptr + next_col_idx * src0_row_stride;
const size_t next_row_count =
std::min<size_t>(src0_plane_slice_row_count,
start_end_element.second - next_col_idx); // number of rows in this slice
if (!init_dma_transfer<true>(params, src0_next_plane, src0_plane_write_cache_ptr, src0_row_stride,
next_row_count, src0_row_stride, src0_row_stride)) {
DEVICE_LOG_ERROR("mul_mat_gemv_quant_impl: failed to continue dma plane transfer for src0 plane\n");
return;
}
}
{
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(mul_mat, 0, dot);
auto * dst_row = reinterpret_cast<float *>(dst_ptr) + col_idx;
batched_row_dot<_DotFunc, const HVX_Vector>(src0_plane_read_cache_ptr, src0->get_ne(0), src0_row_stride,
src1_row_cache_ptr, src1->get_nb(1), dst_row, slice_rows, 0,
dequant_table);
}
}
}
dst->release_write_buffer(); // mark the output tensor as modified
}
bool is_src_cacheable(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1) {
const auto & src0_type_traits = hexagon::get_type_traits(src0.type);
if (src0_type_traits.to_float == nullptr) {
DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) cannot be cached, to_float is null\n",
hexagon::get_type_name(src0.type));
return false;
}
const auto vtcm_thread_quota_size = hexagon::default_thread_pool::get_per_thread_vtcm_quota();
const size_t src0_type_size =
src0_type_traits.is_quantized ? sizeof(hexagon::dequant_output_type) : src0_type_traits.type_size;
const auto & src1_type_traits = hexagon::get_type_traits(src1.type);
const bool is_gemv = src1.ne[1] == 1 && src1.ne[2] == 1 && src1.ne[3] == 1;
size_t min_cache_size = is_gemv ? (src1.ne[0] * src1_type_traits.type_size) : 0;
min_cache_size += src0.ne[0] * src0_type_size;
if (min_cache_size > vtcm_thread_quota_size) {
DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) min_cache_size is too large: %ld, vtcm_thread_quota_size: %zu\n",
hexagon::get_type_name(src0.type), (long) min_cache_size, vtcm_thread_quota_size);
return false;
}
return true;
}
bool is_quantized_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1) {
if (src1.type != NPU_DATA_TYPE_F32 && src1.type != NPU_DATA_TYPE_F16) {
DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) and src1.type(%s) mismatch and src1 is not F32\n",
hexagon::get_type_name(src0.type), hexagon::get_type_name(src1.type));
return false;
}
const auto type_traits = hexagon::get_type_traits(src0.type);
if (!type_traits.is_quantized || type_traits.to_float == nullptr) {
DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) and src1.type(%s) mismatch and src0 is not quantized\n",
hexagon::get_type_name(src0.type), hexagon::get_type_name(src1.type));
return false;
}
if (src0.ne[0] % type_traits.blck_size) {
DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) ne[0] is not aligned: %ld\n", hexagon::get_type_name(src0.type),
(long) src0.ne[0]);
return false;
}
if (!is_src_cacheable(src0, src1)) {
return false;
}
DEVICE_LOG_DEBUG("[MUL_MAT]supported quantized src0.type(%s) and src1.type(%s)\n",
hexagon::get_type_name(src0.type), hexagon::get_type_name(src1.type));
return true;
}
bool is_mul_mat_f16_f32_src_tensors_aligned(hexagon::tensor * src0,
hexagon::tensor * src1,
bool is_src0_cached,
bool is_src1_cached) {
const auto * src1_ptr = is_src1_cached ? nullptr : src1->get_read_buffer_as<float>();
const auto * src0_ptr = is_src0_cached ? nullptr : src0->get_read_buffer_as<npu_device_fp16_t>();
if (!hexagon::is_f16_f32_dot_product_aligned(src0_ptr, src1_ptr, src0->get_ne(0))) {
DEVICE_LOG_DEBUG("[MUL_MAT][f16_f32]src_tensors_unaligned: ne[0]: %ld\n", (long) src0->get_ne(0));
return false;
}
DEVICE_LOG_DEBUG("[MUL_MAT][f16_f32]src_tensors_aligned: ne[0]: %ld\n", (long) src0->get_ne(0));
return true;
}
bool is_mul_mat_f16_f16_src_tensors_aligned(hexagon::tensor * src0, hexagon::tensor * src1, bool is_src0_quantized) {
const auto * src1_ptr = src1->get_read_buffer_as<npu_device_fp16_t>();
const auto * src0_ptr = is_src0_quantized ? nullptr : src0->get_read_buffer_as<npu_device_fp16_t>();
if (!hexagon::is_f16_f16_dot_product_aligned(src0_ptr, src1_ptr, src0->get_ne(0))) {
DEVICE_LOG_DEBUG("[MUL_MAT][f16_f16]src_tensors_unaligned: ne[0]: %ld\n", (long) src0->get_ne(0));
return false;
}
if (!is_src0_quantized && !hexagon::is_size_aligned(src0->get_nb(1))) {
DEVICE_LOG_DEBUG("[MUL_MAT]src0 tensor nb[1] is not aligned: %zu\n", src0->get_nb(1));
return false;
}
if (!hexagon::is_size_aligned(src1->get_nb(1))) {
DEVICE_LOG_DEBUG("[MUL_MAT]src1 tensor nb[1] is not aligned: %zu\n", src1->get_nb(1));
return false;
}
DEVICE_LOG_DEBUG("[MUL_MAT]src_tensors_aligned: ne[0]: %ld\n", (long) src0->get_ne(0));
return true;
}
bool is_mul_mat_f32_f32_src_tensors_aligned(hexagon::tensor * src0, hexagon::tensor * src1) {
const auto * src1_ptr = src1->get_read_buffer_as<float>();
const auto * src0_ptr = src0->get_read_buffer_as<float>();
if (!hexagon::is_f32_f32_dot_product_aligned(src0_ptr, src1_ptr, src0->get_ne(0))) {
DEVICE_LOG_DEBUG("[MUL_MAT][f32_f32]src_tensors_unaligned: ne[0]: %ld\n", (long) src0->get_ne(0));
return false;
}
if (!hexagon::is_size_aligned(src0->get_nb(1))) {
DEVICE_LOG_DEBUG("[MUL_MAT]src0 tensor nb[1] is not aligned: %zu\n", src0->get_nb(1));
return false;
}
if (!hexagon::is_size_aligned(src1->get_nb(1))) {
DEVICE_LOG_DEBUG("[MUL_MAT]src1 tensor nb[1] is not aligned: %zu\n", src1->get_nb(1));
return false;
}
DEVICE_LOG_DEBUG("[MUL_MAT]src_tensors_aligned: ne[0]: %ld\n", (long) src0->get_ne(0));
return true;
}
typedef void (*mul_mat_func_type)(hexagon::tensor * src0,
hexagon::tensor * src1,
hexagon::tensor * dst,
hexagon::compute_params * params);
constexpr const size_t kMulMatGemvBaseIndex = 2;
constexpr const mul_mat_func_type kMulMatF32F32Funcs[4] = {
// quantized and non-quantized
mul_mat_impl<hexagon::vec_dot_product_vqf32_f32_f32, false>, // F32 * F32 unaligned
mul_mat_impl<hexagon::vec_dot_product_aligned_vqf32_f32_f32, false>, // F32 * F32 aligned
mul_mat_gemv_impl<hexagon::vec_dot_product_vqf32_f32_f32, false>, // F32 * F32 gemv
mul_mat_gemv_impl<hexagon::vec_dot_product_aligned_vqf32_f32_f32, false>, // F32 * F32 gemv
};
constexpr const mul_mat_func_type kMulMatF16F32QuantizedFuncs[4] = {
// quantized and non-quantized
mul_mat_impl<hexagon::vec_dot_product_vqf32_f16_f32, true>, // F16 * F32 quantized unaligned
mul_mat_impl<hexagon::vec_dot_product_aligned_vqf32_f16_f32, true>, // F16 * F32 quantized aligned
mul_mat_gemv_impl<hexagon::vec_dot_product_vqf32_f16_f32, true>, // F16 * F32 quantized unaligned
mul_mat_gemv_impl<hexagon::vec_dot_product_aligned_vqf32_f16_f32, true>, // F16 * F32 quantized aligned
};
constexpr const mul_mat_func_type kMulMatF16F32Funcs[4] = {
// quantized and non-quantized
mul_mat_impl<hexagon::vec_dot_product_vqf32_f16_f32, false>, // F16 * F32 unaligned
mul_mat_impl<hexagon::vec_dot_product_aligned_vqf32_f16_f32, false>, // F16 * F32 aligned
mul_mat_gemv_impl<hexagon::vec_dot_product_vqf32_f16_f32, false>, // F16 * F32 unaligned
mul_mat_gemv_impl<hexagon::vec_dot_product_aligned_vqf32_f16_f32, false>, // F16 * F32 aligned
};
constexpr const mul_mat_func_type kMulMatF16QuantizedFuncs[4] = {
mul_mat_impl<hexagon::vec_dot_product_vqf16_f16_f16, true>, // F16 * F16 quantized unaligned
mul_mat_impl<hexagon::vec_dot_product_aligned_vqf16_f16_f16, true>, // F16 * F16 quantized aligned
mul_mat_gemv_impl<hexagon::vec_dot_product_aligned_vqf16_f16_f16, true>, // F16 * F16 quantized gemv
mul_mat_gemv_impl<hexagon::vec_dot_product_aligned_vqf16_f16_f16, true>, // F16 * F16 quantized gemv
};
constexpr const mul_mat_func_type kMulMatF16Funcs[4] = {
mul_mat_impl<hexagon::vec_dot_product_vqf16_f16_f16, false>, // F16 * F16 unaligned
mul_mat_impl<hexagon::vec_dot_product_aligned_vqf16_f16_f16, false>, // F16 * F16 aligned
mul_mat_gemv_impl<hexagon::vec_dot_product_vqf16_f16_f16, false>, // F16 * F16 gemv
mul_mat_gemv_impl<hexagon::vec_dot_product_aligned_vqf16_f16_f16, false>, // F16 * F16 gemv
};
} // namespace
namespace hexagon {
bool mul_mat_f32(hexagon::tensor * out, compute_params * params) {
static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "mul_mat_f32 requires max dims 4");
static_assert(std::is_same<hexagon::dequant_output_type, float>::value ||
std::is_same<hexagon::dequant_output_type, npu_device_fp16_t>::value,
"dequant_output_type must be float or npu_device_fp16_t");
if (!out) {
return false;
}
auto * src0 = out->get_src(0);
auto * src1 = out->get_src(1);
if (!src0 || !src1) {
return true; // skip if no src
}
const bool is_src0_quantized = is_quantized_type(src0->get_type());
const bool is_gemv = src1->get_ne(1) == 1 && src1->get_ne(2) == 1 && src1->get_ne(3) == 1;
const auto base_index = is_gemv ? kMulMatGemvBaseIndex : 0;
switch (src1->get_type()) {
case NPU_DATA_TYPE_F32:
if (is_src0_quantized) {
if (is_gemv && src0->get_type() == NPU_DATA_TYPE_Q4_0) {
// TODO: move to array
mul_mat_gemv_quant_impl<hexagon::vec_dot_product_vqf32_q40_f32>(src0, src1, out, params);
} else {
kMulMatF16F32QuantizedFuncs[is_mul_mat_f16_f32_src_tensors_aligned(src0, src1, true, is_gemv) +
base_index](src0, src1, out, params);
}
} else if (src0->get_type() == NPU_DATA_TYPE_F16) {
kMulMatF16F32Funcs[is_mul_mat_f16_f32_src_tensors_aligned(src0, src1, true, is_gemv) + base_index](
src0, src1, out, params);
} else {
kMulMatF32F32Funcs[is_mul_mat_f32_f32_src_tensors_aligned(src0, src1) + base_index](src0, src1, out,
params);
}
return true;
case NPU_DATA_TYPE_F16:
if (is_src0_quantized) {
kMulMatF16QuantizedFuncs[is_mul_mat_f16_f16_src_tensors_aligned(src0, src1, is_src0_quantized) +
base_index](src0, src1, out, params);
} else {
kMulMatF16Funcs[is_mul_mat_f16_f16_src_tensors_aligned(src0, src1, is_src0_quantized) + base_index](
src0, src1, out, params);
}
return true;
default:
break;
}
DEVICE_LOG_ERROR("[MUL_MAT]Unsupported src1 tensor type: %s\n", get_type_name(src1->get_type()));
return false;
}
bool is_mul_mat_supported(const npu_device_tensor_op_spec * op_spec,
const npu_device_tensor_spec * dst,
const npu_device_tensor_spec * srcs,
size_t src_len) {
const auto op = op_spec->op;
if (op != NPU_OP_MUL_MAT) {
DEVICE_LOG_DEBUG("op is not MUL_MAT: %d\n", op);
return false;
}
if (!dst || !srcs || src_len < 2) {
DEVICE_LOG_DEBUG("[%s]invalid dst or srcs\n", hexagon::op_get_name(op));
return false;
}
if (dst->type != NPU_DATA_TYPE_F32) {
DEVICE_LOG_DEBUG("[%s]dst type is not F32: %s\n", op_get_name(op), get_type_name(dst->type));
return false;
}
const auto & src0 = srcs[0];
const auto & src1 = srcs[1];
if (src0.type != src1.type) {
if (src1.type == NPU_DATA_TYPE_F32 && src0.type == NPU_DATA_TYPE_F16) {
// F16 * F32 is supported
DEVICE_LOG_DEBUG("[%s]src0.type(%s) and src1.type(%s) mismatch, but src0 is F16 and src1 is F32\n",
op_get_name(op), get_type_name(src0.type), get_type_name(src1.type));
} else {
#ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS
if (!is_quantized_mul_mat_supported(src0, src1)) {
return false;
}
#else
DEVICE_LOG_DEBUG("[%s]src0.type(%s) and src1.type(%s) mismatch and quantized tensors are not supported\n",
op_get_name(op), get_type_name(src0.type), get_type_name(src1.type));
return false;
#endif
}
}
if (is_transposed_or_permuted(src0.nb)) {
// TODO: fix permuted src0
DEVICE_LOG_DEBUG("[%s]src0 is transposed or permuted, disabled\n", op_get_name(op));
return false;
}
if (src0.ne[0] != src1.ne[0] || src0.ne[1] != dst->ne[0]) {
DEVICE_LOG_DEBUG("[%s]src0 and src1 cannot multiply: %ldx%ld vs %ldx%ld\n", op_get_name(op), (long) src0.ne[0],
(long) src0.ne[1], (long) src1.ne[0], (long) src1.ne[1]);
return false;
}
if (src1.ne[1] != dst->ne[1] || src1.ne[2] != dst->ne[2] || src1.ne[3] != dst->ne[3]) {
DEVICE_LOG_DEBUG("[%s]src1 and dst dimensions not match: %ldx%ld vs %ldx%ld\n", op_get_name(op),
(long) src1.ne[2], (long) src1.ne[3], (long) dst->ne[2], (long) dst->ne[3]);
return false;
}
if (src1.ne[2] % src0.ne[2] || src1.ne[3] % src0.ne[3]) {
DEVICE_LOG_DEBUG("[%s]src0 cannot broadcast to src1: %ldx%ld vs %ldx%ld\n", op_get_name(op), (long) src0.ne[2],
(long) src0.ne[3], (long) src1.ne[2], (long) src1.ne[3]);
return false;
}
if (src1.ne[1] == 1 && src1.ne[2] == 1 && src1.ne[3] == 1 && dst->ne[0] < hexagon::kMaxThreadCount) {
DEVICE_LOG_DEBUG("[%s]src1 is scalar and dst cannot be parallelized: %ld\n", op_get_name(op),
(long) dst->ne[0]);
return false;
}
return true;
}
bool is_mul_mat_required_sync(npu_device_tensor_op prev_op,
const npu_device_ne_type & prev_ne,
npu_device_tensor_op op,
const npu_device_ne_type & ne) {
NPU_UNUSED(prev_op);
NPU_UNUSED(prev_ne);
NPU_UNUSED(op);
NPU_UNUSED(ne);
return prev_op != NPU_OP_MUL_MAT || !is_same_shape(prev_ne, ne);
}
} // namespace hexagon

View File

@ -0,0 +1,20 @@
#pragma once
#include "op_types.hpp"
#include "tensor.hpp"
#include <hexagon_types.h>
namespace hexagon {
bool mul_mat_f32(tensor * out, compute_params * params);
bool is_mul_mat_supported(const npu_device_tensor_op_spec * op_spec,
const npu_device_tensor_spec * dst,
const npu_device_tensor_spec * srcs,
size_t src_len);
bool is_mul_mat_required_sync(npu_device_tensor_op prev_op,
const npu_device_ne_type & prev_ne,
npu_device_tensor_op op,
const npu_device_ne_type & ne);
} // namespace hexagon

View File

@ -0,0 +1,178 @@
#include "op_registry.hpp"
#include "op_eltwise.hpp"
#include "op_flash_attn.hpp"
#include "op_glu.hpp"
#include "op_mul_mat.hpp"
#include "op_rope.hpp"
#include "op_rows.hpp"
#include <cmath>
#include <cstddef>
#include <type_traits>
namespace {
struct op_capabilities {
npu_device_tensor_op op;
hexagon::op_is_supported_func_type is_supported;
hexagon::op_required_sync_func_type requires_thread_barrier_func;
hexagon::compute_func_type compute_funcs[NPU_DATA_TYPE_COUNT];
};
constexpr const op_capabilities kOpCapabilities[] = {
{
NPU_OP_MUL_MAT, hexagon::is_mul_mat_supported,
hexagon::is_mul_mat_required_sync,
{
hexagon::mul_mat_f32, // NPU_DATA_TYPE_F32
nullptr, // NPU_DATA_TYPE_F16
}, },
{
NPU_OP_ADD, hexagon::is_element_wise_op_supported,
hexagon::is_element_wise_op_required_sync,
{
hexagon::element_wise_op<hexagon::vec_op_f32_f32<hexagon::vadd_f32_f32>>, // NPU_DATA_TYPE_F32
hexagon::element_wise_op<hexagon::vec_op_f16_f16<hexagon::vadd_f16_f16>>, // NPU_DATA_TYPE_F16
}, },
{
NPU_OP_SUB, hexagon::is_element_wise_op_supported,
hexagon::is_element_wise_op_required_sync,
{
hexagon::element_wise_op<hexagon::vec_op_f32_f32<hexagon::vsub_f32_f32>>, // NPU_DATA_TYPE_F32
hexagon::element_wise_op<hexagon::vec_op_f16_f16<hexagon::vsub_f16_f16>>, // NPU_DATA_TYPE_F16
}, },
{
NPU_OP_MUL, hexagon::is_element_wise_op_supported,
hexagon::is_element_wise_op_required_sync,
{
hexagon::element_wise_op<hexagon::vec_op_f32_f32<hexagon::vmul_f32_f32>>, // NPU_DATA_TYPE_F32
hexagon::element_wise_op<hexagon::vec_op_f16_f16<hexagon::vmul_f16_f16>>, // NPU_DATA_TYPE_F16
}, },
{
NPU_OP_RMS_NORM, hexagon::is_unary_op_supported,
hexagon::is_unary_op_required_sync,
{
hexagon::unary_op<hexagon::rms_norm_vec_f32>, // NPU_DATA_TYPE_F32
nullptr, // NPU_DATA_TYPE_F16
}, },
{
NPU_OP_FLASH_ATTN, hexagon::is_flash_attn_supported,
hexagon::is_flash_attn_required_sync,
{
hexagon::flash_attn_f32, // NPU_DATA_TYPE_F32
nullptr, // NPU_DATA_TYPE_F16
}, },
{
NPU_OP_ROPE, hexagon::is_rope_supported,
hexagon::is_rope_required_sync,
{
hexagon::rope_f32, // NPU_DATA_TYPE_F32
nullptr, // NPU_DATA_TYPE_F16
}, },
{
NPU_OP_GLU, hexagon::is_glu_op_supported,
hexagon::is_glu_required_sync,
{
hexagon::glu_f32, // NPU_DATA_TYPE_F32
hexagon::glu_f16, // NPU_DATA_TYPE_F16
}, },
{
NPU_OP_GET_ROWS, hexagon::is_rows_supported,
hexagon::is_rows_required_sync,
{
hexagon::get_rows_f32, // NPU_DATA_TYPE_F32
nullptr, // NPU_DATA_TYPE_F16
}, },
{
NPU_OP_SET_ROWS, hexagon::is_rows_supported,
hexagon::is_rows_required_sync,
{
hexagon::set_rows_generic, // NPU_DATA_TYPE_F32
hexagon::set_rows_generic, // NPU_DATA_TYPE_F16
nullptr, // NPU_DATA_TYPE_I32
nullptr, // NPU_DATA_TYPE_I64
hexagon::set_rows_generic, // NPU_DATA_TYPE_Q8_0
hexagon::set_rows_generic, // NPU_DATA_TYPE_Q4_0
nullptr, // TODO: figure out why failed on NPU_DATA_TYPE_Q4_K
}, },
{
NPU_OP_CPY, hexagon::is_unary_op_supported,
hexagon::is_unary_op_required_sync,
{
nullptr, // NPU_DATA_TYPE_F32
hexagon::unary_op<hexagon::unary_vec_op_f16_f32<hexagon::vequals_f16_f32>>, // NPU_DATA_TYPE_F16
}, },
};
static_assert(kOpCapabilities[NPU_OP_MUL_MAT].compute_funcs[NPU_DATA_TYPE_F32] == hexagon::mul_mat_f32,
"kOpArray[NPU_OP_MUL_MAT] != mul_mat_f32");
static_assert(std::size(kOpCapabilities) == NPU_OP_COUNT);
static_assert(kOpCapabilities[NPU_OP_MUL_MAT].op == NPU_OP_MUL_MAT, "kOpArray[NPU_OP_MUL_MAT].op != NPU_OP_MUL_MAT");
static_assert(kOpCapabilities[NPU_OP_MUL].op == NPU_OP_MUL, "kOpArray[NPU_OP_MUL].op != NPU_OP_MUL");
static_assert(kOpCapabilities[NPU_OP_RMS_NORM].op == NPU_OP_RMS_NORM,
"kOpArray[NPU_OP_RMS_NORM].op != NPU_OP_RMS_NORM");
static_assert(kOpCapabilities[NPU_OP_FLASH_ATTN].op == NPU_OP_FLASH_ATTN,
"kOpArray[NPU_OP_FLASH_ATTN].op != NPU_OP_FLASH_ATTN");
static_assert(kOpCapabilities[NPU_OP_ROPE].op == NPU_OP_ROPE, "kOpArray[NPU_OP_ROPE].op != NPU_OP_ROPE");
static_assert(kOpCapabilities[NPU_OP_GLU].op == NPU_OP_GLU, "kOpArray[NPU_OP_GLU].op != NPU_OP_GLU");
static_assert(kOpCapabilities[NPU_OP_GET_ROWS].op == NPU_OP_GET_ROWS,
"kOpArray[NPU_OP_GET_ROWS].op != NPU_OP_GET_ROWS");
static_assert(kOpCapabilities[NPU_OP_SET_ROWS].op == NPU_OP_SET_ROWS,
"kOpArray[NPU_OP_SET_ROWS].op != NPU_OP_SET_ROWS");
hexagon::compute_func_type get_compute_func_impl(npu_device_tensor_op op, npu_device_tensor_data_type type) {
if (op >= NPU_OP_COUNT) {
return nullptr;
}
return kOpCapabilities[op].compute_funcs[type];
}
} // namespace
namespace hexagon {
compute_func_type get_compute_func(tensor * dst) {
return get_compute_func_impl(dst->get_op(), dst->get_type());
}
bool requires_thread_barrier(npu_device_tensor_op prev_op,
const npu_device_ne_type & prev_ne,
npu_device_tensor_op op,
const npu_device_ne_type & ne) {
if (op >= NPU_OP_COUNT) {
return false;
}
auto requires_thread_barrier_func = kOpCapabilities[op].requires_thread_barrier_func;
return requires_thread_barrier_func && requires_thread_barrier_func(prev_op, prev_ne, op, ne);
}
bool support_op(const npu_device_tensor_op_spec * op_spec,
const npu_device_tensor_spec * dst,
const npu_device_tensor_spec * srcs,
size_t src_len) {
if (!op_spec) {
DEVICE_LOG_ERROR("[hexagon-npu]invalid op_spec\n");
return false;
}
const auto op = op_spec->op;
auto is_supported_func = kOpCapabilities[op].is_supported;
if (!is_supported_func || !is_supported_func(op_spec, dst, srcs, src_len)) {
DEVICE_LOG_DEBUG("[%s]unsupported, is_supported_func return false\n", op_get_name(op));
return false;
}
if (get_compute_func_impl(op, dst->type) == nullptr) {
DEVICE_LOG_DEBUG("[%s]unsupported, get_compute_func failed, type: %s\n", op_get_name(op),
get_type_name(dst->type));
return false;
}
return true;
}
} // namespace hexagon

View File

@ -0,0 +1,19 @@
#pragma once
#include "op_types.hpp"
namespace hexagon {
compute_func_type get_compute_func(tensor * dst);
bool requires_thread_barrier(npu_device_tensor_op prev_op,
const npu_device_ne_type & prev_ne,
npu_device_tensor_op op,
const npu_device_ne_type & ne);
bool support_op(const npu_device_tensor_op_spec * op_spec,
const npu_device_tensor_spec * dst,
const npu_device_tensor_spec * srcs,
size_t src_len);
} // namespace hexagon

View File

@ -0,0 +1,406 @@
#include "op_rope.hpp"
#include "type_traits.hpp"
#ifndef M_PI
# define M_PI (3.14159265358979323846)
#endif
namespace {
// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
float rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float) M_PI)) / (2 * logf(base));
}
void rope_yarn_corr_dims(int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]) {
// start and end correction dims
float start = floorf(rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
float end = ceilf(rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
dims[0] = std::max<float>(0, start);
dims[1] = std::min<float>(n_dims - 1, end);
}
float rope_yarn_ramp(const float low, const float high, const int i0) {
const float y = (i0 / 2 - low) / std::max<float>(0.001f, high - low);
return 1 - std::min<float>(1, std::max<float>(0, y));
}
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
void rope_yarn(float theta_extrap,
float freq_scale,
float corr_dims[2],
int64_t i0,
float ext_factor,
float mscale,
float * cos_theta,
float * sin_theta) {
// Get n-d rotational scaling corrected for extrapolation
float theta_interp = freq_scale * theta_extrap;
float theta = theta_interp;
if (ext_factor != 0.0f) {
float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
// Get n-d magnitude scaling corrected for interpolation
mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
}
*cos_theta = cosf(theta) * mscale;
*sin_theta = sinf(theta) * mscale;
}
void rope_cache_init(float theta_base,
float freq_scale,
const float * freq_factors,
float corr_dims[2],
int64_t ne0,
float ext_factor,
float mscale,
float * cache,
float sin_sign,
float theta_scale) {
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
float theta = theta_base;
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
const float ff = freq_factors ? freq_factors[i0 / 2] : 1.0f;
rope_yarn(theta / ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]);
cache[i0 + 1] *= sin_sign;
theta *= theta_scale;
}
}
void mrope_cache_init(float theta_base_t,
float theta_base_h,
float theta_base_w,
float theta_base_e,
const int sections[4],
bool indep_sects,
float freq_scale,
const float * freq_factors,
float corr_dims[2],
int64_t ne0,
float ext_factor,
float mscale,
float * cache,
float sin_sign,
float theta_scale) {
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
float theta_t = theta_base_t;
float theta_h = theta_base_h;
float theta_w = theta_base_w;
float theta_e = theta_base_e; // extra position id for vision encoder
int sect_dims = sections[0] + sections[1] + sections[2] + sections[3];
int sec_w = sections[1] + sections[0];
int sec_e = sections[2] + sec_w;
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
const float ff = freq_factors ? freq_factors[i0 / 2] : 1.0f;
int sector = (i0 / 2) % sect_dims;
if (indep_sects) {
// compute theta independently for each dim sections
// (i.e. reset corresponding theta when `i0` go from one section to another)
if (sector == 0) {
theta_t = theta_base_t;
} else if (sector == sections[0]) {
theta_h = theta_base_h;
} else if (sector == sec_w) {
theta_w = theta_base_w;
} else if (sector == sec_e) {
theta_e = theta_base_e;
}
}
float theta = theta_t;
if (sector >= sections[0] && sector < sec_w) {
theta = theta_h;
} else if (sector >= sec_w && sector < sec_w + sections[2]) {
theta = theta_w;
} else if (sector >= sec_w + sections[2]) {
theta = theta_e;
}
rope_yarn(theta / ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]);
cache[i0 + 1] *= sin_sign;
theta_t *= theta_scale;
theta_w *= theta_scale;
theta_h *= theta_scale;
theta_e *= theta_scale;
}
}
template <bool _IsNeoX, bool _IsMrope, bool _IsVision>
bool rope_impl(hexagon::tensor * out, hexagon::compute_params * params) {
const auto * src0 = out->get_src(0);
const auto * src1 = out->get_src(1);
const auto * src2 = out->get_src(2);
const int n_dims = out->get_op_param<int32_t>(1);
const int n_ctx_orig = out->get_op_param<int32_t>(4);
const int sections[4] = {
out->get_op_param<int32_t>(11),
out->get_op_param<int32_t>(12),
out->get_op_param<int32_t>(13),
out->get_op_param<int32_t>(14),
};
const float freq_base = out->get_op_param<float>(5);
const float freq_scale = out->get_op_param<float>(6);
const float ext_factor = out->get_op_param<float>(7);
const float attn_factor = out->get_op_param<float>(8);
const float beta_fast = out->get_op_param<float>(9);
const float beta_slow = out->get_op_param<float>(10);
const float theta_scale = powf(freq_base, -2.0f / n_dims);
float corr_dims[2];
rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
if (_IsMrope && sections[0] <= 0 && sections[1] <= 0 && sections[2] <= 0) {
DEVICE_LOG_ERROR("[ROPE]invalid sections for MROPE: %d, %d, %d\n", sections[0], sections[1], sections[2]);
return false; // invalid sections for MROPE
}
if (n_dims % 2 || (_IsVision && n_dims != out->get_ne(0) / 2)) {
DEVICE_LOG_ERROR("[ROPE]invalid n_dims for vision ROPE: %d, expected: %lld\n", n_dims, out->get_ne(0) / 2);
return false; // invalid n_dims for vision ROPE
}
// cache size is (ne0 + CACHE_LINE_SIZE_F32)
const size_t total_cache_size = hexagon::get_aligned_size(out->get_ne(0) * sizeof(float));
auto * cache_ptr = params->get_vtcm_cache(total_cache_size);
if (!cache_ptr) {
DEVICE_LOG_ERROR("[ROPE]Failed to allocate VTCM cache for flash_attn: %zu bytes\n", total_cache_size);
return false; // failed to allocate cache
}
const float * freq_factors = nullptr;
if (src2 != nullptr) {
if (src2->get_type() != NPU_DATA_TYPE_F32 || src2->get_ne(0) < n_dims / 2) {
DEVICE_LOG_ERROR("[ROPE]src2 type is not F32 or F16: %s\n", hexagon::get_type_name(src2->get_type()));
return false; // unsupported src2 type
}
freq_factors = src2->get_read_buffer_as<float>();
}
const int64_t total_planes = out->get_ne(3) * out->get_ne(2);
const auto start_end_plane =
params->get_work_slice(total_planes); // TODO: figure out how to use row slice for inplace rope
if (start_end_plane.first >= start_end_plane.second) {
return true;
}
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_MULTI_SUB_PROC(out, params->get_thread_index(), rope);
const float sin_sign = 1.0f;
const int32_t * pos = src1->get_read_buffer_as<int32_t>();
const uint8_t * src0_data_ptr = src0->get_read_buffer();
uint8_t * dst_data_ptr = out->get_write_buffer();
for (int64_t ip = start_end_plane.first; ip < start_end_plane.second; ip++) {
int64_t i3 = ip / out->get_ne(2); // batch
int64_t i2 = ip % out->get_ne(2); // seq-len
float * cache = reinterpret_cast<float *>(cache_ptr);
if constexpr (!_IsMrope) {
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(rope, 0, cache);
const int64_t p = pos[i2];
rope_cache_init(p, freq_scale, freq_factors, corr_dims, out->get_ne(0), ext_factor, attn_factor, cache,
sin_sign, theta_scale);
} else {
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(rope, 0, cache);
const int64_t p_t = pos[i2];
const int64_t p_h = pos[i2 + out->get_ne(2)];
const int64_t p_w = pos[i2 + out->get_ne(2) * 2];
const int64_t p_e = pos[i2 + out->get_ne(2) * 3];
mrope_cache_init(p_t, p_h, p_w, p_e, sections, _IsVision, freq_scale, freq_factors, corr_dims,
out->get_ne(0), ext_factor, attn_factor, cache, sin_sign, theta_scale);
}
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(rope, 1, loop);
const uint8_t * src0_plane = src0_data_ptr + i3 * src0->get_nb(3) + i2 * src0->get_nb(2);
uint8_t * dst_plane = dst_data_ptr + i3 * out->get_nb(3) + i2 * out->get_nb(2);
for (int64_t i1 = 0; i1 < out->get_ne(1); i1++) { // attn-heads
const uint8_t * src0_row = src0_plane + i1 * src0->get_nb(1);
uint8_t * dst_row = dst_plane + i1 * out->get_nb(1);
if constexpr (_IsNeoX || _IsMrope) {
if constexpr (_IsVision) {
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
const int64_t ic = i0 / 2;
const float cos_theta = cache[i0 + 0];
const float sin_theta = cache[i0 + 1];
const float * const src = (float *) (src0_row + ic * src0->get_nb(0));
float * dst_data = (float *) (dst_row + ic * out->get_nb(0));
const float x0 = src[0];
const float x1 = src[n_dims];
dst_data[0] = x0 * cos_theta - x1 * sin_theta;
dst_data[n_dims] = x0 * sin_theta + x1 * cos_theta;
}
} else {
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
const int64_t ic = i0 / 2;
const float cos_theta = cache[i0 + 0];
const float sin_theta = cache[i0 + 1];
const float * const src = (float *) (src0_row + ic * src0->get_nb(0));
float * dst_data = (float *) (dst_row + ic * out->get_nb(0));
const float x0 = src[0];
const float x1 = src[n_dims / 2];
dst_data[0] = x0 * cos_theta - x1 * sin_theta;
dst_data[n_dims / 2] = x0 * sin_theta + x1 * cos_theta;
}
}
} else {
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
const float cos_theta = cache[i0 + 0];
const float sin_theta = cache[i0 + 1];
const float * const src = (float *) (src0_row + i0 * src0->get_nb(0));
float * dst_data = (float *) (dst_row + i0 * out->get_nb(0));
const float x0 = src[0];
const float x1 = src[1];
dst_data[0] = x0 * cos_theta - x1 * sin_theta;
dst_data[1] = x0 * sin_theta + x1 * cos_theta;
}
}
if constexpr (_IsVision) {
for (int64_t i0 = n_dims; i0 < out->get_ne(0); i0 += 2) {
const int64_t ic = i0 / 2;
const float cos_theta = cache[i0 + 0];
const float sin_theta = cache[i0 + 1];
const float * const src = (float *) (src0_row + ic * src0->get_nb(0));
float * dst_data = (float *) (dst_row + ic * out->get_nb(0));
const float x0 = src[0];
const float x1 = src[n_dims];
dst_data[0] = x0 * cos_theta - x1 * sin_theta;
dst_data[n_dims] = x0 * sin_theta + x1 * cos_theta;
}
} else {
// fill the remain channels with data from src tensor
hexagon::vec_cpy_f32(reinterpret_cast<const float *>(src0_row + n_dims * src0->get_nb(0)),
reinterpret_cast<float *>(dst_row + n_dims * out->get_nb(0)),
out->get_ne(0) - n_dims);
}
}
}
out->release_write_buffer();
return true;
}
typedef bool (*rope_impl_func)(hexagon::tensor * out, hexagon::compute_params * params);
constexpr const rope_impl_func kRopeImplFuncs[8] = {
rope_impl<false, false, false>, // IsNotNeoX, IsNotMrope, IsNotVision
rope_impl<false, false, true>, // IsNotNeoX, IsNotMrope, IsVision
rope_impl<false, true, false>, // IsNotNeoX, IsMrope, IsNotVision
rope_impl<false, true, true>, // IsNotNeoX, IsMrope, IsVision
rope_impl<true, false, false>, // IsNeoX, IsNotMrope, IsNotVision
rope_impl<true, false, true>, // IsNeoX, IsNotMrope, IsVision
rope_impl<true, true, false>, // IsNeoX, IsMrope, IsNotVision
rope_impl<true, true, true>, // IsNeoX, IsMrope, IsVision
};
} // namespace
namespace hexagon {
bool rope_f32(tensor * out, compute_params * params) {
const int mode = out->get_op_param<int32_t>(2);
const bool is_neox = mode & NPU_ROPE_TYPE_NEOX;
const bool is_mrope = mode & NPU_ROPE_TYPE_MROPE; // ggml_rope_multi, multimodal rotary position embedding
const bool is_vision = mode == NPU_ROPE_TYPE_VISION;
size_t impl_index = is_neox ? 4 : 0;
impl_index += is_mrope ? 2 : 0;
impl_index += is_vision ? 1 : 0;
if (impl_index >= sizeof(kRopeImplFuncs) / sizeof(kRopeImplFuncs[0])) {
DEVICE_LOG_ERROR("[ROPE]invalid impl_index: %zu\n", impl_index);
return false; // invalid impl index
}
return kRopeImplFuncs[impl_index](out, params);
}
bool is_rope_supported(const npu_device_tensor_op_spec * op_spec,
const npu_device_tensor_spec * dst,
const npu_device_tensor_spec * srcs,
size_t src_len) {
const auto op = op_spec->op;
if (op != NPU_OP_ROPE) {
DEVICE_LOG_DEBUG("[%s]op is not ROPE\n", op_get_name(op));
return false;
}
if (src_len < 2 || !dst || !srcs) {
// freq can be optional, but we require at least 2 srcs: src0 and src1
DEVICE_LOG_DEBUG("[%s]invalid dst or srcs\n", op_get_name(op));
return false;
}
if (dst->type != NPU_DATA_TYPE_F32) {
DEVICE_LOG_DEBUG("[%s]dst type is not F32: %s\n", op_get_name(op), get_type_name(dst->type));
return false; // add more dst type if needed
}
const auto & src0 = srcs[0];
if (src0.type != dst->type) {
DEVICE_LOG_DEBUG("[%s]src0 type is not the same as dst type: %s vs %s\n", op_get_name(op),
get_type_name(src0.type), get_type_name(dst->type));
return false; // unsupported src0 type
}
const auto & src1 = srcs[1];
if (src1.type != NPU_DATA_TYPE_I32) {
DEVICE_LOG_DEBUG("[%s]src1 type is not I32: %s\n", op_get_name(op), get_type_name(src1.type));
return false; // unsupported src1 type
}
if (src_len > 2) {
const auto & src2 = srcs[2];
if (src2.type != NPU_DATA_TYPE_F32) {
DEVICE_LOG_DEBUG("[%s]src2 type is not F32: %s\n", op_get_name(op), get_type_name(src2.type));
return false; // unsupported src2 type
}
DEVICE_LOG_DEBUG("[%s]freq is present\n", op_get_name(op));
}
if (!is_same_shape(src0, *dst)) {
DEVICE_LOG_DEBUG("[%s]src0 and dst have different shape\n", op_get_name(op));
return false;
}
// TODO: check the params for ROPE operation
return true; // ROPE operation is not supported yet
}
bool is_rope_required_sync(npu_device_tensor_op prev_op,
const npu_device_ne_type & prev_ne,
npu_device_tensor_op op,
const npu_device_ne_type & ne) {
NPU_UNUSED(prev_op);
NPU_UNUSED(prev_ne);
NPU_UNUSED(op);
NPU_UNUSED(ne);
return false;
}
} // namespace hexagon

View File

@ -0,0 +1,17 @@
#pragma once
#include "op_types.hpp"
namespace hexagon {
bool rope_f32(tensor * out, compute_params * params);
bool is_rope_supported(const npu_device_tensor_op_spec * op_spec,
const npu_device_tensor_spec * dst,
const npu_device_tensor_spec * srcs,
size_t src_len);
bool is_rope_required_sync(npu_device_tensor_op prev_op,
const npu_device_ne_type & prev_ne,
npu_device_tensor_op op,
const npu_device_ne_type & ne);
} // namespace hexagon

View File

@ -0,0 +1,155 @@
#include "op_rows.hpp"
#include "type_traits.hpp"
namespace {
template <typename idx_t> void set_rows_impl(hexagon::tensor * out, hexagon::compute_params * params) {
auto * src0 = out->get_src(0);
auto * src1 = out->get_src(1);
const auto total_rows = src0->get_ne(3) * src0->get_ne(2) * src0->get_ne(1);
const auto start_end = params->get_work_slice(total_rows);
if (start_end.first >= start_end.second) {
return;
}
uint8_t * dst_ptr = out->get_write_buffer();
if (!dst_ptr) {
DEVICE_LOG_ERROR("set_rows_impl: dst_ptr is not writable, tensor: %p, type: %s\n", (void *) out,
hexagon::get_type_name(out->get_type()));
return;
}
const uint8_t * src0_ptr = src0->get_read_buffer(true); // TODO: avoid invalidation
const uint8_t * src1_ptr = src1->get_read_buffer(true); // TODO: avoid invalidation
const size_t rows_per_cube = src0->get_ne(2) * src0->get_ne(1);
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(out, params->get_thread_index());
auto from_float = hexagon::get_type_traits(out->get_type()).from_float;
for (size_t ir = start_end.first; ir < size_t(start_end.second); ++ir) {
const size_t i03 = ir / rows_per_cube;
const size_t i02 = ir / src0->get_ne(1) - i03 * src0->get_ne(2);
const size_t i01 = ir % src0->get_ne(1);
const size_t i12 = i03 % src1->get_ne(2);
const size_t i11 = i02 % src1->get_ne(1);
const size_t i10 = i01;
const size_t i1 = *reinterpret_cast<const idx_t *>(src1_ptr + i10 * src1->get_nb(0) + i11 * src1->get_nb(1) +
i12 * src1->get_nb(2));
from_float(reinterpret_cast<const float *>(src0_ptr + i01 * src0->get_nb(1) + i02 * src0->get_nb(2) +
i03 * src0->get_nb(3)),
dst_ptr + i1 * out->get_nb(1) + i02 * out->get_nb(2) + i03 * out->get_nb(3),
size_t(src0->get_ne(0)));
}
out->release_write_buffer(); // mark the output tensor as modified
}
} // namespace
namespace hexagon {
bool get_rows_f32(tensor * out, compute_params * params) {
// TODO: implement get_rows
return false;
}
bool set_rows_generic(tensor * out, compute_params * params) {
if (!out) {
return false;
}
auto * src0 = out->get_src(0);
auto * src1 = out->get_src(1);
if (!src0 || !src1) {
DEVICE_LOG_ERROR("set_rows_generic: missing src0 or src1\n");
return false;
}
switch (src1->get_type()) {
case NPU_DATA_TYPE_I32:
set_rows_impl<int32_t>(out, params);
break;
case NPU_DATA_TYPE_I64:
set_rows_impl<int64_t>(out, params);
break;
default:
DEVICE_LOG_ERROR("set_rows_generic: unsupported src1 type: %s\n", hexagon::get_type_name(src1->get_type()));
return false;
}
return true;
}
bool is_rows_supported(const npu_device_tensor_op_spec * op_spec,
const npu_device_tensor_spec * dst,
const npu_device_tensor_spec * srcs,
size_t src_len) {
const auto op = op_spec->op;
if (op != NPU_OP_GET_ROWS && op != NPU_OP_SET_ROWS) {
DEVICE_LOG_DEBUG("[%s]unsupported\n", hexagon::op_get_name(op));
return false;
}
if (src_len < 2) {
DEVICE_LOG_DEBUG("[%s]invalid src_len: %zu\n", hexagon::op_get_name(op), src_len);
return false;
}
const auto & src0 = srcs[0];
const auto & src1 = srcs[1];
if (op == NPU_OP_GET_ROWS) {
if (dst->ne[0] != src0.ne[0]) {
DEVICE_LOG_DEBUG("[%s]dst.ne[0] and src0.ne[0] not match: %ld vs %ld\n", hexagon::op_get_name(op),
(long) dst->ne[0], (long) src0.ne[0]);
return false;
}
if (dst->type != src0.type) {
DEVICE_LOG_DEBUG("[%s]dst.type and src0.type mismatch: %s vs %s\n", hexagon::op_get_name(op),
hexagon::get_type_name(dst->type), hexagon::get_type_name(src0.type));
return false;
}
// TODO: remove this limitation
return false;
} else {
// NPU_OP_SET_ROWS
if (dst->ne[0] != src0.ne[0] || dst->ne[2] != src0.ne[2] || dst->ne[3] != src0.ne[3]) {
DEVICE_LOG_DEBUG("[%s]dst.ne[0], src0.ne[0] and src0.ne[2], src0.ne[3] not match: %ld vs %ld, %ld, %ld\n",
hexagon::op_get_name(op), (long) dst->ne[0], (long) src0.ne[0], (long) src0.ne[2],
(long) src0.ne[3]);
return false;
}
if (src0.type != NPU_DATA_TYPE_F32) {
DEVICE_LOG_DEBUG("[%s]src0.type is not F32: %s\n", hexagon::op_get_name(op),
hexagon::get_type_name(src0.type));
return false;
}
if (src1.type != NPU_DATA_TYPE_I32 && src1.type != NPU_DATA_TYPE_I64) {
DEVICE_LOG_DEBUG("[%s]src1.type is not I32 or I64: %s\n", hexagon::op_get_name(op),
hexagon::get_type_name(src1.type));
return false;
}
if (dst->type != src0.type && !get_type_traits(dst->type).from_float) {
DEVICE_LOG_DEBUG("[%s]dst.from_float is null: %s\n", hexagon::op_get_name(op),
hexagon::get_type_name(dst->type));
return false;
}
}
return true;
}
bool is_rows_required_sync(npu_device_tensor_op prev_op,
const npu_device_ne_type & prev_ne,
npu_device_tensor_op op,
const npu_device_ne_type & ne) {
// TODO: implement is_rows_required_sync
return false;
}
} // namespace hexagon

View File

@ -0,0 +1,19 @@
#pragma once
#include "op_types.hpp"
namespace hexagon {
bool get_rows_f32(tensor * out, compute_params * params);
bool set_rows_generic(tensor * out, compute_params * params);
bool is_rows_supported(const npu_device_tensor_op_spec * op_spec,
const npu_device_tensor_spec * dst,
const npu_device_tensor_spec * srcs,
size_t src_len);
bool is_rows_required_sync(npu_device_tensor_op prev_op,
const npu_device_ne_type & prev_ne,
npu_device_tensor_op op,
const npu_device_ne_type & ne);
} // namespace hexagon

View File

@ -0,0 +1,91 @@
#pragma once
#include "hexagon_npu.h"
#include "tensor.hpp"
#include "thread_pool.hpp"
#include "util.hpp"
#include "vec_ops.hpp"
#include <hexagon_types.h>
#include <algorithm>
#include <cstdint>
#include <memory>
#include <utility>
namespace hexagon {
inline constexpr std::pair<int64_t, int64_t> get_thread_work_slice(int64_t total, size_t tidx, size_t tcnt) {
if (total <= 0 || tidx >= tcnt) {
return { 0, 0 }; // No work for this thread
}
const auto elements_per_thread = total / tcnt;
const auto remainder = total % tcnt;
int64_t start = 0;
int64_t end = 0;
if (tidx < remainder) {
// First 'remainder' threads get one extra item
start = tidx * (elements_per_thread + 1);
end = start + elements_per_thread + 1;
} else {
// Remaining threads get the base number of elements
start = remainder * (elements_per_thread + 1) + (tidx - remainder) * elements_per_thread;
end = start + elements_per_thread;
}
return { start, std::min(end, total) };
}
struct compute_params {
default_thread_pool::thread_params * const thread_params;
const float * f16_to_f32_table;
uint8_t * get_vtcm_cache(size_t size) { return thread_params->get_vtcm_cache(size); }
std::pair<int64_t, int64_t> get_work_slice(int64_t total) const {
return get_thread_work_slice(total, thread_params->tidx, thread_params->tcnt);
}
size_t get_vtcm_quota_size() const { return thread_params->vtcm_quota_size; }
size_t get_thread_count() const { return thread_params->tcnt; }
size_t get_thread_index() const { return thread_params->tidx; }
bool initiate_dma_row_transfer(const uint8_t * src, uint8_t * dst, size_t size) {
return thread_params->initiate_dma_row_transfer(src, dst, size);
}
bool initiate_dma_row_transfer(const uint8_t * src0,
uint8_t * dst0,
const uint8_t * src1,
uint8_t * dst1,
size_t size) {
return thread_params->initiate_dma_row_transfer(src0, dst0, src1, dst1, size);
}
bool initiate_dma_plane_transfer(const uint8_t * src,
uint8_t * dst,
size_t width,
size_t height,
size_t src_stride,
size_t dst_stride) {
return thread_params->initiate_dma_plane_transfer(src, dst, width, height, src_stride, dst_stride);
}
void wait_for_dma() { thread_params->wait_for_dma(); }
};
typedef bool (*compute_func_type)(tensor * dst, compute_params * params);
typedef bool (*op_is_supported_func_type)(const npu_device_tensor_op_spec * op_spec,
const npu_device_tensor_spec * dst,
const npu_device_tensor_spec * srcs,
size_t src_len);
typedef bool (*op_required_sync_func_type)(npu_device_tensor_op prev_op,
const npu_device_ne_type & prev_ne,
npu_device_tensor_op op,
const npu_device_ne_type & ne);
} // namespace hexagon

View File

@ -0,0 +1,156 @@
#pragma once
#include "hexagon_npu.h"
#include "util.hpp"
#include <HAP_mem.h>
#include <qurt.h>
#include <atomic>
namespace hexagon {
constexpr const size_t kMaxTensorSrc = DEVICE_TENSOR_MAX_SRC;
constexpr const size_t kMaxParamsCount = DEVICE_TENSOR_MAX_OP_PARAMS;
class tensor {
public:
explicit tensor(const npu_device_tensor_config & info) noexcept : _info(info) {
uint64 phy_address = 0;
void * mmap_address = nullptr;
auto ret = HAP_mmap_get(_info.buffer_fd, &mmap_address, &phy_address);
if (ret != AEE_SUCCESS) {
DEVICE_LOG_ERROR("Failed to mmap tensor buffer: %d\n", (int) ret);
return;
}
_data = static_cast<uint8_t *>(mmap_address);
DEVICE_LOG_DEBUG("tensor(%p[%ldx%ldx%ldx%ld]), fd: %d, offset: %zu, mmap_addr: %p, phy_addr: 0x%lx\n",
(void *) this,
(long) _info.ne[0],
(long) _info.ne[1],
(long) _info.ne[2],
(long) _info.ne[3],
(int) _info.buffer_fd,
(size_t) _info.offset,
(void *) mmap_address,
(long) phy_address);
}
~tensor() noexcept {
auto ret = HAP_mmap_put(_info.buffer_fd);
if (ret != AEE_SUCCESS) {
DEVICE_LOG_ERROR("Failed to unmap tensor buffer: %d\n", (int) ret);
}
DEVICE_LOG_DEBUG("~tensor(%p) fd: %d\n", (void *) this, _info.buffer_fd);
}
void flush() const {
if (_data) {
qurt_mem_cache_clean(
(qurt_addr_t) (_data + _info.offset), (qurt_size_t) _info.size, QURT_MEM_CACHE_FLUSH, QURT_MEM_DCACHE);
}
}
void invalidate() const {
if (_data) {
qurt_mem_cache_clean((qurt_addr_t) (_data + _info.offset),
(qurt_size_t) _info.size,
QURT_MEM_CACHE_FLUSH_INVALIDATE_ALL,
QURT_MEM_DCACHE);
}
}
void update_config(const npu_device_tensor_update_config & config) {
static_assert(sizeof(_op_params) == sizeof(config.params), "op params size mismatch");
_op_type = config.op;
memcpy(_op_params, config.params, sizeof(_op_params));
for (size_t i = 0; i < DEVICE_TENSOR_MAX_SRC; ++i) {
auto src_handle = config.src_handles[i];
_src[i] = (src_handle != npu_device_INVALID_DEVICE_TENSOR_HANDLE ? reinterpret_cast<tensor *>(src_handle) :
nullptr);
}
}
tensor * get_src(size_t index) const {
if (index >= kMaxTensorSrc) {
return nullptr;
}
return _src[index];
}
const npu_device_tensor_config & get_info() const { return _info; }
const int64_t get_ne(size_t index) const { return _info.ne[index]; }
const size_t get_nb(size_t index) const { return _info.nb[index]; }
const bool is_permuted() const {
// Check if the tensor is permuted by comparing the nb values
return is_transposed_or_permuted(_info.nb);
}
npu_device_tensor_op get_op() const { return _op_type; }
template <typename _TyParam> const _TyParam get_op_param(size_t index) const {
static_assert(sizeof(_TyParam) <= sizeof(_op_params), "_op_param type size exceeds op params size");
if (sizeof(_TyParam) * (index + 1) >= sizeof(_op_params)) {
return 0;
}
return reinterpret_cast<const _TyParam *>(_op_params)[index];
}
const int32_t * get_op_params() const { return _op_params; }
const size_t get_op_param_count() const { return kMaxParamsCount; }
npu_device_tensor_data_type get_type() const { return _info.type; }
const uint8_t * get_read_buffer(const bool force_invalidate = false) const {
if (force_invalidate || (!_info.is_constant && _has_modified)) {
invalidate();
const_cast<tensor *>(this)->_has_modified = false; // TODO: avoid const_cast
}
return _data + _info.offset;
}
template <typename _Ty> const _Ty * get_read_buffer_as() const {
const auto * buffer = get_read_buffer();
if (!buffer) {
return nullptr;
}
return reinterpret_cast<const _Ty *>(buffer);
}
uint8_t * get_write_buffer() const {
if (_info.is_constant) {
DEVICE_LOG_ERROR("Attempt to write to a constant tensor: %p\n", (void *) this);
return nullptr; // Do not allow writing to constant tensors
}
return _data + _info.offset;
}
void release_write_buffer() { _has_modified = true; }
bool is_valid() const { return _data != nullptr; }
private:
npu_device_tensor_config _info = {};
npu_device_tensor_op _op_type = NPU_OP_COUNT;
int32_t _op_params[kMaxParamsCount] = {};
tensor * _src[kMaxTensorSrc] = {};
uint8_t * _data = nullptr;
std::atomic_bool _has_modified = false;
DISABLE_COPY_AND_MOVE(tensor);
};
} // namespace hexagon

View File

@ -0,0 +1,282 @@
#pragma once
#include "dma_transfer.hpp"
#include "util.hpp"
#include "vtcm_mem.hpp"
#include <qurt.h>
#include <array>
#include <atomic>
#include <cstdint>
#include <memory>
#include <string>
namespace hexagon {
constexpr const size_t kMaxThreadCount = 4;
constexpr const size_t kDefaultStackSize = NPU_THREAD_STACK_SIZE; // 64KB
template <size_t _stack_size> class qurt_thread {
public:
typedef void (*qurt_thread_func_type)(qurt_thread * thread, void * arg);
explicit qurt_thread(const std::string & thread_name,
qurt_thread_func_type thread_func,
void * arg,
unsigned short priority) {
DEVICE_LOG_DEBUG("qurt_thread.create: %s\n", thread_name.c_str());
qurt_thread_attr_init(&_attributes);
qurt_thread_attr_set_name(&_attributes, (char *) thread_name.c_str());
qurt_thread_attr_set_stack_addr(&_attributes, _stack);
qurt_thread_attr_set_stack_size(&_attributes, _stack_size);
qurt_thread_attr_set_priority(&_attributes, priority);
qurt_thread_attr_set_bus_priority(&_attributes, QURT_THREAD_BUS_PRIO_ENABLED);
_func = thread_func;
_arg = arg;
auto ret = qurt_thread_create(
&_tid, &_attributes, reinterpret_cast<void (*)(void *)>(&qurt_thread::thread_func_impl), (void *) this);
if (ret != QURT_EOK) {
DEVICE_LOG_ERROR("Failed to create thread: %d\n", (int) ret);
_func = nullptr;
_arg = nullptr;
return;
}
DEVICE_LOG_DEBUG("qurt_thread.created: %s, id: %d\n", thread_name.c_str(), (int) _tid);
}
~qurt_thread() {
DEVICE_LOG_DEBUG("qurt_thread.destroy: %d\n", (int) _tid);
int thread_exit_code = QURT_EOK;
auto ret = qurt_thread_join(_tid, &thread_exit_code);
if (ret != QURT_EOK && ret != QURT_ENOTHREAD) {
DEVICE_LOG_ERROR("Failed to join thread: %d\n", (int) ret);
return;
}
if (thread_exit_code != QURT_EOK) {
DEVICE_LOG_ERROR("Thread exit code: %d\n", (int) thread_exit_code);
}
}
bool is_valid() const { return _tid != 0 && _func != nullptr; }
private:
static void thread_func_impl(qurt_thread * thread) {
if (thread->_func) {
thread->_func(thread, thread->_arg);
}
qurt_thread_exit(QURT_EOK);
}
uint8_t _stack[_stack_size] = {};
qurt_thread_t _tid;
qurt_thread_attr_t _attributes;
qurt_thread_func_type _func = nullptr;
void * _arg = nullptr;
DISABLE_COPY_AND_MOVE(qurt_thread);
};
using qurt_thread_ptr = std::unique_ptr<qurt_thread<kDefaultStackSize>>;
template <size_t _ThreadCount> class thread_pool {
static_assert(_ThreadCount > 1, "Thread count must be greater than 1");
constexpr const static size_t kMaxThreadCount = _ThreadCount;
constexpr const static size_t kMaxSubThreadCount = _ThreadCount - 1;
public:
typedef qurt_thread<kDefaultStackSize> thread_type;
struct thread_params {
size_t tidx;
const size_t tcnt = kMaxThreadCount;
thread_pool<kMaxThreadCount> * pool = nullptr;
size_t vtcm_quota_size;
std::unique_ptr<vtcm_mem> vtcm_cache;
hexagon::dma::dma_transfer dma;
void init_vtcm_cache() { vtcm_cache = std::make_unique<vtcm_mem>(vtcm_quota_size, false); }
uint8_t * get_vtcm_cache(size_t size) {
if (!vtcm_cache || vtcm_cache->get_size() < size) {
DEVICE_SCOPED_PERFORMANCE_TRACKER("[thread_params]get_vtcm_cache, size: %zu, tidx: %zu", size, tidx);
vtcm_cache.reset(); // reset the cache to create a new one
vtcm_cache = std::make_unique<vtcm_mem>(size, false);
}
if (!vtcm_cache->is_valid()) {
return nullptr;
}
return vtcm_cache->get_mem();
}
bool initiate_dma_row_transfer(const uint8_t * src, uint8_t * dst, size_t size) {
return dma.submit1d(src, dst, size);
}
bool initiate_dma_row_transfer(const uint8_t * src0,
uint8_t * dst0,
const uint8_t * src1,
uint8_t * dst1,
size_t size) {
return dma.submit1d(src0, dst0, src1, dst1, size);
}
bool initiate_dma_plane_transfer(const uint8_t * src,
uint8_t * dst,
size_t width,
size_t height,
size_t src_stride,
size_t dst_stride) {
return dma.submit2d(src, dst, width, height, src_stride, dst_stride);
}
void wait_for_dma() { dma.wait(); }
};
typedef void (*task_type)(thread_pool * pool, thread_params * param, void * arg);
thread_pool() {
const auto quota_size = hexagon::vtcm_mem::get_avail_block_size() / kMaxThreadCount;
for (size_t i = 0; i < kMaxThreadCount; ++i) {
auto & thread_param = _thread_params[i];
thread_param.tidx = i;
thread_param.vtcm_quota_size = quota_size;
thread_param.pool = this;
thread_param.init_vtcm_cache();
}
qurt_barrier_init(&_pending, kMaxSubThreadCount + 1);
qurt_barrier_init(&_completed, kMaxSubThreadCount + 1);
const auto priority = qurt_thread_get_priority(qurt_thread_get_id());
std::string thread_name_base = "thread_pool_";
for (size_t i = 0; i < kMaxSubThreadCount; ++i) {
auto thread = std::make_unique<thread_type>(
thread_name_base + std::to_string(i), &thread_pool::thread_func_impl, &_thread_params[i + 1], priority);
if (!thread->is_valid()) {
DEVICE_LOG_ERROR("Failed to create thread: %zu\n", i);
// destroy all barriers and threads at destructor
return;
}
_threads[i] = std::move(thread);
}
DEVICE_LOG_DEBUG("thread_pool.created: %zu, vtcm_quota_size: %zu\n", kMaxSubThreadCount, quota_size);
}
~thread_pool() {
DEVICE_LOG_DEBUG("thread_pool.destroy\n");
_thread_exit = true;
qurt_barrier_wait(&_pending); // release all task threads
for (auto & thread : _threads) {
thread.reset();
}
qurt_barrier_destroy(&_completed);
qurt_barrier_destroy(&_pending);
}
bool sync_execute(task_type task, void * arg) {
if (!task) {
DEVICE_LOG_ERROR("Invalid task\n");
return false;
}
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
_task_begin_cycles = HAP_perf_get_qtimer_count();
#endif
_task = task;
_arg = arg;
qurt_barrier_wait(&_pending);
task(this, &_thread_params[0], arg);
DEVICE_LOG_DEBUG("main_thread.task_completed: 0\n");
qurt_barrier_wait(&_completed);
_task = nullptr;
_arg = nullptr;
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
_task_begin_cycles = 0;
#endif
return true;
}
void sync_thread() { qurt_barrier_wait(&_completed); }
static size_t get_per_thread_vtcm_quota() { return vtcm_mem::get_total_size() / kMaxThreadCount; }
private:
static void thread_func_impl(thread_type * thread, void * arg) {
NPU_UNUSED(thread);
auto * param = reinterpret_cast<thread_params *>(arg);
DEVICE_LOG_DEBUG("thread_func_impl.start: %zu\n", param->tidx);
auto & pool = *(param->pool);
for (;;) {
qurt_barrier_wait(&pool._pending);
if (pool._thread_exit) {
DEVICE_LOG_DEBUG("thread_func_impl.exit: %zu\n", param->tidx);
break;
}
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
auto task_begin_cycles = pool._task_begin_cycles.load();
DEVICE_LOG_WARN("[profiler]worker_thread, tidx: %zu, prepare: %lluus\n",
param->tidx,
static_cast<unsigned long long>(
HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - task_begin_cycles)));
#endif
auto task = pool._task;
if (task) {
task(param->pool, param, pool._arg);
}
DEVICE_LOG_DEBUG("thread_func_impl.task_completed: %zu\n", param->tidx);
qurt_barrier_wait(&pool._completed);
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
DEVICE_LOG_WARN("[profiler]worker_thread, tidx: %zu, task_end: %lluus\n",
param->tidx,
static_cast<unsigned long long>(
HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - task_begin_cycles)));
#endif
}
DEVICE_LOG_DEBUG("thread_func_impl.end: %zu\n", param->tidx);
}
std::atomic_bool _thread_exit = false;
std::array<qurt_thread_ptr, kMaxSubThreadCount> _threads = {};
qurt_barrier_t _pending = {};
qurt_barrier_t _completed = {};
thread_params _thread_params[kMaxThreadCount] = {};
task_type _task = nullptr;
void * _arg = nullptr;
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
std::atomic<uint64_t> _task_begin_cycles = 0;
#endif
DISABLE_COPY_AND_MOVE(thread_pool);
};
using default_thread_pool = thread_pool<kMaxThreadCount>;
} // namespace hexagon

View File

@ -0,0 +1,615 @@
#include "type_traits.hpp"
#include "op_types.hpp" // TODO: remove this include
#include "vec_ops.hpp"
static_assert(sizeof(npu_device_block_q4_k) ==
2 * sizeof(npu_device_fp16_t) + QUANT_K_SCALE_SIZE + QUANT_K_BLOCK_SIZE / 2,
"wrong q4_K block size/padding");
static_assert(sizeof(npu_device_block_q4_0) == sizeof(npu_device_fp16_t) + QUANT_BLOCK_SIZE / 2,
"wrong q4_0 block size/padding");
static_assert(sizeof(npu_device_block_q8_0) == sizeof(npu_device_fp16_t) + QUANT_BLOCK_SIZE,
"wrong q8_0 block size/padding");
namespace {
inline float to_float(const npu_device_fp16_t src) {
return reinterpret_cast<const __fp16 &>(src);
}
inline npu_device_fp16_t to_fp16(const float src) {
__fp16 f16_value = static_cast<__fp16>(src);
return reinterpret_cast<const npu_device_fp16_t &>(f16_value);
}
inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) {
// TODO: use intrinsics
if (j < 4) {
*d = q[j] & 63;
*m = q[j + 4] & 63;
} else {
*d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4);
*m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4);
}
}
inline int nearest_int(float fval) {
float val = fval + 12582912.f;
int i = reinterpret_cast<const int &>(val);
return (i & 0x007fffff) - 0x00400000;
}
float make_qkx2_quants(int n,
int nmax,
const float * x,
const float * weights,
uint8_t * L,
float * the_min,
uint8_t * Laux,
float rmin,
float rdelta,
int nstep,
bool use_mad) {
float min = x[0];
float max = x[0];
float sum_w = weights[0];
float sum_x = sum_w * x[0];
for (int i = 1; i < n; ++i) {
if (x[i] < min) {
min = x[i];
}
if (x[i] > max) {
max = x[i];
}
float w = weights[i];
sum_w += w;
sum_x += w * x[i];
}
if (min > 0) {
min = 0;
}
if (max == min) {
for (int i = 0; i < n; ++i) {
L[i] = 0;
}
*the_min = -min;
return 0.f;
}
float iscale = nmax / (max - min);
float scale = 1 / iscale;
float best_mad = 0;
for (int i = 0; i < n; ++i) {
int l = nearest_int(iscale * (x[i] - min));
L[i] = std::max<int>(0, std::min(nmax, l));
float diff = scale * L[i] + min - x[i];
diff = use_mad ? fabsf(diff) : diff * diff;
float w = weights[i];
best_mad += w * diff;
}
if (nstep < 1) {
*the_min = -min;
return scale;
}
for (int is = 0; is <= nstep; ++is) {
iscale = (rmin + rdelta * is + nmax) / (max - min);
float sum_l = 0, sum_l2 = 0, sum_xl = 0;
for (int i = 0; i < n; ++i) {
int l = nearest_int(iscale * (x[i] - min));
l = std::max<int>(0, std::min(nmax, l));
Laux[i] = l;
float w = weights[i];
sum_l += w * l;
sum_l2 += w * l * l;
sum_xl += w * l * x[i];
}
float D = sum_w * sum_l2 - sum_l * sum_l;
if (D > 0) {
float this_scale = (sum_w * sum_xl - sum_x * sum_l) / D;
float this_min = (sum_l2 * sum_x - sum_l * sum_xl) / D;
if (this_min > 0) {
this_min = 0;
this_scale = sum_xl / sum_l2;
}
float mad = 0;
for (int i = 0; i < n; ++i) {
float diff = this_scale * Laux[i] + this_min - x[i];
diff = use_mad ? fabsf(diff) : diff * diff;
float w = weights[i];
mad += w * diff;
}
if (mad < best_mad) {
for (int i = 0; i < n; ++i) {
L[i] = Laux[i];
}
best_mad = mad;
scale = this_scale;
min = this_min;
}
}
}
*the_min = -min;
return scale;
}
void quantize_row_fp16(const float * src, void * dst, size_t count) {
auto * out = reinterpret_cast<npu_device_fp16_t *>(dst);
// TODO: use hvx intrinsics for better performance
for (size_t i = 0; i < count; i++) {
out[i] = to_fp16(src[i]);
}
}
void quantize_row_q8_0(const float * src, void * dst, size_t count) {
const int nb = count / QUANT_BLOCK_SIZE;
auto * out = reinterpret_cast<npu_device_block_q8_0 *>(dst);
for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max
for (int j = 0; j < QUANT_BLOCK_SIZE; j++) {
const float v = src[i * QUANT_BLOCK_SIZE + j];
amax = std::max(amax, fabsf(v));
}
const float d = amax / ((1 << 7) - 1);
const float id = d ? 1.0f / d : 0.0f;
out[i].d = to_fp16(d);
for (int j = 0; j < QUANT_BLOCK_SIZE; ++j) {
const float x0 = src[i * QUANT_BLOCK_SIZE + j] * id;
out[i].qs[j] = roundf(x0);
}
}
}
void quantize_row_q4_0(const float * src, void * dst, size_t count) {
constexpr const int qk = QUANT_BLOCK_SIZE;
const int nb = count / qk;
auto * out = reinterpret_cast<npu_device_block_q4_0 *>(dst);
for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max
float max = 0.0f;
for (int j = 0; j < qk; j++) {
const float v = src[i * qk + j];
if (amax < fabsf(v)) {
amax = fabsf(v);
max = v;
}
}
const float d = max / -8;
const float id = d ? 1.0f / d : 0.0f;
out[i].d = to_fp16(d);
for (int j = 0; j < qk / 2; ++j) {
const float x0 = src[i * qk + 0 + j] * id;
const float x1 = src[i * qk + qk / 2 + j] * id;
const uint8_t xi0 = std::min<int8_t>(15, (x0 + 8.5f));
const uint8_t xi1 = std::min<int8_t>(15, (x1 + 8.5f));
out[i].qs[j] = xi0;
out[i].qs[j] |= xi1 << 4;
}
}
}
void quantize_row_q4_K(const float * src, void * dst, size_t count) {
const int nb = count / QUANT_K_BLOCK_SIZE;
auto * out = reinterpret_cast<npu_device_block_q4_k *>(dst);
uint8_t L[QUANT_K_BLOCK_SIZE];
uint8_t Laux[32];
float weights[32];
float mins[QUANT_K_BLOCK_SIZE / 32];
float scales[QUANT_K_BLOCK_SIZE / 32];
for (int i = 0; i < nb; i++) {
float max_scale = 0; // as we are deducting the min, scales are always positive
float max_min = 0;
for (int j = 0; j < QUANT_K_BLOCK_SIZE / 32; ++j) {
//scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
float sum_x2 = 0;
for (int l = 0; l < 32; ++l) {
sum_x2 += src[32 * j + l] * src[32 * j + l];
}
float av_x = sqrtf(sum_x2 / 32);
for (int l = 0; l < 32; ++l) {
weights[l] = av_x + fabsf(src[32 * j + l]);
}
scales[j] =
make_qkx2_quants(32, 15, src + 32 * j, weights, L + 32 * j, &mins[j], Laux, -1.f, 0.1f, 20, false);
float scale = scales[j];
if (scale > max_scale) {
max_scale = scale;
}
float min = mins[j];
if (min > max_min) {
max_min = min;
}
}
float inv_scale = max_scale > 0 ? 63.f / max_scale : 0.f;
float inv_min = max_min > 0 ? 63.f / max_min : 0.f;
for (int j = 0; j < QUANT_K_BLOCK_SIZE / 32; ++j) {
uint8_t ls = nearest_int(inv_scale * scales[j]);
uint8_t lm = nearest_int(inv_min * mins[j]);
ls = std::min<uint8_t>(63, ls);
lm = std::min<uint8_t>(63, lm);
if (j < 4) {
out[i].scales[j] = ls;
out[i].scales[j + 4] = lm;
} else {
out[i].scales[j + 4] = (ls & 0xF) | ((lm & 0xF) << 4);
out[i].scales[j - 4] |= ((ls >> 4) << 6);
out[i].scales[j - 0] |= ((lm >> 4) << 6);
}
}
out[i].d = to_fp16(max_scale / 63.f);
out[i].dmin = to_fp16(max_min / 63.f);
uint8_t sc, m;
for (int j = 0; j < QUANT_K_BLOCK_SIZE / 32; ++j) {
get_scale_min_k4(j, out[i].scales, &sc, &m);
const float d = to_float(out[i].d) * sc;
if (!d) {
continue;
}
const float dm = to_float(out[i].dmin) * m;
for (int ii = 0; ii < 32; ++ii) {
int l = nearest_int((src[32 * j + ii] + dm) / d);
l = std::max<int>(0, std::min<int>(15, l));
L[32 * j + ii] = l;
}
}
uint8_t * q = out[i].qs;
for (int j = 0; j < QUANT_K_BLOCK_SIZE; j += 64) {
for (int l = 0; l < 32; ++l) {
q[l] = L[j + l] | (L[j + l + 32] << 4);
}
q += 32;
}
src += QUANT_K_BLOCK_SIZE;
}
}
void dequantize_row_q8_0(const void * src, hexagon::dequant_output_type * dst, size_t count, HVX_Vector) {
using namespace hexagon::vec::quant;
constexpr const int qk = QUANT_BLOCK_SIZE;
static_assert(QUANT_BLOCK_SIZE == hexagon::kBytesPerVector / sizeof(float));
alignas(hexagon::kBytesPerVector) static const HVX_Vector qs_indices = make_qs_load_mask<npu_device_block_q8_0>();
alignas(hexagon::kBytesPerVector) static const HVX_Vector scale_indices =
make_scale_load_mask<npu_device_block_q8_0>();
const int nb = count / qk;
const auto * src_ptr = reinterpret_cast<const npu_device_block_q8_0 *>(src);
auto * dst_ptr = ((hexagon::dequant_output_type *) dst); // TODO: opt for aligned access
int i = 0;
for (; i + 1 < nb; i += 2) {
auto qs = load_dual_block_generic(src_ptr + i, qs_indices, scale_indices);
HVX_Vector q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(Q6_Wh_vunpack_Vb(qs.val[0])));
HVX_Vector result = Q6_Vqf16_vmpy_VhfVhf(q_lo, qs.val[1]);
*reinterpret_cast<HVX_UVector *>(dst_ptr) = Q6_Vhf_equals_Vqf16(result);
dst_ptr += qk * 2;
}
if (i < nb) {
const auto & src = src_ptr[i];
HVX_Vector scales = Q6_Vh_vsplat_R(src.d);
HVX_Vector q_lo = load_block_generic(src);
q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(Q6_Wh_vunpack_Vb(q_lo)));
HVX_Vector result = Q6_Vqf16_vmpy_VhfVhf(q_lo, scales);
hexagon::q6op_vstu_variable_ARV<hexagon::kBytesPerVector / 2>(
dst_ptr,
Q6_Vhf_equals_Vqf16(result)); // TODO: opt the store
}
}
template <bool _IsDstAligned>
void dequantize_row_q4_0_impl(const void * src, hexagon::dequant_output_type * dst, size_t count, HVX_Vector table) {
using namespace hexagon::vec::quant;
constexpr const size_t kElemsPerVec = hexagon::kBytesPerVector / sizeof(hexagon::dequant_output_type);
constexpr const uint32_t kSizeOfQs = sizeof(npu_device_block_q4_0::qs);
constexpr const int qk = QUANT_BLOCK_SIZE;
static_assert(qk % 2 == 0, "qk must be even");
static_assert(QUANT_BLOCK_SIZE == hexagon::kBytesPerVector / sizeof(float));
alignas(hexagon::kBytesPerVector) static const HVX_Vector qs_indices = make_qs_load_mask<npu_device_block_q4_0>();
alignas(hexagon::kBytesPerVector) static const HVX_Vector scale_indices =
make_scale_load_mask<npu_device_block_q4_0>();
const int nb = count / qk;
const auto * src_ptr = reinterpret_cast<const npu_device_block_q4_0 *>(src);
hexagon::dequant_output_type * dst_ptr = dst;
int i = 0;
for (; i + 5 < nb; i += 6) {
auto qs = load_hexa_block_generic(src_ptr + i, qs_indices, scale_indices);
auto res01 = dequantize_vec_q40_qf16_4blocks(qs.val[0], qs.val[1], qs.val[2], table);
HVX_Vector block45 = Q6_V_vror_VR(qs.val[0], kSizeOfQs * 4);
auto res2 = dequantize_vec_q40_qf16_2blocks(block45, qs.val[3], table);
if constexpr (_IsDstAligned) {
reinterpret_cast<HVX_Vector *>(dst_ptr)[0] = Q6_Vhf_equals_Vqf16(res01.val[0]);
reinterpret_cast<HVX_Vector *>(dst_ptr)[1] = Q6_Vhf_equals_Vqf16(res01.val[1]);
reinterpret_cast<HVX_Vector *>(dst_ptr)[2] = Q6_Vhf_equals_Vqf16(res2);
} else {
reinterpret_cast<HVX_UVector *>(dst_ptr)[0] = Q6_Vhf_equals_Vqf16(res01.val[0]);
reinterpret_cast<HVX_UVector *>(dst_ptr)[1] = Q6_Vhf_equals_Vqf16(res01.val[1]);
reinterpret_cast<HVX_UVector *>(dst_ptr)[2] = Q6_Vhf_equals_Vqf16(res2);
}
dst_ptr += kElemsPerVec * 3;
}
for (; i + 3 < nb; i += 4) {
auto qs = load_qual_block_generic(src_ptr + i, qs_indices, scale_indices);
auto res01 = dequantize_vec_q40_qf16_4blocks(qs.val[0], qs.val[1], qs.val[2], table);
if constexpr (_IsDstAligned) {
reinterpret_cast<HVX_Vector *>(dst_ptr)[0] = Q6_Vhf_equals_Vqf16(res01.val[0]);
reinterpret_cast<HVX_Vector *>(dst_ptr)[1] = Q6_Vhf_equals_Vqf16(res01.val[1]);
} else {
reinterpret_cast<HVX_UVector *>(dst_ptr)[0] = Q6_Vhf_equals_Vqf16(res01.val[0]);
reinterpret_cast<HVX_UVector *>(dst_ptr)[1] = Q6_Vhf_equals_Vqf16(res01.val[1]);
}
dst_ptr += kElemsPerVec * 2;
}
for (; i + 1 < nb; i += 2) {
auto qs = load_dual_block_generic(src_ptr + i, qs_indices, scale_indices);
auto res = dequantize_vec_q40_qf16_2blocks(qs.val[0], qs.val[1], table);
if constexpr (_IsDstAligned) {
*reinterpret_cast<HVX_Vector *>(dst_ptr) = Q6_Vhf_equals_Vqf16(res);
} else {
*reinterpret_cast<HVX_UVector *>(dst_ptr) = Q6_Vhf_equals_Vqf16(res);
}
dst_ptr += kElemsPerVec;
}
if (i < nb) {
const auto & curr_blk = src_ptr[nb - 1];
HVX_Vector scales = Q6_Vh_vsplat_R(curr_blk.d);
HVX_Vector qs = load_block_generic(curr_blk);
HVX_Vector q_lo = qs;
HVX_Vector q_hi = Q6_Vub_vlsr_VubR(qs, 4);
HVX_VectorPair qp0 = Q6_W_vshuff_VVR(q_hi, q_lo, kSizeOfQs);
q_lo = Q6_Vb_vshuff_Vb(Q6_V_lo_W(qp0));
qp0 = Q6_Wh_vlut16_VbVhR_nomatch(q_lo, table, 0);
q_lo = Q6_Vqf16_vmpy_VhfVhf(Q6_V_lo_W(qp0), scales);
q_lo = Q6_Vhf_equals_Vqf16(q_lo);
if constexpr (_IsDstAligned) {
hexagon::q6op_vstu_variable_aligned<hexagon::kBytesPerVector / 2>(dst_ptr, q_lo);
} else {
hexagon::q6op_vstu_variable_ARV<hexagon::kBytesPerVector / 2>(dst_ptr, q_lo);
}
}
}
HVX_Vector load_dequant_table_q4_0() {
constexpr const int kTableSize = 1 << 4; // 4 bits per value, 16 values
constexpr const int kQ4ZeroPoint = 8; // zero point for q4_0 quantization
static_assert(kTableSize <= hexagon::kBytesPerVector / sizeof(__fp16), "table too large");
alignas(hexagon::kBytesPerVector) static const HVX_Vector result = []() -> HVX_Vector {
alignas(hexagon::kBytesPerVector) hexagon::HVX_VectorAlias table;
table.v = Q6_V_vzero();
for (int i = 0; i < kTableSize; ++i) {
table.f16[i * 2] = i - kQ4ZeroPoint; // TODO: vectorize this?
}
return table.v;
}();
return result;
}
void dequantize_row_q4_0(const void * src, hexagon::dequant_output_type * dst, size_t count, HVX_Vector table) {
const bool dst_aligned = hexagon::is_addr_aligned(dst);
if (dst_aligned) {
dequantize_row_q4_0_impl<true>(src, dst, count, table);
} else {
dequantize_row_q4_0_impl<false>(src, dst, count, table);
}
}
HVX_Vector load_dequant_table_q4_k() {
constexpr const int kTableSize = 1 << 4; // 4 bits per value, 16 values
static_assert(kTableSize <= hexagon::kBytesPerVector / sizeof(__fp16), "table too large");
alignas(hexagon::kBytesPerVector) static const HVX_Vector result = []() -> HVX_Vector {
alignas(hexagon::kBytesPerVector) hexagon::HVX_VectorAlias table;
table.v = Q6_V_vzero();
for (int i = 0; i < kTableSize; ++i) {
table.f16[i * 2] = i; // TODO: vectorize this?
}
return table.v;
}();
return result;
}
void dequantize_row_q4_K(const void * src, hexagon::dequant_output_type * dst, size_t count, HVX_Vector table) {
constexpr const int kQuantSubBlockSize = 32;
const int nb = count / QUANT_K_BLOCK_SIZE;
const auto * src_ptr = reinterpret_cast<const npu_device_block_q4_k *>(src);
auto * dst_ptr = reinterpret_cast<npu_device_fp16_t *>(dst);
const HVX_VectorPred scale_mask = Q6_Q_vsetq_R(hexagon::kBytesPerVector / 2);
alignas(hexagon::kBytesPerVector * 4) union {
HVX_VectorPair p[2];
HVX_Vector v[4];
} dual_pair;
for (int i = 0; i < nb; i++) {
const uint8_t * q = src_ptr[i].qs;
HVX_Vector qv = *reinterpret_cast<const HVX_UVector *>(q);
HVX_Vector q_lo = qv;
HVX_Vector q_hi = Q6_Vub_vlsr_VubR(qv, 4);
HVX_VectorPair qp = Q6_W_vshuff_VVR(q_hi, q_lo, kQuantSubBlockSize * 3);
q_lo = Q6_V_lo_W(qp);
q_hi = Q6_V_hi_W(qp);
q_lo = Q6_Vb_vshuff_Vb(q_lo);
q_hi = Q6_Vb_vshuff_Vb(q_hi);
dual_pair.p[0] = Q6_Wh_vlut16_VbVhR_nomatch(q_lo, table, 0);
dual_pair.p[1] = Q6_Wh_vlut16_VbVhR_nomatch(q_hi, table, 0);
const __fp16 d = reinterpret_cast<const __fp16 &>(src_ptr[i].d);
const __fp16 min = reinterpret_cast<const __fp16 &>(src_ptr[i].dmin);
int is = 0;
uint8_t sc = 0;
uint8_t m = 0;
const auto * scales = src_ptr[i].scales;
for (int j = 0; j < QUANT_K_BLOCK_SIZE; j += 128) {
get_scale_min_k4(is + 0, scales, &sc, &m);
const __fp16 d0 = d * sc;
const __fp16 m0 = min * m;
HVX_Vector dv0 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(d0));
HVX_Vector dm0 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(m0));
get_scale_min_k4(is + 1, scales, &sc, &m);
const __fp16 d1 = d * sc;
const __fp16 m1 = min * m;
HVX_Vector dv1 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(d1));
HVX_Vector dm1 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(m1));
get_scale_min_k4(is + 2, scales, &sc, &m);
const __fp16 d2 = d * sc;
const __fp16 m2 = min * m;
HVX_Vector dv2 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(d2));
HVX_Vector dm2 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(m2));
get_scale_min_k4(is + 3, scales, &sc, &m);
const __fp16 d3 = d * sc;
const __fp16 m3 = min * m;
HVX_Vector dv3 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(d3));
HVX_Vector dm3 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(m3));
HVX_Vector dv01 = Q6_V_vmux_QVV(scale_mask, dv0, dv1);
HVX_Vector dm01 = Q6_V_vmux_QVV(scale_mask, dm0, dm1);
HVX_Vector dv23 = Q6_V_vmux_QVV(scale_mask, dv2, dv3);
HVX_Vector dm23 = Q6_V_vmux_QVV(scale_mask, dm2, dm3);
q_lo = Q6_Vqf16_vmpy_VhfVhf(dual_pair.v[j / 64], dv01);
q_lo = Q6_Vqf16_vsub_Vqf16Vhf(q_lo, dm01);
q_hi = Q6_Vqf16_vmpy_VhfVhf(dual_pair.v[j / 64 + 1], dv23);
q_hi = Q6_Vqf16_vsub_Vqf16Vhf(q_hi, dm23);
reinterpret_cast<HVX_UVector *>(dst_ptr)[0] = Q6_Vhf_equals_Vqf16(q_lo);
reinterpret_cast<HVX_UVector *>(dst_ptr)[1] = Q6_Vhf_equals_Vqf16(q_hi);
dst_ptr += 128;
is += 4;
}
}
}
void copy_row_f16(const void * src, hexagon::dequant_output_type * dst, size_t count, HVX_Vector) {
hexagon::vec_cpy_f16(reinterpret_cast<const npu_device_fp16_t *>(src), dst, count);
}
template <typename _TSrc, typename _TDst, typename... _TExtArgs>
void copy_row_f32(const _TSrc * src, _TDst * dst, size_t count, _TExtArgs...) {
hexagon::vec_cpy_f32(reinterpret_cast<const float *>(src), reinterpret_cast<float *>(dst), count);
}
constexpr const hexagon::device_type_traits kDeviceTypeTraits[] = {
{ NPU_DATA_TYPE_F32, "F32", 1, sizeof(float), false, copy_row_f32<void, hexagon::dequant_output_type, HVX_Vector>,
copy_row_f32<float, void>, hexagon::type_erase_dot_func<hexagon::vec_dot_product_f32_f32>,
hexagon::type_erase_dot_func<hexagon::vec_dot_product_aligned_f32_f32>,
hexagon::type_erase_dot_func<hexagon::is_f32_f32_dot_product_aligned> },
{ NPU_DATA_TYPE_F16, "F16", 1, sizeof(npu_device_fp16_t), false, copy_row_f16, quantize_row_fp16,
hexagon::type_erase_dot_func<hexagon::vec_dot_product_f16_f16>,
hexagon::type_erase_dot_func<hexagon::vec_dot_product_aligned_f16_f16>,
hexagon::type_erase_dot_func<hexagon::is_f16_f16_dot_product_aligned> },
{ NPU_DATA_TYPE_I32, "I32", 1, sizeof(int32_t), false },
{ NPU_DATA_TYPE_I64, "I64", 1, sizeof(int64_t), false },
{ NPU_DATA_TYPE_Q8_0, "Q8_0", QUANT_BLOCK_SIZE, sizeof(npu_device_block_q8_0), true, dequantize_row_q8_0,
quantize_row_q8_0 },
{ NPU_DATA_TYPE_Q4_0, "Q4_0", QUANT_BLOCK_SIZE, sizeof(npu_device_block_q4_0), true, dequantize_row_q4_0,
quantize_row_q4_0, nullptr, nullptr, nullptr, load_dequant_table_q4_0 },
{ NPU_DATA_TYPE_Q4_K, "Q4_K", QUANT_K_BLOCK_SIZE, sizeof(npu_device_block_q4_k), true, dequantize_row_q4_K,
quantize_row_q4_K, nullptr, nullptr, nullptr, load_dequant_table_q4_k },
};
static_assert(std::size(kDeviceTypeTraits) == NPU_DATA_TYPE_COUNT,
"kDeviceTypeTraits size mismatch with npu_device_tensor_data_type enum");
static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_F32].type == NPU_DATA_TYPE_F32,
"kDeviceTypeTraits F32 type mismatch with npu_device_tensor_data_type enum");
static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_F16].type == NPU_DATA_TYPE_F16,
"kDeviceTypeTraits F16 type mismatch with npu_device_tensor_data_type enum");
static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_I32].type == NPU_DATA_TYPE_I32,
"kDeviceTypeTraits I32 type mismatch with npu_device_tensor_data_type enum");
static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_I64].type == NPU_DATA_TYPE_I64,
"kDeviceTypeTraits I64 type mismatch with npu_device_tensor_data_type enum");
static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_Q8_0].type == NPU_DATA_TYPE_Q8_0,
"kDeviceTypeTraits Q8_0 type mismatch with npu_device_tensor_data_type enum");
static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_Q4_0].type == NPU_DATA_TYPE_Q4_0,
"kDeviceTypeTraits Q4_0 type mismatch with npu_device_tensor_data_type enum");
static_assert(kDeviceTypeTraits[NPU_DATA_TYPE_Q4_K].type == NPU_DATA_TYPE_Q4_K,
"kDeviceTypeTraits Q4_K type mismatch with npu_device_tensor_data_type enum");
} // namespace
namespace hexagon {
bool init_f16_f32_table(float * table, size_t count) {
constexpr const size_t kTableSize = (1U << 16);
if (count < kTableSize) {
return false;
}
for (size_t i = 0; i < count; ++i) {
table[i] = to_float(i);
}
return true;
}
const device_type_traits & get_type_traits(npu_device_tensor_data_type type) {
return kDeviceTypeTraits[type];
}
size_t get_dequantized_row_size(const tensor * tensor) {
if (!is_quantized_type(tensor->get_type())) {
return tensor->get_nb(1); // for f32 and f16
}
auto row_elems_count = tensor->get_ne(0);
return hexagon::get_aligned_size(
row_elems_count * sizeof(dequant_output_type)); // dequant_output_type is currently restricted to f32
}
} // namespace hexagon

View File

@ -0,0 +1,96 @@
#include "hexagon_npu.h"
#include "tensor.hpp"
#include "util.hpp"
#include <hexagon_types.h>
namespace hexagon {
using dequant_output_type = npu_device_fp16_t;
bool init_f16_f32_table(float * table, size_t count);
typedef void (*quantize_row_type)(const float * src, void * dst, size_t count);
typedef void (*dequantize_row_type)(const void * src, dequant_output_type * dst, size_t count, HVX_Vector table);
typedef float (*vec_dot_type)(const void * src0, const void * src1, size_t count);
typedef bool (*can_use_aligned_vec_dot_type)(const void * src0, const void * src1, size_t count);
typedef HVX_Vector (*load_dequant_table_type)();
struct device_type_traits {
npu_device_tensor_data_type type;
const char * type_name;
int64_t blck_size;
size_t type_size;
bool is_quantized;
dequantize_row_type to_float = nullptr;
quantize_row_type from_float = nullptr;
vec_dot_type vec_dot = nullptr;
vec_dot_type vec_dot_aligned = nullptr;
can_use_aligned_vec_dot_type can_use_aligned_vec_dot = nullptr;
load_dequant_table_type load_dequant_table = nullptr;
};
const device_type_traits & get_type_traits(npu_device_tensor_data_type type);
inline bool is_quantized_type(npu_device_tensor_data_type type) {
return get_type_traits(type).is_quantized;
}
size_t get_dequantized_row_size(const tensor * tensor);
inline const char * get_type_name(npu_device_tensor_data_type type) {
return get_type_traits(type).type_name;
}
} // namespace hexagon
// TODO: move this to a common header
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
namespace hexagon {
inline auto make_scoped_op_perf_timer(tensor * op, size_t tidx) {
auto * src0 = op->get_src(0);
auto * src1 = op->get_src(1);
char buffer[512];
if (src1 == nullptr) {
snprintf(buffer, sizeof(buffer), "[%s][%lldx%lldx%lldx%lld%s], tidx: %zu", op_get_name(op->get_op()),
src0->get_ne(0), src0->get_ne(1), src0->get_ne(2), src0->get_ne(3), get_type_name(src0->get_type()),
tidx);
} else {
snprintf(buffer, sizeof(buffer), "[%s][%lldx%lldx%lldx%lld%s],[%lldx%lldx%lldx%lld%s], tidx: %zu",
op_get_name(op->get_op()), src0->get_ne(0), src0->get_ne(1), src0->get_ne(2), src0->get_ne(3),
get_type_name(src0->get_type()), src1->get_ne(0), src1->get_ne(1), src1->get_ne(2), src1->get_ne(3),
get_type_name(src1->get_type()), tidx);
}
return npu_scoped_timer<1024>(buffer);
}
} // namespace hexagon
# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(op, tidx) \
auto __npu_op_timer_##__LINE__ = hexagon::make_scoped_op_perf_timer(op, tidx)
# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_SUB_PROC(op, tidx, sub_prefix) \
auto __npu_op_timer_##sub_prefix = hexagon::make_scoped_op_perf_timer(op, tidx)
# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_SUB_PROC(sub_prefix) \
hexagon::npu_sub_process_scoped_timer<decltype(__npu_op_timer_##sub_prefix)::kBufferCount, 0> \
__npu_op_sub_timer##sub_prefix(__npu_op_timer_##sub_prefix, #sub_prefix)
# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_MULTI_SUB_PROC(op, tidx, tracker_name) \
auto __npu_op_timer_##tracker_name = hexagon::make_scoped_op_perf_timer(op, tidx)
# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(tracker_name, idx, sub_prefix) \
hexagon::npu_sub_process_scoped_timer< \
std::remove_reference_t<decltype(__npu_op_timer_##tracker_name)>::kBufferCount, idx> \
__npu_op_sub_timer##sub_prefix(__npu_op_timer_##tracker_name, #sub_prefix)
#else
# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(op, tidx) ((void) 0)
# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_SUB_PROC(op, tidx, sub_prefix) ((void) 0)
# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_SUB_PROC(sub_prefix) ((void) 0)
# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_MULTI_SUB_PROC(op, tidx, tracker_name) ((void) 0)
# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(tracker_name, idx, sub_prefix) ((void) 0)
#endif

View File

@ -0,0 +1,390 @@
#pragma once
#include "hexagon_npu.h"
#include <AEEStdDef.h>
#include <HAP_farf.h>
#include <HAP_perf.h>
#include <HAP_power.h>
#include <qurt.h>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <utility>
#define DEVICE_LOG_ERROR(...) hexagon::log_error(__VA_ARGS__)
#define DEVICE_LOG_WARN(...) hexagon::log_message(__VA_ARGS__)
#define DEVICE_LOG_INFO(...) hexagon::log_message(__VA_ARGS__)
#ifdef _DEBUG
# define DEVICE_LOG_DEBUG(...) hexagon::log_message(__VA_ARGS__)
#else
# define DEVICE_LOG_DEBUG(...) (void) 0
#endif
// TODO: reuse the declaration at host
#define DISABLE_COPY(class_name) \
class_name(const class_name &) = delete; \
void operator=(const class_name &) = delete
#define DISABLE_MOVE(class_name) \
class_name(class_name &&) = delete; \
void operator=(class_name &&) = delete
#define DISABLE_COPY_AND_MOVE(class_name) \
DISABLE_COPY(class_name); \
DISABLE_MOVE(class_name)
#define NPU_UNUSED(x) (void) (x)
namespace hexagon {
__attribute__((format(printf, 1, 2))) inline void log_error(const char * format, ...) {
va_list args;
va_start(args, format);
std::vfprintf(stderr, format, args);
va_end(args);
}
__attribute__((format(printf, 1, 2))) inline void log_message(const char * format, ...) {
va_list args;
va_start(args, format);
std::vprintf(format, args);
va_end(args);
}
inline constexpr const char * op_get_name(npu_device_tensor_op op) {
switch (op) {
case NPU_OP_MUL_MAT:
return "MUL_MAT";
case NPU_OP_ADD:
return "ADD";
case NPU_OP_SUB:
return "SUB";
case NPU_OP_MUL:
return "MUL";
case NPU_OP_RMS_NORM:
return "RMS_NORM";
case NPU_OP_FLASH_ATTN:
return "FLASH_ATTN_EXT";
case NPU_OP_ROPE:
return "ROPE";
case NPU_OP_GLU:
return "GLU";
case NPU_OP_GET_ROWS:
return "GET_ROWS";
case NPU_OP_SET_ROWS:
return "SET_ROWS";
case NPU_OP_CPY:
return "CPY";
default:
return "UNKNOWN";
}
}
inline bool is_transposed_or_permuted(const npu_device_nb_type & nb) {
// Check if the tensor is transposed or permuted
return (nb[0] > nb[1]) || (nb[1] > nb[2]) || (nb[2] > nb[3]);
}
inline bool is_same_shape(const npu_device_ne_type & src, const npu_device_ne_type & dst) {
for (size_t i = 0; i < DEVICE_TENSOR_MAX_DIMS; ++i) {
if (src[i] != dst[i]) {
return false;
}
}
return true;
}
inline bool is_same_shape(const npu_device_tensor_spec & src, const npu_device_tensor_spec & dst) {
return is_same_shape(src.ne, dst.ne);
}
class qurt_mutex {
public:
qurt_mutex() { qurt_mutex_init(&_mutex); }
~qurt_mutex() { qurt_mutex_destroy(&_mutex); }
void lock() { qurt_mutex_lock(&_mutex); }
void unlock() { qurt_mutex_unlock(&_mutex); }
private:
qurt_mutex_t _mutex;
DISABLE_COPY_AND_MOVE(qurt_mutex);
};
class power_utils {
public:
power_utils() {
_context_ptr = HAP_utils_create_context();
if (_context_ptr == nullptr) {
DEVICE_LOG_ERROR("Failed to create power context\n");
}
}
~power_utils() {
if (_context_ptr != nullptr) {
HAP_utils_destroy_context(_context_ptr);
}
}
unsigned int get_clock_speed_hz() const {
if (!is_valid()) {
DEVICE_LOG_ERROR("Power context is not initialized\n");
return 0;
}
HAP_power_response_t response = {};
response.type = HAP_power_get_clk_Freq;
auto ret = HAP_power_get(_context_ptr, &response);
if (ret != AEE_SUCCESS) {
DEVICE_LOG_ERROR("Failed to get clock speed: %d\n", ret);
return 0;
}
return response.clkFreqHz;
}
bool get_dvcs_enabled() const {
if (!is_valid()) {
DEVICE_LOG_ERROR("Power context is not initialized\n");
return false;
}
HAP_power_response_t response = {};
response.type = HAP_power_get_dcvsEnabled;
auto ret = HAP_power_get(_context_ptr, &response);
if (ret != AEE_SUCCESS) {
DEVICE_LOG_ERROR("Failed to get DVCS enabled: %d\n", ret);
return false;
}
return response.dcvsEnabled;
}
void set_dvcs_performance_mode(bool enable) {
if (!is_valid()) {
DEVICE_LOG_ERROR("Power context is not initialized\n");
return;
}
HAP_power_request_t request = {};
request.type = HAP_power_set_DCVS_v3;
request.dcvs_v3.set_dcvs_enable = enable ? TRUE : FALSE;
request.dcvs_v3.dcvs_enable = enable ? TRUE : FALSE;
request.dcvs_v3.set_core_params = TRUE;
if (enable) {
request.dcvs_v3.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE;
request.dcvs_v3.set_bus_params = TRUE;
request.dcvs_v3.bus_params.min_corner = HAP_DCVS_VCORNER_MAX;
request.dcvs_v3.bus_params.max_corner = HAP_DCVS_VCORNER_MAX;
request.dcvs_v3.bus_params.target_corner = HAP_DCVS_VCORNER_MAX;
request.dcvs_v3.core_params.min_corner = HAP_DCVS_VCORNER_MAX;
request.dcvs_v3.core_params.max_corner = HAP_DCVS_VCORNER_MAX;
request.dcvs_v3.core_params.target_corner = HAP_DCVS_VCORNER_MAX;
request.dcvs_v3.set_sleep_disable = TRUE;
request.dcvs_v3.sleep_disable = TRUE;
}
auto ret = HAP_power_set(_context_ptr, &request);
if (ret != AEE_SUCCESS) {
DEVICE_LOG_ERROR("Failed to set DVCS performance mode: %d\n", ret);
}
}
void set_sleep_mode(bool enable) {
if (!is_valid()) {
DEVICE_LOG_ERROR("Power context is not initialized\n");
return;
}
boolean sleep_disable = enable ? FALSE : TRUE;
auto ret = HAP_power_set_sleep_mode(_context_ptr, sleep_disable);
if (ret != AEE_SUCCESS) {
DEVICE_LOG_ERROR("Failed to set sleep mode: %d\n", ret);
}
}
bool is_valid() const { return _context_ptr != nullptr; }
private:
void * _context_ptr = nullptr;
DISABLE_COPY_AND_MOVE(power_utils);
};
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
struct sub_process_data {
char log_prefix[32] = {};
uint64_t proc_cycles = 0;
uint64_t proc_pcycles = 0;
uint64_t proc_count = 0;
};
template <size_t _buffer_count> class npu_scoped_timer {
public:
enum {
kBufferCount = _buffer_count,
kSubProcCount = 4,
};
explicit npu_scoped_timer(const char * log_prefix) {
strncpy(_log_prefix, log_prefix, kBufferCount - 1);
_begin_cycles = HAP_perf_get_qtimer_count();
_begin_pcycles = HAP_perf_get_pcycles();
}
npu_scoped_timer(npu_scoped_timer && other) { *this = std::move(other); }
~npu_scoped_timer() { print(); }
void operator=(npu_scoped_timer && other) {
strncpy(_log_prefix, other._log_prefix, kBufferCount - 1);
_begin_cycles = other._begin_cycles;
_begin_pcycles = other._begin_pcycles;
memcpy(&_sub_proc_data, &other._sub_proc_data, sizeof(_sub_proc_data));
}
void add_sub_proc_cycles(size_t sub_proc_idx, const char * sub_proc_prefix, uint64_t cycles, uint64_t pcycles) {
auto & sub_proc_data = _sub_proc_data[sub_proc_idx];
sub_proc_data.proc_cycles += cycles;
sub_proc_data.proc_pcycles += pcycles;
if (!sub_proc_data.proc_count) {
strncpy(sub_proc_data.log_prefix, sub_proc_prefix, sizeof(sub_proc_data.log_prefix) - 1);
}
sub_proc_data.proc_count++;
}
void print() const {
static_assert(kSubProcCount == 4, "Sub process count must be 4 for logging format");
auto total_cycles = HAP_perf_get_qtimer_count() - _begin_cycles;
auto total_pcycles = HAP_perf_get_pcycles() - _begin_pcycles;
auto duration = HAP_perf_qtimer_count_to_us(total_cycles);
int sub_proc_count = 0;
for (int i = kSubProcCount; i > 0; --i) {
if (_sub_proc_data[i - 1].proc_count > 0) {
sub_proc_count = i;
break;
}
}
auto sub_proc0_duration = HAP_perf_qtimer_count_to_us(_sub_proc_data[0].proc_cycles);
auto sub_proc1_duration = HAP_perf_qtimer_count_to_us(_sub_proc_data[1].proc_cycles);
auto sub_proc2_duration = HAP_perf_qtimer_count_to_us(_sub_proc_data[2].proc_cycles);
auto sub_proc3_duration = HAP_perf_qtimer_count_to_us(_sub_proc_data[3].proc_cycles);
switch (sub_proc_count) {
case 4:
DEVICE_LOG_WARN(
"[profiler]%s, pcyc: %llu, dur: %lluus, [%s]cnt: %llu, dur: %lluus, "
"[%s]cnt: %llu, dur: %lluus, [%s]cnt: %llu, dur: %lluus, "
"[%s]cnt: %llu, dur: %lluus\n",
_log_prefix, (unsigned long long) total_pcycles, (unsigned long long) duration,
_sub_proc_data[0].log_prefix, (unsigned long long) _sub_proc_data[0].proc_count,
(unsigned long long) sub_proc0_duration, _sub_proc_data[1].log_prefix,
(unsigned long long) _sub_proc_data[1].proc_count, (unsigned long long) sub_proc1_duration,
_sub_proc_data[2].log_prefix, (unsigned long long) _sub_proc_data[2].proc_count,
(unsigned long long) sub_proc2_duration, _sub_proc_data[3].log_prefix,
(unsigned long long) _sub_proc_data[3].proc_count, (unsigned long long) sub_proc3_duration);
break;
case 3:
DEVICE_LOG_WARN(
"[profiler]%s, pcyc: %llu, dur: %lluus, [%s]cnt: %llu, dur: %lluus, "
"[%s]cnt: %llu, dur: %lluus, [%s]cnt: %llu, dur: %lluus\n",
_log_prefix, (unsigned long long) total_pcycles, (unsigned long long) duration,
_sub_proc_data[0].log_prefix, (unsigned long long) _sub_proc_data[0].proc_count,
(unsigned long long) sub_proc0_duration, _sub_proc_data[1].log_prefix,
(unsigned long long) _sub_proc_data[1].proc_count, (unsigned long long) sub_proc1_duration,
_sub_proc_data[2].log_prefix, (unsigned long long) _sub_proc_data[2].proc_count,
(unsigned long long) sub_proc2_duration);
break;
case 2:
DEVICE_LOG_WARN(
"[profiler]%s, pcyc: %llu, dur: %lluus, [%s]cnt: %llu, dur: %lluus, "
"[%s]cnt: %llu, dur: %lluus\n",
_log_prefix, (unsigned long long) total_pcycles, (unsigned long long) duration,
_sub_proc_data[0].log_prefix, (unsigned long long) _sub_proc_data[0].proc_count,
(unsigned long long) sub_proc0_duration, _sub_proc_data[1].log_prefix,
(unsigned long long) _sub_proc_data[1].proc_count, (unsigned long long) sub_proc1_duration);
break;
case 1:
DEVICE_LOG_WARN("[profiler]%s, pcyc: %llu, dur: %lluus, [%s]cnt: %llu, dur: %lluus\n", _log_prefix,
(unsigned long long) total_pcycles, (unsigned long long) duration,
_sub_proc_data[0].log_prefix, (unsigned long long) _sub_proc_data[0].proc_count,
(unsigned long long) sub_proc0_duration);
break;
default:
case 0:
DEVICE_LOG_WARN("[profiler]%s, pcyc: %llu, dur: %lluus\n", _log_prefix,
(unsigned long long) total_pcycles, (unsigned long long) duration);
break;
}
}
private:
char _log_prefix[kBufferCount] = {};
uint64_t _begin_cycles = 0;
uint64_t _begin_pcycles = 0;
sub_process_data _sub_proc_data[kSubProcCount] = {};
DISABLE_COPY(npu_scoped_timer);
};
template <size_t _buffer_count, size_t _sub_idx> class npu_sub_process_scoped_timer {
public:
static_assert(_sub_idx < npu_scoped_timer<_buffer_count>::kSubProcCount,
"Sub process index must be less than kSubProcCount");
using npu_scoped_timer = npu_scoped_timer<_buffer_count>;
explicit npu_sub_process_scoped_timer(npu_scoped_timer & timer, const char * prefix) :
_timer(timer),
_prefix(prefix) {
_begin_cycles = HAP_perf_get_qtimer_count();
_begin_pcycles = HAP_perf_get_pcycles();
}
~npu_sub_process_scoped_timer() {
_timer.add_sub_proc_cycles(_sub_idx, _prefix, HAP_perf_get_qtimer_count() - _begin_cycles,
HAP_perf_get_pcycles() - _begin_pcycles);
}
private:
npu_scoped_timer & _timer;
const char * _prefix = nullptr;
uint64_t _begin_cycles = 0;
uint64_t _begin_pcycles = 0;
DISABLE_COPY_AND_MOVE(npu_sub_process_scoped_timer);
};
inline auto make_scoped_perf_timer(const char * format, ...) {
va_list args;
va_start(args, format);
char buffer[512];
vsnprintf(buffer, sizeof(buffer), format, args);
va_end(args);
return npu_scoped_timer<1024>(buffer);
}
#endif
} // namespace hexagon
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
# define _MAKE_VARIABLE_NAME2(name, postfix) name##postfix
# define _MAKE_VARIABLE_NAME(name, postfix) _MAKE_VARIABLE_NAME2(name, postfix)
# define DEVICE_SCOPED_PERFORMANCE_TRACKER(fmt, ...) \
auto _MAKE_VARIABLE_NAME(__npu_timer_, __LINE__) = hexagon::make_scoped_perf_timer(fmt, __VA_ARGS__)
#else
# define DEVICE_SCOPED_PERFORMANCE_TRACKER(fmt, ...) ((void) 0)
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,441 @@
#pragma once
#include "hexagon_npu.h"
#include <hexagon_types.h>
#include <cstdint>
namespace hexagon {
constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73
constexpr const size_t kAlignMask = kBytesPerVector - 1;
template <typename T, int N> struct HEXAGON_pack {
T val[N];
};
using HVX_Vector_x2 = HEXAGON_pack<HVX_Vector, 2>;
using HVX_Vector_x3 = HEXAGON_pack<HVX_Vector, 3>;
using HVX_Vector_x4 = HEXAGON_pack<HVX_Vector, 4>;
using HVX_Vector_x5 = HEXAGON_pack<HVX_Vector, 5>;
using HVX_VectorPair_x2 = HEXAGON_pack<HVX_VectorPair, 2>;
using HVX_VectorPair_x3 = HEXAGON_pack<HVX_VectorPair, 3>;
using HVX_VectorPair_x4 = HEXAGON_pack<HVX_VectorPair, 4>;
using HVX_VectorPred_x3 = HEXAGON_pack<HVX_VectorPred, 3>;
typedef union {
HVX_VectorPair VV;
struct {
HVX_Vector lo;
HVX_Vector hi;
} V;
} HVX_DV;
typedef union {
HVX_Vector v;
float f32[kBytesPerVector / sizeof(float)];
uint32_t u32[kBytesPerVector / sizeof(uint32_t)];
__fp16 f16[kBytesPerVector / sizeof(__fp16)];
uint16_t u16[kBytesPerVector / sizeof(uint16_t)];
uint8_t u8[kBytesPerVector];
} HVX_VectorAlias;
inline size_t get_aligned_size(size_t size) {
return (size + kAlignMask) & ~kAlignMask;
}
inline size_t unaligned_bytes(const void * addr) {
return ((size_t) addr) & kAlignMask;
}
template <typename _TyData> inline const _TyData * align_down(const _TyData * addr) {
return reinterpret_cast<const _TyData *>(reinterpret_cast<const uint8_t *>(addr) - unaligned_bytes(addr));
}
inline size_t bytes_to_vector_boundary(const void * addr) {
return kBytesPerVector - unaligned_bytes(addr);
}
inline bool is_addr_aligned(const void * addr) {
return unaligned_bytes(addr) == 0;
}
inline bool is_size_aligned(size_t size) {
return (size & kAlignMask) == 0;
}
inline float get_flt0_from_fltv(HVX_Vector vect) {
static_assert(sizeof(vect[0]) == sizeof(float), "vect[0] should be a float");
int32_t i = vect[0];
return reinterpret_cast<float &>(i);
}
inline HVX_UVector Q6_V_vmemu_R(const void * unaligned_ptr) {
return *reinterpret_cast<const HVX_UVector *>(unaligned_ptr);
}
inline HVX_Vector Q6_V_vmem_R(const void * aligned_ptr) {
return *reinterpret_cast<const HVX_Vector *>(aligned_ptr);
}
constexpr const size_t kL2CacheSize = 8 * 1024; // // 8KB L2 cache
constexpr const size_t kL2FetchAheadVectors = kL2CacheSize / kBytesPerVector;
inline void l2fetch(const void * p, uint32_t stride, uint32_t width, uint32_t height, uint32_t dir) {
uint64_t control = HEXAGON_V64_CREATE_H(dir, stride, width, height);
__asm__ __volatile__(" l2fetch(%0,%1) " : : "r"(p), "r"(control));
}
inline void l2fetch_row(const uint8_t * row_ptr, size_t bytes) {
// TODO: should we use small kL2FetchAheadVectors?
int32_t l2fetch_vectors = Q6_R_min_RR(bytes / kBytesPerVector, kL2FetchAheadVectors);
hexagon::l2fetch(row_ptr, kBytesPerVector, kBytesPerVector, l2fetch_vectors, 0);
}
template <uint32_t _TyBytes> inline void q6op_vstu_variable_ARV(void * addr, HVX_Vector vin) {
vin = Q6_V_vlalign_VVR(vin, vin, (size_t) addr); //rotate as needed.
uint32_t left_off = unaligned_bytes(addr);
uint32_t right_off = left_off + _TyBytes;
HVX_VectorPred qL_not = Q6_Q_vsetq_R((size_t) addr);
HVX_VectorPred qR = Q6_Q_vsetq2_R(right_off);
if (right_off > 128) {
Q6_vmaskedstoreq_QAV(qR, (HVX_Vector *) addr + 1, vin);
qR = Q6_Q_vcmp_eq_VbVb(vin, vin); // all 1's
}
qL_not = Q6_Q_or_QQn(qL_not, qR);
Q6_vmaskedstorenq_QAV(qL_not, (HVX_Vector *) addr, vin);
}
template <uint32_t _TyBytes> inline void q6op_vstu_variable_aligned(void * addr, HVX_Vector vin) {
HVX_VectorPred qR = Q6_Q_vsetq2_R(_TyBytes);
Q6_vmaskedstorenq_QAV(qR, (HVX_Vector *) addr, vin);
}
inline void q6op_vstu_variable_ARV(void * addr, int n, HVX_Vector vin) {
vin = Q6_V_vlalign_VVR(vin, vin, (size_t) addr); //rotate as needed.
unsigned left_off = unaligned_bytes(addr);
unsigned right_off = left_off + n;
HVX_VectorPred qL_not = Q6_Q_vsetq_R((size_t) addr);
HVX_VectorPred qR = Q6_Q_vsetq2_R(right_off);
if (right_off > 128) {
Q6_vmaskedstoreq_QAV(qR, (HVX_Vector *) addr + 1, vin);
qR = Q6_Q_vcmp_eq_VbVb(vin, vin); // all 1's
}
qL_not = Q6_Q_or_QQn(qL_not, qR);
Q6_vmaskedstorenq_QAV(qL_not, (HVX_Vector *) addr, vin);
}
inline HVX_Vector vec_reduction_qf32(HVX_Vector sums) {
constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(float);
static_assert(kFloatsPerVector == 32, "kFloatsPerVector should be 32");
sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 16 * sizeof(float)));
sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 8 * sizeof(float)));
sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 4 * sizeof(float)));
sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 2 * sizeof(float)));
sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, sizeof(float)));
return sums;
}
inline float vec_reduction_f32_qf32(HVX_Vector sums) {
return get_flt0_from_fltv(Q6_Vsf_equals_Vqf32(vec_reduction_qf32(sums)));
}
inline HVX_Vector vec_reduction_qf16(HVX_Vector sums) {
constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(npu_device_fp16_t);
static_assert(kFloatsPerVector == 64, "kFloatsPerVector should be 64");
sums = Q6_Vqf16_vadd_Vqf16Vqf16(sums, Q6_V_vror_VR(sums, 32 * sizeof(npu_device_fp16_t)));
sums = Q6_Vqf16_vadd_Vqf16Vqf16(sums, Q6_V_vror_VR(sums, 16 * sizeof(npu_device_fp16_t)));
sums = Q6_Vqf16_vadd_Vqf16Vqf16(sums, Q6_V_vror_VR(sums, 8 * sizeof(npu_device_fp16_t)));
sums = Q6_Vqf16_vadd_Vqf16Vqf16(sums, Q6_V_vror_VR(sums, 4 * sizeof(npu_device_fp16_t)));
sums = Q6_Vqf16_vadd_Vqf16Vqf16(sums, Q6_V_vror_VR(sums, 2 * sizeof(npu_device_fp16_t)));
sums = Q6_Vqf16_vadd_Vqf16Vqf16(sums, Q6_V_vror_VR(sums, sizeof(npu_device_fp16_t)));
return sums;
}
inline float vec_reduction_qf16_f32(HVX_Vector sums) {
HVX_Vector vect = Q6_Vhf_equals_Vqf16(vec_reduction_qf16(sums));
uint16_t i = (vect[0] & 0xffff);
return reinterpret_cast<__fp16 &>(i);
}
inline HVX_Vector hvx_scale_f32(float scale) {
return Q6_V_vsplat_R(reinterpret_cast<const uint32_t &>(scale));
}
inline HVX_Vector hvx_vec_scale_f32_f32(HVX_Vector src, HVX_UVector *, HVX_Vector scale_vec) {
return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(src, scale_vec));
}
inline HVX_Vector hvx_vec_mad_f32_f32(HVX_Vector src, HVX_UVector * dst_ptr, HVX_Vector scale_vec) {
HVX_Vector dst = *dst_ptr; // TODO: opt the unaligned case?
src = Q6_Vqf32_vmpy_VsfVsf(src, scale_vec);
src = Q6_Vqf32_vadd_Vqf32Vsf(src, dst);
return Q6_Vsf_equals_Vqf32(src);
}
inline HVX_Vector hvx_scale_f16(float scale) {
__fp16 f16_scale = scale;
return Q6_Vh_vsplat_R(reinterpret_cast<const npu_device_fp16_t &>(f16_scale));
}
inline HVX_Vector hvx_vec_scale_f16_f16(HVX_Vector src, HVX_UVector *, HVX_Vector scale_vec) {
return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(src, scale_vec));
}
inline HVX_Vector hvx_vec_mad_f16_f16(HVX_Vector src, HVX_UVector * dst_ptr, HVX_Vector scale_vec) {
HVX_Vector dst = *dst_ptr; // TODO: opt the unaligned case?
HVX_Vector scaled = Q6_Vqf16_vmpy_VhfVhf(src, scale_vec);
HVX_Vector result = Q6_Vqf16_vadd_Vqf16Vhf(scaled, dst);
return Q6_Vhf_equals_Vqf16(result);
}
inline HVX_Vector hvx_nop(float scale) {
return HVX_Vector();
}
inline HVX_Vector hvx_passthru(HVX_Vector src, HVX_UVector *, HVX_Vector) {
return src;
}
} // namespace hexagon
#include "vec_math.inl"
#include "vec_ops.inl"
#include "vec_quant.inl"
namespace hexagon {
inline void vec_scale_f32(const float * src, float scale, float * dst, size_t count) {
using namespace hexagon::vec;
vec_scale_impl<hvx_vec_scale_f32_f32, hvx_scale_f32, float>(src, scale, dst, count);
}
inline void vec_mad_f32(const float * src, float scale, float * dst, size_t count) {
using namespace hexagon::vec;
vec_scale_impl<hvx_vec_mad_f32_f32, hvx_scale_f32, float>(src, scale, dst, count);
}
inline void vec_cpy_f32(const float * src, float * dst, size_t count) {
using namespace hexagon::vec;
vec_scale_impl<hvx_passthru, hvx_nop, float>(src, 0, dst, count);
}
inline void vec_zero_f32(float * src, size_t count) {
using namespace hexagon::vec;
vec_zero_impl<float>(src, count);
}
inline void vec_scale_f16(const npu_device_fp16_t * src, float scale, npu_device_fp16_t * dst, size_t count) {
using namespace hexagon::vec;
vec_scale_impl<hvx_vec_scale_f16_f16, hvx_scale_f16, npu_device_fp16_t>(src, scale, dst, count);
}
inline void vec_mad_f16(const npu_device_fp16_t * src, float scale, npu_device_fp16_t * dst, size_t count) {
using namespace hexagon::vec;
vec_scale_impl<hvx_vec_mad_f16_f16, hvx_scale_f16, npu_device_fp16_t>(src, scale, dst, count);
}
inline void vec_cpy_f16(const npu_device_fp16_t * src, npu_device_fp16_t * dst, size_t count) {
using namespace hexagon::vec;
vec_scale_impl<hvx_passthru, hvx_nop, npu_device_fp16_t>(src, 0, dst, count);
}
inline void vec_zero_f16(npu_device_fp16_t * src, size_t count) {
using namespace hexagon::vec;
vec_zero_impl<npu_device_fp16_t>(src, count);
}
template <typename _TElem0, typename _TElem1>
inline bool is_dot_product_aligned(const _TElem0 * src0, const _TElem1 * src1, size_t count) {
static_assert(sizeof(_TElem0) <= sizeof(_TElem1), "src0 should be smaller than src1");
if ((src0 && !hexagon::is_addr_aligned(src0)) || (src1 && !hexagon::is_addr_aligned(src1))) {
return false;
}
if (count % (hexagon::kBytesPerVector / sizeof(_TElem0)) != 0) {
return false;
}
return true;
}
inline HVX_Vector vec_dot_product_vqf32_f32_f32(const float * src0, const float * src1, size_t count) {
using namespace hexagon::vec;
return vec_dot_product_impl<float, HVX_Vector, vec_mpy_qf32, vec_add_qf32, vec_reduction_qf32>(src0, src1, count);
}
inline HVX_Vector vec_dot_product_aligned_vqf32_f32_f32(const float * src0, const float * src1, size_t count) {
using namespace hexagon::vec;
return vec_dot_product_aligned_impl<float, HVX_Vector, vec_mpy_qf32, vec_add_qf32, vec_reduction_qf32>(src0, src1,
count);
}
inline float vec_dot_product_f32_f32(const float * src0, const float * src1, size_t count) {
using namespace hexagon::vec;
return vec_dot_product_impl<float, float, vec_mpy_qf32, vec_add_qf32, vec_reduction_f32_qf32>(src0, src1, count);
}
inline float vec_dot_product_aligned_f32_f32(const float * src0, const float * src1, size_t count) {
using namespace hexagon::vec;
return vec_dot_product_aligned_impl<float, float, vec_mpy_qf32, vec_add_qf32, vec_reduction_f32_qf32>(src0, src1,
count);
}
inline bool is_f32_f32_dot_product_aligned(const float * src0, const float * src1, size_t count) {
return is_dot_product_aligned<float, float>(src0, src1, count);
}
inline HVX_Vector vec_dot_product_vqf16_f16_f16(const npu_device_fp16_t * src0,
const npu_device_fp16_t * src1,
size_t count) {
using namespace hexagon::vec;
return vec_dot_product_impl<npu_device_fp16_t, HVX_Vector, vec_mpy_qf16, vec_add_qf16, vec_reduction_qf16>(
src0, src1, count);
}
inline HVX_Vector vec_dot_product_aligned_vqf16_f16_f16(const npu_device_fp16_t * src0,
const npu_device_fp16_t * src1,
size_t count) {
using namespace hexagon::vec;
return vec_dot_product_aligned_impl<npu_device_fp16_t, HVX_Vector, vec_mpy_qf16, vec_add_qf16, vec_reduction_qf16>(
src0, src1, count);
}
inline float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_device_fp16_t * src1, size_t count) {
using namespace hexagon::vec;
return vec_dot_product_impl<npu_device_fp16_t, float, vec_mpy_qf16, vec_add_qf16, vec_reduction_qf16_f32>(
src0, src1, count);
}
inline float vec_dot_product_aligned_f16_f16(const npu_device_fp16_t * src0,
const npu_device_fp16_t * src1,
size_t count) {
using namespace hexagon::vec;
return vec_dot_product_aligned_impl<npu_device_fp16_t, float, vec_mpy_qf16, vec_add_qf16, vec_reduction_qf16_f32>(
src0, src1, count);
}
inline bool is_f16_f16_dot_product_aligned(const npu_device_fp16_t * src0,
const npu_device_fp16_t * src1,
size_t count) {
return is_dot_product_aligned<npu_device_fp16_t, npu_device_fp16_t>(src0, src1, count);
}
inline HVX_Vector vec_dot_product_vqf32_f16_f32(const npu_device_fp16_t * src0, const float * src1, size_t count) {
using namespace hexagon::vec;
using namespace hexagon::vec::math;
return vec_dot_product_mixed_impl<npu_device_fp16_t, float, HVX_Vector, hvx_vsf_convert_vhf, vec_mpy_qf32,
vec_add_qf32, vec_reduction_qf32>(src0, src1, count);
}
inline HVX_Vector vec_dot_product_aligned_vqf32_f16_f32(const npu_device_fp16_t * src0,
const float * src1,
size_t count) {
using namespace hexagon::vec;
using namespace hexagon::vec::math;
return vec_dot_product_mix_aligned_impl<npu_device_fp16_t, float, HVX_Vector, hvx_vsf_convert_vhf, vec_mpy_qf32,
vec_add_qf32, vec_reduction_qf32>(src0, src1, count);
}
inline float vec_dot_product_f16_f32(const npu_device_fp16_t * src0, const float * src1, size_t count) {
using namespace hexagon::vec;
using namespace hexagon::vec::math;
return vec_dot_product_mixed_impl<npu_device_fp16_t, float, float, hvx_vsf_convert_vhf, vec_mpy_qf32, vec_add_qf32,
vec_reduction_f32_qf32>(src0, src1, count);
}
inline float vec_dot_product_aligned_f16_f32(const npu_device_fp16_t * src0, const float * src1, size_t count) {
using namespace hexagon::vec;
using namespace hexagon::vec::math;
return vec_dot_product_mix_aligned_impl<npu_device_fp16_t, float, float, hvx_vsf_convert_vhf, vec_mpy_qf32,
vec_add_qf32, vec_reduction_f32_qf32>(src0, src1, count);
}
inline HVX_Vector vec_dot_product_vqf32_q40_f32(const npu_device_block_q4_0 * src0,
const float * src1,
size_t count,
const HVX_Vector table) {
using namespace hexagon::vec;
using namespace hexagon::vec::math;
using namespace hexagon::vec::quant;
alignas(hexagon::kBytesPerVector) static const HVX_Vector qs_indices =
make_qs_load_mask<npu_device_block_q4_0, q4_qs_shuff_idx>();
alignas(hexagon::kBytesPerVector) static const HVX_Vector scale_indices =
Q6_Vh_vshuff_Vh(make_scale_load_mask<npu_device_block_q4_0>());
return vec_dot_product_quant_impl<npu_device_block_q4_0, float, HVX_Vector, load_dequant_vec_q40_qf32_4blocks,
load_dequant_vec_q40_qf32_2blocks, load_dequant_vec_q40_qf32_1block,
vec_reduction_qf32>(src0, src1, count, qs_indices, scale_indices, table);
}
inline bool is_f16_f32_dot_product_aligned(const npu_device_fp16_t * src0, const float * src1, size_t count) {
return is_dot_product_aligned<npu_device_fp16_t, float>(src0, src1, count);
}
template <typename _TFunc> struct dot_func_traits {};
template <typename _TData, typename _TReturn> struct dot_func_traits<_TReturn (*)(_TData, _TData, size_t)> {
using param_type = std::remove_const_t<std::remove_pointer_t<_TData>>;
using return_type = _TReturn;
};
template <auto _DotFunc, typename _TReturn = typename dot_func_traits<decltype(_DotFunc)>::return_type>
_TReturn type_erase_dot_func(const void * src0, const void * src1, size_t count) {
using param_type = typename dot_func_traits<decltype(_DotFunc)>::param_type;
auto * src0_typed = reinterpret_cast<const param_type *>(src0);
auto * src1_typed = reinterpret_cast<const param_type *>(src1);
return _DotFunc(src0_typed, src1_typed, count);
}
inline HVX_Vector vec_silu_f32_f32(HVX_Vector x, HVX_VectorPair_x4 coeff) {
using namespace hexagon::vec::math;
constexpr float kMaxExp = 88.02f; // log(INF)
const HVX_Vector max_exp = Q6_V_vsplat_R(reinterpret_cast<const uint32_t &>(kMaxExp));
HVX_Vector one = Q6_V_vsplat_R(0x3F800000);
// x/(1.0f + expf(-x));
HVX_Vector exp_neg_x = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(Q6_V_vzero(), x));
HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(exp_neg_x, max_exp);
HVX_Vector denom = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(qhmath_hvx_exp_vf(exp_neg_x), one));
HVX_Vector out = qhmath_hvx_div_vf(x, denom, coeff);
out = Q6_V_vmux_QVV(pred0, Q6_V_vzero(), out);
return out;
}
inline HVX_Vector vec_silu_f16_f16(HVX_Vector x, HVX_VectorPair_x4 coeff) {
using namespace hexagon::vec::math;
constexpr __fp16 kMaxExp = 11.0898664f; // log(INF)
const HVX_Vector max_exp = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(kMaxExp));
HVX_Vector one = Q6_Vh_vsplat_R(0x3c00);
// x/(1.0f + expf(-x));
HVX_Vector exp_neg_x = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vsub_VhfVhf(Q6_V_vzero(), x));
HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VhfVhf(exp_neg_x, max_exp);
HVX_Vector denom = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_VhfVhf(qhmath_hvx_exp_vhf(exp_neg_x), one));
HVX_Vector out = qhmath_hvx_div_vhf(x, denom, coeff);
out = Q6_V_vmux_QVV(pred0, Q6_V_vzero(), out);
return out;
}
inline HVX_Vector vec_swiglu_f32_f32(HVX_Vector x, HVX_Vector g, HVX_VectorPair_x4 coeff) {
HVX_Vector silu = vec_silu_f32_f32(x, coeff);
return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(silu, g));
}
inline HVX_Vector vec_swiglu_f16_f16(HVX_Vector x, HVX_Vector g, HVX_VectorPair_x4 coeff) {
HVX_Vector silu = vec_silu_f16_f16(x, coeff);
return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(silu, g));
}
} // namespace hexagon

View File

@ -0,0 +1,785 @@
#pragma once
#include "hexagon_npu.h"
#include <hexagon_types.h>
#include <cassert>
#include <cstdint>
#include <type_traits>
namespace hexagon::vec {
template <typename _TElem,
typename _TRet,
HVX_Vector (*_MpyFunc)(HVX_Vector, HVX_Vector),
HVX_Vector (*_AddFunc)(HVX_Vector, HVX_Vector),
_TRet (*_ReduceFunc)(HVX_Vector)>
inline _TRet vec_dot_product_impl(const _TElem * src0, const _TElem * src1, size_t count) {
constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(_TElem);
const HVX_Vector kZeroV = Q6_V_vzero();
HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0);
HVX_Vector * const src0_vec_ptr_end = ((HVX_Vector *) src0) + count / kElementsPerVector;
HVX_Vector * src1_vec_ptr = ((HVX_Vector *) src1);
HVX_Vector prev0 = *src0_vec_ptr++;
HVX_Vector prev1 = *src1_vec_ptr++;
HVX_Vector sum = kZeroV;
if (src0_vec_ptr_end - src0_vec_ptr > 1) {
HVX_Vector sum0 = kZeroV;
HVX_Vector sum1 = kZeroV;
do {
HVX_VectorPair curr0 = reinterpret_cast<HVX_VectorPair *>(src0_vec_ptr)[0];
HVX_VectorPair curr1 = reinterpret_cast<HVX_VectorPair *>(src1_vec_ptr)[0];
HVX_Vector l0 = Q6_V_valign_VVR(Q6_V_lo_W(curr0), prev0, (size_t) src0);
HVX_Vector l1 = Q6_V_valign_VVR(Q6_V_lo_W(curr1), prev1, (size_t) src1);
HVX_Vector h0 = Q6_V_valign_VVR(Q6_V_hi_W(curr0), Q6_V_lo_W(curr0), (size_t) src0);
HVX_Vector h1 = Q6_V_valign_VVR(Q6_V_hi_W(curr1), Q6_V_lo_W(curr1), (size_t) src1);
HVX_Vector mpy0 = _MpyFunc(l0, l1);
HVX_Vector mpy1 = _MpyFunc(h0, h1);
prev0 = Q6_V_hi_W(curr0);
prev1 = Q6_V_hi_W(curr1);
sum0 = _AddFunc(mpy0, sum0);
sum1 = _AddFunc(mpy1, sum1);
src0_vec_ptr += 2;
src1_vec_ptr += 2;
} while (src0_vec_ptr_end - src0_vec_ptr > 1);
sum = _AddFunc(sum0, sum1);
}
if (src0_vec_ptr_end - src0_vec_ptr > 0) {
HVX_Vector curr0 = *src0_vec_ptr++;
HVX_Vector curr1 = *src1_vec_ptr++;
HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
prev0 = curr0;
prev1 = curr1;
sum = _AddFunc(_MpyFunc(s0, s1), sum);
}
const size_t leftover = count % kElementsPerVector;
if ((src0_vec_ptr_end - ((HVX_Vector *) src0)) > 0) {
// handle the last vector
// see also:
// https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147
// or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c
bool should_fetch_src0 = leftover != 0 || !hexagon::is_addr_aligned(src0_vec_ptr);
bool should_fetch_src1 = leftover != 0 || !hexagon::is_addr_aligned(src1_vec_ptr);
HVX_Vector curr0 = should_fetch_src0 ? *src0_vec_ptr : prev0;
HVX_Vector curr1 = should_fetch_src1 ? *src1_vec_ptr : prev1;
src0_vec_ptr += should_fetch_src0 ? 1 : 0;
src1_vec_ptr += should_fetch_src1 ? 1 : 0;
HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
HVX_Vector mpy0 = _MpyFunc(s0, s1);
prev0 = curr0;
prev1 = curr1;
sum = _AddFunc(mpy0, sum);
}
if (leftover > 0) {
// handle the leftover elements
const size_t leftover_bytes = leftover * sizeof(_TElem);
HVX_Vector curr0 = (leftover_bytes + hexagon::unaligned_bytes(src0_vec_ptr) > hexagon::kBytesPerVector) ?
*src0_vec_ptr :
prev0;
curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
HVX_Vector curr1 = (leftover_bytes + hexagon::unaligned_bytes(src1_vec_ptr) > hexagon::kBytesPerVector) ?
*src1_vec_ptr :
prev1;
curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
sum = _AddFunc(Q6_V_valign_VVR(_MpyFunc(curr0, curr1), kZeroV, leftover_bytes), sum);
}
return _ReduceFunc(sum);
}
template <typename _TElem,
typename _TRet,
HVX_Vector (*_MpyFunc)(HVX_Vector, HVX_Vector),
HVX_Vector (*_AddFunc)(HVX_Vector, HVX_Vector),
_TRet (*_ReduceFunc)(HVX_Vector)>
inline _TRet vec_dot_product_aligned_impl(const _TElem * src0, const _TElem * src1, size_t count) {
constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(_TElem);
const HVX_Vector kZeroV = Q6_V_vzero();
HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0);
HVX_Vector * const src0_vec_ptr_end = ((HVX_Vector *) src0) + count / kElementsPerVector;
HVX_Vector * src1_vec_ptr = ((HVX_Vector *) src1);
HVX_Vector sum = kZeroV;
{
HVX_Vector sum0 = kZeroV;
HVX_Vector sum1 = kZeroV;
while (src0_vec_ptr_end - src0_vec_ptr > 3) {
HVX_VectorPair curr00 = reinterpret_cast<HVX_VectorPair *>(src0_vec_ptr)[0];
HVX_VectorPair curr10 = reinterpret_cast<HVX_VectorPair *>(src1_vec_ptr)[0];
HVX_VectorPair curr01 = reinterpret_cast<HVX_VectorPair *>(src0_vec_ptr)[1];
HVX_VectorPair curr11 = reinterpret_cast<HVX_VectorPair *>(src1_vec_ptr)[1];
HVX_Vector mpy0 = _MpyFunc(Q6_V_lo_W(curr00), Q6_V_lo_W(curr10));
HVX_Vector mpy1 = _MpyFunc(Q6_V_hi_W(curr00), Q6_V_hi_W(curr10));
HVX_Vector mpy2 = _MpyFunc(Q6_V_lo_W(curr01), Q6_V_lo_W(curr11));
HVX_Vector mpy3 = _MpyFunc(Q6_V_hi_W(curr01), Q6_V_hi_W(curr11));
sum0 = _AddFunc(mpy0, sum0);
sum1 = _AddFunc(mpy1, sum1);
sum0 = _AddFunc(mpy2, sum0);
sum1 = _AddFunc(mpy3, sum1);
src0_vec_ptr += 4;
src1_vec_ptr += 4;
};
if (src0_vec_ptr_end - src0_vec_ptr > 1) {
HVX_VectorPair curr0 = reinterpret_cast<HVX_VectorPair *>(src0_vec_ptr)[0];
HVX_VectorPair curr1 = reinterpret_cast<HVX_VectorPair *>(src1_vec_ptr)[0];
src0_vec_ptr += 2;
src1_vec_ptr += 2;
HVX_Vector mpy0 = _MpyFunc(Q6_V_lo_W(curr0), Q6_V_lo_W(curr1));
HVX_Vector mpy1 = _MpyFunc(Q6_V_hi_W(curr0), Q6_V_hi_W(curr1));
sum0 = _AddFunc(mpy0, sum0);
sum1 = _AddFunc(mpy1, sum1);
}
sum = _AddFunc(sum0, sum1);
}
if (src0_vec_ptr_end - src0_vec_ptr > 0) {
HVX_Vector curr0 = src0_vec_ptr[0];
HVX_Vector curr1 = src1_vec_ptr[0];
sum = _AddFunc(_MpyFunc(curr0, curr1), sum);
}
return _ReduceFunc(sum);
}
inline HVX_Vector vec_mpy_qf32(HVX_Vector src0, HVX_Vector src1) {
return Q6_Vqf32_vmpy_VsfVsf(src0, src1);
}
inline HVX_Vector vec_add_qf32(HVX_Vector sum, HVX_Vector result) {
return Q6_Vqf32_vadd_Vqf32Vqf32(sum, result);
}
inline HVX_Vector vec_mpy_qf16(HVX_Vector src0, HVX_Vector src1) {
return Q6_Vqf16_vmpy_VhfVhf(src0, src1);
}
inline HVX_Vector vec_add_qf16(HVX_Vector sum, HVX_Vector result) {
return Q6_Vqf16_vadd_Vqf16Vqf16(sum, result);
}
template <typename _TElem0,
typename _TElem1,
typename _TRet,
HVX_Vector_x2 (*_ExpandFunc)(HVX_Vector, HVX_Vector),
HVX_Vector (*_MpyFunc)(HVX_Vector, HVX_Vector),
HVX_Vector (*_AddFunc)(HVX_Vector, HVX_Vector),
_TRet (*_ReduceFunc)(HVX_Vector)>
inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * src1, size_t count) {
static_assert(sizeof(_TElem0) < sizeof(_TElem1), "Element size mismatch: _TElem0 must be smaller than _TElem1");
static_assert((sizeof(_TElem1) / sizeof(_TElem0)) == 2,
"Element size mismatch: _TElem1 must be twice the size of _TElem0");
static_assert((sizeof(_TElem1) % sizeof(_TElem0)) == 0,
"Element size mismatch: _TElem1 must be a multiple of _TElem0");
constexpr const size_t kElementsPerVector0 = hexagon::kBytesPerVector / sizeof(_TElem0);
constexpr const size_t kElementsPerVector1 = hexagon::kBytesPerVector / sizeof(_TElem1);
constexpr const __fp16 kOne = 1.0f;
const HVX_Vector kOneV = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(kOne));
const HVX_Vector kZeroV = Q6_V_vzero();
const _TElem0 * const src0_ptr_end = src0 + count;
HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0);
HVX_Vector * src1_vec_ptr = ((HVX_Vector *) src1);
HVX_Vector * const src1_vec_ptr_end = ((HVX_Vector *) src1) + count / kElementsPerVector1;
HVX_Vector prev0 = *src0_vec_ptr++;
HVX_Vector prev1 = *src1_vec_ptr++;
HVX_Vector sum = kZeroV;
if (src1_vec_ptr_end - src1_vec_ptr > 1) {
HVX_Vector sum0 = kZeroV;
HVX_Vector sum1 = kZeroV;
do {
HVX_Vector curr0 = src0_vec_ptr[0];
HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
HVX_Vector_x2 s0_pair = _ExpandFunc(s0, kOneV);
HVX_Vector curr10 = src1_vec_ptr[0];
HVX_Vector curr11 = src1_vec_ptr[1];
HVX_Vector l1 = Q6_V_valign_VVR(curr10, prev1, (size_t) src1);
HVX_Vector h1 = Q6_V_valign_VVR(curr11, curr10, (size_t) src1);
HVX_Vector mpy0 = _MpyFunc(s0_pair.val[0], l1);
HVX_Vector mpy1 = _MpyFunc(s0_pair.val[1], h1);
prev0 = curr0;
prev1 = curr11;
sum0 = _AddFunc(mpy0, sum0);
sum1 = _AddFunc(mpy1, sum1);
src0_vec_ptr++;
src1_vec_ptr += 2;
} while (src1_vec_ptr_end - src1_vec_ptr > 1);
sum = _AddFunc(sum0, sum1);
}
const size_t leftover1 = count % kElementsPerVector1;
if ((src1_vec_ptr_end - ((HVX_Vector *) src1)) > 0) {
// handle the last vector
const bool should_fetch_src0 =
reinterpret_cast<const _TElem0 *>(hexagon::align_down(src0_vec_ptr)) < src0_ptr_end;
HVX_Vector curr0 = should_fetch_src0 ? *src0_vec_ptr : prev0;
src0_vec_ptr += should_fetch_src0 ? 1 : 0;
HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
HVX_Vector_x2 s0_pair = _ExpandFunc(s0, kOneV);
const bool has_remaining_src1_vector = src1_vec_ptr_end - src1_vec_ptr > 0;
if (has_remaining_src1_vector) {
HVX_Vector curr1 = *src1_vec_ptr++;
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
HVX_Vector mpy0 = _MpyFunc(s0_pair.val[0], s1);
prev1 = curr1;
sum = _AddFunc(mpy0, sum);
}
bool should_fetch_src1 = leftover1 != 0 || !hexagon::is_addr_aligned(src1_vec_ptr);
HVX_Vector curr1 = should_fetch_src1 ? *src1_vec_ptr : prev1;
src1_vec_ptr += should_fetch_src1 ? 1 : 0;
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
prev0 = curr0;
HVX_Vector mpy1 = _MpyFunc(has_remaining_src1_vector ? s0_pair.val[1] : s0_pair.val[0], s1);
prev1 = curr1;
sum = _AddFunc(mpy1, sum);
}
if (leftover1 > 0) {
// handle the leftover elements
const size_t leftover0 = count % kElementsPerVector0;
const size_t leftover_bytes1 = leftover1 * sizeof(_TElem1);
HVX_Vector curr0 =
reinterpret_cast<const _TElem0 *>(hexagon::align_down(src0_vec_ptr)) < src0_ptr_end ? *src0_vec_ptr : prev0;
curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
HVX_Vector curr1 = (leftover_bytes1 + hexagon::unaligned_bytes(src1_vec_ptr) > hexagon::kBytesPerVector) ?
*src1_vec_ptr :
prev1;
curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
HVX_Vector_x2 curr0_pair = _ExpandFunc(curr0, kOneV);
curr0 = leftover1 == leftover0 ? curr0_pair.val[0] : curr0_pair.val[1];
sum = _AddFunc(Q6_V_valign_VVR(_MpyFunc(curr0, curr1), kZeroV, leftover_bytes1), sum);
}
return _ReduceFunc(sum);
}
template <typename _TElem0,
typename _TElem1,
typename _TRet,
HVX_Vector_x2 (*_ExpandFunc)(HVX_Vector, HVX_Vector),
HVX_Vector (*_MpyFunc)(HVX_Vector, HVX_Vector),
HVX_Vector (*_AddFunc)(HVX_Vector, HVX_Vector),
_TRet (*_ReduceFunc)(HVX_Vector)>
inline _TRet vec_dot_product_mix_aligned_impl(const _TElem0 * src0, const _TElem1 * src1, size_t count) {
static_assert(sizeof(_TElem0) < sizeof(_TElem1), "Element size mismatch: _TElem0 must be smaller than _TElem1");
static_assert((sizeof(_TElem1) / sizeof(_TElem0)) == 2,
"Element size mismatch: _TElem1 must be twice the size of _TElem0");
static_assert((sizeof(_TElem1) % sizeof(_TElem0)) == 0,
"Element size mismatch: _TElem1 must be a multiple of _TElem0");
constexpr const size_t kElementsPerVector1 = hexagon::kBytesPerVector / sizeof(_TElem1);
constexpr const __fp16 kOne = 1.0f;
const HVX_Vector kOneV = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(kOne));
const HVX_Vector kZeroV = Q6_V_vzero();
HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0);
HVX_Vector * src1_vec_ptr = ((HVX_Vector *) src1);
HVX_Vector * const src1_vec_ptr_end = ((HVX_Vector *) src1) + count / kElementsPerVector1;
HVX_Vector sum0 = kZeroV;
HVX_Vector sum1 = kZeroV;
while (src1_vec_ptr_end - src1_vec_ptr > 3) {
HVX_Vector curr0_lo = src0_vec_ptr[0];
HVX_Vector curr10_lo = src1_vec_ptr[0];
HVX_Vector curr0_hi = src0_vec_ptr[1];
HVX_Vector_x2 curr00 = _ExpandFunc(curr0_lo, kOneV);
HVX_Vector curr10_hi = src1_vec_ptr[1];
HVX_Vector_x2 curr01 = _ExpandFunc(curr0_hi, kOneV);
HVX_Vector mpy0 = _MpyFunc(curr00.val[0], curr10_lo);
HVX_Vector mpy1 = _MpyFunc(curr00.val[1], curr10_hi);
HVX_Vector curr11_lo = src1_vec_ptr[2];
HVX_Vector curr11_hi = src1_vec_ptr[3];
sum0 = _AddFunc(mpy0, sum0);
sum1 = _AddFunc(mpy1, sum1);
HVX_Vector mpy2 = _MpyFunc(curr01.val[0], curr11_lo);
HVX_Vector mpy3 = _MpyFunc(curr01.val[1], curr11_hi);
sum0 = _AddFunc(mpy2, sum0);
sum1 = _AddFunc(mpy3, sum1);
src0_vec_ptr += 2;
src1_vec_ptr += 4;
};
if (src1_vec_ptr_end - src1_vec_ptr > 1) {
HVX_Vector curr0 = src0_vec_ptr[0];
HVX_Vector curr1_lo = src1_vec_ptr[0];
HVX_Vector_x2 s0_pair = _ExpandFunc(curr0, kOneV);
HVX_Vector curr1_hi = src1_vec_ptr[1];
HVX_Vector mpy0 = _MpyFunc(s0_pair.val[0], curr1_lo);
HVX_Vector mpy1 = _MpyFunc(s0_pair.val[1], curr1_hi);
sum0 = _AddFunc(mpy0, sum0);
sum1 = _AddFunc(mpy1, sum1);
}
return _ReduceFunc(_AddFunc(sum0, sum1));
}
inline HVX_Vector_x2 vec_dot_accum_pair(HVX_VectorPair s0,
HVX_Vector curr10,
HVX_Vector curr11,
HVX_Vector prev1,
HVX_Vector_x2 sums,
size_t offset,
HVX_Vector zero) {
HVX_Vector l0 = Q6_V_lo_W(s0);
HVX_Vector l1 = Q6_V_valign_VVR(curr10, prev1, offset);
HVX_Vector h0 = Q6_V_hi_W(s0);
HVX_Vector h1 = Q6_V_valign_VVR(curr11, curr10, offset);
l1 = Q6_Vqf32_vadd_VsfVsf(zero, l1);
h1 = Q6_Vqf32_vadd_VsfVsf(zero, h1);
HVX_Vector mpy0 = Q6_Vqf32_vmpy_Vqf32Vqf32(l0, l1);
HVX_Vector mpy1 = Q6_Vqf32_vmpy_Vqf32Vqf32(h0, h1);
HVX_Vector_x2 result;
result.val[0] = Q6_Vqf32_vadd_Vqf32Vqf32(mpy0, sums.val[0]);
result.val[1] = Q6_Vqf32_vadd_Vqf32Vqf32(mpy1, sums.val[1]);
return result;
}
template <typename _TQuantElem0,
typename _TElem1,
typename _TRet,
HVX_VectorPair_x2 (*_DequantQuadFunc)(const _TQuantElem0 * src,
const HVX_Vector qs_indices,
const HVX_Vector scale_indices,
const HVX_Vector table),
HVX_VectorPair (*_DequantDualFunc)(const _TQuantElem0 * src,
const HVX_Vector qs_indices,
const HVX_Vector scale_indices,
const HVX_Vector table),
HVX_Vector (*_DequantFunc)(const _TQuantElem0 * src,
const HVX_Vector qs_indices,
const HVX_Vector scale_indices,
const HVX_Vector table),
_TRet (*_ReduceFunc)(HVX_Vector)>
inline _TRet vec_dot_product_quant_impl(const _TQuantElem0 * src0,
const _TElem1 * src1,
size_t count,
const HVX_Vector qs_indices,
const HVX_Vector scale_indices,
const HVX_Vector table) {
constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(_TElem1);
static_assert(std::is_same_v<_TQuantElem0, npu_device_block_q4_0> ||
std::is_same_v<_TQuantElem0, npu_device_block_q4_k> ||
std::is_same_v<_TQuantElem0, npu_device_block_q8_0>,
"Element type mismatch: _TQuantElem0 must be a supported quantization block type");
static_assert(QUANT_BLOCK_SIZE == kElementsPerVector,
"Quant block size mismatch: QUANT_BLOCK_SIZE must be equal to kElementsPerVector");
assert(count % kElementsPerVector == 0 && "Count must be a multiple of kElementsPerVector");
const HVX_Vector kZeroV = Q6_V_vzero();
const _TQuantElem0 * src0_ptr = src0;
HVX_Vector * src1_vec_ptr = ((HVX_Vector *) src1);
HVX_Vector * const src1_vec_ptr_end = ((HVX_Vector *) src1) + count / kElementsPerVector;
HVX_Vector prev1 = *src1_vec_ptr++;
HVX_Vector sum = kZeroV;
if (src1_vec_ptr_end - src1_vec_ptr > 1) {
HVX_Vector_x2 sums = { kZeroV, kZeroV };
while (src1_vec_ptr_end - src1_vec_ptr > 3) {
HVX_VectorPair_x2 s01 = _DequantQuadFunc(src0_ptr, qs_indices, scale_indices, table);
HVX_Vector curr100 = src1_vec_ptr[0];
HVX_Vector curr101 = src1_vec_ptr[1];
HVX_Vector curr110 = src1_vec_ptr[2];
HVX_Vector curr111 = src1_vec_ptr[3];
sums = vec_dot_accum_pair(s01.val[0], curr100, curr101, prev1, sums, (size_t) src1, kZeroV);
sums = vec_dot_accum_pair(s01.val[1], curr110, curr111, curr101, sums, (size_t) src1, kZeroV);
prev1 = curr111;
src0_ptr += 4;
src1_vec_ptr += 4;
}
while (src1_vec_ptr_end - src1_vec_ptr > 1) {
HVX_VectorPair s0 = _DequantDualFunc(src0_ptr, qs_indices, scale_indices, table);
HVX_Vector curr10 = src1_vec_ptr[0];
HVX_Vector curr11 = src1_vec_ptr[1];
sums = vec_dot_accum_pair(s0, curr10, curr11, prev1, sums, (size_t) src1, kZeroV);
prev1 = curr11;
src0_ptr += 2;
src1_vec_ptr += 2;
}
sum = Q6_Vqf32_vadd_Vqf32Vqf32(sums.val[0], sums.val[1]);
}
if (src1_vec_ptr_end - src1_vec_ptr > 0) {
HVX_Vector curr1 = *src1_vec_ptr++;
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
HVX_Vector s0 = _DequantFunc(src0_ptr++, qs_indices, scale_indices, table);
s1 = Q6_Vqf32_vadd_VsfVsf(kZeroV, s1);
HVX_Vector mpy0 = Q6_Vqf32_vmpy_Vqf32Vqf32(s0, s1);
prev1 = curr1;
sum = Q6_Vqf32_vadd_Vqf32Vqf32(mpy0, sum);
}
if ((src1_vec_ptr_end - ((HVX_Vector *) src1)) > 0) {
// handle the last vector
// see also:
// https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147
// or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c
bool should_fetch_src1 = !hexagon::is_addr_aligned(src1_vec_ptr);
HVX_Vector curr1 = should_fetch_src1 ? *src1_vec_ptr : prev1;
src1_vec_ptr += should_fetch_src1 ? 1 : 0;
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
HVX_Vector s0 = _DequantFunc(src0_ptr, qs_indices, scale_indices, table);
s1 = Q6_Vqf32_vadd_VsfVsf(kZeroV, s1);
HVX_Vector mpy0 = Q6_Vqf32_vmpy_Vqf32Vqf32(s0, s1);
prev1 = curr1;
sum = Q6_Vqf32_vadd_Vqf32Vqf32(mpy0, sum);
}
return _ReduceFunc(sum);
}
template <HVX_Vector (*_Func)(HVX_Vector, HVX_UVector *, HVX_Vector),
HVX_Vector (*_FuncScaleConvert)(float),
typename _TParam>
inline void vec_scale_impl(const _TParam * src, float scale, _TParam * dst, size_t count) {
constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(_TParam);
HVX_Vector * src_vec_ptr = ((HVX_Vector *) src);
HVX_Vector * const src_vec_end = ((HVX_Vector *) src) + (count / kElementsPerVector);
HVX_UVector * dst_vec_ptr = ((HVX_UVector *) dst); // TODO: opt the unaligned case?
HVX_Vector prev = *src_vec_ptr++;
const size_t leftover = count % kElementsPerVector;
HVX_Vector scale_vec = _FuncScaleConvert(scale);
while (src_vec_end - src_vec_ptr > 1) {
HVX_VectorPair curr = reinterpret_cast<HVX_VectorPair *>(src_vec_ptr)[0];
src_vec_ptr += 2;
HVX_Vector lo = Q6_V_valign_VVR(Q6_V_lo_W(curr), prev, (size_t) src);
HVX_Vector hi = Q6_V_valign_VVR(Q6_V_hi_W(curr), Q6_V_lo_W(curr), (size_t) src);
prev = Q6_V_hi_W(curr);
dst_vec_ptr[0] = _Func(lo, dst_vec_ptr, scale_vec);
dst_vec_ptr[1] = _Func(hi, dst_vec_ptr + 1, scale_vec);
dst_vec_ptr += 2;
}
if (src_vec_end - src_vec_ptr > 0) {
HVX_Vector curr = *src_vec_ptr++;
HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src);
dst_vec_ptr[0] = _Func(s0, dst_vec_ptr, scale_vec);
dst_vec_ptr++;
prev = curr;
}
if ((src_vec_end - ((HVX_Vector *) src)) > 0) {
// handle the last vector
bool should_fetch_next = leftover == 0 && hexagon::is_addr_aligned(src_vec_ptr);
HVX_Vector curr = should_fetch_next ? prev : *src_vec_ptr;
src_vec_ptr = should_fetch_next ? src_vec_ptr : src_vec_ptr + 1;
HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src);
dst_vec_ptr[0] = _Func(s0, dst_vec_ptr, scale_vec);
dst_vec_ptr++;
prev = curr;
}
if (leftover > 0) {
// handle the leftover elements
const size_t leftover_bytes = leftover * sizeof(_TParam);
HVX_Vector curr =
(leftover_bytes + hexagon::unaligned_bytes(src_vec_ptr) > hexagon::kBytesPerVector) ? *src_vec_ptr : prev;
curr = Q6_V_valign_VVR(curr, prev, (size_t) src);
q6op_vstu_variable_ARV(dst_vec_ptr, leftover_bytes, _Func(curr, dst_vec_ptr, scale_vec));
}
}
template <typename _TData> inline void vec_zero_impl(_TData * src, size_t count) {
constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(_TData);
HVX_UVector * src_vec_ptr = ((HVX_UVector *) src);
HVX_UVector * const src_vec_end = ((HVX_UVector *) src) + (count / kElementsPerVector);
const HVX_Vector kZeroV = Q6_V_vzero();
while (src_vec_end - src_vec_ptr > 1) {
src_vec_ptr[0] = kZeroV;
src_vec_ptr[1] = kZeroV;
src_vec_ptr += 2;
}
if (src_vec_end - src_vec_ptr > 0) {
src_vec_ptr[0] = kZeroV;
src_vec_ptr++;
}
const size_t leftover = count % kElementsPerVector;
if (leftover > 0) {
// handle the leftover elements
const size_t leftover_bytes = leftover * sizeof(_TData);
q6op_vstu_variable_ARV(src_vec_ptr, leftover_bytes, kZeroV);
}
}
template <auto * _OpBinaryTransform, typename _TyData, typename... _TyParams>
inline void vec_trans_impl(const _TyData * src0,
const _TyData * src1,
_TyData * dst,
size_t count,
_TyParams... params) {
static_assert(std::is_same_v<decltype(_OpBinaryTransform), HVX_Vector (*)(HVX_Vector, HVX_Vector, _TyParams...)>,
"Function type mismatch: _OpBinaryTransform must be of type HVX_Vector (*)(HVX_Vector, HVX_Vector, "
"_TyParams...)");
constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(_TyData);
HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0);
HVX_Vector * const src0_vec_ptr_end = ((HVX_Vector *) src0) + count / kElementsPerVector;
HVX_Vector * src1_vec_ptr = ((HVX_Vector *) src1);
HVX_Vector * dst_vec_ptr = ((HVX_Vector *) dst); // framework will ensure the dst is aligned
HVX_Vector prev0 = *src0_vec_ptr++;
HVX_Vector prev1 = *src1_vec_ptr++;
{
while (src0_vec_ptr_end - src0_vec_ptr > 1) {
HVX_VectorPair curr0 = reinterpret_cast<HVX_VectorPair *>(src0_vec_ptr)[0];
HVX_VectorPair curr1 = reinterpret_cast<HVX_VectorPair *>(src1_vec_ptr)[0];
HVX_Vector l0 = Q6_V_valign_VVR(Q6_V_lo_W(curr0), prev0, (size_t) src0);
HVX_Vector l1 = Q6_V_valign_VVR(Q6_V_lo_W(curr1), prev1, (size_t) src1);
dst_vec_ptr[0] = _OpBinaryTransform(l0, l1, params...);
HVX_Vector h0 = Q6_V_valign_VVR(Q6_V_hi_W(curr0), Q6_V_lo_W(curr0), (size_t) src0);
HVX_Vector h1 = Q6_V_valign_VVR(Q6_V_hi_W(curr1), Q6_V_lo_W(curr1), (size_t) src1);
dst_vec_ptr[1] = _OpBinaryTransform(h0, h1, params...);
prev0 = Q6_V_hi_W(curr0);
prev1 = Q6_V_hi_W(curr1);
src0_vec_ptr += 2;
src1_vec_ptr += 2;
dst_vec_ptr += 2;
}
}
if (src0_vec_ptr_end - src0_vec_ptr > 0) {
HVX_Vector curr0 = *src0_vec_ptr++;
HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
HVX_Vector curr1 = *src1_vec_ptr++;
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
dst_vec_ptr[0] = _OpBinaryTransform(s0, s1, params...);
prev0 = curr0;
prev1 = curr1;
dst_vec_ptr++;
}
const size_t leftover = count % kElementsPerVector;
if ((src0_vec_ptr_end - ((HVX_Vector *) src0)) > 0) {
// handle the last vector
// see also:
// https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147
// or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c
bool should_fetch_src0 = leftover != 0 || !hexagon::is_addr_aligned(src0_vec_ptr);
bool should_fetch_src1 = leftover != 0 || !hexagon::is_addr_aligned(src1_vec_ptr);
HVX_Vector curr0 = should_fetch_src0 ? *src0_vec_ptr : prev0;
HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
HVX_Vector curr1 = should_fetch_src1 ? *src1_vec_ptr : prev1;
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
dst_vec_ptr[0] = _OpBinaryTransform(s0, s1, params...);
src0_vec_ptr += should_fetch_src0 ? 1 : 0;
src1_vec_ptr += should_fetch_src1 ? 1 : 0;
prev0 = curr0;
prev1 = curr1;
dst_vec_ptr++;
}
if (leftover > 0) {
// handle the leftover elements
const size_t leftover_bytes = leftover * sizeof(_TyData);
HVX_Vector curr0 = (leftover_bytes + hexagon::unaligned_bytes(src0_vec_ptr) > hexagon::kBytesPerVector) ?
*src0_vec_ptr :
prev0;
curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
HVX_Vector curr1 = (leftover_bytes + hexagon::unaligned_bytes(src1_vec_ptr) > hexagon::kBytesPerVector) ?
*src1_vec_ptr :
prev1;
curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
q6op_vstu_variable_ARV(dst_vec_ptr, leftover_bytes, _OpBinaryTransform(curr0, curr1, params...));
}
}
template <auto * _OpUnaryTransform, typename _TyData, typename _TyDataRet, typename... _TyParams>
inline void vec_trans_with_half_ret_impl(const _TyData * src0, _TyDataRet * dst, size_t count, _TyParams... params) {
static_assert(std::is_same_v<decltype(_OpUnaryTransform), HVX_Vector (*)(HVX_VectorPair, _TyParams...)>,
"Function type mismatch: _OpUnaryTransform must be of type HVX_Vector (*)(HVX_Vector, HVX_Vector, "
"_TyParams...)");
static_assert(sizeof(_TyData) / sizeof(_TyDataRet) == 2,
"Element size mismatch: _TyData must be twice the size of _TyDataRet");
constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(_TyData);
const HVX_Vector kZero = Q6_V_vzero();
HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0);
HVX_Vector * const src0_vec_ptr_end = ((HVX_Vector *) src0) + count / kElementsPerVector;
HVX_Vector * dst_vec_ptr = ((HVX_Vector *) dst); // framework will ensure the dst is aligned
HVX_Vector prev0 = *src0_vec_ptr++;
{
while (src0_vec_ptr_end - src0_vec_ptr > 1) {
HVX_VectorPair curr0 = reinterpret_cast<HVX_VectorPair *>(src0_vec_ptr)[0];
HVX_Vector l0 = Q6_V_valign_VVR(Q6_V_lo_W(curr0), prev0, (size_t) src0);
HVX_Vector h0 = Q6_V_valign_VVR(Q6_V_hi_W(curr0), Q6_V_lo_W(curr0), (size_t) src0);
dst_vec_ptr[0] = _OpUnaryTransform(Q6_W_vcombine_VV(h0, l0), params...);
prev0 = Q6_V_hi_W(curr0);
src0_vec_ptr += 2;
dst_vec_ptr++;
}
}
HVX_Vector result;
uint32_t processed_bytes = 0;
if (src0_vec_ptr_end - src0_vec_ptr > 0) {
HVX_Vector curr0 = *src0_vec_ptr++;
HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
prev0 = curr0;
result = _OpUnaryTransform(Q6_W_vcombine_VV(kZero, s0), params...);
processed_bytes = kElementsPerVector * sizeof(_TyDataRet);
}
static const HVX_VectorPred mask = Q6_Q_vsetq_R(hexagon::kBytesPerVector / 2);
const size_t src_leftover = count % kElementsPerVector;
if ((src0_vec_ptr_end - ((HVX_Vector *) src0)) > 0) {
// handle the last vector
// see also:
// https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147
// or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c
bool should_fetch_src0 = src_leftover != 0 || !hexagon::is_addr_aligned(src0_vec_ptr);
HVX_Vector curr0 = should_fetch_src0 ? *src0_vec_ptr : prev0;
HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
if (processed_bytes) {
s0 = _OpUnaryTransform(Q6_W_vcombine_VV(s0, kZero), params...);
dst_vec_ptr[0] = Q6_V_vmux_QVV(mask, result, s0); // only update the lower half of the result vector
dst_vec_ptr++;
} else {
result = _OpUnaryTransform(Q6_W_vcombine_VV(kZero, s0), params...);
}
src0_vec_ptr += should_fetch_src0 ? 1 : 0;
prev0 = curr0;
processed_bytes += kElementsPerVector * sizeof(_TyDataRet);
}
if (src_leftover > 0) {
// handle the leftover elements
const size_t src_leftover_bytes = src_leftover * sizeof(_TyData);
HVX_Vector curr0 = (src_leftover_bytes + hexagon::unaligned_bytes(src0_vec_ptr) > hexagon::kBytesPerVector) ?
*src0_vec_ptr :
prev0;
curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
if (processed_bytes % hexagon::kBytesPerVector) {
curr0 = _OpUnaryTransform(Q6_W_vcombine_VV(curr0, kZero), params...);
curr0 = Q6_V_vmux_QVV(mask, result, curr0);
} else {
curr0 = _OpUnaryTransform(Q6_W_vcombine_VV(kZero, curr0), params...);
}
processed_bytes += src_leftover * sizeof(_TyDataRet);
q6op_vstu_variable_ARV(dst_vec_ptr, processed_bytes % hexagon::kBytesPerVector, curr0);
} else if (processed_bytes % hexagon::kBytesPerVector) {
// TODO: This conditional write-back is suboptimal because it may result in an extra memory write.
q6op_vstu_variable_ARV(dst_vec_ptr, processed_bytes % hexagon::kBytesPerVector, result);
}
}
} // namespace hexagon::vec

View File

@ -0,0 +1,332 @@
#pragma once
#include "hexagon_npu.h"
#include <hexagon_types.h>
#include <cstdint>
namespace hexagon::vec::quant {
template <typename _TStruct, size_t _Count, auto _MemberPtr> inline HVX_Vector load_into_vector(const _TStruct * src) {
static_assert(hexagon::kBytesPerVector >= sizeof(_TStruct) * _Count, "_TStruct too large for vector load");
return *reinterpret_cast<const HVX_UVector *>(&(src->*_MemberPtr));
}
template <typename _TStruct, size_t _Count> inline HVX_Vector load_struct_into_vector(const _TStruct * src) {
static_assert(hexagon::kBytesPerVector >= sizeof(_TStruct) * _Count, "_TStruct too large for vector load");
return *reinterpret_cast<const HVX_UVector *>(src);
}
template <typename _TBlock> inline HVX_Vector load_block_generic(const _TBlock & src) {
static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock), "wrong block size/padding");
return load_into_vector<_TBlock, 1, &_TBlock::qs>(&src);
}
template <typename _TBlock> inline HVX_Vector make_scale_load_mask() {
static_assert(sizeof(_TBlock) < hexagon::kBytesPerVector, "wrong block size/padding");
static_assert(std::is_same_v<decltype(_TBlock::d), npu_device_fp16_t>,
"scale field d must be of type npu_device_fp16_t");
constexpr const size_t kBytesPerScale = QUANT_BLOCK_SIZE * sizeof(_TBlock::d);
const size_t qs_start_offset = offsetof(_TBlock, d);
hexagon::HVX_VectorAlias ret;
size_t base_i = qs_start_offset;
for (size_t ret_idx = 0; ret_idx < hexagon::kBytesPerVector; ++ret_idx) {
const auto offset = ret_idx % kBytesPerScale;
const auto i = base_i + (offset % sizeof(_TBlock::d));
ret.u8[ret_idx] = (i & 1) ? (i / 2 + 64) : (i / 2);
if (offset == kBytesPerScale - 1) {
base_i += sizeof(_TBlock);
}
}
return ret.v;
}
inline size_t default_qs_shuff_idx(size_t idx) {
return idx;
}
inline size_t q4_qs_shuff_idx(size_t idx) {
// TODO: The current mask (kIndexShuffle) is hardcoded for the Q4 quantization block layout, where data is arranged in a specific interleaved pattern.
// A more general solution would need to programmatically generate the shuffle mask based on the quantization block's structure.
constexpr const size_t kIndexShuffle[] = {
0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 2, 6, 10, 14, 18, 22,
26, 30, 34, 38, 42, 46, 50, 54, 58, 62, 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45,
49, 53, 57, 61, 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63, 127, 127,
127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
};
return kIndexShuffle[idx];
}
template <typename _TBlock, size_t (*_FuncGetShuffIdx)(size_t) = default_qs_shuff_idx>
inline HVX_Vector make_qs_load_mask() {
static_assert(sizeof(_TBlock) < hexagon::kBytesPerVector, "wrong block size/padding");
const size_t qs_start_offset = offsetof(_TBlock, qs);
const size_t qs_end_offset = qs_start_offset + sizeof(_TBlock::qs);
hexagon::HVX_VectorAlias ret;
size_t ret_idx = 0;
for (size_t i = 0; i < hexagon::kBytesPerVector; ++i) {
auto offset = i % sizeof(_TBlock);
if (offset >= qs_start_offset && offset < qs_end_offset) {
size_t idx = _FuncGetShuffIdx(ret_idx);
ret.u8[idx] = ((i & 1) ? (i / 2 + 64) : (i / 2));
ret_idx++;
}
}
return ret.v;
}
template <typename _TBlock>
inline hexagon::HVX_Vector_x2 load_dual_block_generic(const _TBlock * srcs,
const HVX_Vector qs_indices,
const HVX_Vector scale_indices) {
static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 2, "wrong block size/padding");
const HVX_Vector blocks = load_struct_into_vector<_TBlock, 2>(srcs);
HVX_Vector block01 = Q6_Vb_vlut32_VbVbI(qs_indices, blocks, 0);
HVX_Vector scale01 = Q6_Vb_vlut32_VbVbI(scale_indices, blocks, 0);
block01 = Q6_Vb_vlut32or_VbVbVbI(block01, qs_indices, blocks, 2);
scale01 = Q6_Vb_vlut32or_VbVbVbI(scale01, scale_indices, blocks, 2);
if constexpr (sizeof(_TBlock) * 4 > hexagon::kBytesPerVector) {
block01 = Q6_Vb_vlut32or_VbVbVbI(block01, qs_indices, blocks, 1);
block01 = Q6_Vb_vlut32or_VbVbVbI(block01, qs_indices, blocks, 3);
scale01 = Q6_Vb_vlut32or_VbVbVbI(scale01, scale_indices, blocks, 1);
scale01 = Q6_Vb_vlut32or_VbVbVbI(scale01, scale_indices, blocks, 3);
}
hexagon::HVX_Vector_x2 result;
result.val[0] = block01;
result.val[1] = scale01;
return result;
}
template <typename _TBlock>
inline hexagon::HVX_Vector_x3 load_qual_block_generic(const _TBlock * srcs,
const HVX_Vector qs_indices,
const HVX_Vector scale_indices) {
static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 4, "wrong block size/padding");
hexagon::HVX_Vector_x3 result;
const HVX_Vector blocks = load_struct_into_vector<_TBlock, 4>(srcs);
{
HVX_Vector block0123 = Q6_Vb_vlut32_VbVbI(qs_indices, blocks, 0);
block0123 = Q6_Vb_vlut32or_VbVbVbI(block0123, qs_indices, blocks, 1);
block0123 = Q6_Vb_vlut32or_VbVbVbI(block0123, qs_indices, blocks, 2);
block0123 = Q6_Vb_vlut32or_VbVbVbI(block0123, qs_indices, blocks, 3);
result.val[0] = block0123;
}
{
HVX_Vector blocks23 = Q6_V_vror_VR(blocks, sizeof(_TBlock) * 2);
HVX_Vector scale01 = Q6_Vb_vlut32_VbVbI(scale_indices, blocks, 0);
scale01 = Q6_Vb_vlut32or_VbVbVbI(scale01, scale_indices, blocks, 2);
HVX_Vector scale23 = Q6_Vb_vlut32_VbVbI(scale_indices, blocks23, 0);
scale23 = Q6_Vb_vlut32or_VbVbVbI(scale23, scale_indices, blocks23, 2);
result.val[1] = scale01;
result.val[2] = scale23;
}
return result;
}
template <typename _TBlock>
inline hexagon::HVX_Vector_x4 load_hexa_block_generic(const _TBlock * srcs,
const HVX_Vector qs_indices,
const HVX_Vector scale_indices) {
static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 6, "wrong block size/padding");
const HVX_Vector blocks = load_struct_into_vector<_TBlock, 6>(srcs);
hexagon::HVX_Vector_x4 result;
{
HVX_Vector block012345 = Q6_Vb_vlut32_VbVbI(qs_indices, blocks, 0);
block012345 = Q6_Vb_vlut32or_VbVbVbI(block012345, qs_indices, blocks, 1);
block012345 = Q6_Vb_vlut32or_VbVbVbI(block012345, qs_indices, blocks, 2);
block012345 = Q6_Vb_vlut32or_VbVbVbI(block012345, qs_indices, blocks, 3);
result.val[0] = block012345;
}
{
HVX_Vector blocks23 = Q6_V_vror_VR(blocks, sizeof(_TBlock) * 2);
HVX_Vector blocks45 = Q6_V_vror_VR(blocks, sizeof(_TBlock) * 4);
HVX_Vector scale01 = Q6_Vb_vlut32_VbVbI(scale_indices, blocks, 0);
scale01 = Q6_Vb_vlut32or_VbVbVbI(scale01, scale_indices, blocks, 2);
HVX_Vector scale23 = Q6_Vb_vlut32_VbVbI(scale_indices, blocks23, 0);
scale23 = Q6_Vb_vlut32or_VbVbVbI(scale23, scale_indices, blocks23, 2);
HVX_Vector scale45 = Q6_Vb_vlut32_VbVbI(scale_indices, blocks45, 0);
scale45 = Q6_Vb_vlut32or_VbVbVbI(scale45, scale_indices, blocks45, 2);
result.val[1] = scale01;
result.val[2] = scale23;
result.val[3] = scale45;
}
return result;
}
inline HVX_Vector dequantize_vec_q40_qf16_2blocks(HVX_Vector qs, HVX_Vector scale01, HVX_Vector table) {
constexpr const uint32_t kSizeOfQs = sizeof(npu_device_block_q4_0::qs);
HVX_Vector q_lo = qs;
HVX_Vector q_hi = Q6_Vub_vlsr_VubR(qs, 4);
HVX_VectorPair qp0 = Q6_W_vshuff_VVR(q_hi, q_lo, kSizeOfQs * (1 + 2));
q_lo = Q6_V_lo_W(qp0);
q_lo = Q6_Vb_vshuff_Vb(q_lo);
qp0 = Q6_Wh_vlut16_VbVhR_nomatch(q_lo, table, 0);
return Q6_Vqf16_vmpy_VhfVhf(Q6_V_lo_W(qp0), scale01);
}
inline HVX_VectorPair dequantize_vec_q40_qf32_2blocks(HVX_Vector qs, HVX_Vector scale01, HVX_Vector table) {
constexpr const uint32_t kSizeOfQs = sizeof(npu_device_block_q4_0::qs);
HVX_Vector q_lo = qs;
HVX_Vector q_hi = Q6_Vub_vlsr_VubR(qs, 4);
HVX_VectorPair qp0 = Q6_W_vshuff_VVR(q_hi, q_lo, kSizeOfQs * 4);
q_lo = Q6_V_lo_W(qp0);
qp0 = Q6_Wh_vlut16_VbVhR_nomatch(q_lo, table, 0);
q_lo = Q6_V_lo_W(qp0);
return Q6_Wqf32_vmpy_VhfVhf(q_lo, scale01);
}
inline HVX_Vector_x2 dequantize_vec_q40_qf16_4blocks(HVX_Vector qs,
HVX_Vector scale01,
HVX_Vector scale23,
HVX_Vector table) {
constexpr const uint32_t kSizeOfQs = sizeof(npu_device_block_q4_0::qs);
HVX_Vector q_lo = qs;
HVX_Vector q_hi = Q6_Vub_vlsr_VubR(qs, 4);
HVX_VectorPair qp0 = Q6_W_vshuff_VVR(q_hi, q_lo, kSizeOfQs * (1 + 2 + 4));
q_lo = Q6_V_lo_W(qp0);
q_lo = Q6_Vb_vshuff_Vb(q_lo);
qp0 = Q6_Wh_vlut16_VbVhR_nomatch(q_lo, table, 0);
q_lo = Q6_V_lo_W(qp0);
q_hi = Q6_V_hi_W(qp0);
q_lo = Q6_Vqf16_vmpy_VhfVhf(q_lo, scale01);
q_hi = Q6_Vqf16_vmpy_VhfVhf(q_hi, scale23);
hexagon::HVX_Vector_x2 result;
result.val[0] = q_lo;
result.val[1] = q_hi;
return result;
}
inline HVX_VectorPair_x2 dequantize_vec_q40_qf32_4blocks(HVX_Vector qs,
HVX_Vector scale01,
HVX_Vector scale23,
HVX_Vector table) {
constexpr const uint32_t kSizeOfQs = sizeof(npu_device_block_q4_0::qs);
HVX_Vector q_lo = qs;
HVX_Vector q_hi = Q6_Vub_vlsr_VubR(qs, 4);
HVX_VectorPair qp0 = Q6_W_vshuff_VVR(q_hi, q_lo, kSizeOfQs * 4);
q_lo = Q6_V_lo_W(qp0);
qp0 = Q6_Wh_vlut16_VbVhR_nomatch(q_lo, table, 0);
q_lo = Q6_V_lo_W(qp0);
q_hi = Q6_V_hi_W(qp0);
hexagon::HVX_VectorPair_x2 result;
result.val[0] = Q6_Wqf32_vmpy_VhfVhf(q_lo, scale01);
result.val[1] = Q6_Wqf32_vmpy_VhfVhf(q_hi, scale23);
return result;
}
inline HVX_VectorPair_x3 dequantize_vec_q40_qf32_6blocks(HVX_Vector qs,
HVX_Vector scale01,
HVX_Vector scale23,
HVX_Vector scale45,
HVX_Vector table) {
constexpr const uint32_t kSizeOfQs = sizeof(npu_device_block_q4_0::qs);
HVX_Vector q_lo = qs;
HVX_Vector q_hi = Q6_Vub_vlsr_VubR(qs, 4);
HVX_VectorPair qp0 = Q6_W_vshuff_VVR(q_hi, q_lo, kSizeOfQs * 4);
q_lo = Q6_V_lo_W(qp0);
q_hi = Q6_V_hi_W(qp0);
qp0 = Q6_Wh_vlut16_VbVhR_nomatch(q_lo, table, 0);
HVX_VectorPair qp1 = Q6_Wh_vlut16_VbVhR_nomatch(q_hi, table, 0);
q_lo = Q6_V_lo_W(qp0);
q_hi = Q6_V_hi_W(qp0);
HVX_Vector q2 = Q6_V_lo_W(qp1);
hexagon::HVX_VectorPair_x3 result;
result.val[0] = Q6_Wqf32_vmpy_VhfVhf(q_lo, scale01);
result.val[1] = Q6_Wqf32_vmpy_VhfVhf(q_hi, scale23);
result.val[2] = Q6_Wqf32_vmpy_VhfVhf(q2, scale45);
return result;
}
inline HVX_Vector load_dequant_vec_q40_qf32_1block(const npu_device_block_q4_0 * src,
const HVX_Vector qs_indices,
const HVX_Vector scale_indices,
const HVX_Vector table) {
// TODO: can we have a single-block version of load and dequantize?
auto qs = load_dual_block_generic(src, qs_indices, scale_indices);
return Q6_V_lo_W(dequantize_vec_q40_qf32_2blocks(qs.val[0], qs.val[1], table));
}
inline HVX_VectorPair load_dequant_vec_q40_qf32_2blocks(const npu_device_block_q4_0 * src,
const HVX_Vector qs_indices,
const HVX_Vector scale_indices,
const HVX_Vector table) {
auto qs = load_dual_block_generic(src, qs_indices, scale_indices);
return dequantize_vec_q40_qf32_2blocks(qs.val[0], qs.val[1], table);
}
inline HVX_VectorPair_x2 load_dequant_vec_q40_qf32_4blocks(const npu_device_block_q4_0 * src,
const HVX_Vector qs_indices,
const HVX_Vector scale_indices,
const HVX_Vector table) {
auto qs = load_qual_block_generic(src, qs_indices, scale_indices);
return dequantize_vec_q40_qf32_4blocks(qs.val[0], qs.val[1], qs.val[2], table);
}
inline HVX_VectorPair_x3 load_dequant_vec_q40_qf32_6blocks(const npu_device_block_q4_0 * src,
const HVX_Vector qs_indices,
const HVX_Vector scale_indices,
const HVX_Vector table) {
auto qs = load_hexa_block_generic(src, qs_indices, scale_indices);
return dequantize_vec_q40_qf32_6blocks(qs.val[0], qs.val[1], qs.val[2], qs.val[3], table);
}
} // namespace hexagon::vec::quant

View File

@ -0,0 +1,128 @@
#pragma once
#include "util.hpp"
#include <HAP_compute_res.h>
#include <HAP_vtcm_mgr.h>
namespace hexagon {
class vtcm_mem {
public:
explicit vtcm_mem(size_t size, bool single_page) {
constexpr const unsigned int kTimeoutUs = 10000; // 10ms timeout
size_t avail_size = single_page ? get_avail_page_size() : get_avail_block_size();
if (size > avail_size) {
DEVICE_LOG_ERROR("Requested VTCM size %zu exceeds available size %zu\n", size, avail_size);
return;
}
compute_res_attr_t compute_res;
HAP_compute_res_attr_init(&compute_res);
HAP_compute_res_attr_set_serialize(&compute_res, false);
HAP_compute_res_attr_set_vtcm_param(&compute_res, size, single_page ? 1 : 0);
_vtcm_context_id = HAP_compute_res_acquire(&compute_res, kTimeoutUs); // 10ms timeout
if (_vtcm_context_id == 0) {
DEVICE_LOG_ERROR("Failed to acquire VTCM context: %zu bytes, timeout %zu us\n", size, kTimeoutUs);
return;
}
_vtcm_mem = HAP_compute_res_attr_get_vtcm_ptr(&compute_res);
if (_vtcm_mem == nullptr) {
DEVICE_LOG_ERROR("Failed to allocate VTCM memory: %zu bytes, timeout %zu us\n", size, kTimeoutUs);
return;
}
_vtcm_size = size;
DEVICE_LOG_DEBUG("VTCM allocated: %p(%zu), avail: %zu\n", _vtcm_mem, size, avail_size);
}
explicit vtcm_mem(size_t size, bool single_page, size_t timeout_us) {
compute_res_attr_t compute_res;
HAP_compute_res_attr_init(&compute_res);
HAP_compute_res_attr_set_serialize(&compute_res, false);
HAP_compute_res_attr_set_vtcm_param(&compute_res, size, single_page ? 1 : 0);
_vtcm_context_id = HAP_compute_res_acquire(&compute_res, timeout_us);
if (_vtcm_context_id == 0) {
DEVICE_LOG_ERROR("Failed to acquire VTCM context: %zu bytes, timeout %zu us\n", size, timeout_us);
return;
}
_vtcm_mem = HAP_compute_res_attr_get_vtcm_ptr(&compute_res);
if (_vtcm_mem == nullptr) {
DEVICE_LOG_ERROR("Failed to allocate VTCM memory: %zu bytes, timeout %zu us\n", size, timeout_us);
return;
}
_vtcm_size = size;
DEVICE_LOG_DEBUG("VTCM allocated: %p(%zu), avail: %zu\n", _vtcm_mem, size, get_avail_block_size());
}
~vtcm_mem() {
if (_vtcm_context_id != 0) {
auto ret = HAP_compute_res_release(_vtcm_context_id);
if (ret != AEE_SUCCESS) {
DEVICE_LOG_ERROR("Failed to release VTCM memory: %d\n", ret);
}
}
DEVICE_LOG_DEBUG("VTCM released: %zu bytes at %p\n", _vtcm_size, _vtcm_mem);
}
bool is_valid() const { return _vtcm_mem != nullptr && _vtcm_size != 0; }
uint8_t * get_mem() const { return reinterpret_cast<uint8_t *>(_vtcm_mem); }
size_t get_size() const { return _vtcm_size; }
static size_t get_total_size() {
unsigned int arch_page_aligned_size = 0;
unsigned int arch_page_count = 0;
auto ret = HAP_query_total_VTCM(&arch_page_aligned_size, &arch_page_count);
if (ret != AEE_SUCCESS) {
DEVICE_LOG_ERROR("Failed to query total VTCM: %d\n", ret);
return 0;
}
return arch_page_aligned_size;
}
static size_t get_avail_block_size() {
unsigned int avail_block_size = 0;
unsigned int avail_page_size = 0;
unsigned int num_pages = 0;
auto ret = HAP_query_avail_VTCM(&avail_block_size, &avail_page_size, &num_pages);
if (ret != AEE_SUCCESS) {
DEVICE_LOG_ERROR("Failed to query available VTCM: %d\n", ret);
return 0;
}
return avail_block_size;
}
static size_t get_avail_page_size() {
unsigned int avail_block_size = 0;
unsigned int avail_page_size = 0;
unsigned int num_pages = 0;
auto ret = HAP_query_avail_VTCM(&avail_block_size, &avail_page_size, &num_pages);
if (ret != AEE_SUCCESS) {
DEVICE_LOG_ERROR("Failed to query available VTCM: %d\n", ret);
return 0;
}
return avail_page_size;
}
private:
void * _vtcm_mem = nullptr;
size_t _vtcm_size = 0;
unsigned int _vtcm_context_id = 0;
DISABLE_COPY_AND_MOVE(vtcm_mem);
};
} // namespace hexagon

View File

@ -0,0 +1,293 @@
#include "buffer.hpp"
#include "host_device.hpp"
#include "profiler.hpp"
#include "tensor.hpp"
#include <rpcmem.h>
namespace {
constexpr const int kRpcMemDefaultHeapId = RPCMEM_HEAP_ID_SYSTEM;
constexpr const uint32_t kRpcMemDefaultFlags = RPCMEM_DEFAULT_FLAGS; // TODO: should we use a different flag?
static hexagon::host_buffer * get_buffer_object(ggml_backend_buffer_t buffer) {
return reinterpret_cast<hexagon::host_buffer *>(buffer->context);
}
static hexagon::host_buffer_type * get_buffer_type_object(ggml_backend_buffer_type_t buft) {
return reinterpret_cast<hexagon::host_buffer_type *>(buft->context);
}
void backend_buffer_free_buffer(ggml_backend_buffer_t buffer) {
SCOPED_PERFORMANCE_TRACKER("[hexagon-npu][%p]backend_buffer_free_buffer", (void *) get_buffer_object(buffer));
delete get_buffer_object(buffer);
}
void * backend_buffer_get_base(ggml_backend_buffer_t buffer) {
auto * buffer_obj = get_buffer_object(buffer);
GGML_ASSERT(buffer_obj != nullptr);
return buffer_obj->get_buffer();
}
ggml_status backend_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
auto * buffer_type_obj = get_buffer_type_object(buffer->buft);
GGML_ASSERT(buffer_type_obj != nullptr);
auto * device_object = buffer_type_obj->get_device();
GGML_ASSERT(device_object != nullptr);
auto * buffer_obj = get_buffer_object(buffer);
GGML_ASSERT(buffer_obj != nullptr);
SCOPED_PERFORMANCE_TRACKER("[hexagon-npu][%p]backend_buffer_init_tensor", (void *) buffer_obj);
auto tensor_object = buffer_obj->init_tensor(tensor, device_object->get_device_handle());
if (!tensor_object) {
LOG_ERROR("Failed to init tensor\n");
return GGML_STATUS_ALLOC_FAILED;
}
return GGML_STATUS_SUCCESS;
}
void backend_buffer_memset_tensor(ggml_backend_buffer_t buffer,
ggml_tensor * tensor,
uint8_t value,
size_t offset,
size_t size) {
SCOPED_PERFORMANCE_TRACKER("[hexagon-npu][%p]backend_buffer_memset_tensor.size.%zu",
(void *) get_buffer_object(buffer), size);
memset((char *) tensor->data + offset, value, size);
}
void backend_buffer_set_tensor(ggml_backend_buffer_t buffer,
ggml_tensor * tensor,
const void * data,
size_t offset,
size_t size) {
SCOPED_PERFORMANCE_TRACKER("[hexagon-npu][%p]backend_buffer_set_tensor.size.%zu",
(void *) get_buffer_object(buffer), size);
// TODO: use DMA instead of memcpy?
memcpy((char *) tensor->data + offset, data, size);
}
void backend_buffer_get_tensor(ggml_backend_buffer_t buffer,
const ggml_tensor * tensor,
void * data,
size_t offset,
size_t size) {
SCOPED_PERFORMANCE_TRACKER("[hexagon-npu][%p]backend_buffer_get_tensor", (void *) get_buffer_object(buffer));
// TODO: use DMA instead of memcpy?
memcpy(data, (const char *) tensor->data + offset, size);
}
bool backend_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
SCOPED_PERFORMANCE_TRACKER("[hexagon-npu][%p]backend_buffer_cpy_tensor", (void *) get_buffer_object(buffer));
if (ggml_backend_buffer_is_host(src->buffer)) {
// TODO: use DMA instead of memcpy?
memcpy(dst->data, src->data, ggml_nbytes(src));
return true;
}
LOG_DEBUG("[hexagon-npu][%p]backend_buffer_cpy_tensor: copy from non-host buffer not supported\n",
(void *) get_buffer_object(buffer));
return false;
}
void backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
auto * buffer_obj = get_buffer_object(buffer);
GGML_ASSERT(buffer_obj != nullptr);
SCOPED_PERFORMANCE_TRACKER("[hexagon-npu][%p]backend_buffer_clear", (void *) buffer_obj);
memset(buffer_obj->get_buffer(), value, buffer_obj->get_size());
}
void backend_buffer_reset(ggml_backend_buffer_t buffer) {
auto * buffer_obj = get_buffer_object(buffer);
GGML_ASSERT(buffer_obj != nullptr);
SCOPED_PERFORMANCE_TRACKER("[hexagon-npu][%p]backend_buffer_reset", (void *) buffer_obj);
buffer_obj->clear_tensors();
}
constexpr const ggml_backend_buffer_i backend_buffer_interface = {
/* .free_buffer = */ backend_buffer_free_buffer,
/* .get_base = */ backend_buffer_get_base,
/* .init_tensor = */ backend_buffer_init_tensor,
/* .memset_tensor = */ backend_buffer_memset_tensor,
/* .set_tensor = */ backend_buffer_set_tensor,
/* .get_tensor = */ backend_buffer_get_tensor,
/* .cpy_tensor = */ backend_buffer_cpy_tensor,
/* .clear = */ backend_buffer_clear,
/* .reset = */ backend_buffer_reset,
};
const char * backend_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
auto * buffer_type_obj = get_buffer_type_object(buft);
GGML_ASSERT(buffer_type_obj != nullptr);
return buffer_type_obj->get_name();
}
ggml_backend_buffer_t backend_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
auto * buffer_type_obj = get_buffer_type_object(buft);
GGML_ASSERT(buffer_type_obj != nullptr);
return buffer_type_obj->allocate_buffer(size);
}
size_t backend_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
auto * buffer_type_obj = get_buffer_type_object(buft);
GGML_ASSERT(buffer_type_obj != nullptr);
return buffer_type_obj->get_buffer_alignment();
}
size_t backend_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
auto * buffer_type_obj = get_buffer_type_object(buft);
GGML_ASSERT(buffer_type_obj != nullptr);
auto size = buffer_type_obj->get_max_buffer_size();
LOG_DEBUG("[hexagon-npu][%s]max_buffer_size: %zu\n", buffer_type_obj->get_name(), size);
return size;
}
bool backend_buffer_is_host(ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == backend_buffer_type_get_name;
}
} // namespace
namespace hexagon {
host_buffer::host_buffer(common::rpc_mem_ptr allocator, size_t size, uint32_t domain_id) :
_allocator(allocator),
_size(size),
_domain_id(domain_id) {
if (!_allocator->is_valid()) {
LOG_ERROR("[hexagon-npu]rpc memory not initialized\n");
return;
}
if (size > _allocator->get_max_alloc_size()) {
LOG_ERROR("[hexagon-npu]rpc memory size %zu exceeds max alloc size %zu\n", size,
_allocator->get_max_alloc_size());
return;
}
_data = _allocator->alloc(kRpcMemDefaultHeapId, kRpcMemDefaultFlags, size);
if (!_data) {
LOG_ERROR("[hexagon-npu]failed to allocate rpc memory, size: %d MB\n", (int) (size / (1 << 20)));
return;
}
LOG_DEBUG("[hexagon-npu]create host_buffer(%p), size: %zu, domain_id: %d\n", (void *) _data, size, (int) domain_id);
}
host_buffer::~host_buffer() {
LOG_DEBUG("[hexagon-npu]destroy host_buffer(%p), size: %zu, domain_id: %d\n", (void *) _data, _size,
(int) _domain_id);
_tensors.clear();
if (_buffer_fd != -1) {
auto ret = _allocator->fastrpc_munmap((int) _domain_id, _buffer_fd, nullptr, 0);
if (ret != AEE_SUCCESS) {
LOG_ERROR("[hexagon-npu]failed to munmap rpc memory, fd: %d, ret: %d\n", _buffer_fd, ret);
return;
}
}
_allocator->free(_data);
}
std::shared_ptr<host_tensor> host_buffer::init_tensor(ggml_tensor * tensor, remote_handle64 device_handle) {
if (!_data) {
LOG_ERROR("[hexagon-npu]failed to init tensor, rpc memory not initialized\n");
return std::shared_ptr<host_tensor>();
}
if (_buffer_fd == -1) {
_buffer_fd = _allocator->to_fd(_data);
if (_buffer_fd < 0) {
LOG_ERROR("[hexagon-npu]failed to get fd from rpc memory\n");
return std::shared_ptr<host_tensor>();
}
auto ret = _allocator->fastrpc_mmap((int) _domain_id, _buffer_fd, _data, 0, _size, FASTRPC_MAP_FD);
if (ret != AEE_SUCCESS) {
LOG_ERROR("[hexagon-npu]failed to mmap rpc memory, fd: %d, size: %zu, ret: %d\n", _buffer_fd, _size, ret);
return std::shared_ptr<host_tensor>();
}
LOG_DEBUG("[hexagon-npu]mmap rpc memory(%p), fd: %d, addr: %p, size: %zu\n", (void *) _data, _buffer_fd, _data,
_size);
}
auto tensor_object = std::make_shared<host_tensor>(
tensor, _buffer_fd, (uint64_t) (reinterpret_cast<uint8_t *>(tensor->data) - reinterpret_cast<uint8_t *>(_data)),
device_handle);
if (!tensor_object->is_valid()) {
LOG_ERROR("[hexagon-npu]failed to init tensor, device handle: %p\n", (void *) device_handle);
return std::shared_ptr<host_tensor>();
}
_tensors.push_back(tensor_object);
return tensor_object;
}
void host_buffer::clear_tensors() {
LOG_DEBUG("[hexagon-npu]clear host_buffer(%p) tensors\n", (void *) _data);
host_tensor::destroy_tensors(_tensors);
}
host_buffer_type::host_buffer_type(ggml_backend_dev_t dev, const std::string & name, common::rpc_mem_ptr rpc_mem) :
_name(name),
_rpc_mem(rpc_mem) {
iface = {
/* .get_name = */ backend_buffer_type_get_name,
/* .alloc_buffer = */ backend_buffer_type_alloc_buffer,
/* .get_alignment = */ backend_buffer_type_get_alignment,
/* .get_max_size = */ backend_buffer_type_get_max_size,
/* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes
/* .is_host = */ backend_buffer_is_host,
};
device = dev;
context = this;
_device = reinterpret_cast<npu_device *>(device->context);
LOG_DEBUG("[%s]create host_buffer_type %s\n", _device->get_name(), _name.c_str());
}
size_t host_buffer_type::get_buffer_alignment() const {
return _device->is_device_initialized() ? _device->get_alignment() : 128;
}
size_t host_buffer_type::get_max_buffer_size() const {
if (!_rpc_mem) {
LOG_ERROR("[%s]rpc memory not initialized\n", _device->get_name());
return 0;
}
return _rpc_mem->get_max_alloc_size();
}
ggml_backend_buffer_t host_buffer_type::allocate_buffer(size_t size) {
if (!_rpc_mem) {
LOG_ERROR("[%s]rpc memory not initialized\n", _device->get_name());
return nullptr;
}
if (!_device->is_device_initialized()) {
LOG_ERROR("[%s]device is not initialized\n", _device->get_name());
return nullptr;
}
auto * buffer = new host_buffer(_rpc_mem, size, _device->get_dsp_domain_id());
if (!buffer->is_valid()) {
delete buffer;
LOG_ERROR("[%s]Failed to allocate buffer of size %zu\n", _device->get_name(), size);
return nullptr;
}
LOG_DEBUG("[%s]allocate buffer %p, size: %zu\n", _device->get_name(), buffer->get_buffer(), size);
return ggml_backend_buffer_init(this, backend_buffer_interface, buffer, size);
}
} // namespace hexagon

View File

@ -0,0 +1,68 @@
#pragma once
#include <list>
#include <memory>
#include "ggml-backend-impl.h"
#include "hexagon_npu.h"
#include "rpc-mem.hpp"
namespace hexagon {
class host_tensor;
class host_buffer {
public:
explicit host_buffer(common::rpc_mem_ptr allocator, size_t size, uint32_t domain_id);
~host_buffer();
bool is_valid() const { return _data != nullptr; }
void * get_buffer() { return _data; }
size_t get_size() const { return _size; }
std::shared_ptr<host_tensor> init_tensor(ggml_tensor * tensor, remote_handle64 device_handle);
void clear_tensors();
private:
common::rpc_mem_ptr _allocator;
void * _data = nullptr;
size_t _size = 0;
int _buffer_fd = -1;
uint32_t _domain_id = 0;
std::list<std::shared_ptr<host_tensor>> _tensors;
DISABLE_COPY(host_buffer);
DISABLE_MOVE(host_buffer);
};
class npu_device;
class host_buffer_type : public ggml_backend_buffer_type {
public:
explicit host_buffer_type(ggml_backend_dev_t dev, const std::string & name, common::rpc_mem_ptr rpc_mem);
const char * get_name() const { return _name.c_str(); }
size_t get_buffer_alignment() const;
size_t get_max_buffer_size() const;
ggml_backend_buffer_t allocate_buffer(size_t size);
npu_device * get_device() const { return _device; }
private:
npu_device * _device = nullptr;
std::string _name;
common::rpc_mem_ptr _rpc_mem;
DISABLE_COPY(host_buffer_type);
DISABLE_MOVE(host_buffer_type);
};
} // namespace hexagon

View File

@ -0,0 +1,106 @@
#include "graph.hpp"
#include "profiler.hpp"
#include "tensor.hpp"
namespace hexagon {
host_graph::host_graph(ggml_cgraph * cgraph, remote_handle64 device_handle) : _device_handle(device_handle) {
auto status = npu_device_graph_init(_device_handle, &_graph_handle);
if (status != AEE_SUCCESS) {
LOG_ERROR("Failed to init graph: %d", (int) status);
_graph_handle = 0;
return;
}
update(cgraph);
}
host_graph::~host_graph() {
if (_graph_handle) {
npu_device_graph_free(_device_handle, _graph_handle);
_graph_handle = 0;
}
}
bool host_graph::update(ggml_cgraph * cgraph) {
if (!_graph_handle) {
LOG_ERROR("host_graph not initialized\n");
return false;
}
PROFILER_LOG_DEBUG("[%p]host_graph::update started\n", (void *) this);
SCOPED_PERFORMANCE_TRACKER("[hexagon-npu][%p]update, handle(%p)", (void *) this, (void *) _graph_handle);
_tensor_handles.clear();
_tensor_update_configs.clear();
_tensor_handles.reserve(cgraph->n_nodes);
_tensor_update_configs.reserve(cgraph->n_nodes);
for (int i = 0; i < cgraph->n_nodes; ++i) {
auto * node = cgraph->nodes[i];
if (node->op == GGML_OP_NONE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE ||
node->op == GGML_OP_RESHAPE) {
// skip view liked ops
LOG_DEBUG("node[%d]%s(%s), addr: %p, type: %s, dims: %ldx%ldx%ldx%ld, skipped\n", i, ggml_get_name(node),
ggml_op_desc(node), (void *) node, ggml_type_name(node->type), (long) node->ne[0],
(long) node->ne[1], (long) node->ne[2], (long) node->ne[3]);
continue;
}
// TODO: move to tensor?
auto * tensor_obj = host_tensor::from_ggml_tensor(node);
if (!tensor_obj) {
LOG_DEBUG("Unable to get host tensor from ggml tensor: %p\n", (void *) node);
continue;
}
_tensor_handles.push_back(tensor_obj->get_device_tensor_handle());
_tensor_update_configs.push_back(tensor_obj->update_hosts_params_only(node));
PROFILER_LOG_DEBUG("node[%d]%s(%s), addr(%p), %ldx%ldx%ldx%ld%s, handle(%p)\n", i, ggml_get_name(node),
ggml_op_desc(node), (void *) tensor_obj, (long) tensor_obj->get_ne(0),
(long) tensor_obj->get_ne(1), (long) tensor_obj->get_ne(2), (long) tensor_obj->get_ne(3),
ggml_type_name(node->type), (void *) tensor_obj->get_device_tensor_handle());
}
GGML_ASSERT(_tensor_handles.size() == _tensor_update_configs.size());
constexpr const npu_device_tensor_handle_t kEmptyTensorHandle = 0;
constexpr const npu_device_tensor_update_config kEmptyUpdateConfig = {};
auto ret = npu_device_graph_set_tensor_with_param(
_device_handle, _graph_handle, _tensor_handles.size() ? _tensor_handles.data() : &kEmptyTensorHandle,
(int) _tensor_handles.size(),
_tensor_update_configs.size() ? _tensor_update_configs.data() : &kEmptyUpdateConfig,
(int) _tensor_update_configs.size());
if (ret != AEE_SUCCESS) {
LOG_ERROR("[%p]failed to set tensors in host_graph: 0x%x\n", (void *) this, (int) ret);
return false;
}
LOG_DEBUG("[%p]host_graph::update, handle(%p), ggml_cgraph(%p), tensor count(%zu)\n", (void *) this,
(void *) _graph_handle, (void *) cgraph, _tensor_handles.size());
return true;
}
bool host_graph::compute() {
if (!_graph_handle) {
LOG_ERROR("host_graph not initialized\n");
return false;
}
LOG_DEBUG("[%p]host_graph::compute started\n", (void *) this);
SCOPED_PERFORMANCE_TRACKER("[hexagon-npu][%p]compute, handle(%p)", (void *) this, (void *) _graph_handle);
auto status = npu_device_graph_compute(_device_handle, _graph_handle);
if (status != AEE_SUCCESS) {
LOG_ERROR("Failed to compute host_graph: 0x%x\n", (int) status);
LOG_DEBUG("[%p]host_graph::compute finished with failure\n", (void *) this);
return false;
}
LOG_DEBUG("[%p]host_graph::compute finished\n", (void *) this);
return true;
}
} // namespace hexagon

View File

@ -0,0 +1,33 @@
#pragma once
#include <vector>
#include "common.hpp"
#include "ggml-backend-impl.h"
#include "hexagon_npu.h"
namespace hexagon {
class host_graph {
public:
host_graph(ggml_cgraph * cgraph, remote_handle64 device_handle);
~host_graph();
bool is_valid() const { return _graph_handle != 0; }
bool update(ggml_cgraph * cgraph);
bool compute();
private:
remote_handle64 _device_handle = 0;
npu_device_graph_handle_t _graph_handle = npu_device_INVALID_DEVICE_GRAPH_HANDLE;
std::vector<npu_device_tensor_handle_t> _tensor_handles;
std::vector<npu_device_tensor_update_config> _tensor_update_configs;
DISABLE_COPY(host_graph);
DISABLE_MOVE(host_graph);
};
} // namespace hexagon

View File

@ -0,0 +1,164 @@
#include "buffer.hpp"
#include "common.hpp"
#include "ggml-backend-impl.h"
#include "ggml-impl.h"
#include "host_device.hpp"
#include "profiler.hpp"
#include <memory>
#include <string>
namespace {
hexagon::npu_device * get_device_object(ggml_backend_dev_t device) {
return reinterpret_cast<hexagon::npu_device *>(device->context);
}
hexagon::npu_device * get_device_object(ggml_backend_t backend) {
return get_device_object(backend->device);
}
const char * backend_dev_get_name(ggml_backend_dev_t dev) {
auto * dev_obj = get_device_object(dev);
GGML_ASSERT(dev_obj != nullptr);
return dev_obj->get_name();
}
const char * backend_dev_get_description(ggml_backend_dev_t dev) {
auto * dev_obj = get_device_object(dev);
GGML_ASSERT(dev_obj != nullptr);
return dev_obj->get_description();
}
bool backend_dev_is_npu_device(ggml_backend_dev_t dev) {
return dev->iface.get_name == backend_dev_get_name;
}
void backend_dev_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
GGML_UNUSED(dev);
*free = common::get_system_free_memory_in_bytes();
*total = common::get_system_total_memory_in_bytes();
}
enum ggml_backend_dev_type backend_dev_get_type(ggml_backend_dev_t dev) {
GGML_UNUSED(dev);
// TODO: figure out why the GGML_BACKEND_DEVICE_TYPE_ACCEL type will miss some ops
return GGML_BACKEND_DEVICE_TYPE_IGPU;
}
void backend_dev_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
GGML_ASSERT(get_device_object(dev) != nullptr);
props->name = backend_dev_get_name(dev);
props->description = backend_dev_get_description(dev);
props->type = backend_dev_get_type(dev);
backend_dev_get_memory(dev, &props->memory_free, &props->memory_total);
props->caps = {};
}
ggml_backend_t backend_dev_init_backend(ggml_backend_dev_t dev, const char * params) {
auto * dev_obj = get_device_object(dev);
GGML_ASSERT(dev_obj != nullptr);
if (!dev_obj->init_device()) {
LOG_ERROR("[%s]Failed to init device\n", backend_dev_get_name(dev));
return nullptr;
}
SCOPED_PERFORMANCE_TRACKER("[hexagon-npu][%p]backend_dev_init_backend", (void *) dev_obj);
return new hexagon::npu_backend(dev);
}
ggml_backend_buffer_type_t backend_dev_get_buffer_type(ggml_backend_dev_t dev) {
auto * dev_obj = get_device_object(dev);
GGML_ASSERT(dev_obj != nullptr);
return dev_obj->get_default_buffer_type(dev);
}
ggml_backend_buffer_t backend_dev_buffer_from_host_ptr(ggml_backend_dev_t dev,
void * ptr,
size_t size,
size_t max_tensor_size) {
// TODO: should we use the device memory here?
GGML_UNUSED(dev);
GGML_UNUSED(max_tensor_size);
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
}
bool backend_dev_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
if (!backend_dev_is_npu_device(dev)) {
return false;
}
auto * dev_obj = get_device_object(dev);
GGML_ASSERT(dev_obj != nullptr);
SCOPED_PERFORMANCE_TRACKER("[hexagon-npu][%p]backend_dev_supports_op", (void *) dev_obj);
return dev_obj->supports_op(op);
}
bool backend_dev_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
if (!backend_dev_is_npu_device(dev)) {
return false;
}
auto * dev_obj = get_device_object(dev);
GGML_ASSERT(dev_obj != nullptr);
return dev_obj->supports_buft(buft);
}
bool backend_dev_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
if (!backend_dev_is_npu_device(dev)) {
return false;
}
auto * dev_obj = get_device_object(dev);
GGML_ASSERT(dev_obj != nullptr);
SCOPED_PERFORMANCE_TRACKER("[hexagon-npu][%p]backend_dev_offload_op", (void *) dev_obj);
return dev_obj->offload_op(op);
}
constexpr const ggml_backend_device_i npu_device_interface = {
/* .get_name = */ backend_dev_get_name,
/* .get_description = */ backend_dev_get_description,
/* .get_memory = */ backend_dev_get_memory,
/* .get_type = */ backend_dev_get_type,
/* .get_props = */ backend_dev_get_props,
/* .init_backend = */ backend_dev_init_backend,
/* .get_buffer_type = */ backend_dev_get_buffer_type,
/* .get_host_buffer_type = */ nullptr,
/* .buffer_from_host_ptr = */ backend_dev_buffer_from_host_ptr,
/* .supports_op = */ backend_dev_supports_op,
/* .supports_buft = */ backend_dev_supports_buft,
/* .offload_op = */ backend_dev_offload_op,
/* .event_new = */ nullptr,
/* .event_free = */ nullptr,
/* .event_synchronize = */ nullptr,
};
class npu_device_proxy : public backend_device_proxy {
public:
explicit npu_device_proxy(backend_index_type device) { _device = std::make_unique<hexagon::npu_device>(device); }
const ggml_backend_device_i & get_iface() const { return npu_device_interface; }
void * get_context() { return _device.get(); }
private:
std::unique_ptr<hexagon::npu_device> _device;
DISABLE_COPY(npu_device_proxy);
DISABLE_MOVE(npu_device_proxy);
};
} // namespace
backend_device_proxy_ptr create_hexagon_backend_context(backend_index_type device) {
if (device < QNN_BACKEND_COUNT || device >= TOTAL_BACKEND_COUNT) {
return backend_device_proxy_ptr();
}
return std::make_shared<npu_device_proxy>(device);
}

View File

@ -0,0 +1,364 @@
#include "host_device.hpp"
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wmissing-prototypes"
#include <domain_default.h>
#pragma GCC diagnostic pop
#include "graph.hpp"
#include "util.hpp"
#include <remote.h>
#include <type_traits>
#define SKEL_URI_DEFINE(arch) ("file:///libhexagon_npu_skel_" arch ".so?npu_device_skel_handle_invoke&_modver=1.0")
namespace {
struct device_library_info {
hexagon::hexagon_dsp_arch arch;
const char * device_lib_uri;
};
constexpr const device_library_info kDeviceLibraryInfo[] = {
{ hexagon::NONE, SKEL_URI_DEFINE("") },
{ hexagon::V68, SKEL_URI_DEFINE("v68") },
{ hexagon::V69, SKEL_URI_DEFINE("v69") },
{ hexagon::V73, SKEL_URI_DEFINE("v73") },
{ hexagon::V75, SKEL_URI_DEFINE("v75") },
{ hexagon::V79, SKEL_URI_DEFINE("v79") },
};
const device_library_info & get_device_library_info(hexagon::hexagon_dsp_arch arch) {
for (const auto & info : kDeviceLibraryInfo) {
if (info.arch == arch) {
return info;
}
}
LOG_ERROR("Unknown DSP arch: %d, using hexagon::NONE\n", arch);
return kDeviceLibraryInfo[0];
}
const char * get_domain_param(uint32_t domain_id) {
for (const auto & domain : supported_domains) {
if ((uint32_t) domain.id == domain_id) {
return domain.uri;
}
}
return "";
}
constexpr const ggml_guid kBackendNpuGuid = { 0x7a, 0xd7, 0x59, 0x7d, 0x8f, 0x66, 0x4f, 0x35,
0x84, 0x8e, 0xf5, 0x9a, 0x9b, 0x83, 0x7d, 0x0a };
hexagon::npu_backend * get_backend_object(ggml_backend_t backend) {
return reinterpret_cast<hexagon::npu_backend *>(backend);
}
const char * backend_get_name(ggml_backend_t backend) {
auto * backend_obj = get_backend_object(backend);
GGML_ASSERT(backend_obj != nullptr);
return backend_obj->get_name();
}
void backend_free(ggml_backend_t backend) {
delete get_backend_object(backend);
}
bool backend_cpy_tensor_async(ggml_backend_t backend_src,
ggml_backend_t backend_dst,
const ggml_tensor * src,
ggml_tensor * dst) {
// TODO: implement this
return false;
}
ggml_status backend_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
auto * backend_obj = get_backend_object(backend);
GGML_ASSERT(backend_obj != nullptr);
return backend_obj->graph_compute(cgraph);
}
} // namespace
namespace hexagon {
// TODO: should we use another domain?
npu_device::npu_device(backend_index_type device) : _dsp_domain_id(CDSP_DOMAIN_ID) {
GGML_UNUSED(device);
LOG_DEBUG("[%s]NPU device created\n", _name.c_str());
}
npu_device::~npu_device() {
if (_device_handle) {
npu_device_close(_device_handle);
}
}
size_t npu_device::get_alignment() const {
uint32_t alignment = 0;
npu_device_device_get_alignment(_device_handle, &alignment);
return alignment;
}
bool npu_device::is_device_initialized() const {
if (!_device_handle) {
LOG_ERROR("[%s]NPU device not opened\n", get_name());
return false;
}
if (!_rpc_mem) {
LOG_ERROR("[%s]rpc memory not initialized\n", get_name());
return false;
}
return true;
}
bool npu_device::init_device() {
if (!init_rpc_mem()) {
return false;
}
if (!init_device_lib()) {
return false;
}
return true;
}
bool npu_device::supports_buft(ggml_backend_buffer_type_t buft) const {
return buft && buft->device && buft->device->context == this;
}
bool npu_device::supports_op_impl(const ggml_tensor * op) {
static_assert(std::is_same<npu_device_fp16_t, ggml_fp16_t>::value,
"npu_device_fp16_t should be same as ggml_fp16_t");
if (op->op == GGML_OP_NONE) {
return true;
}
if (op->op == GGML_OP_VIEW || op->op == GGML_OP_RESHAPE || op->op == GGML_OP_PERMUTE) {
LOG_DEBUG("[%s]view/reshape/permute op is always supported\n", get_name());
return true;
}
if (type_to_npu_type(op->type) == NPU_DATA_TYPE_COUNT) {
LOG_DEBUG("[%s][%s]Unsupported op tensor type: %s\n", get_name(), ggml_get_name(op), ggml_type_name(op->type));
return false;
}
auto npu_op = op_to_npu_op(op->op);
if (npu_op == NPU_OP_COUNT) {
LOG_DEBUG("[%s][%s]Unsupported op: %s\n", get_name(), ggml_get_name(op), ggml_op_desc(op));
return false;
}
int i = 0;
npu_device_tensor_spec srcs[DEVICE_TENSOR_MAX_SRC] = {};
constexpr const auto get_spec = [](const ggml_tensor * tensor) -> npu_device_tensor_spec {
if (!tensor) {
return npu_device_tensor_spec{ {}, {}, NPU_DATA_TYPE_COUNT };
}
static_assert(DEVICE_TENSOR_MAX_DIMS == GGML_MAX_DIMS, "tensor dimensions mismatch");
npu_device_tensor_spec spec{};
spec.ne[0] = tensor->ne[0];
spec.ne[1] = tensor->ne[1];
spec.ne[2] = tensor->ne[2];
spec.ne[3] = tensor->ne[3];
spec.nb[0] = tensor->nb[0];
spec.nb[1] = tensor->nb[1];
spec.nb[2] = tensor->nb[2];
spec.nb[3] = tensor->nb[3];
spec.type = type_to_npu_type(tensor->type);
return spec;
};
for (; i < (int) DEVICE_TENSOR_MAX_SRC && op->src[i]; ++i) {
auto * src = op->src[i];
if (type_to_npu_type(src->type) == NPU_DATA_TYPE_COUNT) {
LOG_DEBUG("[%s]Unsupported src%d tensor type: %s\n", get_name(), i, ggml_type_name(src->type));
return false;
}
srcs[i] = get_spec(src);
}
if (!_device_handle && !init_device()) {
LOG_DEBUG("[%s]NPU device initialization failed\n", get_name());
return false;
}
boolean supported = false;
auto dst_spec = get_spec(op);
npu_device_tensor_op_spec npu_op_spec = { npu_op, {} };
static_assert(sizeof(npu_op_spec.params) <= sizeof(op->op_params),
"npu_op_spec.params size should less than op->op_params size");
memcpy(npu_op_spec.params, op->op_params, sizeof(npu_op_spec.params));
auto ret = npu_device_device_support_op(_device_handle, &npu_op_spec, &dst_spec, srcs, i, &supported);
if (ret != AEE_SUCCESS || !supported) {
#ifndef NDEBUG
auto * src0_type = i ? ggml_type_name(op->src[0]->type) : "null";
auto * src1_type = (i > 1) ? ggml_type_name(op->src[1]->type) : "null";
LOG_DEBUG("[%s][%s]unsupported %s(%s,%s), ret: 0x%x, supported: %d\n", get_name(), ggml_op_desc(op),
ggml_type_name(op->type), src0_type, src1_type, ret, supported);
#endif
return false;
}
return true;
}
bool npu_device::init_rpc_mem() {
if (!_rpc_mem) {
auto rpc_interface = std::make_shared<common::rpc_interface>();
if (!rpc_interface->is_valid()) {
LOG_ERROR("[%s]Failed to load rpc memory library\n", get_name());
return false;
}
auto rpc_mem = std::make_shared<common::rpc_mem>(rpc_interface);
_rpc_interface = rpc_interface;
_rpc_mem = rpc_mem;
LOG_DEBUG("[%s]rpc memory initialized\n", get_name());
} else {
LOG_DEBUG("[%s]rpc memory already initialized\n", get_name());
}
return true;
}
bool npu_device::init_device_lib() {
if (!_device_handle) {
set_fast_rpc_stack_size(_rpc_interface, _dsp_domain_id, NPU_THREAD_STACK_SIZE);
auto arch = get_dsp_arch(_rpc_interface, _dsp_domain_id);
const auto & device_lib_info = get_device_library_info(arch);
std::string device_lib_uri = device_lib_info.device_lib_uri;
device_lib_uri += get_domain_param(_dsp_domain_id);
LOG_DEBUG("[%s]NPU device arch: %s, uri: %s\n", get_name(), get_dsp_arch_desc(arch), device_lib_uri.c_str());
auto err = npu_device_open(device_lib_uri.c_str(), &_device_handle);
if (err != AEE_SUCCESS) {
if (err == AEE_ECONNREFUSED) {
LOG_DEBUG("[%s]NPU device is not available, trying to enable unsigned DSP module and reopen\n",
get_name());
enable_unsigned_dsp_module(_rpc_interface, _dsp_domain_id);
err = npu_device_open(device_lib_uri.c_str(), &_device_handle);
}
if (err != AEE_SUCCESS) {
LOG_ERROR("[%s]Unable to open NPU device, err: 0x%x, uri %s\n", get_name(), err,
device_lib_uri.c_str());
_device_handle = 0;
return false;
}
}
_description += ' ';
_description += get_dsp_arch_desc(arch);
LOG_DEBUG("[%s]NPU device opened successfully\n", get_name());
} else {
LOG_DEBUG("[%s]NPU device is already opened\n", get_name());
}
return true;
}
bool npu_device::offload_op(const ggml_tensor * op) {
// TODO: implement this
return false;
}
#ifndef NDEBUG
bool npu_device::supports_op(const ggml_tensor * op) {
char op_desc[1024];
get_op_tensor_desc(op, op_desc, sizeof(op_desc));
if (supports_op_impl(op)) {
if (op->op != GGML_OP_NONE && op->op != GGML_OP_VIEW && op->op != GGML_OP_RESHAPE &&
op->op != GGML_OP_PERMUTE) {
_supported_op++;
LOG_DEBUG("[%s][%s][%s]supported, %s, supported/unsupported: %u/%u\n", get_name(), ggml_op_desc(op),
ggml_get_name(op), op_desc, _supported_op.load(), _unsupported_op.load());
}
return true;
}
_unsupported_op++;
LOG_DEBUG("[%s][%s][%s]unsupported, %s, supported/unsupported: %u/%u\n", get_name(), ggml_op_desc(op),
ggml_get_name(op), op_desc, _supported_op.load(), _unsupported_op.load());
return false;
}
#else
bool npu_device::supports_op(const ggml_tensor * op) {
return supports_op_impl(op);
}
#endif
ggml_backend_buffer_type_t npu_device::get_default_buffer_type(ggml_backend_dev_t dev) {
// Note that this function will be called before the npu_device::init_device
if (!init_rpc_mem()) {
return nullptr;
}
if (!_default_buffer_type) {
LOG_DEBUG("[%s]Creating default buffer type\n", get_name());
_default_buffer_type = std::make_unique<hexagon::host_buffer_type>(dev, _name + "_buffer_type", _rpc_mem);
if (!_default_buffer_type) {
LOG_ERROR("[%s]Default buffer type not initialized\n", get_name());
return nullptr;
}
} else {
LOG_DEBUG("[%s]Default buffer type already created\n", get_name());
}
return _default_buffer_type.get();
}
npu_backend::npu_backend(ggml_backend_dev_t dev) : ggml_backend{} {
memccpy(&_guid, &kBackendNpuGuid, 0, sizeof(ggml_guid));
device = dev;
guid = &_guid;
iface.get_name = backend_get_name;
iface.free = backend_free;
iface.cpy_tensor_async = backend_cpy_tensor_async;
iface.graph_compute = backend_graph_compute;
_device = reinterpret_cast<npu_device *>(dev->context);
}
ggml_status npu_backend::graph_compute(ggml_cgraph * cgraph) {
if (!cgraph || !cgraph->n_nodes) {
LOG_DEBUG("[%s]Graph is empty, nothing to compute\n", get_name());
return GGML_STATUS_SUCCESS;
}
std::shared_ptr<host_graph> graph;
if (_graph_cache.count(cgraph) == 0) {
LOG_DEBUG("[%s]graph(%p) not found in cache, creating new graph\n", get_name(), (void *) cgraph);
graph = std::make_shared<host_graph>(cgraph, _device->get_device_handle());
if (!graph->is_valid()) {
LOG_ERROR("Failed to create graph\n");
return GGML_STATUS_FAILED;
}
_graph_cache[cgraph] = graph;
} else {
graph = _graph_cache[cgraph];
LOG_DEBUG("[%s]graph(%p) found in cache, using existing graph\n", get_name(), (void *) cgraph);
if (!graph->update(cgraph)) {
LOG_ERROR("[%s]Failed to update graph(%p)\n", get_name(), (void *) cgraph);
return GGML_STATUS_FAILED;
}
}
return graph->compute() ? GGML_STATUS_SUCCESS : GGML_STATUS_FAILED;
}
} // namespace hexagon

View File

@ -0,0 +1,88 @@
#pragma once
#include <memory>
#include <unordered_map>
#ifndef NDEBUG
# include <atomic>
#endif
#include "buffer.hpp"
#include "common.hpp"
#include "ggml-backend-impl.h"
#include "hexagon_npu.h"
#include "rpc-mem.hpp"
namespace hexagon {
class npu_device {
public:
explicit npu_device(backend_index_type device);
~npu_device();
const char * get_name() const { return _name.c_str(); }
const char * get_description() const { return _description.c_str(); }
size_t get_alignment() const;
uint32_t get_dsp_domain_id() const { return _dsp_domain_id; }
ggml_backend_buffer_type_t get_default_buffer_type(ggml_backend_dev_t dev);
bool is_device_initialized() const;
bool init_device();
bool supports_buft(ggml_backend_buffer_type_t buft) const;
bool offload_op(const ggml_tensor * op);
bool supports_op(const ggml_tensor * op);
remote_handle64 get_device_handle() const { return _device_handle; }
private:
bool supports_op_impl(const ggml_tensor * op);
bool init_rpc_mem();
bool init_device_lib();
std::string _name = "hexagon-npu";
std::string _description = "Hexagon NPU";
common::rpc_interface_ptr _rpc_interface;
common::rpc_mem_ptr _rpc_mem;
remote_handle64 _device_handle = 0;
std::unique_ptr<host_buffer_type> _default_buffer_type;
uint32_t _dsp_domain_id = 0;
#ifndef NDEBUG
std::atomic_uint32_t _supported_op = 0;
std::atomic_uint32_t _unsupported_op = 0;
#endif
DISABLE_COPY(npu_device);
DISABLE_MOVE(npu_device);
};
class host_graph;
class npu_backend : public ggml_backend {
public:
explicit npu_backend(ggml_backend_dev_t dev);
~npu_backend() {}
const char * get_name() const {
// TODO: should we use the device name here?
return _device->get_name();
}
ggml_status graph_compute(ggml_cgraph * cgraph);
private:
ggml_guid _guid = {};
npu_device * _device = nullptr;
std::unordered_map<ggml_cgraph *, std::shared_ptr<host_graph>> _graph_cache;
DISABLE_COPY(npu_backend);
DISABLE_MOVE(npu_backend);
};
} // namespace hexagon

View File

@ -0,0 +1,241 @@
#pragma once
#include "common.hpp"
#include "ggml-impl.h"
#include "hexagon_npu.h"
#include "util.hpp"
#include <list>
#include <type_traits>
#include <vector>
namespace hexagon {
// TODO: merge this with device tensor?
class host_tensor {
public:
static host_tensor * from_ggml_tensor(ggml_tensor * tensor) {
if (!tensor || !tensor->extra) {
return nullptr;
}
return static_cast<host_tensor *>(tensor->extra);
}
explicit host_tensor(ggml_tensor * tensor, int buffer_fd, uint64_t offset, remote_handle64 device_handle) :
_device_handle(device_handle) {
// TODO: figure out why the npu_device_tensor_config can't be larger than 100 bytes
static_assert(sizeof(npu_device_tensor_config) < kMaxNpuRpcStructSize,
"npu_device_tensor_config size too large");
_info.buffer_fd = buffer_fd;
_info.offset = offset;
_info.type = type_to_npu_type(tensor->type);
_info.size = ggml_nbytes(tensor);
_info.is_constant = false; // TODO: support constant tensors in the future
// _info.op will be updated in update_params()
_info_update.op = NPU_OP_COUNT;
static_assert(DEVICE_TENSOR_MAX_DIMS == GGML_MAX_DIMS, "tensor dimensions mismatch");
static_assert(sizeof(_info.ne) == sizeof(tensor->ne), "tensor ne size mismatch");
static_assert(sizeof(_info.nb) == sizeof(tensor->nb), "tensor nb size mismatch");
memcpy(_info.ne, tensor->ne, sizeof(_info.ne));
memcpy(_info.nb, tensor->nb, sizeof(_info.nb));
auto status = npu_device_tensor_init(_device_handle, &_info, &_device_tensor_handle);
if (status != AEE_SUCCESS) {
LOG_ERROR("Failed to init tensor: %d", (int) status);
_device_tensor_handle = npu_device_INVALID_DEVICE_TENSOR_HANDLE;
return;
}
tensor->extra = this;
_ggml_tensor = tensor;
#ifndef NDEBUG
{
char desc[1024];
get_desc(desc, sizeof(desc));
LOG_DEBUG("host_tensor(%s)\n", desc);
}
#endif
}
~host_tensor() {
LOG_DEBUG("host_tensor(%p) destroy, device_tensor_handle: %p\n", (void *) this, (void *) _device_tensor_handle);
if (_device_tensor_handle != npu_device_INVALID_DEVICE_TENSOR_HANDLE) {
npu_device_tensor_free(_device_handle, _device_tensor_handle);
// TODO: figure out why the _ggml_tensor is invalid here
}
}
static void destroy_tensors(std::list<std::shared_ptr<host_tensor>> & tensors) {
std::vector<npu_device_tensor_handle_t> handles;
handles.reserve(tensors.size());
remote_handle64 device_handle = 0;
for (auto tensor : tensors) {
if (tensor && tensor->_device_tensor_handle != npu_device_INVALID_DEVICE_TENSOR_HANDLE) {
handles.push_back(tensor->_device_tensor_handle);
tensor->_device_tensor_handle = npu_device_INVALID_DEVICE_TENSOR_HANDLE; // prevent double free
device_handle = tensor->_device_handle;
}
}
if (!handles.empty()) {
npu_device_tensors_free(device_handle, handles.data(), handles.size());
}
tensors.clear();
}
npu_device_tensor_handle_t get_device_tensor_handle() const { return _device_tensor_handle; }
void update_params(ggml_tensor * ggml_tensor) {
static_assert(sizeof(_info_update.params) <= sizeof(_ggml_tensor->op_params),
"device tensor params size mismatch");
static_assert(DEVICE_TENSOR_MAX_SRC <= GGML_MAX_SRC, "device tensor src size mismatch");
GGML_ASSERT(ggml_tensor == _ggml_tensor);
if (!_ggml_tensor) {
LOG_DEBUG("host_tensor(%p) _ggml_tensor is null\n", (void *) this);
return;
}
auto new_op = op_to_npu_op(_ggml_tensor->op);
bool params_changed = new_op != _info_update.op;
if (params_changed) {
LOG_DEBUG("host_tensor(%p) op changed: %s\n", (void *) this, get_npu_op_desc(new_op));
}
_info_update.op = new_op;
if (memcmp(_info_update.params, _ggml_tensor->op_params, sizeof(_info_update.params)) != 0) {
params_changed = true;
memcpy(_info_update.params, _ggml_tensor->op_params, sizeof(_info_update.params));
LOG_DEBUG("host_tensor(%p) op_params changed: [%x, %x, %x, %x]\n",
(void *) this,
(int) _info_update.params[0],
(int) _info_update.params[1],
(int) _info_update.params[2],
(int) _info_update.params[3]);
}
npu_device_tensor_handle_t src_tensor_handles[DEVICE_TENSOR_MAX_SRC] = {};
static_assert(std::is_same<decltype(_info_update.src_handles), decltype(src_tensor_handles)>::value,
"src tensor handles type mismatch");
for (size_t j = 0; j < DEVICE_TENSOR_MAX_SRC && _ggml_tensor->src[j]; ++j) {
auto * ggml_src = _ggml_tensor->src[j];
auto * src = host_tensor::from_ggml_tensor(ggml_src);
src_tensor_handles[j] = src->get_device_tensor_handle();
#ifndef NDEBUG
char desc[1024];
src->get_desc(desc, sizeof(desc));
LOG_DEBUG("host_tensor(%p) set_src[%zu]: (%s)\n", (void *) this, j, desc);
#endif
}
if (memcmp(_info_update.src_handles, src_tensor_handles, sizeof(_info_update.src_handles)) != 0) {
params_changed = true;
memcpy(_info_update.src_handles, src_tensor_handles, sizeof(_info_update.src_handles));
LOG_DEBUG("host_tensor(%p) src changed, handles: [%p, %p]\n",
(void *) this,
(void *) _info_update.src_handles[0],
(void *) _info_update.src_handles[1]);
}
if (params_changed) {
npu_device_tensor_update_params(_device_handle, _device_tensor_handle, &_info_update);
LOG_DEBUG("host_tensor(%p) update_params, op: %s, params: [%x, %x, %x, %x]\n",
(void *) this,
ggml_op_desc(_ggml_tensor),
(int) _info_update.params[0],
(int) _info_update.params[1],
(int) _info_update.params[2],
(int) _info_update.params[3]);
} else {
LOG_DEBUG("host_tensor(%p) update_params, no changes, op: %s, params: [%x, %x, %x, %x]\n",
(void *) this,
ggml_op_desc(_ggml_tensor),
(int) _info_update.params[0],
(int) _info_update.params[1],
(int) _info_update.params[2],
(int) _info_update.params[3]);
}
}
const npu_device_tensor_update_config & update_hosts_params_only(ggml_tensor * ggml_tensor) {
static_assert(sizeof(_info_update.params) <= sizeof(ggml_tensor->op_params),
"device tensor params size mismatch");
static_assert(DEVICE_TENSOR_MAX_SRC <= GGML_MAX_SRC, "device tensor src size mismatch");
GGML_ASSERT(ggml_tensor == _ggml_tensor);
auto new_op = op_to_npu_op(_ggml_tensor->op);
_info_update.op = new_op;
memcpy(_info_update.params, _ggml_tensor->op_params, sizeof(_info_update.params));
for (size_t j = 0; j < DEVICE_TENSOR_MAX_SRC && _ggml_tensor->src[j]; ++j) {
auto * ggml_src = _ggml_tensor->src[j];
auto * src = host_tensor::from_ggml_tensor(ggml_src);
_info_update.src_handles[j] = src->get_device_tensor_handle();
#ifndef NDEBUG
char desc[1024];
src->get_desc(desc, sizeof(desc));
LOG_DEBUG("host_tensor(%p) set_src[%zu]: (%s)\n", (void *) this, j, desc);
#endif
}
LOG_DEBUG("host_tensor(%p) update_params, op: %s, params: [%x, %x, %x, %x]\n",
(void *) this,
ggml_op_desc(_ggml_tensor),
(int) _info_update.params[0],
(int) _info_update.params[1],
(int) _info_update.params[2],
(int) _info_update.params[3]);
return _info_update;
}
bool is_valid() const { return _device_tensor_handle != npu_device_INVALID_DEVICE_TENSOR_HANDLE; }
int64_t get_ne(size_t index) const {
if (index >= DEVICE_TENSOR_MAX_DIMS) {
LOG_ERROR("host_tensor(%p) get_ne: index out of bounds: %zu\n", (void *) this, index);
return 0;
}
return _info.ne[index];
}
int get_desc(char * buffer, size_t size) const {
return snprintf(buffer,
size,
"%s[%ldx%ldx%ldx%ld], nb[%ld,%ld,%ld,%ld], %s, addr: %p, ggml: %p, handle:%p",
_ggml_tensor->name,
(long) _ggml_tensor->ne[0],
(long) _ggml_tensor->ne[1],
(long) _ggml_tensor->ne[2],
(long) _ggml_tensor->ne[3],
(long) _ggml_tensor->nb[0],
(long) _ggml_tensor->nb[1],
(long) _ggml_tensor->nb[2],
(long) _ggml_tensor->nb[3],
ggml_type_name(_ggml_tensor->type),
(void *) this,
(void *) _ggml_tensor,
(void *) _device_tensor_handle);
}
private:
remote_handle64 _device_handle = 0;
npu_device_tensor_handle_t _device_tensor_handle = 0;
npu_device_tensor_config _info = {};
npu_device_tensor_update_config _info_update = {};
ggml_tensor * _ggml_tensor = nullptr;
DISABLE_COPY(host_tensor);
DISABLE_MOVE(host_tensor);
};
} // namespace hexagon

View File

@ -0,0 +1,273 @@
#include "util.hpp"
#include <remote.h>
#define GGML_COMMON_DECL_CPP
#include "ggml-common.h"
#undef GGML_COMMON_DECL_CPP
static_assert(sizeof(npu_device_block_q4_k) == sizeof(block_q4_K), "npu_device_block_q4_k size mismatch");
static_assert(sizeof(npu_device_block_q4_0) == sizeof(block_q4_0), "npu_device_block_q4_0 size mismatch");
static_assert(sizeof(npu_device_block_q8_0) == sizeof(block_q8_0), "npu_device_block_q8_0 size mismatch");
static_assert(QUANT_K_SCALE_SIZE == K_SCALE_SIZE, "QUANT_K_SCALE_SIZE size mismatch");
static_assert(QUANT_K_BLOCK_SIZE == QK_K, "QUANT_K_BLOCK_SIZE size mismatch");
static_assert(QUANT_BLOCK_SIZE == QK4_0, "QUANT_BLOCK_SIZE size mismatch");
static_assert(NPU_ROPE_TYPE_NEOX == GGML_ROPE_TYPE_NEOX, "NPU_ROPE_TYPE_NEOX mismatch");
static_assert(NPU_ROPE_TYPE_MROPE == GGML_ROPE_TYPE_MROPE, "NPU_ROPE_TYPE_MROPE mismatch");
static_assert(NPU_ROPE_TYPE_VISION == GGML_ROPE_TYPE_VISION, "NPU_ROPE_TYPE_VISION mismatch");
namespace hexagon {
enum npu_device_tensor_op op_to_npu_op(ggml_op op) {
switch (op) {
case GGML_OP_MUL_MAT:
return NPU_OP_MUL_MAT;
case GGML_OP_ADD:
return NPU_OP_ADD;
case GGML_OP_SUB:
return NPU_OP_SUB;
case GGML_OP_MUL:
return NPU_OP_MUL;
case GGML_OP_RMS_NORM:
return NPU_OP_RMS_NORM;
case GGML_OP_FLASH_ATTN_EXT:
return NPU_OP_FLASH_ATTN;
case GGML_OP_ROPE:
return NPU_OP_ROPE;
case GGML_OP_GLU:
return NPU_OP_GLU;
case GGML_OP_GET_ROWS:
return NPU_OP_GET_ROWS;
case GGML_OP_SET_ROWS:
return NPU_OP_SET_ROWS;
case GGML_OP_CPY:
return NPU_OP_CPY;
default:
return NPU_OP_COUNT;
}
}
const char * get_npu_op_desc(enum npu_device_tensor_op op) {
switch (op) {
case NPU_OP_MUL_MAT:
return ggml_op_name(GGML_OP_MUL_MAT);
case NPU_OP_ADD:
return ggml_op_name(GGML_OP_ADD);
case NPU_OP_SUB:
return ggml_op_name(GGML_OP_SUB);
case NPU_OP_MUL:
return ggml_op_name(GGML_OP_MUL);
case NPU_OP_RMS_NORM:
return ggml_op_name(GGML_OP_RMS_NORM);
case NPU_OP_FLASH_ATTN:
return ggml_op_name(GGML_OP_FLASH_ATTN_EXT);
case NPU_OP_ROPE:
return ggml_op_name(GGML_OP_ROPE);
case NPU_OP_GLU:
return ggml_op_name(GGML_OP_GLU);
case NPU_OP_GET_ROWS:
return ggml_op_name(GGML_OP_GET_ROWS);
case NPU_OP_SET_ROWS:
return ggml_op_name(GGML_OP_SET_ROWS);
case NPU_OP_CPY:
return ggml_op_name(GGML_OP_CPY);
default:
return "UNKNOWN";
}
}
enum npu_device_tensor_data_type type_to_npu_type(ggml_type type) {
switch (type) {
case GGML_TYPE_F32:
return NPU_DATA_TYPE_F32;
case GGML_TYPE_F16:
return NPU_DATA_TYPE_F16;
case GGML_TYPE_I32:
return NPU_DATA_TYPE_I32;
case GGML_TYPE_I64:
return NPU_DATA_TYPE_I64;
case GGML_TYPE_Q4_K:
return NPU_DATA_TYPE_Q4_K;
case GGML_TYPE_Q4_0:
return NPU_DATA_TYPE_Q4_0;
case GGML_TYPE_Q8_0:
return NPU_DATA_TYPE_Q8_0;
default:
return NPU_DATA_TYPE_COUNT;
}
}
hexagon_dsp_arch get_dsp_arch(common::rpc_interface_ptr rpc_interface, uint32_t domain_id) {
if (!rpc_interface || !rpc_interface->is_valid()) {
return NONE;
}
remote_dsp_capability dsp_caps = {};
dsp_caps.domain = domain_id;
dsp_caps.attribute_ID = ARCH_VER;
auto ret = rpc_interface->remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_caps, sizeof(dsp_caps));
if (ret != AEE_SUCCESS) {
LOG_ERROR("failed to get DSP arch: %d\n", ret);
return NONE;
}
LOG_DEBUG("get DSP arch: 0x%x\n", (int) dsp_caps.capability);
auto arch = dsp_caps.capability & 0xFF;
switch (arch) {
case 0x68:
return V68;
case 0x69:
return V69;
case 0x73:
return V73;
case 0x75:
return V75;
case 0x79:
return V79;
default:
LOG_ERROR("unknown DSP arch: %x\n", arch);
return NONE;
}
}
const char * get_dsp_arch_desc(hexagon_dsp_arch arch) {
switch (arch) {
case V68:
return "V68";
case V69:
return "V69";
case V73:
return "V73";
case V75:
return "V75";
case V79:
return "V79";
case NONE:
default:
return "UnknownArch";
}
}
void enable_unsigned_dsp_module(common::rpc_interface_ptr rpc_interface, uint32_t domain_id) {
if (!rpc_interface || !rpc_interface->is_valid()) {
return;
}
remote_rpc_control_unsigned_module data = {};
data.domain = domain_id;
data.enable = 1;
auto ret = rpc_interface->remote_session_control(DSPRPC_CONTROL_UNSIGNED_MODULE, &data, sizeof(data));
if (ret != AEE_SUCCESS) {
LOG_ERROR("failed to enable unsigned DSP module: 0x%x\n", ret);
}
}
void set_fast_rpc_stack_size(common::rpc_interface_ptr rpc_interface, uint32_t domain_id, uint32_t stack_size) {
constexpr const uint32_t FASTRPC_THREAD_PARAMS = 1;
if (!rpc_interface || !rpc_interface->is_valid()) {
return;
}
remote_rpc_thread_params tp = {};
tp.domain = domain_id;
tp.prio = -1;
tp.stack_size = stack_size;
auto ret = rpc_interface->remote_session_control(FASTRPC_THREAD_PARAMS, &tp, sizeof(tp));
if (ret != AEE_SUCCESS) {
LOG_ERROR("failed to set fast RPC stack size: 0x%x\n", ret);
}
}
void get_op_tensor_desc(const ggml_tensor * dst, char * out, size_t max_len) {
if (dst == nullptr) {
snprintf(out, max_len, "null");
return;
}
constexpr const auto print_tensor = [](const ggml_tensor * tensor, char * out, size_t max_len) {
auto dims = ggml_n_dims(tensor);
switch (dims) {
default:
case 4:
snprintf(out, max_len, "%s[%ldx%ldx%ldx%ld]", ggml_type_name(tensor->type), (long) tensor->ne[0],
(long) tensor->ne[1], (long) tensor->ne[2], (long) tensor->ne[3]);
break;
case 3:
snprintf(out, max_len, "%s[%ldx%ldx%ld]", ggml_type_name(tensor->type), (long) tensor->ne[0],
(long) tensor->ne[1], (long) tensor->ne[2]);
break;
case 2:
snprintf(out, max_len, "%s[%ldx%ld]", ggml_type_name(tensor->type), (long) tensor->ne[0],
(long) tensor->ne[1]);
break;
case 1:
snprintf(out, max_len, "%s[%ld]", ggml_type_name(tensor->type), (long) tensor->ne[0]);
break;
}
};
constexpr const auto get_src_tensor_count = [](const ggml_tensor * tensor) -> size_t {
for (size_t i = 0; i < GGML_MAX_SRC; ++i) {
if (!tensor->src[i]) {
return i;
}
}
return GGML_MAX_SRC;
};
char dst_desc[256];
print_tensor(dst, dst_desc, sizeof(dst_desc));
switch (get_src_tensor_count(dst)) {
case 4:
{
char src0_desc[256];
print_tensor(dst->src[0], src0_desc, sizeof(src0_desc));
char src1_desc[256];
print_tensor(dst->src[1], src1_desc, sizeof(src1_desc));
char src2_desc[256];
print_tensor(dst->src[2], src2_desc, sizeof(src2_desc));
char src3_desc[256];
print_tensor(dst->src[3], src3_desc, sizeof(src3_desc));
snprintf(out, max_len, "dst: %s, src0: %s, src1: %s, src2: %s, src3: %s", dst_desc, src0_desc,
src1_desc, src2_desc, src3_desc);
return;
}
case 3:
{
char src0_desc[256];
print_tensor(dst->src[0], src0_desc, sizeof(src0_desc));
char src1_desc[256];
print_tensor(dst->src[1], src1_desc, sizeof(src1_desc));
char src2_desc[256];
print_tensor(dst->src[2], src2_desc, sizeof(src2_desc));
snprintf(out, max_len, "dst: %s, src0: %s, src1: %s, src2: %s", dst_desc, src0_desc, src1_desc,
src2_desc);
return;
}
case 2:
{
char src0_desc[256];
print_tensor(dst->src[0], src0_desc, sizeof(src0_desc));
char src1_desc[256];
print_tensor(dst->src[1], src1_desc, sizeof(src1_desc));
snprintf(out, max_len, "dst: %s, src0: %s, src1: %s", dst_desc, src0_desc, src1_desc);
return;
}
case 1:
{
char src0_desc[256];
print_tensor(dst->src[0], src0_desc, sizeof(src0_desc));
snprintf(out, max_len, "dst: %s, src0: %s", dst_desc, src0_desc);
return;
}
default:
snprintf(out, max_len, "dst: %s", dst_desc);
return;
}
}
} // namespace hexagon

View File

@ -0,0 +1,32 @@
#include "ggml-impl.h"
#include "hexagon_npu.h"
#include "rpc-interface.hpp"
namespace hexagon {
enum npu_device_tensor_op op_to_npu_op(ggml_op op);
const char * get_npu_op_desc(enum npu_device_tensor_op op);
enum npu_device_tensor_data_type type_to_npu_type(ggml_type type);
// TODO: merge with qcom_htp_arch
enum hexagon_dsp_arch {
NONE = 0,
V68,
V69,
V73,
V75,
V79, // SD 8 Gen 4 (SM8750)
};
hexagon_dsp_arch get_dsp_arch(common::rpc_interface_ptr rpc_interface, uint32_t domain_id);
const char * get_dsp_arch_desc(hexagon_dsp_arch arch);
void enable_unsigned_dsp_module(common::rpc_interface_ptr rpc_interface, uint32_t domain_id);
void set_fast_rpc_stack_size(common::rpc_interface_ptr rpc_interface, uint32_t domain_id, uint32_t stack_size);
void get_op_tensor_desc(const ggml_tensor * dst, char * out, size_t max_len);
constexpr const size_t kMaxNpuRpcStructSize = 100; // TODO: figure out the actual size
} // namespace hexagon

View File

@ -0,0 +1,161 @@
#include "AEEStdDef.idl"
#include "AEEStdErr.idl"
#include "remote.idl"
const uint32_t DEVICE_TENSOR_MAX_DIMS = 4;
const uint32_t DEVICE_TENSOR_MAX_SRC = 5;
const uint32_t DEVICE_TENSOR_MAX_OP_PARAMS = 16;
const uint32_t QUANT_BLOCK_SIZE = 32;
const uint32_t QUANT_K_BLOCK_SIZE = 256;
const uint32_t QUANT_K_SCALE_SIZE = 12;
const uint32_t NPU_ROPE_TYPE_NEOX = 2;
const uint32_t NPU_ROPE_TYPE_MROPE = 8;
const uint32_t NPU_ROPE_TYPE_VISION = 24;
const uint32_t NPU_THREAD_STACK_SIZE = 64 * 1024;
interface npu_device : remote_handle64{
typedef int64_t ne_type[DEVICE_TENSOR_MAX_DIMS];
typedef uint64_t nb_type[DEVICE_TENSOR_MAX_DIMS];
typedef int32_t param_type[DEVICE_TENSOR_MAX_OP_PARAMS];
typedef uint64_t tensor_handle_t;
typedef uint64_t graph_handle_t;
const graph_handle_t INVALID_DEVICE_GRAPH_HANDLE = 0;
const tensor_handle_t INVALID_DEVICE_TENSOR_HANDLE = 0;
typedef uint16_t fp16_t;
struct block_q4_0 {
fp16_t d;
uint8_t qs[QUANT_BLOCK_SIZE / 2];
};
struct block_q4_k {
fp16_t d;
fp16_t dmin;
uint8_t scales[QUANT_K_SCALE_SIZE];
uint8_t qs[QUANT_K_BLOCK_SIZE / 2];
};
struct block_q8_0 {
fp16_t d;
int8_t qs[QUANT_BLOCK_SIZE];
};
enum tensor_op {
NPU_OP_MUL_MAT,
NPU_OP_ADD,
NPU_OP_SUB,
NPU_OP_MUL,
NPU_OP_RMS_NORM,
NPU_OP_FLASH_ATTN,
NPU_OP_ROPE,
NPU_OP_GLU,
NPU_OP_GET_ROWS,
NPU_OP_SET_ROWS,
NPU_OP_CPY,
NPU_OP_COUNT
};
enum glu_op {
NPU_GLU_OP_REGLU,
NPU_GLU_OP_GEGLU,
NPU_GLU_OP_SWIGLU,
NPU_GLU_OP_GEGLU_ERF,
NPU_GLU_OP_GEGLU_QUICK,
NPU_GLU_OP_COUNT
};
enum tensor_data_type {
NPU_DATA_TYPE_F32,
NPU_DATA_TYPE_F16,
NPU_DATA_TYPE_I32,
NPU_DATA_TYPE_I64,
NPU_DATA_TYPE_Q8_0,
NPU_DATA_TYPE_Q4_0,
NPU_DATA_TYPE_Q4_K,
NPU_DATA_TYPE_COUNT
};
struct tensor_spec {
ne_type ne;
nb_type nb;
tensor_data_type type;
};
struct tensor_op_spec {
tensor_op op;
param_type params;
};
struct tensor_update_config {
tensor_op op;
param_type params;
tensor_handle_t src_handles[DEVICE_TENSOR_MAX_SRC];
};
struct tensor_config {
ne_type ne;
nb_type nb;
long buffer_fd;
uint64_t offset;
uint64_t size;
tensor_data_type type;
boolean is_constant;
};
AEEResult device_get_alignment(
rout uint32_t alignment
);
AEEResult device_support_op(
in tensor_op_spec op_spec,
in tensor_spec dst,
in sequence<tensor_spec> srcs,
rout boolean is_supported
);
AEEResult tensor_init(
in tensor_config info,
rout tensor_handle_t tensor_handle
);
AEEResult tensor_update_params(
in tensor_handle_t tensor_handle,
in tensor_update_config config
);
AEEResult tensor_free(
in tensor_handle_t tensor_handle
);
AEEResult tensors_free(
in sequence<tensor_handle_t> tensor_handles
);
AEEResult graph_init(
rout graph_handle_t graph_handle
);
AEEResult graph_set_tensor(
in graph_handle_t graph_handle,
in sequence<tensor_handle_t> tensor_handles
);
AEEResult graph_set_tensor_with_param(
in graph_handle_t graph_handle,
in sequence<tensor_handle_t> tensor_handles,
in sequence<tensor_update_config> tensor_params
);
AEEResult graph_compute(
in graph_handle_t graph_handle
);
AEEResult graph_free(
in graph_handle_t graph_handle
);
};

View File

@ -0,0 +1,49 @@
file(GLOB qnn_srcs "${CMAKE_CURRENT_LIST_DIR}/*.cpp")
add_library(qnn-backend STATIC
${qnn_srcs}
)
target_include_directories(qnn-backend PRIVATE
${GGML_QNN_SDK_PATH}/include/QNN/
${CMAKE_CURRENT_LIST_DIR}/
${CMAKE_CURRENT_LIST_DIR}/../
${CMAKE_CURRENT_LIST_DIR}/../../
${CMAKE_CURRENT_LIST_DIR}/../../../include/ # TODO: figure out how to remove this
${CMAKE_CURRENT_LIST_DIR}/../shared/
)
target_link_directories(qnn-backend PRIVATE
runtime-common
)
if(GGML_QNN_ENABLE_CPU_BACKEND)
message("GGML_QNN_ENABLE_CPU_BACKEND is enabled")
target_compile_definitions(qnn-backend PUBLIC GGML_QNN_ENABLE_CPU_BACKEND)
else()
message("GGML_QNN_ENABLE_CPU_BACKEND is disabled")
endif()
if(GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING)
message("GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING is enabled")
target_compile_definitions(qnn-backend PUBLIC GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING)
else()
message("GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING is disabled")
endif()
if(CMAKE_SYSTEM_NAME STREQUAL "Android")
set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend")
elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows" OR CMAKE_SYSTEM_NAME STREQUAL "Linux")
set(QNN_DEFAULT_LIB_SEARCH_PATH "" CACHE STRING "customized library search path for QNN backend")
else()
message(FATAL_ERROR "QNN now only available on Android, Windows and Linux")
endif()
if(NOT "${QNN_DEFAULT_LIB_SEARCH_PATH}" STREQUAL "")
string(REGEX REPLACE "/$" "" QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}")
endif()
message("GGML_QNN_DEFAULT_LIB_SEARCH_PATH: ${QNN_DEFAULT_LIB_SEARCH_PATH}")
target_compile_definitions(qnn-backend PUBLIC GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}")

View File

@ -0,0 +1,471 @@
#include "backend-ops.hpp"
#include "ggml-impl.h"
#include "graph.hpp"
#include "logger.hpp"
#include "op-config.hpp"
#include "tensor.hpp"
#include "utils.hpp"
#include <memory>
namespace {
qnn::qnn_graph * get_qnn_graph_from_cache(qnn::ggml_backend_qnn_device_context * ctx, const ggml_cgraph * cgraph) {
auto & graph_cache = ctx->qnn_graph_cache;
std::string graph_key;
auto op_data_type = qnn::qnn_graph::get_graph_key_from_cgraph(cgraph, graph_key);
if (graph_key.empty()) {
QNN_LOG_DEBUG("[%s]empty graph key for cgraph: %p, size: %d\n", qnn::get_backend_name(ctx->device),
(const void *) cgraph, (int) cgraph->n_nodes);
return nullptr;
}
auto it = graph_cache.find(graph_key);
qnn::qnn_graph * graph_ptr = nullptr;
if (it != graph_cache.end()) {
auto it = graph_cache.find(graph_key);
QNN_LOG_DEBUG("[%s]found graph %s in cache, cache size: %d\n", qnn::get_backend_name(ctx->device),
graph_key.c_str(), (int) graph_cache.size());
graph_ptr = it->second.get();
} else {
auto precision = qnn::qnn_graph::kHtpDefault;
if (op_data_type == GGML_TYPE_F16) {
QNN_LOG_DEBUG("[%s][%s]set graph precision to FP16\n", qnn::get_backend_name(ctx->device),
graph_key.c_str());
precision = qnn::qnn_graph::kHtpFp16;
}
auto graph = std::make_unique<qnn::qnn_graph>(graph_key, ctx->device, ctx->instance, precision,
ctx->socinfo.vtcm_size_in_mb);
if (!graph->is_valid()) {
return nullptr;
}
if (!graph->build_graph_from_ggml_graph(cgraph)) {
QNN_LOG_ERROR("[%s]build_graph_from_op failed\n", qnn::get_backend_name(ctx->device));
return nullptr;
}
graph_ptr = graph.get();
graph_cache[graph_key] = std::move(graph);
QNN_LOG_DEBUG("[%s]add graph %s to cache, cache size: %d\n", qnn::get_backend_name(ctx->device),
graph_key.c_str(), (int) graph_cache.size());
}
return graph_ptr;
}
// TODO: could be merge into op caps array
constexpr const bool kQnnSupportedOps[] = {
true, // GGML_OP_NONE
false, // GGML_OP_DUP
true, // GGML_OP_ADD
false, // GGML_OP_ADD_ID
false, // GGML_OP_ADD1
false, // GGML_OP_ACC
true, // GGML_OP_SUB
true, // GGML_OP_MUL
false, // GGML_OP_DIV, disabled for now cause failed on test-backend-ops
false, // GGML_OP_SQR
false, // GGML_OP_SQRT, disabled for now cause failed on test-backend-ops
true, // GGML_OP_LOG
false, // GGML_OP_SIN
false, // GGML_OP_COS
false, // GGML_OP_SUM
false, // GGML_OP_SUM_ROWS
false, // GGML_OP_CUMSUM
false, // GGML_OP_MEAN
false, // GGML_OP_ARGMAX
false, // GGML_OP_COUNT_EQUAL
false, // GGML_OP_REPEAT
false, // GGML_OP_REPEAT_BACK
false, // GGML_OP_CONCAT
false, // GGML_OP_SILU_BACK
false, // GGML_OP_NORM
false, // GGML_OP_RMS_NORM
false, // GGML_OP_RMS_NORM_BACK
false, // GGML_OP_GROUP_NORM
false, // GGML_OP_L2_NORM
true, // GGML_OP_MUL_MAT
false, // GGML_OP_MUL_MAT_ID
false, // GGML_OP_OUT_PROD
false, // GGML_OP_SCALE
false, // GGML_OP_SET
false, // GGML_OP_CPY
false, // GGML_OP_CONT
false, // GGML_OP_RESHAPE
false, // GGML_OP_VIEW
false, // GGML_OP_PERMUTE
false, // GGML_OP_TRANSPOSE
false, // GGML_OP_GET_ROWS
false, // GGML_OP_GET_ROWS_BACK
false, // GGML_OP_SET_ROWS
false, // GGML_OP_DIAG
false, // GGML_OP_DIAG_MASK_INF
false, // GGML_OP_DIAG_MASK_ZERO
false, // GGML_OP_SOFT_MAX
false, // GGML_OP_SOFT_MAX_BACK
false, // GGML_OP_ROPE
false, // GGML_OP_ROPE_BACK
false, // GGML_OP_CLAMP
false, // GGML_OP_CONV_TRANSPOSE_1D
false, // GGML_OP_IM2COL
false, // GGML_OP_IM2COL_BACK
false, // GGML_OP_IM2COL_3D
false, // GGML_OP_CONV_2D
false, // GGML_OP_CONV_3D
false, // GGML_OP_CONV_2D_DW
false, // GGML_OP_CONV_TRANSPOSE_2D
false, // GGML_OP_POOL_1D
false, // GGML_OP_POOL_2D
false, // GGML_OP_POOL_2D_BACK
false, // GGML_OP_UPSCALE
false, // GGML_OP_PAD
false, // GGML_OP_ROLL
false, // GGML_OP_PAD_REFLECT_1D
false, // GGML_OP_ARANGE
false, // GGML_OP_TIMESTEP_EMBEDDING
false, // GGML_OP_ARGSORT
false, // GGML_OP_TOP_K
false, // GGML_OP_LEAKY_RELU
false, // GGML_OP_TRI
false, // GGML_OP_FILL
false, // GGML_OP_FLASH_ATTN_EXT
false, // GGML_OP_FLASH_ATTN_BACK
false, // GGML_OP_SSM_CONV
false, // GGML_OP_SSM_SCAN
false, // GGML_OP_WIN_PART
false, // GGML_OP_WIN_UNPART
false, // GGML_OP_GET_REL_POS
false, // GGML_OP_ADD_REL_POS
false, // GGML_OP_RWKV_WKV6
false, // GGML_OP_GATED_LINEAR_ATTN
false, // GGML_OP_RWKV_WKV7
false, // GGML_OP_SOLVE_TRI
false, // GGML_OP_UNARY
false, // GGML_OP_MAP_CUSTOM1
false, // GGML_OP_MAP_CUSTOM2
false, // GGML_OP_MAP_CUSTOM3
false, // GGML_OP_CUSTOM
false, // GGML_OP_CROSS_ENTROPY_LOSS
false, // GGML_OP_CROSS_ENTROPY_LOSS_BACK
false, // GGML_OP_OPT_STEP_ADAMW
false, // GGML_OP_OPT_STEP_SGD
false, // GGML_OP_GLU
// ggml_unary_op
false, // GGML_UNARY_OP_ABS
false, // GGML_UNARY_OP_SGN
false, // GGML_UNARY_OP_NEG
false, // GGML_UNARY_OP_STEP
false, // GGML_UNARY_OP_TANH
false, // GGML_UNARY_OP_ELU
false, // GGML_UNARY_OP_RELU
false, // GGML_UNARY_OP_SIGMOID
true, // GGML_UNARY_OP_GELU
false, // GGML_UNARY_OP_GELU_QUICK
false, // GGML_UNARY_OP_SILU
false, // GGML_UNARY_OP_HARDSWISH
false, // GGML_UNARY_OP_HARDSIGMOID
false, // GGML_UNARY_OP_EXP
false, // GGML_UNARY_OP_EXPM1
false, // GGML_UNARY_OP_SOFTPLUS
false, // GGML_UNARY_OP_GELU_ERF
false, // GGML_UNARY_OP_XIELU
false, // GGML_UNARY_OP_FLOOR
false, // GGML_UNARY_OP_CEIL
false, // GGML_UNARY_OP_ROUND
false, // GGML_UNARY_OP_TRUNC
};
static_assert(kQnnSupportedOps[GGML_OP_NONE], "GGML_OP_NONE is not true");
static_assert(kQnnSupportedOps[GGML_OP_ADD], "GGML_OP_ADD is not true");
static_assert(kQnnSupportedOps[GGML_OP_MUL], "GGML_OP_MUL is not true");
static_assert(kQnnSupportedOps[GGML_OP_MUL_MAT], "GGML_OP_MUL_MAT is not true");
static_assert(!kQnnSupportedOps[GGML_OP_RESHAPE], "GGML_OP_RESHAPE should not be true");
static_assert(!kQnnSupportedOps[GGML_OP_VIEW], "GGML_OP_VIEW is not false");
static_assert(std::size(kQnnSupportedOps) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT),
"GGML_OP_COUNT does not match the size of the kQnnSupportedOps table");
inline bool is_type_bit_enabled(uint64_t bits, ggml_type type) {
return bits & (uint64_t(1) << type);
}
inline bool is_tensor_size_valid(qnn::ggml_backend_qnn_device_context * ctx, const ggml_tensor * tensor) {
constexpr const auto get_tensor_size_in_bytes = [](const ggml_tensor * tensor, ggml_type type) -> size_t {
return tensor->ne[0] * tensor->ne[1] * tensor->ne[2] * tensor->ne[3] * ggml_type_size(type);
};
auto type = tensor->type;
if (ggml_is_quantized(type) && ctx->enable_cpu_dequantize) {
type = GGML_TYPE_F32; // TODO: [quantize] fix me if plan to dequantize to other types
}
const auto tensor_size = get_tensor_size_in_bytes(tensor, type);
if (ctx->max_tensor_size_in_bytes && tensor_size >= ctx->max_tensor_size_in_bytes) {
QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) size(%lld) exceeds the limit(%lld)\n",
qnn::get_backend_name(ctx->device), ggml_get_name(tensor), (int) tensor->ne[0],
(int) tensor->ne[1], (int) tensor->ne[2], (int) tensor->ne[3], (long long int) tensor_size,
(long long int) ctx->max_tensor_size_in_bytes);
return false;
}
return true;
}
bool is_tensor_type_valid(qnn::ggml_backend_qnn_device_context * ctx, const ggml_tensor * tensor) {
if (!tensor) {
QNN_LOG_DEBUG("tensor is nullptr\n");
return false;
}
#ifndef NDEBUG
if (tensor->view_src) {
auto * src_tensor = tensor->view_src;
QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) is a view, src: %s_%dx%dx%dx%d\n", qnn::get_backend_name(ctx->device),
ggml_get_name(tensor), (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2],
(int) tensor->ne[3], ggml_get_name(src_tensor), (int) src_tensor->ne[0], (int) src_tensor->ne[1],
(int) src_tensor->ne[2], (int) src_tensor->ne[3]);
}
#endif
switch (tensor->type) {
case GGML_TYPE_F32:
case GGML_TYPE_F16:
if (!is_type_bit_enabled(ctx->supported_types, tensor->type)) {
QNN_LOG_DEBUG("[%s]unsupported data type %s, supported_types: 0x%x\n",
qnn::get_backend_name(ctx->device), ggml_type_name(tensor->type),
(unsigned int) ctx->supported_types);
return false;
}
break;
default:
QNN_LOG_DEBUG("[%s]unsupported data type %s\n", qnn::get_backend_name(ctx->device),
ggml_type_name(tensor->type));
return false;
}
return true;
}
bool is_data_reinterpretation_op(ggml_op op) {
return op == GGML_OP_VIEW || op == GGML_OP_PERMUTE;
}
bool ggnl_qnn_supports_op_tensor(qnn::ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) {
if (op->op == GGML_OP_NONE) {
return true;
}
if (!is_tensor_type_valid(ctx, op) || !is_tensor_size_valid(ctx, op)) {
return false;
}
// TODO: fix for other op
const bool cpu_dequant = ctx->enable_cpu_dequantize && op->op == GGML_OP_MUL_MAT;
for (size_t i = 0; i < GGML_MAX_SRC && op->src[i]; ++i) {
auto * src = op->src[i];
if (!is_tensor_size_valid(ctx, src)) {
return false;
}
// passthrough the quantized tensor for CPU dequantization
if (!is_tensor_type_valid(ctx, src) && (!cpu_dequant || !ggml_is_quantized(src->type))) {
return false;
}
}
return true;
}
bool ggml_qnn_have_same_tensor_types(qnn::ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) {
auto * src0 = op->src[0];
auto * src1 = op->src[1];
if (src1) {
if (src0->type != op->type || src1->type != op->type) {
QNN_LOG_DEBUG("[%s][%s]type src0(%s), src1(%s) and op(%s) are not equal\n",
qnn::get_backend_name(ctx->device), ggml_op_name(op->op), ggml_type_name(src0->type),
ggml_type_name(src1->type), ggml_type_name(op->type));
return false;
}
} else {
if (src0->type != op->type) {
QNN_LOG_DEBUG("[%s][%s]type src0(%s) and op(%s) are not equal\n", qnn::get_backend_name(ctx->device),
ggml_op_name(op->op), ggml_type_name(src0->type), ggml_type_name(op->type));
return false;
}
}
#ifdef NDEBUG
GGML_UNUSED(ctx);
#endif
return true;
}
// TODO: move to caps array?
bool ggml_qnn_supports_matmul_op(qnn::ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) {
auto * src0 = op->src[0];
auto * src1 = op->src[1];
if (is_data_reinterpretation_op(src0->op) || is_data_reinterpretation_op(src1->op)) {
// TODO: remove the blocker here when we support permute op
QNN_LOG_DEBUG("[%s][MUL_MAT]data reorganization op is not supported, (%s, %s)\n",
qnn::get_backend_name(ctx->device), ggml_op_name(src0->op), ggml_op_name(src1->op));
return false;
}
switch (ctx->device) {
case QNN_BACKEND_NPU:
if (src1->ne[2] != src0->ne[2] || src1->ne[3] != src0->ne[3]) {
/*
* TODO: remove the blocker here when NPU backend supports mul_mat like this:
* [ne03, ne02, n, k] * [ne03 * x, ne02 * y, m, k] -> [ne03 * x, ne02 * y, m, n]
*/
QNN_LOG_DEBUG("[qnn-npu][MUL_MAT]src0 and src1 dimensions are not equal\n");
return false;
}
// fall through, from test here, the convert op is super slow on NPU:
// https://github.com/usefulsensors/qc_npu_benchmark
case QNN_BACKEND_GPU:
if (!ggml_qnn_have_same_tensor_types(ctx, op) && op->type != GGML_TYPE_F32) {
// for different tensor types and not float32, we don't support it currently, since there's no convert
QNN_LOG_DEBUG("[%s][MUL_MAT]src0 and src1 and dst types are not equal\n",
qnn::get_backend_name(ctx->device));
return false;
}
if (op->type == GGML_TYPE_F32 && ggml_is_quantized(src0->type) &&
!is_type_bit_enabled(ctx->cpu_preprocess_types, src0->type)) {
// for such cases that src0 is quantized and op is float32, check if the quant type is enabled
QNN_LOG_DEBUG("[%s][MUL_MAT]quantized src0 type %s is not enabled\n",
qnn::get_backend_name(ctx->device), ggml_type_name(src0->type));
return false;
}
break;
default:
break;
}
if ((src1->ne[2] % src0->ne[2]) != 0 || (src1->ne[3] % src0->ne[3]) != 0) {
QNN_LOG_DEBUG("[%s][MUL_MAT]src0 and src1 dimensions are not equal\n", qnn::get_backend_name(ctx->device));
return false;
}
QNN_LOG_DEBUG("[%s][MUL_MAT]supported matmul op\n", qnn::get_backend_name(ctx->device));
return true;
}
#ifndef NDEBUG
void print_tensor_info(qnn::ggml_backend_qnn_device_context * ctx, const ggml_tensor * op, bool is_supported) {
const char * supported = is_supported ? "supported" : "unsupported";
std::string op_key;
qnn::get_qnn_op_desc(op, true, GGML_TYPE_COUNT, op_key);
QNN_LOG_DEBUG("[%s][%s]op was %s, support/unsupported: %d/%d\n", qnn::get_backend_name(ctx->device), op_key.c_str(),
supported, ctx->supported_op_count.load(), ctx->unsupported_op_count.load());
}
#endif
} // namespace
namespace qnn {
bool device_supports_op(qnn::ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) {
// Note that this function could be called before the device context is initialized
if (op->op == GGML_OP_NONE) {
return true;
}
if (!kQnnSupportedOps[qnn::get_qnn_op_index(op)]) {
#ifndef NDEBUG
ctx->unsupported_op_count++;
print_tensor_info(ctx, op, false);
#endif
return false;
}
if (!ggnl_qnn_supports_op_tensor(ctx, op)) {
#ifndef NDEBUG
ctx->unsupported_op_count++;
print_tensor_info(ctx, op, false);
#endif
return false;
}
bool is_op_supported = true;
if (op->op == GGML_OP_UNARY) {
const auto unary_op = ggml_get_unary_op(op);
if (unary_op == GGML_UNARY_OP_GELU) {
// TODO: fix this
QNN_LOG_DEBUG("[GELU]unsupported unary op GGML_UNARY_OP_GELU for NPU\n");
is_op_supported = false;
}
} else {
auto * src0 = op->src[0];
auto * src1 = op->src[1];
switch (op->op) {
case GGML_OP_MUL:
// TODO: fix this when we have the support for mul with rms_norm
if (ctx->enable_cpu_dequantize && (src0->op == GGML_OP_RMS_NORM || src1->op == GGML_OP_RMS_NORM)) {
QNN_LOG_DEBUG("[%s][%s]skip unsupported mul with rms norm, (%s, %s)\n",
qnn::get_backend_name(ctx->device), ggml_op_desc(op), ggml_op_desc(src0),
ggml_op_desc(src1));
is_op_supported = false;
break;
}
// fall through, just skip the mul with rms_norm, in llama, its at start of decoder block
case GGML_OP_ADD:
case GGML_OP_SUB:
case GGML_OP_DIV:
// TODO: move to op caps array?
if (!ggml_are_same_shape(src0, src1)) {
QNN_LOG_DEBUG("[%s][%s] src0 and src1 dimensions are not equal\n",
qnn::get_backend_name(ctx->device), ggml_op_desc(op));
is_op_supported = false;
}
break;
case GGML_OP_MUL_MAT:
is_op_supported = ggml_qnn_supports_matmul_op(ctx, op);
break;
default:
is_op_supported = ggml_qnn_have_same_tensor_types(ctx, op);
break;
}
}
#ifndef NDEBUG
if (is_op_supported) {
ctx->supported_op_count++;
} else {
ctx->unsupported_op_count++;
}
print_tensor_info(ctx, op, is_op_supported);
#endif
return is_op_supported;
}
bool device_compute_graph(qnn::ggml_backend_qnn_device_context * ctx, ggml_cgraph * cgraph) {
QNN_LOG_DEBUG("[%s]compute graph start, nodes count: %d\n", qnn::get_backend_name(ctx->device),
(int) cgraph->n_nodes);
auto qnn_graph = get_qnn_graph_from_cache(ctx, cgraph);
bool success = qnn_graph && qnn_graph->execute(cgraph, ctx->convert_context);
QNN_LOG_DEBUG("[%s]compute graph, success: %d\n", qnn::get_backend_name(ctx->device), (int) success);
return success;
}
} // namespace qnn

View File

@ -0,0 +1,59 @@
#pragma once
#ifndef NDEBUG
# include <atomic>
#endif
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include "convert.hpp"
#include "ggml-backend.h"
#include "ggml-qnn.h"
#include "ggml.h"
#include "graph.hpp"
#include "qnn-lib.hpp"
namespace qnn {
typedef std::unordered_map<std::string, std::unique_ptr<qnn::qnn_graph>> qnn_graph_cache_t;
struct ggml_backend_qnn_device_context {
// initialize in constructor
backend_index_type device;
size_t threads;
std::string name;
std::string description;
// initialize in qnn init
qnn::qcom_socinfo socinfo = {};
size_t max_tensor_size_in_bytes;
std::shared_ptr<qnn::qnn_instance> instance;
std::shared_ptr<qnn::qnn_interface> qnn_interface;
qnn::qnn_graph_cache_t qnn_graph_cache;
std::shared_ptr<qnn::qnn_convert_context_t> convert_context = std::make_shared<qnn::qnn_convert_context_t>();
#ifndef NDEBUG
std::atomic_uint32_t supported_op_count = 0;
std::atomic_uint32_t unsupported_op_count = 0;
#endif
bool enable_cpu_dequantize = false;
uint64_t supported_types;
uint64_t cpu_preprocess_types;
explicit ggml_backend_qnn_device_context(backend_index_type device, size_t threads, const char * name,
uint64_t supported_types) :
device(device),
threads(threads),
name(name),
supported_types(supported_types) {}
};
bool device_supports_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op);
bool device_compute_graph(ggml_backend_qnn_device_context * ctx, ggml_cgraph * cgraph);
} // namespace qnn

View File

@ -0,0 +1,187 @@
#pragma once
#include <cstdint>
#include <memory>
#include "logger.hpp"
#include "qnn-lib.hpp"
namespace qnn {
/**
* @brief An interface for managing generic QNN buffers.
*
* This abstract class defines the interface for managing generic memory buffers in a QNN context.
*/
class qnn_buffer_interface {
public:
virtual ~qnn_buffer_interface() = default;
/**
* @brief Checks if the buffer is valid.
*
* This pure virtual function must be implemented by derived classes to check
* the validity of the buffer.
*
* @return true if the buffer is valid, false otherwise.
*/
virtual bool is_valid() const = 0;
/**
* @brief Gets the buffer pointer.
*
* This pure virtual function must be implemented by derived classes to return
* a pointer to the buffer.
*
* @return A pointer to the buffer.
*/
virtual uint8_t * get_buffer() = 0;
/**
* @brief Gets the buffer pointer.
*
* This pure virtual function must be implemented by derived classes to return
* a pointer to the buffer.
*
* @return A pointer to the buffer.
*/
virtual size_t get_size() const = 0;
/**
* @brief Gets the QNN memory handle associated with the buffer.
*
* This pure virtual function must be implemented by derived classes to return
* the memory handle associated with the buffer.
*
* @return The memory handle, or null if no valid QNN memory handle is attached.
*/
virtual Qnn_MemHandle_t get_mem_handle() const = 0;
};
using qnn_buffer_ptr = std::shared_ptr<qnn_buffer_interface>;
/**
* @brief A class for managing QNN RPC memory buffers.
*
* This class is responsible for allocating, registering, and managing a buffer in RPC memory.
* It ensures that the buffer is properly allocated and registered with the QNN instance, and
* handles cleanup of the buffer and its associated memory handle upon destruction.
*/
class qnn_rpc_buffer : public qnn_buffer_interface {
public:
qnn_rpc_buffer(qnn_instance_ptr qnn_instance, const size_t size, const uint32_t rank, uint32_t * dimensions,
Qnn_DataType_t data_type) :
_size(size),
_qnn_instance(qnn_instance) {
_qnn_rpc_buffer = static_cast<uint8_t *>(qnn_instance->alloc_rpcmem(size, alignof(uint8_t *)));
_qnn_rpc_mem_handle = qnn_instance->register_rpcmem(_qnn_rpc_buffer, rank, dimensions, data_type);
if (!_qnn_rpc_buffer || !_qnn_rpc_mem_handle) {
QNN_LOG_WARN("Failed to register RPC memory: buffer or memory handle is null\n");
// let the destructor free the buffer
return;
}
QNN_LOG_DEBUG("alloc rpcmem(%p) successfully, size %d\n", (void *) _qnn_rpc_buffer, (int) size);
}
~qnn_rpc_buffer() {
if (_qnn_instance) {
if (_qnn_rpc_mem_handle) {
_qnn_instance->unregister_rpcmem(_qnn_rpc_mem_handle);
}
if (_qnn_rpc_buffer) {
_qnn_instance->free_rpcmem(_qnn_rpc_buffer);
}
}
}
bool is_valid() const override { return _qnn_rpc_buffer && _qnn_rpc_mem_handle; }
uint8_t * get_buffer() override { return _qnn_rpc_buffer; }
size_t get_size() const override { return _size; }
Qnn_MemHandle_t get_mem_handle() const override { return _qnn_rpc_mem_handle; }
private:
size_t _size = 0;
uint8_t * _qnn_rpc_buffer = nullptr;
Qnn_MemHandle_t _qnn_rpc_mem_handle = nullptr;
qnn_instance_ptr _qnn_instance;
DISABLE_COPY(qnn_rpc_buffer);
DISABLE_MOVE(qnn_rpc_buffer);
};
/**
* @brief A class for managing QNN memory buffers allocated in regular memory.
*
* This class is responsible for allocating, managing, and freeing memory buffers
* in regular (non-RPC) memory. It implements the qnn_buffer_interface to provide
* a consistent interface for buffer management.
*/
class qnn_mem_buffer : public qnn_buffer_interface {
public:
explicit qnn_mem_buffer(const uint8_t * data, size_t size) {
_buffer = reinterpret_cast<uint8_t *>(qnn::page_align_alloc(size));
if (!_buffer) {
QNN_LOG_WARN("failed to allocate %.2f MiB\n", float(size / (1 << 20)));
return;
}
_size = size;
if (data) {
memcpy(_buffer, data, size);
}
QNN_LOG_DEBUG("alloc buffer: %p, size: %ld\n", (void *) _buffer, (long) size);
}
explicit qnn_mem_buffer(size_t size) : qnn_mem_buffer(nullptr, size) {}
~qnn_mem_buffer() {
QNN_LOG_DEBUG("free buffer: %p, size: %ld\n", (void *) _buffer, (long) _size);
// the free will do nothing if the _buffer is nullptr
qnn::align_free(_buffer);
}
bool is_valid() const override { return _buffer != nullptr; }
uint8_t * get_buffer() override { return _buffer; }
size_t get_size() const override { return _size; }
Qnn_MemHandle_t get_mem_handle() const override { return nullptr; }
private:
size_t _size = 0;
uint8_t * _buffer = nullptr;
DISABLE_COPY(qnn_mem_buffer);
DISABLE_MOVE(qnn_mem_buffer);
};
class qnn_mem_buffer_slice : public qnn_buffer_interface {
public:
qnn_mem_buffer_slice(const uint8_t * buffer, size_t size) : _buffer(const_cast<uint8_t *>(buffer)), _size(size) {}
bool is_valid() const override { return _buffer && _size; }
uint8_t * get_buffer() override { return _buffer; }
size_t get_size() const override { return _size; }
Qnn_MemHandle_t get_mem_handle() const override { return nullptr; }
private:
uint8_t * _buffer = nullptr;
size_t _size = 0;
DISABLE_COPY(qnn_mem_buffer_slice);
DISABLE_MOVE(qnn_mem_buffer_slice);
};
} // namespace qnn

View File

@ -0,0 +1,155 @@
#include "convert.hpp"
#include "logger.hpp"
namespace {
size_t get_convert_buffer_size(const qnn::ggml_dimension_array_t & dimensions, ggml_type dst_type) {
GGML_ASSERT(ggml_blck_size(dst_type) == 1);
size_t nbytes = ggml_type_size(dst_type);
for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
nbytes *= dimensions[i]; // tight packing
}
return nbytes;
}
// from ggml_backend_blas_mul_mat, when omp available, use it otherwise will fall back to standard lib solution
// TODO: remove this when we can fall back the convert to blas backend
#ifdef GGML_USE_OPENMP
void convert_tensor_impl(const ggml_tensor * src, int max_threads,
std::shared_ptr<qnn::qnn_mem_buffer_slice> & output_buffer) {
const auto ne03 = src->ne[3];
const auto ne02 = src->ne[2];
const auto ne01 = src->ne[1];
const auto ne00 = src->ne[0];
const auto ne_plane = ne01 * ne00;
const auto nb03 = src->nb[3];
const auto nb02 = src->nb[2];
const auto nb01 = src->nb[1];
const int min_cols_per_thread = 4096;
void * wdata = output_buffer->get_buffer();
const auto to_float = ggml_get_type_traits(src->type)->to_float;
GGML_ASSERT(to_float);
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
const void * x = (char *) src->data + i02 * nb02 + i03 * nb03;
float * const wplane = (float *) wdata + i02 * ne_plane + i03 * ne02 * ne_plane;
const int min_rows_per_thread = std::max((int) (min_cols_per_thread / ne00), 1);
const int n_threads = std::max(std::min(max_threads, (int) (ne01 / min_rows_per_thread)), 1);
# pragma omp parallel for num_threads(n_threads)
for (int64_t i01 = 0; i01 < ne01; i01++) {
to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00);
}
}
}
return output_buffer;
}
#else
void convert_tensor_impl(const ggml_tensor * src, int max_threads, std::vector<std::future<void>> & tasks,
std::shared_ptr<qnn::qnn_mem_buffer_slice> & output_buffer) {
const auto ne03 = src->ne[3];
const auto ne02 = src->ne[2];
const auto ne01 = src->ne[1];
const auto ne00 = src->ne[0];
const auto ne_plane = ne01 * ne00;
const auto nb03 = src->nb[3];
const auto nb02 = src->nb[2];
const auto nb01 = src->nb[1];
const int min_cols_per_thread = 4096;
void * wdata = output_buffer->get_buffer();
const auto to_float = ggml_get_type_traits(src->type)->to_float;
GGML_ASSERT(to_float);
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
const void * x = (char *) src->data + i02 * nb02 + i03 * nb03;
float * const wplane = (float *) wdata + i02 * ne_plane + i03 * ne02 * ne_plane;
const int min_rows_per_thread = std::max((int) (min_cols_per_thread / ne00), 1);
const int n_threads = std::max(std::min(max_threads, (int) (ne01 / min_rows_per_thread)), 1);
for (int i = 1; i < n_threads; i++) {
const int64_t start = i * ne01 / n_threads;
const int64_t end = (i + 1) * ne01 / n_threads;
if (start < end) {
tasks.push_back(std::async(std::launch::async, [=]() {
for (int64_t i01 = start; i01 < end; i01++) {
to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00);
}
}));
}
}
{
// reuse the current thread for the first task
const int64_t start = 0;
const int64_t end = ne01 / n_threads;
for (int64_t i01 = start; i01 < end; i01++) {
to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00);
}
}
}
}
// wait for all tasks to finish
for (auto & task : tasks) {
task.get();
}
tasks.clear();
}
#endif
} // namespace
namespace qnn {
std::vector<qnn::qnn_buffer_ptr> convert(std::shared_ptr<qnn_convert_context_t> convert_context,
const ggml_tensor_array_t & tensors, ggml_type target_data_type) {
convert_context->buffers.resize(tensors.size());
std::vector<qnn::qnn_buffer_ptr> output_buffers(tensors.size());
for (size_t i = 0; i < tensors.size(); ++i) {
const ggml_tensor * src = tensors[i];
if (src->type == target_data_type) {
continue;
}
auto & data_buffer = convert_context->buffers[i];
const auto dst_size = get_convert_buffer_size(src->ne, target_data_type);
if (!data_buffer || data_buffer->get_size() < dst_size) {
#ifndef NDEBUG
auto old_size = data_buffer ? data_buffer->get_size() : 0;
QNN_LOG_DEBUG("create buffer[%d] for tensor %s(%s), old_size: %d, new_size: %d\n", (int) i,
ggml_get_name(src), ggml_type_name(src->type), (int) old_size, (int) dst_size);
#endif
data_buffer = std::make_shared<qnn::qnn_mem_buffer>(dst_size);
}
// TODO: add more restrictions to the buffer slice here
std::shared_ptr<qnn::qnn_mem_buffer_slice> output_buffer =
std::make_shared<qnn::qnn_mem_buffer_slice>(data_buffer->get_buffer(), dst_size);
QNN_LOG_DEBUG("convert tensor(%s) from %s to %s, size: %d, n_threads: %d\n", ggml_get_name(src),
ggml_type_name(src->type), ggml_type_name(target_data_type), (int) dst_size,
convert_context->n_threads);
#ifdef GGML_USE_OPENMP
convert_tensor_impl(src, convert_context->n_threads, output_buffer);
#else
convert_tensor_impl(src, convert_context->n_threads, convert_context->tasks, output_buffer);
#endif
output_buffers[i] = output_buffer;
}
return output_buffers;
}
} // namespace qnn

View File

@ -0,0 +1,26 @@
#pragma once
#include <future>
#include <memory>
#include <thread>
#include "buffer.hpp"
#include "ggml-qnn.h"
#include "tensor.hpp"
#include "utils.hpp"
namespace qnn {
// see also: ggml_backend_blas_context
struct qnn_convert_context_t {
int n_threads = std::thread::hardware_concurrency();
std::vector<std::shared_ptr<qnn_mem_buffer>> buffers;
#ifndef GGML_USE_OPENMP
std::vector<std::future<void>> tasks;
#endif
};
std::vector<qnn::qnn_buffer_ptr> convert(std::shared_ptr<qnn_convert_context_t> convert_context,
const ggml_tensor_array_t & tensors, ggml_type target_data_type);
} // namespace qnn

View File

@ -0,0 +1,170 @@
#include "event_tracer.hpp"
#include <HTP/QnnHtpProfile.h>
#include <QnnProfile.h>
#include "logger.hpp"
#include "qnn-lib.hpp"
namespace {
std::string get_duration_string(const QnnProfile_EventData_t & event_data) {
char time_str[128] = {};
switch (event_data.unit) {
case QNN_PROFILE_EVENTUNIT_CYCLES:
snprintf(time_str, sizeof(time_str), "cycles: %lld", (long long int) event_data.value);
break;
case QNN_PROFILE_EVENTUNIT_COUNT:
snprintf(time_str, sizeof(time_str), "count: %lld", (long long int) event_data.value);
break;
case QNN_PROFILE_EVENTUNIT_BYTES:
snprintf(time_str, sizeof(time_str), "size: %lld bytes", (long long int) event_data.value);
break;
case QNN_PROFILE_EVENTUNIT_MICROSEC:
{
double duration_ms = event_data.value / 1000.0;
snprintf(time_str, sizeof(time_str), "duration: %.3f ms", duration_ms);
}
break;
default:
break;
}
return time_str;
}
} // namespace
namespace qnn {
qnn_event_tracer::qnn_event_tracer(const std::string & prefix, std::shared_ptr<qnn_interface> interface,
Qnn_BackendHandle_t backend_handle, sdk_profile_level level) :
_interface(interface),
_prefix(prefix) {
QnnProfile_Level_t qnn_profile_level = 0;
switch (level) {
case sdk_profile_level::PROFILE_BASIC:
qnn_profile_level = QNN_PROFILE_LEVEL_BASIC;
break;
case sdk_profile_level::PROFILE_OP_TRACE:
case sdk_profile_level::PROFILE_DETAIL:
qnn_profile_level = QNN_PROFILE_LEVEL_DETAILED;
break;
case sdk_profile_level::PROFILE_OFF:
default:
QNN_LOG_WARN("[profiler][%s]invalid profile level %d, using PROFILE_OFF\n", _prefix.c_str(), level);
return;
}
auto error = _interface->qnn_profile_create(backend_handle, qnn_profile_level, &_handle);
if (error != QNN_SUCCESS) {
QNN_LOG_ERROR("[profiler][%s]failed to create QNN profile_handle. Backend ID %u, error %ld\n", _prefix.c_str(),
_interface->get_backend_id(), (long) QNN_GET_ERROR_CODE(error));
_handle = nullptr;
return;
}
if (level == sdk_profile_level::PROFILE_OP_TRACE) {
QnnProfile_Config_t qnn_profile_config = QNN_PROFILE_CONFIG_INIT;
qnn_profile_config.option = QNN_PROFILE_CONFIG_OPTION_ENABLE_OPTRACE;
std::array<const QnnProfile_Config_t *, 2> profile_configs = { &qnn_profile_config, nullptr };
error = _interface->qnn_profile_set_config(_handle, profile_configs.data());
if (error != QNN_SUCCESS) {
QNN_LOG_ERROR("[profiler][%s]failed to set QNN profile event. Backend ID %u, error %ld\n", _prefix.c_str(),
_interface->get_backend_id(), (long) QNN_GET_ERROR_CODE(error));
_interface->qnn_profile_free(_handle);
_handle = nullptr;
return;
}
}
QNN_LOG_DEBUG("[profiler][%s]created, Backend ID %u, level %d\n", _prefix.c_str(), _interface->get_backend_id(),
level);
}
qnn_event_tracer::~qnn_event_tracer() {
if (_handle) {
Qnn_ErrorHandle_t error = _interface->qnn_profile_free(_handle);
if (error != QNN_SUCCESS) {
QNN_LOG_ERROR("[profiler][%s]failed to free QNN profile_handle. Backend ID %u, error %ld\n",
_prefix.c_str(), _interface->get_backend_id(), (long) QNN_GET_ERROR_CODE(error));
}
_handle = nullptr;
}
}
void qnn_event_tracer::print_profile_events() {
const QnnProfile_EventId_t * events_ptr = nullptr;
uint32_t num_events = 0;
auto error = _interface->qnn_profile_get_events(_handle, &events_ptr, &num_events);
if (error != QNN_SUCCESS) {
QNN_LOG_ERROR("[profiler][%s]failed to get QNN profile events. Backend ID %u, error %ld\n", _prefix.c_str(),
_interface->get_backend_id(), (long) QNN_GET_ERROR_CODE(error));
return;
}
if (!num_events) {
QNN_LOG_INFO("[profiler][%s]no QNN profile events\n", _prefix.c_str());
return;
}
QNN_LOG_INFO("[profiler][%s]print_profile_events start ----------------\n", _prefix.c_str());
// see also: https://github.com/pytorch/executorch/blob/0ccf5093823761cf8ad98c75e5fe81f15ea42366/backends/qualcomm/runtime/backends/QnnProfiler.cpp#L73
QnnProfile_EventData_t event_data;
for (uint32_t i = 0; i < num_events; ++i) {
error = _interface->qnn_profile_get_event_data(events_ptr[i], &event_data);
if (error != QNN_SUCCESS) {
QNN_LOG_ERROR("[profiler][%s]failed to get QNN profile event data. Backend ID %u, event[%d], error: %ld\n",
_prefix.c_str(), _interface->get_backend_id(), i, (long) QNN_GET_ERROR_CODE(error));
continue;
}
const QnnProfile_EventId_t * sub_events_ptr = nullptr;
uint32_t num_sub_events = 0;
error = _interface->qnn_profile_get_sub_events(events_ptr[i], &sub_events_ptr, &num_sub_events);
if (error != QNN_SUCCESS) {
QNN_LOG_ERROR("[profiler][%s]failed to get QNN profile sub events. Backend ID %u, event[%d], error: %ld\n",
_prefix.c_str(), _interface->get_backend_id(), i, (long) QNN_GET_ERROR_CODE(error));
continue;
}
auto duration = get_duration_string(event_data);
if (!num_sub_events) {
QNN_LOG_INFO("[profiler][%s]event[%d]: %s, %s\n", _prefix.c_str(), i, event_data.identifier,
duration.c_str());
continue;
}
QNN_LOG_INFO("[profiler][%s]event[%d]: %s, sub_count: %d, start -------------\n", _prefix.c_str(), i,
event_data.identifier, num_sub_events);
QnnProfile_EventData_t sub_event_data;
for (std::uint32_t j = 0; j < num_sub_events; ++j) {
error = _interface->qnn_profile_get_event_data(sub_events_ptr[j], &sub_event_data);
if (error != QNN_SUCCESS) {
QNN_LOG_ERROR(
"[profiler][%s]failed to get QNN profile sub event data. Backend ID %u, event[%d], sub_event[%d], "
"error: %ld\n",
_prefix.c_str(), _interface->get_backend_id(), i, j, (long) QNN_GET_ERROR_CODE(error));
continue;
}
if (sub_event_data.type != QNN_PROFILE_EVENTTYPE_NODE) {
QNN_LOG_DEBUG("[profiler][%s]sub_event[%d]%s, type %d, skipping\n", _prefix.c_str(), j,
sub_event_data.identifier, sub_event_data.type);
continue;
}
auto sub_duration = get_duration_string(sub_event_data);
QNN_LOG_INFO("[profiler][%s]sub_event[%d]: %s, %s\n", _prefix.c_str(), j, sub_event_data.identifier,
sub_duration.c_str());
}
QNN_LOG_INFO("[profiler][%s]event[%d]: %s, %s, end --------------\n", _prefix.c_str(), i, event_data.identifier,
duration.c_str());
}
QNN_LOG_INFO("[profiler][%s]print_profile_events end -----------------\n", _prefix.c_str());
}
} // namespace qnn

View File

@ -0,0 +1,45 @@
#pragma once
#include <QnnCommon.h>
#include <cstddef>
#include <cstdint>
#include <memory>
#include <string>
#include "logger.hpp"
#include "profiler.hpp"
#include "qnn-types.hpp"
namespace qnn {
// forward declaration of qnn_interface
class qnn_interface;
class qnn_event_tracer {
public:
// ref:
// https://github.com/pytorch/executorch/blob/ae3d558d5e6aa04fc52a3065399fe6a773702f52/backends/qualcomm/serialization/qc_schema.py#L53
// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html#supported-snapdragon-devices
enum sdk_profile_level { PROFILE_OFF = 0, PROFILE_BASIC, PROFILE_DETAIL, PROFILE_OP_TRACE };
explicit qnn_event_tracer(const std::string & prefix, std::shared_ptr<qnn_interface> interface,
Qnn_BackendHandle_t backend_handle, sdk_profile_level level);
~qnn_event_tracer();
Qnn_ProfileHandle_t get_handle() const { return _handle; }
void print_profile_events();
private:
std::shared_ptr<qnn_interface> _interface;
Qnn_ProfileHandle_t _handle = nullptr;
std::string _prefix;
DISABLE_COPY(qnn_event_tracer);
DISABLE_MOVE(qnn_event_tracer);
};
using qnn_event_tracer_ptr = std::shared_ptr<qnn_event_tracer>;
} // namespace qnn

View File

@ -0,0 +1,408 @@
#include "backend-ops.hpp"
#include "common.hpp"
#include "ggml-backend-impl.h"
#include "ggml-impl.h"
#include "logger.hpp"
#include "tensor.hpp"
#include "utils.hpp"
#include <functional>
#include <memory>
#include <vector>
namespace {
qnn::ggml_backend_qnn_device_context * get_device_context(ggml_backend_dev_t dev) {
return reinterpret_cast<qnn::ggml_backend_qnn_device_context *>(dev->context);
}
qnn::qnn_buffer_interface * get_buffer_context(ggml_backend_buffer_t buffer) {
return reinterpret_cast<qnn::qnn_buffer_interface *>(buffer->context);
}
/*
* -----------------------------------------------------------------------------------------------
* qnn backend buffer object
* -----------------------------------------------------------------------------------------------
*/
void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) {
auto * ctx = get_buffer_context(buffer);
delete ctx;
}
void * ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) {
auto * ctx = get_buffer_context(buffer);
return ctx->get_buffer();
}
ggml_status ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
GGML_UNUSED(buffer);
GGML_UNUSED(tensor);
return GGML_STATUS_SUCCESS;
}
void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer,
ggml_tensor * tensor,
const void * data,
size_t offset,
size_t size) {
GGML_UNUSED(buffer);
memcpy((char *) tensor->data + offset, data, size);
}
void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer,
const ggml_tensor * tensor,
void * data,
size_t offset,
size_t size) {
GGML_UNUSED(buffer);
memcpy(data, (const char *) tensor->data + offset, size);
}
bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
GGML_UNUSED(buffer);
if (ggml_backend_buffer_is_host(src->buffer)) {
memcpy(dst->data, src->data, ggml_nbytes(src));
return true;
}
return false;
}
void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
auto * ctx = get_buffer_context(buffer);
memset(ctx->get_buffer(), value, ctx->get_size());
}
constexpr const ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = {
/* .free_buffer = */ ggml_backend_qnn_buffer_free_buffer,
/* .get_base = */ ggml_backend_qnn_buffer_get_base,
/* .init_tensor = */ ggml_backend_qnn_buffer_init_tensor,
/* .memset_tensor = */ nullptr,
/* .set_tensor = */ ggml_backend_qnn_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_qnn_buffer_get_tensor,
/* .cpy_tensor = */ ggml_backend_qnn_buffer_cpy_tensor,
/* .clear = */ ggml_backend_qnn_buffer_clear,
/* .reset = */ nullptr,
};
/*
* -----------------------------------------------------------------------------------------------
* qnn backend object
* -----------------------------------------------------------------------------------------------
*/
const char * ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) {
auto * dev_ctx = get_device_context(buft->device);
return qnn::get_backend_name(dev_ctx->device);
}
ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
qnn::qnn_buffer_interface * ctx = new qnn::qnn_mem_buffer(size);
if (!ctx->is_valid()) {
return nullptr;
}
QNN_LOG_DEBUG("[%s]alloc buffer: %p, size: %ld\n", qnn::get_backend_name(get_device_context(buft->device)->device),
(void *) ctx->get_buffer(), (long) size);
return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface, ctx, size);
}
size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
GGML_UNUSED(buft);
// TODO: fix this
return 32;
}
size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
GGML_UNUSED(buft);
// TODO: get the max size from device
return 1024L * 1024 * 1024;
}
bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) {
// TODO: fix this
GGML_UNUSED(buft);
return true;
}
const char * ggml_backend_qnn_name(ggml_backend_t backend) {
auto * device_ctx = get_device_context(backend->device);
return device_ctx->name.c_str();
}
void ggml_backend_qnn_free(ggml_backend_t backend) {
auto * device_ctx = get_device_context(backend->device);
QNN_LOG_INFO("idx %d, name:%s\n", device_ctx->device, device_ctx->name.c_str());
auto & instance = device_ctx->instance;
if (instance) {
device_ctx->qnn_graph_cache.clear();
device_ctx->qnn_interface.reset();
instance->qnn_finalize();
instance.reset();
}
delete backend;
}
ggml_guid_t ggml_backend_qnn_guid() {
static ggml_guid guid = { 0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81,
0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09 };
return &guid;
}
bool ggml_backend_is_qnn(ggml_backend_t backend) {
return ggml_guid_matches(backend->guid, ggml_backend_qnn_guid());
}
bool ggml_backend_qnn_cpy_tensor_async(ggml_backend_t backend_src,
ggml_backend_t backend_dst,
const ggml_tensor * src,
ggml_tensor * dst) {
GGML_UNUSED(backend_src);
GGML_UNUSED(backend_dst);
GGML_UNUSED(src);
GGML_UNUSED(dst);
QNN_LOG_DEBUG("opy form %s to %s, src_is_qnn: %d, dst_is_qnn: %d\n", ggml_get_name(src), ggml_get_name(dst),
(int) ggml_backend_is_qnn(backend_src), (int) ggml_backend_is_qnn(backend_dst));
return false;
}
ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(ggml_backend_dev_t dev) {
static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[QNN_BACKEND_COUNT];
auto * dev_ctx = get_device_context(dev);
if (!ggml_backend_qnn_buffer_types[dev_ctx->device].device) {
ggml_backend_qnn_buffer_types[dev_ctx->device] = {
/* .iface = */ {
/* .get_name = */ ggml_backend_qnn_buffer_type_name,
/* .alloc_buffer = */
ggml_backend_qnn_buffer_type_alloc_buffer, /* .get_alignment = */
ggml_backend_qnn_buffer_type_get_alignment, /* .get_max_size = */
ggml_backend_qnn_buffer_type_get_max_size, /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes
/* .is_host = */ ggml_backend_qnn_buffer_is_host,
},
/* .device */
dev,
/* .context = */ nullptr,
};
} else {
GGML_ASSERT(ggml_backend_qnn_buffer_types[dev_ctx->device].device == dev);
}
return &ggml_backend_qnn_buffer_types[dev_ctx->device];
}
ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
return qnn::device_compute_graph(get_device_context(backend->device), cgraph) ? GGML_STATUS_SUCCESS :
GGML_STATUS_FAILED;
}
constexpr const ggml_backend_i ggml_backend_qnn_interface = {
/* .get_name = */ ggml_backend_qnn_name,
/* .free = */ ggml_backend_qnn_free,
/* .set_tensor_async = */ nullptr,
/* .get_tensor_async = */ nullptr,
/* .cpy_tensor_async = */ ggml_backend_qnn_cpy_tensor_async,
/* .synchronize = */ nullptr,
/* .graph_plan_create = */ nullptr,
/* .graph_plan_free = */ nullptr,
/* .graph_plan_update = */ nullptr,
/* .graph_plan_compute = */ nullptr,
/* .graph_compute = */ ggml_backend_qnn_graph_compute,
/* .event_record = */ nullptr,
/* .event_wait = */ nullptr,
/* .graph_optimize = */ nullptr,
};
/*
* -----------------------------------------------------------------------------------------------
* qnn backend device object
* -----------------------------------------------------------------------------------------------
*/
const char * ggml_backend_qnn_device_get_name(ggml_backend_dev_t dev) {
auto * dev_ctx = get_device_context(dev);
return qnn::get_backend_name(dev_ctx->device);
}
const char * ggml_backend_qnn_device_get_description(ggml_backend_dev_t dev) {
auto * dev_ctx = get_device_context(dev);
return dev_ctx->description.empty() ? qnn::get_backend_desc(dev_ctx->device) : dev_ctx->description.c_str();
}
void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
GGML_UNUSED(dev);
*free = common::get_system_free_memory_in_bytes();
*total = common::get_system_total_memory_in_bytes();
QNN_LOG_DEBUG("free memory: %ldMB, total memory: %ldMB\n", (*free / 1048576), (*total) / 1048576);
}
enum ggml_backend_dev_type ggml_backend_qnn_device_get_type(ggml_backend_dev_t dev) {
return qnn::get_device_caps(get_device_context(dev)->device).type;
}
void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
props->name = ggml_backend_qnn_device_get_name(dev);
props->description = ggml_backend_qnn_device_get_description(dev);
props->type = ggml_backend_qnn_device_get_type(dev);
ggml_backend_qnn_device_get_memory(dev, &props->memory_free, &props->memory_total);
props->caps = {
/* async */ false,
/* host_buffer */ false,
/* buffer_from_host_ptr */ false,
/* events */ false,
};
}
ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, const char * extend_lib_search_path) {
if (!extend_lib_search_path) {
extend_lib_search_path = GGML_QNN_DEFAULT_LIB_SEARCH_PATH;
QNN_LOG_WARN(
"extend_lib_search_path is nullptr, will "
"use " GGML_QNN_DEFAULT_LIB_SEARCH_PATH " as default\n");
}
auto * dev_ctx = get_device_context(dev);
const auto device = dev_ctx->device;
QNN_LOG_DEBUG("device %s\n", qnn::get_backend_name(device));
QNN_LOG_DEBUG("extend_lib_search_path %s\n", extend_lib_search_path);
auto instance = std::make_shared<qnn::qnn_instance>(extend_lib_search_path, device);
if (!instance->qnn_init(nullptr)) {
QNN_LOG_WARN("failed to init qnn backend %s\n", qnn::get_backend_name(device));
return nullptr;
}
auto qnn_interface = instance->get_qnn_interface();
if (!qnn_interface) {
QNN_LOG_WARN("qnn subsystem failure\n");
return nullptr;
}
std::string device_name = qnn::get_backend_name(device);
QNN_LOG_INFO("qnn device name %s\n", device_name.c_str());
const auto & device_caps = qnn::get_device_caps(device);
dev_ctx->instance = instance;
dev_ctx->qnn_interface = qnn_interface;
dev_ctx->socinfo = instance->get_soc_info();
dev_ctx->supported_types = device_caps.supported_types;
dev_ctx->cpu_preprocess_types = device_caps.cpu_preprocess_types;
dev_ctx->max_tensor_size_in_bytes = device_caps.max_tensor_size_in_bytes;
{
char buffer[256];
snprintf(buffer, sizeof(buffer), "%s(%s)", qnn::get_chipset_desc(dev_ctx->socinfo.soc_model),
qnn::get_backend_desc(dev_ctx->device));
dev_ctx->description = buffer;
}
#ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS
// TODO: remove npu from here if hardware quantization is supported
dev_ctx->enable_cpu_dequantize = device == QNN_BACKEND_CPU;
#endif
ggml_backend_t qnn_backend = new ggml_backend{
/* .guid = */ ggml_backend_qnn_guid(),
/* .iface = */ ggml_backend_qnn_interface,
/* .device = */ dev,
/* .context = */ nullptr,
};
return qnn_backend;
}
ggml_backend_t ggml_backend_qnn_device_init(ggml_backend_dev_t dev, const char * params) {
return ggml_backend_qnn_init_with_device_context(dev, params);
}
ggml_backend_buffer_type_t ggml_backend_qnn_device_get_buffer_type(ggml_backend_dev_t dev) {
return ggml_backend_qnn_buffer_type(dev);
}
ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_ptr(ggml_backend_dev_t dev,
void * ptr,
size_t size,
size_t max_tensor_size) {
// TODO
GGML_UNUSED(dev);
GGML_UNUSED(max_tensor_size);
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
}
bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
// Note that this function could be called before the device context is initialized
auto * device_ctx = get_device_context(dev);
return qnn::device_supports_op(device_ctx, op);
}
bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
GGML_UNUSED(dev);
return ggml_backend_buft_is_host(buft);
}
bool ggml_backend_qnn_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
#ifdef NDEBUG
GGML_UNUSED(dev);
GGML_UNUSED(op);
#else
auto * device_ctx = get_device_context(dev);
QNN_LOG_DEBUG("[%s][%s]offload op\n", qnn::get_backend_name(device_ctx->device), ggml_op_name(op->op));
#endif
return false;
}
constexpr const ggml_backend_device_i ggml_backend_qnn_device_interface = {
/* .get_name = */ ggml_backend_qnn_device_get_name,
/* .get_description = */ ggml_backend_qnn_device_get_description,
/* .get_memory = */ ggml_backend_qnn_device_get_memory,
/* .get_type = */ ggml_backend_qnn_device_get_type,
/* .get_props = */ ggml_backend_qnn_device_get_props,
/* .init_backend = */ ggml_backend_qnn_device_init,
/* .get_buffer_type = */ ggml_backend_qnn_device_get_buffer_type,
/* .get_host_buffer_type = */ nullptr,
/* .buffer_from_host_ptr = */ ggml_backend_qnn_device_buffer_from_ptr,
/* .supports_op = */ ggml_backend_qnn_device_supports_op,
/* .supports_buft = */ ggml_backend_qnn_device_supports_buft,
/* .offload_op = */ ggml_backend_qnn_device_offload_op,
/* .event_new = */ nullptr,
/* .event_free = */ nullptr,
/* .event_synchronize = */ nullptr,
};
class qnn_device_proxy : public backend_device_proxy {
public:
explicit qnn_device_proxy(backend_index_type device) {
const auto & device_caps = qnn::get_device_caps(device);
_device_context = std::make_unique<qnn::ggml_backend_qnn_device_context>(
/* .device = */ device, // init from the last device, i.e. NPU
/* .threads = */ 1, // TODO: fix this
/* .name = */ qnn::get_backend_name(device),
/* .supported_types = */ device_caps.supported_types);
}
const ggml_backend_device_i & get_iface() const { return ggml_backend_qnn_device_interface; }
void * get_context() { return _device_context.get(); }
private:
std::unique_ptr<qnn::ggml_backend_qnn_device_context> _device_context;
};
} // namespace
backend_device_proxy_ptr create_qnn_backend_context(backend_index_type device) {
if (device >= QNN_BACKEND_COUNT) {
QNN_LOG_ERROR("[qnn]invalid device %d\n", device);
return backend_device_proxy_ptr();
}
#ifndef GGML_QNN_ENABLE_CPU_BACKEND
if (device == QNN_BACKEND_CPU) {
/*
* here we skip the initialization of CPU device,
* cause it'll block unsupported ops fallback to ggml cpu backend
*/
GGML_LOG_DEBUG("qnn backend registry skip CPU device\n");
return backend_device_proxy_ptr();
}
#endif
return std::make_unique<qnn_device_proxy>(device);
}

View File

@ -0,0 +1,549 @@
#include "graph.hpp"
#include <algorithm>
#include <unordered_map>
#include "event_tracer.hpp"
#include "ggml-impl.h"
#include "logger.hpp"
#include "op-config.hpp"
#include "tensor.hpp"
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
# define GRAPH_PROFILE_HANDLE (_event_tracer ? _event_tracer->get_handle() : nullptr)
# define GRAPH_PROFILE_PRINT() \
if (_event_tracer) { \
_event_tracer->print_profile_events(); \
} \
(void) 0
#else
# define GRAPH_PROFILE_HANDLE (nullptr)
# define GRAPH_PROFILE_PRINT() (void) 0
#endif
namespace {
using qnn_tensor_cache_t = std::unordered_map<ggml_tensor *, qnn::qnn_tensor_ptr_t>;
int get_op_max_rank(const ggml_tensor * op) {
int max_rank = ggml_n_dims(op);
for (int i = 0; i < GGML_MAX_DIMS && op->src[i]; ++i) {
max_rank = std::max(max_rank, ggml_n_dims(op->src[i]));
}
return max_rank;
}
qnn::qnn_tensor_ptr_t create_tensor_with_cache(ggml_tensor * tensor, qnn::ggml_qnn_tensor::tensor_type_t type, int rank,
ggml_type override_data_type, backend_index_type device,
Qnn_GraphHandle_t graph_handle,
std::shared_ptr<qnn::qnn_instance> qnn_instance,
qnn_tensor_cache_t & tensor_cache) {
GGML_ASSERT(tensor);
if (tensor_cache.count(tensor)) {
return tensor_cache[tensor];
}
QNN_LOG_DEBUG("[%s]create_tensor_with_cache, data_type: %s, override_data_type: %s\n",
qnn::get_backend_name(device), ggml_type_name(tensor->type), ggml_type_name(override_data_type));
auto data_type = override_data_type != GGML_TYPE_COUNT ? override_data_type : tensor->type;
// We've observed that some tensors have the same name with different op types will be added to the same graph
// which will cause the graph build failed. To avoid this, we append the op type to the tensor name.
char tensor_name[256];
snprintf(tensor_name, sizeof(tensor_name), "%s_%s", ggml_get_name(tensor), ggml_op_desc(tensor));
auto qnn_tensor = std::make_shared<qnn::ggml_qnn_tensor>(type, std::string(tensor_name), tensor->ne, data_type,
rank, device, graph_handle, qnn_instance);
tensor_cache[tensor] = qnn_tensor;
return qnn_tensor;
}
qnn::qnn_tensor_array_t create_tensors_with_cache(const qnn::ggml_tensor_array_t & ggml_tensors,
qnn::ggml_qnn_tensor::tensor_type_t type, int rank,
ggml_type override_data_type, backend_index_type device,
Qnn_GraphHandle_t graph_handle,
std::shared_ptr<qnn::qnn_instance> qnn_instance,
qnn_tensor_cache_t & tensor_cache) {
qnn::qnn_tensor_array_t tensors;
for (auto * tensor : ggml_tensors) {
tensors.push_back(create_tensor_with_cache(tensor, type, rank, override_data_type, device, graph_handle,
qnn_instance, tensor_cache));
}
return tensors;
}
qnn::qnn_op_config_ptr_t create_operation_from_op_tensor(ggml_tensor * dst, const std::string & name, int rank,
backend_index_type device, Qnn_GraphHandle_t graph_handle,
std::shared_ptr<qnn::qnn_instance> qnn_instance,
qnn_tensor_cache_t & tensor_cache) {
auto operation = qnn::create_op(dst, name, qnn_instance);
// input tensors
qnn::qnn_tensor_array_t input_qnn_tensors;
for (size_t i = 0; i < GGML_MAX_DIMS && dst->src[i]; ++i) {
auto * src = dst->src[i];
auto input_qnn_tensor = create_tensor_with_cache(src, qnn::ggml_qnn_tensor::INTERMEDIATE, rank, GGML_TYPE_COUNT,
device, graph_handle, qnn_instance, tensor_cache);
input_qnn_tensors.push_back(input_qnn_tensor);
}
operation->set_input_tensors(input_qnn_tensors);
// output tensor
qnn::qnn_tensor_array_t output_qnn_tensors =
create_tensors_with_cache({ dst }, qnn::ggml_qnn_tensor::INTERMEDIATE, rank, GGML_TYPE_COUNT, device,
graph_handle, qnn_instance, tensor_cache);
operation->set_output_tensors(output_qnn_tensors);
// initialize operation
if (!operation->initialize_op_nodes(device, graph_handle)) {
QNN_LOG_ERROR("[%s][%s]initialize_op_nodes failed\n", qnn::get_backend_name(device), name.c_str());
return nullptr;
}
return operation;
}
/**
* @brief Extracts input and output tensors from a computational graph.
*
* This function identifies the input and output tensors of a computational graph by analyzing the connectivity between
* tensor nodes. It does this by iterating over each node in the graph, using a connectivity map that associates every
* tensor with its number of incoming connections (in_degree), outgoing connections (out_degree), and an insertion index
* that preserves order. The insertion index is used later to sort the tensors in their original discovery order.
*
* TODO: this algorithm is not perfect and may not work for all cases. It assumes that the tensors are
* connected in a way that allows for unambiguous categorization.
*/
int get_io_tensors_from_graph(const ggml_cgraph * cgraph, qnn::ggml_tensor_array_t & inputs,
qnn::ggml_tensor_array_t & outputs) {
struct _tensor_connectivity_info {
size_t in_degree = 0;
size_t out_degree = 0;
size_t insert_index = 0;
};
using ggml_tensor_connectivity_map_t = std::unordered_map<ggml_tensor *, _tensor_connectivity_info>;
ggml_tensor_connectivity_map_t connectivity_map;
int rank = 0;
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * dst = cgraph->nodes[i];
if (ggml_is_empty(dst)) {
continue;
}
if (dst->op == GGML_OP_NONE || dst->op == GGML_OP_VIEW || dst->op == GGML_OP_PERMUTE) {
// TODO: remove GGML_OP_VIEW after view op is supported
QNN_LOG_DEBUG("node[%d]%s(%s), type: %s, skipped\n", i, ggml_get_name(dst), ggml_op_desc(dst),
ggml_type_name(dst->type));
continue;
}
QNN_LOG_DEBUG("node[%d]%s(%s), type: %s\n", i, ggml_get_name(dst), ggml_op_desc(dst),
ggml_type_name(dst->type));
rank = std::max(rank, ggml_n_dims(dst));
if (connectivity_map.count(dst) == 0) {
connectivity_map[dst] = {
1, // in-degree, at least 1
0,
connectivity_map.size(),
};
} else {
++(connectivity_map[dst].in_degree);
}
for (size_t j = 0; j < GGML_MAX_DIMS && dst->src[j]; ++j) {
auto * src = dst->src[j];
rank = std::max(rank, ggml_n_dims(src));
QNN_LOG_DEBUG("node[%d]: src[%d]: %s(%s), type: %s\n", i, (int) j, ggml_get_name(src), ggml_op_desc(src),
ggml_type_name(src->type));
if (connectivity_map.count(src) == 0) {
connectivity_map[src] = {
0,
1, // out-degree, at least 1
connectivity_map.size(),
};
} else {
++(connectivity_map[src].out_degree);
}
}
}
for (const auto & kv : connectivity_map) {
if (kv.second.in_degree == 0) {
inputs.push_back(kv.first);
}
if (kv.second.out_degree == 0) {
outputs.push_back(kv.first);
}
}
std::sort(inputs.begin(), inputs.end(), [&connectivity_map](ggml_tensor * lhs, ggml_tensor * rhs) {
return connectivity_map[lhs].insert_index < connectivity_map[rhs].insert_index;
});
std::sort(outputs.begin(), outputs.end(), [&connectivity_map](ggml_tensor * lhs, ggml_tensor * rhs) {
return connectivity_map[lhs].insert_index < connectivity_map[rhs].insert_index;
});
return rank;
}
/*
* for src0_F32, src1_F32, dst_F32 -> GGML_TYPE_COUNT
* for src0_F16, src1_F16, dst_F16 -> GGML_TYPE_COUNT
* for src0_F16, src1_F32, dst_F32 -> GGML_TYPE_F32
* for src0_q4, src1_F32, dst_F32 -> GGML_TYPE_F32
* for src0_q4, src1_F16, dst_F32 -> GGML_TYPE_F32
*/
ggml_type get_override_data_type(const qnn::ggml_tensor_array_t & inputs, const qnn::ggml_tensor_array_t & outputs) {
GGML_ASSERT(!inputs.empty());
ggml_type override_data_type = inputs.front()->type;
bool is_same_data_type = true;
for (auto * tensor : inputs) {
QNN_LOG_DEBUG("input_tensor: %s(%s), override_data_type(%s)\n", ggml_get_name(tensor),
ggml_type_name(tensor->type), ggml_type_name(override_data_type));
is_same_data_type = is_same_data_type && tensor->type == override_data_type;
override_data_type = std::min(override_data_type, tensor->type);
}
for (auto * tensor : outputs) {
QNN_LOG_DEBUG("output_tensor: %s(%s), override_data_type(%s)\n", ggml_get_name(tensor),
ggml_type_name(tensor->type), ggml_type_name(override_data_type));
is_same_data_type = is_same_data_type && tensor->type == override_data_type;
override_data_type = std::min(override_data_type, tensor->type);
}
return is_same_data_type ? GGML_TYPE_COUNT : override_data_type;
}
static const QnnHtpGraph_CustomConfig_t kDefaultHvxConfig = []() {
QnnHtpGraph_CustomConfig_t hvx_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT;
hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
hvx_config.numHvxThreads = 8;
return hvx_config;
}();
static const QnnHtpGraph_CustomConfig_t kDefaultDlbcConfig = []() {
QnnHtpGraph_CustomConfig_t dlbc_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT;
dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC
return dlbc_config;
}();
/*
* 1 = Faster preparation time, less optimal graph
* 2 = Longer preparation time, more optimal graph
* 3 = Longest preparation time, most likely even more optimal graph:
* QNN_HTP_DEVICE_CONFIG_OPTION_SOC configuration will be taken into account when possible, details see HTP Backend Specific Page
*/
static const QnnHtpGraph_CustomConfig_t kDefaultOptConfig = []() {
QnnHtpGraph_CustomConfig_t opt_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT;
opt_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
#ifndef NDEBUG
opt_config.optimizationOption.floatValue = 3;
#else
opt_config.optimizationOption.floatValue = 1;
#endif
return opt_config;
}();
static const QnnHtpGraph_CustomConfig_t kHtpPrecisionConfigF16 = []() {
QnnHtpGraph_CustomConfig_t precision_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT;
precision_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION;
precision_config.precision = QNN_PRECISION_FLOAT16;
return precision_config;
}();
constexpr QnnHtpGraph_CustomConfig_t make_vtcm_config(size_t vtcm_size_in_mb) {
QnnHtpGraph_CustomConfig_t vtcm_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT;
vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
vtcm_config.vtcmSizeInMB = (uint32_t) vtcm_size_in_mb;
return vtcm_config;
}
constexpr QnnGraph_Config_t make_graph_config(const QnnHtpGraph_CustomConfig_t * custom_config) {
QnnGraph_Config_t graph_config = QNN_GRAPH_CONFIG_INIT;
graph_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
graph_config.customConfig = const_cast<QnnHtpGraph_CustomConfig_t *>(custom_config);
return graph_config;
}
} // namespace
namespace qnn {
ggml_type qnn_graph::get_graph_key_from_cgraph(const ggml_cgraph * cgraph, std::string & output) {
if (cgraph->n_nodes == 0) {
QNN_LOG_DEBUG("empty cgraph\n");
return GGML_TYPE_COUNT;
}
ggml_type override_type = GGML_TYPE_COUNT;
{
// TODO: can we have a better approach to get the override_type here?
// though it is O(n) + O(mlog(m)) complexity, our graph is small, so it is fine
ggml_tensor_array_t inputs;
ggml_tensor_array_t outputs;
get_io_tensors_from_graph(cgraph, inputs, outputs);
if (!inputs.empty() && !outputs.empty()) {
override_type = get_override_data_type(inputs, outputs);
QNN_LOG_DEBUG("get_graph_key, override_type: %s\n", ggml_type_name(override_type));
} else {
QNN_LOG_DEBUG("get_graph_key, no input or output tensors\n");
}
}
ggml_type min_op_type = GGML_TYPE_COUNT;
{
bool is_start = true;
for (int i = 0; i < cgraph->n_nodes; ++i) {
auto * op = cgraph->nodes[i];
if (ggml_is_empty(op)) {
QNN_LOG_DEBUG("empty op in graph, skipping\n");
continue;
}
if (op->op == GGML_OP_NONE || op->op == GGML_OP_VIEW || op->op == GGML_OP_PERMUTE) {
QNN_LOG_DEBUG("%s in graph, skipping\n", ggml_op_desc(op));
continue;
}
min_op_type = std::min(min_op_type, op->type);
if (is_start) {
qnn::get_qnn_op_desc(op, is_start, override_type, output);
is_start = false;
} else {
output += '#';
qnn::get_qnn_op_desc(op, is_start, override_type, output);
}
}
}
if (cgraph->n_nodes > 1) {
auto * last_op = cgraph->nodes[cgraph->n_nodes - 1];
output += qnn::get_ggml_type_name(last_op->type);
output += '_';
qnn::append_tensor_shape_and_type(last_op, output);
}
return min_op_type;
}
qnn_graph::qnn_graph(const std::string & graph_name, backend_index_type device, qnn_instance_ptr qnn_instance,
htp_precision precision, size_t vtcm_size_in_mb) :
_graph_name(graph_name),
_device(device),
_qnn_instance(qnn_instance) {
QNN_LOG_DEBUG("[%s][%s]creating\n", get_backend_name(device), graph_name.c_str());
auto qnn_interface = qnn_instance->get_qnn_interface();
auto qnn_context = qnn_instance->get_qnn_context_handle();
Qnn_ErrorHandle_t error = QNN_SUCCESS;
Qnn_GraphHandle_t graph_handle = nullptr;
if (device == QNN_BACKEND_NPU) {
// TODO: fix graph config here for NPU
std::vector<const QnnGraph_Config_t *> graph_configs;
auto hvx_config = make_graph_config(&kDefaultHvxConfig);
graph_configs.push_back(&hvx_config);
auto dlbc_config = make_graph_config(&kDefaultDlbcConfig);
graph_configs.push_back(&dlbc_config);
auto opt_config = make_graph_config(&kDefaultOptConfig);
graph_configs.push_back(&opt_config);
auto vctm_sub_config = make_vtcm_config(vtcm_size_in_mb);
auto vtcm_config = make_graph_config(&vctm_sub_config);
graph_configs.push_back(&vtcm_config);
if (precision == qnn_graph::kHtpFp16) {
auto precision_config = make_graph_config(&kHtpPrecisionConfigF16);
graph_configs.push_back(&precision_config);
QNN_LOG_DEBUG("[%s][%s]set precision to F16\n", get_backend_name(device), graph_name.c_str());
}
graph_configs.push_back(nullptr);
error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), graph_configs.data(), &graph_handle);
} else {
error = qnn_interface->qnn_graph_create(qnn_context, graph_name.c_str(), nullptr, &graph_handle);
}
if (error != QNN_SUCCESS) {
QNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s\n", get_backend_name(device), graph_name.c_str(),
get_qnn_error_string(error));
return;
}
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
if (device == QNN_BACKEND_NPU) {
_event_tracer = std::make_shared<qnn_event_tracer>(
graph_name, qnn_interface, qnn_instance->get_qnn_backend_handle(), qnn_event_tracer::PROFILE_OP_TRACE);
}
#endif
_graph_handle = graph_handle;
_qnn_interface = qnn_interface;
QNN_LOG_DEBUG("[%s][%s]create succeed\n", get_backend_name(device), graph_name.c_str());
}
qnn_graph::~qnn_graph() {
QNN_LOG_DEBUG("[%s][%s]destroy\n", get_backend_name(_device), _graph_name.c_str());
}
bool qnn_graph::build_graph_from_ggml_graph(const ggml_cgraph * cgraph) {
QNN_LOG_DEBUG("[%s][%s]build start\n", get_backend_name(_device), _graph_name.c_str());
ggml_tensor_array_t inputs;
ggml_tensor_array_t outputs;
int rank = get_io_tensors_from_graph(cgraph, inputs, outputs);
QNN_LOG_DEBUG("[%s][%s]rank: %d, graph_nodes: %d, input_set: %d, output_set: %d\n", get_backend_name(_device),
_graph_name.c_str(), rank, cgraph->n_nodes, int(inputs.size()), int(outputs.size()));
{
static_assert(
GGML_TYPE_COUNT > GGML_TYPE_Q8_0 && GGML_TYPE_Q8_0 > GGML_TYPE_F16 && GGML_TYPE_F16 > GGML_TYPE_F32,
"GGML_TYPE enum order is not correct");
SCOPED_PERFORMANCE_TRACKER("[%s][%s]build_graph_from_ggml_graph", get_backend_name(_device),
_graph_name.c_str());
auto override_data_type = get_override_data_type(inputs, outputs);
if (override_data_type != GGML_TYPE_COUNT) {
QNN_LOG_DEBUG("[%s][%s]set override_data_type: %s\n", get_backend_name(_device), _graph_name.c_str(),
ggml_type_name(override_data_type));
}
qnn_tensor_cache_t tensor_cache;
auto input_tensors = create_tensors_with_cache(inputs, ggml_qnn_tensor::INPUT, rank, override_data_type,
_device, _graph_handle, _qnn_instance, tensor_cache);
auto output_tensors = create_tensors_with_cache(outputs, ggml_qnn_tensor::OUTPUT, rank, GGML_TYPE_COUNT,
_device, _graph_handle, _qnn_instance, tensor_cache);
qnn_op_config_array_t operations;
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * dst = cgraph->nodes[i];
if (ggml_is_empty(dst)) {
continue;
}
if (dst->op == GGML_OP_NONE || dst->op == GGML_OP_VIEW || dst->op == GGML_OP_PERMUTE) {
// TODO: remove GGML_OP_VIEW after view op is supported
continue;
}
#ifndef NDEBUG
{
std::string op_desc;
get_qnn_op_desc(dst, true, GGML_TYPE_COUNT, op_desc);
QNN_LOG_DEBUG("[%s]create op(%s) with qnn op(%s)\n", get_backend_name(_device), op_desc.c_str(),
get_qnn_op_name(dst));
}
#endif
auto operation = create_operation_from_op_tensor(dst, dst->name, rank, _device, _graph_handle,
_qnn_instance, tensor_cache); // TODO: fix op name
operations.push_back(operation);
}
_tensor_inputs = std::move(input_tensors);
_tensor_outputs = std::move(output_tensors);
_operations = std::move(operations);
if (!finalize()) {
return false;
}
}
QNN_LOG_DEBUG("[%s][%s]build succeed\n", get_backend_name(_device), _graph_name.c_str());
return true;
}
bool qnn_graph::execute(const ggml_cgraph * cgraph, std::shared_ptr<qnn_convert_context_t> convert_context) {
ggml_tensor_array_t inputs;
ggml_tensor_array_t outputs;
{
SCOPED_PERFORMANCE_TRACKER("[%s][%s]get_io_tensors_from_graph", get_backend_name(_device), _graph_name.c_str());
#ifdef NDEBUG
get_io_tensors_from_graph(cgraph, inputs, outputs);
#else
int rank = get_io_tensors_from_graph(cgraph, inputs, outputs);
QNN_LOG_DEBUG("[%s]rank: %d, input_set: %d, output_set: %d\n", get_backend_name(_device), rank,
int(inputs.size()), int(outputs.size()));
#endif
}
{
SCOPED_PERFORMANCE_TRACKER("[%s][%s]bind_tensors", get_backend_name(_device), _graph_name.c_str());
auto override_data_type = get_override_data_type(inputs, outputs);
if (override_data_type != GGML_TYPE_COUNT) {
QNN_LOG_DEBUG("[%s][%s]override_data_type: %s\n", get_backend_name(_device), _graph_name.c_str(),
ggml_type_name(override_data_type));
auto buffers = convert(convert_context, inputs, override_data_type);
if (!qnn::bind_tensors_with_custom_buffers(inputs, buffers, _tensor_inputs, _qnn_tensor_inputs)) {
QNN_LOG_ERROR("[%s][%s]bind input tensors failed\n", get_backend_name(_device), _graph_name.c_str());
return false;
}
} else {
if (!qnn::bind_tensors(inputs, _tensor_inputs, _qnn_tensor_inputs)) {
QNN_LOG_ERROR("[%s][%s]bind input tensors failed\n", get_backend_name(_device), _graph_name.c_str());
return false;
}
}
if (!qnn::bind_tensors(outputs, _tensor_outputs, _qnn_tensor_outputs)) {
QNN_LOG_ERROR("[%s][%s]bind output tensors failed\n", get_backend_name(_device), _graph_name.c_str());
return false;
}
}
{
SCOPED_PERFORMANCE_TRACKER("[%s][%s]execute", get_backend_name(_device), _graph_name.c_str());
auto & qnn_tensor_inputs = _qnn_tensor_inputs;
auto & qnn_tensor_outputs = _qnn_tensor_outputs;
auto error = _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(),
qnn_tensor_inputs.size(), qnn_tensor_outputs.data(),
qnn_tensor_outputs.size(), GRAPH_PROFILE_HANDLE, nullptr);
unbind_tensors(_tensor_inputs);
unbind_tensors(_tensor_outputs);
if (error != QNN_SUCCESS) {
if (_device == QNN_BACKEND_NPU && error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) {
QNN_LOG_WARN("[%s][%s][execute]NPU crashed. SSR detected. Caused QNN graph execute error.\n",
get_backend_name(_device), _graph_name.c_str());
} else {
QNN_LOG_ERROR("[%s][%s][execute]error: %s\n", get_backend_name(_device), _graph_name.c_str(),
get_qnn_error_string(error));
}
return false;
}
QNN_LOG_DEBUG("[%s][%s]execute succeed\n", get_backend_name(_device), _graph_name.c_str());
}
GRAPH_PROFILE_PRINT();
return true;
}
bool qnn_graph::finalize() {
SCOPED_PERFORMANCE_TRACKER("[%s][%s]finalize", get_backend_name(_device), _graph_name.c_str());
if (!qnn::add_op_to_graph(_graph_handle, _operations)) {
QNN_LOG_ERROR("[%s]add nodes failed\n", _graph_name.c_str());
return false;
}
auto error = _qnn_interface->qnn_graph_finalize(_graph_handle, GRAPH_PROFILE_HANDLE, nullptr);
if (error != QNN_SUCCESS) {
QNN_LOG_ERROR("[%s][%s]qnn_graph_finalize.error: %s\n", get_backend_name(_device), _graph_name.c_str(),
get_qnn_error_string(error));
return false;
}
QNN_LOG_DEBUG("[%s][%s]finalize succeed\n", get_backend_name(_device), _graph_name.c_str());
return true;
}
} // namespace qnn

View File

@ -0,0 +1,93 @@
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "convert.hpp"
#include "event_tracer.hpp"
#include "ggml-qnn.h"
#include "op-config.hpp"
#include "qnn-lib.hpp"
namespace qnn {
/**
* @class qnn_graph
* @brief Manages a QNN graph, converting a GGML graph to QNN format and handling its execution.
*
* This class is responsible for building a QNN graph from a given GGML graph,
* determining its input/output tensors, finalizing the configuration, and
* executing the graph on the specified backend device.
*/
class qnn_graph {
public:
enum htp_precision {
kHtpDefault = 0,
kHtpFp16,
};
/**
* @brief Generates a unique key for a given computation graph (cgraph).
*
* This key is used to cache the graph, enabling efficient reuse of previously
* compiled graphs. The key is constructed by concatenating the descriptions
* of the operations and their associated tensor dimensions within the graph.
*
* Example key format: "MUL_MATf32_2048x8192q4_K_2048x2f32#MUL(SILU,MUL_MAT)#MUL_MAT(NONE,MUL)#ADD(MUL_MAT,ADD)f32_2048x2f32"
*
* @param cgraph The computation graph for which the key is generated.
* @param output The string where the generated key will be stored.
* @return The max ggml_type of all tensors in the graph.
*
* TODO: Improve the key generation logic to handle more complex graph structures and edge cases.
*/
static ggml_type get_graph_key_from_cgraph(const ggml_cgraph * cgraph, std::string & output);
explicit qnn_graph(const std::string & graph_name, backend_index_type device, qnn_instance_ptr qnn_instance,
htp_precision precision, size_t vtcm_size_in_mb);
~qnn_graph();
bool build_graph_from_ggml_graph(const ggml_cgraph * cgraph);
bool execute(const ggml_cgraph * cgraph, std::shared_ptr<qnn_convert_context_t> convert_context);
bool is_valid() const { return _graph_handle != nullptr; }
Qnn_GraphHandle_t get_graph_handler() const { return _graph_handle; }
qnn_instance_ptr get_qnn_instance() { return _qnn_instance; }
const std::string & get_name() const { return _graph_name; }
backend_index_type get_device() const { return _device; }
private:
bool finalize();
const std::string _graph_name;
const backend_index_type _device;
Qnn_GraphHandle_t _graph_handle = nullptr;
qnn_instance_ptr _qnn_instance;
qnn_interface_ptr _qnn_interface;
qnn_op_config_array_t _operations;
qnn_tensor_array_t _tensor_inputs;
qnn_tensor_array_t _tensor_outputs;
std::vector<Qnn_Tensor_t> _qnn_tensor_inputs;
std::vector<Qnn_Tensor_t> _qnn_tensor_outputs;
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
// profiler
qnn_event_tracer_ptr _event_tracer;
#endif
DISABLE_COPY(qnn_graph);
DISABLE_MOVE(qnn_graph);
};
using qnn_graph_ptr_t = std::shared_ptr<qnn_graph>;
} // namespace qnn

View File

@ -0,0 +1,50 @@
#include "logger.hpp"
#ifndef NDEBUG
# include <mutex>
# include "QnnInterface.h"
# include "QnnTypes.h"
# include "System/QnnSystemInterface.h"
void qnn::sdk_logcallback(const char * fmt, QnnLog_Level_t level, uint64_t /*timestamp*/, va_list argp) {
static std::mutex log_mutex;
static char s_ggml_qnn_logbuf[4096];
char log_level_desc;
switch (level) {
case QNN_LOG_LEVEL_ERROR:
log_level_desc = 'E';
break;
case QNN_LOG_LEVEL_WARN:
log_level_desc = 'W';
break;
case QNN_LOG_LEVEL_INFO:
log_level_desc = 'I';
break;
case QNN_LOG_LEVEL_DEBUG:
log_level_desc = 'D';
break;
case QNN_LOG_LEVEL_VERBOSE:
log_level_desc = 'V';
break;
default:
log_level_desc = 'U';
break;
}
{
std::lock_guard<std::mutex> lock(log_mutex);
int size = vsnprintf(s_ggml_qnn_logbuf, sizeof(s_ggml_qnn_logbuf), fmt, argp);
if (size > 0 && s_ggml_qnn_logbuf[size - 1] != '\n') {
QNN_LOG_INFO("[%c]%s\n", log_level_desc, s_ggml_qnn_logbuf);
} else {
QNN_LOG_INFO("[%c]%s", log_level_desc, s_ggml_qnn_logbuf);
}
}
}
#else
void qnn::sdk_logcallback(const char *, QnnLog_Level_t, uint64_t, va_list) {}
#endif

View File

@ -0,0 +1,22 @@
#pragma once
#include <QnnLog.h>
#include <cstdint>
#include "ggml-impl.h"
#include "ggml.h"
namespace qnn {
void sdk_logcallback(const char * fmt, QnnLog_Level_t level, uint64_t timestamp, va_list argp);
} // namespace qnn
#define QNN_LOG_ERROR(...) (GGML_LOG_ERROR(__VA_ARGS__))
#define QNN_LOG_WARN(...) (GGML_LOG_WARN(__VA_ARGS__))
#define QNN_LOG_INFO(...) (GGML_LOG_INFO(__VA_ARGS__))
#ifndef NDEBUG
# define QNN_LOG_DEBUG(...) (GGML_LOG_DEBUG(__VA_ARGS__))
#else
# define QNN_LOG_DEBUG(...)
#endif

View File

@ -0,0 +1,152 @@
#pragma once
#include <memory>
#include <vector>
#include "common.hpp"
#include "ggml-qnn.h"
#include "qnn-types.hpp"
#include "tensor.hpp"
namespace qnn {
/**
* @class ggml_qnn_op_config
* @brief Abstract base class for configuring QNN operations.
*
* This class provides an interface for creating and managing tensors,
* adding operations to a graph, and binding/unbinding input and output tensors.
*/
class ggml_qnn_op_config {
public:
virtual ~ggml_qnn_op_config() {}
/**
* @brief Sets custom input tensors for the operation. This method should be called before `initialize_op_nodes`.
* If no custom input tensors are provided, the input tensors will be automatically created from the input ggml
* tensors.
*
* This pure virtual function must be overridden by derived classes to set
* the input tensors for the operation. The function takes a reference to a
* vector of qnn_tensor_ptr_t objects, which represent the input tensors.
*
* @param tensor_inputs A reference to a vector of qnn_tensor_ptr_t objects representing the input tensors.
*/
virtual void set_input_tensors(qnn::qnn_tensor_array_t & tensor_inputs) = 0;
virtual void set_input_tensors(qnn::qnn_tensor_array_t && tensor_inputs) = 0;
/**
* @brief Sets custom output tensors for the operation. This method should be called before `initialize_op_nodes`.
* If no custom output tensors are provided, the output tensors will be automatically created from the output ggml
* tensors.
*
* This pure virtual function must be overridden by derived classes to set
* the output tensors for the operation. The function takes a reference to a
* vector of qnn_tensor_ptr_t objects, which represent the output tensors.
*
* @param tensor_outputs A reference to a vector of qnn_tensor_ptr_t objects representing the output tensors.
*/
virtual void set_output_tensors(qnn::qnn_tensor_array_t & tensor_outputs) = 0;
virtual void set_output_tensors(qnn::qnn_tensor_array_t && tensor_inputs) = 0;
/**
* @brief Creates tensors and internal nodes for constructing the calculation graph.
*
* This pure virtual function is responsible for creating tensors on the given
* backend device, associating them with the provided graph handle, and creating
* the internal nodes necessary for constructing the calculation graph. It takes
* input and output tensor arrays as parameters.
*
* @param device
* @param graph_handle
* @return true if tensors and nodes are successfully created, false otherwise.
*/
virtual bool initialize_op_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle) = 0;
/**
* @brief Pure virtual function to retrieve the input tensors.
*
* This function must be overridden by derived classes to provide the specific implementation
* for retrieving the input tensors used in QNN operations.
*
* @return A reference to a vector of qnn_tensor_ptr_t objects representing the input tensors.
*/
virtual qnn_tensor_array_t & get_input_tensors() = 0;
/**
* @brief Pure virtual function to retrieve the output tensors of a QNN.
*
* This function must be overridden by any derived class to provide access to the
* output tensors of the QNN. The function returns a reference to a vector of
* qnn_tensor_ptr_t objects, which represent the output tensors.
*
* @return A reference to a vector of qnn_tensor_ptr_t objects representing the output tensors.
*/
virtual qnn_tensor_array_t & get_output_tensors() = 0;
/**
* @brief Adds an operation to the given graph.
*
* This pure virtual function must be implemented by derived classes to add
* a specific operation to the provided graph handle.
*
* This function will be called after `initialize_op_nodes` during initialization.
*
* @param graph_handle The handle to the graph where the operation will be added.
* @return true if the operation was successfully added to the graph, false otherwise.
*/
virtual bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) = 0;
/**
* @brief Binds the input tensors to the operation.
*
* This pure virtual function must be implemented by derived classes to bind
* the provided input tensors to the operation. The function takes a constant
* reference to a ggml_tensor_array_t object, which contains the input tensors
* to be bound.
*
* @param tensor_inputs A constant reference to a ggml_tensor_array_t object
* containing the input tensors.
* @return true if the input tensors were successfully bound, false otherwise.
*/
virtual bool bind_input_tensors(const ggml_tensor_array_t & tensor_inputs) = 0;
/**
* @brief Binds the output tensors to the given tensor array.
*
* This pure virtual function must be implemented by derived classes to bind
* the output tensors to the provided array of tensors. The function is expected
* to establish the necessary connections or mappings between the output tensors
* and the elements of the given tensor array.
*
* @param tensor_outputs A constant reference to an array of ggml tensors that
* represent the output tensors to be bound.
* @return true if the binding is successful, false otherwise.
*/
virtual bool bind_output_tensors(const ggml_tensor_array_t & tensor_outputs) = 0;
/**
* @brief Unbinds the input tensors from the operation.
*
* This pure virtual function is intended to be overridden by derived classes
* to implement the logic for unbinding or detaching input tensors that were
* previously bound to the operation. This is typically used to release resources
* or reset the state of the operation.
*/
virtual void unbind_input_tensors() = 0;
/**
* @brief Unbinds the output tensors.
*
* This pure virtual function is responsible for unbinding or detaching
* the output tensors from their current bindings. Implementations of this
* function should ensure that any resources or references held by the
* output tensors are properly released or reset.
*/
virtual void unbind_output_tensors() = 0;
};
using qnn_op_config_ptr_t = std::shared_ptr<ggml_qnn_op_config>;
using qnn_op_config_array_t = std::vector<qnn_op_config_ptr_t>;
} // namespace qnn

View File

@ -0,0 +1,489 @@
#include "op-config-impl.hpp"
namespace {
using op_constructor_t = std::shared_ptr<qnn::ggml_qnn_op_config> (*)(const ggml_tensor *,
const std::string &,
std::shared_ptr<qnn::qnn_instance>);
using op_description_generator_t = void (*)(const ggml_tensor * op,
bool append_dimensions,
ggml_type override_data_type,
std::string & output);
void append_tensor_shape_and_type_impl(const ggml_tensor * tensor, ggml_type override_data_type, std::string & output) {
char buffer[256] = {};
const auto * type_name = qnn::get_ggml_type_name(std::min(tensor->type, override_data_type));
int len = 0;
switch (ggml_n_dims(tensor)) {
case 1:
len = snprintf(buffer, sizeof(buffer), "%ld%s", (long) tensor->ne[0], type_name);
break;
case 2:
len = snprintf(buffer, sizeof(buffer), "%ldx%ld%s", (long) tensor->ne[0], (long) tensor->ne[1], type_name);
break;
case 3:
len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ld%s", (long) tensor->ne[0], (long) tensor->ne[1],
(long) tensor->ne[2], type_name);
break;
case 4:
default:
len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ldx%ld%s", (long) tensor->ne[0], (long) tensor->ne[1],
(long) tensor->ne[2], (long) tensor->ne[3], type_name);
break;
}
GGML_ASSERT(len > 0 && len < (int) sizeof(buffer));
output.append(buffer, len);
}
void get_graph_key_from_op(const ggml_tensor * op, ggml_type override_data_type, std::string & output) {
output += ggml_op_desc(op);
output += qnn::get_ggml_type_name(op->type);
for (size_t i = 0; i < GGML_MAX_SRC && op->src[i]; ++i) {
auto * src = op->src[i];
if (!src) {
break;
}
output += '_';
append_tensor_shape_and_type_impl(src, override_data_type, output);
}
}
void get_op_key_with_src_op_desc(const ggml_tensor * op, std::string & output) {
output += ggml_op_desc(op);
output += '(';
if (op->src[0]) {
output += ggml_op_desc(op->src[0]);
}
for (size_t i = 1; i < GGML_MAX_SRC && op->src[i]; ++i) {
output += ',';
output += ggml_op_desc(op->src[i]);
}
output += ')';
}
void generic_get_op_desc(const ggml_tensor * op,
bool append_dimensions,
ggml_type override_data_type,
std::string & output) {
if (append_dimensions) {
get_graph_key_from_op(op, override_data_type, output);
} else {
get_op_key_with_src_op_desc(op, output);
}
}
struct qnn_op_caps_t {
const char * qnn_op_name = nullptr;
op_description_generator_t get_desc = nullptr;
const char * qnn_param_name = nullptr;
};
constexpr const qnn_op_caps_t kOpCaps[] = {
{}, // GGML_OP_NONE
{}, // GGML_OP_DUP
{
// GGML_OP_ADD
QNN_OP_ELEMENT_WISE_ADD, // qnn_op_name
},
{}, // GGML_OP_ADD_ID
{}, // GGML_OP_ADD1
{}, // GGML_OP_ACC
{
// GGML_OP_SUB
QNN_OP_ELEMENT_WISE_SUBTRACT, // qnn_op_name
},
{
// GGML_OP_MUL
QNN_OP_ELEMENT_WISE_MULTIPLY, // qnn_op_name
},
{
// GGML_OP_DIV
QNN_OP_ELEMENT_WISE_DIVIDE, // qnn_op_name
},
{}, // GGML_OP_SQR
{
// GGML_OP_SQRT
QNN_OP_ELEMENT_WISE_SQUARE_ROOT, // qnn_op_name
},
{
// GGML_OP_LOG
QNN_OP_ELEMENT_WISE_LOG, // qnn_op_name
},
{}, // GGML_OP_SIN
{}, // GGML_OP_COS
{}, // GGML_OP_SUM
{}, // GGML_OP_SUM_ROWS
{}, // GGML_OP_CUMSUM
{}, // GGML_OP_MEAN
{}, // GGML_OP_ARGMAX
{}, // GGML_OP_COUNT_EQUAL
{}, // GGML_OP_REPEAT
{}, // GGML_OP_REPEAT_BACK
{}, // GGML_OP_CONCAT
{}, // GGML_OP_SILU_BACK
{}, // GGML_OP_NORM
{
// GGML_OP_RMS_NORM
QNN_OP_RMS_NORM, // qnn_op_name
generic_get_op_desc, // get_desc
QNN_OP_RMS_NORM_PARAM_EPSILON, // qnn_param_name
},
{}, // GGML_OP_RMS_NORM_BACK
{}, // GGML_OP_GROUP_NORM
{}, // GGML_OP_L2_NORM
{
// GGML_OP_MUL_MAT
QNN_OP_MAT_MUL, // qnn_op_name
},
{}, // GGML_OP_MUL_MAT_ID
{}, // GGML_OP_OUT_PROD
{}, // GGML_OP_SCALE
{}, // GGML_OP_SET
{}, // GGML_OP_CPY
{}, // GGML_OP_CONT
{
// GGML_OP_RESHAPE
QNN_OP_RESHAPE, // qnn_op_name
},
{}, // GGML_OP_VIEW
{}, // GGML_OP_PERMUTE
{}, // GGML_OP_TRANSPOSE
{}, // GGML_OP_GET_ROWS
{}, // GGML_OP_GET_ROWS_BACK
{}, // GGML_OP_SET_ROWS
{}, // GGML_OP_DIAG
{}, // GGML_OP_DIAG_MASK_INF
{}, // GGML_OP_DIAG_MASK_ZERO
{}, // GGML_OP_SOFT_MAX
{}, // GGML_OP_SOFT_MAX_BACK
{}, // GGML_OP_ROPE
{}, // GGML_OP_ROPE_BACK
{}, // GGML_OP_CLAMP
{}, // GGML_OP_CONV_TRANSPOSE_1D
{}, // GGML_OP_IM2COL
{}, // GGML_OP_IM2COL_BACK
{}, // GGML_OP_IM2COL_3D
{}, // GGML_OP_CONV_2D
{}, // GGML_OP_CONV_3D
{}, // GGML_OP_CONV_2D_DW
{}, // GGML_OP_CONV_TRANSPOSE_2D
{}, // GGML_OP_POOL_1D
{}, // GGML_OP_POOL_2D
{}, // GGML_OP_POOL_2D_BACK
{}, // GGML_OP_UPSCALE
{}, // GGML_OP_PAD
{}, // GGML_OP_ROLL
{}, // GGML_OP_PAD_REFLECT_1D
{}, // GGML_OP_ARANGE
{}, // GGML_OP_TIMESTEP_EMBEDDING
{}, // GGML_OP_ARGSORT
{}, // GGML_OP_TOP_K
{}, // GGML_OP_LEAKY_RELU
{}, // GGML_OP_TRI
{}, // GGML_OP_FILL
{}, // GGML_OP_FLASH_ATTN_EXT
{}, // GGML_OP_FLASH_ATTN_BACK
{}, // GGML_OP_SSM_CONV
{}, // GGML_OP_SSM_SCAN
{}, // GGML_OP_WIN_PART
{}, // GGML_OP_WIN_UNPART
{}, // GGML_OP_GET_REL_POS
{}, // GGML_OP_ADD_REL_POS
{}, // GGML_OP_RWKV_WKV6
{}, // GGML_OP_GATED_LINEAR_ATTN
{}, // GGML_OP_RWKV_WKV7
{}, // GGML_OP_SOLVE_TRI
{}, // GGML_OP_UNARY
{}, // GGML_OP_MAP_CUSTOM1
{}, // GGML_OP_MAP_CUSTOM2
{}, // GGML_OP_MAP_CUSTOM3
{}, // GGML_OP_CUSTOM
{}, // GGML_OP_CROSS_ENTROPY_LOSS
{}, // GGML_OP_CROSS_ENTROPY_LOSS_BACK
{}, // GGML_OP_OPT_STEP_ADAMW
{}, // GGML_OP_OPT_STEP_SGD
{}, // GGML_OP_GLU
// ggml_unary_op
{}, // GGML_UNARY_OP_ABS
{}, // GGML_UNARY_OP_SGN
{}, // GGML_UNARY_OP_NEG
{}, // GGML_UNARY_OP_STEP
{}, // GGML_UNARY_OP_TANH
{}, // GGML_UNARY_OP_ELU
{}, // GGML_UNARY_OP_RELU
{}, // GGML_UNARY_OP_SIGMOID
{
// GGML_UNARY_OP_GELU
QNN_OP_GELU, // qnn_op_name
},
{}, // GGML_UNARY_OP_GELU_QUICK
{}, // GGML_UNARY_OP_SILU
{}, // GGML_UNARY_OP_HARDSWISH
{}, // GGML_UNARY_OP_HARDSIGMOID
{}, // GGML_UNARY_OP_EXP
{}, // GGML_UNARY_OP_EXPM1
{}, // GGML_UNARY_OP_SOFTPLUS
{}, // GGML_UNARY_OP_GELU_ERF
{}, // GGML_UNARY_OP_XIELU
{}, // GGML_UNARY_OP_FLOOR
{}, // GGML_UNARY_OP_CEIL
{}, // GGML_UNARY_OP_ROUND
{}, // GGML_UNARY_OP_TRUNC
};
static_assert(kOpCaps[GGML_OP_NONE].get_desc == nullptr, "GGML_OP_NONE should not have get_desc function");
static_assert(kOpCaps[GGML_OP_ADD].qnn_op_name, "GGML_OP_ADD does not have qnn_op_name in the kOpCaps table");
static_assert(kOpCaps[GGML_OP_MUL_MAT].qnn_op_name, "GGML_OP_MUL_MAT does not have qnn_op_name in the kOpCaps table");
static_assert(kOpCaps[GGML_OP_MUL].qnn_op_name, "GGML_OP_MUL does not have qnn_op_name in the kOpCaps table");
static_assert(kOpCaps[GGML_OP_LOG].qnn_op_name, "GGML_OP_LOG does not have qnn_op_name in the kOpCaps table");
static_assert(kOpCaps[GGML_OP_COUNT + GGML_UNARY_OP_GELU].qnn_op_name,
"GGML_UNARY_OP_GELU does not have qnn_op_name in the kOpCaps table");
static_assert(std::size(kOpCaps) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT),
"GGML_OP_COUNT does not match the size of the kOpCaps table");
std::shared_ptr<qnn::ggml_qnn_op_config> mat_mul_op_constructor(const ggml_tensor * op,
const std::string & instance_name,
qnn::qnn_instance_ptr qnn_instance) {
if (qnn_instance->has_custom_op_package() && ggml_n_dims(op) == 2) {
QNN_LOG_DEBUG("create GgmlMulMat, name %s, use GgmlOpPackage\n", instance_name.c_str());
return std::make_shared<qnn::ggml_qnn_single_op_config>(instance_name, "GgmlOpPackage", "GgmlMulMat",
qnn_instance);
}
QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s\n", instance_name.c_str());
return std::make_shared<qnn::ggml_qnn_matmul_op_config>(instance_name, qnn_instance);
}
template <size_t _op>
std::shared_ptr<qnn::ggml_qnn_op_config> generic_op_constructor(const ggml_tensor * op,
const std::string & instance_name,
qnn::qnn_instance_ptr qnn_instance) {
GGML_UNUSED(op);
static_assert(_op < std::size(kOpCaps));
static_assert(kOpCaps[_op].qnn_op_name != nullptr);
return std::make_shared<qnn::ggml_qnn_single_op_config>(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW,
kOpCaps[_op].qnn_op_name, qnn_instance);
}
void add_type_parameters(std::shared_ptr<qnn::ggml_qnn_op_config_base> op, const char * name, float value) {
Qnn_Scalar_t scalar = QNN_SCALAR_INIT;
scalar.dataType = QNN_DATATYPE_FLOAT_32;
scalar.floatValue = value;
op->add_scalar_param(name, scalar);
}
template <size_t _op, typename _ggml_op_param_type, typename _qnn_op_type_name>
std::shared_ptr<qnn::ggml_qnn_op_config> op_constructor_with_type_param(const ggml_tensor * op,
const std::string & instance_name,
qnn::qnn_instance_ptr qnn_instance) {
static_assert(std::is_base_of<qnn::ggml_qnn_op_config_base, _qnn_op_type_name>::value);
static_assert(_op < std::size(kOpCaps));
constexpr auto & op_caps = kOpCaps[_op];
static_assert(op_caps.qnn_op_name != nullptr);
_ggml_op_param_type op_param;
memcpy(&op_param, op->op_params, sizeof(op_param));
auto qnn_op = std::make_shared<_qnn_op_type_name>(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_caps.qnn_op_name,
qnn_instance);
if (op_caps.qnn_param_name) {
add_type_parameters(qnn_op, op_caps.qnn_param_name, op_param);
}
return qnn_op;
}
constexpr const op_constructor_t kOpConstructors[] = {
nullptr, // GGML_OP_NONE
nullptr, // GGML_OP_DUP
generic_op_constructor<GGML_OP_ADD>, // GGML_OP_ADD
nullptr, // GGML_OP_ADD_ID
nullptr, // GGML_OP_ADD1
nullptr, // GGML_OP_ACC
generic_op_constructor<GGML_OP_SUB>, // GGML_OP_SUB
generic_op_constructor<GGML_OP_MUL>, // GGML_OP_MUL
generic_op_constructor<GGML_OP_DIV>, // GGML_OP_DIV
nullptr, // GGML_OP_SQR
generic_op_constructor<GGML_OP_SQRT>, // GGML_OP_SQRT
generic_op_constructor<GGML_OP_LOG>, // GGML_OP_LOG
nullptr, // GGML_OP_SIN
nullptr, // GGML_OP_COS
nullptr, // GGML_OP_SUM
nullptr, // GGML_OP_SUM_ROWS
nullptr, // GGML_OP_CUMSUM
nullptr, // GGML_OP_MEAN
nullptr, // GGML_OP_ARGMAX
nullptr, // GGML_OP_COUNT_EQUAL
nullptr, // GGML_OP_REPEAT
nullptr, // GGML_OP_REPEAT_BACK
nullptr, // GGML_OP_CONCAT
nullptr, // GGML_OP_SILU_BACK
nullptr, // GGML_OP_NORM
op_constructor_with_type_param<GGML_OP_RMS_NORM, float, qnn::ggml_qnn_rmsnorm_op_config>, // GGML_OP_RMS_NORM
nullptr, // GGML_OP_RMS_NORM_BACK
nullptr, // GGML_OP_GROUP_NORM
nullptr, // GGML_OP_L2_NORM
mat_mul_op_constructor, // GGML_OP_MUL_MAT
nullptr, // GGML_OP_MUL_MAT_ID
nullptr, // GGML_OP_OUT_PROD
nullptr, // GGML_OP_SCALE
nullptr, // GGML_OP_SET
nullptr, // GGML_OP_CPY
nullptr, // GGML_OP_CONT
generic_op_constructor<GGML_OP_RESHAPE>, // GGML_OP_RESHAPE
nullptr, // GGML_OP_VIEW
nullptr, // GGML_OP_PERMUTE
nullptr, // GGML_OP_TRANSPOSE
nullptr, // GGML_OP_GET_ROWS
nullptr, // GGML_OP_GET_ROWS_BACK
nullptr, // GGML_OP_SET_ROWS
nullptr, // GGML_OP_DIAG
nullptr, // GGML_OP_DIAG_MASK_INF
nullptr, // GGML_OP_DIAG_MASK_ZERO
nullptr, // GGML_OP_SOFT_MAX
nullptr, // GGML_OP_SOFT_MAX_BACK
nullptr, // GGML_OP_ROPE
nullptr, // GGML_OP_ROPE_BACK
nullptr, // GGML_OP_CLAMP
nullptr, // GGML_OP_CONV_TRANSPOSE_1D
nullptr, // GGML_OP_IM2COL
nullptr, // GGML_OP_IM2COL_BACK
nullptr, // GGML_OP_IM2COL_3D
nullptr, // GGML_OP_CONV_2D
nullptr, // GGML_OP_CONV_3D
nullptr, // GGML_OP_CONV_2D_DW
nullptr, // GGML_OP_CONV_TRANSPOSE_2D
nullptr, // GGML_OP_POOL_1D
nullptr, // GGML_OP_POOL_2D
nullptr, // GGML_OP_POOL_2D_BACK
nullptr, // GGML_OP_UPSCALE
nullptr, // GGML_OP_PAD
nullptr, // GGML_OP_ROLL
nullptr, // GGML_OP_PAD_REFLECT_1D
nullptr, // GGML_OP_ARANGE
nullptr, // GGML_OP_TIMESTEP_EMBEDDING
nullptr, // GGML_OP_ARGSORT
nullptr, // GGML_OP_TOP_K
nullptr, // GGML_OP_LEAKY_RELU
nullptr, // GGML_OP_TRI
nullptr, // GGML_OP_FILL
nullptr, // GGML_OP_FLASH_ATTN_EXT
nullptr, // GGML_OP_FLASH_ATTN_BACK
nullptr, // GGML_OP_SSM_CONV
nullptr, // GGML_OP_SSM_SCAN
nullptr, // GGML_OP_WIN_PART
nullptr, // GGML_OP_WIN_UNPART
nullptr, // GGML_OP_GET_REL_POS
nullptr, // GGML_OP_ADD_REL_POS
nullptr, // GGML_OP_RWKV_WKV6
nullptr, // GGML_OP_GATED_LINEAR_ATTN
nullptr, // GGML_OP_RWKV_WKV7
nullptr, // GGML_OP_SOLVE_TRI
nullptr, // GGML_OP_UNARY
nullptr, // GGML_OP_MAP_CUSTOM1
nullptr, // GGML_OP_MAP_CUSTOM2
nullptr, // GGML_OP_MAP_CUSTOM3
nullptr, // GGML_OP_CUSTOM
nullptr, // GGML_OP_CROSS_ENTROPY_LOSS
nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK
nullptr, // GGML_OP_OPT_STEP_ADAMW
nullptr, // GGML_OP_OPT_STEP_SGD
nullptr, // GGML_OP_GLU
// ggml_unary_op
nullptr, // GGML_UNARY_OP_ABS
nullptr, // GGML_UNARY_OP_SGN
nullptr, // GGML_UNARY_OP_NEG
nullptr, // GGML_UNARY_OP_STEP
nullptr, // GGML_UNARY_OP_TANH
nullptr, // GGML_UNARY_OP_ELU
nullptr, // GGML_UNARY_OP_RELU
nullptr, // GGML_UNARY_OP_SIGMOID
nullptr, // GGML_UNARY_OP_GELU
nullptr, // GGML_UNARY_OP_GELU_QUICK
nullptr, // GGML_UNARY_OP_SILU
nullptr, // GGML_UNARY_OP_HARDSWISH
nullptr, // GGML_UNARY_OP_HARDSIGMOID
nullptr, // GGML_UNARY_OP_EXP
nullptr, // GGML_UNARY_OP_EXPM1
nullptr, // GGML_UNARY_OP_SOFTPLUS
nullptr, // GGML_UNARY_OP_GELU_ERF
nullptr, // GGML_UNARY_OP_XIELU
nullptr, // GGML_UNARY_OP_FLOOR
nullptr, // GGML_UNARY_OP_CEIL
nullptr, // GGML_UNARY_OP_ROUND
nullptr, // GGML_UNARY_OP_TRUNC
};
static_assert(kOpConstructors[GGML_OP_NONE] == nullptr, "GGML_OP_NONE does not match the nullptr function");
static_assert(kOpConstructors[GGML_OP_ADD] == generic_op_constructor<GGML_OP_ADD>,
"GGML_OP_ADD does not match the generic_op_constructor<GGML_OP_ADD> function");
static_assert(kOpConstructors[GGML_OP_MUL_MAT] == mat_mul_op_constructor,
"GGML_OP_MUL_MAT does not match the mat_mul_op_constructor function");
static_assert(std::size(kOpConstructors) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT),
"GGML_OP_COUNT does not match the size of the kOpConstructors table");
} // namespace
namespace qnn {
void append_tensor_shape_and_type(const ggml_tensor * tensor, std::string & output) {
append_tensor_shape_and_type_impl(tensor, GGML_TYPE_COUNT, output);
}
size_t get_qnn_op_index(const ggml_tensor * tensor) {
if (tensor->op == GGML_OP_UNARY) {
return kGgmlUnaryOpStart + ggml_get_unary_op(tensor);
}
return tensor->op;
}
const char * get_qnn_op_name(const ggml_tensor * op) {
auto op_index = get_qnn_op_index(op);
GGML_ASSERT(op_index < std::size(kOpCaps));
GGML_ASSERT(kOpCaps[op_index].qnn_op_name);
return kOpCaps[op_index].qnn_op_name;
}
void get_qnn_op_desc(const ggml_tensor * op,
bool append_dimensions,
ggml_type override_data_type,
std::string & output) {
auto op_index = get_qnn_op_index(op);
GGML_ASSERT(op_index < std::size(kOpCaps));
auto get_desc = kOpCaps[op_index].get_desc;
if (get_desc) {
get_desc(op, append_dimensions, override_data_type, output);
} else {
generic_get_op_desc(op, append_dimensions, override_data_type, output);
}
}
std::shared_ptr<ggml_qnn_op_config> create_op(const ggml_tensor * op,
const std::string & name,
qnn_instance_ptr qnn_instance) {
auto op_index = get_qnn_op_index(op);
GGML_ASSERT(op_index < std::size(kOpCaps));
auto op_constructor = kOpConstructors[op_index];
GGML_ASSERT(op_constructor);
return op_constructor(op, name, qnn_instance);
}
} // namespace qnn

View File

@ -0,0 +1,444 @@
#include "op-config-impl.hpp"
#include <cstdint>
#include "logger.hpp"
namespace {
qnn::qnn_dimension_array_t get_transposed_dimensions(const qnn::qnn_dimension_array_t & dimensions, int rank) {
qnn::qnn_dimension_array_t transposed_dims = dimensions;
if (rank >= 2) {
transposed_dims[rank - 1] = dimensions[rank - 2];
transposed_dims[rank - 2] = dimensions[rank - 1];
}
return transposed_dims;
}
int get_rank(const qnn::ggml_tensor_array_t & tensor_inputs, const qnn::ggml_tensor_array_t & tensor_outputs) {
return std::max(qnn::get_ggml_tensors_max_rank(tensor_inputs), qnn::get_ggml_tensors_max_rank(tensor_outputs));
}
Qnn_DataType_t get_tensor_type(const qnn::qnn_tensor_array_t & tensors) {
Qnn_DataType_t type = QNN_DATATYPE_UNDEFINED;
for (auto tensor : tensors) {
auto tensor_type_size = qnn::qnn_datatype_size(tensor->get_data_type());
GGML_ASSERT(tensor_type_size > 0);
if (tensor_type_size > qnn::qnn_datatype_size(type)) {
type = tensor->get_data_type();
}
}
return type;
}
} // namespace
namespace qnn {
void ggml_qnn_op_config_base::add_scalar_param(const std::string & name, const Qnn_Scalar_t scalar) {
_param_names.push_back(name);
Qnn_Param_t param = QNN_PARAM_INIT;
param.paramType = QNN_PARAMTYPE_SCALAR;
param.name = _param_names.back().c_str();
param.scalarParam = scalar;
_qnn_parameters.push_back(param);
}
bool ggml_qnn_op_config_base::add_tensor_param(const std::string & name, const qnn_dimension_array_t & dimensions,
int rank, const uint8_t * data, const Qnn_DataType_t data_type,
backend_index_type device, Qnn_GraphHandle_t graph_handle) {
std::string tensor_name = _name + name + std::to_string(_tensor_parameters.size());
auto param_tensor = std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::PARAMETER, tensor_name, dimensions,
data_type, rank, device, graph_handle, _qnn_instance);
size_t data_size = ggml_type_size(ggml_datatype_from_qnn_datatype(data_type));
for (int i = 0; i < rank; i++) {
data_size *= dimensions[i];
}
GGML_ASSERT(data_size > 0);
if (!param_tensor->set_data_buffer(data, data_size)) {
QNN_LOG_ERROR("parameter tensor bind_buffer failed\n");
return false;
}
if (!param_tensor->alloc_qnn_tensor_id()) {
QNN_LOG_ERROR("parameter tensor alloc_qnn_tensor_id failed\n");
return false;
}
_tensor_parameters.push_back(param_tensor);
_param_names.push_back(name);
Qnn_Param_t param = QNN_PARAM_INIT;
param.paramType = QNN_PARAMTYPE_TENSOR;
param.name = _param_names.back().c_str();
param.tensorParam = param_tensor->get_qnn_tensor();
_qnn_parameters.push_back(param);
return true;
}
void ggml_qnn_op_config_base::set_input_tensors(qnn::qnn_tensor_array_t & tensor_inputs) {
_tensor_inputs = tensor_inputs;
_qnn_tensor_inputs.resize(_tensor_inputs.size());
}
void ggml_qnn_op_config_base::set_input_tensors(qnn::qnn_tensor_array_t && tensor_inputs) {
_tensor_inputs = std::move(tensor_inputs);
_qnn_tensor_inputs.resize(_tensor_inputs.size());
}
void ggml_qnn_op_config_base::set_output_tensors(qnn::qnn_tensor_array_t & tensor_outputs) {
_tensor_outputs = tensor_outputs;
_qnn_tensor_outputs.resize(_tensor_outputs.size());
}
void ggml_qnn_op_config_base::set_output_tensors(qnn::qnn_tensor_array_t && tensor_outputs) {
_tensor_outputs = std::move(tensor_outputs);
_qnn_tensor_outputs.resize(_tensor_outputs.size());
}
bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) {
QNN_LOG_DEBUG("[%s]add to graph start\n", _name.c_str());
GGML_ASSERT(_qnn_tensor_inputs.size() == _tensor_inputs.size());
GGML_ASSERT(_qnn_tensor_outputs.size() == _tensor_outputs.size());
for (size_t i = 0; i < _tensor_inputs.size(); i++) {
auto tensor = _tensor_inputs[i];
if (!tensor->alloc_qnn_tensor_id()) {
QNN_LOG_ERROR("[%s]input tensor alloc_qnn_tensor_id failed\n", _name.c_str());
return false;
}
QNN_LOG_DEBUG("[%s]input tensor(%s), id(%d)\n", _name.c_str(), tensor->get_tensor_name().c_str(),
tensor->get_qnn_tensor_id());
_qnn_tensor_inputs[i] = tensor->get_qnn_tensor();
}
for (size_t i = 0; i < _tensor_outputs.size(); i++) {
auto tensor = _tensor_outputs[i];
if (!tensor->alloc_qnn_tensor_id()) {
QNN_LOG_ERROR("[%s]output tensor alloc_qnn_tensor_id failed\n", _name.c_str());
return false;
}
QNN_LOG_DEBUG("[%s]output tensor(%s), id(%d)\n", _name.c_str(), tensor->get_tensor_name().c_str(),
tensor->get_qnn_tensor_id());
_qnn_tensor_outputs[i] = tensor->get_qnn_tensor();
}
auto qnn_interface = _qnn_instance->get_qnn_interface();
auto error = qnn_interface->qnn_graph_add_node(graph_handle, get_op_config());
if (error != QNN_SUCCESS) {
QNN_LOG_ERROR("[%s][%s][%s]qnn_graph_add_node.error: %s\n", _name.c_str(), _package_name.c_str(),
_op_type.c_str(), get_qnn_error_string(error));
return false;
}
QNN_LOG_DEBUG("[%s]added to graph succeed\n", _name.c_str());
return true;
}
bool ggml_qnn_op_config_base::bind_input_tensors(const ggml_tensor_array_t & tensor_inputs) {
GGML_ASSERT(tensor_inputs.size() == _tensor_inputs.size());
return qnn::bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs);
}
bool ggml_qnn_op_config_base::bind_output_tensors(const ggml_tensor_array_t & tensor_outputs) {
GGML_ASSERT(tensor_outputs.size() == _tensor_outputs.size());
return qnn::bind_tensors(tensor_outputs, _tensor_outputs, _qnn_tensor_outputs);
}
void ggml_qnn_op_config_base::unbind_input_tensors() {
for (auto & tensor : _tensor_inputs) {
tensor->unbind();
}
}
void ggml_qnn_op_config_base::unbind_output_tensors() {
for (auto & tensor : _tensor_outputs) {
tensor->unbind();
}
}
Qnn_OpConfig_t ggml_qnn_op_config_base::get_op_config() {
GGML_ASSERT(_qnn_parameters.size() == _param_names.size());
for (size_t i = 0; i < _qnn_parameters.size(); i++) {
_qnn_parameters[i].name = _param_names[i].c_str();
}
Qnn_OpConfig_t config = QNN_OPCONFIG_INIT;
config.version = QNN_OPCONFIG_VERSION_1;
auto & op_config = config.v1;
op_config.name = _name.c_str();
op_config.packageName = _package_name.c_str();
op_config.typeName = _op_type.c_str();
op_config.numOfParams = (uint32_t) _qnn_parameters.size();
op_config.params = _qnn_parameters.data();
op_config.numOfInputs = (uint32_t) _qnn_tensor_inputs.size();
op_config.inputTensors = _qnn_tensor_inputs.data();
op_config.numOfOutputs = (uint32_t) _qnn_tensor_outputs.size();
op_config.outputTensors = _qnn_tensor_outputs.data();
return config;
}
bool ggml_qnn_single_op_config::initialize_op_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle) {
GGML_UNUSED(device);
GGML_UNUSED(graph_handle);
return true;
}
bool ggml_qnn_rmsnorm_op_config::initialize_op_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle) {
constexpr const uint32_t kAxes[] = { 0 };
add_tensor_param(QNN_OP_RMS_NORM_PARAM_AXES, { 1 }, 1, reinterpret_cast<const uint8_t *>(kAxes),
QNN_DATATYPE_UINT_32, device, graph_handle);
return true;
}
void ggml_qnn_aggregate_op_config::set_input_tensors(qnn::qnn_tensor_array_t & tensor_inputs) {
_tensor_inputs = tensor_inputs;
}
void ggml_qnn_aggregate_op_config::set_input_tensors(qnn::qnn_tensor_array_t && tensor_inputs) {
_tensor_inputs = std::move(tensor_inputs);
}
void ggml_qnn_aggregate_op_config::set_output_tensors(qnn::qnn_tensor_array_t & tensor_outputs) {
_tensor_outputs = tensor_outputs;
}
void ggml_qnn_aggregate_op_config::set_output_tensors(qnn::qnn_tensor_array_t && tensor_outputs) {
_tensor_outputs = std::move(tensor_outputs);
}
bool ggml_qnn_aggregate_op_config::bind_input_tensors(const ggml_tensor_array_t & tensor_inputs) {
return qnn::bind_tensors(tensor_inputs, _tensor_inputs);
}
bool ggml_qnn_aggregate_op_config::bind_output_tensors(const ggml_tensor_array_t & tensor_outputs) {
return qnn::bind_tensors(tensor_outputs, _tensor_outputs);
}
bool ggml_qnn_matmul_op_config::initialize_op_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle) {
GGML_ASSERT(_tensor_inputs.size() == 2);
GGML_ASSERT(_tensor_outputs.size() == 1);
// create convert nodes
const auto tensor_rank = _tensor_inputs.front()->get_rank();
qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs;
auto tensor_type = create_input_convert_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs);
mat_mul_tensor_inputs.front() =
create_gather_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs.front(),
mat_mul_tensor_inputs.back()->get_dimensions());
if (device != QNN_BACKEND_GPU && _tensor_outputs.front()->get_data_type() != tensor_type) {
auto convert_out = create_output_convert_nodes(device, graph_handle, tensor_rank, tensor_type, _tensor_outputs);
if (!create_mat_mul_nodes(mat_mul_tensor_inputs, convert_out->get_input_tensors())) {
QNN_LOG_ERROR("create mat_mul nodes failed\n");
return false;
}
_operations.push_back(convert_out);
} else {
if (!create_mat_mul_nodes(mat_mul_tensor_inputs, _tensor_outputs)) {
QNN_LOG_ERROR("create mat_mul nodes failed\n");
return false;
}
}
return true;
}
qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(backend_index_type device,
Qnn_GraphHandle_t graph_handle, const int rank,
qnn_tensor_ptr_t tensor_input,
qnn_dimension_array_t output_dimensions) {
if (rank <= 2) {
return tensor_input;
}
const auto & input_dimensions = tensor_input->get_dimensions();
output_dimensions[rank - 1] = input_dimensions[rank - 1];
output_dimensions[rank - 2] = input_dimensions[rank - 2];
const auto y = output_dimensions[rank - 3] / input_dimensions[rank - 3];
if (y == 1 && (rank == 3 || (rank == 4 && output_dimensions[rank - 4] == input_dimensions[rank - 4]))) {
return tensor_input;
}
// create concat nodes, to convert tensor shape from [ne03, ne02, n, k] to [ne03 * x, ne02 * y, n, k]
constexpr const auto create_node =
[](const std::string & name, const int rank, const int axis, const qnn_dimension_array_t & dimensions,
qnn_tensor_ptr_t tensor_input, backend_index_type device, Qnn_GraphHandle_t graph_handle,
qnn_instance_ptr qnn_instance, qnn_tensor_ptr_t & tensor_output) -> qnn_op_config_ptr_t {
auto gather_out =
std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, name + "_out", dimensions,
tensor_input->get_data_type(), rank, device, graph_handle, qnn_instance);
auto gather_op = std::make_shared<ggml_qnn_single_op_config>(name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_GATHER,
qnn_instance);
Qnn_Scalar_t scalar = QNN_SCALAR_INIT;
scalar.dataType = QNN_DATATYPE_INT_32;
scalar.int32Value = axis;
gather_op->add_scalar_param(QNN_OP_GATHER_PARAM_AXIS, scalar);
gather_op->set_output_tensors({ gather_out });
// here we calculate the index mapping, will generate a 1d tensor like [0, 0, 0, 1, 1, 1, 2, 2, 2, ...],
// by repeating each index [scale] times.
const auto scale = dimensions[axis] / tensor_input->get_dimensions()[axis];
auto index_buffer = std::make_shared<qnn_mem_buffer>(dimensions[axis] * sizeof(uint32_t));
for (uint32_t *curr = reinterpret_cast<uint32_t *>(index_buffer->get_buffer()), *end = curr + dimensions[axis];
curr < end; curr++) {
*curr = uint32_t((curr - reinterpret_cast<uint32_t *>(index_buffer->get_buffer())) / scale);
}
auto gather_index = std::make_shared<ggml_qnn_tensor>(
ggml_qnn_tensor::PARAMETER, name + "_index", qnn_dimension_array_t{ dimensions[axis] },
QNN_DATATYPE_UINT_32, 1, device, graph_handle, qnn_instance);
gather_index->set_data_buffer(index_buffer);
gather_op->set_input_tensors({ tensor_input, gather_index });
tensor_output = gather_out;
return gather_op;
};
qnn_dimension_array_t intermediate_dimensions = input_dimensions;
intermediate_dimensions[rank - 3] = output_dimensions[rank - 3];
qnn_tensor_ptr_t gather0_out;
_operations.push_back(create_node(_name + "_gather0", rank, rank - 3, intermediate_dimensions, tensor_input, device,
graph_handle, _qnn_instance, gather0_out));
if (rank == 3) {
return gather0_out;
}
qnn_tensor_ptr_t gather1_out;
_operations.push_back(create_node(_name + "_gather1", rank, rank - 4, output_dimensions, gather0_out, device,
graph_handle, _qnn_instance, gather1_out));
return gather1_out;
}
Qnn_DataType_t ggml_qnn_matmul_op_config::create_input_convert_nodes(backend_index_type device,
Qnn_GraphHandle_t graph_handle, const int rank,
qnn_tensor_array_t & tensor_inputs) {
if (device == QNN_BACKEND_GPU) {
// there's no convert op for GPU, so we should create matmul nodes directly.
return QNN_DATATYPE_UNDEFINED;
}
// create tensors for convert node
auto tensor_type = get_tensor_type(tensor_inputs);
for (size_t i = 0; i < tensor_inputs.size(); ++i) {
// create input convert nodes
auto convert_in = tensor_inputs[i];
if (convert_in->get_data_type() == tensor_type) {
continue;
}
std::string convert_name("convert_src" + std::to_string(i));
auto convert_out = std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_out",
convert_in->get_dimensions(), tensor_type, rank, device,
graph_handle, _qnn_instance);
auto convert = std::make_shared<ggml_qnn_single_op_config>(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW,
QNN_OP_CAST, _qnn_instance);
QNN_LOG_DEBUG("[%s][MUL_MAT]create: %s, type: %s\n", get_backend_name(device), convert_name.c_str(),
qnn_datatype_to_string(tensor_type));
convert->set_input_tensors({ convert_in });
convert->set_output_tensors({ convert_out });
tensor_inputs[i] = convert_out;
_operations.push_back(convert);
}
return tensor_type;
}
qnn_op_config_ptr_t ggml_qnn_matmul_op_config::create_output_convert_nodes(backend_index_type device,
Qnn_GraphHandle_t graph_handle,
const int rank, Qnn_DataType_t tensor_type,
qnn_tensor_array_t & tensor_outputs) {
GGML_ASSERT(tensor_outputs.size() == 1);
// create output convert node
std::string convert_name("convert_dst");
auto convert_in = std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, convert_name + "_in",
tensor_outputs.front()->get_dimensions(), tensor_type, rank,
device, graph_handle, _qnn_instance);
auto output_convert = std::make_shared<ggml_qnn_single_op_config>(convert_name, QNN_OP_PACKAGE_NAME_QTI_AISW,
QNN_OP_CAST, _qnn_instance);
QNN_LOG_DEBUG("[%s][MUL_MAT]create: %s, type: %s\n", get_backend_name(device), convert_name.c_str(),
qnn_datatype_to_string(tensor_type));
output_convert->set_input_tensors({ convert_in });
output_convert->set_output_tensors(tensor_outputs);
return output_convert;
}
bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(qnn_tensor_array_t & tensor_inputs,
qnn_tensor_array_t & tensor_outputs) {
/*
* First, both the ggml and qnn tensor in memory are stored as row-major format. (For more details, please refer to:
* https://pytorch.org/blog/tensor-memory-format-matters/#:~:text=Column%20Major%20Order:%20In%20this%20format,%20the%20matrix)
* But the dimensions of the tensor are stored in different order.
* For example, a 2x3 matrix:
* [
* [1, 2, 3],
* [4, 5, 6],
* ]
* The ggml tensor will have dimensions [3, 2], while the qnn tensor will have dimensions [2, 3].
*
* Second, from the ggml introduction here: https://github.com/huggingface/blog/blob/main/introduction-to-ggml.md
* Given 2 matrices A and B, the matrix multiplication C = A * B is defined as:
* ```python
* import torch
* # Create two matrices
* A = torch.tensor([
* [2, 8],
* [5, 1],
* [4, 2],
* [8, 6],
* ])
* B = torch.tensor([
* [10, 5],
* [9, 9],
* [5, 4],
* ])
* # Perform matrix multiplication
* C = torch.matmul(A, B.T)
* print(C.T)
* ```
* Here, the B.T is the transpose of B.
* So C.T = A * B.T which is equivalent to C = B * A.T.
* See: https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md
*
* So here we need to create graph like:
* ```mermaid
* graph TD;
* i1>ggml_tensor_in1] --src0--> mat_mul0;
* i2>ggml_tensor_in0] --src1.T--> mat_mul0;
* mat_mul0 --dst0--> o1>ggml_tensor_out];
* ```
*/
// create src0_trans tensor
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS does not match the expected value");
GGML_ASSERT(tensor_inputs.size() == 2);
GGML_ASSERT(tensor_outputs.size() == 1);
// create mat_mul
auto mat_mul =
std::make_shared<ggml_qnn_single_op_config>(_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL, _qnn_instance);
Qnn_Scalar_t scalar = QNN_SCALAR_INIT;
scalar.dataType = QNN_DATATYPE_BOOL_8;
scalar.bool8Value = 1;
mat_mul->add_scalar_param(QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, scalar);
// set tensor to mat_mul
mat_mul->set_input_tensors({ tensor_inputs[1], tensor_inputs[0] });
mat_mul->set_output_tensors(tensor_outputs);
_operations.push_back(mat_mul);
return true;
}
} // namespace qnn

View File

@ -0,0 +1,162 @@
#pragma once
#include <functional>
#include <memory>
#include <string>
#include <vector>
#include "op-config.hpp"
#include "qnn-lib.hpp"
#include "qnn-types.hpp"
#include "tensor.hpp"
namespace qnn {
class ggml_qnn_op_config_base : public ggml_qnn_op_config {
public:
explicit ggml_qnn_op_config_base(const std::string & name, const std::string & package_name,
const std::string & op_type, qnn_instance_ptr qnn_instance) :
_name(name),
_package_name(package_name),
_op_type(op_type),
_qnn_instance(qnn_instance) {}
void add_scalar_param(const std::string & name, const Qnn_Scalar_t scalar);
bool add_tensor_param(const std::string & name, const qnn_dimension_array_t & dimensions, int rank,
const uint8_t * data, const Qnn_DataType_t data_type, backend_index_type device,
Qnn_GraphHandle_t graph_handle);
void set_input_tensors(qnn::qnn_tensor_array_t & tensor_inputs) override;
void set_input_tensors(qnn::qnn_tensor_array_t && tensor_inputs) override;
void set_output_tensors(qnn::qnn_tensor_array_t & tensor_inputs) override;
void set_output_tensors(qnn::qnn_tensor_array_t && tensor_inputs) override;
bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override;
bool bind_input_tensors(const ggml_tensor_array_t & tensor_inputs) override;
bool bind_output_tensors(const ggml_tensor_array_t & tensor_outputs) override;
void unbind_input_tensors() override;
void unbind_output_tensors() override;
qnn_tensor_array_t & get_input_tensors() override { return _tensor_inputs; }
qnn_tensor_array_t & get_output_tensors() override { return _tensor_outputs; }
protected:
Qnn_OpConfig_t get_op_config();
std::string _name;
std::string _package_name;
std::string _op_type;
qnn_instance_ptr _qnn_instance;
qnn_tensor_array_t _tensor_inputs;
qnn_tensor_array_t _tensor_outputs;
qnn_tensor_array_t _tensor_parameters;
std::vector<Qnn_Tensor_t> _qnn_tensor_inputs;
std::vector<Qnn_Tensor_t> _qnn_tensor_outputs;
std::vector<Qnn_Param_t> _qnn_parameters;
std::vector<std::string> _param_names;
DISABLE_COPY(ggml_qnn_op_config_base);
DISABLE_MOVE(ggml_qnn_op_config_base);
};
class ggml_qnn_single_op_config : public ggml_qnn_op_config_base {
public:
explicit ggml_qnn_single_op_config(const std::string & name, const std::string & package_name,
const std::string & op_type, qnn_instance_ptr qnn_instance) :
ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {}
bool initialize_op_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle) override;
private:
DISABLE_COPY(ggml_qnn_single_op_config);
DISABLE_MOVE(ggml_qnn_single_op_config);
};
class ggml_qnn_rmsnorm_op_config : public ggml_qnn_op_config_base {
public:
explicit ggml_qnn_rmsnorm_op_config(const std::string & name, const std::string & package_name,
const std::string & op_type, qnn_instance_ptr qnn_instance) :
ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {}
bool initialize_op_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle) override;
private:
DISABLE_COPY(ggml_qnn_rmsnorm_op_config);
DISABLE_MOVE(ggml_qnn_rmsnorm_op_config);
};
class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config {
public:
explicit ggml_qnn_aggregate_op_config(const std::string & name, qnn_instance_ptr qnn_instance) :
_name(name),
_qnn_instance(qnn_instance) {}
~ggml_qnn_aggregate_op_config() {
_tensor_inputs.clear();
_tensor_outputs.clear();
_operations.clear();
}
void set_input_tensors(qnn::qnn_tensor_array_t & tensor_inputs) override;
void set_input_tensors(qnn::qnn_tensor_array_t && tensor_inputs) override;
void set_output_tensors(qnn::qnn_tensor_array_t & tensor_inputs) override;
void set_output_tensors(qnn::qnn_tensor_array_t && tensor_inputs) override;
bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override {
return qnn::add_op_to_graph(graph_handle, _operations);
}
bool bind_input_tensors(const ggml_tensor_array_t & tensor_inputs) override;
bool bind_output_tensors(const ggml_tensor_array_t & tensor_outputs) override;
void unbind_input_tensors() override {
for (auto & tensor : _tensor_inputs) {
tensor->unbind();
}
}
void unbind_output_tensors() override {
for (auto & tensor : _tensor_outputs) {
tensor->unbind();
}
}
qnn_tensor_array_t & get_input_tensors() override { return _tensor_inputs; }
qnn_tensor_array_t & get_output_tensors() override { return _tensor_outputs; }
protected:
std::string _name;
qnn_instance_ptr _qnn_instance;
std::vector<qnn_op_config_ptr_t> _operations;
qnn_tensor_array_t _tensor_inputs;
qnn_tensor_array_t _tensor_outputs;
private:
DISABLE_COPY(ggml_qnn_aggregate_op_config);
DISABLE_MOVE(ggml_qnn_aggregate_op_config);
};
class ggml_qnn_matmul_op_config : public ggml_qnn_aggregate_op_config {
public:
ggml_qnn_matmul_op_config(const std::string & name, qnn_instance_ptr qnn_instance) :
ggml_qnn_aggregate_op_config(name, qnn_instance) {}
bool initialize_op_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle) override;
private:
qnn_tensor_ptr_t create_gather_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle, const int rank,
qnn_tensor_ptr_t tensor_input, qnn_dimension_array_t output_dimensions);
Qnn_DataType_t create_input_convert_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle, const int rank,
qnn_tensor_array_t & tensor_inputs);
qnn_op_config_ptr_t create_output_convert_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle,
const int rank, Qnn_DataType_t tensor_type,
qnn_tensor_array_t & tensor_outputs);
bool create_mat_mul_nodes(qnn_tensor_array_t & tensor_inputs, qnn_tensor_array_t & tensor_outputs);
DISABLE_COPY(ggml_qnn_matmul_op_config);
DISABLE_MOVE(ggml_qnn_matmul_op_config);
};
} // namespace qnn

View File

@ -0,0 +1,38 @@
#pragma once
#include <functional>
#include <memory>
#include <string>
#include <vector>
#include "op-config-base.hpp"
#include "qnn-lib.hpp"
#include "qnn-types.hpp"
#include "tensor.hpp"
namespace qnn {
constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT;
// TODO: move to a better place
void append_tensor_shape_and_type(const ggml_tensor * tensor, std::string & output);
size_t get_qnn_op_index(const ggml_tensor * tensor);
const char * get_qnn_op_name(const ggml_tensor * op);
void get_qnn_op_desc(const ggml_tensor * op, bool append_dimensions, ggml_type override_data_type,
std::string & output);
std::shared_ptr<ggml_qnn_op_config> create_op(const ggml_tensor * op, const std::string & name,
qnn_instance_ptr qnn_instance);
inline bool add_op_to_graph(Qnn_GraphHandle_t graph_handle, std::vector<qnn_op_config_ptr_t> & operations) {
for (auto & op : operations) {
if (!op->add_op_to_graph(graph_handle)) {
return false;
}
}
return true;
}
} // namespace qnn

View File

@ -0,0 +1,571 @@
#include "qnn-lib.hpp"
#include <filesystem>
#include "common.hpp"
#include "rpc-mem.hpp"
#if defined(__linux__)
# include <unistd.h>
#endif
namespace {
#ifdef _WIN32
# define PLATFORM_LIB_FILENAME(name) (name ".dll")
#else
# define PLATFORM_LIB_FILENAME(name) ("lib" name ".so")
#endif
#if defined(__aarch64__) || defined(_M_ARM64) // TODO: check for other platforms
# define PLATFORM_LIB_POSFIX "_aarch64"
#else
# define PLATFORM_LIB_POSFIX "_x64"
#endif
constexpr const char * kQnnSystemLibName = PLATFORM_LIB_FILENAME("QnnSystem");
constexpr const char * kQnnCpuLibName = PLATFORM_LIB_FILENAME("QnnCpu");
constexpr const char * kQnnGpuLibName = PLATFORM_LIB_FILENAME("QnnGpu");
constexpr const char * kQnnNpuLibName = PLATFORM_LIB_FILENAME("QnnHtp");
constexpr const char * kQnnCpuPackageLibName = PLATFORM_LIB_FILENAME("QnnGgmlOpPackage" PLATFORM_LIB_POSFIX);
constexpr const qnn::device_caps kDeviceCaps[] = {
{
// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul
kQnnCpuLibName, GGML_BACKEND_DEVICE_TYPE_ACCEL, (1L << GGML_TYPE_I8) | (1L << GGML_TYPE_F32),
#ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS
// all quantized types can be offload to CPU, at current implementation, those types will be dequantized into float32 on cpu
0xFFFFFE,
#else
(1L << GGML_TYPE_F32),
#endif
0, // 0 for no limitation
},
{
// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul
kQnnGpuLibName, GGML_BACKEND_DEVICE_TYPE_GPU, (1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16),
#ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS
// all quantized types can be offload to GPU, at current implementation, those types will be dequantized into float32 on cpu
0xFFFFFE,
#else
(1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16),
#endif
(128256L * 4096 *
sizeof(float)), // tested on 8 gen 2, failed to allocate tensor with size 128256x4096 and float32
},
{
// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul
kQnnNpuLibName, GGML_BACKEND_DEVICE_TYPE_ACCEL,
(1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16) | (1L << GGML_TYPE_I16),
#ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS
(1L << GGML_TYPE_Q2_K) | (1L << GGML_TYPE_Q3_K) | (1L << GGML_TYPE_Q4_K) | (1L << GGML_TYPE_Q8_K),
#else
(1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16),
#endif
(8192L * 2048 + 8192 * 512 + 2048 * 512) * sizeof(float), // TODO: should have a better way to get this value
},
};
static_assert(sizeof(kDeviceCaps) / sizeof(kDeviceCaps[0]) == QNN_BACKEND_COUNT,
"The number of qnn devices should be equal to QNN_BACKEND_COUNT");
static_assert(kDeviceCaps[QNN_BACKEND_NPU].type == GGML_BACKEND_DEVICE_TYPE_ACCEL,
"The NPU device should be an accelerator device");
static_assert(kDeviceCaps[QNN_BACKEND_GPU].type == GGML_BACKEND_DEVICE_TYPE_GPU,
"The GPU device should be an GPU device");
static_assert(
kDeviceCaps[QNN_BACKEND_CPU].type == GGML_BACKEND_DEVICE_TYPE_ACCEL,
"The CPU device should be an accelerator device"); // we treat qnn-cpu as a supplementary accelerator device
static_assert(GGML_TYPE_Q4_0 == 2 && GGML_TYPE_Q8_K == 15, "The quantized type order is not correct");
void insert_path(std::string & path, std::string insert_path, const char separator = ':') {
if (!insert_path.empty() && !path.empty()) {
insert_path += separator;
}
path.insert(0, insert_path);
}
// TODO: Fix this for other platforms, or use a more portable way to set the library search path
bool set_qnn_lib_search_path(const std::string & custom_lib_search_path) {
#if defined(__linux__)
{
auto * original = getenv("LD_LIBRARY_PATH");
std::string lib_search_path = original ? original : "";
insert_path(lib_search_path,
"/vendor/dsp/cdsp:/vendor/lib64:"
"/vendor/dsp/dsp:/vendor/dsp/images");
insert_path(lib_search_path, custom_lib_search_path);
if (setenv("LD_LIBRARY_PATH", lib_search_path.c_str(), 1)) {
return false;
}
}
# if defined(__ANDROID__) || defined(ANDROID)
{
// See also: https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-2/dsp_runtime.html
std::string adsp_lib_search_path = custom_lib_search_path +
";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/"
"rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp";
if (setenv("ADSP_LIBRARY_PATH", adsp_lib_search_path.c_str(), 1)) {
return false;
}
QNN_LOG_DEBUG("ADSP_LIBRARY_PATH=%s", getenv("ADSP_LIBRARY_PATH\n"));
}
# endif
QNN_LOG_DEBUG("LD_LIBRARY_PATH=%s", getenv("LD_LIBRARY_PATH\n"));
#else
(void) custom_lib_search_path;
#endif
return true;
}
common::dl_handler_t load_lib_with_fallback(const std::string & lib_path, const std::string & load_directory) {
std::filesystem::path full_path(load_directory);
full_path /= std::filesystem::path(lib_path).filename();
auto handle = common::dl_load(full_path.string());
if (!handle) {
QNN_LOG_WARN("failed to load %s, fallback to %s\n", full_path.c_str(), lib_path.c_str());
handle = common::dl_load(lib_path);
}
return handle;
}
struct op_package_lib_info {
const char * lib_name;
const char * interface;
const char * type;
size_t htp_arch;
const char * extra_lib_name = nullptr;
};
const op_package_lib_info & get_op_package_lib_info(uint32_t soc_model, size_t htp_arch) {
constexpr static const op_package_lib_info kOpPackageLibInfo[] = {
{ kQnnCpuPackageLibName, "GgmlOpPackageInterfaceProvider", "CPU", qnn::NONE,
PLATFORM_LIB_FILENAME("HtpPrepare") },
{ PLATFORM_LIB_FILENAME("QnnGgmlOpPackage_v68"), "GgmlOpPackageInterfaceProvider", "HTP", qnn::V68 },
{ PLATFORM_LIB_FILENAME("QnnGgmlOpPackage_v69"), "GgmlOpPackageInterfaceProvider", "HTP", qnn::V69 },
{ PLATFORM_LIB_FILENAME("QnnGgmlOpPackage_v73"), "GgmlOpPackageInterfaceProvider", "HTP", qnn::V73 },
{ PLATFORM_LIB_FILENAME("QnnGgmlOpPackage_v75"), "GgmlOpPackageInterfaceProvider", "HTP", qnn::V75 },
{ PLATFORM_LIB_FILENAME("QnnGgmlOpPackage_v79"), "GgmlOpPackageInterfaceProvider", "HTP", qnn::V79 },
};
if (soc_model == qnn::UNKNOWN || soc_model == qnn::EMULATOR_X64 || soc_model == qnn::EMULATOR_AARCH64) {
return kOpPackageLibInfo[0];
}
switch (htp_arch) {
case qnn::V68:
static_assert(kOpPackageLibInfo[1].htp_arch == qnn::V68);
return kOpPackageLibInfo[1];
case qnn::V69:
static_assert(kOpPackageLibInfo[2].htp_arch == qnn::V69);
return kOpPackageLibInfo[2];
case qnn::V73:
static_assert(kOpPackageLibInfo[3].htp_arch == qnn::V73);
return kOpPackageLibInfo[3];
case qnn::V75:
static_assert(kOpPackageLibInfo[4].htp_arch == qnn::V75);
return kOpPackageLibInfo[4];
case qnn::V79:
default:
static_assert(kOpPackageLibInfo[5].htp_arch == qnn::V79);
return kOpPackageLibInfo[5];
}
}
} // namespace
namespace qnn {
qnn_system_interface::qnn_system_interface(const QnnSystemInterface_t & qnn_sys_interface,
common::dl_handler_t lib_handle) :
_qnn_sys_interface(qnn_sys_interface),
_lib_handle(lib_handle) {
qnn_system_context_create(&_qnn_system_handle);
if (_qnn_system_handle) {
QNN_LOG_INFO("initialize qnn system successfully\n");
} else {
QNN_LOG_WARN("can not create QNN system contenxt\n");
}
}
qnn_system_interface::~qnn_system_interface() {
if (_qnn_system_handle) {
if (qnn_system_context_free(_qnn_system_handle) != QNN_SUCCESS) {
QNN_LOG_WARN("failed to free QNN system context\n");
}
} else {
QNN_LOG_WARN("system handle is null\n");
}
if (_lib_handle) {
if (!common::dl_unload(_lib_handle)) {
QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", common::dl_error());
}
} else {
QNN_LOG_WARN("system lib handle is null\n");
}
}
qnn_instance::qnn_instance(const std::string & lib_path, backend_index_type device) :
_additional_lib_load_path(lib_path) {
_backend_lib_name = kDeviceCaps[device].lib_name;
if (set_qnn_lib_search_path(lib_path)) {
QNN_LOG_DEBUG("[%s] set_qnn_lib_search_path succeed\n", _backend_lib_name.c_str());
} else {
QNN_LOG_ERROR("[%s] set_qnn_lib_search_path failed\n", _backend_lib_name.c_str());
}
}
bool qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
BackendIdType backend_id = QNN_BACKEND_ID_NULL;
QNN_LOG_DEBUG("enter qnn_init\n");
std::lock_guard<std::mutex> lock(_init_mutex);
if (load_system() != 0) {
QNN_LOG_WARN("failed to load QNN system lib\n");
return false;
} else {
QNN_LOG_DEBUG("load QNN system lib successfully\n");
}
std::string backend_lib_path = _backend_lib_name;
if (_lib_path_to_backend_id.count(backend_lib_path) == 0) {
if (!load_backend(backend_lib_path, saver_config)) {
QNN_LOG_WARN("failed to load QNN backend\n");
return false;
}
}
backend_id = _lib_path_to_backend_id[backend_lib_path];
if (_loaded_backend.count(backend_id) == 0 || _loaded_lib_handle.count(backend_id) == 0) {
QNN_LOG_WARN(
"library %s is loaded but loaded backend count=%zu, "
"loaded lib_handle count=%zu",
backend_lib_path.c_str(), _loaded_backend.count(backend_id), _loaded_lib_handle.count(backend_id));
return false;
}
_qnn_interface = std::make_shared<qnn_interface>(*_loaded_backend[backend_id]);
_qnn_interface->qnn_log_create(qnn::sdk_logcallback, _qnn_log_level, &_qnn_log_handle);
if (!_qnn_log_handle) {
// NPU backend not work on Qualcomm SoC equipped low-end phone
QNN_LOG_WARN("failed to initialize qnn log\n");
return false;
} else {
QNN_LOG_DEBUG("initialize qnn log successfully\n");
}
std::vector<const QnnBackend_Config_t *> temp_backend_config;
_qnn_interface->qnn_backend_create(
_qnn_log_handle, temp_backend_config.empty() ? nullptr : temp_backend_config.data(), &_qnn_backend_handle);
if (!_qnn_backend_handle) {
QNN_LOG_WARN("failed to initialize qnn backend\n");
return false;
} else {
QNN_LOG_DEBUG("initialize qnn backend successfully\n");
}
auto qnn_status = _qnn_interface->qnn_property_has_capability(QNN_PROPERTY_GROUP_DEVICE);
switch (qnn_status) {
case QNN_PROPERTY_NOT_SUPPORTED:
QNN_LOG_WARN("device property is not supported\n");
break;
case QNN_PROPERTY_ERROR_UNKNOWN_KEY:
QNN_LOG_WARN("device property is unknown\n");
break;
}
{
const QnnDevice_PlatformInfo_t * p_info = nullptr;
qnn_status = _qnn_interface->qnn_device_get_platform_info(nullptr, &p_info);
if (qnn_status == QNN_SUCCESS) {
QNN_LOG_INFO("device counts %d\n", p_info->v1.numHwDevices);
QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices;
for (uint32_t i = 0; i < p_info->v1.numHwDevices; i++) {
QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d\n", (int) infos[i].v1.deviceId,
(int) infos[i].v1.deviceType, (int) infos[i].v1.numCores);
QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension;
QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice;
size_t htp_arch = (size_t) chipinfo.arch;
QNN_LOG_INFO("htp_type:%d(%s)\n", devinfo->devType,
(devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : "");
QNN_LOG_INFO("soc_model:%s(%s), htp_arch:%s(%d), vtcm_size:%d MB\n",
get_chipset_desc(chipinfo.socModel), get_chipset_model(chipinfo.socModel),
get_htparch_desc(htp_arch), (int) htp_arch, (int) chipinfo.vtcmSize);
}
if (p_info->v1.numHwDevices) {
QnnDevice_DeviceInfoExtension_t devinfo = infos[p_info->v1.numHwDevices - 1].v1.deviceInfoExtension;
QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice;
size_t htp_arch = (size_t) chipinfo.arch;
_soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize };
}
_qnn_interface->qnn_device_free_platform_info(nullptr, p_info);
} else {
// For emulator, we can't get platform info
QNN_LOG_INFO("failed to get platform info, emulator or cpu backend?\n");
#if defined(__aarch64__) || defined(_M_ARM64)
_soc_info = { EMULATOR_AARCH64, NONE, 0 };
#elif defined(__x86_64__) || defined(__amd64__) || defined(_M_X64)
_soc_info = { EMULATOR_X64, NONE, 0 };
#else
_soc_info = { UNKNOWN, NONE, 0 };
#endif
}
}
{
qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, nullptr, &_qnn_device_handle);
if (QNN_SUCCESS != qnn_status && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) {
QNN_LOG_WARN("failed to create QNN device\n");
} else {
QNN_LOG_INFO("create QNN device successfully\n");
}
}
{
auto rpc_mem = std::make_unique<common::rpc_mem>();
if (rpc_mem->is_valid()) {
_rpc_mem = std::move(rpc_mem);
}
}
_qnn_interface->qnn_context_create(_qnn_backend_handle, _qnn_device_handle, nullptr, &_qnn_context_handle);
if (!_qnn_context_handle) {
QNN_LOG_WARN("failed to initialize qnn context\n");
return false;
} else {
QNN_LOG_DEBUG("initialize qnn context successfully\n");
}
if (_backend_lib_name.find("Htp") != _backend_lib_name.npos) {
if (init_htp_perfinfra() != 0) {
QNN_LOG_WARN("initialize HTP performance failure\n");
}
if (set_rpc_polling() != 0) {
QNN_LOG_WARN("set RPC polling failure\n");
}
if (set_high_performance_mode() != 0) {
QNN_LOG_WARN("set HTP high performance mode failure\n");
}
}
QNN_LOG_DEBUG("leave qnn_init\n");
return true;
}
bool qnn_instance::qnn_finalize() {
if (_backend_lib_name.find("Htp") != _backend_lib_name.npos) {
_qnn_htp_perfinfra->destroyPowerConfigId(_qnn_power_configid);
}
if (_qnn_context_handle) {
auto error = _qnn_interface->qnn_context_free(_qnn_context_handle, nullptr);
if (error != QNN_SUCCESS) {
QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(),
(int) QNN_GET_ERROR_CODE(error));
}
_qnn_context_handle = nullptr;
}
if (_qnn_device_handle) {
auto error = _qnn_interface->qnn_device_free(_qnn_device_handle);
if (error != QNN_SUCCESS) {
QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(),
(int) QNN_GET_ERROR_CODE(error));
}
_qnn_device_handle = nullptr;
}
if (_qnn_backend_handle) {
auto error = _qnn_interface->qnn_backend_free(_qnn_backend_handle);
if (error != QNN_SUCCESS) {
QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(),
(int) QNN_GET_ERROR_CODE(error));
}
_qnn_backend_handle = nullptr;
}
if (_qnn_log_handle) {
auto error = _qnn_interface->qnn_log_free(_qnn_log_handle);
if (error != QNN_SUCCESS) {
QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(),
(int) QNN_GET_ERROR_CODE(error));
}
_qnn_log_handle = nullptr;
}
if (_custom_op_extra_lib_handle) {
common::dl_unload(_custom_op_extra_lib_handle);
}
unload_backend();
_qnn_sys_interface.reset();
_rpc_mem.reset();
return true;
}
int qnn_instance::load_system() {
QNN_LOG_DEBUG("[%s]lib: %s\n", _backend_lib_name.c_str(), kQnnSystemLibName);
auto system_lib_handle = load_lib_with_fallback(kQnnSystemLibName, _additional_lib_load_path);
if (!system_lib_handle) {
QNN_LOG_WARN("can not load QNN library %s, error: %s\n", kQnnSystemLibName, common::dl_error());
return 1;
}
auto * get_providers = common::dl_sym_typed<qnn::pfn_qnnsysteminterface_getproviders *>(
system_lib_handle, "QnnSystemInterface_getProviders");
if (!get_providers) {
QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", common::dl_error());
return 2;
}
uint32_t num_providers = 0;
const QnnSystemInterface_t ** provider_list = nullptr;
Qnn_ErrorHandle_t error = get_providers(&provider_list, &num_providers);
if (error != QNN_SUCCESS) {
QNN_LOG_WARN("failed to get providers, error %d\n", (int) QNN_GET_ERROR_CODE(error));
return 3;
}
QNN_LOG_DEBUG("num_providers: %d\n", num_providers);
if (num_providers != _required_num_providers) {
QNN_LOG_WARN("providers is %d instead of required %d\n", (int) num_providers, (int) _required_num_providers);
return 4;
}
if (!provider_list) {
QNN_LOG_WARN("can not get providers\n");
return 5;
}
QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface;
bool found_valid_system_interface = false;
for (size_t idx = 0; idx < num_providers; idx++) {
if (QNN_SYSTEM_API_VERSION_MAJOR == provider_list[idx]->systemApiVersion.major &&
QNN_SYSTEM_API_VERSION_MINOR <= provider_list[idx]->systemApiVersion.minor) {
found_valid_system_interface = true;
qnn_system_interface = provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME;
break;
}
}
if (!found_valid_system_interface) {
QNN_LOG_WARN("unable to find a valid qnn system interface\n");
return 6;
} else {
QNN_LOG_DEBUG("find a valid qnn system interface\n");
}
auto qnn_sys_interface = std::make_shared<qnn::qnn_system_interface>(*provider_list[0], system_lib_handle);
if (!qnn_sys_interface->is_valid()) {
QNN_LOG_WARN("failed to create QNN system interface\n");
return 7;
}
_qnn_sys_interface = qnn_sys_interface;
return 0;
}
bool qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t ** /*saver_config*/) {
QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str());
auto lib_handle = load_lib_with_fallback(lib_path, _additional_lib_load_path);
if (!lib_handle) {
QNN_LOG_WARN("can not open QNN library %s, with error: %s\n", lib_path.c_str(), common::dl_error());
return false;
}
auto get_providers =
common::dl_sym_typed<qnn::pfn_qnninterface_getproviders *>(lib_handle, "QnnInterface_getProviders");
if (!get_providers) {
QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s\n", common::dl_error());
common::dl_unload(lib_handle);
return false;
}
std::uint32_t num_providers = 0;
const QnnInterface_t ** provider_list = nullptr;
auto error = get_providers(&provider_list, &num_providers);
if (error != QNN_SUCCESS) {
QNN_LOG_WARN("failed to get providers, error %d\n", (int) QNN_GET_ERROR_CODE(error));
common::dl_unload(lib_handle);
return false;
}
QNN_LOG_DEBUG("num_providers=%d\n", num_providers);
if (num_providers != _required_num_providers) {
QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, _required_num_providers);
common::dl_unload(lib_handle);
return false;
}
if (!provider_list) {
QNN_LOG_WARN("failed to get qnn interface providers\n");
common::dl_unload(lib_handle);
return false;
}
bool found_valid_interface = false;
QNN_INTERFACE_VER_TYPE qnn_interface;
for (size_t idx = 0; idx < num_providers; idx++) {
if (QNN_API_VERSION_MAJOR == provider_list[idx]->apiVersion.coreApiVersion.major &&
QNN_API_VERSION_MINOR <= provider_list[idx]->apiVersion.coreApiVersion.minor) {
found_valid_interface = true;
qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME;
break;
}
}
if (!found_valid_interface) {
QNN_LOG_WARN("unable to find a valid qnn interface\n");
common::dl_unload(lib_handle);
return false;
} else {
QNN_LOG_DEBUG("find a valid qnn interface\n");
}
BackendIdType backend_id = provider_list[0]->backendId;
_lib_path_to_backend_id[lib_path] = backend_id;
if (_loaded_backend.count(backend_id) > 0) {
QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", lib_path.c_str(), backend_id);
}
_loaded_backend[backend_id] = provider_list[0];
if (_loaded_lib_handle.count(backend_id) > 0) {
QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]);
if (!common::dl_unload(_loaded_lib_handle[backend_id])) {
QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], common::dl_error());
}
}
_loaded_lib_handle[backend_id] = lib_handle;
_backend_id = backend_id;
return true;
}
void qnn_instance::unload_backend() {
for (auto & it : _loaded_lib_handle) {
if (!common::dl_unload(it.second)) {
QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, common::dl_error());
}
}
_loaded_lib_handle.clear();
_lib_path_to_backend_id.clear();
_loaded_backend.clear();
}
const device_caps & get_device_caps(backend_index_type device) {
return kDeviceCaps[device];
}
} // namespace qnn

View File

@ -0,0 +1,459 @@
#pragma once
#include <algorithm>
#include <atomic>
#include <cmath>
#include <cstring>
#include <memory>
#include <mutex>
#include <string>
#include <unordered_map>
#include <vector>
// header file of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK
// https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct
#include <HTP/QnnHtpDevice.h>
#include <HTP/QnnHtpGraph.h>
#include <QnnBackend.h>
#include <QnnCommon.h>
#include <QnnContext.h>
#include <QnnGraph.h>
#include <QnnInterface.h>
#include <QnnProperty.h>
#include <QnnTensor.h>
#include <QnnTypes.h>
#include <System/QnnSystemInterface.h>
#include "dyn-lib-loader.hpp"
#include "qnn-types.hpp"
#include "rpc-mem.hpp"
#include "utils.hpp"
namespace qnn {
// =================================================================================================
//
// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK
// ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm
// =================================================================================================
// TODO: fix this for other compilers
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wextra-semi"
#pragma GCC diagnostic ignored "-Wpedantic"
class qnn_system_interface {
#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \
template <typename... Args> inline auto qnn_##F(Args... args) const { \
return (_qnn_sys_interface.QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)(std::forward<Args>(args)...); \
}
public:
qnn_system_interface(const QnnSystemInterface_t & qnn_sys_interface, common::dl_handler_t lib_handle);
~qnn_system_interface();
bool is_valid() const { return _qnn_system_handle != nullptr; }
// QnnSystem
DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate);
DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo);
DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree);
private:
qnn_system_interface(const qnn_system_interface &) = delete;
void operator=(const qnn_system_interface &) = delete;
qnn_system_interface(qnn_system_interface &&) = delete;
void operator=(qnn_system_interface &&) = delete;
const QnnSystemInterface_t _qnn_sys_interface = {};
common::dl_handler_t _lib_handle = nullptr;
QnnSystemContext_Handle_t _qnn_system_handle = nullptr;
};
class qnn_interface {
#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \
template <typename... Args> inline auto qnn_##F(Args... args) const { \
return (_qnn_interface.QNN_INTERFACE_VER_NAME.pointer_name)(std::forward<Args>(args)...); \
}
public:
qnn_interface(const QnnInterface_t & qnn_interface) : _qnn_interface(qnn_interface) {}
// QnnBackend
DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate);
DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree);
DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage);
DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig);
DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion);
// QnnDevice
DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate);
DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree);
DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure);
DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo);
DEFINE_SHIM_FUNCTION_INTERFACE(device_free_platform_info, deviceFreePlatformInfo);
DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo);
// QnnContext
DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate);
DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize);
DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary);
DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary);
DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree);
// QnnGraph
DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate);
DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode);
DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize);
DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute);
DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve);
// QnnLog
DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate);
DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree);
DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel);
// QnnProfile
DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate);
DEFINE_SHIM_FUNCTION_INTERFACE(profile_set_config, profileSetConfig);
DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents);
DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents);
DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData);
DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree);
// QnnMem
DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister);
DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister);
// QnnProperty
DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability);
// QnnTensor
DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor);
DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor);
uint32_t get_backend_id() const { return _qnn_interface.backendId; }
private:
qnn_interface(const qnn_interface &) = delete;
void operator=(const qnn_interface &) = delete;
qnn_interface(qnn_interface &&) = delete;
void operator=(qnn_interface &&) = delete;
const QnnInterface_t _qnn_interface = {};
};
#pragma GCC diagnostic pop
using qnn_interface_ptr = std::shared_ptr<qnn_interface>;
class qnn_instance {
public:
using BackendIdType = decltype(QnnInterface_t{}.backendId);
explicit qnn_instance(const std::string & lib_path, backend_index_type device);
~qnn_instance() {}
bool qnn_init(const QnnSaver_Config_t ** saver_config);
bool qnn_finalize();
qnn_interface_ptr get_qnn_interface() {
if (!_qnn_interface) {
QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
}
return _qnn_interface;
}
Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; }
Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; }
Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; }
Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; }
Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; }
int init_htp_perfinfra() {
QnnDevice_Infrastructure_t device_infra = nullptr;
auto error = _qnn_interface->qnn_device_get_infrastructure(&device_infra);
if (error != QNN_SUCCESS) {
QNN_LOG_WARN("failed to get qnn device infra\n");
return 1;
} else {
QNN_LOG_INFO("HTP backend perf_infrastructure creation ok\n");
}
QnnHtpDevice_Infrastructure_t * htp_infra = static_cast<QnnHtpDevice_Infrastructure_t *>(device_infra);
QnnHtpDevice_PerfInfrastructure_t * htp_perfinfra = &htp_infra->perfInfra;
uint32_t power_configid = 1;
uint32_t device_id = 0;
uint32_t core_id = 0;
htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid);
if (htp_infra->infraType != QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF) {
QNN_LOG_INFO("HTP infra type = %d, which is not perf infra type\n", htp_infra->infraType);
} else {
QNN_LOG_INFO("HTP infra type = %d, which is perf infra type\n", htp_infra->infraType);
}
_qnn_htp_perfinfra = htp_perfinfra;
_qnn_power_configid = power_configid;
return 0;
}
int set_rpc_polling() {
if (_qnn_htp_perfinfra) {
QnnHtpPerfInfrastructure_PowerConfig_t rpc_polling_time;
memset(&rpc_polling_time, 0, sizeof(rpc_polling_time));
rpc_polling_time.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME;
// use rpc polling time recommended 0-10000 us
rpc_polling_time.rpcPollingTimeConfig = 9999;
QnnHtpPerfInfrastructure_PowerConfig_t rpc_control_latency;
memset(&rpc_control_latency, 0, sizeof(rpc_control_latency));
rpc_control_latency.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY;
// use rpc control latency recommended 100 us, refer hexagon sdk
rpc_control_latency.rpcControlLatencyConfig = 100;
const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = { &rpc_polling_time, &rpc_control_latency,
nullptr };
Qnn_ErrorHandle_t qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs);
if (qnn_status != QNN_SUCCESS) {
QNN_LOG_WARN("set htp perf failed\n");
} else {
QNN_LOG_DEBUG("set htp perf ok\n");
}
} else {
QNN_LOG_WARN("can't set htp perf\n");
}
return 0;
}
int set_high_performance_mode() {
if (!_qnn_htp_perfinfra) {
QNN_LOG_WARN("perf intra is null\n");
return 1;
}
QnnHtpPerfInfrastructure_PowerConfig_t power_config;
memset(&power_config, 0, sizeof(power_config));
power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3;
power_config.dcvsV3Config.setDcvsEnable = 1;
power_config.dcvsV3Config.dcvsEnable = 0;
power_config.dcvsV3Config.contextId = _qnn_power_configid;
power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE;
power_config.dcvsV3Config.setSleepLatency = 1; // true to consider Latency parameter otherwise false
power_config.dcvsV3Config.sleepLatency = 40;
power_config.dcvsV3Config.setBusParams = 1; // true to consider Bus parameter otherwise false
power_config.dcvsV3Config.setCoreParams = 1; // true to consider Core parameter otherwise false
power_config.dcvsV3Config.sleepDisable = 1; // true to consider sleep/LPM modes, false to enable
power_config.dcvsV3Config.setSleepDisable =
1; // true to consider sleep disable/enable parameter otherwise false set sleep latency parameter
// set Bus Clock Parameters
power_config.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
power_config.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
power_config.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
// set Core Clock Parameters
power_config.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
power_config.dcvsV3Config.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
power_config.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
// set power config with different performance parameters
const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = { &power_config, nullptr };
Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS;
qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs);
if (qnn_status != QNN_SUCCESS) {
QNN_LOG_WARN("set htp high performance mode failed\n");
} else {
QNN_LOG_DEBUG("set htp high performance mode ok\n");
}
return 0;
}
std::string & get_qnn_graph_name() { return _graph_name; }
void * alloc_rpcmem(size_t bytes, size_t alignment) {
if (!_rpc_mem) {
QNN_LOG_WARN("rpc memory not initialized\n");
return nullptr;
}
auto allocate_bytes = static_cast<int64_t>(bytes + alignment);
void * buf = _rpc_mem->alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (int) allocate_bytes);
if (!buf) {
QNN_LOG_WARN("failed to allocate rpc memory, size: %d MB\n", (int) (allocate_bytes / (1 << 20)));
return nullptr;
}
auto aligned_buf = reinterpret_cast<void *>(qnn::align_to(alignment, reinterpret_cast<intptr_t>(buf)));
bool status = _rpcmem_store_map.insert(std::pair<void *, void *>(aligned_buf, buf)).second;
if (!status) {
QNN_LOG_WARN("failed to allocate rpc memory\n");
_rpc_mem->free(buf);
}
return aligned_buf;
}
void free_rpcmem(void * buf) {
if (!_rpc_mem) {
QNN_LOG_WARN("rpc memory not initialized\n");
} else if (_rpcmem_store_map.count(buf) == 0) {
QNN_LOG_WARN("no allocated tensor\n");
} else {
_rpc_mem->free(_rpcmem_store_map[buf]);
_rpcmem_store_map.erase(buf);
}
}
int rpcmem_to_fd(void * buf) {
int fd = -1;
if (!_rpc_mem) {
QNN_LOG_WARN("rpc memory not initialized\n");
} else if (_rpcmem_store_map.count(buf) == 0) {
QNN_LOG_WARN("no allocated tensor\n");
} else {
buf = _rpcmem_store_map[buf];
fd = _rpc_mem->to_fd(buf);
}
return fd;
}
Qnn_MemHandle_t register_rpcmem(void * p_data, const uint32_t rank, uint32_t * dimensions,
Qnn_DataType_t data_type) {
if (!p_data) {
QNN_LOG_WARN("invalid param\n");
return nullptr;
}
if (!_rpc_mem) {
QNN_LOG_WARN("rpc memory not initialized\n");
return nullptr;
}
if (is_rpcmem_registered(p_data)) {
QNN_LOG_WARN("rpc memory already registered\n");
return _qnn_rpc_buffer_to_handles[p_data];
}
auto mem_fd = rpcmem_to_fd(p_data);
if (mem_fd == -1) {
QNN_LOG_WARN("failed to get file descriptor\n");
return nullptr;
}
QNN_LOG_DEBUG("mem_fd %d\n", mem_fd);
Qnn_MemDescriptor_t descriptor = {
{ rank, dimensions, nullptr },
data_type, QNN_MEM_TYPE_ION, { { mem_fd } }
};
Qnn_MemHandle_t handle = nullptr;
auto error = _qnn_interface->qnn_mem_register(_qnn_context_handle, &descriptor,
/*numDescriptors=*/1, &handle);
if (error != QNN_SUCCESS) {
QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", (int) QNN_GET_ERROR_CODE(error),
strerror(error));
return nullptr;
}
_qnn_rpc_buffer_to_handles.insert({ p_data, handle });
QNN_LOG_DEBUG("successfully register shared memory handler: %p\n", handle);
return handle;
}
void unregister_rpcmem(Qnn_MemHandle_t mem_handle) {
auto error = _qnn_interface->qnn_mem_de_register(&mem_handle, 1);
if (error != QNN_SUCCESS) {
QNN_LOG_WARN("failed to unregister shared memory, error %d\n", (int) QNN_GET_ERROR_CODE(error));
}
auto it = std::find_if(_qnn_rpc_buffer_to_handles.begin(), _qnn_rpc_buffer_to_handles.end(),
[mem_handle](const auto & kv) { return kv.second == mem_handle; });
if (it == _qnn_rpc_buffer_to_handles.end()) {
QNN_LOG_WARN("failed to find shared memory handler: %p\n", mem_handle);
return;
}
_qnn_rpc_buffer_to_handles.erase(it);
}
bool is_rpcmem_allocated(void * buf) { return _rpcmem_store_map.count(buf) != 0; }
bool is_rpcmem_registered(void * buf) { return _qnn_rpc_buffer_to_handles.count(buf) != 0U; }
const qnn::qcom_socinfo & get_soc_info() { return _soc_info; }
bool has_custom_op_package() const { return _has_custom_op_package; }
private:
int load_system();
bool load_backend(std::string & lib_path, const QnnSaver_Config_t ** /*saver_config*/);
void unload_backend();
private:
static constexpr const int _required_num_providers = 1;
std::string _additional_lib_load_path;
std::string _backend_lib_name;
BackendIdType _backend_id;
#ifdef NDEBUG
QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_INFO; // TODO: should we consider changing this dynamically?
#else
QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG;
#endif
std::shared_ptr<qnn::qnn_system_interface> _qnn_sys_interface;
std::shared_ptr<qnn::qnn_interface> _qnn_interface;
Qnn_GraphHandle_t _qnn_graph_handle = nullptr;
Qnn_LogHandle_t _qnn_log_handle = nullptr;
Qnn_DeviceHandle_t _qnn_device_handle = nullptr;
Qnn_BackendHandle_t _qnn_backend_handle = nullptr;
Qnn_ContextHandle_t _qnn_context_handle = nullptr;
QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr;
uint32_t _qnn_power_configid = 1;
std::unordered_map<void *, Qnn_MemHandle_t> _qnn_rpc_buffer_to_handles;
std::mutex _init_mutex;
std::unordered_map<BackendIdType, common::dl_handler_t> _loaded_lib_handle;
std::unordered_map<std::string, BackendIdType> _lib_path_to_backend_id;
std::unordered_map<BackendIdType, const QnnInterface_t *> _loaded_backend;
std::unique_ptr<common::rpc_mem> _rpc_mem;
std::unordered_map<void *, void *> _rpcmem_store_map;
std::string _graph_name;
qnn::qcom_socinfo _soc_info = {};
bool _has_custom_op_package = false;
common::dl_handler_t _custom_op_extra_lib_handle = nullptr;
};
using qnn_instance_ptr = std::shared_ptr<qnn_instance>;
struct device_caps {
const char * lib_name;
enum ggml_backend_dev_type type;
// TODO: should we get this from device?
uint64_t supported_types;
// TODO: should we merge this with supported_types?
uint64_t cpu_preprocess_types;
// TODO: should we get this from device?
size_t max_tensor_size_in_bytes;
};
const device_caps & get_device_caps(backend_index_type device);
} // namespace qnn

View File

@ -0,0 +1,51 @@
#pragma once
#include <QnnCommon.h>
#include <QnnInterface.h>
#include <QnnTypes.h>
#include <Saver/QnnSaver.h>
#include <System/QnnSystemInterface.h>
#include "common.hpp"
namespace qnn {
enum qcom_htp_arch {
NONE = 0,
V68 = 68,
V69 = 69,
V73 = 73,
V75 = 75,
V79 = 79, // SD 8 Gen 4 (SM8750)
};
enum qcom_chipset {
UNKNOWN = 0,
EMULATOR_X64 = 0xFF00, // x86_64 emulator
EMULATOR_AARCH64 = 0xFF01, // ARM64 emulator
SM8350 = 30, // v68, SD 888/888+
SM8450 = 36, // v69, SD 8 Gen 1
SA8295 = 39, // v68
SM8475 = 42, // v69, SD 8+ Gen 1
SM8550 = 43, // v73, SD 8 Gen 2
SSG2115P = 46, // v73
SM7675 = 70, // V73, SD 7+ Gen 3
SM8635 = 68, // v73, SD 8s Gen 3
SM8650 = 57, // v75, SD 8 Gen 3
SM8750 = 69, // v79, SD 8 Gen 4
};
struct qcom_socinfo {
uint32_t soc_model;
size_t htp_arch;
size_t vtcm_size_in_mb;
};
using pfn_qnnsaver_initialize = decltype(QnnSaver_initialize);
using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders);
using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders);
} // namespace qnn
#define RPCMEM_DEFAULT_FLAGS 1
#define RPCMEM_HEAP_ID_SYSTEM 25

View File

@ -0,0 +1,443 @@
#pragma once
#include <algorithm>
#include <array>
#include <atomic>
#include <cstddef>
#include <cstdint>
#include <memory>
#include <string>
#include "buffer.hpp"
#include "ggml-qnn.h"
#include "logger.hpp"
#include "qnn-lib.hpp"
#include "utils.hpp"
namespace qnn {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS should be 4");
class ggml_qnn_tensor : public std::enable_shared_from_this<ggml_qnn_tensor> {
public:
typedef enum _tensor_type { INPUT, OUTPUT, INTERMEDIATE, PARAMETER, BIDIRECTION } tensor_type_t;
explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string & name,
const qnn_dimension_array_t & dimensions, Qnn_DataType_t data_type, int rank,
backend_index_type device, Qnn_GraphHandle_t graph_handle, qnn_instance_ptr qnn_instance) :
_tensor_name(name),
_device(device),
_qnn_instance(qnn_instance),
_graph_handle(graph_handle) {
if (!_tensor_name.empty()) {
QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str());
}
_dimensions = dimensions;
QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions.data());
QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER);
update_params_from_ggml_tensor(tensor_type, data_type, rank);
QNN_LOG_DEBUG("[%s][%s]created, rank: %d, dims: [%d, %d, %d, %d], type: %s\n", get_backend_name(device),
_tensor_name.c_str(), rank, (int) _dimensions[0], (int) _dimensions[1], (int) _dimensions[2],
(int) _dimensions[3], qnn_datatype_to_string(data_type));
}
explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string & name,
const ggml_dimension_array_t & dimensions, ggml_type data_type, int rank,
backend_index_type device, Qnn_GraphHandle_t graph_handle, qnn_instance_ptr qnn_instance) :
ggml_qnn_tensor(tensor_type, name, get_internal_dimension(dimensions, rank),
qnn_datatype_from_ggml_datatype(data_type), rank, device, graph_handle, qnn_instance) {}
~ggml_qnn_tensor() {
_rpc_buffer.reset();
unbind();
}
bool set_data_buffer(const uint8_t * buffer, const size_t buffer_size) {
auto qnn_buffer = std::make_shared<qnn_mem_buffer>(buffer, buffer_size);
if (bind_buffer_impl(qnn_buffer)) {
return true;
}
_can_unbind = false;
return false;
}
bool set_data_buffer(qnn_buffer_ptr buffer) {
if (bind_buffer_impl(buffer)) {
return true;
}
_can_unbind = false;
return false;
}
bool alloc_qnn_tensor_id() {
if (QNN_TENSOR_GET_ID(_qnn_tensor)) {
QNN_LOG_DEBUG("[%s]tensor already has a id: %d\n", _tensor_name.c_str(), QNN_TENSOR_GET_ID(_qnn_tensor));
return true;
}
Qnn_Tensor_t qnn_tensor = _qnn_tensor;
auto qnn_interface = _qnn_instance->get_qnn_interface();
auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor);
if (error != QNN_SUCCESS) {
QNN_LOG_ERROR("[%s]allocate id failed, error: %s\n", _tensor_name.c_str(), get_qnn_error_string(error));
return false;
}
QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(qnn_tensor));
QNN_LOG_DEBUG("[%s][%s]allocated id: %d, rank: %d\n", get_backend_name(_device), _tensor_name.c_str(),
QNN_TENSOR_GET_ID(qnn_tensor), QNN_TENSOR_GET_RANK(qnn_tensor));
return true;
}
bool bind_ggml_tensor(ggml_tensor * tensor, qnn_buffer_ptr buffer) {
if (!_can_unbind) {
QNN_LOG_DEBUG("[%s]already has buffer storage, skip bind\n", _tensor_name.c_str());
return true;
}
#ifndef NDEBUG
if (tensor->view_src) {
auto * src = tensor->view_src;
QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) is a view, src: %s_%dx%dx%dx%d\n", get_backend_name(_device),
tensor->name, (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2],
(int) tensor->ne[3], src->name, (int) src->ne[0], (int) src->ne[1], (int) src->ne[2],
(int) src->ne[3]);
}
#endif
if (!buffer) {
buffer =
std::make_shared<qnn_mem_buffer_slice>(reinterpret_cast<uint8_t *>(tensor->data), ggml_nbytes(tensor));
QNN_LOG_DEBUG("[%s][%s]attach buffer to tensor(%s), size: %d\n", get_backend_name(_device),
_tensor_name.c_str(), tensor->name, (int) buffer->get_size());
}
if (!bind_buffer_impl(buffer)) {
QNN_LOG_WARN("[%s]failed to bind ggml tensor(%s)\n", _tensor_name.c_str(), ggml_get_name(tensor));
return false;
}
QNN_LOG_DEBUG("[%s][%s]bind to ggml tensor(%s)\n", get_backend_name(_device), _tensor_name.c_str(),
ggml_get_name(tensor));
tensor->extra = this;
_ggml_tensor = tensor;
return true;
}
bool unbind() {
if (!_graph_handle) {
QNN_LOG_WARN("[%s]not bound to any graph\n", _tensor_name.c_str());
return false;
}
if (!_buffer) {
QNN_LOG_DEBUG("[%s]unbind to ggml tensor\n", _tensor_name.c_str());
return true;
}
if (!read_from_qnn_tensor()) {
QNN_LOG_WARN("[%s]read from qnn tensor failed\n", _tensor_name.c_str());
return false;
}
if (!_can_unbind) {
QNN_LOG_DEBUG("[%s]already has buffer storage, stop unbind\n", _tensor_name.c_str());
return true;
}
if (!should_use_mem_handle()) {
QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW);
Qnn_ClientBuffer_t client_buf = {};
QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf);
QNN_LOG_DEBUG("[%s]clear client buffer\n", _tensor_name.c_str());
}
QNN_LOG_DEBUG("[%s][%s]unbind from buffer: %p, size: %d\n", get_backend_name(_device), _tensor_name.c_str(),
(void *) _buffer->get_buffer(), (int) _buffer->get_size());
_buffer.reset();
if (_ggml_tensor) {
_ggml_tensor->extra = nullptr;
_ggml_tensor = nullptr;
}
return true;
}
const Qnn_Tensor_t & get_qnn_tensor() const { return _qnn_tensor; }
Qnn_DataType_t get_data_type() const { return QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor); }
const qnn_dimension_array_t & get_dimensions() const { return _dimensions; }
uint32_t get_rank() const { return QNN_TENSOR_GET_RANK(_qnn_tensor); }
uint32_t get_qnn_tensor_id() const { return QNN_TENSOR_GET_ID(_qnn_tensor); }
const std::string & get_tensor_name() const { return _tensor_name; }
private:
bool bind_buffer_impl(qnn_buffer_ptr buffer) {
if (_buffer) {
if (_buffer != buffer) {
QNN_LOG_WARN("[%s]has been bound to another buffer %p\n", _tensor_name.c_str(),
(void *) _buffer->get_buffer());
return false;
}
QNN_LOG_DEBUG("[%s]already bound to same ggml tensor %p\n", _tensor_name.c_str(),
(void *) _buffer->get_buffer());
return true;
}
if (QNN_TENSOR_GET_TYPE(_qnn_tensor) == QNN_TENSOR_TYPE_NATIVE) {
QNN_LOG_DEBUG("[%s]tensor type(%d) not READ/WRITE, skipping\n", _tensor_name.c_str(),
(int) QNN_TENSOR_TYPE_NATIVE);
return true;
}
if (should_use_mem_handle()) {
if (!_rpc_buffer) {
auto rpc_buffer = std::make_shared<qnn_rpc_buffer>(
_qnn_instance, buffer->get_size(), QNN_TENSOR_GET_RANK(_qnn_tensor),
QNN_TENSOR_GET_DIMENSIONS(_qnn_tensor), QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor));
if (!rpc_buffer->is_valid()) {
QNN_LOG_WARN("[%s][%s]alloc rpc mem failed\n", get_backend_name(_device), _tensor_name.c_str());
return false;
}
_rpc_buffer = std::move(rpc_buffer);
}
QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE);
auto mem_handle = _rpc_buffer->get_mem_handle();
if (!mem_handle) {
QNN_LOG_WARN("[%s][%s]can't find rpcmem from qnn mem handle\n", get_backend_name(_device),
_tensor_name.c_str());
return false;
}
QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, mem_handle);
QNN_LOG_DEBUG("[%s][%s]use mem handle %p\n", get_backend_name(_device), _tensor_name.c_str(),
QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor));
} else {
QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW);
Qnn_ClientBuffer_t client_buf = { buffer->get_buffer(), (uint32_t) buffer->get_size() };
QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf);
QNN_LOG_DEBUG("[%s][%s]use client buffer %p size %d\n", get_backend_name(_device), _tensor_name.c_str(),
client_buf.data, (int) client_buf.dataSize);
}
_buffer = buffer;
if (!write_to_qnn_tensor()) {
QNN_LOG_WARN("[%s]write to qnn tensor failed\n", _tensor_name.c_str());
return false;
}
QNN_LOG_DEBUG("[%s][%s]bind to buffer: %p, size: %d\n", get_backend_name(_device), _tensor_name.c_str(),
(void *) buffer->get_buffer(), (int) buffer->get_size());
return true;
}
bool write_to_qnn_tensor() {
auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor);
if (tensor_type != QNN_TENSOR_TYPE_APP_WRITE && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) {
QNN_LOG_DEBUG("[%s]tensor type(%d) not WRITE\n", _tensor_name.c_str(), (int) tensor_type);
return true;
}
if (_rpc_buffer) {
memcpy(_rpc_buffer->get_buffer(), _buffer->get_buffer(), _buffer->get_size());
// For CPU and GPU, the data is already in the tensor.
QNN_LOG_DEBUG("[%s][%s]write buffer(%p) to rpc buffer(%p)\n", get_backend_name(_device),
_tensor_name.c_str(), (void *) _buffer->get_buffer(), (void *) _rpc_buffer->get_buffer());
}
return true;
}
bool read_from_qnn_tensor() {
auto tensor_type = QNN_TENSOR_GET_TYPE(_qnn_tensor);
if (tensor_type != QNN_TENSOR_TYPE_APP_READ && tensor_type != QNN_TENSOR_TYPE_APP_READWRITE) {
QNN_LOG_DEBUG("[%s]tensor type(%d) not READ\n", _tensor_name.c_str(), (int) tensor_type);
return true;
}
if (_rpc_buffer) {
memcpy(_buffer->get_buffer(), _rpc_buffer->get_buffer(), _buffer->get_size());
// For CPU and GPU, the data is already in the tensor.
QNN_LOG_DEBUG("[%s][%s]read buffer(%p) from rpc buffer(%p)\n", get_backend_name(_device),
_tensor_name.c_str(), (void *) _buffer->get_buffer(), (void *) _rpc_buffer->get_buffer());
}
return true;
}
void update_params_from_ggml_tensor(tensor_type_t tensor_type, Qnn_DataType_t data_type, int rank) {
QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, data_type);
// TODO: set the quantizeParams base on the tensor type
QNN_TENSOR_SET_RANK(_qnn_tensor, (uint32_t) rank);
QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW);
Qnn_ClientBuffer_t client_buf = {};
QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf);
Qnn_TensorType_t new_tensor_type;
switch (tensor_type) {
case INPUT:
new_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
break;
case OUTPUT:
new_tensor_type = QNN_TENSOR_TYPE_APP_READ;
break;
case PARAMETER:
new_tensor_type = QNN_TENSOR_TYPE_STATIC;
break;
case BIDIRECTION:
new_tensor_type = QNN_TENSOR_TYPE_APP_READWRITE;
break;
case INTERMEDIATE:
default:
new_tensor_type = QNN_TENSOR_TYPE_NATIVE;
break;
}
QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type);
QNN_LOG_DEBUG("[%s][%s]new_tensor_type %s\n", get_backend_name(_device), _tensor_name.c_str(),
get_qnn_tensor_type_name(new_tensor_type));
}
bool should_use_mem_handle() const {
// TODO: figure out how to set rpc mem to multiple tensor
return false;
}
std::string _tensor_name;
qnn_buffer_ptr _buffer;
bool _can_unbind = true;
backend_index_type _device;
qnn_instance_ptr _qnn_instance;
Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion);
qnn_dimension_array_t _dimensions = {};
Qnn_GraphHandle_t _graph_handle = nullptr;
qnn_buffer_ptr _rpc_buffer;
ggml_tensor * _ggml_tensor = nullptr;
DISABLE_COPY(ggml_qnn_tensor);
DISABLE_MOVE(ggml_qnn_tensor);
};
using qnn_tensor_ptr_t = std::shared_ptr<ggml_qnn_tensor>;
using qnn_tensor_array_t = std::vector<qnn_tensor_ptr_t>;
using ggml_tensor_array_t = std::vector<ggml_tensor *>;
inline qnn_tensor_ptr_t get_qnn_tensor_ptr(ggml_tensor * ggml_tensor) {
return ggml_tensor->extra ? reinterpret_cast<ggml_qnn_tensor *>(ggml_tensor->extra)->shared_from_this() :
qnn_tensor_ptr_t();
}
inline int get_ggml_tensors_max_rank(const qnn::ggml_tensor_array_t & tensors) {
int max_rank = 0;
for (auto tensor : tensors) {
max_rank = std::max(max_rank, ggml_n_dims(tensor));
}
return max_rank;
}
inline bool bind_tensors_with_custom_buffers(const ggml_tensor_array_t & ggml_tensors,
std::vector<qnn_buffer_ptr> & buffers,
qnn_tensor_array_t & tensor_wrappers,
std::vector<Qnn_Tensor_t> & qnn_tensors) {
GGML_ASSERT(tensor_wrappers.size() == ggml_tensors.size());
GGML_ASSERT(buffers.size() == ggml_tensors.size());
qnn_tensors.resize(ggml_tensors.size());
for (size_t i = 0; i < ggml_tensors.size(); i++) {
auto * ggml_tensor = ggml_tensors[i];
if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor, buffers[i])) {
QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor));
return false;
}
qnn_tensors[i] = tensor_wrappers[i]->get_qnn_tensor();
}
return true;
}
inline bool bind_tensors(const ggml_tensor_array_t & ggml_tensors, qnn_tensor_array_t & tensor_wrappers,
std::vector<Qnn_Tensor_t> & qnn_tensors) {
GGML_ASSERT(tensor_wrappers.size() == ggml_tensors.size());
qnn_tensors.resize(ggml_tensors.size());
for (size_t i = 0; i < ggml_tensors.size(); i++) {
auto * ggml_tensor = ggml_tensors[i];
if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor, qnn_buffer_ptr())) {
QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor));
return false;
}
qnn_tensors[i] = tensor_wrappers[i]->get_qnn_tensor();
}
return true;
}
inline bool bind_tensors(const ggml_tensor_array_t & ggml_tensors, qnn_tensor_array_t & tensor_wrappers) {
GGML_ASSERT(tensor_wrappers.size() == ggml_tensors.size());
for (size_t i = 0; i < ggml_tensors.size(); i++) {
auto * ggml_tensor = ggml_tensors[i];
if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor, qnn_buffer_ptr())) {
QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor));
return false;
}
}
return true;
}
inline void unbind_tensors(qnn_tensor_array_t & tensor_wrappers) {
for (auto & tensor : tensor_wrappers) {
tensor->unbind();
}
}
struct tensor_create_common_params {
const char * name_prefix;
int tensor_rank;
bool is_input;
backend_index_type device;
Qnn_GraphHandle_t graph_handle;
std::shared_ptr<qnn::qnn_instance> qnn_instance;
};
inline void create_tensors_from_ggml_tensor(const tensor_create_common_params & params,
const ggml_tensor_array_t & ggml_tensors,
qnn_tensor_array_t * tensor_wrappers,
std::vector<Qnn_Tensor_t> * qnn_tensors) {
if (qnn_tensors) {
qnn_tensors->resize(ggml_tensors.size());
}
if (!tensor_wrappers->empty()) {
QNN_LOG_DEBUG("tensor_wrappers is not empty, skip create tensors\n");
GGML_ASSERT(tensor_wrappers->size() == ggml_tensors.size());
return;
}
tensor_wrappers->resize(ggml_tensors.size());
char buffer[GGML_MAX_NAME] = {};
auto tensor_type = params.is_input ? ggml_qnn_tensor::INPUT : ggml_qnn_tensor::OUTPUT;
for (size_t i = 0; i < ggml_tensors.size(); i++) {
snprintf(buffer, GGML_MAX_NAME, "%s%d", params.name_prefix, (int) i);
auto * ggml_tensor = ggml_tensors[i];
(*tensor_wrappers)[i] = std::make_shared<ggml_qnn_tensor>(tensor_type, std::string(buffer), ggml_tensor->ne,
ggml_tensor->type, params.tensor_rank, params.device,
params.graph_handle, params.qnn_instance);
}
}
} // namespace qnn

View File

@ -0,0 +1,467 @@
#include "utils.hpp"
#include <cstdlib>
#include "ggml-qnn.h"
#include "qnn-types.hpp"
#include "QnnGraph.h"
#ifdef _WIN32
# include <windows.h>
#else
# include <sys/sysinfo.h>
# include <unistd.h>
#endif
namespace {
template <typename _Ty> _Ty align_to_generic(size_t alignment, _Ty offset) {
return offset % alignment == 0 ? offset :
offset + (static_cast<_Ty>(alignment) - (offset % static_cast<_Ty>(alignment)));
}
} // namespace
namespace qnn {
qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t & dims, uint32_t rank) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS should be 4");
GGML_ASSERT(rank <= GGML_MAX_DIMS && rank > 0);
qnn_dimension_array_t internal_dims = {};
/*
* Both the ggml and qnn tensor in memory are stored as row-major format.
* But the dimensions of the tensor are stored in different order.
* For example, a 2x3 matrix:
* [
* [1, 2, 3],
* [4, 5, 6],
* ]
* The ggml tensor will have dimensions [3, 2], while the qnn tensor will have dimensions [2, 3].
*/
for (uint32_t i = 0; i < rank; i++) {
internal_dims[i] = std::max<uint32_t>((uint32_t) dims[rank - 1 - i], 1);
}
return internal_dims;
}
qnn_dimension_array_t get_view_internal_dimension(const ggml_tensor * tensor, size_t & element_offset_out) {
element_offset_out = 0;
auto * parent_tensor = tensor;
while (parent_tensor->view_src) {
element_offset_out += parent_tensor->view_offs;
parent_tensor = parent_tensor->view_src;
}
const auto rank = get_ggml_tensor_rank(tensor);
const auto parent_rank = get_ggml_tensor_rank(parent_tensor);
GGML_ASSERT(parent_tensor->type == tensor->type);
GGML_ASSERT(parent_rank == rank);
const auto block_size = ggml_blck_size(tensor->type);
element_offset_out =
element_offset_out * block_size / tensor->nb[0]; // calculate the element offset in the view tensor
return get_internal_dimension(parent_tensor->ne, parent_rank);
}
// TODO: mapping more ggml data type to QNN data type
// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684
Qnn_DataType_t qnn_datatype_from_ggml_datatype(ggml_type ggml_type) {
switch (ggml_type) {
case GGML_TYPE_F32:
return QNN_DATATYPE_FLOAT_32;
case GGML_TYPE_F16:
return QNN_DATATYPE_FLOAT_16;
case GGML_TYPE_I32:
return QNN_DATATYPE_INT_32;
case GGML_TYPE_I16:
return QNN_DATATYPE_INT_16;
case GGML_TYPE_I8:
return QNN_DATATYPE_INT_8;
case GGML_TYPE_Q8_0:
return QNN_DATATYPE_SFIXED_POINT_8;
case GGML_TYPE_Q4_0:
return QNN_DATATYPE_SFIXED_POINT_4;
default:
break;
}
return QNN_DATATYPE_UNDEFINED;
}
ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type) {
switch (qnn_type) {
case QNN_DATATYPE_FLOAT_32:
return GGML_TYPE_F32;
case QNN_DATATYPE_FLOAT_16:
return GGML_TYPE_F16;
case QNN_DATATYPE_UINT_32:
case QNN_DATATYPE_INT_32:
return GGML_TYPE_I32;
case QNN_DATATYPE_INT_16:
return GGML_TYPE_I16;
case QNN_DATATYPE_INT_8:
return GGML_TYPE_I8;
case QNN_DATATYPE_SFIXED_POINT_8:
return GGML_TYPE_Q8_0;
case QNN_DATATYPE_SFIXED_POINT_4:
return GGML_TYPE_Q4_0;
default:
break;
}
return GGML_TYPE_COUNT;
}
size_t qnn_datatype_size(Qnn_DataType_t qnn_type) {
switch (qnn_type) {
case QNN_DATATYPE_FLOAT_32:
return sizeof(float);
case QNN_DATATYPE_FLOAT_16:
return sizeof(uint16_t);
case QNN_DATATYPE_UINT_32:
case QNN_DATATYPE_INT_32:
return sizeof(int32_t);
case QNN_DATATYPE_INT_16:
return sizeof(int16_t);
case QNN_DATATYPE_INT_8:
return sizeof(int8_t);
case QNN_DATATYPE_SFIXED_POINT_8:
return sizeof(int8_t);
case QNN_DATATYPE_SFIXED_POINT_4:
return sizeof(int8_t);
default:
break;
}
return 0;
}
const char * qnn_datatype_to_string(Qnn_DataType_t qnn_type) {
switch (qnn_type) {
case QNN_DATATYPE_FLOAT_32:
return "QNN_DATATYPE_FLOAT_32";
case QNN_DATATYPE_FLOAT_16:
return "QNN_DATATYPE_FLOAT_16";
case QNN_DATATYPE_UINT_32:
return "QNN_DATATYPE_UINT_32";
case QNN_DATATYPE_INT_32:
return "QNN_DATATYPE_INT_32";
case QNN_DATATYPE_INT_16:
return "QNN_DATATYPE_INT_16";
case QNN_DATATYPE_INT_8:
return "QNN_DATATYPE_INT_8";
case QNN_DATATYPE_SFIXED_POINT_8:
return "QNN_DATATYPE_SFIXED_POINT_8";
case QNN_DATATYPE_SFIXED_POINT_4:
return "QNN_DATATYPE_SFIXED_POINT_4";
default:
break;
}
return "QNN_DATATYPE_UNDEFINED";
}
uint32_t get_ggml_tensor_rank(const ggml_tensor * tensor) {
uint32_t rank = 0;
for (int i = 0; i < GGML_MAX_DIMS; i++) {
if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) {
rank++;
}
}
return rank;
}
const char * get_ggml_type_name(ggml_type type) {
const auto * traits = ggml_get_type_traits(type);
return traits->type_name;
}
const char * get_backend_name(backend_index_type device) {
switch (device) {
case QNN_BACKEND_CPU:
return "qnn-cpu";
case QNN_BACKEND_GPU:
return "qnn-gpu";
case QNN_BACKEND_NPU:
return "qnn-npu";
case QNN_BACKEND_COUNT:
default:
return "unknown";
}
}
const char * get_backend_desc(backend_index_type device) {
switch (device) {
case QNN_BACKEND_CPU:
return "CPU";
case QNN_BACKEND_GPU:
return "Adreno GPU";
case QNN_BACKEND_NPU:
return "Hexagon NPU";
case QNN_BACKEND_COUNT:
default:
return "unknown";
}
}
const char * get_chipset_desc(uint32_t soc_model) {
switch (soc_model) {
case SM8350:
return "Snapdragon 888/888+";
case SM8450:
return "Snapdragon 8 Gen 1";
case SM8475:
return "Snapdragon 8 Gen 1+";
case SM8550:
return "Snapdragon 8 Gen 2";
case SM7675:
return "Snapdragon 7+ Gen 3";
case SM8635:
return "Snapdragon 8s Gen 3";
case SM8650:
return "Snapdragon 8 Gen 3";
case SM8750:
return "Snapdragon 8 Elite";
case EMULATOR_AARCH64:
return "AArch64 Emulator";
case EMULATOR_X64:
return "x86_64 Emulator";
default:
return "unknown";
}
}
const char * get_chipset_model(uint32_t soc_model) {
switch (soc_model) {
case SM8350:
return "SM8350";
case SM8450:
return "SM8450";
case SA8295:
return "SA8295";
case SM8475:
return "SM8475";
case SM8550:
return "SM8550";
case SSG2115P:
return "SSG2115P";
case SM7675:
return "SM7675";
case SM8635:
return "SM8635";
case SM8650:
return "SM8650";
case SM8750:
return "SM8750";
case EMULATOR_AARCH64:
return "AARCH64EMU";
case EMULATOR_X64:
return "X64EMU";
default:
return "unknown";
}
}
const char * get_htparch_desc(size_t htp_arch) {
switch (htp_arch) {
case V68:
return "HTP_V68";
case V69:
return "HTP_V69";
case V73:
return "HTP_V73";
case V75:
return "HTP_V75";
case V79:
return "HTP_V79";
default:
return "unknown";
}
}
intptr_t align_to(size_t alignment, intptr_t offset) {
return align_to_generic<intptr_t>(alignment, offset);
}
uint32_t get_ggml_tensor_data_size(const ggml_tensor * tensor) {
return (uint32_t) ggml_nbytes(tensor);
}
const char * get_qnn_tensor_type_name(Qnn_TensorType_t type) {
switch (type) {
case QNN_TENSOR_TYPE_APP_WRITE:
return "QNN_TENSOR_TYPE_APP_WRITE";
case QNN_TENSOR_TYPE_APP_READ:
return "QNN_TENSOR_TYPE_APP_READ";
case QNN_TENSOR_TYPE_APP_READWRITE:
return "QNN_TENSOR_TYPE_APP_READWRITE";
case QNN_TENSOR_TYPE_STATIC:
return "QNN_TENSOR_TYPE_STATIC";
case QNN_TENSOR_TYPE_NATIVE:
return "QNN_TENSOR_TYPE_NATIVE";
case QNN_TENSOR_TYPE_UNDEFINED:
return "QNN_TENSOR_TYPE_UNDEFINED";
case QNN_TENSOR_TYPE_NULL:
return "QNN_TENSOR_TYPE_NULL";
default:
break;
}
return "unknown";
}
#ifdef _WIN32
static void * _align_alloc(size_t alignment, size_t size) {
return _aligned_malloc(size, alignment);
}
static size_t _get_page_size() {
SYSTEM_INFO si;
GetSystemInfo(&si);
return si.dwPageSize;
}
void align_free(void * ptr) {
_aligned_free(ptr);
}
#else
static void * _align_alloc(size_t alignment, size_t size) {
return std::aligned_alloc(alignment, size);
}
static size_t _get_page_size() {
return sysconf(_SC_PAGESIZE);
}
void align_free(void * ptr) {
std::free(ptr);
}
#endif
void * page_align_alloc(size_t size) {
const size_t alignment = _get_page_size();
size_t size_aligned = align_to_generic<size_t>(alignment, size);
void * data = _align_alloc(alignment, size_aligned);
if (!data) {
QNN_LOG_WARN("_align_alloc failed, alignment: %ld, size: %ld, size_aligned: %ld\n", alignment, size,
size_aligned);
return nullptr;
}
QNN_LOG_DEBUG("_align_alloc success, alignment: %ld, size: %ld, size_aligned: %ld\n", alignment, size,
size_aligned);
return data;
}
// =================================================================================================
//
// QNN backend internal helper functions
//
// =================================================================================================
// TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT
const char * opname_from_ggmlop(enum ggml_op ggmlop) {
switch (ggmlop) {
case GGML_OP_ADD:
return QNN_OP_ELEMENT_WISE_ADD;
case GGML_OP_MUL:
return QNN_OP_ELEMENT_WISE_MULTIPLY;
case GGML_OP_MUL_MAT:
return QNN_OP_MAT_MUL;
default:
break;
}
return nullptr;
}
const char * get_qnn_error_string(Qnn_ErrorHandle_t error) {
// A complete list of error codes can be found at here:
// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/api_error_codes.html
thread_local static char error_code[128] = {};
switch (error) {
case QNN_SUCCESS:
return "QNN_SUCCESS";
case QNN_COMMON_ERROR_GENERAL:
return "QNN_COMMON_ERROR_GENERAL";
// QnnGraph_Error_t
case QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE:
return "QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE";
case QNN_GRAPH_ERROR_MEM_ALLOC:
return "QNN_GRAPH_ERROR_MEM_ALLOC";
case QNN_GRAPH_ERROR_INVALID_ARGUMENT:
return "QNN_GRAPH_ERROR_INVALID_ARGUMENT";
case QNN_GRAPH_ERROR_INVALID_HANDLE:
return "QNN_GRAPH_ERROR_INVALID_HANDLE";
case QNN_GRAPH_ERROR_GRAPH_DOES_NOT_EXIST:
return "QNN_GRAPH_ERROR_GRAPH_DOES_NOT_EXIST";
case QNN_GRAPH_ERROR_INVALID_NAME:
return "QNN_GRAPH_ERROR_INVALID_NAME";
case QNN_GRAPH_ERROR_INVALID_TENSOR:
return "QNN_GRAPH_ERROR_INVALID_TENSOR";
case QNN_GRAPH_ERROR_INVALID_OP_CONFIG:
return "QNN_GRAPH_ERROR_INVALID_OP_CONFIG";
case QNN_GRAPH_ERROR_SET_PROFILE:
return "QNN_GRAPH_ERROR_SET_PROFILE";
case QNN_GRAPH_ERROR_UNCONNECTED_NODE:
return "QNN_GRAPH_ERROR_UNCONNECTED_NODE";
case QNN_GRAPH_ERROR_CREATE_FAILED:
return "QNN_GRAPH_ERROR_CREATE_FAILED";
case QNN_GRAPH_ERROR_OPTIMIZATION_FAILED:
return "QNN_GRAPH_ERROR_OPTIMIZATION_FAILED";
case QNN_GRAPH_ERROR_FINALIZE_FAILED:
return "QNN_GRAPH_ERROR_FINALIZE_FAILED";
case QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED:
return "QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED";
case QNN_GRAPH_ERROR_GRAPH_FINALIZED:
return "QNN_GRAPH_ERROR_GRAPH_FINALIZED";
case QNN_GRAPH_ERROR_EXECUTION_ASYNC_FIFO_FULL:
return "QNN_GRAPH_ERROR_EXECUTION_ASYNC_FIFO_FULL";
case QNN_GRAPH_ERROR_SIGNAL_IN_USE:
return "QNN_GRAPH_ERROR_SIGNAL_IN_USE";
case QNN_GRAPH_ERROR_ABORTED:
return "QNN_GRAPH_ERROR_ABORTED";
case QNN_GRAPH_ERROR_PROFILE_IN_USE:
return "QNN_GRAPH_ERROR_PROFILE_IN_USE";
case QNN_GRAPH_ERROR_TIMED_OUT:
return "QNN_GRAPH_ERROR_TIMED_OUT";
case QNN_GRAPH_ERROR_SUBGRAPH:
return "QNN_GRAPH_ERROR_SUBGRAPH";
case QNN_GRAPH_ERROR_DISABLED:
return "QNN_GRAPH_ERROR_DISABLED";
case QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE:
return "QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE";
case QNN_GRAPH_ERROR_TENSOR_SPARSITY:
return "QNN_GRAPH_ERROR_TENSOR_SPARSITY";
case QNN_GRAPH_ERROR_EARLY_TERMINATION:
return "QNN_GRAPH_ERROR_EARLY_TERMINATION";
case QNN_GRAPH_ERROR_INVALID_CONTEXT:
return "QNN_GRAPH_ERROR_INVALID_CONTEXT";
// QnnOpPackage_Error_t
case QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED:
return "QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED";
case QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED:
return "QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED";
case QNN_OP_PACKAGE_ERROR_INVALID_HANDLE:
return "QNN_OP_PACKAGE_ERROR_INVALID_HANDLE";
case QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE:
return "QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE";
case QNN_OP_PACKAGE_ERROR_INVALID_INFO:
return "QNN_OP_PACKAGE_ERROR_INVALID_INFO";
case QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE:
return "QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE";
case QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT:
return "QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT";
default:
if (error >= QNN_GRAPH_MIN_ERROR && error < QNN_GRAPH_MAX_ERROR) {
snprintf(error_code, sizeof(error_code), "UNKNOWN_GRAPH_ERROR_%d", int(error - QNN_GRAPH_MIN_ERROR));
} else {
snprintf(error_code, sizeof(error_code), "%d", int(error));
}
return error_code;
}
}
} // namespace qnn

View File

@ -0,0 +1,228 @@
#pragma once
#include <array>
#include <cstddef>
#include <cstdint>
#include <string>
#include "common.hpp"
#include "ggml-qnn.h"
#include "ggml.h"
#include "logger.hpp"
#include "QnnTypes.h"
#define QNN_TENSOR_VER(x) ((x).v1)
namespace qnn {
using ggml_dimension_array_t = int64_t[GGML_MAX_DIMS];
using ggml_stride_array_t = size_t[GGML_MAX_DIMS];
using qnn_dimension_array_t = std::array<uint32_t, GGML_MAX_DIMS>;
qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t & dims, uint32_t rank);
qnn_dimension_array_t get_view_internal_dimension(const ggml_tensor * tensor, size_t & element_offser_out);
uint32_t get_ggml_tensor_rank(const ggml_tensor * tensor);
const char * get_ggml_type_name(ggml_type type);
const char * get_backend_name(backend_index_type device);
const char * get_backend_desc(backend_index_type device);
const char * get_chipset_desc(uint32_t soc_model);
const char * get_chipset_model(uint32_t soc_model);
const char * get_htparch_desc(size_t htp_arch);
intptr_t align_to(size_t alignment, intptr_t offset);
uint32_t get_ggml_tensor_data_size(const ggml_tensor * tensor);
const char * get_qnn_tensor_type_name(Qnn_TensorType_t type);
void * page_align_alloc(size_t size);
void align_free(void * ptr);
const char * opname_from_ggmlop(enum ggml_op ggmlop);
const char * get_qnn_error_string(Qnn_ErrorHandle_t error);
constexpr const Qnn_TensorVersion_t kDefaultQnnTensorVersion = QNN_TENSOR_VERSION_1;
inline Qnn_Tensor_t qnn_tensor_init(Qnn_TensorVersion_t version) {
Qnn_Tensor_t tensor;
tensor.version = version;
if (version == QNN_TENSOR_VERSION_1) {
tensor.v1 = QNN_TENSOR_V1_INIT;
} else if (version == QNN_TENSOR_VERSION_2) {
tensor.v2 = QNN_TENSOR_V2_INIT;
}
return tensor;
}
inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) {
if (tensor.version == kDefaultQnnTensorVersion) {
return QNN_TENSOR_VER(tensor).id;
}
return 0u;
}
inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) {
if (tensor.version == kDefaultQnnTensorVersion) {
return QNN_TENSOR_VER(tensor).name;
}
return nullptr;
}
inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) {
if (tensor.version == kDefaultQnnTensorVersion) {
return QNN_TENSOR_VER(tensor).type;
}
return QNN_TENSOR_TYPE_UNDEFINED;
}
inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) {
if (tensor.version == kDefaultQnnTensorVersion) {
return QNN_TENSOR_VER(tensor).dataFormat;
}
return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER;
}
inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) {
if (tensor.version == kDefaultQnnTensorVersion) {
return QNN_TENSOR_VER(tensor).dataType;
}
return QNN_DATATYPE_UNDEFINED;
}
inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) {
if (tensor.version == kDefaultQnnTensorVersion) {
return QNN_TENSOR_VER(tensor).quantizeParams;
}
return QNN_QUANTIZE_PARAMS_INIT;
}
inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) {
if (tensor.version == kDefaultQnnTensorVersion) {
return QNN_TENSOR_VER(tensor).rank;
}
return 0u;
}
inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) {
if (tensor.version == kDefaultQnnTensorVersion) {
return QNN_TENSOR_VER(tensor).dimensions;
}
return nullptr;
}
inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) {
if (tensor.version == kDefaultQnnTensorVersion) {
return QNN_TENSOR_VER(tensor).memType;
}
return QNN_TENSORMEMTYPE_UNDEFINED;
}
inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t & tensor) {
if (tensor.version == kDefaultQnnTensorVersion) {
return QNN_TENSOR_VER(tensor).memHandle;
}
return nullptr;
}
inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) {
if (tensor.version == kDefaultQnnTensorVersion) {
QNN_TENSOR_VER(tensor).id = id;
}
}
inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) {
if (tensor.version == kDefaultQnnTensorVersion) {
QNN_TENSOR_VER(tensor).name = name;
}
}
inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) {
if (tensor.version == kDefaultQnnTensorVersion) {
QNN_TENSOR_VER(tensor).type = type;
}
}
inline void set_qnn_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDataFormat_t format) {
if (tensor.version == kDefaultQnnTensorVersion) {
QNN_TENSOR_VER(tensor).dataFormat = format;
}
}
inline void set_qnn_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t dataType) {
if (tensor.version == kDefaultQnnTensorVersion) {
QNN_TENSOR_VER(tensor).dataType = dataType;
}
}
inline void set_qnn_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_QuantizeParams_t params) {
if (tensor.version == kDefaultQnnTensorVersion) {
QNN_TENSOR_VER(tensor).quantizeParams = params;
}
}
inline void set_qnn_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) {
if (tensor.version == kDefaultQnnTensorVersion) {
QNN_TENSOR_VER(tensor).rank = rank;
}
}
inline void set_qnn_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * dims) {
if (tensor.version == kDefaultQnnTensorVersion) {
QNN_TENSOR_VER(tensor).dimensions = dims;
}
}
inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t mem_type) {
if (tensor.version == kDefaultQnnTensorVersion) {
QNN_TENSOR_VER(tensor).memType = mem_type;
}
}
inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t client_buf) {
if (tensor.version == kDefaultQnnTensorVersion) {
QNN_TENSOR_VER(tensor).clientBuf = client_buf;
}
}
inline void set_qnn_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle_t handle) {
if (tensor.version == kDefaultQnnTensorVersion) {
QNN_TENSOR_VER(tensor).memHandle = handle;
}
}
inline void set_qnn_tensor_dyn_dimensions(Qnn_Tensor_t & tensor, uint8_t * isDynamicDimensions) {
if (tensor.version == QNN_TENSOR_VERSION_2) {
tensor.v2.isDynamicDimensions = isDynamicDimensions;
}
}
Qnn_DataType_t qnn_datatype_from_ggml_datatype(ggml_type ggml_type);
ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type);
size_t qnn_datatype_size(Qnn_DataType_t qnn_type);
const char * qnn_datatype_to_string(Qnn_DataType_t qnn_type);
} // namespace qnn
#define QNN_TENSOR_GET_ID(tensor) qnn::get_qnn_tensorid(tensor)
#define QNN_TENSOR_GET_NAME(tensor) qnn::get_qnn_tensorname(tensor)
#define QNN_TENSOR_GET_TYPE(tensor) qnn::get_qnn_tensortype(tensor)
#define QNN_TENSOR_GET_DATA_FORMAT(tensor) qnn::get_qnn_tensor_dataformat(tensor)
#define QNN_TENSOR_GET_DATA_TYPE(tensor) qnn::get_qnn_tensor_datatype(tensor)
#define QNN_TENSOR_GET_QUANT_PARAMS(tensor) qnn::get_qnn_tensor_quantparams(tensor)
#define QNN_TENSOR_GET_RANK(tensor) qnn::get_qnn_tensor_rank(tensor)
#define QNN_TENSOR_GET_DIMENSIONS(tensor) qnn::get_qnn_tensor_dimensions(tensor)
#define QNN_TENSOR_GET_MEM_TYPE(tensor) qnn::get_qnn_tensor_memtype(tensor)
#define QNN_TENSOR_GET_MEM_HANDLE(tensor) qnn::get_qnn_tensor_memhandle(tensor)
#define QNN_TENSOR_SET_ID(tensor, value) qnn::set_qnn_tensor_id(tensor, value)
#define QNN_TENSOR_SET_NAME(tensor, value) qnn::set_qnn_tensor_name(tensor, value)
#define QNN_TENSOR_SET_TYPE(tensor, value) qnn::set_qnn_tensor_type(tensor, value)
#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) qnn::set_qnn_tensor_dataformat(tensor, value)
#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) qnn::set_qnn_tensor_datatype(tensor, value)
#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) qnn::set_qnn_tensor_quantparams(tensor, value)
#define QNN_TENSOR_SET_RANK(tensor, value) qnn::set_qnn_tensor_rank(tensor, value)
#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) qnn::set_qnn_tensor_dimensions(tensor, value)
#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) qnn::set_qnn_tensor_memtype(tensor, value)
#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) qnn::set_qnn_tensor_clientbuf(tensor, value)
#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) qnn::set_qnn_tensor_memhandle(tensor, value)
#define QNN_TENSOR_SET_DYN_DIMENSIONS(tensor, value) qnn::set_qnn_tensor_dyn_dimensions(tensor, value)

View File

@ -0,0 +1,41 @@
file(GLOB common_srcs "${CMAKE_CURRENT_LIST_DIR}/*.cpp")
add_library(runtime-common STATIC
${common_srcs}
)
target_include_directories(runtime-common PUBLIC
${CMAKE_CURRENT_LIST_DIR}/
${CMAKE_CURRENT_LIST_DIR}/../
${CMAKE_CURRENT_LIST_DIR}/../../
${CMAKE_CURRENT_LIST_DIR}/../../../include/ # TODO: figure out how to remove this
)
if(GGML_HEXAGON_NPU_ONLY)
add_compile_definitions(GGML_HEXAGON_NPU_ONLY)
endif()
if(GGML_QNN_ENABLE_HEXAGON_BACKEND)
if(DEFINED ENV{QNN_SDK_PATH})
set(HEXAGON_SDK_ROOT $ENV{HEXAGON_SDK_ROOT})
message("found HEXAGON_SDK_ROOT, setting to ${HEXAGON_SDK_ROOT}")
elseif(EXISTS ${HEXAGON_SDK_ROOT})
message("HEXAGON_SDK_ROOT: ${HEXAGON_SDK_ROOT}")
else()
message(FATAL_ERROR "HEXAGON_SDK_ROOT not defined")
endif()
target_include_directories(runtime-common PUBLIC
${HEXAGON_SDK_ROOT}/incs/
${HEXAGON_SDK_ROOT}/incs/stddef/
${HEXAGON_SDK_ROOT}/incs/HAP/
${HEXAGON_SDK_ROOT}/rtos/qurt/
${HEXAGON_SDK_ROOT}/utils/examples/
)
target_compile_definitions(runtime-common PRIVATE
GGML_QNN_ENABLE_HEXAGON_BACKEND
)
else()
message("HEXAGON_SDK_ROOT not defined, not appending to include directories")
endif()

View File

@ -0,0 +1,146 @@
#include "common.hpp"
#include <memory>
#include "ggml-backend-impl.h"
#include "ggml-impl.h"
#include "ggml-qnn.h"
#ifdef _WIN32
# include <windows.h>
#else
# include <sys/sysinfo.h>
# include <unistd.h>
#endif
namespace {
struct ggml_backend_qnn_reg_impl : ggml_backend_reg {
std::vector<backend_device_proxy_ptr> device_proxies;
std::vector<ggml_backend_device> devices;
explicit ggml_backend_qnn_reg_impl(ggml_backend_reg_i backend_iface) {
context = this;
iface = backend_iface;
LOG_INFO("backend registry init\n");
for (size_t i = 0; i < TOTAL_BACKEND_COUNT; i++) {
const auto device_enum =
(backend_index_type) (TOTAL_BACKEND_COUNT - 1 - i); // init from the last device, i.e. NPU
backend_device_proxy_ptr device_proxy;
if (device_enum < QNN_BACKEND_COUNT) {
#ifndef GGML_HEXAGON_NPU_ONLY
device_proxy = create_qnn_backend_context(device_enum);
#else
LOG_DEBUG("skip qnn device %d\n", (int) device_enum);
continue;
#endif
} else {
#ifdef GGML_QNN_ENABLE_HEXAGON_BACKEND
device_proxy = create_hexagon_backend_context(device_enum);
#else
LOG_DEBUG("skip hexagon device %d\n", (int) device_enum);
continue;
#endif
}
if (!device_proxy) {
LOG_DEBUG("skip device %d\n", (int) device_enum);
continue;
}
devices.emplace_back(ggml_backend_device{
/* iface = */ device_proxy->get_iface(),
/* reg = */ this,
/* context = */ device_proxy->get_context(),
});
device_proxies.emplace_back(device_proxy);
}
}
};
const char * ggml_backend_qnn_reg_get_name(ggml_backend_reg_t reg) {
GGML_UNUSED(reg);
// TODO: should we use a different name?
return "qualcomm";
}
size_t ggml_backend_qnn_reg_get_device_count(ggml_backend_reg_t reg) {
auto * ctx = (ggml_backend_qnn_reg_impl *) reg->context;
return ctx->devices.size();
}
ggml_backend_dev_t ggml_backend_qnn_reg_get_device(ggml_backend_reg_t reg, size_t index) {
auto * ctx = (ggml_backend_qnn_reg_impl *) reg->context;
GGML_ASSERT(index < ctx->devices.size());
return &(ctx->devices[index]);
}
const ggml_backend_reg_i ggml_backend_qnn_reg_interface = {
/* .get_name = */ ggml_backend_qnn_reg_get_name,
/* .get_device_count = */ ggml_backend_qnn_reg_get_device_count,
/* .get_device_get = */ ggml_backend_qnn_reg_get_device,
/* .get_proc_address = */ nullptr,
};
} // namespace
ggml_backend_reg_t ggml_backend_qnn_reg() {
static ggml_backend_qnn_reg_impl reg{ ggml_backend_qnn_reg_interface };
return &reg;
}
namespace common {
#ifdef _WIN32
size_t get_system_total_memory_in_bytes() {
MEMORYSTATUSEX mem = {};
mem.dwLength = sizeof(mem);
if (GlobalMemoryStatusEx(&mem)) {
return mem.ullTotalPhys;
}
return 0;
}
size_t get_system_free_memory_in_bytes() {
MEMORYSTATUSEX mem = {};
mem.dwLength = sizeof(mem);
if (GlobalMemoryStatusEx(&mem)) {
return mem.ullAvailPhys;
}
return 0;
}
#else
size_t get_system_total_memory_in_bytes() {
struct sysinfo info = {};
if (sysinfo(&info) == 0) {
return (info.totalram + info.totalswap) * info.mem_unit;
}
auto pages = (size_t) sysconf(_SC_PHYS_PAGES);
auto page_size = (size_t) sysconf(_SC_PAGE_SIZE);
return pages * page_size;
}
size_t get_system_free_memory_in_bytes() {
struct sysinfo info = {};
if (sysinfo(&info) == 0) {
return (info.freeram + info.freeswap) * info.mem_unit;
}
auto avail_pages = (size_t) sysconf(_SC_AVPHYS_PAGES);
auto page_size = (size_t) sysconf(_SC_PAGE_SIZE);
return avail_pages * page_size;
}
#endif
} // namespace common

View File

@ -0,0 +1,60 @@
#pragma once
#include <cstdint>
#include <memory>
#include "ggml-backend-impl.h"
#include "ggml-impl.h"
enum backend_index_type {
QNN_BACKEND_CPU = 0,
QNN_BACKEND_GPU,
QNN_BACKEND_NPU,
HEXAGON_BACKEND,
TOTAL_BACKEND_COUNT,
QNN_BACKEND_COUNT = HEXAGON_BACKEND,
};
class backend_device_proxy {
public:
virtual ~backend_device_proxy() = default;
virtual const ggml_backend_device_i & get_iface() const = 0;
virtual void * get_context() = 0;
};
using backend_device_proxy_ptr = std::shared_ptr<backend_device_proxy>;
backend_device_proxy_ptr create_qnn_backend_context(backend_index_type device);
backend_device_proxy_ptr create_hexagon_backend_context(backend_index_type device);
namespace common {
size_t get_system_total_memory_in_bytes();
size_t get_system_free_memory_in_bytes();
} // namespace common
#define DISABLE_COPY(class_name) \
class_name(const class_name &) = delete; \
void operator=(const class_name &) = delete
#define DISABLE_MOVE(class_name) \
class_name(class_name &&) = delete; \
void operator=(class_name &&) = delete
#define DISABLE_COPY_AND_MOVE(class_name) \
DISABLE_COPY(class_name); \
DISABLE_MOVE(class_name)
#define LOG_ERROR(...) (GGML_LOG_ERROR(__VA_ARGS__))
#define LOG_WARN(...) (GGML_LOG_WARN(__VA_ARGS__))
#define LOG_INFO(...) (GGML_LOG_INFO(__VA_ARGS__))
#ifndef NDEBUG
# define LOG_DEBUG(...) (GGML_LOG_DEBUG(__VA_ARGS__))
#else
# define LOG_DEBUG(...)
#endif

View File

@ -0,0 +1,76 @@
#pragma once
#ifdef __linux__
# include <dlfcn.h>
# include <fcntl.h>
#elif defined(_WIN32)
# define WIN32_LEAN_AND_MEAN
# ifndef NOMINMAX
# define NOMINMAX
# endif
# include <windows.h>
#endif
#include <string>
namespace common {
#ifdef __linux__
typedef void * dl_handler_t;
inline dl_handler_t dl_load(const std::string & lib_path) {
return dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL);
}
inline void * dl_sym(dl_handler_t handle, const std::string & symbol) {
return dlsym(handle, symbol.c_str());
}
inline bool dl_unload(dl_handler_t handle) {
return dlclose(handle) == 0;
}
inline const char * dl_error() {
return dlerror();
}
#elif defined(_WIN32)
using dl_handler_t = HMODULE;
inline dl_handler_t dl_load(const std::string & lib_path) {
// suppress error dialogs for missing DLLs
auto old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
auto handle = LoadLibraryA(lib_path.c_str()); // TODO: use wstring version for unicode paths
SetErrorMode(old_mode);
return handle;
}
inline void * dl_sym(dl_handler_t handle, const std::string & symbol) {
auto old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
void * p = (void *) GetProcAddress(handle, symbol.c_str());
SetErrorMode(old_mode);
return p;
}
inline bool dl_unload(dl_handler_t handle) {
FreeLibrary(handle);
return true;
}
inline const char * dl_error() {
// TODO: implement dl_error for Windows
return nullptr;
}
#endif
template <typename Fn> Fn dl_sym_typed(dl_handler_t handle, const std::string & function_name) {
return reinterpret_cast<Fn>(dl_sym(handle, function_name));
}
} // namespace common

View File

@ -0,0 +1,63 @@
#pragma once
#include <cstddef>
#include <cstdint>
#include <string>
#include "common.hpp"
#include "ggml-impl.h"
namespace profiler {
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
class scoped_timer {
public:
scoped_timer(const std::string & log_prefix) : _log_prefix(std::move(log_prefix)) { _begin_us = ggml_time_us(); }
scoped_timer(scoped_timer && other) {
_begin_us = other._begin_us;
_log_prefix = std::move(other._log_prefix);
}
~scoped_timer() { print(); }
void operator=(scoped_timer && other) {
_begin_us = other._begin_us;
_log_prefix = std::move(other._log_prefix);
}
void print() const {
auto duration = ggml_time_us() - _begin_us;
GGML_LOG_INFO("[profiler]%s, dur: %lld us\n", _log_prefix.c_str(), (long long) duration);
}
private:
int64_t _begin_us = 0LL;
std::string _log_prefix;
DISABLE_COPY(scoped_timer);
};
inline scoped_timer make_scope_perf_timer(const char * format, ...) {
va_list args;
va_start(args, format);
char buffer[4096];
vsnprintf(buffer, sizeof(buffer), format, args);
va_end(args);
return scoped_timer(buffer);
}
#endif
} // namespace profiler
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
# define SCOPED_PERFORMANCE_TRACKER(fmt, ...) \
auto __scoped_timer_##__LINE__ = profiler::make_scope_perf_timer(fmt, __VA_ARGS__)
# define PROFILER_LOG_DEBUG(fmt, ...) GGML_LOG_INFO("[profiler]" fmt, __VA_ARGS__)
#else
# define SCOPED_PERFORMANCE_TRACKER(fmt, ...) ((void) 0)
# define PROFILER_LOG_DEBUG(...) ((void) 0)
#endif

View File

@ -0,0 +1,223 @@
#pragma once
#include <memory>
#include "common.hpp"
#include "dyn-lib-loader.hpp"
#ifdef GGML_QNN_ENABLE_HEXAGON_BACKEND
# include <remote.h>
#else
// TODO: remove this when not needed
/**
* @enum fastrpc_map_flags for fastrpc_mmap and fastrpc_munmap
* @brief Types of maps with cache maintenance
*/
enum fastrpc_map_flags {
/**
* Map memory pages with RW- permission and CACHE WRITEBACK.
* Driver will clean cache when buffer passed in a FastRPC call.
* Same remote virtual address will be assigned for subsequent
* FastRPC calls.
*/
FASTRPC_MAP_STATIC,
/** Reserved for compatibility with deprecated flag */
FASTRPC_MAP_RESERVED,
/**
* Map memory pages with RW- permission and CACHE WRITEBACK.
* Mapping tagged with a file descriptor. User is responsible for
* maintenance of CPU and DSP caches for the buffer. Get virtual address
* of buffer on DSP using HAP_mmap_get() and HAP_mmap_put() functions.
*/
FASTRPC_MAP_FD,
/**
* Mapping delayed until user calls HAP_mmap() and HAP_munmap()
* functions on DSP. User is responsible for maintenance of CPU and DSP
* caches for the buffer. Delayed mapping is useful for users to map
* buffer on DSP with other than default permissions and cache modes
* using HAP_mmap() and HAP_munmap() functions.
*/
FASTRPC_MAP_FD_DELAYED,
/** Reserved for compatibility **/
FASTRPC_MAP_RESERVED_4,
FASTRPC_MAP_RESERVED_5,
FASTRPC_MAP_RESERVED_6,
FASTRPC_MAP_RESERVED_7,
FASTRPC_MAP_RESERVED_8,
FASTRPC_MAP_RESERVED_9,
FASTRPC_MAP_RESERVED_10,
FASTRPC_MAP_RESERVED_11,
FASTRPC_MAP_RESERVED_12,
FASTRPC_MAP_RESERVED_13,
FASTRPC_MAP_RESERVED_14,
FASTRPC_MAP_RESERVED_15,
/**
* This flag is used to skip CPU mapping,
* otherwise behaves similar to FASTRPC_MAP_FD_DELAYED flag.
*/
FASTRPC_MAP_FD_NOMAP,
/** Update FASTRPC_MAP_MAX when adding new value to this enum **/
};
#endif
namespace common {
#ifdef _WIN32
constexpr const char * kQnnRpcLibName = "libcdsprpc.dll";
#else
constexpr const char * kQnnRpcLibName = "libcdsprpc.so";
#endif
class rpc_interface {
using rpc_mem_init_t = void (*)();
using rpc_mem_deinit_t = void (*)();
using rpc_mem_alloc_t = void * (*) (int heapid, uint32_t flags, int size);
using rpc_mem_alloc2_t = void * (*) (int heapid, uint32_t flags, size_t size);
using rpc_mem_free_t = void (*)(void * po);
using rpc_mem_to_fd_t = int (*)(void * po);
using rpc_mem_fastrpc_mmap_t = int (*)(int domain, int fd, void * addr, int offset, size_t length,
enum fastrpc_map_flags flags);
using rpc_mem_fastrpc_munmap_t = int (*)(int domain, int fd, void * addr, size_t length);
using remote_handle_control_t = int (*)(uint32_t req, void * data, uint32_t datalen);
using remote_session_control_t = int (*)(uint32_t req, void * data, uint32_t datalen);
public:
rpc_interface(const std::string & rpc_lib_path = kQnnRpcLibName) {
_rpc_lib_handle = dl_load(rpc_lib_path);
if (!_rpc_lib_handle) {
LOG_ERROR("failed to load %s, error: %s\n", rpc_lib_path.c_str(), dl_error());
return;
}
_rpc_mem_init = reinterpret_cast<rpc_mem_init_t>(dl_sym(_rpc_lib_handle, "rpcmem_init"));
_rpc_mem_deinit = reinterpret_cast<rpc_mem_deinit_t>(dl_sym(_rpc_lib_handle, "rpcmem_deinit"));
_rpc_mem_alloc = reinterpret_cast<rpc_mem_alloc_t>(dl_sym(_rpc_lib_handle, "rpcmem_alloc"));
_rpc_mem_alloc2 = reinterpret_cast<rpc_mem_alloc2_t>(dl_sym(_rpc_lib_handle, "rpcmem_alloc2"));
_rpc_mem_free = reinterpret_cast<rpc_mem_free_t>(dl_sym(_rpc_lib_handle, "rpcmem_free"));
_rpc_mem_to_fd = reinterpret_cast<rpc_mem_to_fd_t>(dl_sym(_rpc_lib_handle, "rpcmem_to_fd"));
_rpc_mem_fastrpc_mmap = reinterpret_cast<rpc_mem_fastrpc_mmap_t>(dl_sym(_rpc_lib_handle, "fastrpc_mmap"));
_rpc_mem_fastrpc_munmap = reinterpret_cast<rpc_mem_fastrpc_munmap_t>(dl_sym(_rpc_lib_handle, "fastrpc_munmap"));
_remote_handle_control =
reinterpret_cast<remote_handle_control_t>(dl_sym(_rpc_lib_handle, "remote_handle_control"));
_remote_session_control =
reinterpret_cast<remote_session_control_t>(dl_sym(_rpc_lib_handle, "remote_session_control"));
}
bool is_valid() const { return _rpc_lib_handle != nullptr; }
bool is_alloc2_available() const { return _rpc_mem_alloc2 != nullptr; }
void rpcmem_init() {
if (_rpc_mem_init) {
_rpc_mem_init();
}
}
void rpcmem_deinit() {
if (_rpc_mem_deinit) {
_rpc_mem_deinit();
}
}
void * rpcmem_alloc(int heapid, uint32_t flags, int size) {
if (!is_valid()) {
return nullptr;
}
return _rpc_mem_alloc(heapid, flags, size);
}
void * rpcmem_alloc2(int heapid, uint32_t flags, size_t size) {
if (!is_valid()) {
return nullptr;
}
return _rpc_mem_alloc2(heapid, flags, size);
}
void rpcmem_free(void * buf) {
if (is_valid()) {
_rpc_mem_free(buf);
}
}
int rpcmem_to_fd(void * buf) {
int mem_fd = -1;
if (is_valid()) {
mem_fd = _rpc_mem_to_fd(buf);
}
return mem_fd;
}
int fastrpc_mmap(int domain, int fd, void * addr, int offset, size_t length, enum fastrpc_map_flags flags) {
if (!is_valid()) {
return -1;
}
return _rpc_mem_fastrpc_mmap(domain, fd, addr, offset, length, flags);
}
int fastrpc_munmap(int domain, int fd, void * addr, size_t length) {
if (!is_valid()) {
return -1;
}
return _rpc_mem_fastrpc_munmap(domain, fd, addr, length);
}
int remote_handle_control(uint32_t req, void * data, uint32_t datalen) {
if (!is_valid()) {
return -1;
}
return _remote_handle_control(req, data, datalen);
}
int remote_session_control(uint32_t req, void * data, uint32_t datalen) {
if (!is_valid()) {
return -1;
}
return _remote_session_control(req, data, datalen);
}
~rpc_interface() {
if (_rpc_lib_handle) {
if (_rpc_mem_deinit) {
_rpc_mem_deinit();
}
dl_unload(_rpc_lib_handle);
}
}
private:
dl_handler_t _rpc_lib_handle = nullptr;
rpc_mem_init_t _rpc_mem_init = nullptr;
rpc_mem_deinit_t _rpc_mem_deinit = nullptr;
rpc_mem_alloc_t _rpc_mem_alloc = nullptr;
rpc_mem_alloc2_t _rpc_mem_alloc2 = nullptr;
rpc_mem_free_t _rpc_mem_free = nullptr;
rpc_mem_to_fd_t _rpc_mem_to_fd = nullptr;
rpc_mem_fastrpc_mmap_t _rpc_mem_fastrpc_mmap = nullptr;
rpc_mem_fastrpc_munmap_t _rpc_mem_fastrpc_munmap = nullptr;
remote_handle_control_t _remote_handle_control = nullptr;
remote_session_control_t _remote_session_control = nullptr;
rpc_interface(const rpc_interface &) = delete;
rpc_interface & operator=(const rpc_interface &) = delete;
rpc_interface(rpc_interface &&) = delete;
rpc_interface & operator=(rpc_interface &&) = delete;
};
using rpc_interface_ptr = std::shared_ptr<rpc_interface>;
} // namespace common

View File

@ -0,0 +1,131 @@
#pragma once
#include <limits>
#include <memory>
#include "common.hpp"
#include "dyn-lib-loader.hpp"
#include "rpc-interface.hpp"
namespace common {
class rpc_mem {
public:
rpc_mem() {
auto interface = std::make_shared<rpc_interface>();
if (!interface->is_valid()) {
LOG_ERROR("failed to load rpcmem lib\n");
return;
}
interface->rpcmem_init();
_rpc_interface = interface;
LOG_DEBUG("load rpcmem lib successfully\n");
}
explicit rpc_mem(rpc_interface_ptr interface) {
if (!interface->is_valid()) {
LOG_ERROR("failed to load rpcmem lib\n");
return;
}
interface->rpcmem_init();
_rpc_interface = interface;
LOG_DEBUG("load rpcmem lib successfully\n");
}
~rpc_mem() {
if (!is_valid()) {
LOG_DEBUG("rpc memory not initialized\n");
return;
}
if (_rpc_interface) {
_rpc_interface->rpcmem_deinit();
_rpc_interface.reset();
}
LOG_DEBUG("unload rpcmem lib successfully\n");
}
bool is_valid() const { return (bool) _rpc_interface; }
void * alloc(int heapid, uint32_t flags, size_t size) {
if (!is_valid()) {
LOG_ERROR("rpc memory not initialized\n");
return nullptr;
}
if (size > get_max_alloc_size()) {
LOG_ERROR("rpc memory size %zu exceeds max alloc size %zu\n", size, get_max_alloc_size());
return nullptr;
}
void * buf = nullptr;
if (_rpc_interface->is_alloc2_available()) {
LOG_DEBUG("rpcmem_alloc2 available, using it\n");
buf = _rpc_interface->rpcmem_alloc2(heapid, flags, size);
} else {
LOG_DEBUG("rpcmem_alloc2 not available, using rpcmem_alloc\n");
buf = _rpc_interface->rpcmem_alloc(heapid, flags, size);
}
if (!buf) {
LOG_ERROR("failed to allocate rpc memory, size: %d MB\n", (int) (size / (1 << 20)));
return nullptr;
}
LOG_DEBUG("rpc buffer allocated, heapid: %d, flags: 0x%x, size: %zu\n", heapid, flags, size);
return buf;
}
void free(void * buf) {
if (!is_valid()) {
LOG_ERROR("rpc memory not initialized\n");
} else {
_rpc_interface->rpcmem_free(buf);
}
}
int to_fd(void * buf) {
int mem_fd = -1;
if (!is_valid()) {
LOG_ERROR("rpc memory not initialized\n");
} else {
mem_fd = _rpc_interface->rpcmem_to_fd(buf);
}
return mem_fd;
}
size_t get_max_alloc_size() {
return _rpc_interface->is_alloc2_available() ? std::numeric_limits<size_t>::max() :
std::numeric_limits<int>::max();
}
int fastrpc_mmap(int domain, int fd, void * addr, int offset, size_t length, enum fastrpc_map_flags flags) {
if (!is_valid()) {
LOG_ERROR("rpc memory not initialized\n");
return -1;
}
return _rpc_interface->fastrpc_mmap(domain, fd, addr, offset, length, flags);
}
int fastrpc_munmap(int domain, int fd, void * addr, size_t length) {
if (!is_valid()) {
LOG_ERROR("rpc memory not initialized\n");
return -1;
}
return _rpc_interface->fastrpc_munmap(domain, fd, addr, length);
}
private:
rpc_interface_ptr _rpc_interface;
};
using rpc_mem_ptr = std::shared_ptr<rpc_mem>;
} // namespace common