feat: op perf opt (#38)
* add op define xml * copy qnn libs in cmake * fix htp skel path * add windows copy file list * wip * add generated package * remove unused params * add cmake list * set qnn sdk and hexagon sdk path * wip * wip * fix tools version * fix compiling error * fix dims calc * wip * add mulmat 2d * wip * reduction * wip * wip * fix compiling error in x64 * wip * fix device description in emulator * wip * add flag * copy necessary libs * wip * load HtpPrepare first for emulator * enable custom op for 2d matrix * verify op config before add to node * Revert "verify op config before add to node" This reverts commit 206dec826e560625e053c4c78e023994f993526e. * wip * wip * wip * revert tool version change * use hexagon sdk version 5.5.0 https://docs.qualcomm.com/bundle/publicresource/topics/80-77512-2/release-notes-wrapper.html?product=1601111740010422#5.5.0 * wip * move to sub dir * add hexagon npu device and server lib * fix npu lib build * refactoring: rename QNNBackend enum * fix compiling error * wip * remove qnn/backend.hpp * add hexagon dsp host layer * extract rpc_mem from qnn submodule * fix dsp compiling error * wip * wip * open and lose npu device * split objects into separated files * fix linking error * add npu_tensor * add host graph * map rpc buffer before usage * fix some todos * add shared module * split rpc_interface from rpc_mem * get get_dsp_arch from device * wip * rename host classes * fix hexagon sdk arch getter * fix device open * fix linking error * fix crash * use tensor_data_type * fix npu lib crash * fix debug log print * skip empty graph * wip * add log * fix unmap fail * fix tensor set * remove some logs * flush back memory after finished * fix nb * wip * wip * add helper function * impl add op * fix some add in test-backend-ops * add elt wise sub and mul * fix crash on some inplace op * wip * fix elt wise op calc * wip * split mul_mat into file * add caps array * wip * wip * print support/unsupport op * copy lldb-server for newer android sdk * add tensor_spec * add assert * fix crash when loading model * rename cmake option * fix name * fix device memory and description * fix compiling error on qnn only build * fix some potential UBs * fix comments
This commit is contained in:
parent
9e41f79403
commit
beff5c4b78
|
|
@ -1,24 +1,11 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GGML_QNN_NAME "qnn"
|
||||
#define GGML_QNN_MAX_DEVICES QNN_BACKEND_COUNT
|
||||
|
||||
enum QNNBackend {
|
||||
QNN_BACKEND_CPU = 0,
|
||||
QNN_BACKEND_GPU,
|
||||
QNN_BACKEND_NPU,
|
||||
QNN_BACKEND_COUNT,
|
||||
};
|
||||
|
||||
GGML_API bool ggml_backend_is_qnn(ggml_backend_t backend);
|
||||
|
||||
GGML_API ggml_backend_reg_t ggml_backend_qnn_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
|||
|
|
@ -1,9 +1,13 @@
|
|||
message(STATUS "Using QNN backend")
|
||||
|
||||
option(GGML_HEXAGON_NPU_ONLY "ggml-qnn: Only use Hexagon NPU" OFF)
|
||||
option(GGML_QNN_ENABLE_HEXAGON_BACKEND "ggml-qnn: Enable Hexagon custom package" ${GGML_HEXAGON_NPU_ONLY})
|
||||
|
||||
if(CMAKE_SYSTEM_NAME STREQUAL "Android")
|
||||
find_library(LOG_LIB log)
|
||||
set(QNN_LINK_LIBRARIES ${LOG_LIB})
|
||||
set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend")
|
||||
add_compile_options(-g -O0)
|
||||
elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows" OR CMAKE_SYSTEM_NAME STREQUAL "Linux")
|
||||
set(QNN_DEFAULT_LIB_SEARCH_PATH "" CACHE STRING "customized library search path for QNN backend")
|
||||
else()
|
||||
|
|
@ -21,15 +25,22 @@ if(NOT DEFINED GGML_QNN_SDK_PATH)
|
|||
endif()
|
||||
|
||||
message("CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}")
|
||||
message("CMAKE_CXX_FLAGS_DEBUG: ${CMAKE_CXX_FLAGS_DEBUG}")
|
||||
message("CMAKE_CXX_FLAGS_RELEASE: ${CMAKE_CXX_FLAGS_RELEASE}")
|
||||
message("QNN_SDK_PATH: ${GGML_QNN_SDK_PATH}")
|
||||
|
||||
file(GLOB QNN_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp")
|
||||
file(GLOB QNN_SOURCES "${CMAKE_CURRENT_LIST_DIR}/qnn/*.cpp")
|
||||
file(GLOB COMMON_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp")
|
||||
ggml_add_backend_library(ggml-qnn
|
||||
${QNN_SOURCES}
|
||||
${COMMON_SOURCES}
|
||||
)
|
||||
|
||||
target_include_directories(ggml-qnn PRIVATE ${GGML_QNN_SDK_PATH}/include/QNN ${CMAKE_CURRENT_LIST_DIR})
|
||||
target_include_directories(ggml-qnn PRIVATE
|
||||
${GGML_QNN_SDK_PATH}/include/QNN
|
||||
${CMAKE_CURRENT_LIST_DIR}/qnn
|
||||
${CMAKE_CURRENT_LIST_DIR}
|
||||
)
|
||||
target_link_libraries(ggml-qnn PRIVATE ${QNN_LINK_LIBRARIES})
|
||||
|
||||
if(NOT "${QNN_DEFAULT_LIB_SEARCH_PATH}" STREQUAL "")
|
||||
|
|
@ -52,3 +63,99 @@ if(GGML_QNN_ENABLE_PERFORMANCE_TRACKING)
|
|||
else()
|
||||
message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING is disabled")
|
||||
endif()
|
||||
|
||||
add_subdirectory(shared)
|
||||
|
||||
if(GGML_HEXAGON_NPU_ONLY)
|
||||
message("GGML_HEXAGON_NPU_ONLY is enabled")
|
||||
add_compile_definitions(GGML_HEXAGON_NPU_ONLY)
|
||||
set(GGML_QNN_ENABLE_HEXAGON_BACKEND ON)
|
||||
else()
|
||||
message("GGML_HEXAGON_NPU_ONLY is disabled")
|
||||
endif()
|
||||
|
||||
if(GGML_QNN_ENABLE_HEXAGON_BACKEND)
|
||||
message("GGML_QNN_ENABLE_HEXAGON_BACKEND is enabled")
|
||||
add_subdirectory(npu)
|
||||
target_link_libraries(hexagon-npu-host runtime-common)
|
||||
target_link_libraries(ggml-qnn PRIVATE hexagon-npu-host)
|
||||
else()
|
||||
message("GGML_QNN_ENABLE_HEXAGON_BACKEND is disabled")
|
||||
target_link_libraries(ggml-qnn PRIVATE runtime-common)
|
||||
endif()
|
||||
|
||||
# Copy QNN dynamic libraries
|
||||
set(QNN_DYNAMIC_LIBS "")
|
||||
|
||||
if(CMAKE_SYSTEM_NAME STREQUAL "Android" OR CMAKE_SYSTEM_NAME STREQUAL "Linux")
|
||||
if(CMAKE_SYSTEM_NAME STREQUAL "Android")
|
||||
# Android
|
||||
set(QNN_SDK_LIB_PATH "${GGML_QNN_SDK_PATH}/lib/aarch64-android")
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
|
||||
# Linux x86_64
|
||||
set(QNN_SDK_LIB_PATH "${GGML_QNN_SDK_PATH}/lib/x86_64-linux-clang")
|
||||
else()
|
||||
# Linux aarch64
|
||||
set(QNN_SDK_LIB_PATH "${GGML_QNN_SDK_PATH}/lib/aarch64-oe-linux-gcc11.2")
|
||||
endif()
|
||||
|
||||
list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/libQnnSystem.so")
|
||||
list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/libQnnCpu.so")
|
||||
list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/libQnnGpu.so")
|
||||
list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/libQnnHtp.so")
|
||||
file(GLOB HTP_STUB_LIBS "${QNN_SDK_LIB_PATH}/libQnnHtp*.so")
|
||||
list(APPEND QNN_DYNAMIC_LIBS ${HTP_STUB_LIBS})
|
||||
|
||||
if(CMAKE_SYSTEM_NAME STREQUAL "Android")
|
||||
file(GLOB HTP_SKEL_LIBS "${GGML_QNN_SDK_PATH}/lib/hexagon-*/unsigned/libQnnHtp*Skel.so")
|
||||
list(APPEND QNN_DYNAMIC_LIBS ${HTP_SKEL_LIBS})
|
||||
|
||||
if(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
|
||||
if(EXISTS "${CMAKE_ANDROID_NDK}/prebuilt/android-arm64/gdbserver/gdbserver")
|
||||
list(APPEND QNN_DYNAMIC_LIBS "${CMAKE_ANDROID_NDK}/prebuilt/android-arm64/gdbserver/gdbserver")
|
||||
message("old ndk, copy gdbserver")
|
||||
else()
|
||||
file(GLOB LLDB_SERVER "${CMAKE_ANDROID_NDK}/toolchains/llvm/prebuilt/linux-x86_64/lib64/clang/*/lib/linux/aarch64/lldb-server")
|
||||
list(APPEND QNN_DYNAMIC_LIBS ${LLDB_SERVER})
|
||||
message("new ndk, copy lldb-server")
|
||||
endif()
|
||||
|
||||
file(GLOB OMP_LIBS "${CMAKE_ANDROID_NDK}/toolchains/llvm/prebuilt/linux-x86_64/lib64/clang/*/lib/linux/aarch64/libomp.so")
|
||||
file(GLOB ASAN_LIBS "${CMAKE_ANDROID_NDK}/toolchains/llvm/prebuilt/linux-x86_64/lib64/clang/*/lib/linux/libclang_rt.asan-aarch64-android.so")
|
||||
list(APPEND QNN_DYNAMIC_LIBS ${OMP_LIBS})
|
||||
list(APPEND QNN_DYNAMIC_LIBS ${ASAN_LIBS})
|
||||
endif()
|
||||
else()
|
||||
# Linux
|
||||
list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/libHtpPrepare.so")
|
||||
endif()
|
||||
elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows")
|
||||
if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
|
||||
# x86_64
|
||||
set(QNN_SDK_LIB_PATH "${GGML_QNN_SDK_PATH}/lib/x86_64-windows-msvc")
|
||||
else()
|
||||
# aarch64
|
||||
set(QNN_SDK_LIB_PATH "${GGML_QNN_SDK_PATH}/lib/aarch64-windows-msvc")
|
||||
endif()
|
||||
|
||||
list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/QnnSystem.dll")
|
||||
list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/QnnCpu.dll")
|
||||
list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/QnnGpu.dll")
|
||||
list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/QnnHtp.dll")
|
||||
file(GLOB HTP_STUB_LIBS "${QNN_SDK_LIB_PATH}/QnnHtp*.dll")
|
||||
|
||||
if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
|
||||
list(APPEND QNN_DYNAMIC_LIBS "${QNN_SDK_LIB_PATH}/HtpPrepare.dll")
|
||||
endif()
|
||||
|
||||
list(APPEND QNN_DYNAMIC_LIBS ${HTP_STUB_LIBS})
|
||||
endif()
|
||||
|
||||
foreach(QNN_DYNAMIC_LIB ${QNN_DYNAMIC_LIBS})
|
||||
message("Copy: ${QNN_DYNAMIC_LIB} -> ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
|
||||
add_custom_command(
|
||||
TARGET ggml-qnn POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy
|
||||
${QNN_DYNAMIC_LIB}
|
||||
${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
|
||||
endforeach()
|
||||
|
|
|
|||
|
|
@ -1,11 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include "backend.hpp"
|
||||
#include "ggml.h"
|
||||
|
||||
namespace qnn {
|
||||
|
||||
bool device_supports_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op);
|
||||
bool device_compute_graph(ggml_backend_qnn_device_context * ctx, ggml_cgraph * cgraph);
|
||||
|
||||
} // namespace qnn
|
||||
|
|
@ -0,0 +1,147 @@
|
|||
enable_language(ASM)
|
||||
cmake_policy(SET CMP0115 OLD)
|
||||
|
||||
if(DEFINED ENV{HEXAGON_SDK_ROOT})
|
||||
set(HEXAGON_SDK_ROOT $ENV{HEXAGON_SDK_ROOT})
|
||||
message("HEXAGON_SDK_ROOT: ${HEXAGON_SDK_ROOT}")
|
||||
else()
|
||||
message(FATAL_ERROR "HEXAGON_SDK_ROOT not defined")
|
||||
endif()
|
||||
|
||||
if(HEXAGON_SDK_ROOT)
|
||||
include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_fun.cmake)
|
||||
else()
|
||||
include(${HEXAGON_CMAKE_ROOT}/hexagon_fun.cmake)
|
||||
endif()
|
||||
|
||||
# Base Include dirs for the Project
|
||||
set(common_incs
|
||||
${CMAKE_CURRENT_BINARY_DIR}/
|
||||
${HEXAGON_SDK_ROOT}/incs/
|
||||
${HEXAGON_SDK_ROOT}/incs/stddef/
|
||||
${HEXAGON_SDK_ROOT}/incs/HAP/
|
||||
${HEXAGON_SDK_ROOT}/rtos/qurt/
|
||||
${HEXAGON_SDK_ROOT}/utils/examples/
|
||||
)
|
||||
|
||||
include_directories(${common_incs})
|
||||
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Android|Linux|Windows")
|
||||
# host build
|
||||
file(GLOB common_srcs "${CMAKE_CURRENT_LIST_DIR}/common/*.cpp")
|
||||
file(GLOB host_srcs "${CMAKE_CURRENT_LIST_DIR}/host/*.cpp")
|
||||
set(stub_srcs "${CMAKE_CURRENT_BINARY_DIR}/npu_device_stub.c")
|
||||
add_library(hexagon-npu-host STATIC
|
||||
${common_srcs}
|
||||
${host_srcs}
|
||||
${stub_srcs}
|
||||
)
|
||||
|
||||
# disable warnings for the stub
|
||||
set_source_files_properties(
|
||||
${stub_srcs}
|
||||
PROPERTIES
|
||||
COMPILE_FLAGS "-w"
|
||||
)
|
||||
|
||||
build_idl(idl/hexagon_npu.idl hexagon-npu-host)
|
||||
|
||||
# Add compile definitions to the target
|
||||
target_compile_definitions(hexagon-npu-host PUBLIC
|
||||
VERIFY_PRINT_ERROR
|
||||
GGML_QNN_ENABLE_HEXAGON_BACKEND
|
||||
)
|
||||
|
||||
target_include_directories(hexagon-npu-host PRIVATE
|
||||
${HEXAGON_SDK_ROOT}/ipc/fastrpc/rpcmem/inc/
|
||||
${QNN_SDK_ROOT}/include/QNN/
|
||||
${CMAKE_CURRENT_LIST_DIR}/host/
|
||||
${CMAKE_CURRENT_LIST_DIR}/
|
||||
)
|
||||
|
||||
target_include_directories(hexagon-npu-host PUBLIC
|
||||
${HEXAGON_SDK_ROOT}/incs/ # TODO: this is for rpc-mem
|
||||
)
|
||||
|
||||
if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "Windows")
|
||||
set_target_properties(hexagon-npu-host PROPERTIES OUTPUT_NAME "hexagon_npu")
|
||||
endif()
|
||||
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Android|Linux")
|
||||
target_link_options(hexagon-npu-host PUBLIC -pie)
|
||||
endif()
|
||||
|
||||
link_options(hexagon-npu-host)
|
||||
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Android")
|
||||
set(PREBUILT_LIB_DIR "android_aarch64")
|
||||
elseif(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
|
||||
set(PREBUILT_LIB_DIR "UbuntuARM_aarch64")
|
||||
else()
|
||||
# Windows
|
||||
set(PREBUILT_LIB_DIR "windows_aarch64")
|
||||
endif()
|
||||
|
||||
choose_dsprpc("3" dsprpc) # cdsprpc
|
||||
link_custom_library(hexagon-npu-host ${dsprpc})
|
||||
else()
|
||||
# hexagon npu build
|
||||
cmake_minimum_required(VERSION 3.14.3)
|
||||
project(hexagon_npu C CXX ASM)
|
||||
|
||||
# check if QNN_SDK_ROOT is set
|
||||
if(NOT DEFINED ENV{QNN_SDK_ROOT})
|
||||
message(FATAL_ERROR "QNN_SDK_ROOT not defined")
|
||||
endif()
|
||||
|
||||
set(QNN_SDK_ROOT $ENV{QNN_SDK_ROOT})
|
||||
message("QNN_SDK_ROOT: ${QNN_SDK_ROOT}")
|
||||
include_directories(
|
||||
${QNN_SDK_ROOT}/include/QNN/
|
||||
)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
|
||||
|
||||
file(GLOB common_srcs "${CMAKE_CURRENT_LIST_DIR}/common/*.cpp")
|
||||
file(GLOB device_srcs "${CMAKE_CURRENT_LIST_DIR}/device/*.cpp")
|
||||
set(skel_srcs "${CMAKE_CURRENT_BINARY_DIR}/npu_device_skel.c")
|
||||
add_library(hexagon_npu_skel_OBJS OBJECT
|
||||
${common_srcs}
|
||||
${device_srcs}
|
||||
${skel_srcs}
|
||||
)
|
||||
|
||||
if(CMAKE_BUILD_TYPE MATCHES "Debug|Dbg")
|
||||
message("Debug build, enable all logging")
|
||||
target_compile_definitions(hexagon_npu_skel_OBJS PUBLIC
|
||||
_DEBUG
|
||||
DEBUG_LOGGING
|
||||
)
|
||||
else()
|
||||
message("Release build, disable debug logging")
|
||||
target_compile_definitions(hexagon_npu_skel_OBJS PUBLIC
|
||||
NDEBUG
|
||||
RELEASE_LOGGING
|
||||
)
|
||||
endif()
|
||||
|
||||
build_idl(idl/hexagon_npu.idl hexagon_npu_skel_OBJS)
|
||||
|
||||
# disable warnings for the skel
|
||||
set_source_files_properties(
|
||||
${skel_srcs}
|
||||
PROPERTIES
|
||||
COMPILE_FLAGS "-w"
|
||||
)
|
||||
|
||||
add_library(hexagon_npu_skel SHARED $<TARGET_OBJECTS:hexagon_npu_skel_OBJS>)
|
||||
|
||||
target_link_libraries(hexagon_npu_skel
|
||||
${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++abi.a
|
||||
${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++.a
|
||||
)
|
||||
set_target_properties(hexagon_npu_skel PROPERTIES OUTPUT_NAME "hexagon_npu_skel_${HEXAGON_ARCH}")
|
||||
|
||||
copy_binaries(hexagon_npu_skel)
|
||||
endif()
|
||||
|
||||
# vim: set noet fenc=utf-8 ff=unix ft=cmake :
|
||||
|
|
@ -0,0 +1,173 @@
|
|||
|
||||
#include <AEEStdErr.h>
|
||||
#include <HAP_compute_res.h>
|
||||
#include <hexagon_types.h>
|
||||
|
||||
#include <new>
|
||||
|
||||
#include "graph.hpp"
|
||||
#include "hexagon_npu.h"
|
||||
#include "op_impl.hpp"
|
||||
#include "remote.h"
|
||||
#include "tensor.hpp"
|
||||
#include "util.hpp"
|
||||
|
||||
#define NPU_UNUSED(x) (void) (x)
|
||||
|
||||
namespace {
|
||||
|
||||
struct npu_device_context {
|
||||
int unused = 0;
|
||||
// TODO: should we add tensor context here?
|
||||
};
|
||||
|
||||
inline hexagon::tensor * tensor_from_handle(npu_device_graph_handle_t h) {
|
||||
return reinterpret_cast<hexagon::tensor *>(h);
|
||||
}
|
||||
|
||||
inline npu_device_graph_handle_t tensor_to_handle(hexagon::tensor * tensor) {
|
||||
return reinterpret_cast<npu_device_graph_handle_t>(tensor);
|
||||
}
|
||||
|
||||
inline hexagon::graph * graph_from_handle(npu_device_tensor_handle_t h) {
|
||||
return reinterpret_cast<hexagon::graph *>(h);
|
||||
}
|
||||
|
||||
inline npu_device_tensor_handle_t graph_to_handle(hexagon::graph * graph) {
|
||||
return reinterpret_cast<npu_device_tensor_handle_t>(graph);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
int npu_device_open(const char * uri, remote_handle64 * h) {
|
||||
// TODO: should we have a device context here?
|
||||
auto * context = new (std::nothrow) npu_device_context();
|
||||
if (!context) {
|
||||
DEVICE_LOG_ERROR("Failed to allocate memory for the npu_device_context");
|
||||
return AEE_ENOMEMORY;
|
||||
}
|
||||
|
||||
*h = reinterpret_cast<remote_handle64>(context);
|
||||
return AEE_SUCCESS;
|
||||
}
|
||||
|
||||
int npu_device_close(remote_handle64 h) {
|
||||
auto * context = reinterpret_cast<npu_device_context *>(h);
|
||||
if (!context) {
|
||||
DEVICE_LOG_ERROR("Invalid npu_device_context handle");
|
||||
return AEE_EINVHANDLE;
|
||||
}
|
||||
|
||||
delete context;
|
||||
return AEE_SUCCESS;
|
||||
}
|
||||
|
||||
AEEResult npu_device_device_get_alignment(remote_handle64 _h, uint32_t * alignment) {
|
||||
NPU_UNUSED(_h);
|
||||
*alignment = sizeof(HVX_Vector);
|
||||
return AEE_SUCCESS;
|
||||
}
|
||||
|
||||
AEEResult npu_device_device_support_op(remote_handle64 _h, const npu_device_tensor_spec * src0,
|
||||
const npu_device_tensor_spec * src1, const npu_device_tensor_spec * dst,
|
||||
npu_device_tensor_op op, boolean * is_supported) {
|
||||
NPU_UNUSED(_h);
|
||||
*is_supported = hexagon::support_op(*src0, *src1, *dst, op);
|
||||
return AEE_SUCCESS;
|
||||
}
|
||||
|
||||
AEEResult npu_device_tensor_init(remote_handle64 _h, const npu_device_tensor_config * info,
|
||||
npu_device_tensor_handle_t * tensor_handle) {
|
||||
NPU_UNUSED(_h);
|
||||
auto * tensor = new (std::nothrow) hexagon::tensor(*info);
|
||||
if (!tensor) {
|
||||
DEVICE_LOG_ERROR("Failed to allocate memory for the tensor");
|
||||
return AEE_ENOMEMORY;
|
||||
}
|
||||
|
||||
*tensor_handle = tensor_to_handle(tensor);
|
||||
return AEE_SUCCESS;
|
||||
}
|
||||
|
||||
AEEResult npu_device_tensor_set_src(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle, uint64_t index,
|
||||
npu_device_tensor_handle_t src) {
|
||||
NPU_UNUSED(_h);
|
||||
auto * tensor = tensor_from_handle(tensor_handle);
|
||||
if (!tensor) {
|
||||
return AEE_EINVHANDLE;
|
||||
}
|
||||
|
||||
auto * src_tensor = tensor_from_handle(src);
|
||||
tensor->set_src(index, src_tensor);
|
||||
return AEE_SUCCESS;
|
||||
}
|
||||
|
||||
AEEResult npu_device_tensor_set_op(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle,
|
||||
npu_device_tensor_op op) {
|
||||
NPU_UNUSED(_h);
|
||||
auto * tensor = tensor_from_handle(tensor_handle);
|
||||
if (!tensor) {
|
||||
return AEE_EINVHANDLE;
|
||||
}
|
||||
|
||||
tensor->set_op(op);
|
||||
return AEE_SUCCESS;
|
||||
}
|
||||
|
||||
AEEResult npu_device_tensor_free(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle) {
|
||||
NPU_UNUSED(_h);
|
||||
auto * tensor = tensor_from_handle(tensor_handle);
|
||||
if (!tensor) {
|
||||
return AEE_EINVHANDLE;
|
||||
}
|
||||
|
||||
delete tensor;
|
||||
return AEE_SUCCESS;
|
||||
}
|
||||
|
||||
AEEResult npu_device_graph_init(remote_handle64 _h, npu_device_graph_handle_t * graph_handle) {
|
||||
NPU_UNUSED(_h);
|
||||
auto * graph = new (std::nothrow) hexagon::graph();
|
||||
if (!graph) {
|
||||
return AEE_ENOMEMORY;
|
||||
}
|
||||
|
||||
*graph_handle = graph_to_handle(graph);
|
||||
return AEE_SUCCESS;
|
||||
}
|
||||
|
||||
AEEResult npu_device_graph_set_tensor(remote_handle64 _h, npu_device_graph_handle_t graph_handle,
|
||||
const npu_device_tensor_handle_t * tensor_handles, int tensor_handlesLen) {
|
||||
NPU_UNUSED(_h);
|
||||
auto * graph = graph_from_handle(graph_handle);
|
||||
if (!graph || !tensor_handles || tensor_handlesLen <= 0) {
|
||||
return AEE_EINVHANDLE;
|
||||
}
|
||||
|
||||
graph->set_tensor(tensor_handles, tensor_handlesLen);
|
||||
return AEE_SUCCESS;
|
||||
}
|
||||
|
||||
AEEResult npu_device_graph_compute(remote_handle64 _h, npu_device_graph_handle_t graph_handle) {
|
||||
NPU_UNUSED(_h);
|
||||
auto * graph = graph_from_handle(graph_handle);
|
||||
if (!graph) {
|
||||
return AEE_EINVHANDLE;
|
||||
}
|
||||
|
||||
if (!graph->compute()) {
|
||||
return AEE_EFAILED;
|
||||
}
|
||||
|
||||
return AEE_SUCCESS;
|
||||
}
|
||||
|
||||
AEEResult npu_device_graph_free(remote_handle64 _h, npu_device_graph_handle_t graph_handle) {
|
||||
NPU_UNUSED(_h);
|
||||
auto * graph = graph_from_handle(graph_handle);
|
||||
if (graph) {
|
||||
delete graph;
|
||||
}
|
||||
|
||||
return AEE_SUCCESS;
|
||||
}
|
||||
|
|
@ -0,0 +1,67 @@
|
|||
|
||||
#include "graph.hpp"
|
||||
|
||||
#include <new>
|
||||
|
||||
#include "op_impl.hpp"
|
||||
#include "util.hpp"
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
graph::~graph() noexcept {
|
||||
if (_tensors) {
|
||||
delete[] _tensors;
|
||||
}
|
||||
}
|
||||
|
||||
void graph::set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_count) {
|
||||
if (_tensor_count > 0) {
|
||||
delete[] _tensors;
|
||||
}
|
||||
|
||||
if (tensor_count <= 0) {
|
||||
_tensors = nullptr;
|
||||
_tensor_count = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
_tensors = new (std::nothrow) tensor *[tensor_count];
|
||||
for (int i = 0; i < tensor_count; ++i) {
|
||||
auto * tensor_obj = reinterpret_cast<tensor *>(tensors[i]);
|
||||
_tensors[i] = tensor_obj;
|
||||
DEVICE_LOG_DEBUG("graph(%p) set_tensor[%d]: %p(%p,%p), op: %d\n", (void *) this, i, (void *) tensor_obj,
|
||||
(void *) tensor_obj->get_src(0), (void *) tensor_obj->get_src(1), tensor_obj->get_op());
|
||||
}
|
||||
|
||||
_tensor_count = tensor_count;
|
||||
DEVICE_LOG_DEBUG("graph(%p) tensor count: %zu\n", (void *) this, _tensor_count);
|
||||
}
|
||||
|
||||
bool graph::compute() {
|
||||
if (!_tensors || !_tensor_count) {
|
||||
DEVICE_LOG_DEBUG("graph(%p) no tensors to compute\n", (void *) this);
|
||||
return true; // return success if no tensors to compute
|
||||
}
|
||||
|
||||
DEVICE_LOG_DEBUG("graph(%p) compute\n", (void *) this);
|
||||
for (size_t i = 0; i < _tensor_count; ++i) {
|
||||
auto * dst = _tensors[i];
|
||||
auto op = dst->get_op();
|
||||
auto * func = get_compute_func(op);
|
||||
if (!func) {
|
||||
DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d not supported\n", (void *) this, i, op);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!func(dst)) {
|
||||
DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d compute failed\n", (void *) this, i, op);
|
||||
return false;
|
||||
}
|
||||
|
||||
dst->flush(); // TODO: optimize this
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace hexagon
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
#pragma once
|
||||
|
||||
#include "hexagon_npu.h"
|
||||
#include "tensor.hpp"
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
class graph {
|
||||
public:
|
||||
// TODO: add execute direction here
|
||||
explicit graph() noexcept {}
|
||||
|
||||
~graph() noexcept;
|
||||
|
||||
void set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_count);
|
||||
|
||||
bool compute();
|
||||
|
||||
private:
|
||||
tensor ** _tensors = nullptr;
|
||||
size_t _tensor_count = 0;
|
||||
|
||||
graph(const graph &) = delete;
|
||||
void operator=(const graph &) = delete;
|
||||
graph(graph &&) = delete;
|
||||
void operator=(graph &&) = delete;
|
||||
};
|
||||
|
||||
} // namespace hexagon
|
||||
|
|
@ -0,0 +1,194 @@
|
|||
|
||||
|
||||
#include "op_impl.hpp"
|
||||
|
||||
#include <hexagon_types.h>
|
||||
#include <HTP/core/intrinsics.h>
|
||||
|
||||
#include "op_mul_mat.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
template <HVX_Vector (*_OpIntrinsic)(HVX_Vector, HVX_Vector)>
|
||||
inline void vec_op_f32_f32(const float * src0, const float * src1, size_t count, float * dst) {
|
||||
HVX_Vector * iptr0 = ((HVX_Vector *) src0);
|
||||
HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / hexagon::kFloatsPerVector);
|
||||
HVX_Vector * iptr1 = ((HVX_Vector *) src1);
|
||||
HVX_Vector * optr = ((HVX_Vector *) dst);
|
||||
HVX_Vector prev0 = *iptr0++;
|
||||
HVX_Vector prev1 = *iptr1++;
|
||||
|
||||
// TODO: prefetch or just use VTCM?
|
||||
while (iptr0 < iptr0_end) {
|
||||
HVX_Vector curr0 = *iptr0++;
|
||||
HVX_Vector curr1 = *iptr1++;
|
||||
HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
|
||||
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
|
||||
*optr++ = Q6_Vsf_equals_Vqf32(_OpIntrinsic(s0, s1));
|
||||
prev0 = curr0;
|
||||
prev1 = curr1;
|
||||
}
|
||||
|
||||
if ((iptr0_end - ((HVX_Vector *) src0)) > 0) {
|
||||
// handle the last vector
|
||||
// see also:
|
||||
// https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147
|
||||
// or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c
|
||||
bool iptr0_aligned = hexagon::is_addr_aligned(iptr0);
|
||||
HVX_Vector curr0 = iptr0_aligned ? prev0 : *iptr0;
|
||||
iptr0 = iptr0_aligned ? iptr0 : iptr0 + 1;
|
||||
bool iptr1_aligned = hexagon::is_addr_aligned(iptr1);
|
||||
HVX_Vector curr1 = iptr1_aligned ? prev1 : *iptr1;
|
||||
iptr1 = iptr1_aligned ? iptr1 : iptr1 + 1;
|
||||
HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
|
||||
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
|
||||
*optr++ = Q6_Vsf_equals_Vqf32(_OpIntrinsic(s0, s1));
|
||||
prev0 = curr0;
|
||||
prev1 = curr1;
|
||||
}
|
||||
|
||||
const size_t leftover = count % hexagon::kFloatsPerVector;
|
||||
const size_t leftover_bytes = leftover * sizeof(float);
|
||||
if (leftover > 0) {
|
||||
// handle the leftover elements
|
||||
HVX_Vector curr0 =
|
||||
(leftover_bytes + hexagon::unaligned_bytes(iptr0) > hexagon::kBytesPerVector) ? *iptr0 : prev0;
|
||||
curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
|
||||
|
||||
HVX_Vector curr1 =
|
||||
(leftover_bytes + hexagon::unaligned_bytes(iptr1) > hexagon::kBytesPerVector) ? *iptr1 : prev1;
|
||||
curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
|
||||
|
||||
q6op_vstu_variable_ARV(optr, leftover_bytes, Q6_Vsf_equals_Vqf32(_OpIntrinsic(curr0, curr1)));
|
||||
}
|
||||
}
|
||||
|
||||
inline HVX_Vector vadd_f32_f32(HVX_Vector a, HVX_Vector b) {
|
||||
return Q6_Vqf32_vadd_VsfVsf(a, b);
|
||||
}
|
||||
|
||||
inline HVX_Vector vsub_f32_f32(HVX_Vector a, HVX_Vector b) {
|
||||
return Q6_Vqf32_vsub_VsfVsf(a, b);
|
||||
}
|
||||
|
||||
inline HVX_Vector vmul_f32_f32(HVX_Vector a, HVX_Vector b) {
|
||||
return Q6_Vqf32_vmpy_VsfVsf(a, b);
|
||||
}
|
||||
|
||||
template <typename _TySrc, typename _TyDst, void (*_RowFunc)(const _TySrc *, const _TySrc *, size_t, _TyDst *)>
|
||||
bool element_wise_op(hexagon::tensor * out) {
|
||||
if (!out) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto * src0 = out->get_src(0);
|
||||
auto * src1 = out->get_src(1);
|
||||
if (!src0 || !src1) {
|
||||
return true; // skip if no src
|
||||
}
|
||||
|
||||
if (src0->get_ne(0) != src1->get_ne(0)) {
|
||||
// TODO: handle this case
|
||||
DEVICE_LOG_ERROR("src0[0] and src1[0] not match: %ld vs %ld\n", (long) src0->get_ne(0), (long) src1->get_ne(0));
|
||||
return false;
|
||||
}
|
||||
|
||||
static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "element_wise_op requires max dims 4");
|
||||
|
||||
const auto * src0_ptr = reinterpret_cast<const uint8_t *>(src0->get_data());
|
||||
const auto * src1_ptr = reinterpret_cast<const uint8_t *>(src1->get_data());
|
||||
auto * dst_ptr = reinterpret_cast<uint8_t *>(out->get_data());
|
||||
for (int64_t i3 = 0; i3 < out->get_ne(3); i3++) {
|
||||
const auto * src0_cube = src0_ptr + i3 * src0->get_nb(3);
|
||||
const auto * src1_cube = src1_ptr + (i3 % src1->get_ne(3)) * src1->get_nb(3);
|
||||
auto * dst_cube = dst_ptr + i3 * out->get_nb(3);
|
||||
for (int64_t i2 = 0; i2 < out->get_ne(2); i2++) {
|
||||
const auto * src0_plane = src0_cube + i2 * src0->get_nb(2);
|
||||
const auto * src1_plane = src1_cube + (i2 % src1->get_ne(2)) * src1->get_nb(2);
|
||||
auto * dst_plane = dst_cube + i2 * out->get_nb(2);
|
||||
for (int64_t i1 = 0; i1 < out->get_ne(1); i1++) {
|
||||
// TODO: prefetch row?
|
||||
auto * src0_row = src0_plane + i1 * src0->get_nb(1);
|
||||
auto * src1_row = src1_plane + (i1 % src1->get_ne(1)) * src1->get_nb(1);
|
||||
auto * dst_row = reinterpret_cast<float *>(dst_plane + i1 * out->get_nb(1));
|
||||
_RowFunc(reinterpret_cast<const _TySrc *>(src0_row), reinterpret_cast<const _TySrc *>(src1_row),
|
||||
static_cast<size_t>(out->get_ne(0)), reinterpret_cast<_TyDst *>(dst_row));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool is_element_wise_op_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
|
||||
const npu_device_tensor_spec & dst, npu_device_tensor_op op) {
|
||||
if (op != NPU_OP_ADD && op != NPU_OP_SUB && op != NPU_OP_MUL) {
|
||||
DEVICE_LOG_DEBUG("Unsupported element wise op: %s\n", hexagon::op_get_name(op));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src0.ne[0] != src1.ne[0]) {
|
||||
DEVICE_LOG_DEBUG("src0.ne[0] and src1.ne[0] not match: %ld vs %ld\n", (long) src0.ne[0], (long) src1.ne[0]);
|
||||
return false;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < DEVICE_TENSOR_MAX_DIMS; ++i) {
|
||||
if (src0.ne[i] != dst.ne[i]) {
|
||||
DEVICE_LOG_DEBUG("src0.ne[%zu] and dst.ne[%zu] not match: %lld vs %lld\n", i, i, (long long) src0.ne[i],
|
||||
(long long) dst.ne[i]);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
struct op_capabilities {
|
||||
npu_device_tensor_op op;
|
||||
hexagon::compute_func_type compute_func;
|
||||
hexagon::op_is_supported_func_type is_supported;
|
||||
};
|
||||
|
||||
constexpr const op_capabilities kOpCapabilities[] = {
|
||||
{ NPU_OP_MUL_MAT, hexagon::mul_mat_f32, hexagon::is_mul_mat_supported },
|
||||
{ NPU_OP_ADD, element_wise_op<float, float, vec_op_f32_f32<vadd_f32_f32>>, is_element_wise_op_supported },
|
||||
{ NPU_OP_SUB, element_wise_op<float, float, vec_op_f32_f32<vsub_f32_f32>>, is_element_wise_op_supported },
|
||||
{ NPU_OP_MUL, element_wise_op<float, float, vec_op_f32_f32<vmul_f32_f32>>, is_element_wise_op_supported },
|
||||
};
|
||||
|
||||
static_assert(kOpCapabilities[NPU_OP_MUL_MAT].compute_func == hexagon::mul_mat_f32,
|
||||
"kOpArray[NPU_OP_MUL_MAT] != mul_mat_f32");
|
||||
|
||||
static_assert(std::size(kOpCapabilities) == NPU_OP_COUNT);
|
||||
static_assert(kOpCapabilities[NPU_OP_MUL_MAT].op == NPU_OP_MUL_MAT, "kOpArray[NPU_OP_MUL_MAT].op != NPU_OP_MUL_MAT");
|
||||
static_assert(kOpCapabilities[NPU_OP_MUL].op == NPU_OP_MUL, "kOpArray[NPU_OP_MUL].op != NPU_OP_MUL");
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
compute_func_type get_compute_func(npu_device_tensor_op op) {
|
||||
if (op >= NPU_OP_COUNT) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return kOpCapabilities[op].compute_func;
|
||||
}
|
||||
|
||||
bool support_op(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
|
||||
const npu_device_tensor_spec & dst, npu_device_tensor_op op) {
|
||||
if (get_compute_func(op) == nullptr) {
|
||||
DEVICE_LOG_ERROR("Unsupported op: %s, get_compute_func failed\n", op_get_name(op));
|
||||
return false;
|
||||
}
|
||||
|
||||
auto is_supported_func = kOpCapabilities[op].is_supported;
|
||||
if (!is_supported_func || !is_supported_func(src0, src1, dst, op)) {
|
||||
DEVICE_LOG_ERROR("Unsupported op: %s, is_supported_func failed\n", op_get_name(op));
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace hexagon
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
#pragma once
|
||||
|
||||
#include "hexagon_npu.h"
|
||||
#include "tensor.hpp"
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
typedef bool (*compute_func_type)(tensor * dst);
|
||||
typedef bool (*op_is_supported_func_type)(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
|
||||
const npu_device_tensor_spec & dst, npu_device_tensor_op op);
|
||||
|
||||
compute_func_type get_compute_func(npu_device_tensor_op op);
|
||||
|
||||
bool support_op(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
|
||||
const npu_device_tensor_spec & dst, npu_device_tensor_op op);
|
||||
|
||||
} // namespace hexagon
|
||||
|
|
@ -0,0 +1,146 @@
|
|||
#include "op_mul_mat.hpp"
|
||||
|
||||
#include <HTP/core/intrinsics.h>
|
||||
|
||||
namespace {
|
||||
|
||||
inline float vec_dot_product_f32_f32(const float * src0, const float * src1, size_t count) {
|
||||
HVX_Vector * iptr0 = ((HVX_Vector *) src0);
|
||||
HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / hexagon::kFloatsPerVector);
|
||||
HVX_Vector * iptr1 = ((HVX_Vector *) src1);
|
||||
HVX_Vector prev0 = *iptr0++;
|
||||
HVX_Vector prev1 = *iptr1++;
|
||||
HVX_Vector sum = Q6_V_vzero();
|
||||
|
||||
// TODO: prefetch or just use VTCM?
|
||||
while (iptr0 < iptr0_end) {
|
||||
HVX_Vector curr0 = *iptr0++;
|
||||
HVX_Vector curr1 = *iptr1++;
|
||||
HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
|
||||
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
|
||||
sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, s1), sum);
|
||||
prev0 = curr0;
|
||||
prev1 = curr1;
|
||||
}
|
||||
|
||||
if ((iptr0_end - ((HVX_Vector *) src0)) > 0) {
|
||||
// handle the last vector
|
||||
// see also:
|
||||
// https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147
|
||||
// or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c
|
||||
bool iptr0_aligned = hexagon::is_addr_aligned(iptr0);
|
||||
HVX_Vector curr0 = iptr0_aligned ? prev0 : *iptr0;
|
||||
iptr0 = iptr0_aligned ? iptr0 : iptr0 + 1;
|
||||
bool iptr1_aligned = hexagon::is_addr_aligned(iptr1);
|
||||
HVX_Vector curr1 = iptr1_aligned ? prev1 : *iptr1;
|
||||
iptr1 = iptr1_aligned ? iptr1 : iptr1 + 1;
|
||||
HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
|
||||
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
|
||||
sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, s1), sum);
|
||||
prev0 = curr0;
|
||||
prev1 = curr1;
|
||||
}
|
||||
|
||||
const size_t leftover = count % hexagon::kFloatsPerVector;
|
||||
const size_t leftover_bytes = leftover * sizeof(float);
|
||||
if (leftover > 0) {
|
||||
// handle the leftover elements
|
||||
HVX_Vector curr0 =
|
||||
(leftover_bytes + hexagon::unaligned_bytes(iptr0) > hexagon::kBytesPerVector) ? *iptr0 : prev0;
|
||||
curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
|
||||
|
||||
HVX_Vector curr1 =
|
||||
(leftover_bytes + hexagon::unaligned_bytes(iptr1) > hexagon::kBytesPerVector) ? *iptr1 : prev1;
|
||||
curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
|
||||
|
||||
sum = Q6_Vqf32_vadd_Vqf32Vqf32(
|
||||
Q6_V_valign_VVR(Q6_Vqf32_vmpy_VsfVsf(curr0, curr1), Q6_V_vzero(), leftover_bytes), sum);
|
||||
}
|
||||
|
||||
// TODO: do we have a better way to do the reduction?
|
||||
for (size_t i = hexagon::kFloatsPerVector / 2; i > 0; i /= 2) {
|
||||
sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_V_vror_VR(sum, i * sizeof(float)));
|
||||
}
|
||||
|
||||
float result;
|
||||
q6op_vstu_variable_ARV(&result, sizeof(float), Q6_Vsf_equals_Vqf32(sum));
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
bool mul_mat_f32(hexagon::tensor * out) {
|
||||
if (!out) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto * src0 = out->get_src(0);
|
||||
auto * src1 = out->get_src(1);
|
||||
if (!src0 || !src1) {
|
||||
return true; // skip if no src
|
||||
}
|
||||
|
||||
static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "mul_mat_f32 requires max dims 4");
|
||||
|
||||
const auto r02 = src1->get_ne(2) / src0->get_ne(2);
|
||||
const auto r03 = src1->get_ne(3) / src0->get_ne(3);
|
||||
const auto * src0_ptr = reinterpret_cast<const uint8_t *>(src0->get_data());
|
||||
const auto * src1_ptr = reinterpret_cast<const uint8_t *>(src1->get_data());
|
||||
auto * dst_ptr = reinterpret_cast<uint8_t *>(out->get_data());
|
||||
for (int64_t i3 = 0; i3 < out->get_ne(3); i3++) {
|
||||
const auto * src0_cube = src0_ptr + i3 / r03 * src0->get_nb(3);
|
||||
const auto * src1_cube = src1_ptr + i3 * src1->get_nb(3);
|
||||
auto * dst_cube = dst_ptr + i3 * out->get_nb(3);
|
||||
for (int64_t i2 = 0; i2 < out->get_ne(2); i2++) {
|
||||
const auto * src0_plane = src0_cube + i2 / r02 * src0->get_nb(2);
|
||||
const auto * src1_plane = src1_cube + i2 * src1->get_nb(2);
|
||||
auto * dst_plane = dst_cube + i2 * out->get_nb(2);
|
||||
for (int64_t i1 = 0; i1 < out->get_ne(1); i1++) {
|
||||
// TODO: prefetch row?
|
||||
auto * src1_row = src1_plane + i1 * src1->get_nb(1);
|
||||
auto * dst_row = reinterpret_cast<float *>(dst_plane + i1 * out->get_nb(1));
|
||||
for (int64_t i0 = 0; i0 < out->get_ne(0); i0++) {
|
||||
auto * src0_row = src0_plane + i0 * src0->get_nb(1);
|
||||
// TODO: figure out how to handle a entire row
|
||||
*dst_row++ =
|
||||
vec_dot_product_f32_f32(reinterpret_cast<const float *>(src0_row),
|
||||
reinterpret_cast<const float *>(src1_row), (size_t) src0->get_ne(0));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool is_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
|
||||
const npu_device_tensor_spec & dst, npu_device_tensor_op op) {
|
||||
if (op != NPU_OP_MUL_MAT) {
|
||||
DEVICE_LOG_DEBUG("op is not NPU_OP_MUL_MAT: %d\n", op);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src0.ne[0] != src1.ne[0] || src0.ne[1] != dst.ne[0]) {
|
||||
DEVICE_LOG_DEBUG("src0 and src1 cannot multiply: %ldx%ld vs %ldx%ld\n", (long) src0.ne[0], (long) src0.ne[1],
|
||||
(long) src1.ne[0], (long) src1.ne[1]);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src1.ne[1] != dst.ne[1] || src1.ne[2] != dst.ne[2] || src1.ne[3] != dst.ne[3]) {
|
||||
DEVICE_LOG_DEBUG("src1 and dst dimensions not match: %ldx%ld vs %ldx%ld\n", (long) src1.ne[2],
|
||||
(long) src1.ne[3], (long) dst.ne[2], (long) dst.ne[3]);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src1.ne[2] % src0.ne[2] || src1.ne[3] % src0.ne[3]) {
|
||||
DEVICE_LOG_DEBUG("src0 cannot broadcast to src1: %ldx%ld vs %ldx%ld\n", (long) src0.ne[2], (long) src0.ne[3],
|
||||
(long) src1.ne[2], (long) src1.ne[3]);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace hexagon
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
#pragma once
|
||||
|
||||
#include <hexagon_types.h>
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#include "tensor.hpp"
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73
|
||||
constexpr const size_t kFloatsPerVector = kBytesPerVector / sizeof(float);
|
||||
constexpr const size_t kAlignMask = kBytesPerVector - 1;
|
||||
|
||||
inline size_t unaligned_bytes(const void * addr) {
|
||||
return ((size_t) addr) & kAlignMask;
|
||||
}
|
||||
|
||||
inline bool is_addr_aligned(void * addr) {
|
||||
return unaligned_bytes(addr) == 0;
|
||||
}
|
||||
|
||||
bool mul_mat_f32(tensor * out);
|
||||
bool is_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
|
||||
const npu_device_tensor_spec & dst, npu_device_tensor_op op);
|
||||
|
||||
} // namespace hexagon
|
||||
|
|
@ -0,0 +1,90 @@
|
|||
#pragma once
|
||||
|
||||
#include <HAP_mem.h>
|
||||
#include <qurt.h>
|
||||
|
||||
#include "hexagon_npu.h"
|
||||
#include "util.hpp"
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
constexpr const size_t kMaxTensorSrc = DEVICE_TENSOR_MAX_SRC;
|
||||
|
||||
class tensor {
|
||||
public:
|
||||
explicit tensor(const npu_device_tensor_config & info) noexcept : _info(info) {
|
||||
uint64 phy_address = 0;
|
||||
void * mmap_address = nullptr;
|
||||
auto ret = HAP_mmap_get(_info.buffer_fd, &mmap_address, &phy_address);
|
||||
if (ret != AEE_SUCCESS) {
|
||||
DEVICE_LOG_ERROR("Failed to mmap tensor buffer: %d", (int) ret);
|
||||
return;
|
||||
}
|
||||
|
||||
_data = static_cast<uint8_t *>(mmap_address);
|
||||
DEVICE_LOG_INFO("tensor(%p[%ldx%ldx%ldx%ld]), fd: %d, offset: %zu, mmap_address: %p, phy_address: 0x%lx\n",
|
||||
(void *) this, (long) _info.ne[0], (long) _info.ne[1], (long) _info.ne[2], (long) _info.ne[3],
|
||||
_info.buffer_fd, _info.offset, (void *) mmap_address, phy_address);
|
||||
}
|
||||
|
||||
~tensor() noexcept {
|
||||
auto ret = HAP_mmap_put(_info.buffer_fd);
|
||||
if (ret != AEE_SUCCESS) {
|
||||
DEVICE_LOG_ERROR("Failed to unmap tensor buffer: %d", (int) ret);
|
||||
}
|
||||
|
||||
DEVICE_LOG_INFO("~tensor(%p) fd: %d", (void *) this, _info.buffer_fd);
|
||||
}
|
||||
|
||||
void flush() {
|
||||
if (_data) {
|
||||
qurt_mem_cache_clean((qurt_addr_t) (_data + _info.offset), (qurt_size_t) _info.size,
|
||||
QURT_MEM_CACHE_INVALIDATE, QURT_MEM_DCACHE);
|
||||
}
|
||||
}
|
||||
|
||||
bool set_src(size_t index, tensor * src) {
|
||||
if (index >= kMaxTensorSrc) {
|
||||
return false;
|
||||
}
|
||||
|
||||
_src[index] = src;
|
||||
return true;
|
||||
}
|
||||
|
||||
void set_op(npu_device_tensor_op op) { _info.op = op; }
|
||||
|
||||
tensor * get_src(size_t index) const {
|
||||
if (index >= kMaxTensorSrc) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return _src[index];
|
||||
}
|
||||
|
||||
const npu_device_tensor_config & get_info() const { return _info; }
|
||||
|
||||
const int64_t get_ne(size_t index) const { return _info.ne[index]; }
|
||||
|
||||
const size_t get_nb(size_t index) const { return _info.nb[index]; }
|
||||
|
||||
npu_device_tensor_op get_op() const { return _info.op; }
|
||||
|
||||
npu_device_tensor_data_type get_type() const { return _info.type; }
|
||||
|
||||
uint8_t * get_data() const { return _data + _info.offset; }
|
||||
|
||||
bool is_valid() const { return _data != nullptr; }
|
||||
|
||||
private:
|
||||
npu_device_tensor_config _info;
|
||||
tensor * _src[kMaxTensorSrc] = {};
|
||||
uint8_t * _data = nullptr;
|
||||
|
||||
tensor(const tensor &) = delete;
|
||||
void operator=(const tensor &) = delete;
|
||||
tensor(tensor &&) = delete;
|
||||
void operator=(tensor &&) = delete;
|
||||
};
|
||||
|
||||
} // namespace hexagon
|
||||
|
|
@ -0,0 +1,36 @@
|
|||
#pragma once
|
||||
|
||||
#include <HAP_farf.h>
|
||||
|
||||
#include "hexagon_npu.h"
|
||||
|
||||
#define DEVICE_LOG_ERROR(...) FARF(FATAL, __VA_ARGS__)
|
||||
#define DEVICE_LOG_WARN(...) FARF(ERROR, __VA_ARGS__)
|
||||
#define DEVICE_LOG_INFO(...) FARF(HIGH, __VA_ARGS__)
|
||||
|
||||
#ifdef _DEBUG
|
||||
# undef FARF_LOW
|
||||
# define FARF_LOW 1
|
||||
# define DEVICE_LOG_DEBUG(...) FARF(LOW, __VA_ARGS__)
|
||||
#else
|
||||
# define DEVICE_LOG_DEBUG(...) (void) 0
|
||||
#endif
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
constexpr const char * op_get_name(npu_device_tensor_op op) {
|
||||
switch (op) {
|
||||
case NPU_OP_MUL_MAT:
|
||||
return "MUL_MAT";
|
||||
case NPU_OP_ADD:
|
||||
return "ADD";
|
||||
case NPU_OP_SUB:
|
||||
return "SUB";
|
||||
case NPU_OP_MUL:
|
||||
return "MUL";
|
||||
default:
|
||||
return "UNKNOWN";
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace hexagon
|
||||
|
|
@ -0,0 +1,246 @@
|
|||
#include "buffer.hpp"
|
||||
|
||||
#include <rpcmem.h>
|
||||
|
||||
#include "host_device.hpp"
|
||||
#include "tensor.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
constexpr const int kRpcMemDefaultHeapId = RPCMEM_HEAP_ID_SYSTEM;
|
||||
constexpr const uint32_t kRpcMemDefaultFlags = RPCMEM_DEFAULT_FLAGS; // TODO: should we use a different flag?
|
||||
|
||||
static hexagon::host_buffer * get_buffer_object(ggml_backend_buffer_t buffer) {
|
||||
return reinterpret_cast<hexagon::host_buffer *>(buffer->context);
|
||||
}
|
||||
|
||||
static hexagon::host_buffer_type * get_buffer_type_object(ggml_backend_buffer_type_t buft) {
|
||||
return reinterpret_cast<hexagon::host_buffer_type *>(buft->context);
|
||||
}
|
||||
|
||||
void backend_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
delete get_buffer_object(buffer);
|
||||
}
|
||||
|
||||
void * backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
auto * buffer_obj = get_buffer_object(buffer);
|
||||
GGML_ASSERT(buffer_obj != nullptr);
|
||||
return buffer_obj->get_buffer();
|
||||
}
|
||||
|
||||
ggml_status backend_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
auto * buffer_type_obj = get_buffer_type_object(buffer->buft);
|
||||
GGML_ASSERT(buffer_type_obj != nullptr);
|
||||
|
||||
auto * device_object = buffer_type_obj->get_device();
|
||||
GGML_ASSERT(device_object != nullptr);
|
||||
|
||||
auto * buffer_obj = get_buffer_object(buffer);
|
||||
GGML_ASSERT(buffer_obj != nullptr);
|
||||
|
||||
auto tensor_object = buffer_obj->init_tensor(tensor, device_object->get_device_handle());
|
||||
if (!tensor_object) {
|
||||
LOG_ERROR("Failed to init tensor\n");
|
||||
return GGML_STATUS_ALLOC_FAILED;
|
||||
}
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
void backend_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset,
|
||||
size_t size) {
|
||||
GGML_UNUSED(buffer);
|
||||
memcpy((char *) tensor->data + offset, data, size);
|
||||
}
|
||||
|
||||
void backend_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset,
|
||||
size_t size) {
|
||||
GGML_UNUSED(buffer);
|
||||
memcpy(data, (const char *) tensor->data + offset, size);
|
||||
}
|
||||
|
||||
bool backend_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
GGML_UNUSED(buffer);
|
||||
if (ggml_backend_buffer_is_host(src->buffer)) {
|
||||
memcpy(dst->data, src->data, ggml_nbytes(src));
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
auto * buffer_obj = get_buffer_object(buffer);
|
||||
GGML_ASSERT(buffer_obj != nullptr);
|
||||
memset(buffer_obj->get_buffer(), value, buffer_obj->get_size());
|
||||
}
|
||||
|
||||
constexpr const ggml_backend_buffer_i backend_buffer_interface = {
|
||||
/* .free_buffer = */ backend_buffer_free_buffer,
|
||||
/* .get_base = */ backend_buffer_get_base,
|
||||
/* .init_tensor = */ backend_buffer_init_tensor,
|
||||
/* .memset_tensor = */ nullptr,
|
||||
/* .set_tensor = */ backend_buffer_set_tensor,
|
||||
/* .get_tensor = */ backend_buffer_get_tensor,
|
||||
/* .cpy_tensor = */ backend_buffer_cpy_tensor,
|
||||
/* .clear = */ backend_buffer_clear,
|
||||
/* .reset = */ nullptr,
|
||||
};
|
||||
|
||||
const char * backend_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
auto * buffer_type_obj = get_buffer_type_object(buft);
|
||||
GGML_ASSERT(buffer_type_obj != nullptr);
|
||||
return buffer_type_obj->get_name();
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t backend_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
auto * buffer_type_obj = get_buffer_type_object(buft);
|
||||
GGML_ASSERT(buffer_type_obj != nullptr);
|
||||
return buffer_type_obj->allocate_buffer(size);
|
||||
}
|
||||
|
||||
size_t backend_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
auto * buffer_type_obj = get_buffer_type_object(buft);
|
||||
GGML_ASSERT(buffer_type_obj != nullptr);
|
||||
return buffer_type_obj->get_buffer_alignment();
|
||||
}
|
||||
|
||||
size_t backend_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
|
||||
auto * buffer_type_obj = get_buffer_type_object(buft);
|
||||
GGML_ASSERT(buffer_type_obj != nullptr);
|
||||
return buffer_type_obj->get_max_buffer_size();
|
||||
}
|
||||
|
||||
bool backend_buffer_is_host(ggml_backend_buffer_type_t buft) {
|
||||
return buft->iface.get_name == backend_buffer_type_get_name;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
host_buffer::host_buffer(common::rpc_mem_ptr allocator, size_t size, uint32_t domain_id) :
|
||||
_allocator(allocator),
|
||||
_size(size),
|
||||
_domain_id(domain_id) {
|
||||
if (!_allocator->is_valid()) {
|
||||
LOG_ERROR("rpc memory not initialized\n");
|
||||
return;
|
||||
}
|
||||
|
||||
if (size > _allocator->get_max_alloc_size()) {
|
||||
LOG_ERROR("rpc memory size %zu exceeds max alloc size %zu\n", size, _allocator->get_max_alloc_size());
|
||||
return;
|
||||
}
|
||||
|
||||
_data = _allocator->alloc(kRpcMemDefaultHeapId, kRpcMemDefaultFlags, size);
|
||||
if (!_data) {
|
||||
LOG_ERROR("failed to allocate rpc memory, size: %d MB\n", (int) (size / (1 << 20)));
|
||||
return;
|
||||
}
|
||||
|
||||
LOG_DEBUG("create host_buffer(%p), size: %zu, domain_id: %d\n", (void *) _data, size, (int) domain_id);
|
||||
}
|
||||
|
||||
host_buffer::~host_buffer() {
|
||||
LOG_DEBUG("destroy host_buffer(%p), size: %zu, domain_id: %d\n", (void *) _data, _size, (int) _domain_id);
|
||||
_tensors.clear();
|
||||
if (_buffer_fd != -1) {
|
||||
auto ret = _allocator->fastrpc_munmap((int) _domain_id, _buffer_fd, nullptr, 0);
|
||||
if (ret != AEE_SUCCESS) {
|
||||
LOG_ERROR("failed to munmap rpc memory, fd: %d, ret: %d\n", _buffer_fd, ret);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
_allocator->free(_data);
|
||||
}
|
||||
|
||||
std::shared_ptr<host_tensor> host_buffer::init_tensor(ggml_tensor * tensor, remote_handle64 device_handle) {
|
||||
if (!_data) {
|
||||
LOG_ERROR("failed to init tensor, rpc memory not initialized\n");
|
||||
return std::shared_ptr<host_tensor>();
|
||||
}
|
||||
|
||||
if (_buffer_fd == -1) {
|
||||
_buffer_fd = _allocator->to_fd(_data);
|
||||
if (_buffer_fd < 0) {
|
||||
LOG_ERROR("failed to get fd from rpc memory\n");
|
||||
return std::shared_ptr<host_tensor>();
|
||||
}
|
||||
|
||||
auto ret = _allocator->fastrpc_mmap((int) _domain_id, _buffer_fd, _data, 0, _size, FASTRPC_MAP_FD);
|
||||
if (ret != AEE_SUCCESS) {
|
||||
LOG_ERROR("failed to mmap rpc memory, fd: %d, ret: %d\n", _buffer_fd, ret);
|
||||
return std::shared_ptr<host_tensor>();
|
||||
}
|
||||
|
||||
LOG_DEBUG("mmap rpc memory(%p), fd: %d, addr: %p, size: %zu\n", (void *) _data, _buffer_fd, _data, _size);
|
||||
}
|
||||
|
||||
auto tensor_object = std::make_shared<host_tensor>(
|
||||
tensor, _buffer_fd, (uint64_t) (reinterpret_cast<uint8_t *>(tensor->data) - reinterpret_cast<uint8_t *>(_data)),
|
||||
device_handle);
|
||||
if (!tensor_object->is_valid()) {
|
||||
LOG_ERROR("failed to init tensor, device handle: %p\n", (void *) device_handle);
|
||||
return std::shared_ptr<host_tensor>();
|
||||
}
|
||||
|
||||
_tensors.push_back(tensor_object);
|
||||
return tensor_object;
|
||||
}
|
||||
|
||||
host_buffer_type::host_buffer_type(ggml_backend_dev_t dev, const std::string & name, common::rpc_mem_ptr rpc_mem) :
|
||||
_name(name),
|
||||
_rpc_mem(rpc_mem) {
|
||||
iface = {
|
||||
/* .get_name = */ backend_buffer_type_get_name,
|
||||
/* .alloc_buffer = */ backend_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ backend_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ backend_buffer_type_get_max_size,
|
||||
/* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes
|
||||
/* .is_host = */ backend_buffer_is_host,
|
||||
};
|
||||
device = dev;
|
||||
context = this;
|
||||
|
||||
_device = reinterpret_cast<npu_device *>(device->context);
|
||||
LOG_DEBUG("[%s]create host_buffer_type %s\n", _device->get_name(), _name.c_str());
|
||||
}
|
||||
|
||||
size_t host_buffer_type::get_buffer_alignment() const {
|
||||
return _device->is_device_initialized() ? _device->get_alignment() : 128;
|
||||
}
|
||||
|
||||
size_t host_buffer_type::get_max_buffer_size() const {
|
||||
if (!_rpc_mem) {
|
||||
LOG_ERROR("rpc memory not initialized\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
return _rpc_mem->get_max_alloc_size();
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t host_buffer_type::allocate_buffer(size_t size) {
|
||||
if (!_rpc_mem) {
|
||||
LOG_ERROR("rpc memory not initialized\n");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (!_device->is_device_initialized()) {
|
||||
LOG_ERROR("device is not initialized\n");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto * buffer = new host_buffer(_rpc_mem, size, _device->get_dsp_domain_id());
|
||||
if (!buffer->is_valid()) {
|
||||
delete buffer;
|
||||
LOG_ERROR("Failed to allocate buffer of size %zu\n", size);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
LOG_DEBUG("[%s]allocate buffer %p, size: %zu\n", _device->get_name(), buffer->get_buffer(), size);
|
||||
return ggml_backend_buffer_init(this, backend_buffer_interface, buffer, size);
|
||||
}
|
||||
|
||||
} // namespace hexagon
|
||||
|
|
@ -0,0 +1,66 @@
|
|||
#pragma once
|
||||
|
||||
#include <list>
|
||||
#include <memory>
|
||||
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "hexagon_npu.h"
|
||||
#include "rpc-mem.hpp"
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
class host_tensor;
|
||||
|
||||
class host_buffer {
|
||||
public:
|
||||
explicit host_buffer(common::rpc_mem_ptr allocator, size_t size, uint32_t domain_id);
|
||||
|
||||
~host_buffer();
|
||||
|
||||
bool is_valid() const { return _data != nullptr; }
|
||||
|
||||
void * get_buffer() { return _data; }
|
||||
|
||||
size_t get_size() const { return _size; }
|
||||
|
||||
std::shared_ptr<host_tensor> init_tensor(ggml_tensor * tensor, remote_handle64 device_handle);
|
||||
|
||||
private:
|
||||
common::rpc_mem_ptr _allocator;
|
||||
void * _data = nullptr;
|
||||
size_t _size = 0;
|
||||
int _buffer_fd = -1;
|
||||
uint32_t _domain_id = 0;
|
||||
|
||||
std::list<std::shared_ptr<host_tensor>> _tensors;
|
||||
|
||||
DISABLE_COPY(host_buffer);
|
||||
DISABLE_MOVE(host_buffer);
|
||||
};
|
||||
|
||||
class npu_device;
|
||||
|
||||
class host_buffer_type : public ggml_backend_buffer_type {
|
||||
public:
|
||||
explicit host_buffer_type(ggml_backend_dev_t dev, const std::string & name, common::rpc_mem_ptr rpc_mem);
|
||||
|
||||
const char * get_name() const { return _name.c_str(); }
|
||||
|
||||
size_t get_buffer_alignment() const;
|
||||
|
||||
size_t get_max_buffer_size() const;
|
||||
|
||||
ggml_backend_buffer_t allocate_buffer(size_t size);
|
||||
|
||||
npu_device * get_device() const { return _device; }
|
||||
|
||||
private:
|
||||
npu_device * _device = nullptr;
|
||||
std::string _name;
|
||||
common::rpc_mem_ptr _rpc_mem;
|
||||
|
||||
DISABLE_COPY(host_buffer_type);
|
||||
DISABLE_MOVE(host_buffer_type);
|
||||
};
|
||||
|
||||
} // namespace hexagon
|
||||
|
|
@ -0,0 +1,82 @@
|
|||
#include "graph.hpp"
|
||||
|
||||
#include "tensor.hpp"
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
host_graph::host_graph(ggml_cgraph * cgraph, remote_handle64 device_handle) : _device_handle(device_handle) {
|
||||
auto status = npu_device_graph_init(_device_handle, &_graph_handle);
|
||||
if (status != AEE_SUCCESS) {
|
||||
LOG_ERROR("Failed to init graph: %d", (int) status);
|
||||
_graph_handle = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
update(cgraph);
|
||||
}
|
||||
|
||||
host_graph::~host_graph() {
|
||||
if (_graph_handle) {
|
||||
npu_device_graph_free(_device_handle, _graph_handle);
|
||||
_graph_handle = 0;
|
||||
}
|
||||
}
|
||||
|
||||
bool host_graph::update(ggml_cgraph * cgraph) {
|
||||
if (!_graph_handle) {
|
||||
LOG_ERROR("host_graph not initialized\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
_tensor_handles.clear();
|
||||
_tensor_handles.reserve(cgraph->n_nodes);
|
||||
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
||||
auto * node = cgraph->nodes[i];
|
||||
if (node->op == GGML_OP_NONE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) {
|
||||
// skip view liked ops
|
||||
LOG_DEBUG("node[%d]%s(%s), addr: %p, type: %s, skipped\n", i, ggml_get_name(node), ggml_op_desc(node),
|
||||
(void *) node, ggml_type_name(node->type));
|
||||
continue;
|
||||
}
|
||||
|
||||
auto * tensor_obj = host_tensor::from_ggml_tensor(node);
|
||||
if (!tensor_obj) {
|
||||
LOG_DEBUG("Unable to get host tensor from ggml tensor: %p\n", (void *) node);
|
||||
continue;
|
||||
}
|
||||
|
||||
tensor_obj->set_op(node->op);
|
||||
_tensor_handles.push_back(tensor_obj->get_device_tensor_handle());
|
||||
LOG_DEBUG("node[%d]%s(%s), addr: %p, type: %s, tensor_handle: %p\n", i, ggml_get_name(node), ggml_op_desc(node),
|
||||
(void *) node, ggml_type_name(node->type), (void *) tensor_obj->get_device_tensor_handle());
|
||||
for (size_t j = 0; j < GGML_MAX_SRC && node->src[j]; ++j) {
|
||||
auto * src = host_tensor::from_ggml_tensor(node->src[j]);
|
||||
tensor_obj->set_src(j, src);
|
||||
}
|
||||
}
|
||||
|
||||
LOG_DEBUG("host_graph::update, host_graph(%p), ggml_cgraph(%p), tensor count(%zu)\n", (void *) this,
|
||||
(void *) cgraph, _tensor_handles.size());
|
||||
if (!_tensor_handles.empty()) {
|
||||
npu_device_graph_set_tensor(_device_handle, _graph_handle, _tensor_handles.data(),
|
||||
(int) _tensor_handles.size());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool host_graph::compute() {
|
||||
if (!_graph_handle) {
|
||||
LOG_ERROR("host_graph not initialized\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
auto status = npu_device_graph_compute(_device_handle, _graph_handle);
|
||||
if (status != AEE_SUCCESS) {
|
||||
LOG_ERROR("Failed to compute host_graph: 0x%x\n", (int) status);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace hexagon
|
||||
|
|
@ -0,0 +1,32 @@
|
|||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "hexagon_npu.h"
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
class host_graph {
|
||||
public:
|
||||
host_graph(ggml_cgraph * cgraph, remote_handle64 device_handle);
|
||||
|
||||
~host_graph();
|
||||
|
||||
bool is_valid() const { return _graph_handle != 0; }
|
||||
|
||||
bool update(ggml_cgraph * cgraph);
|
||||
|
||||
bool compute();
|
||||
|
||||
private:
|
||||
remote_handle64 _device_handle = 0;
|
||||
npu_device_graph_handle_t _graph_handle = 0;
|
||||
std::vector<npu_device_tensor_handle_t> _tensor_handles;
|
||||
|
||||
DISABLE_COPY(host_graph);
|
||||
DISABLE_MOVE(host_graph);
|
||||
};
|
||||
|
||||
} // namespace hexagon
|
||||
|
|
@ -0,0 +1,153 @@
|
|||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "buffer.hpp"
|
||||
#include "common.hpp"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "host_device.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
hexagon::npu_device * get_device_object(ggml_backend_dev_t device) {
|
||||
return reinterpret_cast<hexagon::npu_device *>(device->context);
|
||||
}
|
||||
|
||||
hexagon::npu_device * get_device_object(ggml_backend_t backend) {
|
||||
return get_device_object(backend->device);
|
||||
}
|
||||
|
||||
const char * backend_dev_get_name(ggml_backend_dev_t dev) {
|
||||
auto * dev_obj = get_device_object(dev);
|
||||
GGML_ASSERT(dev_obj != nullptr);
|
||||
return dev_obj->get_name();
|
||||
}
|
||||
|
||||
const char * backend_dev_get_description(ggml_backend_dev_t dev) {
|
||||
auto * dev_obj = get_device_object(dev);
|
||||
GGML_ASSERT(dev_obj != nullptr);
|
||||
return dev_obj->get_description();
|
||||
}
|
||||
|
||||
bool backend_dev_is_npu_device(ggml_backend_dev_t dev) {
|
||||
return dev->iface.get_name == backend_dev_get_name;
|
||||
}
|
||||
|
||||
void backend_dev_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
GGML_UNUSED(dev);
|
||||
*free = common::get_system_free_memory_in_bytes();
|
||||
*total = common::get_system_total_memory_in_bytes();
|
||||
}
|
||||
|
||||
enum ggml_backend_dev_type backend_dev_get_type(ggml_backend_dev_t dev) {
|
||||
GGML_UNUSED(dev);
|
||||
return GGML_BACKEND_DEVICE_TYPE_ACCEL;
|
||||
}
|
||||
|
||||
void backend_dev_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
||||
GGML_ASSERT(get_device_object(dev) != nullptr);
|
||||
props->name = backend_dev_get_name(dev);
|
||||
props->description = backend_dev_get_description(dev);
|
||||
props->type = backend_dev_get_type(dev);
|
||||
backend_dev_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
props->caps = {};
|
||||
}
|
||||
|
||||
ggml_backend_t backend_dev_init_backend(ggml_backend_dev_t dev, const char * params) {
|
||||
auto * dev_obj = get_device_object(dev);
|
||||
GGML_ASSERT(dev_obj != nullptr);
|
||||
if (!dev_obj->init_device(dev, params)) {
|
||||
LOG_ERROR("[%s]Failed to init device\n", backend_dev_get_name(dev));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return new hexagon::npu_backend(dev);
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t backend_dev_get_buffer_type(ggml_backend_dev_t dev) {
|
||||
auto * dev_obj = get_device_object(dev);
|
||||
GGML_ASSERT(dev_obj != nullptr);
|
||||
return dev_obj->get_default_buffer_type(dev);
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t backend_dev_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size,
|
||||
size_t max_tensor_size) {
|
||||
// TODO: should we use the device memory here?
|
||||
GGML_UNUSED(dev);
|
||||
GGML_UNUSED(max_tensor_size);
|
||||
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||
}
|
||||
|
||||
bool backend_dev_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
||||
if (!backend_dev_is_npu_device(dev)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto * dev_obj = get_device_object(dev);
|
||||
GGML_ASSERT(dev_obj != nullptr);
|
||||
return dev_obj->supports_op(op);
|
||||
}
|
||||
|
||||
bool backend_dev_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||
if (!backend_dev_is_npu_device(dev)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto * dev_obj = get_device_object(dev);
|
||||
GGML_ASSERT(dev_obj != nullptr);
|
||||
return dev_obj->supports_buft(buft);
|
||||
}
|
||||
|
||||
bool backend_dev_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
||||
if (!backend_dev_is_npu_device(dev)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto * dev_obj = get_device_object(dev);
|
||||
GGML_ASSERT(dev_obj != nullptr);
|
||||
return dev_obj->offload_op(op);
|
||||
}
|
||||
|
||||
constexpr const ggml_backend_device_i npu_device_interface = {
|
||||
/* .get_name = */ backend_dev_get_name,
|
||||
/* .get_description = */ backend_dev_get_description,
|
||||
/* .get_memory = */ backend_dev_get_memory,
|
||||
/* .get_type = */ backend_dev_get_type,
|
||||
/* .get_props = */ backend_dev_get_props,
|
||||
/* .init_backend = */ backend_dev_init_backend,
|
||||
/* .get_buffer_type = */ backend_dev_get_buffer_type,
|
||||
/* .get_host_buffer_type = */ nullptr,
|
||||
/* .buffer_from_host_ptr = */ backend_dev_buffer_from_host_ptr,
|
||||
/* .supports_op = */ backend_dev_supports_op,
|
||||
/* .supports_buft = */ backend_dev_supports_buft,
|
||||
/* .offload_op = */ backend_dev_offload_op,
|
||||
/* .event_new = */ nullptr,
|
||||
/* .event_free = */ nullptr,
|
||||
/* .event_synchronize = */ nullptr,
|
||||
};
|
||||
|
||||
class npu_device_proxy : public backend_device_proxy {
|
||||
public:
|
||||
explicit npu_device_proxy(backend_index_type device) { _device = std::make_unique<hexagon::npu_device>(device); }
|
||||
|
||||
const ggml_backend_device_i & get_iface() const { return npu_device_interface; }
|
||||
|
||||
void * get_context() { return _device.get(); }
|
||||
|
||||
private:
|
||||
std::unique_ptr<hexagon::npu_device> _device;
|
||||
|
||||
DISABLE_COPY(npu_device_proxy);
|
||||
DISABLE_MOVE(npu_device_proxy);
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
backend_device_proxy_ptr create_hexagon_backend_context(backend_index_type device) {
|
||||
if (device < QNN_BACKEND_COUNT || device >= TOTAL_BACKEND_COUNT) {
|
||||
return backend_device_proxy_ptr();
|
||||
}
|
||||
|
||||
return std::make_shared<npu_device_proxy>(device);
|
||||
}
|
||||
|
|
@ -0,0 +1,305 @@
|
|||
#include "host_device.hpp"
|
||||
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wmissing-prototypes"
|
||||
#include <domain_default.h>
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
#include <remote.h>
|
||||
|
||||
#include "graph.hpp"
|
||||
#include "util.hpp"
|
||||
|
||||
#define SKEL_URI_DEFINE(arch) ("file:///libhexagon_npu_skel_" arch ".so?npu_device_skel_handle_invoke&_modver=1.0")
|
||||
|
||||
namespace {
|
||||
|
||||
struct device_library_info {
|
||||
hexagon::hexagon_dsp_arch arch;
|
||||
const char * device_lib_uri;
|
||||
};
|
||||
|
||||
constexpr const device_library_info kDeviceLibraryInfo[] = {
|
||||
{ hexagon::NONE, SKEL_URI_DEFINE("") },
|
||||
{ hexagon::V68, SKEL_URI_DEFINE("v68") },
|
||||
{ hexagon::V69, SKEL_URI_DEFINE("v69") },
|
||||
{ hexagon::V73, SKEL_URI_DEFINE("v73") },
|
||||
{ hexagon::V75, SKEL_URI_DEFINE("v75") },
|
||||
{ hexagon::V79, SKEL_URI_DEFINE("v79") },
|
||||
};
|
||||
|
||||
const device_library_info & get_device_library_info(hexagon::hexagon_dsp_arch arch) {
|
||||
for (const auto & info : kDeviceLibraryInfo) {
|
||||
if (info.arch == arch) {
|
||||
return info;
|
||||
}
|
||||
}
|
||||
|
||||
LOG_ERROR("Unknown DSP arch: %d, using hexagon::NONE\n", arch);
|
||||
return kDeviceLibraryInfo[0];
|
||||
}
|
||||
|
||||
const char * get_domain_param(uint32_t domain_id) {
|
||||
for (const auto & domain : supported_domains) {
|
||||
if ((uint32_t) domain.id == domain_id) {
|
||||
return domain.uri;
|
||||
}
|
||||
}
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
constexpr const ggml_guid kBackendNpuGuid = { 0x7a, 0xd7, 0x59, 0x7d, 0x8f, 0x66, 0x4f, 0x35,
|
||||
0x84, 0x8e, 0xf5, 0x9a, 0x9b, 0x83, 0x7d, 0x0a };
|
||||
|
||||
hexagon::npu_backend * get_backend_object(ggml_backend_t backend) {
|
||||
return reinterpret_cast<hexagon::npu_backend *>(backend);
|
||||
}
|
||||
|
||||
const char * backend_get_name(ggml_backend_t backend) {
|
||||
auto * backend_obj = get_backend_object(backend);
|
||||
GGML_ASSERT(backend_obj != nullptr);
|
||||
return backend_obj->get_name();
|
||||
}
|
||||
|
||||
void backend_free(ggml_backend_t backend) {
|
||||
delete get_backend_object(backend);
|
||||
}
|
||||
|
||||
bool backend_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src,
|
||||
ggml_tensor * dst) {
|
||||
// TODO: implement this
|
||||
return false;
|
||||
}
|
||||
|
||||
ggml_status backend_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
auto * backend_obj = get_backend_object(backend);
|
||||
GGML_ASSERT(backend_obj != nullptr);
|
||||
return backend_obj->graph_compute(cgraph);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
// TODO: should we use another domain?
|
||||
npu_device::npu_device(backend_index_type device) : _dsp_domain_id(CDSP_DOMAIN_ID) {
|
||||
GGML_UNUSED(device);
|
||||
LOG_DEBUG("[%s]NPU device created\n", _name.c_str());
|
||||
}
|
||||
|
||||
npu_device::~npu_device() {
|
||||
if (_device_handle) {
|
||||
npu_device_close(_device_handle);
|
||||
}
|
||||
}
|
||||
|
||||
size_t npu_device::get_alignment() const {
|
||||
uint32_t alignment = 0;
|
||||
npu_device_device_get_alignment(_device_handle, &alignment);
|
||||
return alignment;
|
||||
}
|
||||
|
||||
bool npu_device::is_device_initialized() const {
|
||||
if (!_device_handle) {
|
||||
LOG_ERROR("[%s]NPU device not opened\n", get_name());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!_rpc_mem) {
|
||||
LOG_ERROR("[%s]rpc memory not initialized\n", get_name());
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool npu_device::init_device(ggml_backend_dev_t dev, const char * params) {
|
||||
if (!init_rpc_mem()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!_device_handle) {
|
||||
auto arch = get_dsp_arch(_rpc_interface, _dsp_domain_id);
|
||||
const auto & device_lib_info = get_device_library_info(arch);
|
||||
std::string device_lib_uri = device_lib_info.device_lib_uri;
|
||||
device_lib_uri += get_domain_param(_dsp_domain_id);
|
||||
LOG_DEBUG("[%s]NPU device arch: %s, uri: %s\n", get_name(), get_dsp_arch_desc(arch), device_lib_uri.c_str());
|
||||
auto err = npu_device_open(device_lib_uri.c_str(), &_device_handle);
|
||||
if (err != AEE_SUCCESS) {
|
||||
if (err == AEE_ECONNREFUSED) {
|
||||
LOG_DEBUG("[%s]NPU device is not available, trying to enable unsigned DSP module and reopen\n",
|
||||
get_name());
|
||||
enable_unsigned_dsp_module(_rpc_interface, _dsp_domain_id);
|
||||
err = npu_device_open(device_lib_uri.c_str(), &_device_handle);
|
||||
}
|
||||
|
||||
if (err != AEE_SUCCESS) {
|
||||
LOG_ERROR("[%s]Unable to open NPU device, err: 0x%x, uri %s\n", get_name(), err,
|
||||
device_lib_uri.c_str());
|
||||
_device_handle = 0;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
_description += ' ';
|
||||
_description += get_dsp_arch_desc(arch);
|
||||
LOG_DEBUG("[%s]NPU device opened successfully\n", get_name());
|
||||
} else {
|
||||
LOG_DEBUG("[%s]NPU device is already opened\n", get_name());
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool npu_device::supports_buft(ggml_backend_buffer_type_t buft) const {
|
||||
return buft && buft->device && buft->device->context == this;
|
||||
}
|
||||
|
||||
bool npu_device::supports_op_impl(const ggml_tensor * op) {
|
||||
if (op->op == GGML_OP_NONE) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (type_to_npu_type(op->type) == NPU_DATA_TYPE_COUNT) {
|
||||
LOG_DEBUG("[%s]Unsupported op tensor type: %s\n", get_name(), ggml_type_name(op->type));
|
||||
return false;
|
||||
}
|
||||
|
||||
auto * src0 = op->src[0];
|
||||
if (!src0) {
|
||||
LOG_DEBUG("[%s]Unsupported inplace op: %s\n", get_name(), ggml_op_name(op->op));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (type_to_npu_type(src0->type) == NPU_DATA_TYPE_COUNT) {
|
||||
LOG_DEBUG("[%s]Unsupported src0 tensor type: %s\n", get_name(), ggml_type_name(src0->type));
|
||||
return false;
|
||||
}
|
||||
|
||||
auto * src1 = op->src[1];
|
||||
if (src1 && type_to_npu_type(src1->type) == NPU_DATA_TYPE_COUNT) {
|
||||
LOG_DEBUG("[%s]Unsupported src1 tensor type: %s\n", get_name(), ggml_type_name(src1->type));
|
||||
return false;
|
||||
}
|
||||
|
||||
auto npu_op = op_to_npu_op(op->op);
|
||||
if (npu_op == NPU_OP_COUNT) {
|
||||
LOG_DEBUG("[%s]Unsupported op: %s\n", get_name(), ggml_op_name(op->op));
|
||||
return false;
|
||||
}
|
||||
|
||||
constexpr const auto get_spec = [](const ggml_tensor * tensor) -> npu_device_tensor_spec {
|
||||
if (!tensor) {
|
||||
return npu_device_tensor_spec{};
|
||||
}
|
||||
|
||||
static_assert(DEVICE_TENSOR_MAX_DIMS == GGML_MAX_DIMS, "tensor dimensions mismatch");
|
||||
npu_device_tensor_spec spec{};
|
||||
spec.ne[0] = tensor->ne[0];
|
||||
spec.ne[1] = tensor->ne[1];
|
||||
spec.ne[2] = tensor->ne[2];
|
||||
spec.ne[3] = tensor->ne[3];
|
||||
spec.type = type_to_npu_type(tensor->type);
|
||||
return spec;
|
||||
};
|
||||
|
||||
boolean supported = false;
|
||||
auto src0_spec = get_spec(src0);
|
||||
auto src1_spec = get_spec(src1);
|
||||
auto dst_spec = get_spec(op);
|
||||
auto ret = npu_device_device_support_op(_device_handle, &src0_spec, &src1_spec, &dst_spec, npu_op, &supported);
|
||||
if (ret != AEE_SUCCESS || !supported) {
|
||||
LOG_DEBUG("[%s]Unsupported op: %s, ret: 0x%x, supported: %d\n", get_name(), ggml_op_name(op->op), ret,
|
||||
supported);
|
||||
return false;
|
||||
}
|
||||
|
||||
LOG_DEBUG("[%s]Supported op: %s\n", get_name(), ggml_op_name(op->op));
|
||||
return true;
|
||||
}
|
||||
|
||||
bool npu_device::init_rpc_mem() {
|
||||
if (!_rpc_mem) {
|
||||
auto rpc_interface = std::make_shared<common::rpc_interface>();
|
||||
if (!rpc_interface->is_valid()) {
|
||||
LOG_ERROR("[%s]Failed to load rpc memory library\n", get_name());
|
||||
return false;
|
||||
}
|
||||
|
||||
auto rpc_mem = std::make_shared<common::rpc_mem>(rpc_interface);
|
||||
_rpc_interface = rpc_interface;
|
||||
_rpc_mem = rpc_mem;
|
||||
LOG_DEBUG("[%s]rpc memory initialized\n", get_name());
|
||||
} else {
|
||||
LOG_DEBUG("[%s]rpc memory already initialized\n", get_name());
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool npu_device::offload_op(const ggml_tensor * op) {
|
||||
// TODO: implement this
|
||||
return false;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t npu_device::get_default_buffer_type(ggml_backend_dev_t dev) {
|
||||
// Note that this function will be called before the npu_device::init_device
|
||||
if (!init_rpc_mem()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (!_default_buffer_type) {
|
||||
LOG_DEBUG("[%s]Creating default buffer type\n", get_name());
|
||||
_default_buffer_type = std::make_unique<hexagon::host_buffer_type>(dev, _name + "_buffer_type", _rpc_mem);
|
||||
if (!_default_buffer_type) {
|
||||
LOG_ERROR("[%s]Default buffer type not initialized\n", get_name());
|
||||
return nullptr;
|
||||
}
|
||||
} else {
|
||||
LOG_DEBUG("[%s]Default buffer type already created\n", get_name());
|
||||
}
|
||||
|
||||
return _default_buffer_type.get();
|
||||
}
|
||||
|
||||
npu_backend::npu_backend(ggml_backend_dev_t dev) : ggml_backend{} {
|
||||
memccpy(&_guid, &kBackendNpuGuid, 0, sizeof(ggml_guid));
|
||||
device = dev;
|
||||
guid = &_guid;
|
||||
iface.get_name = backend_get_name;
|
||||
iface.free = backend_free;
|
||||
iface.cpy_tensor_async = backend_cpy_tensor_async;
|
||||
iface.graph_compute = backend_graph_compute;
|
||||
_device = reinterpret_cast<npu_device *>(dev->context);
|
||||
}
|
||||
|
||||
ggml_status npu_backend::graph_compute(ggml_cgraph * cgraph) {
|
||||
if (!cgraph || !cgraph->n_nodes) {
|
||||
LOG_DEBUG("[%s]Graph is empty, nothing to compute\n", get_name());
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
std::shared_ptr<host_graph> graph;
|
||||
if (_graph_cache.count(cgraph) == 0) {
|
||||
LOG_DEBUG("[%s]graph(%p) not found in cache, creating new graph\n", get_name(), (void *) cgraph);
|
||||
graph = std::make_shared<host_graph>(cgraph, _device->get_device_handle());
|
||||
if (!graph->is_valid()) {
|
||||
LOG_ERROR("Failed to create graph\n");
|
||||
return GGML_STATUS_FAILED;
|
||||
}
|
||||
|
||||
_graph_cache[cgraph] = graph;
|
||||
} else {
|
||||
graph = _graph_cache[cgraph];
|
||||
LOG_DEBUG("[%s]graph(%p) found in cache, using existing graph\n", get_name(), (void *) cgraph);
|
||||
if (!graph->update(cgraph)) {
|
||||
LOG_ERROR("[%s]Failed to update graph(%p)\n", get_name(), (void *) cgraph);
|
||||
return GGML_STATUS_FAILED;
|
||||
}
|
||||
}
|
||||
|
||||
return graph->compute() ? GGML_STATUS_SUCCESS : GGML_STATUS_FAILED;
|
||||
}
|
||||
|
||||
} // namespace hexagon
|
||||
|
|
@ -0,0 +1,107 @@
|
|||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
#ifndef NDEBUG
|
||||
# include <atomic>
|
||||
#endif
|
||||
|
||||
#include "buffer.hpp"
|
||||
#include "common.hpp"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "hexagon_npu.h"
|
||||
#include "rpc-mem.hpp"
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
class npu_device {
|
||||
public:
|
||||
explicit npu_device(backend_index_type device);
|
||||
|
||||
~npu_device();
|
||||
|
||||
const char * get_name() const { return _name.c_str(); }
|
||||
|
||||
const char * get_description() const { return _description.c_str(); }
|
||||
|
||||
size_t get_alignment() const;
|
||||
|
||||
uint32_t get_dsp_domain_id() const { return _dsp_domain_id; }
|
||||
|
||||
ggml_backend_buffer_type_t get_default_buffer_type(ggml_backend_dev_t dev);
|
||||
|
||||
bool is_device_initialized() const;
|
||||
bool init_device(ggml_backend_dev_t dev, const char * params);
|
||||
|
||||
bool supports_buft(ggml_backend_buffer_type_t buft) const;
|
||||
bool offload_op(const ggml_tensor * op);
|
||||
|
||||
#ifndef NDEBUG
|
||||
bool supports_op(const ggml_tensor * op) {
|
||||
if (supports_op_impl(op)) {
|
||||
if (op->op != GGML_OP_NONE) {
|
||||
_supported_op++;
|
||||
LOG_DEBUG("[%s]Supported op: %s, supported/unsupported: %u/%u\n", get_name(), ggml_op_name(op->op),
|
||||
_supported_op.load(), _unsupported_op.load());
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
_unsupported_op++;
|
||||
LOG_DEBUG("[%s]Unsupported op: %s, supported/unsupported: %u/%u\n", get_name(), ggml_op_name(op->op),
|
||||
_supported_op.load(), _unsupported_op.load());
|
||||
return false;
|
||||
}
|
||||
#else
|
||||
bool supports_op(const ggml_tensor * op) { return supports_op_impl(op); }
|
||||
#endif
|
||||
|
||||
remote_handle64 get_device_handle() const { return _device_handle; }
|
||||
|
||||
private:
|
||||
bool supports_op_impl(const ggml_tensor * op);
|
||||
bool init_rpc_mem();
|
||||
|
||||
std::string _name = "hexagon-npu";
|
||||
std::string _description = "Hexagon NPU";
|
||||
common::rpc_interface_ptr _rpc_interface;
|
||||
common::rpc_mem_ptr _rpc_mem;
|
||||
remote_handle64 _device_handle = 0;
|
||||
std::unique_ptr<host_buffer_type> _default_buffer_type;
|
||||
uint32_t _dsp_domain_id = 0;
|
||||
|
||||
#ifndef NDEBUG
|
||||
std::atomic_uint32_t _supported_op = 0;
|
||||
std::atomic_uint32_t _unsupported_op = 0;
|
||||
#endif
|
||||
|
||||
DISABLE_COPY(npu_device);
|
||||
DISABLE_MOVE(npu_device);
|
||||
};
|
||||
|
||||
class host_graph;
|
||||
|
||||
class npu_backend : public ggml_backend {
|
||||
public:
|
||||
explicit npu_backend(ggml_backend_dev_t dev);
|
||||
|
||||
~npu_backend() {}
|
||||
|
||||
const char * get_name() const {
|
||||
// TODO: should we use the device name here?
|
||||
return _device->get_name();
|
||||
}
|
||||
|
||||
ggml_status graph_compute(ggml_cgraph * cgraph);
|
||||
|
||||
private:
|
||||
ggml_guid _guid = {};
|
||||
npu_device * _device = nullptr;
|
||||
std::unordered_map<ggml_cgraph *, std::shared_ptr<host_graph>> _graph_cache;
|
||||
|
||||
DISABLE_COPY(npu_backend);
|
||||
DISABLE_MOVE(npu_backend);
|
||||
};
|
||||
|
||||
} // namespace hexagon
|
||||
|
|
@ -0,0 +1,88 @@
|
|||
#pragma once
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ggml-impl.h"
|
||||
#include "hexagon_npu.h"
|
||||
#include "util.hpp"
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
// TODO: merge this with device tensor?
|
||||
class host_tensor {
|
||||
public:
|
||||
static host_tensor * from_ggml_tensor(ggml_tensor * tensor) {
|
||||
if (!tensor || !tensor->extra) {
|
||||
return nullptr;
|
||||
}
|
||||
return static_cast<host_tensor *>(tensor->extra);
|
||||
}
|
||||
|
||||
explicit host_tensor(ggml_tensor * tensor, int buffer_fd, uint64_t offset, remote_handle64 device_handle) :
|
||||
_device_handle(device_handle) {
|
||||
_info.buffer_fd = buffer_fd;
|
||||
_info.offset = offset;
|
||||
_info.type = type_to_npu_type(tensor->type);
|
||||
_info.op = op_to_npu_op(tensor->op);
|
||||
_info.size = ggml_nbytes(tensor);
|
||||
|
||||
static_assert(DEVICE_TENSOR_MAX_DIMS == GGML_MAX_DIMS, "tensor dimensions mismatch");
|
||||
static_assert(sizeof(_info.ne) == sizeof(tensor->ne), "tensor ne size mismatch");
|
||||
static_assert(sizeof(_info.nb) == sizeof(tensor->nb), "tensor nb size mismatch");
|
||||
memcpy(_info.ne, tensor->ne, sizeof(_info.ne));
|
||||
memcpy(_info.nb, tensor->nb, sizeof(_info.nb));
|
||||
|
||||
auto status = npu_device_tensor_init(_device_handle, &_info, &_device_tensor_handle);
|
||||
if (status != AEE_SUCCESS) {
|
||||
LOG_ERROR("Failed to init tensor: %d", (int) status);
|
||||
_device_tensor_handle = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
tensor->extra = this;
|
||||
_ggml_tensor = tensor;
|
||||
LOG_DEBUG(
|
||||
"host_tensor(%p) created, ggml_tensor(%p[%ldx%ldx%ldx%ld], nb[%ld][%ld][%ld][%ld]), "
|
||||
"device_tensor_handle(%p)\n",
|
||||
(void *) this, (void *) tensor, (long) tensor->ne[0], (long) tensor->ne[1], (long) tensor->ne[2],
|
||||
(long) tensor->ne[3], (long) tensor->nb[0], (long) tensor->nb[1], (long) tensor->nb[2],
|
||||
(long) tensor->nb[3], (void *) _device_tensor_handle);
|
||||
}
|
||||
|
||||
~host_tensor() {
|
||||
LOG_DEBUG("host_tensor(%p) destroy, device_tensor_handle: %p\n", (void *) this, (void *) _device_tensor_handle);
|
||||
if (_device_tensor_handle) {
|
||||
npu_device_tensor_free(_device_handle, _device_tensor_handle);
|
||||
_ggml_tensor->extra = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
npu_device_tensor_handle_t get_device_tensor_handle() const { return _device_tensor_handle; }
|
||||
|
||||
void set_src(size_t index, host_tensor * src) {
|
||||
if (index >= DEVICE_TENSOR_MAX_SRC) {
|
||||
LOG_ERROR("host_tensor(%p) set_src[%zu] out of range\n", (void *) this, index);
|
||||
return;
|
||||
}
|
||||
|
||||
LOG_DEBUG("host_tensor(%p) set_src[%zu]: %p\n", (void *) this, index, (void *) src);
|
||||
npu_device_tensor_set_src(_device_handle, _device_tensor_handle, index, src->get_device_tensor_handle());
|
||||
}
|
||||
|
||||
void set_op(ggml_op op) {
|
||||
_info.op = op_to_npu_op(op);
|
||||
npu_device_tensor_set_op(_device_handle, _device_tensor_handle, _info.op);
|
||||
}
|
||||
|
||||
bool is_valid() const { return _device_tensor_handle != 0; }
|
||||
|
||||
private:
|
||||
remote_handle64 _device_handle = 0;
|
||||
npu_device_tensor_handle_t _device_tensor_handle = 0;
|
||||
npu_device_tensor_config _info = {};
|
||||
ggml_tensor * _ggml_tensor = nullptr;
|
||||
|
||||
DISABLE_COPY(host_tensor);
|
||||
DISABLE_MOVE(host_tensor);
|
||||
};
|
||||
|
||||
} // namespace hexagon
|
||||
|
|
@ -0,0 +1,96 @@
|
|||
#include "util.hpp"
|
||||
|
||||
#include <remote.h>
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
enum npu_device_tensor_op op_to_npu_op(ggml_op op) {
|
||||
switch (op) {
|
||||
case GGML_OP_MUL_MAT:
|
||||
return NPU_OP_MUL_MAT;
|
||||
case GGML_OP_ADD:
|
||||
return NPU_OP_ADD;
|
||||
case GGML_OP_SUB:
|
||||
return NPU_OP_SUB;
|
||||
case GGML_OP_MUL:
|
||||
return NPU_OP_MUL;
|
||||
default:
|
||||
return NPU_OP_COUNT;
|
||||
}
|
||||
}
|
||||
|
||||
enum npu_device_tensor_data_type type_to_npu_type(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return NPU_DATA_TYPE_F32;
|
||||
default:
|
||||
return NPU_DATA_TYPE_COUNT;
|
||||
}
|
||||
}
|
||||
|
||||
hexagon_dsp_arch get_dsp_arch(common::rpc_interface_ptr rpc_interface, uint32_t domain_id) {
|
||||
if (!rpc_interface || !rpc_interface->is_valid()) {
|
||||
return NONE;
|
||||
}
|
||||
|
||||
remote_dsp_capability dsp_caps = {};
|
||||
dsp_caps.domain = domain_id;
|
||||
dsp_caps.attribute_ID = ARCH_VER;
|
||||
auto ret = rpc_interface->remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_caps, sizeof(dsp_caps));
|
||||
if (ret != AEE_SUCCESS) {
|
||||
LOG_ERROR("failed to get DSP arch: %d\n", ret);
|
||||
return NONE;
|
||||
}
|
||||
|
||||
LOG_DEBUG("get DSP arch: 0x%x\n", (int) dsp_caps.capability);
|
||||
auto arch = dsp_caps.capability & 0xFF;
|
||||
switch (arch) {
|
||||
case 0x68:
|
||||
return V68;
|
||||
case 0x69:
|
||||
return V69;
|
||||
case 0x73:
|
||||
return V73;
|
||||
case 0x75:
|
||||
return V75;
|
||||
case 0x79:
|
||||
return V79;
|
||||
default:
|
||||
LOG_ERROR("unknown DSP arch: %x\n", arch);
|
||||
return NONE;
|
||||
}
|
||||
}
|
||||
|
||||
const char * get_dsp_arch_desc(hexagon_dsp_arch arch) {
|
||||
switch (arch) {
|
||||
case V68:
|
||||
return "V68";
|
||||
case V69:
|
||||
return "V69";
|
||||
case V73:
|
||||
return "V73";
|
||||
case V75:
|
||||
return "V75";
|
||||
case V79:
|
||||
return "V79";
|
||||
case NONE:
|
||||
default:
|
||||
return "UnknownArch";
|
||||
}
|
||||
}
|
||||
|
||||
void enable_unsigned_dsp_module(common::rpc_interface_ptr rpc_interface, uint32_t domain_id) {
|
||||
if (!rpc_interface || !rpc_interface->is_valid()) {
|
||||
return;
|
||||
}
|
||||
|
||||
remote_rpc_control_unsigned_module data = {};
|
||||
data.domain = domain_id;
|
||||
data.enable = 1;
|
||||
auto ret = rpc_interface->remote_session_control(DSPRPC_CONTROL_UNSIGNED_MODULE, &data, sizeof(data));
|
||||
if (ret != AEE_SUCCESS) {
|
||||
LOG_ERROR("failed to enable unsigned DSP module: 0x%x\n", ret);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace hexagon
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
#include "ggml-impl.h"
|
||||
#include "hexagon_npu.h"
|
||||
#include "rpc-interface.hpp"
|
||||
|
||||
namespace hexagon {
|
||||
|
||||
enum npu_device_tensor_op op_to_npu_op(ggml_op op);
|
||||
enum npu_device_tensor_data_type type_to_npu_type(ggml_type type);
|
||||
|
||||
// TODO: merge with qcom_htp_arch
|
||||
enum hexagon_dsp_arch {
|
||||
NONE = 0,
|
||||
V68,
|
||||
V69,
|
||||
V73,
|
||||
V75,
|
||||
V79, // SD 8 Gen 4 (SM8750)
|
||||
};
|
||||
|
||||
hexagon_dsp_arch get_dsp_arch(common::rpc_interface_ptr rpc_interface, uint32_t domain_id);
|
||||
|
||||
const char * get_dsp_arch_desc(hexagon_dsp_arch arch);
|
||||
|
||||
void enable_unsigned_dsp_module(common::rpc_interface_ptr rpc_interface, uint32_t domain_id);
|
||||
|
||||
} // namespace hexagon
|
||||
|
|
@ -0,0 +1,90 @@
|
|||
#include "AEEStdDef.idl"
|
||||
#include "AEEStdErr.idl"
|
||||
#include "remote.idl"
|
||||
|
||||
const uint32_t DEVICE_TENSOR_MAX_DIMS = 4;
|
||||
const uint32_t DEVICE_TENSOR_MAX_SRC = 2;
|
||||
|
||||
interface npu_device : remote_handle64{
|
||||
|
||||
typedef int64_t ne_type[DEVICE_TENSOR_MAX_DIMS];
|
||||
typedef uint64_t tensor_handle_t;
|
||||
typedef uint64_t graph_handle_t;
|
||||
|
||||
enum tensor_op {
|
||||
NPU_OP_MUL_MAT,
|
||||
NPU_OP_ADD,
|
||||
NPU_OP_SUB,
|
||||
NPU_OP_MUL,
|
||||
NPU_OP_COUNT
|
||||
};
|
||||
|
||||
enum tensor_data_type {
|
||||
NPU_DATA_TYPE_F32,
|
||||
NPU_DATA_TYPE_COUNT
|
||||
};
|
||||
|
||||
struct tensor_spec {
|
||||
ne_type ne;
|
||||
tensor_data_type type;
|
||||
};
|
||||
|
||||
struct tensor_config {
|
||||
ne_type ne;
|
||||
uint64_t nb[DEVICE_TENSOR_MAX_DIMS];
|
||||
long buffer_fd;
|
||||
uint64_t offset;
|
||||
uint64_t size;
|
||||
tensor_data_type type;
|
||||
tensor_op op;
|
||||
};
|
||||
|
||||
AEEResult device_get_alignment(
|
||||
rout uint32_t alignment
|
||||
);
|
||||
|
||||
AEEResult device_support_op(
|
||||
in tensor_spec src0,
|
||||
in tensor_spec src1,
|
||||
in tensor_spec dst,
|
||||
in tensor_op op,
|
||||
rout boolean is_supported
|
||||
);
|
||||
|
||||
AEEResult tensor_init(
|
||||
in tensor_config info,
|
||||
rout tensor_handle_t tensor_handle
|
||||
);
|
||||
|
||||
AEEResult tensor_set_src(
|
||||
in tensor_handle_t tensor_handle,
|
||||
in uint64_t index,
|
||||
in tensor_handle_t src
|
||||
);
|
||||
|
||||
AEEResult tensor_set_op(
|
||||
in tensor_handle_t tensor_handle,
|
||||
in tensor_op op
|
||||
);
|
||||
|
||||
AEEResult tensor_free(
|
||||
in tensor_handle_t tensor_handle
|
||||
);
|
||||
|
||||
AEEResult graph_init(
|
||||
rout graph_handle_t graph_handle
|
||||
);
|
||||
|
||||
AEEResult graph_set_tensor(
|
||||
in graph_handle_t graph_handle,
|
||||
in sequence<tensor_handle_t> tensor_handles
|
||||
);
|
||||
|
||||
AEEResult graph_compute(
|
||||
in graph_handle_t graph_handle
|
||||
);
|
||||
|
||||
AEEResult graph_free(
|
||||
in graph_handle_t graph_handle
|
||||
);
|
||||
};
|
||||
|
|
@ -1,61 +0,0 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include "QnnCommon.h"
|
||||
#include "QnnInterface.h"
|
||||
#include "QnnTypes.h"
|
||||
#include "Saver/QnnSaver.h"
|
||||
#include "System/QnnSystemInterface.h"
|
||||
|
||||
namespace qnn {
|
||||
|
||||
enum qcom_htp_arch {
|
||||
NONE = 0,
|
||||
V68 = 68,
|
||||
V69 = 69,
|
||||
V73 = 73,
|
||||
V75 = 75,
|
||||
V79 = 79, // SD 8 Gen 4 (SM8750)
|
||||
};
|
||||
|
||||
enum qcom_chipset {
|
||||
UNKNOWN_SM = 0,
|
||||
SM8350 = 30, // v68, SD 888/888+
|
||||
SM8450 = 36, // v69, SD 8 Gen 1
|
||||
SA8295 = 39, // v68
|
||||
SM8475 = 42, // v69, SD 8+ Gen 1
|
||||
SM8550 = 43, // v73, SD 8 Gen 2
|
||||
SSG2115P = 46, // v73
|
||||
SM7675 = 70, // V73, SD 7+ Gen 3
|
||||
SM8635 = 68, // v73, SD 8s Gen 3
|
||||
SM8650 = 57, // v75, SD 8 Gen 3
|
||||
SM8750 = 69, // v79, SD 8 Gen 4
|
||||
};
|
||||
|
||||
struct qcom_socinfo {
|
||||
uint32_t soc_model;
|
||||
size_t htp_arch;
|
||||
size_t vtcm_size_in_mb;
|
||||
};
|
||||
|
||||
using pfn_rpc_mem_init = void (*)(void);
|
||||
using pfn_rpc_mem_deinit = void (*)(void);
|
||||
using pfn_rpc_mem_alloc = void * (*) (int, uint32_t, int);
|
||||
using pfn_rpc_mem_free = void (*)(void *);
|
||||
using pfn_rpc_mem_to_fd = int (*)(void *);
|
||||
|
||||
using pfn_qnnsaver_initialize = decltype(QnnSaver_initialize);
|
||||
using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders);
|
||||
using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders);
|
||||
} // namespace qnn
|
||||
|
||||
#define RPCMEM_DEFAULT_FLAGS 1
|
||||
#define RPCMEM_HEAP_ID_SYSTEM 25
|
||||
|
||||
#define DISABLE_COPY(class_name) \
|
||||
class_name(const class_name &) = delete; \
|
||||
void operator=(const class_name &) = delete
|
||||
|
||||
#define DISABLE_MOVE(class_name) \
|
||||
class_name(class_name &&) = delete; \
|
||||
void operator=(class_name &&) = delete
|
||||
|
|
@ -12,7 +12,7 @@
|
|||
|
||||
namespace {
|
||||
|
||||
qnn::qnn_graph * get_qnn_graph_from_cache(ggml_backend_qnn_device_context * ctx, const ggml_cgraph * cgraph) {
|
||||
qnn::qnn_graph * get_qnn_graph_from_cache(qnn::ggml_backend_qnn_device_context * ctx, const ggml_cgraph * cgraph) {
|
||||
auto & graph_cache = ctx->qnn_graph_cache;
|
||||
std::string graph_key;
|
||||
auto op_data_type = qnn::qnn_graph::get_graph_key_from_cgraph(cgraph, graph_key);
|
||||
|
|
@ -178,7 +178,7 @@ inline bool is_type_bit_enabled(uint64_t bits, ggml_type type) {
|
|||
return bits & (uint64_t(1) << type);
|
||||
}
|
||||
|
||||
inline bool is_tensor_size_valid(ggml_backend_qnn_device_context * ctx, const ggml_tensor * tensor) {
|
||||
inline bool is_tensor_size_valid(qnn::ggml_backend_qnn_device_context * ctx, const ggml_tensor * tensor) {
|
||||
constexpr const auto get_tensor_size_in_bytes = [](const ggml_tensor * tensor, ggml_type type) -> size_t {
|
||||
return tensor->ne[0] * tensor->ne[1] * tensor->ne[2] * tensor->ne[3] * ggml_type_size(type);
|
||||
};
|
||||
|
|
@ -200,7 +200,7 @@ inline bool is_tensor_size_valid(ggml_backend_qnn_device_context * ctx, const gg
|
|||
return true;
|
||||
}
|
||||
|
||||
bool is_tensor_type_valid(ggml_backend_qnn_device_context * ctx, const ggml_tensor * tensor) {
|
||||
bool is_tensor_type_valid(qnn::ggml_backend_qnn_device_context * ctx, const ggml_tensor * tensor) {
|
||||
if (!tensor) {
|
||||
QNN_LOG_DEBUG("tensor is nullptr\n");
|
||||
return false;
|
||||
|
|
@ -239,7 +239,7 @@ bool is_data_reinterpretation_op(ggml_op op) {
|
|||
return op == GGML_OP_VIEW || op == GGML_OP_PERMUTE;
|
||||
}
|
||||
|
||||
bool ggnl_qnn_supports_op_tensor(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) {
|
||||
bool ggnl_qnn_supports_op_tensor(qnn::ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) {
|
||||
if (op->op == GGML_OP_NONE) {
|
||||
return true;
|
||||
}
|
||||
|
|
@ -265,7 +265,7 @@ bool ggnl_qnn_supports_op_tensor(ggml_backend_qnn_device_context * ctx, const gg
|
|||
return true;
|
||||
}
|
||||
|
||||
bool ggml_qnn_have_same_tensor_types(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) {
|
||||
bool ggml_qnn_have_same_tensor_types(qnn::ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) {
|
||||
auto * src0 = op->src[0];
|
||||
auto * src1 = op->src[1];
|
||||
if (src1) {
|
||||
|
|
@ -291,7 +291,7 @@ bool ggml_qnn_have_same_tensor_types(ggml_backend_qnn_device_context * ctx, cons
|
|||
}
|
||||
|
||||
// TODO: move to caps array?
|
||||
bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) {
|
||||
bool ggml_qnn_supports_matmul_op(qnn::ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) {
|
||||
auto * src0 = op->src[0];
|
||||
auto * src1 = op->src[1];
|
||||
if (is_data_reinterpretation_op(src0->op) || is_data_reinterpretation_op(src1->op)) {
|
||||
|
|
@ -343,7 +343,7 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context * ctx, const gg
|
|||
|
||||
#ifndef NDEBUG
|
||||
|
||||
void print_tensor_info(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op, bool is_supported) {
|
||||
void print_tensor_info(qnn::ggml_backend_qnn_device_context * ctx, const ggml_tensor * op, bool is_supported) {
|
||||
const char * supported = is_supported ? "supported" : "unsupported";
|
||||
std::string op_key;
|
||||
qnn::get_qnn_op_desc(op, true, GGML_TYPE_COUNT, op_key);
|
||||
|
|
@ -358,7 +358,7 @@ void print_tensor_info(ggml_backend_qnn_device_context * ctx, const ggml_tensor
|
|||
|
||||
namespace qnn {
|
||||
|
||||
bool device_supports_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) {
|
||||
bool device_supports_op(qnn::ggml_backend_qnn_device_context * ctx, const ggml_tensor * op) {
|
||||
// Note that this function could be called before the device context is initialized
|
||||
if (op->op == GGML_OP_NONE) {
|
||||
return true;
|
||||
|
|
@ -435,7 +435,7 @@ bool device_supports_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor
|
|||
return is_op_supported;
|
||||
}
|
||||
|
||||
bool device_compute_graph(ggml_backend_qnn_device_context * ctx, ggml_cgraph * cgraph) {
|
||||
bool device_compute_graph(qnn::ggml_backend_qnn_device_context * ctx, ggml_cgraph * cgraph) {
|
||||
QNN_LOG_DEBUG("[%s]compute graph start, nodes count: %d\n", qnn::get_backend_name(ctx->device),
|
||||
(int) cgraph->n_nodes);
|
||||
|
||||
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#ifndef NDEBUG
|
||||
|
|
@ -18,15 +17,15 @@
|
|||
#include "qnn-lib.hpp"
|
||||
|
||||
namespace qnn {
|
||||
|
||||
typedef std::unordered_map<std::string, std::unique_ptr<qnn::qnn_graph>> qnn_graph_cache_t;
|
||||
} // namespace qnn
|
||||
|
||||
struct ggml_backend_qnn_device_context {
|
||||
// initialize in constructor
|
||||
QNNBackend device;
|
||||
size_t threads;
|
||||
std::string name;
|
||||
std::string description;
|
||||
backend_index_type device;
|
||||
size_t threads;
|
||||
std::string name;
|
||||
std::string description;
|
||||
|
||||
// initialize in qnn init
|
||||
qnn::qcom_socinfo socinfo = {};
|
||||
|
|
@ -46,10 +45,15 @@ struct ggml_backend_qnn_device_context {
|
|||
uint64_t supported_types;
|
||||
uint64_t cpu_preprocess_types;
|
||||
|
||||
explicit ggml_backend_qnn_device_context(QNNBackend device, size_t threads, const char * name,
|
||||
explicit ggml_backend_qnn_device_context(backend_index_type device, size_t threads, const char * name,
|
||||
uint64_t supported_types) :
|
||||
device(device),
|
||||
threads(threads),
|
||||
name(name),
|
||||
supported_types(supported_types) {}
|
||||
};
|
||||
|
||||
bool device_supports_op(ggml_backend_qnn_device_context * ctx, const ggml_tensor * op);
|
||||
bool device_compute_graph(ggml_backend_qnn_device_context * ctx, ggml_cgraph * cgraph);
|
||||
|
||||
} // namespace qnn
|
||||
|
|
@ -1,11 +1,9 @@
|
|||
#include "ggml-qnn.h"
|
||||
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "backend-ops.hpp"
|
||||
#include "backend.hpp"
|
||||
#include "common.hpp"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "logger.hpp"
|
||||
|
|
@ -14,8 +12,8 @@
|
|||
|
||||
namespace {
|
||||
|
||||
ggml_backend_qnn_device_context * get_device_context(ggml_backend_dev_t dev) {
|
||||
return reinterpret_cast<ggml_backend_qnn_device_context *>(dev->context);
|
||||
qnn::ggml_backend_qnn_device_context * get_device_context(ggml_backend_dev_t dev) {
|
||||
return reinterpret_cast<qnn::ggml_backend_qnn_device_context *>(dev->context);
|
||||
}
|
||||
|
||||
qnn::qnn_buffer_interface * get_buffer_context(ggml_backend_buffer_t buffer) {
|
||||
|
|
@ -141,6 +139,16 @@ void ggml_backend_qnn_free(ggml_backend_t backend) {
|
|||
delete backend;
|
||||
}
|
||||
|
||||
ggml_guid_t ggml_backend_qnn_guid() {
|
||||
static ggml_guid guid = { 0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81,
|
||||
0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09 };
|
||||
return &guid;
|
||||
}
|
||||
|
||||
bool ggml_backend_is_qnn(ggml_backend_t backend) {
|
||||
return ggml_guid_matches(backend->guid, ggml_backend_qnn_guid());
|
||||
}
|
||||
|
||||
bool ggml_backend_qnn_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src,
|
||||
ggml_tensor * dst) {
|
||||
GGML_UNUSED(backend_src);
|
||||
|
|
@ -154,7 +162,7 @@ bool ggml_backend_qnn_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_
|
|||
}
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(ggml_backend_dev_t dev) {
|
||||
static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES];
|
||||
static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[QNN_BACKEND_COUNT];
|
||||
auto * dev_ctx = get_device_context(dev);
|
||||
if (!ggml_backend_qnn_buffer_types[dev_ctx->device].device) {
|
||||
ggml_backend_qnn_buffer_types[dev_ctx->device] = {
|
||||
|
|
@ -215,8 +223,8 @@ const char * ggml_backend_qnn_device_get_description(ggml_backend_dev_t dev) {
|
|||
|
||||
void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
GGML_UNUSED(dev);
|
||||
*free = qnn::get_system_free_memory_in_bytes();
|
||||
*total = qnn::get_system_total_memory_in_bytes();
|
||||
*free = common::get_system_free_memory_in_bytes();
|
||||
*total = common::get_system_total_memory_in_bytes();
|
||||
QNN_LOG_DEBUG("free memory: %ldMB, total memory: %ldMB\n", (*free / 1048576), (*total) / 1048576);
|
||||
}
|
||||
|
||||
|
|
@ -237,12 +245,6 @@ void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_
|
|||
};
|
||||
}
|
||||
|
||||
ggml_guid_t ggml_backend_qnn_guid() {
|
||||
static ggml_guid guid = { 0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81,
|
||||
0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09 };
|
||||
return &guid;
|
||||
}
|
||||
|
||||
ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, const char * extend_lib_search_path) {
|
||||
if (!extend_lib_search_path) {
|
||||
extend_lib_search_path = GGML_QNN_DEFAULT_LIB_SEARCH_PATH;
|
||||
|
|
@ -256,8 +258,7 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev,
|
|||
QNN_LOG_DEBUG("device %s\n", qnn::get_backend_name(device));
|
||||
QNN_LOG_DEBUG("extend_lib_search_path %s\n", extend_lib_search_path);
|
||||
auto instance = std::make_shared<qnn::qnn_instance>(extend_lib_search_path, device);
|
||||
auto result = instance->qnn_init(nullptr);
|
||||
if (result != 0) {
|
||||
if (!instance->qnn_init(nullptr)) {
|
||||
QNN_LOG_WARN("failed to init qnn backend %s\n", qnn::get_backend_name(device));
|
||||
return nullptr;
|
||||
}
|
||||
|
|
@ -351,80 +352,43 @@ constexpr const ggml_backend_device_i ggml_backend_qnn_device_interface = {
|
|||
/* .event_synchronize = */ nullptr,
|
||||
};
|
||||
|
||||
/*
|
||||
* -----------------------------------------------------------------------------------------------
|
||||
* qnn backend registry object
|
||||
* -----------------------------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
struct ggml_backend_qnn_reg_impl : ggml_backend_reg {
|
||||
std::vector<std::unique_ptr<ggml_backend_qnn_device_context>> device_contexts;
|
||||
std::vector<ggml_backend_device> devices;
|
||||
|
||||
explicit ggml_backend_qnn_reg_impl(ggml_backend_reg_i interface) {
|
||||
context = this;
|
||||
iface = interface;
|
||||
|
||||
QNN_LOG_DEBUG("qnn backend registry init\n");
|
||||
for (size_t i = 0; i < QNN_BACKEND_COUNT; i++) {
|
||||
const auto device_enum = (QNNBackend) (QNN_BACKEND_COUNT - 1 - i); // init from the last device, i.e. NPU
|
||||
#ifndef GGML_QNN_ENABLE_CPU_BACKEND
|
||||
if (device_enum == QNN_BACKEND_CPU) {
|
||||
/*
|
||||
* here we skip the initialization of CPU device,
|
||||
* cause it'll block unsupported ops fallback to ggml cpu backend
|
||||
*/
|
||||
QNN_LOG_DEBUG("qnn backend registry skip CPU device\n");
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
|
||||
const auto & device_caps = qnn::get_device_caps(device_enum);
|
||||
device_contexts.emplace_back(std::make_unique<ggml_backend_qnn_device_context>(
|
||||
/* .device = */ device_enum, // init from the last device, i.e. NPU
|
||||
/* .threads = */ 1,
|
||||
/* .name = */ qnn::get_backend_name(device_enum),
|
||||
/* .supported_types = */ device_caps.supported_types));
|
||||
|
||||
devices.emplace_back(ggml_backend_device{
|
||||
/* iface = */ ggml_backend_qnn_device_interface,
|
||||
/* reg = */ this,
|
||||
/* context = */ device_contexts.back().get(),
|
||||
});
|
||||
}
|
||||
class qnn_device_proxy : public backend_device_proxy {
|
||||
public:
|
||||
explicit qnn_device_proxy(backend_index_type device) {
|
||||
const auto & device_caps = qnn::get_device_caps(device);
|
||||
_device_context = std::make_unique<qnn::ggml_backend_qnn_device_context>(
|
||||
/* .device = */ device, // init from the last device, i.e. NPU
|
||||
/* .threads = */ 1, // TODO: fix this
|
||||
/* .name = */ qnn::get_backend_name(device),
|
||||
/* .supported_types = */ device_caps.supported_types);
|
||||
}
|
||||
};
|
||||
|
||||
const char * ggml_backend_qnn_reg_get_name(ggml_backend_reg_t reg) {
|
||||
GGML_UNUSED(reg);
|
||||
return GGML_QNN_NAME;
|
||||
}
|
||||
const ggml_backend_device_i & get_iface() const { return ggml_backend_qnn_device_interface; }
|
||||
|
||||
size_t ggml_backend_qnn_reg_get_device_count(ggml_backend_reg_t reg) {
|
||||
auto * ctx = (ggml_backend_qnn_reg_impl *) reg->context;
|
||||
return ctx->devices.size();
|
||||
}
|
||||
void * get_context() { return _device_context.get(); }
|
||||
|
||||
ggml_backend_dev_t ggml_backend_qnn_reg_get_device(ggml_backend_reg_t reg, size_t index) {
|
||||
auto * ctx = (ggml_backend_qnn_reg_impl *) reg->context;
|
||||
GGML_ASSERT(index < ctx->devices.size());
|
||||
return &(ctx->devices[index]);
|
||||
}
|
||||
|
||||
const ggml_backend_reg_i ggml_backend_qnn_reg_interface = {
|
||||
/* .get_name = */ ggml_backend_qnn_reg_get_name,
|
||||
/* .get_device_count = */ ggml_backend_qnn_reg_get_device_count,
|
||||
/* .get_device_get = */ ggml_backend_qnn_reg_get_device,
|
||||
/* .get_proc_address = */ nullptr,
|
||||
private:
|
||||
std::unique_ptr<qnn::ggml_backend_qnn_device_context> _device_context;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
bool ggml_backend_is_qnn(ggml_backend_t backend) {
|
||||
return ggml_guid_matches(backend->guid, ggml_backend_qnn_guid());
|
||||
}
|
||||
backend_device_proxy_ptr create_qnn_backend_context(backend_index_type device) {
|
||||
if (device >= QNN_BACKEND_COUNT) {
|
||||
QNN_LOG_ERROR("[qnn]invalid device %d\n", device);
|
||||
return backend_device_proxy_ptr();
|
||||
}
|
||||
|
||||
ggml_backend_reg_t ggml_backend_qnn_reg() {
|
||||
static ggml_backend_qnn_reg_impl reg{ ggml_backend_qnn_reg_interface };
|
||||
return ®
|
||||
#ifndef GGML_QNN_ENABLE_CPU_BACKEND
|
||||
if (device == QNN_BACKEND_CPU) {
|
||||
/*
|
||||
* here we skip the initialization of CPU device,
|
||||
* cause it'll block unsupported ops fallback to ggml cpu backend
|
||||
*/
|
||||
GGML_LOG_DEBUG("qnn backend registry skip CPU device\n");
|
||||
return backend_device_proxy_ptr();
|
||||
}
|
||||
#endif
|
||||
|
||||
return std::make_unique<qnn_device_proxy>(device);
|
||||
}
|
||||
|
|
@ -35,7 +35,7 @@ int get_op_max_rank(const ggml_tensor * op) {
|
|||
}
|
||||
|
||||
qnn::qnn_tensor_ptr_t create_tensor_with_cache(ggml_tensor * tensor, qnn::ggml_qnn_tensor::tensor_type_t type, int rank,
|
||||
ggml_type override_data_type, QNNBackend device,
|
||||
ggml_type override_data_type, backend_index_type device,
|
||||
Qnn_GraphHandle_t graph_handle,
|
||||
std::shared_ptr<qnn::qnn_instance> qnn_instance,
|
||||
qnn_tensor_cache_t & tensor_cache) {
|
||||
|
|
@ -60,7 +60,7 @@ qnn::qnn_tensor_ptr_t create_tensor_with_cache(ggml_tensor * tensor, qnn::ggml_q
|
|||
|
||||
qnn::qnn_tensor_array_t create_tensors_with_cache(const qnn::ggml_tensor_array_t & ggml_tensors,
|
||||
qnn::ggml_qnn_tensor::tensor_type_t type, int rank,
|
||||
ggml_type override_data_type, QNNBackend device,
|
||||
ggml_type override_data_type, backend_index_type device,
|
||||
Qnn_GraphHandle_t graph_handle,
|
||||
std::shared_ptr<qnn::qnn_instance> qnn_instance,
|
||||
qnn_tensor_cache_t & tensor_cache) {
|
||||
|
|
@ -74,7 +74,7 @@ qnn::qnn_tensor_array_t create_tensors_with_cache(const qnn::ggml_tensor_array_t
|
|||
}
|
||||
|
||||
qnn::qnn_op_config_ptr_t create_operation_from_op_tensor(ggml_tensor * dst, const std::string & name, int rank,
|
||||
QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
backend_index_type device, Qnn_GraphHandle_t graph_handle,
|
||||
std::shared_ptr<qnn::qnn_instance> qnn_instance,
|
||||
qnn_tensor_cache_t & tensor_cache) {
|
||||
auto operation = qnn::create_op(dst, name, qnn_instance);
|
||||
|
|
@ -335,7 +335,7 @@ ggml_type qnn_graph::get_graph_key_from_cgraph(const ggml_cgraph * cgraph, std::
|
|||
return min_op_type;
|
||||
}
|
||||
|
||||
qnn_graph::qnn_graph(const std::string & graph_name, QNNBackend device, qnn_instance_ptr qnn_instance,
|
||||
qnn_graph::qnn_graph(const std::string & graph_name, backend_index_type device, qnn_instance_ptr qnn_instance,
|
||||
htp_precision precision, size_t vtcm_size_in_mb) :
|
||||
_graph_name(graph_name),
|
||||
_device(device),
|
||||
|
|
@ -45,7 +45,7 @@ class qnn_graph {
|
|||
*/
|
||||
static ggml_type get_graph_key_from_cgraph(const ggml_cgraph * cgraph, std::string & output);
|
||||
|
||||
explicit qnn_graph(const std::string & graph_name, QNNBackend device, qnn_instance_ptr qnn_instance,
|
||||
explicit qnn_graph(const std::string & graph_name, backend_index_type device, qnn_instance_ptr qnn_instance,
|
||||
htp_precision precision, size_t vtcm_size_in_mb);
|
||||
|
||||
~qnn_graph();
|
||||
|
|
@ -62,17 +62,17 @@ class qnn_graph {
|
|||
|
||||
const std::string & get_name() const { return _graph_name; }
|
||||
|
||||
QNNBackend get_device() const { return _device; }
|
||||
backend_index_type get_device() const { return _device; }
|
||||
|
||||
private:
|
||||
bool finalize();
|
||||
|
||||
const std::string _graph_name;
|
||||
const QNNBackend _device;
|
||||
Qnn_GraphHandle_t _graph_handle = nullptr;
|
||||
qnn_instance_ptr _qnn_instance;
|
||||
qnn_interface_ptr _qnn_interface;
|
||||
qnn_op_config_array_t _operations;
|
||||
const std::string _graph_name;
|
||||
const backend_index_type _device;
|
||||
Qnn_GraphHandle_t _graph_handle = nullptr;
|
||||
qnn_instance_ptr _qnn_instance;
|
||||
qnn_interface_ptr _qnn_interface;
|
||||
qnn_op_config_array_t _operations;
|
||||
|
||||
qnn_tensor_array_t _tensor_inputs;
|
||||
qnn_tensor_array_t _tensor_outputs;
|
||||
|
|
@ -0,0 +1,88 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<OpDefCollection PackageName="GgmlOpPackage" Domain="ggml" Version="1.0">
|
||||
<OpDefList>
|
||||
<OpDef>
|
||||
<Name>GgmlMulMat</Name>
|
||||
<Description>
|
||||
<Content>
|
||||
GGML MulMat operator
|
||||
</Content>
|
||||
</Description>
|
||||
|
||||
<Input>
|
||||
<Name>in[0]</Name>
|
||||
<Description>
|
||||
<Content>src0</Content>
|
||||
</Description>
|
||||
<Mandatory>true</Mandatory>
|
||||
<Datatype>BACKEND_SPECIFIC</Datatype>
|
||||
<Shape>
|
||||
<Rank>4D</Rank>
|
||||
<Layout>NHWC</Layout>
|
||||
<Text>[N, C, H , W]</Text>
|
||||
</Shape>
|
||||
</Input>
|
||||
|
||||
<Input>
|
||||
<Name>in[1]</Name>
|
||||
<Description>
|
||||
<Content>src1</Content>
|
||||
</Description>
|
||||
<Mandatory>true</Mandatory>
|
||||
<Datatype>BACKEND_SPECIFIC</Datatype>
|
||||
<Shape>
|
||||
<Rank>4D</Rank>
|
||||
<Layout>NHWC</Layout>
|
||||
<Text>[N, C, H , W]</Text>
|
||||
</Shape>
|
||||
</Input>
|
||||
|
||||
<Output>
|
||||
<Name>out[0]</Name>
|
||||
<Description>
|
||||
<Content>dst</Content>
|
||||
</Description>
|
||||
<Mandatory>true</Mandatory>
|
||||
<Datatype>BACKEND_SPECIFIC</Datatype>
|
||||
<Shape>
|
||||
<Rank>4D</Rank>
|
||||
<Text> [N, C, H , W] </Text>
|
||||
</Shape>
|
||||
</Output>
|
||||
|
||||
<!--This Op is implemented on these Backends-->
|
||||
<SupportedBackend>HTP</SupportedBackend>
|
||||
</OpDef>
|
||||
|
||||
</OpDefList>
|
||||
|
||||
<SupplementalOpDefList Backend="HTP">
|
||||
<SupportedOps>
|
||||
<OpName>GgmlMulMat</OpName>
|
||||
</SupportedOps>
|
||||
|
||||
<!--ggml-mul-->
|
||||
<SupplementalOpDef>
|
||||
<Name>GgmlMulMat</Name>
|
||||
|
||||
<Input>
|
||||
<Name>in[0]</Name>
|
||||
<Datatype>QNN_DATATYPE_FLOAT_16</Datatype>
|
||||
<Datatype>QNN_DATATYPE_FLOAT_32</Datatype>
|
||||
</Input>
|
||||
<Input>
|
||||
<Name>in[1]</Name>
|
||||
<Datatype>QNN_DATATYPE_FLOAT_16</Datatype>
|
||||
<Datatype>QNN_DATATYPE_FLOAT_32</Datatype>
|
||||
</Input>
|
||||
|
||||
<Output>
|
||||
<Name>out[0]</Name>
|
||||
<Datatype>QNN_DATATYPE_FLOAT_16</Datatype>
|
||||
<Datatype>QNN_DATATYPE_FLOAT_32</Datatype>
|
||||
</Output>
|
||||
</SupplementalOpDef>
|
||||
|
||||
</SupplementalOpDefList>
|
||||
|
||||
</OpDefCollection>
|
||||
|
|
@ -0,0 +1,357 @@
|
|||
# check all setup prerequisites if the command goal is not clean
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
ifndef QNN_INCLUDE
|
||||
$(info "INFO: Qnn include not explicitly defined, attempting to use QNN_SDK_ROOT if it is valid")
|
||||
QNN_INCLUDE := $(QNN_SDK_ROOT)/include/QNN
|
||||
endif
|
||||
ifeq ($(wildcard $(QNN_INCLUDE)),)
|
||||
$(error "ERROR: QNN_INCLUDE path is not set. QNN include paths must be set to obtain BE headers necessary to compile the package")
|
||||
endif
|
||||
ifndef QNN_TARGET_LIB
|
||||
$(info "INFO: Qnn target not explicitly defined, attempting to use QNN_SDK_ROOT if it is valid")
|
||||
QNN_TARGET_LIB := $(QNN_SDK_ROOT)/lib/aarch64-android
|
||||
endif
|
||||
ifeq ($(wildcard $(QNN_TARGET_LIB)),)
|
||||
ifeq ($(MAKECMDGOALS),htp_aarch64)
|
||||
$(error "ERROR: QNN_TARGET_LIB is needed to compile package for aarch64")
|
||||
else ifeq ($(MAKECMDGOALS),all)
|
||||
$(info "WARNING:QNN_TARGET_LIB may need to be defined to compile packages")
|
||||
endif
|
||||
endif
|
||||
|
||||
ifndef HEXAGON_SDK_ROOT
|
||||
$(error "ERROR: HEXAGON_SDK_ROOT is not set. Hexagon-SDK path must be set to the latest hexagon-sdk-x.y.z")
|
||||
endif
|
||||
|
||||
ifeq ($(wildcard $(HEXAGON_SDK_ROOT)),)
|
||||
$(error "ERROR: HEXAGON_SDK_ROOT is not set correctly. Please set HEXAGON_SDK_ROOT to latest hexagon-sdk-X.Y.Z path")
|
||||
endif
|
||||
|
||||
HEXAGON_SDK_BASE := $(dir $(HEXAGON_SDK_ROOT))
|
||||
|
||||
$(info "HEXAGON_SDK_ROOT is [${HEXAGON_SDK_ROOT}]")
|
||||
# Users should note that the tools version may change between hexagon sdk versions
|
||||
# Following combination of SDK and Tool version is supported
|
||||
# fix the sdk root for new versions
|
||||
HEXAGON_SDK_ROOT_V68 := $(HEXAGON_SDK_ROOT)
|
||||
HEXAGON_SDK_ROOT_V69 := $(HEXAGON_SDK_ROOT)
|
||||
HEXAGON_SDK_ROOT_V73 := $(HEXAGON_SDK_ROOT)
|
||||
HEXAGON_SDK_ROOT_V75 := $(HEXAGON_SDK_ROOT)
|
||||
HEXAGON_SDK_ROOT_V79 := $(HEXAGON_SDK_ROOT)
|
||||
|
||||
#Updated to point to latest sdk to match with libQnnHtp.so
|
||||
HEXAGON_SDK_ROOT_X86 := $(HEXAGON_SDK_ROOT)
|
||||
HEXAGON_TOOLS_VERSION_V68 := 8.7.06
|
||||
HEXAGON_TOOLS_VERSION_V69 := 8.7.06
|
||||
HEXAGON_TOOLS_VERSION_V73 := 8.7.06
|
||||
HEXAGON_TOOLS_VERSION_V75 := 8.7.06
|
||||
HEXAGON_TOOLS_VERSION_V79 := 8.7.06
|
||||
|
||||
#Updated to point to latest sdk to match with libQnnHtp.so
|
||||
HEXAGON_TOOLS_VERSION_X86 := 8.7.06
|
||||
|
||||
ifndef ANDROID_NDK_ROOT
|
||||
ifeq ($(MAKECMDGOALS),htp_aarch64)
|
||||
$(error "ERROR: ANDROID_NDK_ROOT is not set. Android NDK path must be set to compile package for aarch64")
|
||||
else ifeq ($(MAKECMDGOALS),all)
|
||||
$(info "WARNING: ANDROID_NDK_ROOT is not set. Android NDK path must be set to compile package for aarch64")
|
||||
endif
|
||||
endif
|
||||
|
||||
ifndef PACKAGE_NAME
|
||||
export
|
||||
PACKAGE_NAME := $(notdir $(shell pwd))
|
||||
$(info "INFO: No package name defined. Using current directory name: $(PACKAGE_NAME) as the package name")
|
||||
endif
|
||||
|
||||
WORK := build
|
||||
SRC_DIR := src
|
||||
OP_SRC_DIR := src/ops
|
||||
OP_INCLUDE_DIR := ./include
|
||||
OP_INCLUDES = #$(wildcard $(OP_INCLUDE_DIR)/*.h) user defined if any op specific headers are needed, add -I to common flags
|
||||
LIBRARY_NAME := libQnn$(PACKAGE_NAME).so
|
||||
SUPPORTED_TARGETS = x86_64-linux-clang hexagon-v68 hexagon-v69 hexagon-v73 hexagon-v75 hexagon-v79 aarch64-android
|
||||
|
||||
|
||||
COMMON_CXX_FLAGS = -std=c++17 -I$(QNN_INCLUDE) -fPIC -Wall -Wreorder -Wno-missing-braces -Wno-unused-function
|
||||
COMMON_CXX_FLAGS += -Werror -Wno-format -Wno-unused-command-line-argument -fvisibility=default -stdlib=libc++
|
||||
COMMON_CXX_FLAGS += -DQNN_API="__attribute__((visibility(\"default\")))" -D__QAIC_HEADER_EXPORT="__attribute__((visibility(\"default\")))"
|
||||
|
||||
X86_LIBNATIVE_RELEASE_DIR := $(HEXAGON_SDK_ROOT_X86)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_X86)/Tools
|
||||
|
||||
# Ensure hexagon sdk tool version can be retrieved
|
||||
ifeq ($(wildcard $(X86_LIBNATIVE_RELEASE_DIR)/.),)
|
||||
$(error "Cannot retrieve hexagon tools from: $(X86_LIBNATIVE_RELEASE_DIR). \
|
||||
\
|
||||
Please check that hexagon tools version is correct. Expected: $(HEXAGON_TOOLS_VERSION_X86)")
|
||||
endif
|
||||
|
||||
#Check tools for hexagon_v68 are present.
|
||||
ifeq ($(MAKECMDGOALS),htp_v68)
|
||||
ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V68)),)
|
||||
$(error "ERROR: HEXAGON_SDK_ROOT_V68 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V68)")
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(MAKECMDGOALS),htp_v69)
|
||||
ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V69)),)
|
||||
$(error "ERROR: HEXAGON_SDK_ROOT_V69 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V69)")
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(MAKECMDGOALS),htp_v73)
|
||||
ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V73)),)
|
||||
$(error "ERROR: HEXAGON_SDK_ROOT_V73 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V73)")
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(MAKECMDGOALS),htp_v75)
|
||||
ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V75)),)
|
||||
$(error "ERROR: HEXAGON_SDK_ROOT_V75 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V75)")
|
||||
endif
|
||||
endif
|
||||
|
||||
#Check tools for hexagon_v79 are present.
|
||||
ifeq ($(MAKECMDGOALS),htp_v79)
|
||||
ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V79)),)
|
||||
$(error "ERROR: HEXAGON_SDK_ROOT_V79 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V79)")
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
|
||||
endif
|
||||
OP_SOURCES = $(wildcard $(OP_SRC_DIR)/*.cpp)
|
||||
OTHER_SOURCES = $(wildcard $(SRC_DIR)/*.cpp)
|
||||
HFILES = $(wildcard $(QNN_INCLUDE)/*.h)
|
||||
HFILES += $(wildcard $(QNN_INCLUDE)/HTP/*.h)
|
||||
HFILES += $(wildcard $(QNN_INCLUDE)/HTP/core/*.h)
|
||||
OP_OBJS = $(patsubst $(SRC_DIR)/%,%,$(patsubst %.cpp,%.o,$(OP_SOURCES)))
|
||||
OTHER_OBJS = $(patsubst $(SRC_DIR)/%,%,$(patsubst %.cpp,%.o,$(OTHER_SOURCES)))
|
||||
|
||||
#======= Assembly ========
|
||||
OP_SOURCES_ASM_X86 += $(wildcard $(OP_SRC_DIR)/x86_asm/*.S)
|
||||
OP_OBJS_ASM_X86 += $(subst /x86_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_X86))))
|
||||
OP_SOURCES_ASM_V68 += $(wildcard $(OP_SRC_DIR)/v68_asm/*.S)
|
||||
OP_OBJS_ASM_V68 += $(subst /v68_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V68))))
|
||||
OP_SOURCES_ASM_V69 += $(wildcard $(OP_SRC_DIR)/v69_asm/*.S)
|
||||
OP_OBJS_ASM_V69 += $(subst /v69_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V69))))
|
||||
OP_SOURCES_ASM_V73 += $(wildcard $(OP_SRC_DIR)/v73_asm/*.S)
|
||||
OP_OBJS_ASM_V73 += $(subst /v73_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V73))))
|
||||
OP_SOURCES_ASM_V75 += $(wildcard $(OP_SRC_DIR)/v75_asm/*.S)
|
||||
OP_OBJS_ASM_V75 += $(subst /v75_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V75))))
|
||||
OP_SOURCES_ASM_V79 += $(wildcard $(OP_SRC_DIR)/v79_asm/*.S)
|
||||
OP_OBJS_ASM_V79 += $(subst /v79_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V79))))
|
||||
|
||||
OP_SOURCES_ASM_ANDROID += $(wildcard $(OP_SRC_DIR)/android_asm/*.S)
|
||||
OP_OBJS_ASM_ANDROID += $(subst /android_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_ANDROID))))
|
||||
|
||||
|
||||
all: htp_v68 htp_x86 htp_aarch64
|
||||
|
||||
#============================================================================================================
|
||||
# Setup compiler, compiler instructions and linker for x86
|
||||
X86_CXX ?= clang++-9
|
||||
# Checking if clang++-9 is present. If not switch to clang++
|
||||
ifeq ($(shell $(X86_CXX) -v 2>&1 | grep -c "clang version"), 0)
|
||||
X86_CXX := clang++
|
||||
endif
|
||||
X86_LDFLAGS:= -Wl,--whole-archive -L$(X86_LIBNATIVE_RELEASE_DIR)/libnative/lib -lnative -Wl,--no-whole-archive -lpthread -L$(QNN_SDK_ROOT)/lib/x86_64-linux-clang -lHtpPrepare
|
||||
X86_C_FLAGS := -D__HVXDBL__ -I$(X86_LIBNATIVE_RELEASE_DIR)/libnative/include -ffast-math -DUSE_OS_LINUX
|
||||
X86_CXX_FLAGS = $(COMMON_CXX_FLAGS) $(X86_C_FLAGS) -fomit-frame-pointer -Wno-invalid-offsetof
|
||||
linux_objs =
|
||||
#============================================================================================================
|
||||
# Setup compiler, compiler instructions and linker for hexagon
|
||||
HEXAGON_CXX_FLAGS := $(COMMON_CXX_FLAGS) -mhvx -mhvx-length=128B -mhmx -DUSE_OS_QURT -O2 -Wno-reorder -DPREPARE_DISABLED
|
||||
|
||||
HEXAGON_CXX_FLAGS_V68 := $(HEXAGON_CXX_FLAGS) -mv68 -I$(HEXAGON_SDK_ROOT_V68)/rtos/qurt/computev68/include/qurt -I$(HEXAGON_SDK_ROOT_V68)/rtos/qurt/computev68/include/posix -I$(HEXAGON_SDK_ROOT_V68)/incs -I$(HEXAGON_SDK_ROOT_V68)/incs/stddef
|
||||
HEXAGON_CXX_FLAGS_V69 := $(HEXAGON_CXX_FLAGS) -mv69 -I$(HEXAGON_SDK_ROOT_V69)/rtos/qurt/computev69/include/qurt -I$(HEXAGON_SDK_ROOT_V69)/rtos/qurt/computev69/include/posix -I$(HEXAGON_SDK_ROOT_V69)/incs -I$(HEXAGON_SDK_ROOT_V69)/incs/stddef
|
||||
HEXAGON_CXX_FLAGS_V73 := $(HEXAGON_CXX_FLAGS) -mv73 -I$(HEXAGON_SDK_ROOT_V73)/rtos/qurt/computev73/include/qurt -I$(HEXAGON_SDK_ROOT_V73)/rtos/qurt/computev73/include/posix -I$(HEXAGON_SDK_ROOT_V73)/incs -I$(HEXAGON_SDK_ROOT_V73)/incs/stddef
|
||||
HEXAGON_CXX_FLAGS_V75 := $(HEXAGON_CXX_FLAGS) -mv75 -I$(HEXAGON_SDK_ROOT_V75)/rtos/qurt/computev75/include/qurt -I$(HEXAGON_SDK_ROOT_V75)/rtos/qurt/computev75/include/posix -I$(HEXAGON_SDK_ROOT_V75)/incs -I$(HEXAGON_SDK_ROOT_V75)/incs/stddef
|
||||
HEXAGON_CXX_FLAGS_V79 := $(HEXAGON_CXX_FLAGS) -mv79 -I$(HEXAGON_SDK_ROOT_V79)/rtos/qurt/computev79/include/qurt -I$(HEXAGON_SDK_ROOT_V79)/rtos/qurt/computev79/include/posix -I$(HEXAGON_SDK_ROOT_V79)/incs -I$(HEXAGON_SDK_ROOT_V79)/incs/stddef
|
||||
|
||||
|
||||
HEXAGON_CXX_V68 := $(HEXAGON_SDK_ROOT_V68)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V68)/Tools/bin/hexagon-clang++
|
||||
HEXAGON_CXX_V69 := $(HEXAGON_SDK_ROOT_V69)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V69)/Tools/bin/hexagon-clang++
|
||||
HEXAGON_CXX_V73 := $(HEXAGON_SDK_ROOT_V73)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V73)/Tools/bin/hexagon-clang++
|
||||
HEXAGON_CXX_V75 := $(HEXAGON_SDK_ROOT_V75)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V75)/Tools/bin/hexagon-clang++
|
||||
HEXAGON_CXX_V79 := $(HEXAGON_SDK_ROOT_V79)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V79)/Tools/bin/hexagon-clang++
|
||||
|
||||
|
||||
HEX_LDFLAGS =
|
||||
hexagon_objs =
|
||||
#============================================================================================================
|
||||
# Setup compiler, compiler instructions and linker for aarch64
|
||||
AARCH64_C__FLAGS = -D__HVXDBL__ -I$(X86_LIBNATIVE_RELEASE_DIR)/libnative/include -ffast-math -DUSE_OS_LINUX -DANDROID
|
||||
AARCH64_CXX_FLAGS = $(COMMON_CXX_FLAGS) $(AARCH64_C__FLAGS) -fomit-frame-pointer -Wno-invalid-offsetof -Wno-unused-variable -Wno-unused-parameter -Wno-missing-braces -Wno-sign-compare -Wno-unused-private-field -Wno-unused-variable -Wno-ignored-qualifiers -Wno-missing-field-initializers
|
||||
ARM_CLANG_OPTS =--target=aarch64-none-linux-android21 --sysroot=$(ANDROID_NDK_ROOT)/toolchains/llvm/prebuilt/linux-x86_64/sysroot -stdlib=libc++ -static-libstdc++
|
||||
AARCH64_CXX = $(ANDROID_NDK_ROOT)/toolchains/llvm/prebuilt/linux-x86_64/bin/clang++ $(ARM_CLANG_OPTS)
|
||||
AARCH64_LDFLAGS = -L$(QNN_TARGET_LIB) -lQnnHtp -lQnnHtpPrepare
|
||||
aarch64_objs =
|
||||
#============================================================================================================
|
||||
# Setup targets and goals
|
||||
|
||||
htp_x86: X86_BUILD
|
||||
|
||||
htp_v68: HEXAGON_BUILD_V68
|
||||
|
||||
htp_v69: HEXAGON_BUILD_V69
|
||||
|
||||
htp_v73: HEXAGON_BUILD_V73
|
||||
|
||||
htp_v75: HEXAGON_BUILD_V75
|
||||
|
||||
htp_v79: HEXAGON_BUILD_V79
|
||||
|
||||
|
||||
|
||||
htp_aarch64: AARCH64_BUILD
|
||||
|
||||
AARCH64_BUILD: $(WORK)/aarch64-android/$(LIBRARY_NAME)
|
||||
|
||||
HEXAGON_BUILD_V68: $(WORK)/hexagon-v68/$(LIBRARY_NAME)
|
||||
|
||||
HEXAGON_BUILD_V69: $(WORK)/hexagon-v69/$(LIBRARY_NAME)
|
||||
|
||||
HEXAGON_BUILD_V73: $(WORK)/hexagon-v73/$(LIBRARY_NAME)
|
||||
|
||||
HEXAGON_BUILD_V75: $(WORK)/hexagon-v75/$(LIBRARY_NAME)
|
||||
|
||||
HEXAGON_BUILD_V79: $(WORK)/hexagon-v79/$(LIBRARY_NAME)
|
||||
|
||||
|
||||
|
||||
X86_BUILD: $(WORK)/x86_64-linux-clang/$(LIBRARY_NAME)
|
||||
|
||||
|
||||
define build_objs =
|
||||
ifneq ($(filter $(2),$(SUPPORTED_TARGETS)),)
|
||||
$(2)_objs += $(foreach x,$(1),$(WORK)/$(2)/$(x))
|
||||
else
|
||||
$$(error "Unknown target option provided: $(2): Supported targets are: $(SUPPORTED_TARGETS)")
|
||||
endif
|
||||
endef
|
||||
|
||||
$(eval $(call build_objs,$(OTHER_OBJS),x86_64-linux-clang))
|
||||
$(eval $(call build_objs,$(OP_OBJS),x86_64-linux-clang))
|
||||
$(eval $(call build_objs,$(OP_OBJS_ASM_X86),x86_64-linux-clang))
|
||||
$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v68))
|
||||
$(eval $(call build_objs,$(OP_OBJS),hexagon-v68))
|
||||
$(eval $(call build_objs,$(OP_OBJS_ASM_V68),hexagon-v68))
|
||||
$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v69))
|
||||
$(eval $(call build_objs,$(OP_OBJS),hexagon-v69))
|
||||
$(eval $(call build_objs,$(OP_OBJS_ASM_V69),hexagon-v69))
|
||||
$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v73))
|
||||
$(eval $(call build_objs,$(OP_OBJS),hexagon-v73))
|
||||
$(eval $(call build_objs,$(OP_OBJS_ASM_V73),hexagon-v73))
|
||||
$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v75))
|
||||
$(eval $(call build_objs,$(OP_OBJS),hexagon-v75))
|
||||
$(eval $(call build_objs,$(OP_OBJS_ASM_V75),hexagon-v75))
|
||||
$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v79))
|
||||
$(eval $(call build_objs,$(OP_OBJS),hexagon-v79))
|
||||
$(eval $(call build_objs,$(OP_OBJS_ASM_V75),hexagon-v79))
|
||||
|
||||
$(eval $(call build_objs,$(OTHER_OBJS),aarch64-android))
|
||||
$(eval $(call build_objs,$(OP_OBJS),aarch64-android))
|
||||
$(eval $(call build_objs,$(OP_OBJS_ASM_ANDROID),aarch64-android))
|
||||
|
||||
# x86
|
||||
$(WORK)/x86_64-linux-clang $(WORK)/hexagon-v68 $(WORK)/hexagon-v69 $(WORK)/hexagon-v73 $(WORK)/hexagon-v75 $(WORK)/hexagon-v79 $(WORK)/aarch64-android:
|
||||
@mkdir -p $@/ops
|
||||
|
||||
$(WORK)/x86_64-linux-clang/%.o: $(SRC_DIR)/%.cpp | $(WORK)/x86_64-linux-clang
|
||||
$(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/x86_64-linux-clang/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/x86_64-linux-clang
|
||||
$(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/x86_64-linux-clang/ops/%.o: $(OP_SRC_DIR)/x86_asm/%.S | $(WORK)/x86_64-linux-clang
|
||||
$(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/x86_64-linux-clang/$(LIBRARY_NAME): $(x86_64-linux-clang_objs) | $(HFILES)
|
||||
$(X86_CXX) -fPIC -std=c++17 -g -shared -o $@ $^ $(X86_LDFLAGS)
|
||||
|
||||
# v68
|
||||
$(WORK)/hexagon-v68/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v68
|
||||
$(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/hexagon-v68/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v68
|
||||
$(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/hexagon-v68/ops/%.o: $(OP_SRC_DIR)/v68_asm/%.S | $(WORK)/hexagon-v68
|
||||
$(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/hexagon-v68/$(LIBRARY_NAME): $(hexagon-v68_objs) | $(HFILES)
|
||||
$(HEXAGON_CXX_V68) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS)
|
||||
|
||||
# v69
|
||||
$(WORK)/hexagon-v69/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v69
|
||||
$(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/hexagon-v69/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v69
|
||||
$(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/hexagon-v69/ops/%.o: $(OP_SRC_DIR)/v69_asm/%.S | $(WORK)/hexagon-v69
|
||||
$(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/hexagon-v69/$(LIBRARY_NAME): $(hexagon-v69_objs) | $(HFILES)
|
||||
$(HEXAGON_CXX_V69) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS)
|
||||
|
||||
# v73
|
||||
$(WORK)/hexagon-v73/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v73
|
||||
$(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/hexagon-v73/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v73
|
||||
$(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/hexagon-v73/ops/%.o: $(OP_SRC_DIR)/v73_asm/%.S | $(WORK)/hexagon-v73
|
||||
$(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/hexagon-v73/$(LIBRARY_NAME): $(hexagon-v73_objs) | $(HFILES)
|
||||
$(HEXAGON_CXX_V73) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS)
|
||||
|
||||
#v75
|
||||
$(WORK)/hexagon-v75/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v75
|
||||
$(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/hexagon-v75/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v75
|
||||
$(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/hexagon-v75/ops/%.o: $(OP_SRC_DIR)/v75_asm/%.S | $(WORK)/hexagon-v75
|
||||
$(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/hexagon-v75/$(LIBRARY_NAME): $(hexagon-v75_objs) | $(HFILES)
|
||||
$(HEXAGON_CXX_V75) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS)
|
||||
|
||||
#v79
|
||||
$(WORK)/hexagon-v79/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v79
|
||||
$(HEXAGON_CXX_V79) $(HEXAGON_CXX_FLAGS_V79) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/hexagon-v79/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v79
|
||||
$(HEXAGON_CXX_V79) $(HEXAGON_CXX_FLAGS_V79) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/hexagon-v79/ops/%.o: $(OP_SRC_DIR)/v79_asm/%.S | $(WORK)/hexagon-v79
|
||||
$(HEXAGON_CXX_V79) $(HEXAGON_CXX_FLAGS_V79) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/hexagon-v79/$(LIBRARY_NAME): $(hexagon-v79_objs) | $(HFILES)
|
||||
$(HEXAGON_CXX_V79) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS)
|
||||
|
||||
|
||||
|
||||
# aarch64
|
||||
$(WORK)/aarch64-android/%.o: $(SRC_DIR)/%.cpp | $(WORK)/aarch64-android
|
||||
$(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/aarch64-android/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/aarch64-android
|
||||
$(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/aarch64-android/ops/%.o: $(OP_SRC_DIR)/android_asm/%.S | $(WORK)/aarch64-android
|
||||
$(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
|
||||
|
||||
$(WORK)/aarch64-android/$(LIBRARY_NAME): $(aarch64-android_objs) | $(HFILES)
|
||||
$(AARCH64_CXX) -fPIC -std=c++17 -g -shared -o $@ $^ $(AARCH64_LDFLAGS)
|
||||
|
||||
clean:
|
||||
-rm -rf $(WORK)
|
||||
|
||||
.PHONY: all clean
|
||||
|
|
@ -0,0 +1,88 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<OpDefCollection PackageName="GgmlOpPackage" Domain="ggml" Version="1.0">
|
||||
<OpDefList>
|
||||
<OpDef>
|
||||
<Name>GgmlMulMat</Name>
|
||||
<Description>
|
||||
<Content>
|
||||
GGML MulMat operator
|
||||
</Content>
|
||||
</Description>
|
||||
|
||||
<Input>
|
||||
<Name>in[0]</Name>
|
||||
<Description>
|
||||
<Content>src0</Content>
|
||||
</Description>
|
||||
<Mandatory>true</Mandatory>
|
||||
<Datatype>BACKEND_SPECIFIC</Datatype>
|
||||
<Shape>
|
||||
<Rank>4D</Rank>
|
||||
<Layout>NHWC</Layout>
|
||||
<Text>[N, C, H , W]</Text>
|
||||
</Shape>
|
||||
</Input>
|
||||
|
||||
<Input>
|
||||
<Name>in[1]</Name>
|
||||
<Description>
|
||||
<Content>src1</Content>
|
||||
</Description>
|
||||
<Mandatory>true</Mandatory>
|
||||
<Datatype>BACKEND_SPECIFIC</Datatype>
|
||||
<Shape>
|
||||
<Rank>4D</Rank>
|
||||
<Layout>NHWC</Layout>
|
||||
<Text>[N, C, H , W]</Text>
|
||||
</Shape>
|
||||
</Input>
|
||||
|
||||
<Output>
|
||||
<Name>out[0]</Name>
|
||||
<Description>
|
||||
<Content>dst</Content>
|
||||
</Description>
|
||||
<Mandatory>true</Mandatory>
|
||||
<Datatype>BACKEND_SPECIFIC</Datatype>
|
||||
<Shape>
|
||||
<Rank>4D</Rank>
|
||||
<Text> [N, C, H , W] </Text>
|
||||
</Shape>
|
||||
</Output>
|
||||
|
||||
<!--This Op is implemented on these Backends-->
|
||||
<SupportedBackend>HTP</SupportedBackend>
|
||||
</OpDef>
|
||||
|
||||
</OpDefList>
|
||||
|
||||
<SupplementalOpDefList Backend="HTP">
|
||||
<SupportedOps>
|
||||
<OpName>GgmlMulMat</OpName>
|
||||
</SupportedOps>
|
||||
|
||||
<!--ggml-mul-->
|
||||
<SupplementalOpDef>
|
||||
<Name>GgmlMulMat</Name>
|
||||
|
||||
<Input>
|
||||
<Name>in[0]</Name>
|
||||
<Datatype>QNN_DATATYPE_FLOAT_16</Datatype>
|
||||
<Datatype>QNN_DATATYPE_FLOAT_32</Datatype>
|
||||
</Input>
|
||||
<Input>
|
||||
<Name>in[1]</Name>
|
||||
<Datatype>QNN_DATATYPE_FLOAT_16</Datatype>
|
||||
<Datatype>QNN_DATATYPE_FLOAT_32</Datatype>
|
||||
</Input>
|
||||
|
||||
<Output>
|
||||
<Name>out[0]</Name>
|
||||
<Datatype>QNN_DATATYPE_FLOAT_16</Datatype>
|
||||
<Datatype>QNN_DATATYPE_FLOAT_32</Datatype>
|
||||
</Output>
|
||||
</SupplementalOpDef>
|
||||
|
||||
</SupplementalOpDefList>
|
||||
|
||||
</OpDefCollection>
|
||||
|
|
@ -0,0 +1,274 @@
|
|||
//==============================================================================
|
||||
// Auto Generated Code for GgmlOpPackage
|
||||
//==============================================================================
|
||||
|
||||
#include "HTP/QnnHtpCommon.h"
|
||||
#include "HTP/core/constraints.h"
|
||||
#include "HTP/core/op_package_feature_support.h"
|
||||
#include "HTP/core/op_register_ext.h"
|
||||
#include "HTP/core/optimize.h"
|
||||
#include "HTP/core/simple_reg.h"
|
||||
#include "HTP/core/unique_types.h"
|
||||
#include "QnnOpPackage.h"
|
||||
#include "QnnSdkBuildId.h"
|
||||
|
||||
DEFINE_UNIQ_TY()
|
||||
BEGIN_PKG_OPS_OPTS_LIST()
|
||||
|
||||
/** Note that the order of declarations given here defines the order in which ops and graph optimizations are
|
||||
* registered to the HTP Core.
|
||||
* Append the latest OpName at the bottom
|
||||
*/
|
||||
DECLARE_PKG_OPS_OPTS_LIST(PKG_GgmlMulMat)
|
||||
|
||||
END_PKG_OPS_OPTS_LIST()
|
||||
|
||||
// op package info
|
||||
static constexpr auto sg_packageName = THIS_PKG_NAME_STR; // package name passed in as compile flag
|
||||
|
||||
static std::array<const char*, 1> sg_opNames{{"GgmlMulMat"}};
|
||||
|
||||
static Qnn_ApiVersion_t sg_sdkApiVersion = QNN_HTP_API_VERSION_INIT;
|
||||
static QnnOpPackage_Info_t sg_packageInfo = QNN_OP_PACKAGE_INFO_INIT;
|
||||
|
||||
// global data
|
||||
static QnnOpPackage_GlobalInfrastructure_t sg_globalInfra =
|
||||
nullptr; // global infrastructure not in use for now
|
||||
static bool sg_packageInitialized = false;
|
||||
|
||||
/*
|
||||
* user provided logging call back function
|
||||
* currently only supported on linux x86-64 and nonrpc versions
|
||||
* typedef void (*QnnLog_Callback_t)(const char* fmt,
|
||||
* QnnLog_Level_t level,
|
||||
* uint64_t timestamp,
|
||||
* va_list args);
|
||||
* usage: if(sg_logInitialized && level <= sg_maxLogLevel)
|
||||
* sg_logCallback(fmt, level, timestamp, args);
|
||||
*
|
||||
* for cross rpc versions, skel side user provided logging call back function
|
||||
* can be defined as part of op packages. maximal log level sg_maxLogLevel
|
||||
* can be set by Qnn_ErrorHandle_t GgmlOpPackageLogSetLevel(QnnLog_Level_t maxLogLevel)
|
||||
*/
|
||||
/*
|
||||
* for alternative logging method provided by HTP core, please refer to log.h
|
||||
*/
|
||||
static QnnLog_Callback_t sg_logCallback =
|
||||
nullptr; // user provided call back function pointer for logging
|
||||
static QnnLog_Level_t sg_maxLogLevel =
|
||||
(QnnLog_Level_t)0; // maximal log level used in user provided logging
|
||||
static bool sg_logInitialized =
|
||||
false; // tracks whether user provided logging method has been initialized
|
||||
|
||||
|
||||
/*
|
||||
* op initialization
|
||||
* needs to be global in the package
|
||||
* one initialization per package before any op definitions
|
||||
* syntax: INIT_PACKAGE_OP_DEF()
|
||||
*/
|
||||
INIT_PACKAGE_OP_DEF()
|
||||
|
||||
/*
|
||||
* optimization initialization
|
||||
* needs to be global in the package
|
||||
* one initialization per package before any optimization definitions
|
||||
* syntax: INIT_PACKAGE_OPTIMIZATION_DEF()
|
||||
*/
|
||||
INIT_PACKAGE_OPTIMIZATION_DEF()
|
||||
|
||||
/*
|
||||
* op parameter order initialization
|
||||
* needs to be global in the package
|
||||
* one initialization per package before any op parameter order definitions
|
||||
* syntax: INIT_PACKAGE_PARAM_ORDER_DEF()
|
||||
*/
|
||||
INIT_PACKAGE_PARAM_ORDER_DEF()
|
||||
|
||||
/*
|
||||
* axis parameter name list
|
||||
* optional
|
||||
* needs to be global in the package
|
||||
* one list per package
|
||||
* for listing axis parameter names passed into Qnn_AddNode API
|
||||
* HTP backend auto-adjusts values in axis parameters based on HTP backfilling
|
||||
* note: HTP backend backfills tensor dimensions to 4 dimensions
|
||||
* syntax: LIST_PACKAGE_AXIS_PARAMS(...)
|
||||
* e.g. LIST_PACKAGE_AXIS_PARAMS("Axis", "AXIS", "axis")
|
||||
*/
|
||||
// LIST_PACKAGE_AXIS_PARAMS()
|
||||
|
||||
/*
|
||||
* per-channel quantized op name list
|
||||
* optional
|
||||
* needs to be global in the package
|
||||
* one list per package
|
||||
* for listing op names which support per-channel quantization
|
||||
* per-axis quantization info of an op is embeded in axisScaleOffsetEncoding
|
||||
* inside Qnn_Tensor_t types
|
||||
* HTP backend only supports per-channel scale ops
|
||||
* i.e. along last dimension, offset is always zero
|
||||
* if an op name is marked as having per-channel scale support, and in
|
||||
* QNN_AddNode, at least one input, parameter, or output has
|
||||
* QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET type:
|
||||
* then:
|
||||
* HTP backend will pass to op implementation function the following:
|
||||
* output(s), input(s), parameter(s),
|
||||
* outputPerChannelScale(s), inputPerChannelScale(s), paramPerChannelScale(s)
|
||||
*
|
||||
* optimization rules can be used to remove extra perChannelScale tensors
|
||||
*
|
||||
* syntax: LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(...)
|
||||
* e.g. LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(sg_op1Name, sg_op2Name)
|
||||
*/
|
||||
|
||||
// LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS()
|
||||
|
||||
/*
|
||||
* Declare and define the special intialize function for HTP Backend to load
|
||||
*/
|
||||
INIT_PKG_CORE_INIT_FUNC()
|
||||
|
||||
/* op package API's */
|
||||
|
||||
Qnn_ErrorHandle_t GgmlOpPackageInit(QnnOpPackage_GlobalInfrastructure_t infrastructure) {
|
||||
if (sg_packageInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED;
|
||||
|
||||
/*
|
||||
* op parameter order registration
|
||||
* registers all defined op parameter orders in the package
|
||||
* syntax: REGISTER_PACKAGE_PARAM_ORDERS()
|
||||
*/
|
||||
REGISTER_PACKAGE_PARAM_ORDERS()
|
||||
|
||||
/*
|
||||
* op axis parameter name registration
|
||||
* registers all axis parameter names in the package
|
||||
* used with LIST_PACKAGE_AXIS_PARAMS(...)
|
||||
* syntax: REGISTER_PACKAGE_AXIS_PARAMS()
|
||||
*/
|
||||
REGISTER_PACKAGE_AXIS_PARAMS()
|
||||
|
||||
/*
|
||||
* per-channel scale op name registration
|
||||
* registers all per-channel scale op names in the package
|
||||
* used with LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(...)
|
||||
* syntax: REGISTER_PACKAGE_PER_CHANNEL_QUANTIZED_OPS()
|
||||
*/
|
||||
REGISTER_PACKAGE_PER_CHANNEL_QUANTIZED_OPS()
|
||||
|
||||
sg_globalInfra = infrastructure;
|
||||
sg_packageInitialized = true;
|
||||
return QNN_SUCCESS;
|
||||
}
|
||||
|
||||
Qnn_ErrorHandle_t GgmlOpPackageGetInfo(const QnnOpPackage_Info_t** info) {
|
||||
if (!sg_packageInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED;
|
||||
if (!info) return QNN_OP_PACKAGE_ERROR_INVALID_INFO;
|
||||
|
||||
sg_packageInfo = QNN_OP_PACKAGE_INFO_INIT;
|
||||
sg_packageInfo.packageName = sg_packageName;
|
||||
sg_packageInfo.operationNames = sg_opNames.data();
|
||||
sg_packageInfo.numOperations = sg_opNames.size();
|
||||
sg_packageInfo.sdkBuildId = QNN_SDK_BUILD_ID;
|
||||
sg_packageInfo.sdkApiVersion = &sg_sdkApiVersion;
|
||||
|
||||
*info = &sg_packageInfo;
|
||||
return QNN_SUCCESS;
|
||||
}
|
||||
|
||||
Qnn_ErrorHandle_t GgmlOpPackageLogInitialize(QnnLog_Callback_t callback, QnnLog_Level_t maxLogLevel) {
|
||||
if (sg_logInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED;
|
||||
if (!callback) return QNN_LOG_ERROR_INVALID_ARGUMENT;
|
||||
if (maxLogLevel < QNN_LOG_LEVEL_ERROR) return QNN_LOG_ERROR_INVALID_ARGUMENT;
|
||||
sg_logCallback = callback;
|
||||
sg_maxLogLevel = maxLogLevel;
|
||||
sg_logInitialized = true;
|
||||
return QNN_SUCCESS;
|
||||
}
|
||||
|
||||
Qnn_ErrorHandle_t GgmlOpPackageLogSetLevel(QnnLog_Level_t maxLogLevel) {
|
||||
if (maxLogLevel < QNN_LOG_LEVEL_ERROR) return QNN_LOG_ERROR_INVALID_ARGUMENT;
|
||||
sg_maxLogLevel = maxLogLevel;
|
||||
return QNN_SUCCESS;
|
||||
}
|
||||
|
||||
Qnn_ErrorHandle_t GgmlOpPackageLogTerminate() {
|
||||
if (!sg_logInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED;
|
||||
sg_logCallback = nullptr;
|
||||
sg_maxLogLevel = (QnnLog_Level_t)0;
|
||||
sg_logInitialized = false;
|
||||
return QNN_SUCCESS;
|
||||
}
|
||||
|
||||
Qnn_ErrorHandle_t GgmlOpPackageValidateOpConfig (Qnn_OpConfig_t opConfig){
|
||||
if (std::string(sg_packageName) != opConfig.v1.packageName) {
|
||||
return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
|
||||
}
|
||||
|
||||
/* auto-generated validation code below
|
||||
* Check if op config type matches any registered ops
|
||||
* If a match is found, check number of inputs, outputs and params
|
||||
*/
|
||||
if (std::string(opConfig.v1.typeName) == "GgmlMulMat"){
|
||||
if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 2 || opConfig.v1.numOfOutputs != 1){
|
||||
return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
|
||||
}
|
||||
}
|
||||
else{
|
||||
return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
|
||||
}
|
||||
|
||||
/*
|
||||
* additional validation code here
|
||||
* */
|
||||
|
||||
return QNN_SUCCESS;
|
||||
}
|
||||
|
||||
/* The following three functions in this comment are not called by HTP backend for now,
|
||||
* no auto-generated implementations are created. Users should see example for full function signatures.
|
||||
* (version 1.3.0) Qnn_ErrorHandle_t GgmlOpPackageCreateKernels (QnnOpPackage_GraphInfrastructure_t
|
||||
* graphInfrastructure, QnnOpPackage_Node_t node, QnnOpPackage_Kernel_t** kernels, uint32_t*
|
||||
* numKernels)
|
||||
* (version 1.3.0) Qnn_ErrorHandle_t GgmlOpPackageFreeKernels (QnnOpPackage_Kernel_t* kernels)
|
||||
*
|
||||
* (version 1.4.0) Qnn_ErrorHandle_t GgmlOpPackageCreateOpImpl (QnnOpPackage_GraphInfrastructure_t
|
||||
* graphInfrastructure, QnnOpPackage_Node_t node, QnnOpPackage_OpImpl_t* opImpl)
|
||||
*(version 1.4.0) Qnn_ErrorHandle_t GgmlOpPackageFreeOpImpl (QnnOpPackage_OpImpl_t opImpl)
|
||||
*/
|
||||
|
||||
Qnn_ErrorHandle_t GgmlOpPackageTerminate() {
|
||||
if (!sg_packageInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED;
|
||||
|
||||
sg_globalInfra = nullptr;
|
||||
sg_packageInitialized = false;
|
||||
return QNN_SUCCESS;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
/* latest version */
|
||||
Qnn_ErrorHandle_t GgmlOpPackageInterfaceProvider(QnnOpPackage_Interface_t* interface) {
|
||||
if (!interface) return QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT;
|
||||
interface->interfaceVersion = {1, 4, 0};
|
||||
interface->v1_4.init = GgmlOpPackageInit;
|
||||
interface->v1_4.terminate = GgmlOpPackageTerminate;
|
||||
interface->v1_4.getInfo = GgmlOpPackageGetInfo;
|
||||
interface->v1_4.validateOpConfig = GgmlOpPackageValidateOpConfig;
|
||||
interface->v1_4.createOpImpl = nullptr;
|
||||
interface->v1_4.freeOpImpl = nullptr;
|
||||
interface->v1_4.logInitialize = GgmlOpPackageLogInitialize;
|
||||
interface->v1_4.logSetLevel = GgmlOpPackageLogSetLevel;
|
||||
interface->v1_4.logTerminate = GgmlOpPackageLogTerminate;
|
||||
return QNN_SUCCESS;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,213 @@
|
|||
//==============================================================================
|
||||
// Auto Generated Code for GgmlOpPackage
|
||||
//==============================================================================
|
||||
|
||||
#include "HTP/core/constraints.h"
|
||||
#include "HTP/core/op_package_feature_support.h"
|
||||
#include "HTP/core/op_register_ext.h"
|
||||
#include "HTP/core/optimize.h"
|
||||
#include "HTP/core/simple_reg.h"
|
||||
#include "QnnOpPackage.h"
|
||||
|
||||
BEGIN_PKG_OP_DEFINITION(PKG_GgmlMulMat);
|
||||
|
||||
// op execute function declarations
|
||||
template <typename TensorType>
|
||||
GraphStatus ggmlmulmatImpl(TensorType & out_0, const TensorType & in_0, const TensorType & in_1);
|
||||
|
||||
// forward declaration of sample cost function
|
||||
static float ggmlmulmatCostFunc(const Op * op);
|
||||
|
||||
/*
|
||||
* method 1 for defining op, using default cost value (i.e. GLACIAL) and default flag (Flags::RESOURCE_HVX)
|
||||
* syntax: DEF_PACKAGE_OP(F,OP)
|
||||
* e.g. DEF_PACKAGE_OP((ggmlmulmatImpl<Tensor>), "GgmlMulMat")
|
||||
*/
|
||||
DEF_PACKAGE_OP((ggmlmulmatImpl<Tensor>), "GgmlMulMat")
|
||||
|
||||
/*
|
||||
* method 2 for defining op with specified cost value (one of GLACIAL, SNAIL, FAST, FREE)
|
||||
* and provided flags
|
||||
* syntax: DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...)
|
||||
* can use zero or more flags, FLAG options are IS_CONST, INHIBIT_CONST_PROP,
|
||||
* RESOURCE_HVX, RESOURCE_HMX(not supported in external op packages)
|
||||
* e.g. DEF_PACKAGE_OP_AND_COST_AND_FLAGS((ggmlmulmatImpl<PlainFloatTensor>), "GgmlMulMat", SNAIL)
|
||||
*/
|
||||
|
||||
/*
|
||||
* method 3 for defining op with cost function pointer and provided flags
|
||||
* cost function pointer type: typedef float (*cost_function) (const Op * op);
|
||||
* syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...)
|
||||
* e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((ggmlmulmatImpl<PlainFloatTensor>),
|
||||
* "GgmlMulMat", ggmlmulmatCostFunc, Flags::RESOURCE_HVX)
|
||||
*/
|
||||
|
||||
/*
|
||||
* optimization definitions
|
||||
* need to be global in the package
|
||||
* one definition per optimization
|
||||
* syntax: DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE)
|
||||
* PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000)
|
||||
* HTP core provides some replacement functions for op package to use
|
||||
* for more information about optimization rules, please refer to HTP core documentations
|
||||
*/
|
||||
|
||||
/*
|
||||
* op parameter order definitions
|
||||
* need to be global in the package
|
||||
* one definition per op, and this is optional
|
||||
* syntax: DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...)
|
||||
* one or more parameters can be specified for each op
|
||||
* order of parameters listed determines the order of parameters passed into op execution functions
|
||||
* if an op does not have a parameter order definition, parameter order passed into Qnn_addNode
|
||||
* will be passed into op execution functions
|
||||
* if an op has a parameter order definition, any parameter passed into Qnn_addNode with unlisted
|
||||
* name will be abandoned
|
||||
* if two or more op packages with the same package name will be registered, they cannot list
|
||||
* conflicting parameter orders
|
||||
* PARAM refers to parameter name as a string literal
|
||||
* MANDATORY refers to whether this parameter is required to be provided at Qnn_addNode
|
||||
* DEFAULT is used when MANDATORY is false
|
||||
* if provided as Qnn_Param_t*,
|
||||
* DEFAULT will be used for graph construction when this parameter is not provided at
|
||||
* Qnn_addNode
|
||||
* if provided as nullptr,
|
||||
* graph construction will skip this parameter when this parameter is not provided at
|
||||
* Qnn_addNode
|
||||
*/
|
||||
|
||||
namespace {
|
||||
|
||||
constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73
|
||||
constexpr const size_t kFloatsPerVector = kBytesPerVector / sizeof(float);
|
||||
constexpr const size_t kAlignMask = kBytesPerVector - 1;
|
||||
|
||||
inline size_t unaligned_bytes(const void * addr) {
|
||||
return ((size_t) addr) & kAlignMask;
|
||||
}
|
||||
|
||||
inline bool is_addr_aligned(void * addr) {
|
||||
return unaligned_bytes(addr) == 0;
|
||||
}
|
||||
|
||||
inline float vec_dot_product_f32(const float * src0, const float * src1, size_t count) {
|
||||
HVX_Vector * iptr0 = ((HVX_Vector *) src0);
|
||||
HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kFloatsPerVector);
|
||||
HVX_Vector * iptr1 = ((HVX_Vector *) src1);
|
||||
HVX_Vector prev0 = *iptr0++;
|
||||
HVX_Vector prev1 = *iptr1++;
|
||||
HVX_Vector sum = Q6_V_vzero();
|
||||
|
||||
// TODO: prefetch?
|
||||
while (iptr0 < iptr0_end) {
|
||||
HVX_Vector curr0 = *iptr0++;
|
||||
HVX_Vector curr1 = *iptr1++;
|
||||
HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
|
||||
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
|
||||
sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, s1), sum);
|
||||
prev0 = curr0;
|
||||
prev1 = curr1;
|
||||
}
|
||||
|
||||
if ((iptr0_end - ((HVX_Vector *) src0)) > 0) {
|
||||
// handle the last vector
|
||||
// see also: https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147
|
||||
HVX_Vector curr0 = is_addr_aligned(iptr0) ? prev0 : *iptr0++;
|
||||
HVX_Vector curr1 = is_addr_aligned(iptr1) ? prev1 : *iptr1++;
|
||||
HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
|
||||
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
|
||||
sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, s1), sum);
|
||||
prev0 = curr0;
|
||||
prev1 = curr1;
|
||||
}
|
||||
|
||||
const size_t leftover = count % kFloatsPerVector;
|
||||
const size_t leftover_bytes = leftover * sizeof(float);
|
||||
if (leftover > 0) {
|
||||
// handle the leftover elements
|
||||
HVX_Vector curr0 = (leftover_bytes + unaligned_bytes(iptr0) > kBytesPerVector) ? *iptr0 : prev0;
|
||||
curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
|
||||
|
||||
HVX_Vector curr1 = (leftover_bytes + unaligned_bytes(iptr1) > kBytesPerVector) ? *iptr1 : prev1;
|
||||
curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
|
||||
|
||||
sum = Q6_Vqf32_vadd_Vqf32Vqf32(
|
||||
Q6_V_valign_VVR(Q6_Vqf32_vmpy_VsfVsf(curr0, curr1), Q6_V_vzero(), leftover_bytes), sum);
|
||||
}
|
||||
|
||||
// TODO: do we have a better way to do the reduction?
|
||||
for (size_t i = kFloatsPerVector / 2; i > 0; i /= 2) {
|
||||
sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_V_vror_VR(sum, i * sizeof(float)));
|
||||
}
|
||||
|
||||
float result;
|
||||
q6op_vstu_variable_ARV(&result, sizeof(float), Q6_Vsf_equals_Vqf32(sum));
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename TensorType>
|
||||
inline GraphStatus mul_mat_2d_f32(TensorType & out_0, const TensorType & in_0, const TensorType & in_1) {
|
||||
// TODO: handle strides?
|
||||
if (in_1.dim(1) != in_0.dim(1)) {
|
||||
return GraphStatus::ErrorDimensions;
|
||||
}
|
||||
|
||||
size_t dims[4] = { in_1.dim(0), in_0.dim(0) };
|
||||
out_0.set_dims(dims);
|
||||
|
||||
auto in0_ptr = (float *) in_0.raw_data_const();
|
||||
auto in1_ptr = (float *) in_1.raw_data_const();
|
||||
auto out_ptr = (float *) out_0.raw_data();
|
||||
|
||||
for (size_t i = 0; i < dims[0]; i++) {
|
||||
// TODO: prefetch?
|
||||
auto * in1_row = in1_ptr + i * in_1.dim(1);
|
||||
auto * out_row = out_ptr + i * dims[1];
|
||||
for (size_t j = 0; j < dims[1]; j++) {
|
||||
*out_row++ = vec_dot_product_f32(in0_ptr + j * in_0.dim(1), in1_row, in_0.dim(1));
|
||||
}
|
||||
}
|
||||
|
||||
return GraphStatus::Success;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
/* execute functions for ops */
|
||||
|
||||
template <typename TensorType>
|
||||
GraphStatus ggmlmulmatImpl(TensorType & out_0, const TensorType & in_0, const TensorType & in_1) {
|
||||
if (!in_0.raw_data_const() || !in_1.raw_data_const() || !out_0.raw_data()) {
|
||||
return GraphStatus::ErrorBadInput;
|
||||
}
|
||||
|
||||
if (in_0.rank() != in_1.rank()) {
|
||||
return GraphStatus::ErrorRank;
|
||||
}
|
||||
|
||||
auto rank = in_0.rank();
|
||||
switch (rank) {
|
||||
case 4:
|
||||
case 3:
|
||||
// TODO: add implementation
|
||||
return GraphStatus::ErrorUnsupported;
|
||||
case 2:
|
||||
return mul_mat_2d_f32(out_0, in_0, in_1);
|
||||
}
|
||||
|
||||
return GraphStatus::ErrorRank;
|
||||
}
|
||||
|
||||
__attribute__((unused)) static float ggmlmulmatCostFunc(const Op * op) {
|
||||
/*
|
||||
* add code here
|
||||
* */
|
||||
|
||||
float cost = 0.0; // add cost computation here
|
||||
return cost;
|
||||
}
|
||||
|
||||
/* At the bottom of the op file, call END_PKG_OP_DEFINITION(<name>),
|
||||
where <name> is as BEGIN_PKG_OP_DEFINITION
|
||||
*/
|
||||
END_PKG_OP_DEFINITION(PKG_GgmlMulMat);
|
||||
|
|
@ -3,6 +3,7 @@
|
|||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ggml-qnn.h"
|
||||
#include "qnn-types.hpp"
|
||||
#include "tensor.hpp"
|
||||
|
|
@ -60,7 +61,7 @@ class ggml_qnn_op_config {
|
|||
* @param graph_handle
|
||||
* @return true if tensors and nodes are successfully created, false otherwise.
|
||||
*/
|
||||
virtual bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) = 0;
|
||||
virtual bool initialize_op_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle) = 0;
|
||||
|
||||
/**
|
||||
* @brief Pure virtual function to retrieve the input tensors.
|
||||
|
|
@ -224,18 +224,23 @@ static_assert(kOpCaps[GGML_OP_COUNT + GGML_UNARY_OP_GELU].qnn_op_name,
|
|||
static_assert(std::size(kOpCaps) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT),
|
||||
"GGML_OP_COUNT does not match the size of the kOpCaps table");
|
||||
|
||||
std::shared_ptr<qnn::ggml_qnn_op_config> mat_mul_op_constructor(const ggml_tensor * op,
|
||||
const std::string & instance_name,
|
||||
std::shared_ptr<qnn::qnn_instance> qnn_instance) {
|
||||
GGML_UNUSED(op);
|
||||
std::shared_ptr<qnn::ggml_qnn_op_config> mat_mul_op_constructor(const ggml_tensor * op,
|
||||
const std::string & instance_name,
|
||||
qnn::qnn_instance_ptr qnn_instance) {
|
||||
if (qnn_instance->has_custom_op_package() && ggml_n_dims(op) == 2) {
|
||||
QNN_LOG_DEBUG("create GgmlMulMat, name %s, use GgmlOpPackage\n", instance_name.c_str());
|
||||
return std::make_shared<qnn::ggml_qnn_single_op_config>(instance_name, "GgmlOpPackage", "GgmlMulMat",
|
||||
qnn_instance);
|
||||
}
|
||||
|
||||
QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s\n", instance_name.c_str());
|
||||
return std::make_shared<qnn::ggml_qnn_matmul_op_config>(instance_name, qnn_instance);
|
||||
}
|
||||
|
||||
template <size_t _op>
|
||||
std::shared_ptr<qnn::ggml_qnn_op_config> generic_op_constructor(const ggml_tensor * op,
|
||||
const std::string & instance_name,
|
||||
std::shared_ptr<qnn::qnn_instance> qnn_instance) {
|
||||
std::shared_ptr<qnn::ggml_qnn_op_config> generic_op_constructor(const ggml_tensor * op,
|
||||
const std::string & instance_name,
|
||||
qnn::qnn_instance_ptr qnn_instance) {
|
||||
GGML_UNUSED(op);
|
||||
static_assert(_op < std::size(kOpCaps));
|
||||
static_assert(kOpCaps[_op].qnn_op_name != nullptr);
|
||||
|
|
@ -251,8 +256,9 @@ void add_type_parameters(std::shared_ptr<qnn::ggml_qnn_op_config_base> op, const
|
|||
}
|
||||
|
||||
template <size_t _op, typename _ggml_op_param_type, typename _qnn_op_type_name>
|
||||
std::shared_ptr<qnn::ggml_qnn_op_config> op_constructor_with_type_param(
|
||||
const ggml_tensor * op, const std::string & instance_name, std::shared_ptr<qnn::qnn_instance> qnn_instance) {
|
||||
std::shared_ptr<qnn::ggml_qnn_op_config> op_constructor_with_type_param(const ggml_tensor * op,
|
||||
const std::string & instance_name,
|
||||
qnn::qnn_instance_ptr qnn_instance) {
|
||||
static_assert(std::is_base_of<qnn::ggml_qnn_op_config_base, _qnn_op_type_name>::value);
|
||||
static_assert(_op < std::size(kOpCaps));
|
||||
|
||||
|
|
@ -48,7 +48,7 @@ void ggml_qnn_op_config_base::add_scalar_param(const std::string & name, const Q
|
|||
|
||||
bool ggml_qnn_op_config_base::add_tensor_param(const std::string & name, const qnn_dimension_array_t & dimensions,
|
||||
int rank, const uint8_t * data, const Qnn_DataType_t data_type,
|
||||
QNNBackend device, Qnn_GraphHandle_t graph_handle) {
|
||||
backend_index_type device, Qnn_GraphHandle_t graph_handle) {
|
||||
std::string tensor_name = _name + name + std::to_string(_tensor_parameters.size());
|
||||
auto param_tensor = std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::PARAMETER, tensor_name, dimensions,
|
||||
data_type, rank, device, graph_handle, _qnn_instance);
|
||||
|
|
@ -131,7 +131,8 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) {
|
|||
auto qnn_interface = _qnn_instance->get_qnn_interface();
|
||||
auto error = qnn_interface->qnn_graph_add_node(graph_handle, get_op_config());
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_ERROR("[%s]qnn_graph_add_node.error: %s\n", _name.c_str(), get_qnn_error_string(error));
|
||||
QNN_LOG_ERROR("[%s][%s][%s]qnn_graph_add_node.error: %s\n", _name.c_str(), _package_name.c_str(),
|
||||
_op_type.c_str(), get_qnn_error_string(error));
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -183,13 +184,13 @@ Qnn_OpConfig_t ggml_qnn_op_config_base::get_op_config() {
|
|||
return config;
|
||||
}
|
||||
|
||||
bool ggml_qnn_single_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) {
|
||||
bool ggml_qnn_single_op_config::initialize_op_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle) {
|
||||
GGML_UNUSED(device);
|
||||
GGML_UNUSED(graph_handle);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ggml_qnn_rmsnorm_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) {
|
||||
bool ggml_qnn_rmsnorm_op_config::initialize_op_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle) {
|
||||
constexpr const uint32_t kAxes[] = { 0 };
|
||||
add_tensor_param(QNN_OP_RMS_NORM_PARAM_AXES, { 1 }, 1, reinterpret_cast<const uint8_t *>(kAxes),
|
||||
QNN_DATATYPE_UINT_32, device, graph_handle);
|
||||
|
|
@ -220,7 +221,7 @@ bool ggml_qnn_aggregate_op_config::bind_output_tensors(const ggml_tensor_array_t
|
|||
return qnn::bind_tensors(tensor_outputs, _tensor_outputs);
|
||||
}
|
||||
|
||||
bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) {
|
||||
bool ggml_qnn_matmul_op_config::initialize_op_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle) {
|
||||
GGML_ASSERT(_tensor_inputs.size() == 2);
|
||||
GGML_ASSERT(_tensor_outputs.size() == 1);
|
||||
|
||||
|
|
@ -251,8 +252,9 @@ bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_Graph
|
|||
return true;
|
||||
}
|
||||
|
||||
qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
const int rank, qnn_tensor_ptr_t tensor_input,
|
||||
qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(backend_index_type device,
|
||||
Qnn_GraphHandle_t graph_handle, const int rank,
|
||||
qnn_tensor_ptr_t tensor_input,
|
||||
qnn_dimension_array_t output_dimensions) {
|
||||
if (rank <= 2) {
|
||||
return tensor_input;
|
||||
|
|
@ -270,7 +272,7 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic
|
|||
// create concat nodes, to convert tensor shape from [ne03, ne02, n, k] to [ne03 * x, ne02 * y, n, k]
|
||||
constexpr const auto create_node =
|
||||
[](const std::string & name, const int rank, const int axis, const qnn_dimension_array_t & dimensions,
|
||||
qnn_tensor_ptr_t tensor_input, QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
qnn_tensor_ptr_t tensor_input, backend_index_type device, Qnn_GraphHandle_t graph_handle,
|
||||
qnn_instance_ptr qnn_instance, qnn_tensor_ptr_t & tensor_output) -> qnn_op_config_ptr_t {
|
||||
auto gather_out =
|
||||
std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, name + "_out", dimensions,
|
||||
|
|
@ -318,8 +320,8 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic
|
|||
return gather1_out;
|
||||
}
|
||||
|
||||
Qnn_DataType_t ggml_qnn_matmul_op_config::create_input_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle,
|
||||
const int rank,
|
||||
Qnn_DataType_t ggml_qnn_matmul_op_config::create_input_convert_nodes(backend_index_type device,
|
||||
Qnn_GraphHandle_t graph_handle, const int rank,
|
||||
qnn_tensor_array_t & tensor_inputs) {
|
||||
if (device == QNN_BACKEND_GPU) {
|
||||
// there's no convert op for GPU, so we should create matmul nodes directly.
|
||||
|
|
@ -352,8 +354,8 @@ Qnn_DataType_t ggml_qnn_matmul_op_config::create_input_convert_nodes(QNNBackend
|
|||
return tensor_type;
|
||||
}
|
||||
|
||||
qnn_op_config_ptr_t ggml_qnn_matmul_op_config::create_output_convert_nodes(QNNBackend device,
|
||||
Qnn_GraphHandle_t graph_handle,
|
||||
qnn_op_config_ptr_t ggml_qnn_matmul_op_config::create_output_convert_nodes(backend_index_type device,
|
||||
Qnn_GraphHandle_t graph_handle,
|
||||
const int rank, Qnn_DataType_t tensor_type,
|
||||
qnn_tensor_array_t & tensor_outputs) {
|
||||
GGML_ASSERT(tensor_outputs.size() == 1);
|
||||
|
|
@ -23,7 +23,7 @@ class ggml_qnn_op_config_base : public ggml_qnn_op_config {
|
|||
|
||||
void add_scalar_param(const std::string & name, const Qnn_Scalar_t scalar);
|
||||
bool add_tensor_param(const std::string & name, const qnn_dimension_array_t & dimensions, int rank,
|
||||
const uint8_t * data, const Qnn_DataType_t data_type, QNNBackend device,
|
||||
const uint8_t * data, const Qnn_DataType_t data_type, backend_index_type device,
|
||||
Qnn_GraphHandle_t graph_handle);
|
||||
|
||||
void set_input_tensors(qnn::qnn_tensor_array_t & tensor_inputs) override;
|
||||
|
|
@ -65,7 +65,7 @@ class ggml_qnn_single_op_config : public ggml_qnn_op_config_base {
|
|||
const std::string & op_type, qnn_instance_ptr qnn_instance) :
|
||||
ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {}
|
||||
|
||||
bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override;
|
||||
bool initialize_op_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle) override;
|
||||
|
||||
private:
|
||||
DISABLE_COPY(ggml_qnn_single_op_config);
|
||||
|
|
@ -78,7 +78,7 @@ class ggml_qnn_rmsnorm_op_config : public ggml_qnn_op_config_base {
|
|||
const std::string & op_type, qnn_instance_ptr qnn_instance) :
|
||||
ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {}
|
||||
|
||||
bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override;
|
||||
bool initialize_op_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle) override;
|
||||
|
||||
private:
|
||||
DISABLE_COPY(ggml_qnn_rmsnorm_op_config);
|
||||
|
|
@ -143,15 +143,16 @@ class ggml_qnn_matmul_op_config : public ggml_qnn_aggregate_op_config {
|
|||
ggml_qnn_matmul_op_config(const std::string & name, qnn_instance_ptr qnn_instance) :
|
||||
ggml_qnn_aggregate_op_config(name, qnn_instance) {}
|
||||
|
||||
bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle) override;
|
||||
bool initialize_op_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle) override;
|
||||
|
||||
private:
|
||||
qnn_tensor_ptr_t create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank,
|
||||
qnn_tensor_ptr_t tensor_input, qnn_dimension_array_t output_dimensions);
|
||||
Qnn_DataType_t create_input_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank,
|
||||
qnn_tensor_array_t & tensor_inputs);
|
||||
qnn_op_config_ptr_t create_output_convert_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank,
|
||||
Qnn_DataType_t tensor_type, qnn_tensor_array_t & tensor_outputs);
|
||||
qnn_tensor_ptr_t create_gather_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle, const int rank,
|
||||
qnn_tensor_ptr_t tensor_input, qnn_dimension_array_t output_dimensions);
|
||||
Qnn_DataType_t create_input_convert_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle, const int rank,
|
||||
qnn_tensor_array_t & tensor_inputs);
|
||||
qnn_op_config_ptr_t create_output_convert_nodes(backend_index_type device, Qnn_GraphHandle_t graph_handle,
|
||||
const int rank, Qnn_DataType_t tensor_type,
|
||||
qnn_tensor_array_t & tensor_outputs);
|
||||
bool create_mat_mul_nodes(qnn_tensor_array_t & tensor_inputs, qnn_tensor_array_t & tensor_outputs);
|
||||
|
||||
DISABLE_COPY(ggml_qnn_matmul_op_config);
|
||||
|
|
@ -3,6 +3,9 @@
|
|||
|
||||
#include <filesystem>
|
||||
|
||||
#include "common.hpp"
|
||||
#include "rpc-mem.hpp"
|
||||
|
||||
#if defined(__linux__)
|
||||
# include <unistd.h>
|
||||
#endif
|
||||
|
|
@ -10,19 +13,23 @@
|
|||
namespace {
|
||||
|
||||
#ifdef _WIN32
|
||||
constexpr const char * kQnnSystemLibName = "QnnSystem.dll";
|
||||
constexpr const char * kQnnRpcLibName = "libcdsprpc.dll";
|
||||
constexpr const char * kQnnCpuLibName = "QnnCpu.dll";
|
||||
constexpr const char * kQnnGpuLibName = "QnnGpu.dll";
|
||||
constexpr const char * kQnnNpuLibName = "QnnHtp.dll";
|
||||
# define PLATFORM_LIB_FILENAME(name) (name ".dll")
|
||||
#else
|
||||
constexpr const char * kQnnSystemLibName = "libQnnSystem.so";
|
||||
constexpr const char * kQnnRpcLibName = "libcdsprpc.so";
|
||||
constexpr const char * kQnnCpuLibName = "libQnnCpu.so";
|
||||
constexpr const char * kQnnGpuLibName = "libQnnGpu.so";
|
||||
constexpr const char * kQnnNpuLibName = "libQnnHtp.so";
|
||||
# define PLATFORM_LIB_FILENAME(name) ("lib" name ".so")
|
||||
#endif
|
||||
|
||||
#if defined(__aarch64__) || defined(_M_ARM64) // TODO: check for other platforms
|
||||
# define PLATFORM_LIB_POSFIX "_aarch64"
|
||||
#else
|
||||
# define PLATFORM_LIB_POSFIX "_x64"
|
||||
#endif
|
||||
|
||||
constexpr const char * kQnnSystemLibName = PLATFORM_LIB_FILENAME("QnnSystem");
|
||||
constexpr const char * kQnnCpuLibName = PLATFORM_LIB_FILENAME("QnnCpu");
|
||||
constexpr const char * kQnnGpuLibName = PLATFORM_LIB_FILENAME("QnnGpu");
|
||||
constexpr const char * kQnnNpuLibName = PLATFORM_LIB_FILENAME("QnnHtp");
|
||||
constexpr const char * kQnnCpuPackageLibName = PLATFORM_LIB_FILENAME("QnnGgmlOpPackage" PLATFORM_LIB_POSFIX);
|
||||
|
||||
constexpr const qnn::device_caps kDeviceCaps[] = {
|
||||
{
|
||||
// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul
|
||||
|
|
@ -46,8 +53,8 @@ constexpr const qnn::device_caps kDeviceCaps[] = {
|
|||
},
|
||||
};
|
||||
|
||||
static_assert(sizeof(kDeviceCaps) / sizeof(kDeviceCaps[0]) == GGML_QNN_MAX_DEVICES,
|
||||
"The number of qnn devices should be equal to GGML_QNN_MAX_DEVICES");
|
||||
static_assert(sizeof(kDeviceCaps) / sizeof(kDeviceCaps[0]) == QNN_BACKEND_COUNT,
|
||||
"The number of qnn devices should be equal to QNN_BACKEND_COUNT");
|
||||
static_assert(kDeviceCaps[QNN_BACKEND_NPU].type == GGML_BACKEND_DEVICE_TYPE_ACCEL,
|
||||
"The NPU device should be an accelerator device");
|
||||
static_assert(kDeviceCaps[QNN_BACKEND_GPU].type == GGML_BACKEND_DEVICE_TYPE_GPU,
|
||||
|
|
@ -102,23 +109,67 @@ bool set_qnn_lib_search_path(const std::string & custom_lib_search_path) {
|
|||
return true;
|
||||
}
|
||||
|
||||
qnn::dl_handler_t load_lib_with_fallback(const std::string & lib_path, const std::string & load_directory) {
|
||||
common::dl_handler_t load_lib_with_fallback(const std::string & lib_path, const std::string & load_directory) {
|
||||
std::filesystem::path full_path(load_directory);
|
||||
full_path /= std::filesystem::path(lib_path).filename();
|
||||
auto handle = qnn::dl_load(full_path.string());
|
||||
auto handle = common::dl_load(full_path.string());
|
||||
if (!handle) {
|
||||
QNN_LOG_WARN("failed to load %s, fallback to %s\n", full_path.c_str(), lib_path.c_str());
|
||||
handle = qnn::dl_load(lib_path);
|
||||
handle = common::dl_load(lib_path);
|
||||
}
|
||||
|
||||
return handle;
|
||||
}
|
||||
|
||||
struct op_package_lib_info {
|
||||
const char * lib_name;
|
||||
const char * interface;
|
||||
const char * type;
|
||||
size_t htp_arch;
|
||||
const char * extra_lib_name = nullptr;
|
||||
};
|
||||
|
||||
const op_package_lib_info & get_op_package_lib_info(uint32_t soc_model, size_t htp_arch) {
|
||||
constexpr static const op_package_lib_info kOpPackageLibInfo[] = {
|
||||
{ kQnnCpuPackageLibName, "GgmlOpPackageInterfaceProvider", "CPU", qnn::NONE,
|
||||
PLATFORM_LIB_FILENAME("HtpPrepare") },
|
||||
{ PLATFORM_LIB_FILENAME("QnnGgmlOpPackage_v68"), "GgmlOpPackageInterfaceProvider", "HTP", qnn::V68 },
|
||||
{ PLATFORM_LIB_FILENAME("QnnGgmlOpPackage_v69"), "GgmlOpPackageInterfaceProvider", "HTP", qnn::V69 },
|
||||
{ PLATFORM_LIB_FILENAME("QnnGgmlOpPackage_v73"), "GgmlOpPackageInterfaceProvider", "HTP", qnn::V73 },
|
||||
{ PLATFORM_LIB_FILENAME("QnnGgmlOpPackage_v75"), "GgmlOpPackageInterfaceProvider", "HTP", qnn::V75 },
|
||||
{ PLATFORM_LIB_FILENAME("QnnGgmlOpPackage_v79"), "GgmlOpPackageInterfaceProvider", "HTP", qnn::V79 },
|
||||
};
|
||||
|
||||
if (soc_model == qnn::UNKNOWN || soc_model == qnn::EMULATOR_X64 || soc_model == qnn::EMULATOR_AARCH64) {
|
||||
return kOpPackageLibInfo[0];
|
||||
}
|
||||
|
||||
switch (htp_arch) {
|
||||
case qnn::V68:
|
||||
static_assert(kOpPackageLibInfo[1].htp_arch == qnn::V68);
|
||||
return kOpPackageLibInfo[1];
|
||||
case qnn::V69:
|
||||
static_assert(kOpPackageLibInfo[2].htp_arch == qnn::V69);
|
||||
return kOpPackageLibInfo[2];
|
||||
case qnn::V73:
|
||||
static_assert(kOpPackageLibInfo[3].htp_arch == qnn::V73);
|
||||
return kOpPackageLibInfo[3];
|
||||
case qnn::V75:
|
||||
static_assert(kOpPackageLibInfo[4].htp_arch == qnn::V75);
|
||||
return kOpPackageLibInfo[4];
|
||||
case qnn::V79:
|
||||
default:
|
||||
static_assert(kOpPackageLibInfo[5].htp_arch == qnn::V79);
|
||||
return kOpPackageLibInfo[5];
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace qnn {
|
||||
|
||||
qnn_system_interface::qnn_system_interface(const QnnSystemInterface_t & qnn_sys_interface, dl_handler_t lib_handle) :
|
||||
qnn_system_interface::qnn_system_interface(const QnnSystemInterface_t & qnn_sys_interface,
|
||||
common::dl_handler_t lib_handle) :
|
||||
_qnn_sys_interface(qnn_sys_interface),
|
||||
_lib_handle(lib_handle) {
|
||||
qnn_system_context_create(&_qnn_system_handle);
|
||||
|
|
@ -139,15 +190,16 @@ qnn_system_interface::~qnn_system_interface() {
|
|||
}
|
||||
|
||||
if (_lib_handle) {
|
||||
if (!dl_unload(_lib_handle)) {
|
||||
QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dl_error());
|
||||
if (!common::dl_unload(_lib_handle)) {
|
||||
QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", common::dl_error());
|
||||
}
|
||||
} else {
|
||||
QNN_LOG_WARN("system lib handle is null\n");
|
||||
}
|
||||
}
|
||||
|
||||
qnn_instance::qnn_instance(const std::string & lib_path, QNNBackend device) : _additional_lib_load_path(lib_path) {
|
||||
qnn_instance::qnn_instance(const std::string & lib_path, backend_index_type device) :
|
||||
_additional_lib_load_path(lib_path) {
|
||||
_backend_lib_name = kDeviceCaps[device].lib_name;
|
||||
if (set_qnn_lib_search_path(lib_path)) {
|
||||
QNN_LOG_DEBUG("[%s] set_qnn_lib_search_path succeed\n", _backend_lib_name.c_str());
|
||||
|
|
@ -156,23 +208,23 @@ qnn_instance::qnn_instance(const std::string & lib_path, QNNBackend device) : _a
|
|||
}
|
||||
}
|
||||
|
||||
int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
|
||||
bool qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
|
||||
BackendIdType backend_id = QNN_BACKEND_ID_NULL;
|
||||
QNN_LOG_DEBUG("enter qnn_init\n");
|
||||
|
||||
std::lock_guard<std::mutex> lock(_init_mutex);
|
||||
if (load_system() != 0) {
|
||||
QNN_LOG_WARN("failed to load QNN system lib\n");
|
||||
return 1;
|
||||
return false;
|
||||
} else {
|
||||
QNN_LOG_DEBUG("load QNN system lib successfully\n");
|
||||
}
|
||||
|
||||
std::string backend_lib_path = _backend_lib_name;
|
||||
if (_lib_path_to_backend_id.count(backend_lib_path) == 0) {
|
||||
if (load_backend(backend_lib_path, saver_config) != 0) {
|
||||
if (!load_backend(backend_lib_path, saver_config)) {
|
||||
QNN_LOG_WARN("failed to load QNN backend\n");
|
||||
return 2;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -182,15 +234,15 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
|
|||
"library %s is loaded but loaded backend count=%zu, "
|
||||
"loaded lib_handle count=%zu",
|
||||
backend_lib_path.c_str(), _loaded_backend.count(backend_id), _loaded_lib_handle.count(backend_id));
|
||||
return 3;
|
||||
return false;
|
||||
}
|
||||
|
||||
_qnn_interface = std::make_shared<qnn_interface>(*_loaded_backend[backend_id]);
|
||||
_qnn_interface->qnn_log_create(qnn::sdk_logcallback, _qnn_log_level, &_qnn_log_handle);
|
||||
if (!_qnn_log_handle) {
|
||||
// NPU backend not work on Qualcomm SoC equipped low-end phone
|
||||
QNN_LOG_WARN("why failed to initialize qnn log\n");
|
||||
return 4;
|
||||
QNN_LOG_WARN("failed to initialize qnn log\n");
|
||||
return false;
|
||||
} else {
|
||||
QNN_LOG_DEBUG("initialize qnn log successfully\n");
|
||||
}
|
||||
|
|
@ -199,22 +251,23 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
|
|||
_qnn_interface->qnn_backend_create(
|
||||
_qnn_log_handle, temp_backend_config.empty() ? nullptr : temp_backend_config.data(), &_qnn_backend_handle);
|
||||
if (!_qnn_backend_handle) {
|
||||
QNN_LOG_WARN("why failed to initialize qnn backend\n");
|
||||
return 5;
|
||||
QNN_LOG_WARN("failed to initialize qnn backend\n");
|
||||
return false;
|
||||
} else {
|
||||
QNN_LOG_DEBUG("initialize qnn backend successfully\n");
|
||||
}
|
||||
|
||||
auto qnn_status = _qnn_interface->qnn_property_has_capability(QNN_PROPERTY_GROUP_DEVICE);
|
||||
if (QNN_PROPERTY_NOT_SUPPORTED == qnn_status) {
|
||||
QNN_LOG_WARN("device property is not supported\n");
|
||||
}
|
||||
if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnn_status) {
|
||||
QNN_LOG_WARN("device property is not known to backend\n");
|
||||
switch (qnn_status) {
|
||||
case QNN_PROPERTY_NOT_SUPPORTED:
|
||||
QNN_LOG_WARN("device property is not supported\n");
|
||||
break;
|
||||
case QNN_PROPERTY_ERROR_UNKNOWN_KEY:
|
||||
QNN_LOG_WARN("device property is unknown\n");
|
||||
break;
|
||||
}
|
||||
|
||||
qnn_status = QNN_SUCCESS;
|
||||
if (_backend_lib_name.find("Htp") != _backend_lib_name.npos) {
|
||||
{
|
||||
const QnnDevice_PlatformInfo_t * p_info = nullptr;
|
||||
qnn_status = _qnn_interface->qnn_device_get_platform_info(nullptr, &p_info);
|
||||
if (qnn_status == QNN_SUCCESS) {
|
||||
|
|
@ -243,57 +296,50 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
|
|||
_qnn_interface->qnn_device_free_platform_info(nullptr, p_info);
|
||||
} else {
|
||||
// For emulator, we can't get platform info
|
||||
QNN_LOG_WARN("failed to get platform info, are we in emulator?\n");
|
||||
_soc_info = { NONE, UNKNOWN_SM, 0 };
|
||||
QNN_LOG_INFO("failed to get platform info, emulator or cpu backend?\n");
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
_soc_info = { EMULATOR_AARCH64, NONE, 0 };
|
||||
#elif defined(__x86_64__) || defined(__amd64__) || defined(_M_X64)
|
||||
_soc_info = { EMULATOR_X64, NONE, 0 };
|
||||
#else
|
||||
_soc_info = { UNKNOWN, NONE, 0 };
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
QnnHtpDevice_CustomConfig_t soc_customconfig;
|
||||
soc_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC;
|
||||
soc_customconfig.socModel = _soc_info.soc_model;
|
||||
QnnDevice_Config_t soc_devconfig;
|
||||
soc_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
|
||||
soc_devconfig.customConfig = &soc_customconfig;
|
||||
|
||||
QnnHtpDevice_CustomConfig_t arch_customconfig;
|
||||
arch_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH;
|
||||
arch_customconfig.arch.arch = (QnnHtpDevice_Arch_t) _soc_info.htp_arch;
|
||||
arch_customconfig.arch.deviceId = 0; // Id of device to be used. 0 will use by default.
|
||||
QnnDevice_Config_t arch_devconfig;
|
||||
arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
|
||||
arch_devconfig.customConfig = &arch_customconfig;
|
||||
|
||||
const QnnDevice_Config_t * p_deviceconfig[] = { &soc_devconfig, &arch_devconfig, nullptr };
|
||||
qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle);
|
||||
} else {
|
||||
{
|
||||
qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, nullptr, &_qnn_device_handle);
|
||||
}
|
||||
if (QNN_SUCCESS != qnn_status && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) {
|
||||
QNN_LOG_WARN("failed to create QNN device\n");
|
||||
} else {
|
||||
QNN_LOG_INFO("create QNN device successfully\n");
|
||||
if (QNN_SUCCESS != qnn_status && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) {
|
||||
QNN_LOG_WARN("failed to create QNN device\n");
|
||||
} else {
|
||||
QNN_LOG_INFO("create QNN device successfully\n");
|
||||
}
|
||||
}
|
||||
|
||||
_rpc_lib_handle = load_lib_with_fallback(kQnnRpcLibName, _additional_lib_load_path);
|
||||
if (_rpc_lib_handle) {
|
||||
_pfn_rpc_mem_alloc = reinterpret_cast<qnn::pfn_rpc_mem_alloc>(dl_sym(_rpc_lib_handle, "rpcmem_alloc"));
|
||||
_pfn_rpc_mem_free = reinterpret_cast<qnn::pfn_rpc_mem_free>(dl_sym(_rpc_lib_handle, "rpcmem_free"));
|
||||
_pfn_rpc_mem_to_fd = reinterpret_cast<qnn::pfn_rpc_mem_to_fd>(dl_sym(_rpc_lib_handle, "rpcmem_to_fd"));
|
||||
if (!_pfn_rpc_mem_alloc || !_pfn_rpc_mem_free || !_pfn_rpc_mem_to_fd) {
|
||||
QNN_LOG_WARN("unable to access symbols in QNN RPC lib. error: %s\n", dl_error());
|
||||
dl_unload(_rpc_lib_handle);
|
||||
return 9;
|
||||
{
|
||||
auto rpc_mem = std::make_unique<common::rpc_mem>();
|
||||
if (rpc_mem->is_valid()) {
|
||||
_rpc_mem = std::move(rpc_mem);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
auto & op_package_info = get_op_package_lib_info(_soc_info.soc_model, _soc_info.htp_arch);
|
||||
if (op_package_info.extra_lib_name) {
|
||||
_custom_op_extra_lib_handle =
|
||||
load_lib_with_fallback(op_package_info.extra_lib_name, _additional_lib_load_path);
|
||||
}
|
||||
|
||||
_pfn_rpc_mem_init = reinterpret_cast<qnn::pfn_rpc_mem_init>(dl_sym(_rpc_lib_handle, "rpcmem_init"));
|
||||
_pfn_rpc_mem_deinit = reinterpret_cast<qnn::pfn_rpc_mem_deinit>(dl_sym(_rpc_lib_handle, "rpcmem_deinit"));
|
||||
if (_pfn_rpc_mem_init) {
|
||||
_pfn_rpc_mem_init();
|
||||
qnn_status = _qnn_interface->qnn_backend_register_op_package(_qnn_backend_handle, op_package_info.lib_name,
|
||||
op_package_info.interface, op_package_info.type);
|
||||
if (qnn_status != QNN_SUCCESS) {
|
||||
QNN_LOG_WARN("failed to register op package %s, interface: %s, error: %s\n", op_package_info.lib_name,
|
||||
op_package_info.interface, qnn::get_qnn_error_string(qnn_status));
|
||||
} else {
|
||||
QNN_LOG_DEBUG("register op package %s successfully, ID %u\n", op_package_info.lib_name,
|
||||
_qnn_interface->get_backend_id());
|
||||
_has_custom_op_package = true;
|
||||
}
|
||||
|
||||
_rpcmem_initialized = true;
|
||||
QNN_LOG_DEBUG("load rpcmem lib successfully\n");
|
||||
} else {
|
||||
QNN_LOG_WARN("failed to load qualcomm rpc lib, skipping, error:%s\n", dl_error());
|
||||
}
|
||||
|
||||
/* TODO: not used, keep it for further usage
|
||||
|
|
@ -302,35 +348,14 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
|
|||
const QnnContext_Config_t * context_configs[] = {&qnn_context_config, nullptr};
|
||||
*/
|
||||
_qnn_interface->qnn_context_create(_qnn_backend_handle, _qnn_device_handle, nullptr, &_qnn_context_handle);
|
||||
if (nullptr == _qnn_context_handle) {
|
||||
QNN_LOG_WARN("why failed to initialize qnn context\n");
|
||||
return 10;
|
||||
if (!_qnn_context_handle) {
|
||||
QNN_LOG_WARN("failed to initialize qnn context\n");
|
||||
return false;
|
||||
} else {
|
||||
QNN_LOG_DEBUG("initialize qnn context successfully\n");
|
||||
}
|
||||
|
||||
if (_backend_lib_name.find("Htp") != _backend_lib_name.npos) {
|
||||
// TODO: faster approach to probe the accurate capacity of rpc ion memory
|
||||
size_t candidate_size = 0;
|
||||
uint8_t * rpc_buffer = nullptr;
|
||||
const int size_in_mb = (1 << 20);
|
||||
size_t probe_slots[] = { 1024, 1536, 2048 - 48, 2048 };
|
||||
size_t probe_counts = sizeof(probe_slots) / sizeof(size_t);
|
||||
for (size_t idx = 0; idx < probe_counts; idx++) {
|
||||
rpc_buffer = static_cast<uint8_t *>(alloc_rpcmem(probe_slots[idx] * size_in_mb, sizeof(void *)));
|
||||
if (!rpc_buffer) {
|
||||
QNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", (int) probe_slots[idx], strerror(errno));
|
||||
break;
|
||||
} else {
|
||||
candidate_size = probe_slots[idx];
|
||||
free_rpcmem(rpc_buffer);
|
||||
rpc_buffer = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
_rpcmem_capacity = std::max(candidate_size, _rpcmem_capacity);
|
||||
QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB\n", (int) _rpcmem_capacity);
|
||||
|
||||
if (init_htp_perfinfra() != 0) {
|
||||
QNN_LOG_WARN("initialize HTP performance failure\n");
|
||||
}
|
||||
|
|
@ -343,33 +368,16 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
|
|||
}
|
||||
|
||||
QNN_LOG_DEBUG("leave qnn_init\n");
|
||||
|
||||
return 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
int qnn_instance::qnn_finalize() {
|
||||
int ret_status = 0;
|
||||
Qnn_ErrorHandle_t error = QNN_SUCCESS;
|
||||
|
||||
if (_rpc_lib_handle) {
|
||||
if (_pfn_rpc_mem_deinit) {
|
||||
_pfn_rpc_mem_deinit();
|
||||
_pfn_rpc_mem_deinit = nullptr;
|
||||
}
|
||||
|
||||
if (dl_unload(_rpc_lib_handle)) {
|
||||
QNN_LOG_DEBUG("succeed to close rpcmem lib\n");
|
||||
} else {
|
||||
QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dl_error());
|
||||
}
|
||||
}
|
||||
|
||||
bool qnn_instance::qnn_finalize() {
|
||||
if (_backend_lib_name.find("Htp") != _backend_lib_name.npos) {
|
||||
_qnn_htp_perfinfra->destroyPowerConfigId(_qnn_power_configid);
|
||||
}
|
||||
|
||||
if (_qnn_context_handle) {
|
||||
error = _qnn_interface->qnn_context_free(_qnn_context_handle, nullptr);
|
||||
auto error = _qnn_interface->qnn_context_free(_qnn_context_handle, nullptr);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(),
|
||||
(int) QNN_GET_ERROR_CODE(error));
|
||||
|
|
@ -378,7 +386,7 @@ int qnn_instance::qnn_finalize() {
|
|||
}
|
||||
|
||||
if (_qnn_device_handle) {
|
||||
error = _qnn_interface->qnn_device_free(_qnn_device_handle);
|
||||
auto error = _qnn_interface->qnn_device_free(_qnn_device_handle);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(),
|
||||
(int) QNN_GET_ERROR_CODE(error));
|
||||
|
|
@ -387,7 +395,7 @@ int qnn_instance::qnn_finalize() {
|
|||
}
|
||||
|
||||
if (_qnn_backend_handle) {
|
||||
error = _qnn_interface->qnn_backend_free(_qnn_backend_handle);
|
||||
auto error = _qnn_interface->qnn_backend_free(_qnn_backend_handle);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(),
|
||||
(int) QNN_GET_ERROR_CODE(error));
|
||||
|
|
@ -396,7 +404,7 @@ int qnn_instance::qnn_finalize() {
|
|||
}
|
||||
|
||||
if (_qnn_log_handle) {
|
||||
error = _qnn_interface->qnn_log_free(_qnn_log_handle);
|
||||
auto error = _qnn_interface->qnn_log_free(_qnn_log_handle);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(),
|
||||
(int) QNN_GET_ERROR_CODE(error));
|
||||
|
|
@ -404,25 +412,31 @@ int qnn_instance::qnn_finalize() {
|
|||
_qnn_log_handle = nullptr;
|
||||
}
|
||||
|
||||
if (_custom_op_extra_lib_handle) {
|
||||
common::dl_unload(_custom_op_extra_lib_handle);
|
||||
}
|
||||
|
||||
unload_backend();
|
||||
|
||||
_qnn_sys_interface.reset();
|
||||
|
||||
return ret_status;
|
||||
_rpc_mem.reset();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int qnn_instance::load_system() {
|
||||
QNN_LOG_DEBUG("[%s]lib: %s\n", _backend_lib_name.c_str(), kQnnSystemLibName);
|
||||
auto system_lib_handle = load_lib_with_fallback(kQnnSystemLibName, _additional_lib_load_path);
|
||||
if (!system_lib_handle) {
|
||||
QNN_LOG_WARN("can not load QNN library %s, error: %s\n", kQnnSystemLibName, dl_error());
|
||||
QNN_LOG_WARN("can not load QNN library %s, error: %s\n", kQnnSystemLibName, common::dl_error());
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto * get_providers =
|
||||
dl_sym_typed<qnn::pfn_qnnsysteminterface_getproviders *>(system_lib_handle, "QnnSystemInterface_getProviders");
|
||||
auto * get_providers = common::dl_sym_typed<qnn::pfn_qnnsysteminterface_getproviders *>(
|
||||
system_lib_handle, "QnnSystemInterface_getProviders");
|
||||
if (!get_providers) {
|
||||
QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dl_error());
|
||||
QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", common::dl_error());
|
||||
return 2;
|
||||
}
|
||||
|
||||
|
|
@ -473,38 +487,42 @@ int qnn_instance::load_system() {
|
|||
return 0;
|
||||
}
|
||||
|
||||
int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t ** /*saver_config*/) {
|
||||
Qnn_ErrorHandle_t error = QNN_SUCCESS;
|
||||
bool qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t ** /*saver_config*/) {
|
||||
QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str());
|
||||
|
||||
auto lib_handle = load_lib_with_fallback(lib_path, _additional_lib_load_path);
|
||||
if (!lib_handle) {
|
||||
QNN_LOG_WARN("can not open QNN library %s, with error: %s\n", lib_path.c_str(), dl_error());
|
||||
return 1;
|
||||
QNN_LOG_WARN("can not open QNN library %s, with error: %s\n", lib_path.c_str(), common::dl_error());
|
||||
return false;
|
||||
}
|
||||
|
||||
auto get_providers = dl_sym_typed<qnn::pfn_qnninterface_getproviders *>(lib_handle, "QnnInterface_getProviders");
|
||||
auto get_providers =
|
||||
common::dl_sym_typed<qnn::pfn_qnninterface_getproviders *>(lib_handle, "QnnInterface_getProviders");
|
||||
if (!get_providers) {
|
||||
QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s\n", dl_error());
|
||||
return 2;
|
||||
QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s\n", common::dl_error());
|
||||
common::dl_unload(lib_handle);
|
||||
return false;
|
||||
}
|
||||
|
||||
std::uint32_t num_providers = 0;
|
||||
const QnnInterface_t ** provider_list = nullptr;
|
||||
error = get_providers(&provider_list, &num_providers);
|
||||
auto error = get_providers(&provider_list, &num_providers);
|
||||
if (error != QNN_SUCCESS) {
|
||||
QNN_LOG_WARN("failed to get providers, error %d\n", (int) QNN_GET_ERROR_CODE(error));
|
||||
return 3;
|
||||
common::dl_unload(lib_handle);
|
||||
return false;
|
||||
}
|
||||
QNN_LOG_DEBUG("num_providers=%d\n", num_providers);
|
||||
if (num_providers != _required_num_providers) {
|
||||
QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, _required_num_providers);
|
||||
return 4;
|
||||
common::dl_unload(lib_handle);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!provider_list) {
|
||||
QNN_LOG_WARN("failed to get qnn interface providers\n");
|
||||
return 5;
|
||||
common::dl_unload(lib_handle);
|
||||
return false;
|
||||
}
|
||||
bool found_valid_interface = false;
|
||||
QNN_INTERFACE_VER_TYPE qnn_interface;
|
||||
|
|
@ -519,7 +537,8 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t *
|
|||
|
||||
if (!found_valid_interface) {
|
||||
QNN_LOG_WARN("unable to find a valid qnn interface\n");
|
||||
return 6;
|
||||
common::dl_unload(lib_handle);
|
||||
return false;
|
||||
} else {
|
||||
QNN_LOG_DEBUG("find a valid qnn interface\n");
|
||||
}
|
||||
|
|
@ -532,31 +551,29 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t *
|
|||
_loaded_backend[backend_id] = provider_list[0];
|
||||
if (_loaded_lib_handle.count(backend_id) > 0) {
|
||||
QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]);
|
||||
if (!dl_unload(_loaded_lib_handle[backend_id])) {
|
||||
QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dl_error());
|
||||
if (!common::dl_unload(_loaded_lib_handle[backend_id])) {
|
||||
QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], common::dl_error());
|
||||
}
|
||||
}
|
||||
_loaded_lib_handle[backend_id] = lib_handle;
|
||||
_backend_id = backend_id;
|
||||
|
||||
return 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
int qnn_instance::unload_backend() {
|
||||
void qnn_instance::unload_backend() {
|
||||
for (auto & it : _loaded_lib_handle) {
|
||||
if (!dl_unload(it.second)) {
|
||||
QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dl_error());
|
||||
if (!common::dl_unload(it.second)) {
|
||||
QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, common::dl_error());
|
||||
}
|
||||
}
|
||||
|
||||
_loaded_lib_handle.clear();
|
||||
_lib_path_to_backend_id.clear();
|
||||
_loaded_backend.clear();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
const device_caps & get_device_caps(QNNBackend device) {
|
||||
const device_caps & get_device_caps(backend_index_type device) {
|
||||
return kDeviceCaps[device];
|
||||
}
|
||||
|
||||
|
|
@ -24,8 +24,9 @@
|
|||
#include <QnnTypes.h>
|
||||
#include <System/QnnSystemInterface.h>
|
||||
|
||||
#include "dl-loader.hpp"
|
||||
#include "dyn-lib-loader.hpp"
|
||||
#include "qnn-types.hpp"
|
||||
#include "rpc-mem.hpp"
|
||||
#include "utils.hpp"
|
||||
|
||||
namespace qnn {
|
||||
|
|
@ -48,7 +49,7 @@ class qnn_system_interface {
|
|||
}
|
||||
|
||||
public:
|
||||
qnn_system_interface(const QnnSystemInterface_t & qnn_sys_interface, dl_handler_t lib_handle);
|
||||
qnn_system_interface(const QnnSystemInterface_t & qnn_sys_interface, common::dl_handler_t lib_handle);
|
||||
~qnn_system_interface();
|
||||
|
||||
bool is_valid() const { return _qnn_system_handle != nullptr; }
|
||||
|
|
@ -67,7 +68,7 @@ class qnn_system_interface {
|
|||
void operator=(qnn_system_interface &&) = delete;
|
||||
|
||||
const QnnSystemInterface_t _qnn_sys_interface = {};
|
||||
dl_handler_t _lib_handle = nullptr;
|
||||
common::dl_handler_t _lib_handle = nullptr;
|
||||
QnnSystemContext_Handle_t _qnn_system_handle = nullptr;
|
||||
};
|
||||
|
||||
|
|
@ -152,12 +153,12 @@ class qnn_instance {
|
|||
public:
|
||||
using BackendIdType = decltype(QnnInterface_t{}.backendId);
|
||||
|
||||
explicit qnn_instance(const std::string & lib_path, QNNBackend device);
|
||||
explicit qnn_instance(const std::string & lib_path, backend_index_type device);
|
||||
|
||||
~qnn_instance() {}
|
||||
|
||||
int qnn_init(const QnnSaver_Config_t ** saver_config);
|
||||
int qnn_finalize();
|
||||
bool qnn_init(const QnnSaver_Config_t ** saver_config);
|
||||
bool qnn_finalize();
|
||||
|
||||
qnn_interface_ptr get_qnn_interface() {
|
||||
if (!_qnn_interface) {
|
||||
|
|
@ -277,18 +278,14 @@ class qnn_instance {
|
|||
|
||||
std::string & get_qnn_graph_name() { return _graph_name; }
|
||||
|
||||
bool is_rpcmem_initialized() { return _rpcmem_initialized; }
|
||||
|
||||
size_t get_rpcmem_capacity() { return _rpcmem_capacity; }
|
||||
|
||||
void * alloc_rpcmem(size_t bytes, size_t alignment) {
|
||||
if (!_rpcmem_initialized) {
|
||||
if (!_rpc_mem) {
|
||||
QNN_LOG_WARN("rpc memory not initialized\n");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto allocate_bytes = static_cast<int64_t>(bytes + alignment);
|
||||
void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (int) allocate_bytes);
|
||||
void * buf = _rpc_mem->alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (int) allocate_bytes);
|
||||
if (!buf) {
|
||||
QNN_LOG_WARN("failed to allocate rpc memory, size: %d MB\n", (int) (allocate_bytes / (1 << 20)));
|
||||
return nullptr;
|
||||
|
|
@ -298,32 +295,34 @@ class qnn_instance {
|
|||
bool status = _rpcmem_store_map.insert(std::pair<void *, void *>(aligned_buf, buf)).second;
|
||||
if (!status) {
|
||||
QNN_LOG_WARN("failed to allocate rpc memory\n");
|
||||
_pfn_rpc_mem_free(buf);
|
||||
_rpc_mem->free(buf);
|
||||
}
|
||||
|
||||
return aligned_buf;
|
||||
}
|
||||
|
||||
void free_rpcmem(void * buf) {
|
||||
if (!_rpcmem_initialized) {
|
||||
if (!_rpc_mem) {
|
||||
QNN_LOG_WARN("rpc memory not initialized\n");
|
||||
} else if (_rpcmem_store_map.count(buf) == 0) {
|
||||
QNN_LOG_WARN("no allocated tensor\n");
|
||||
} else {
|
||||
_pfn_rpc_mem_free(_rpcmem_store_map[buf]);
|
||||
_rpc_mem->free(_rpcmem_store_map[buf]);
|
||||
_rpcmem_store_map.erase(buf);
|
||||
}
|
||||
}
|
||||
|
||||
int32_t rpcmem_to_fd(void * buf) {
|
||||
int32_t mem_fd = -1;
|
||||
if (!is_rpcmem_initialized()) {
|
||||
int rpcmem_to_fd(void * buf) {
|
||||
int fd = -1;
|
||||
if (!_rpc_mem) {
|
||||
QNN_LOG_WARN("rpc memory not initialized\n");
|
||||
} else if (_rpcmem_store_map.count(buf) == 0) {
|
||||
QNN_LOG_WARN("no allocated tensor\n");
|
||||
} else {
|
||||
mem_fd = _pfn_rpc_mem_to_fd(buf);
|
||||
buf = _rpcmem_store_map[buf];
|
||||
fd = _rpc_mem->to_fd(buf);
|
||||
}
|
||||
|
||||
return mem_fd;
|
||||
return fd;
|
||||
}
|
||||
|
||||
Qnn_MemHandle_t register_rpcmem(void * p_data, const uint32_t rank, uint32_t * dimensions,
|
||||
|
|
@ -333,7 +332,7 @@ class qnn_instance {
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
if (!is_rpcmem_initialized()) {
|
||||
if (!_rpc_mem) {
|
||||
QNN_LOG_WARN("rpc memory not initialized\n");
|
||||
return nullptr;
|
||||
}
|
||||
|
|
@ -390,10 +389,12 @@ class qnn_instance {
|
|||
|
||||
const qnn::qcom_socinfo & get_soc_info() { return _soc_info; }
|
||||
|
||||
bool has_custom_op_package() const { return _has_custom_op_package; }
|
||||
|
||||
private:
|
||||
int load_system();
|
||||
int load_backend(std::string & lib_path, const QnnSaver_Config_t ** /*saver_config*/);
|
||||
int unload_backend();
|
||||
int load_system();
|
||||
bool load_backend(std::string & lib_path, const QnnSaver_Config_t ** /*saver_config*/);
|
||||
void unload_backend();
|
||||
|
||||
private:
|
||||
static constexpr const int _required_num_providers = 1;
|
||||
|
|
@ -422,23 +423,19 @@ class qnn_instance {
|
|||
std::unordered_map<void *, Qnn_MemHandle_t> _qnn_rpc_buffer_to_handles;
|
||||
|
||||
std::mutex _init_mutex;
|
||||
std::unordered_map<BackendIdType, dl_handler_t> _loaded_lib_handle;
|
||||
std::unordered_map<BackendIdType, common::dl_handler_t> _loaded_lib_handle;
|
||||
std::unordered_map<std::string, BackendIdType> _lib_path_to_backend_id;
|
||||
std::unordered_map<BackendIdType, const QnnInterface_t *> _loaded_backend;
|
||||
|
||||
dl_handler_t _rpc_lib_handle = nullptr;
|
||||
std::atomic_bool _rpcmem_initialized{ false };
|
||||
qnn::pfn_rpc_mem_alloc _pfn_rpc_mem_alloc = nullptr;
|
||||
qnn::pfn_rpc_mem_free _pfn_rpc_mem_free = nullptr;
|
||||
qnn::pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd = nullptr;
|
||||
qnn::pfn_rpc_mem_init _pfn_rpc_mem_init = nullptr;
|
||||
qnn::pfn_rpc_mem_deinit _pfn_rpc_mem_deinit = nullptr;
|
||||
std::unique_ptr<common::rpc_mem> _rpc_mem;
|
||||
std::unordered_map<void *, void *> _rpcmem_store_map;
|
||||
size_t _rpcmem_capacity = 512;
|
||||
|
||||
std::string _graph_name;
|
||||
|
||||
qnn::qcom_socinfo _soc_info = {};
|
||||
|
||||
bool _has_custom_op_package = false;
|
||||
common::dl_handler_t _custom_op_extra_lib_handle = nullptr;
|
||||
};
|
||||
|
||||
using qnn_instance_ptr = std::shared_ptr<qnn_instance>;
|
||||
|
|
@ -457,6 +454,6 @@ struct device_caps {
|
|||
size_t max_tensor_size_in_bytes;
|
||||
};
|
||||
|
||||
const device_caps & get_device_caps(QNNBackend device);
|
||||
const device_caps & get_device_caps(backend_index_type device);
|
||||
|
||||
} // namespace qnn
|
||||
|
|
@ -0,0 +1,51 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include <QnnCommon.h>
|
||||
#include <QnnInterface.h>
|
||||
#include <QnnTypes.h>
|
||||
#include <Saver/QnnSaver.h>
|
||||
#include <System/QnnSystemInterface.h>
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
namespace qnn {
|
||||
|
||||
enum qcom_htp_arch {
|
||||
NONE = 0,
|
||||
V68 = 68,
|
||||
V69 = 69,
|
||||
V73 = 73,
|
||||
V75 = 75,
|
||||
V79 = 79, // SD 8 Gen 4 (SM8750)
|
||||
};
|
||||
|
||||
enum qcom_chipset {
|
||||
UNKNOWN = 0,
|
||||
EMULATOR_X64 = 0xFF00, // x86_64 emulator
|
||||
EMULATOR_AARCH64 = 0xFF01, // ARM64 emulator
|
||||
SM8350 = 30, // v68, SD 888/888+
|
||||
SM8450 = 36, // v69, SD 8 Gen 1
|
||||
SA8295 = 39, // v68
|
||||
SM8475 = 42, // v69, SD 8+ Gen 1
|
||||
SM8550 = 43, // v73, SD 8 Gen 2
|
||||
SSG2115P = 46, // v73
|
||||
SM7675 = 70, // V73, SD 7+ Gen 3
|
||||
SM8635 = 68, // v73, SD 8s Gen 3
|
||||
SM8650 = 57, // v75, SD 8 Gen 3
|
||||
SM8750 = 69, // v79, SD 8 Gen 4
|
||||
};
|
||||
|
||||
struct qcom_socinfo {
|
||||
uint32_t soc_model;
|
||||
size_t htp_arch;
|
||||
size_t vtcm_size_in_mb;
|
||||
};
|
||||
|
||||
using pfn_qnnsaver_initialize = decltype(QnnSaver_initialize);
|
||||
using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders);
|
||||
using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders);
|
||||
} // namespace qnn
|
||||
|
||||
#define RPCMEM_DEFAULT_FLAGS 1
|
||||
#define RPCMEM_HEAP_ID_SYSTEM 25
|
||||
|
|
@ -25,7 +25,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this<ggml_qnn_tensor> {
|
|||
|
||||
explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string & name,
|
||||
const qnn_dimension_array_t & dimensions, Qnn_DataType_t data_type, int rank,
|
||||
QNNBackend device, Qnn_GraphHandle_t graph_handle, qnn_instance_ptr qnn_instance) :
|
||||
backend_index_type device, Qnn_GraphHandle_t graph_handle, qnn_instance_ptr qnn_instance) :
|
||||
_tensor_name(name),
|
||||
_device(device),
|
||||
_qnn_instance(qnn_instance),
|
||||
|
|
@ -45,7 +45,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this<ggml_qnn_tensor> {
|
|||
|
||||
explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string & name,
|
||||
const ggml_dimension_array_t & dimensions, ggml_type data_type, int rank,
|
||||
QNNBackend device, Qnn_GraphHandle_t graph_handle, qnn_instance_ptr qnn_instance) :
|
||||
backend_index_type device, Qnn_GraphHandle_t graph_handle, qnn_instance_ptr qnn_instance) :
|
||||
ggml_qnn_tensor(tensor_type, name, get_internal_dimension(dimensions, rank),
|
||||
qnn_datatype_from_ggml_datatype(data_type), rank, device, graph_handle, qnn_instance) {}
|
||||
|
||||
|
|
@ -318,7 +318,7 @@ class ggml_qnn_tensor : public std::enable_shared_from_this<ggml_qnn_tensor> {
|
|||
std::string _tensor_name;
|
||||
qnn_buffer_ptr _buffer;
|
||||
bool _can_unbind = true;
|
||||
QNNBackend _device;
|
||||
backend_index_type _device;
|
||||
qnn_instance_ptr _qnn_instance;
|
||||
Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion);
|
||||
qnn_dimension_array_t _dimensions = {};
|
||||
|
|
@ -408,7 +408,7 @@ struct tensor_create_common_params {
|
|||
const char * name_prefix;
|
||||
int tensor_rank;
|
||||
bool is_input;
|
||||
QNNBackend device;
|
||||
backend_index_type device;
|
||||
Qnn_GraphHandle_t graph_handle;
|
||||
std::shared_ptr<qnn::qnn_instance> qnn_instance;
|
||||
};
|
||||
|
|
@ -178,7 +178,7 @@ const char * get_ggml_type_name(ggml_type type) {
|
|||
return traits->type_name;
|
||||
}
|
||||
|
||||
const char * get_backend_name(QNNBackend device) {
|
||||
const char * get_backend_name(backend_index_type device) {
|
||||
switch (device) {
|
||||
case QNN_BACKEND_CPU:
|
||||
return "qnn-cpu";
|
||||
|
|
@ -192,7 +192,7 @@ const char * get_backend_name(QNNBackend device) {
|
|||
}
|
||||
}
|
||||
|
||||
const char * get_backend_desc(QNNBackend device) {
|
||||
const char * get_backend_desc(backend_index_type device) {
|
||||
switch (device) {
|
||||
case QNN_BACKEND_CPU:
|
||||
return "CPU";
|
||||
|
|
@ -224,6 +224,10 @@ const char * get_chipset_desc(uint32_t soc_model) {
|
|||
return "Snapdragon 8 Gen 3";
|
||||
case SM8750:
|
||||
return "Snapdragon 8 Elite";
|
||||
case EMULATOR_AARCH64:
|
||||
return "AArch64 Emulator";
|
||||
case EMULATOR_X64:
|
||||
return "x86_64 Emulator";
|
||||
default:
|
||||
return "unknown";
|
||||
}
|
||||
|
|
@ -251,6 +255,10 @@ const char * get_chipset_model(uint32_t soc_model) {
|
|||
return "SM8650";
|
||||
case SM8750:
|
||||
return "SM8750";
|
||||
case EMULATOR_AARCH64:
|
||||
return "AARCH64EMU";
|
||||
case EMULATOR_X64:
|
||||
return "X64EMU";
|
||||
default:
|
||||
return "unknown";
|
||||
}
|
||||
|
|
@ -456,52 +464,4 @@ const char * get_qnn_error_string(Qnn_ErrorHandle_t error) {
|
|||
}
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
size_t get_system_total_memory_in_bytes() {
|
||||
MEMORYSTATUSEX mem = {};
|
||||
mem.dwLength = sizeof(mem);
|
||||
if (GlobalMemoryStatusEx(&mem)) {
|
||||
return mem.ullTotalPhys;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t get_system_free_memory_in_bytes() {
|
||||
MEMORYSTATUSEX mem = {};
|
||||
mem.dwLength = sizeof(mem);
|
||||
if (GlobalMemoryStatusEx(&mem)) {
|
||||
return mem.ullAvailPhys;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
size_t get_system_total_memory_in_bytes() {
|
||||
struct sysinfo info = {};
|
||||
if (sysinfo(&info) == 0) {
|
||||
return (info.totalram + info.totalswap) * info.mem_unit;
|
||||
}
|
||||
|
||||
auto pages = (size_t) sysconf(_SC_PHYS_PAGES);
|
||||
auto page_size = (size_t) sysconf(_SC_PAGE_SIZE);
|
||||
return pages * page_size;
|
||||
}
|
||||
|
||||
size_t get_system_free_memory_in_bytes() {
|
||||
struct sysinfo info = {};
|
||||
if (sysinfo(&info) == 0) {
|
||||
return (info.freeram + info.freeswap) * info.mem_unit;
|
||||
}
|
||||
|
||||
auto avail_pages = (size_t) sysconf(_SC_AVPHYS_PAGES);
|
||||
auto page_size = (size_t) sysconf(_SC_PAGE_SIZE);
|
||||
return avail_pages * page_size;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
} // namespace qnn
|
||||
|
|
@ -5,6 +5,7 @@
|
|||
#include <cstdint>
|
||||
#include <string>
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ggml-qnn.h"
|
||||
#include "ggml.h"
|
||||
#include "logger.hpp"
|
||||
|
|
@ -23,8 +24,8 @@ qnn_dimension_array_t get_view_internal_dimension(const ggml_tensor * tensor, si
|
|||
|
||||
uint32_t get_ggml_tensor_rank(const ggml_tensor * tensor);
|
||||
const char * get_ggml_type_name(ggml_type type);
|
||||
const char * get_backend_name(QNNBackend device);
|
||||
const char * get_backend_desc(QNNBackend device);
|
||||
const char * get_backend_name(backend_index_type device);
|
||||
const char * get_backend_desc(backend_index_type device);
|
||||
const char * get_chipset_desc(uint32_t soc_model);
|
||||
const char * get_chipset_model(uint32_t soc_model);
|
||||
const char * get_htparch_desc(size_t htp_arch);
|
||||
|
|
@ -199,8 +200,6 @@ Qnn_DataType_t qnn_datatype_from_ggml_datatype(ggml_type ggml_type);
|
|||
ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type);
|
||||
size_t qnn_datatype_size(Qnn_DataType_t qnn_type);
|
||||
const char * qnn_datatype_to_string(Qnn_DataType_t qnn_type);
|
||||
size_t get_system_total_memory_in_bytes();
|
||||
size_t get_system_free_memory_in_bytes();
|
||||
|
||||
} // namespace qnn
|
||||
|
||||
|
|
@ -0,0 +1,35 @@
|
|||
|
||||
file(GLOB common_srcs "${CMAKE_CURRENT_LIST_DIR}/*.cpp")
|
||||
|
||||
add_library(runtime-common STATIC
|
||||
${common_srcs}
|
||||
)
|
||||
|
||||
target_include_directories(runtime-common PUBLIC
|
||||
${CMAKE_CURRENT_LIST_DIR}/
|
||||
${CMAKE_CURRENT_LIST_DIR}/../
|
||||
${CMAKE_CURRENT_LIST_DIR}/../../
|
||||
${CMAKE_CURRENT_LIST_DIR}/../../../include/ # TODO: figure out how to remove this
|
||||
)
|
||||
|
||||
if(GGML_QNN_ENABLE_HEXAGON_BACKEND)
|
||||
if(DEFINED ENV{QNN_SDK_PATH})
|
||||
set(HEXAGON_SDK_ROOT $ENV{HEXAGON_SDK_ROOT})
|
||||
message("found HEXAGON_SDK_ROOT, setting to ${HEXAGON_SDK_ROOT}")
|
||||
else()
|
||||
message(FATAL_ERROR "HEXAGON_SDK_ROOT not defined")
|
||||
endif()
|
||||
|
||||
target_include_directories(runtime-common PUBLIC
|
||||
${HEXAGON_SDK_ROOT}/incs/
|
||||
${HEXAGON_SDK_ROOT}/incs/stddef/
|
||||
${HEXAGON_SDK_ROOT}/incs/HAP/
|
||||
${HEXAGON_SDK_ROOT}/rtos/qurt/
|
||||
${HEXAGON_SDK_ROOT}/utils/examples/
|
||||
)
|
||||
target_compile_definitions(runtime-common PRIVATE
|
||||
GGML_QNN_ENABLE_HEXAGON_BACKEND
|
||||
)
|
||||
else()
|
||||
message("HEXAGON_SDK_ROOT not defined, not appending to include directories")
|
||||
endif()
|
||||
|
|
@ -0,0 +1,146 @@
|
|||
|
||||
#include "common.hpp"
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-qnn.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
# include <windows.h>
|
||||
#else
|
||||
# include <sys/sysinfo.h>
|
||||
# include <unistd.h>
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
|
||||
struct ggml_backend_qnn_reg_impl : ggml_backend_reg {
|
||||
std::vector<backend_device_proxy_ptr> device_proxies;
|
||||
std::vector<ggml_backend_device> devices;
|
||||
|
||||
explicit ggml_backend_qnn_reg_impl(ggml_backend_reg_i backend_iface) {
|
||||
context = this;
|
||||
iface = backend_iface;
|
||||
|
||||
LOG_INFO("backend registry init\n");
|
||||
for (size_t i = 0; i < TOTAL_BACKEND_COUNT; i++) {
|
||||
const auto device_enum =
|
||||
(backend_index_type) (TOTAL_BACKEND_COUNT - 1 - i); // init from the last device, i.e. NPU
|
||||
|
||||
backend_device_proxy_ptr device_proxy;
|
||||
if (device_enum < QNN_BACKEND_COUNT) {
|
||||
#ifdef GGML_HEXAGON_NPU_ONLY
|
||||
device_proxy = create_qnn_backend_context(device_enum);
|
||||
#else
|
||||
LOG_DEBUG("skip qnn device %d\n", (int) device_enum);
|
||||
continue;
|
||||
#endif
|
||||
} else {
|
||||
#ifdef GGML_QNN_ENABLE_HEXAGON_BACKEND
|
||||
device_proxy = create_hexagon_backend_context(device_enum);
|
||||
#else
|
||||
LOG_DEBUG("skip hexagon device %d\n", (int) device_enum);
|
||||
continue;
|
||||
#endif
|
||||
}
|
||||
|
||||
if (!device_proxy) {
|
||||
LOG_DEBUG("skip device %d\n", (int) device_enum);
|
||||
continue;
|
||||
}
|
||||
|
||||
devices.emplace_back(ggml_backend_device{
|
||||
/* iface = */ device_proxy->get_iface(),
|
||||
/* reg = */ this,
|
||||
/* context = */ device_proxy->get_context(),
|
||||
});
|
||||
|
||||
device_proxies.emplace_back(device_proxy);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const char * ggml_backend_qnn_reg_get_name(ggml_backend_reg_t reg) {
|
||||
GGML_UNUSED(reg);
|
||||
// TODO: should we use a different name?
|
||||
return "qualcomm";
|
||||
}
|
||||
|
||||
size_t ggml_backend_qnn_reg_get_device_count(ggml_backend_reg_t reg) {
|
||||
auto * ctx = (ggml_backend_qnn_reg_impl *) reg->context;
|
||||
return ctx->devices.size();
|
||||
}
|
||||
|
||||
ggml_backend_dev_t ggml_backend_qnn_reg_get_device(ggml_backend_reg_t reg, size_t index) {
|
||||
auto * ctx = (ggml_backend_qnn_reg_impl *) reg->context;
|
||||
GGML_ASSERT(index < ctx->devices.size());
|
||||
return &(ctx->devices[index]);
|
||||
}
|
||||
|
||||
const ggml_backend_reg_i ggml_backend_qnn_reg_interface = {
|
||||
/* .get_name = */ ggml_backend_qnn_reg_get_name,
|
||||
/* .get_device_count = */ ggml_backend_qnn_reg_get_device_count,
|
||||
/* .get_device_get = */ ggml_backend_qnn_reg_get_device,
|
||||
/* .get_proc_address = */ nullptr,
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
ggml_backend_reg_t ggml_backend_qnn_reg() {
|
||||
static ggml_backend_qnn_reg_impl reg{ ggml_backend_qnn_reg_interface };
|
||||
return ®
|
||||
}
|
||||
|
||||
namespace common {
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
size_t get_system_total_memory_in_bytes() {
|
||||
MEMORYSTATUSEX mem = {};
|
||||
mem.dwLength = sizeof(mem);
|
||||
if (GlobalMemoryStatusEx(&mem)) {
|
||||
return mem.ullTotalPhys;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t get_system_free_memory_in_bytes() {
|
||||
MEMORYSTATUSEX mem = {};
|
||||
mem.dwLength = sizeof(mem);
|
||||
if (GlobalMemoryStatusEx(&mem)) {
|
||||
return mem.ullAvailPhys;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
size_t get_system_total_memory_in_bytes() {
|
||||
struct sysinfo info = {};
|
||||
if (sysinfo(&info) == 0) {
|
||||
return (info.totalram + info.totalswap) * info.mem_unit;
|
||||
}
|
||||
|
||||
auto pages = (size_t) sysconf(_SC_PHYS_PAGES);
|
||||
auto page_size = (size_t) sysconf(_SC_PAGE_SIZE);
|
||||
return pages * page_size;
|
||||
}
|
||||
|
||||
size_t get_system_free_memory_in_bytes() {
|
||||
struct sysinfo info = {};
|
||||
if (sysinfo(&info) == 0) {
|
||||
return (info.freeram + info.freeswap) * info.mem_unit;
|
||||
}
|
||||
|
||||
auto avail_pages = (size_t) sysconf(_SC_AVPHYS_PAGES);
|
||||
auto page_size = (size_t) sysconf(_SC_PAGE_SIZE);
|
||||
return avail_pages * page_size;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
} // namespace common
|
||||
|
|
@ -0,0 +1,56 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
enum backend_index_type {
|
||||
QNN_BACKEND_CPU = 0,
|
||||
QNN_BACKEND_GPU,
|
||||
QNN_BACKEND_NPU,
|
||||
|
||||
HEXAGON_BACKEND,
|
||||
|
||||
TOTAL_BACKEND_COUNT,
|
||||
QNN_BACKEND_COUNT = HEXAGON_BACKEND,
|
||||
};
|
||||
|
||||
class backend_device_proxy {
|
||||
public:
|
||||
virtual ~backend_device_proxy() = default;
|
||||
|
||||
virtual const ggml_backend_device_i & get_iface() const = 0;
|
||||
virtual void * get_context() = 0;
|
||||
};
|
||||
|
||||
using backend_device_proxy_ptr = std::shared_ptr<backend_device_proxy>;
|
||||
|
||||
backend_device_proxy_ptr create_qnn_backend_context(backend_index_type device);
|
||||
backend_device_proxy_ptr create_hexagon_backend_context(backend_index_type device);
|
||||
|
||||
namespace common {
|
||||
|
||||
size_t get_system_total_memory_in_bytes();
|
||||
size_t get_system_free_memory_in_bytes();
|
||||
|
||||
} // namespace common
|
||||
|
||||
#define DISABLE_COPY(class_name) \
|
||||
class_name(const class_name &) = delete; \
|
||||
void operator=(const class_name &) = delete
|
||||
|
||||
#define DISABLE_MOVE(class_name) \
|
||||
class_name(class_name &&) = delete; \
|
||||
void operator=(class_name &&) = delete
|
||||
|
||||
#define LOG_ERROR(...) (GGML_LOG_ERROR(__VA_ARGS__))
|
||||
#define LOG_WARN(...) (GGML_LOG_WARN(__VA_ARGS__))
|
||||
#define LOG_INFO(...) (GGML_LOG_INFO(__VA_ARGS__))
|
||||
|
||||
#ifndef NDEBUG
|
||||
# define LOG_DEBUG(...) (GGML_LOG_DEBUG(__VA_ARGS__))
|
||||
#else
|
||||
# define LOG_DEBUG(...)
|
||||
#endif
|
||||
|
|
@ -13,20 +13,20 @@
|
|||
|
||||
#include <string>
|
||||
|
||||
namespace qnn {
|
||||
namespace common {
|
||||
|
||||
#ifdef __linux__
|
||||
typedef void * dl_handler_t;
|
||||
|
||||
inline qnn::dl_handler_t dl_load(const std::string & lib_path) {
|
||||
return dlopen(lib_path.c_str(), RTLD_NOW | RTLD_LOCAL);
|
||||
inline dl_handler_t dl_load(const std::string & lib_path) {
|
||||
return dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL);
|
||||
}
|
||||
|
||||
inline void * dl_sym(qnn::dl_handler_t handle, const std::string & symbol) {
|
||||
inline void * dl_sym(dl_handler_t handle, const std::string & symbol) {
|
||||
return dlsym(handle, symbol.c_str());
|
||||
}
|
||||
|
||||
inline bool dl_unload(qnn::dl_handler_t handle) {
|
||||
inline bool dl_unload(dl_handler_t handle) {
|
||||
return dlclose(handle) == 0;
|
||||
}
|
||||
|
||||
|
|
@ -36,7 +36,7 @@ inline const char * dl_error() {
|
|||
#elif defined(_WIN32)
|
||||
using dl_handler_t = HMODULE;
|
||||
|
||||
inline qnn::dl_handler_t dl_load(const std::string & lib_path) {
|
||||
inline dl_handler_t dl_load(const std::string & lib_path) {
|
||||
// suppress error dialogs for missing DLLs
|
||||
auto old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
||||
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
||||
|
|
@ -47,7 +47,7 @@ inline qnn::dl_handler_t dl_load(const std::string & lib_path) {
|
|||
return handle;
|
||||
}
|
||||
|
||||
inline void * dl_sym(qnn::dl_handler_t handle, const std::string & symbol) {
|
||||
inline void * dl_sym(dl_handler_t handle, const std::string & symbol) {
|
||||
auto old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
||||
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
||||
|
||||
|
|
@ -57,7 +57,7 @@ inline void * dl_sym(qnn::dl_handler_t handle, const std::string & symbol) {
|
|||
return p;
|
||||
}
|
||||
|
||||
inline bool dl_unload(qnn::dl_handler_t handle) {
|
||||
inline bool dl_unload(dl_handler_t handle) {
|
||||
FreeLibrary(handle);
|
||||
return true;
|
||||
}
|
||||
|
|
@ -69,8 +69,8 @@ inline const char * dl_error() {
|
|||
|
||||
#endif
|
||||
|
||||
template <typename Fn> Fn dl_sym_typed(qnn::dl_handler_t handle, const std::string & function_name) {
|
||||
template <typename Fn> Fn dl_sym_typed(dl_handler_t handle, const std::string & function_name) {
|
||||
return reinterpret_cast<Fn>(dl_sym(handle, function_name));
|
||||
}
|
||||
|
||||
} // namespace qnn
|
||||
} // namespace common
|
||||
|
|
@ -0,0 +1,223 @@
|
|||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "common.hpp"
|
||||
#include "dyn-lib-loader.hpp"
|
||||
#ifdef GGML_QNN_ENABLE_HEXAGON_BACKEND
|
||||
# include <remote.h>
|
||||
#else
|
||||
// TODO: remove this when not needed
|
||||
|
||||
/**
|
||||
* @enum fastrpc_map_flags for fastrpc_mmap and fastrpc_munmap
|
||||
* @brief Types of maps with cache maintenance
|
||||
*/
|
||||
enum fastrpc_map_flags {
|
||||
/**
|
||||
* Map memory pages with RW- permission and CACHE WRITEBACK.
|
||||
* Driver will clean cache when buffer passed in a FastRPC call.
|
||||
* Same remote virtual address will be assigned for subsequent
|
||||
* FastRPC calls.
|
||||
*/
|
||||
FASTRPC_MAP_STATIC,
|
||||
|
||||
/** Reserved for compatibility with deprecated flag */
|
||||
FASTRPC_MAP_RESERVED,
|
||||
|
||||
/**
|
||||
* Map memory pages with RW- permission and CACHE WRITEBACK.
|
||||
* Mapping tagged with a file descriptor. User is responsible for
|
||||
* maintenance of CPU and DSP caches for the buffer. Get virtual address
|
||||
* of buffer on DSP using HAP_mmap_get() and HAP_mmap_put() functions.
|
||||
*/
|
||||
FASTRPC_MAP_FD,
|
||||
|
||||
/**
|
||||
* Mapping delayed until user calls HAP_mmap() and HAP_munmap()
|
||||
* functions on DSP. User is responsible for maintenance of CPU and DSP
|
||||
* caches for the buffer. Delayed mapping is useful for users to map
|
||||
* buffer on DSP with other than default permissions and cache modes
|
||||
* using HAP_mmap() and HAP_munmap() functions.
|
||||
*/
|
||||
FASTRPC_MAP_FD_DELAYED,
|
||||
|
||||
/** Reserved for compatibility **/
|
||||
FASTRPC_MAP_RESERVED_4,
|
||||
FASTRPC_MAP_RESERVED_5,
|
||||
FASTRPC_MAP_RESERVED_6,
|
||||
FASTRPC_MAP_RESERVED_7,
|
||||
FASTRPC_MAP_RESERVED_8,
|
||||
FASTRPC_MAP_RESERVED_9,
|
||||
FASTRPC_MAP_RESERVED_10,
|
||||
FASTRPC_MAP_RESERVED_11,
|
||||
FASTRPC_MAP_RESERVED_12,
|
||||
FASTRPC_MAP_RESERVED_13,
|
||||
FASTRPC_MAP_RESERVED_14,
|
||||
FASTRPC_MAP_RESERVED_15,
|
||||
|
||||
/**
|
||||
* This flag is used to skip CPU mapping,
|
||||
* otherwise behaves similar to FASTRPC_MAP_FD_DELAYED flag.
|
||||
*/
|
||||
FASTRPC_MAP_FD_NOMAP,
|
||||
|
||||
/** Update FASTRPC_MAP_MAX when adding new value to this enum **/
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
namespace common {
|
||||
|
||||
#ifdef _WIN32
|
||||
constexpr const char * kQnnRpcLibName = "libcdsprpc.dll";
|
||||
#else
|
||||
constexpr const char * kQnnRpcLibName = "libcdsprpc.so";
|
||||
#endif
|
||||
|
||||
class rpc_interface {
|
||||
using rpc_mem_init_t = void (*)();
|
||||
using rpc_mem_deinit_t = void (*)();
|
||||
using rpc_mem_alloc_t = void * (*) (int heapid, uint32_t flags, int size);
|
||||
using rpc_mem_alloc2_t = void * (*) (int heapid, uint32_t flags, size_t size);
|
||||
using rpc_mem_free_t = void (*)(void * po);
|
||||
using rpc_mem_to_fd_t = int (*)(void * po);
|
||||
using rpc_mem_fastrpc_mmap_t = int (*)(int domain, int fd, void * addr, int offset, size_t length,
|
||||
enum fastrpc_map_flags flags);
|
||||
using rpc_mem_fastrpc_munmap_t = int (*)(int domain, int fd, void * addr, size_t length);
|
||||
using remote_handle_control_t = int (*)(uint32_t req, void * data, uint32_t datalen);
|
||||
using remote_session_control_t = int (*)(uint32_t req, void * data, uint32_t datalen);
|
||||
|
||||
public:
|
||||
rpc_interface(const std::string & rpc_lib_path = kQnnRpcLibName) {
|
||||
_rpc_lib_handle = dl_load(rpc_lib_path);
|
||||
if (!_rpc_lib_handle) {
|
||||
LOG_ERROR("failed to load %s, error: %s\n", rpc_lib_path.c_str(), dl_error());
|
||||
return;
|
||||
}
|
||||
|
||||
_rpc_mem_init = reinterpret_cast<rpc_mem_init_t>(dl_sym(_rpc_lib_handle, "rpcmem_init"));
|
||||
_rpc_mem_deinit = reinterpret_cast<rpc_mem_deinit_t>(dl_sym(_rpc_lib_handle, "rpcmem_deinit"));
|
||||
_rpc_mem_alloc = reinterpret_cast<rpc_mem_alloc_t>(dl_sym(_rpc_lib_handle, "rpcmem_alloc"));
|
||||
_rpc_mem_alloc2 = reinterpret_cast<rpc_mem_alloc2_t>(dl_sym(_rpc_lib_handle, "rpcmem_alloc2"));
|
||||
_rpc_mem_free = reinterpret_cast<rpc_mem_free_t>(dl_sym(_rpc_lib_handle, "rpcmem_free"));
|
||||
_rpc_mem_to_fd = reinterpret_cast<rpc_mem_to_fd_t>(dl_sym(_rpc_lib_handle, "rpcmem_to_fd"));
|
||||
_rpc_mem_fastrpc_mmap = reinterpret_cast<rpc_mem_fastrpc_mmap_t>(dl_sym(_rpc_lib_handle, "fastrpc_mmap"));
|
||||
_rpc_mem_fastrpc_munmap = reinterpret_cast<rpc_mem_fastrpc_munmap_t>(dl_sym(_rpc_lib_handle, "fastrpc_munmap"));
|
||||
_remote_handle_control =
|
||||
reinterpret_cast<remote_handle_control_t>(dl_sym(_rpc_lib_handle, "remote_handle_control"));
|
||||
_remote_session_control =
|
||||
reinterpret_cast<remote_session_control_t>(dl_sym(_rpc_lib_handle, "remote_session_control"));
|
||||
}
|
||||
|
||||
bool is_valid() const { return _rpc_lib_handle != nullptr; }
|
||||
|
||||
bool is_alloc2_available() const { return _rpc_mem_alloc2 != nullptr; }
|
||||
|
||||
void rpcmem_init() {
|
||||
if (_rpc_mem_init) {
|
||||
_rpc_mem_init();
|
||||
}
|
||||
}
|
||||
|
||||
void rpcmem_deinit() {
|
||||
if (_rpc_mem_deinit) {
|
||||
_rpc_mem_deinit();
|
||||
}
|
||||
}
|
||||
|
||||
void * rpcmem_alloc(int heapid, uint32_t flags, int size) {
|
||||
if (!is_valid()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return _rpc_mem_alloc(heapid, flags, size);
|
||||
}
|
||||
|
||||
void * rpcmem_alloc2(int heapid, uint32_t flags, size_t size) {
|
||||
if (!is_valid()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return _rpc_mem_alloc2(heapid, flags, size);
|
||||
}
|
||||
|
||||
void rpcmem_free(void * buf) {
|
||||
if (is_valid()) {
|
||||
_rpc_mem_free(buf);
|
||||
}
|
||||
}
|
||||
|
||||
int rpcmem_to_fd(void * buf) {
|
||||
int mem_fd = -1;
|
||||
if (is_valid()) {
|
||||
mem_fd = _rpc_mem_to_fd(buf);
|
||||
}
|
||||
|
||||
return mem_fd;
|
||||
}
|
||||
|
||||
int fastrpc_mmap(int domain, int fd, void * addr, int offset, size_t length, enum fastrpc_map_flags flags) {
|
||||
if (!is_valid()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return _rpc_mem_fastrpc_mmap(domain, fd, addr, offset, length, flags);
|
||||
}
|
||||
|
||||
int fastrpc_munmap(int domain, int fd, void * addr, size_t length) {
|
||||
if (!is_valid()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return _rpc_mem_fastrpc_munmap(domain, fd, addr, length);
|
||||
}
|
||||
|
||||
int remote_handle_control(uint32_t req, void * data, uint32_t datalen) {
|
||||
if (!is_valid()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return _remote_handle_control(req, data, datalen);
|
||||
}
|
||||
|
||||
int remote_session_control(uint32_t req, void * data, uint32_t datalen) {
|
||||
if (!is_valid()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return _remote_session_control(req, data, datalen);
|
||||
}
|
||||
|
||||
~rpc_interface() {
|
||||
if (_rpc_lib_handle) {
|
||||
if (_rpc_mem_deinit) {
|
||||
_rpc_mem_deinit();
|
||||
}
|
||||
|
||||
dl_unload(_rpc_lib_handle);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
dl_handler_t _rpc_lib_handle = nullptr;
|
||||
rpc_mem_init_t _rpc_mem_init = nullptr;
|
||||
rpc_mem_deinit_t _rpc_mem_deinit = nullptr;
|
||||
rpc_mem_alloc_t _rpc_mem_alloc = nullptr;
|
||||
rpc_mem_alloc2_t _rpc_mem_alloc2 = nullptr;
|
||||
rpc_mem_free_t _rpc_mem_free = nullptr;
|
||||
rpc_mem_to_fd_t _rpc_mem_to_fd = nullptr;
|
||||
rpc_mem_fastrpc_mmap_t _rpc_mem_fastrpc_mmap = nullptr;
|
||||
rpc_mem_fastrpc_munmap_t _rpc_mem_fastrpc_munmap = nullptr;
|
||||
remote_handle_control_t _remote_handle_control = nullptr;
|
||||
remote_session_control_t _remote_session_control = nullptr;
|
||||
|
||||
rpc_interface(const rpc_interface &) = delete;
|
||||
rpc_interface & operator=(const rpc_interface &) = delete;
|
||||
rpc_interface(rpc_interface &&) = delete;
|
||||
rpc_interface & operator=(rpc_interface &&) = delete;
|
||||
};
|
||||
|
||||
using rpc_interface_ptr = std::shared_ptr<rpc_interface>;
|
||||
|
||||
} // namespace common
|
||||
|
|
@ -0,0 +1,129 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
|
||||
#include "common.hpp"
|
||||
#include "dyn-lib-loader.hpp"
|
||||
#include "rpc-interface.hpp"
|
||||
|
||||
namespace common {
|
||||
|
||||
class rpc_mem {
|
||||
public:
|
||||
rpc_mem() {
|
||||
auto interface = std::make_shared<rpc_interface>();
|
||||
if (!interface->is_valid()) {
|
||||
LOG_ERROR("failed to load rpcmem lib\n");
|
||||
return;
|
||||
}
|
||||
|
||||
interface->rpcmem_init();
|
||||
_rpc_interface = interface;
|
||||
LOG_DEBUG("load rpcmem lib successfully\n");
|
||||
}
|
||||
|
||||
explicit rpc_mem(rpc_interface_ptr interface) {
|
||||
if (!interface->is_valid()) {
|
||||
LOG_ERROR("failed to load rpcmem lib\n");
|
||||
return;
|
||||
}
|
||||
|
||||
interface->rpcmem_init();
|
||||
_rpc_interface = interface;
|
||||
LOG_DEBUG("load rpcmem lib successfully\n");
|
||||
}
|
||||
|
||||
~rpc_mem() {
|
||||
if (!is_valid()) {
|
||||
LOG_DEBUG("rpc memory not initialized\n");
|
||||
return;
|
||||
}
|
||||
|
||||
if (_rpc_interface) {
|
||||
_rpc_interface->rpcmem_deinit();
|
||||
_rpc_interface.reset();
|
||||
}
|
||||
|
||||
LOG_DEBUG("unload rpcmem lib successfully\n");
|
||||
}
|
||||
|
||||
bool is_valid() const { return (bool) _rpc_interface; }
|
||||
|
||||
void * alloc(int heapid, uint32_t flags, size_t size) {
|
||||
if (!is_valid()) {
|
||||
LOG_ERROR("rpc memory not initialized\n");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (size > get_max_alloc_size()) {
|
||||
LOG_ERROR("rpc memory size %zu exceeds max alloc size %zu\n", size, get_max_alloc_size());
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void * buf = nullptr;
|
||||
if (_rpc_interface->is_alloc2_available()) {
|
||||
buf = _rpc_interface->rpcmem_alloc2(heapid, flags, size);
|
||||
} else {
|
||||
buf = _rpc_interface->rpcmem_alloc(heapid, flags, size);
|
||||
}
|
||||
|
||||
if (!buf) {
|
||||
LOG_ERROR("failed to allocate rpc memory, size: %d MB\n", (int) (size / (1 << 20)));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
LOG_DEBUG("rpc buffer allocated, heapid: %d, flags: 0x%x, size: %zu\n", heapid, flags, size);
|
||||
return buf;
|
||||
}
|
||||
|
||||
void free(void * buf) {
|
||||
if (!is_valid()) {
|
||||
LOG_ERROR("rpc memory not initialized\n");
|
||||
} else {
|
||||
_rpc_interface->rpcmem_free(buf);
|
||||
}
|
||||
}
|
||||
|
||||
int to_fd(void * buf) {
|
||||
int mem_fd = -1;
|
||||
if (!is_valid()) {
|
||||
LOG_ERROR("rpc memory not initialized\n");
|
||||
} else {
|
||||
mem_fd = _rpc_interface->rpcmem_to_fd(buf);
|
||||
}
|
||||
|
||||
return mem_fd;
|
||||
}
|
||||
|
||||
size_t get_max_alloc_size() {
|
||||
return _rpc_interface->is_alloc2_available() ? std::numeric_limits<size_t>::max() :
|
||||
std::numeric_limits<int>::max();
|
||||
}
|
||||
|
||||
int fastrpc_mmap(int domain, int fd, void * addr, int offset, size_t length, enum fastrpc_map_flags flags) {
|
||||
if (!is_valid()) {
|
||||
LOG_ERROR("rpc memory not initialized\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
return _rpc_interface->fastrpc_mmap(domain, fd, addr, offset, length, flags);
|
||||
}
|
||||
|
||||
int fastrpc_munmap(int domain, int fd, void * addr, size_t length) {
|
||||
if (!is_valid()) {
|
||||
LOG_ERROR("rpc memory not initialized\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
return _rpc_interface->fastrpc_munmap(domain, fd, addr, length);
|
||||
}
|
||||
|
||||
private:
|
||||
rpc_interface_ptr _rpc_interface;
|
||||
};
|
||||
|
||||
using rpc_mem_ptr = std::shared_ptr<rpc_mem>;
|
||||
|
||||
} // namespace common
|
||||
Loading…
Reference in New Issue