feat: perf opt part4 (#43)
* wip * refactor: rewrite dequantize_row_q4_0 by intrinsic * log for debug * fix q4 intrinsic * small opt * wip * wip * add vtcm_quota_size * add perf log for hexagon-npu backend * wip * add log * sync after a specfic op * increase worker thread priority * fix unbalanced thread slice * small slict to fit in vtcm cache * limit the supported row element size * opt 4_0 dequant * fix q4 dequant * add power_utils * add rms_norm * wip * enable rms_norm f32 * fix rms_norm with param * fix compiling flags * use float * fix small row size * vectorized rms norm * wip * read 2 vectors * rename * add perf log on update * set empty tensors handle also * merge some rpc functions * opt param update * wip * print more log * add struct for update param config * add npu_device_graph_set_tensor_with_param * merge tensor and params update * wip * wip * make as template to reuse * vectorize dequantize_row_q8_0 * opt * avoid using union to store q data * wip * wip * wip
This commit is contained in:
parent
2306f82a58
commit
c23ab465c0
|
|
@ -231,6 +231,11 @@ else()
|
|||
|
||||
build_idl(idl/hexagon_npu.idl hexagon_npu_skel_OBJS)
|
||||
|
||||
add_subdirectory(${HEXAGON_SDK_ROOT}/libs/qprintf qprintf_dir)
|
||||
target_include_directories(hexagon_npu_skel_OBJS PUBLIC
|
||||
${HEXAGON_SDK_ROOT}/libs/qprintf/inc/
|
||||
)
|
||||
|
||||
# disable warnings for the skel
|
||||
set_source_files_properties(
|
||||
${skel_srcs}
|
||||
|
|
@ -239,12 +244,12 @@ else()
|
|||
)
|
||||
|
||||
add_library(hexagon_npu_skel SHARED $<TARGET_OBJECTS:hexagon_npu_skel_OBJS>)
|
||||
|
||||
target_link_libraries(hexagon_npu_skel
|
||||
${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++abi.a
|
||||
${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++.a
|
||||
)
|
||||
set_target_properties(hexagon_npu_skel PROPERTIES OUTPUT_NAME "hexagon_npu_skel_${HEXAGON_ARCH}")
|
||||
target_link_libraries(hexagon_npu_skel qprintf_static)
|
||||
|
||||
copy_binaries(hexagon_npu_skel)
|
||||
endif()
|
||||
|
|
|
|||
|
|
@ -106,6 +106,7 @@ int npu_device_open(const char * uri, remote_handle64 * h) {
|
|||
}
|
||||
|
||||
*h = reinterpret_cast<remote_handle64>(context);
|
||||
DEVICE_LOG_INFO("NPU device context created: %p", (void *) *h);
|
||||
return AEE_SUCCESS;
|
||||
}
|
||||
|
||||
|
|
@ -117,6 +118,7 @@ int npu_device_close(remote_handle64 h) {
|
|||
}
|
||||
|
||||
delete context;
|
||||
DEVICE_LOG_INFO("NPU device context destroyed: %p", (void *) h);
|
||||
return AEE_SUCCESS;
|
||||
}
|
||||
|
||||
|
|
@ -130,6 +132,12 @@ AEEResult npu_device_device_support_op(remote_handle64 _h, const npu_device_tens
|
|||
const npu_device_tensor_spec * src1, const npu_device_tensor_spec * dst,
|
||||
npu_device_tensor_op op, boolean * is_supported) {
|
||||
NPU_UNUSED(_h);
|
||||
|
||||
if (!src0 || !src1 || !dst || !is_supported) {
|
||||
DEVICE_LOG_ERROR("npu_device_device_support_op: Invalid arguments");
|
||||
return AEE_EINVARGS;
|
||||
}
|
||||
|
||||
*is_supported = hexagon::support_op(*src0, *src1, *dst, op);
|
||||
return AEE_SUCCESS;
|
||||
}
|
||||
|
|
@ -147,28 +155,15 @@ AEEResult npu_device_tensor_init(remote_handle64 _h, const npu_device_tensor_con
|
|||
return AEE_SUCCESS;
|
||||
}
|
||||
|
||||
AEEResult npu_device_tensor_set_src(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle, uint64_t index,
|
||||
npu_device_tensor_handle_t src) {
|
||||
AEEResult npu_device_tensor_update_params(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle,
|
||||
const npu_device_tensor_update_config * config) {
|
||||
NPU_UNUSED(_h);
|
||||
auto * tensor = tensor_from_handle(tensor_handle);
|
||||
if (!tensor) {
|
||||
if (!tensor || !config) {
|
||||
return AEE_EINVHANDLE;
|
||||
}
|
||||
|
||||
auto * src_tensor = tensor_from_handle(src);
|
||||
tensor->set_src(index, src_tensor);
|
||||
return AEE_SUCCESS;
|
||||
}
|
||||
|
||||
AEEResult npu_device_tensor_set_op(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle,
|
||||
npu_device_tensor_op op) {
|
||||
NPU_UNUSED(_h);
|
||||
auto * tensor = tensor_from_handle(tensor_handle);
|
||||
if (!tensor) {
|
||||
return AEE_EINVHANDLE;
|
||||
}
|
||||
|
||||
tensor->set_op(op);
|
||||
tensor->update_config(*config);
|
||||
return AEE_SUCCESS;
|
||||
}
|
||||
|
||||
|
|
@ -206,6 +201,29 @@ AEEResult npu_device_graph_set_tensor(remote_handle64 _h, npu_device_graph_handl
|
|||
return AEE_SUCCESS;
|
||||
}
|
||||
|
||||
AEEResult npu_device_graph_set_tensor_with_param(remote_handle64 _h, npu_device_graph_handle_t graph_handle,
|
||||
const npu_device_tensor_handle_t * tensor_handles,
|
||||
int tensor_handlesLen,
|
||||
const npu_device_tensor_update_config * tensor_params,
|
||||
int tensor_paramsLen) {
|
||||
NPU_UNUSED(_h);
|
||||
auto * graph = graph_from_handle(graph_handle);
|
||||
if (!graph || !tensor_handles || tensor_handlesLen <= 0 || !tensor_params ||
|
||||
tensor_handlesLen != tensor_paramsLen) {
|
||||
return AEE_EINVHANDLE;
|
||||
}
|
||||
|
||||
graph->set_tensor(tensor_handles, tensor_handlesLen);
|
||||
for (int i = 0; i < tensor_handlesLen; ++i) {
|
||||
auto * tensor = tensor_from_handle(tensor_handles[i]);
|
||||
if (tensor) {
|
||||
tensor->update_config(tensor_params[i]);
|
||||
}
|
||||
}
|
||||
|
||||
return AEE_SUCCESS;
|
||||
}
|
||||
|
||||
AEEResult npu_device_graph_compute(remote_handle64 _h, npu_device_graph_handle_t graph_handle) {
|
||||
auto dev_ctx = device_context_from_handle(_h);
|
||||
if (!dev_ctx) {
|
||||
|
|
|
|||
|
|
@ -10,7 +10,8 @@
|
|||
namespace hexagon {
|
||||
|
||||
graph::graph() noexcept {
|
||||
DEVICE_LOG_DEBUG("graph(%p) created\n", (void *) this);
|
||||
_vtcm_quota_size = hexagon::vtcm_mem::get_avail_block_size(); // TODO: move to device init?
|
||||
DEVICE_LOG_DEBUG("graph(%p) created: vtcm quota size: %zu\n", (void *) this, _vtcm_quota_size);
|
||||
}
|
||||
|
||||
graph::~graph() noexcept {
|
||||
|
|
@ -45,6 +46,8 @@ bool graph::compute(default_thread_pool * thread_pool, const float * f16_to_f32_
|
|||
}
|
||||
|
||||
DEVICE_LOG_DEBUG("graph(%p) compute\n", (void *) this);
|
||||
|
||||
DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p]compute", (void *) this);
|
||||
_f16_to_f32_table = f16_to_f32_table;
|
||||
if (thread_pool) {
|
||||
thread_pool->sync_execute(reinterpret_cast<default_thread_pool::task_type>(&graph::thread_pool_task), this);
|
||||
|
|
@ -61,6 +64,8 @@ void graph::thread_pool_task(default_thread_pool * pool, size_t thread_idx, size
|
|||
}
|
||||
|
||||
void graph::compute_impl(default_thread_pool * pool, size_t thread_idx, size_t thread_count) {
|
||||
hexagon::compute_params params = { thread_idx, thread_count, _vtcm_quota_size / thread_count, _f16_to_f32_table };
|
||||
|
||||
for (size_t i = 0; i < _tensor_count; ++i) {
|
||||
auto * dst = _tensors[i];
|
||||
auto op = dst->get_op();
|
||||
|
|
@ -69,14 +74,14 @@ void graph::compute_impl(default_thread_pool * pool, size_t thread_idx, size_t t
|
|||
DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d not supported\n", (void *) this, i, op);
|
||||
return;
|
||||
}
|
||||
|
||||
hexagon::compute_params params = { thread_idx, thread_count, _f16_to_f32_table };
|
||||
if (!func(dst, ¶ms)) {
|
||||
DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d compute failed\n", (void *) this, i, op);
|
||||
}
|
||||
|
||||
// TODO: figure out which ops need to sync
|
||||
if (pool) {
|
||||
DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p]sync_thread, tidx: %zu", (void *) this, thread_idx);
|
||||
|
||||
const bool should_sync = requires_thread_barrier(op);
|
||||
if (pool && should_sync && i < _tensor_count - 1) {
|
||||
pool->sync_thread();
|
||||
}
|
||||
dst->invalidate();
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@ class graph {
|
|||
|
||||
std::unique_ptr<tensor *[]> _tensors;
|
||||
size_t _tensor_count = 0;
|
||||
size_t _vtcm_quota_size = 0;
|
||||
const float * _f16_to_f32_table = nullptr;
|
||||
|
||||
DISABLE_COPY_AND_MOVE(graph);
|
||||
|
|
|
|||
|
|
@ -5,6 +5,8 @@
|
|||
#include <hexagon_types.h>
|
||||
#include <HTP/core/intrinsics.h>
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
#include "op_mul_mat.hpp"
|
||||
#include "quants.hpp"
|
||||
|
||||
|
|
@ -17,7 +19,7 @@ inline void vec_op_impl(const _TyData * src0, const _TyData * src1, size_t count
|
|||
HVX_Vector * iptr0 = ((HVX_Vector *) src0);
|
||||
HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kElementsPerVector);
|
||||
HVX_Vector * iptr1 = ((HVX_Vector *) src1);
|
||||
HVX_Vector * optr = ((HVX_Vector *) dst);
|
||||
HVX_Vector * optr = ((HVX_Vector *) dst); // framework will ensure the dst is aligned
|
||||
HVX_Vector prev0 = *iptr0++;
|
||||
HVX_Vector prev1 = *iptr1++;
|
||||
|
||||
|
|
@ -108,6 +110,12 @@ template <typename _TyData> struct get_data_type<void (*)(const _TyData *, const
|
|||
using type = _TyData;
|
||||
};
|
||||
|
||||
template <typename _TyData, typename _TyParam>
|
||||
struct get_data_type<void (*)(const _TyData *, size_t, _TyParam, _TyData *)> {
|
||||
using type = _TyData;
|
||||
using param_type = typename std::remove_cv<typename std::remove_reference<_TyData>::type>::type;
|
||||
};
|
||||
|
||||
template <auto _RowFunc> bool element_wise_op(hexagon::tensor * out, hexagon::compute_params * params) {
|
||||
using data_type = typename get_data_type<decltype(_RowFunc)>::type;
|
||||
|
||||
|
|
@ -166,6 +174,16 @@ template <auto _RowFunc> bool element_wise_op(hexagon::tensor * out, hexagon::co
|
|||
return true;
|
||||
}
|
||||
|
||||
bool is_same_shape(const npu_device_tensor_spec & src, const npu_device_tensor_spec & dst) {
|
||||
for (size_t i = 0; i < DEVICE_TENSOR_MAX_DIMS; ++i) {
|
||||
if (src.ne[i] != dst.ne[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool is_element_wise_op_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
|
||||
const npu_device_tensor_spec & dst, npu_device_tensor_op op) {
|
||||
if (op != NPU_OP_ADD && op != NPU_OP_SUB && op != NPU_OP_MUL) {
|
||||
|
|
@ -196,12 +214,149 @@ bool is_element_wise_op_supported(const npu_device_tensor_spec & src0, const npu
|
|||
return false;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < DEVICE_TENSOR_MAX_DIMS; ++i) {
|
||||
if (src0.ne[i] != dst.ne[i]) {
|
||||
DEVICE_LOG_DEBUG("[%s]src0.ne[%zu] and dst.ne[%zu] not match: %lld vs %lld\n", hexagon::op_get_name(op), i,
|
||||
i, (long long) src0.ne[i], (long long) dst.ne[i]);
|
||||
return false;
|
||||
if (!is_same_shape(src0, dst)) {
|
||||
DEVICE_LOG_DEBUG("[%s]src0 and dst have different shape\n", hexagon::op_get_name(op));
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void rms_norm_vec_f32(const float * src, size_t count, float eps, float * dst) {
|
||||
constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(float);
|
||||
|
||||
HVX_Vector * src_vec_ptr = ((HVX_Vector *) src);
|
||||
HVX_Vector * src_vec_end = ((HVX_Vector *) src) + (count / kElementsPerVector);
|
||||
HVX_Vector prev = *src_vec_ptr++;
|
||||
HVX_Vector sum = Q6_V_vzero();
|
||||
while (src_vec_ptr < src_vec_end) {
|
||||
HVX_Vector curr = *src_vec_ptr++;
|
||||
HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src);
|
||||
sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_Vqf32_vmpy_VsfVsf(s0, s0));
|
||||
prev = curr;
|
||||
}
|
||||
|
||||
if ((src_vec_end - ((HVX_Vector *) src)) > 0) {
|
||||
// handle the last vector
|
||||
bool src_ptr_aligned = hexagon::is_addr_aligned(src_vec_ptr);
|
||||
HVX_Vector curr = src_ptr_aligned ? prev : *src_vec_ptr;
|
||||
src_vec_ptr = src_ptr_aligned ? src_vec_ptr : src_vec_ptr + 1;
|
||||
HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src);
|
||||
sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_Vqf32_vmpy_VsfVsf(s0, s0));
|
||||
prev = curr;
|
||||
}
|
||||
|
||||
const size_t leftover = count % kElementsPerVector;
|
||||
const size_t leftover_bytes = leftover * sizeof(float);
|
||||
if (leftover > 0) {
|
||||
// handle the leftover elements
|
||||
HVX_Vector curr =
|
||||
(leftover_bytes + hexagon::unaligned_bytes(src_vec_ptr) > hexagon::kBytesPerVector) ? *src_vec_ptr : prev;
|
||||
curr = Q6_V_valign_VVR(curr, prev, (size_t) src);
|
||||
sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum,
|
||||
Q6_V_valign_VVR(Q6_Vqf32_vmpy_VsfVsf(curr, curr), Q6_V_vzero(), leftover_bytes));
|
||||
}
|
||||
|
||||
const float mean = hexagon::vec_reduction_f32(sum) / count; // TODO: figure out how to do division in vector
|
||||
const float scale = 1.0f / sqrtf(mean + eps); // TODO: use buildin blas sqrtf?
|
||||
|
||||
HVX_Vector scale_vec = Q6_V_vsplat_R(reinterpret_cast<const uint32_t &>(scale));
|
||||
src_vec_ptr = ((HVX_Vector *) src);
|
||||
prev = *src_vec_ptr++;
|
||||
HVX_Vector * dst_vec_ptr = ((HVX_Vector *) dst); // framework will ensure the dst is aligned
|
||||
while (src_vec_ptr < src_vec_end) {
|
||||
HVX_Vector curr = *src_vec_ptr++;
|
||||
HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src);
|
||||
*dst_vec_ptr++ = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, scale_vec));
|
||||
prev = curr;
|
||||
}
|
||||
|
||||
if ((src_vec_end - ((HVX_Vector *) src)) > 0) {
|
||||
// handle the last vector
|
||||
bool src_ptr_aligned = hexagon::is_addr_aligned(src_vec_ptr);
|
||||
HVX_Vector curr = src_ptr_aligned ? prev : *src_vec_ptr;
|
||||
src_vec_ptr = src_ptr_aligned ? src_vec_ptr : src_vec_ptr + 1;
|
||||
HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src);
|
||||
*dst_vec_ptr++ = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, scale_vec));
|
||||
prev = curr;
|
||||
}
|
||||
|
||||
if (leftover > 0) {
|
||||
// handle the leftover elements
|
||||
HVX_Vector curr =
|
||||
(leftover_bytes + hexagon::unaligned_bytes(src_vec_ptr) > hexagon::kBytesPerVector) ? *src_vec_ptr : prev;
|
||||
curr = Q6_V_valign_VVR(curr, prev, (size_t) src);
|
||||
q6op_vstu_variable_ARV(dst_vec_ptr, leftover_bytes, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(curr, scale_vec)));
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: merge with element_wise_op?
|
||||
template <auto _RowFunc> bool unary_op(hexagon::tensor * out, hexagon::compute_params * params) {
|
||||
using data_type = typename get_data_type<decltype(_RowFunc)>::type;
|
||||
using param_type = typename get_data_type<decltype(_RowFunc)>::param_type;
|
||||
|
||||
if (!out) {
|
||||
return false;
|
||||
}
|
||||
|
||||
static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "element_wise_op requires max dims 4");
|
||||
auto * src0 = out->get_src(0);
|
||||
if (!src0) {
|
||||
return true; // skip if no src
|
||||
}
|
||||
|
||||
const auto * src0_ptr = reinterpret_cast<const uint8_t *>(src0->get_read_buffer());
|
||||
auto * dst_ptr = reinterpret_cast<uint8_t *>(out->get_write_buffer());
|
||||
auto total_rows = out->get_ne(3) * out->get_ne(2) * out->get_ne(1);
|
||||
const auto rows_per_cube = out->get_ne(2) * out->get_ne(1);
|
||||
const auto start_end = hexagon::get_thread_work_slice(total_rows, params->tidx, params->tcnt);
|
||||
if (start_end.first >= start_end.second) {
|
||||
return true;
|
||||
}
|
||||
|
||||
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(out, params->tidx);
|
||||
|
||||
const auto param = out->get_op_param<param_type>(0);
|
||||
const size_t valid_row_bytes = src0->get_ne(0) * sizeof(data_type);
|
||||
for (int64_t ir = start_end.first; ir < start_end.second; ++ir) {
|
||||
const auto i03 = ir / rows_per_cube;
|
||||
const auto i02 = ir / out->get_ne(1) - i03 * out->get_ne(2);
|
||||
const auto i01 = ir % out->get_ne(1); // TODO: should we use divide instead of mod?
|
||||
|
||||
auto * src0_row = src0_ptr + i03 * src0->get_nb(3) + i02 * src0->get_nb(2) + i01 * src0->get_nb(1);
|
||||
auto * dst_row = dst_ptr + i03 * out->get_nb(3) + i02 * out->get_nb(2) + i01 * out->get_nb(1);
|
||||
if (ir + 1 < start_end.second) {
|
||||
hexagon::l2fetch_row(src0_row + src0->get_nb(1), valid_row_bytes);
|
||||
}
|
||||
|
||||
_RowFunc(reinterpret_cast<const data_type *>(src0_row), static_cast<size_t>(out->get_ne(0)), param,
|
||||
reinterpret_cast<data_type *>(dst_row));
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool is_unary_op_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
|
||||
const npu_device_tensor_spec & dst, npu_device_tensor_op op) {
|
||||
if (op != NPU_OP_RMS_NORM) {
|
||||
DEVICE_LOG_DEBUG("[%s]unsupported\n", hexagon::op_get_name(op));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dst.type != src0.type) {
|
||||
DEVICE_LOG_DEBUG("[%s]src0.type and dst.type mismatch: %s vs %s\n", hexagon::op_get_name(op),
|
||||
hexagon::get_type_name(src0.type), hexagon::get_type_name(dst.type));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dst.type != NPU_DATA_TYPE_F32) {
|
||||
DEVICE_LOG_DEBUG("[%s]unsupported data type: %s\n", hexagon::op_get_name(op), hexagon::get_type_name(dst.type));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!is_same_shape(src0, dst)) {
|
||||
DEVICE_LOG_DEBUG("[%s]src0 and dst have different shape\n", hexagon::op_get_name(op));
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
|
|
@ -211,6 +366,7 @@ struct op_capabilities {
|
|||
npu_device_tensor_op op;
|
||||
hexagon::op_is_supported_func_type is_supported;
|
||||
hexagon::compute_func_type compute_funcs[NPU_DATA_TYPE_COUNT];
|
||||
bool requires_thread_barrier = false;
|
||||
};
|
||||
|
||||
constexpr const op_capabilities kOpCapabilities[] = {
|
||||
|
|
@ -219,22 +375,36 @@ constexpr const op_capabilities kOpCapabilities[] = {
|
|||
{
|
||||
hexagon::mul_mat_f32, // NPU_DATA_TYPE_F32
|
||||
nullptr, // NPU_DATA_TYPE_F16
|
||||
}, },
|
||||
{ NPU_OP_ADD,
|
||||
is_element_wise_op_supported, {
|
||||
element_wise_op<vec_op_f32_f32<vadd_f32_f32>>, // NPU_DATA_TYPE_F32
|
||||
element_wise_op<vec_op_f16_f16<vadd_f16_f16>>, // NPU_DATA_TYPE_F16
|
||||
} },
|
||||
{ NPU_OP_SUB,
|
||||
is_element_wise_op_supported, {
|
||||
element_wise_op<vec_op_f32_f32<vsub_f32_f32>>, // NPU_DATA_TYPE_F32
|
||||
element_wise_op<vec_op_f16_f16<vsub_f16_f16>>, // NPU_DATA_TYPE_F16
|
||||
} },
|
||||
{ NPU_OP_MUL,
|
||||
is_element_wise_op_supported, {
|
||||
element_wise_op<vec_op_f32_f32<vmul_f32_f32>>, // NPU_DATA_TYPE_F32
|
||||
element_wise_op<vec_op_f16_f16<vmul_f16_f16>>, // NPU_DATA_TYPE_F16
|
||||
} },
|
||||
}, true,
|
||||
},
|
||||
{
|
||||
NPU_OP_ADD, is_element_wise_op_supported,
|
||||
{
|
||||
element_wise_op<vec_op_f32_f32<vadd_f32_f32>>, // NPU_DATA_TYPE_F32
|
||||
element_wise_op<vec_op_f16_f16<vadd_f16_f16>>, // NPU_DATA_TYPE_F16
|
||||
}, false,
|
||||
},
|
||||
{
|
||||
NPU_OP_SUB, is_element_wise_op_supported,
|
||||
{
|
||||
element_wise_op<vec_op_f32_f32<vsub_f32_f32>>, // NPU_DATA_TYPE_F32
|
||||
element_wise_op<vec_op_f16_f16<vsub_f16_f16>>, // NPU_DATA_TYPE_F16
|
||||
}, false,
|
||||
},
|
||||
{
|
||||
NPU_OP_MUL, is_element_wise_op_supported,
|
||||
{
|
||||
element_wise_op<vec_op_f32_f32<vmul_f32_f32>>, // NPU_DATA_TYPE_F32
|
||||
element_wise_op<vec_op_f16_f16<vmul_f16_f16>>, // NPU_DATA_TYPE_F16
|
||||
}, false,
|
||||
},
|
||||
{
|
||||
NPU_OP_RMS_NORM, is_unary_op_supported,
|
||||
{
|
||||
unary_op<rms_norm_vec_f32>, // NPU_DATA_TYPE_F32
|
||||
nullptr, // NPU_DATA_TYPE_F16
|
||||
}, false,
|
||||
},
|
||||
};
|
||||
|
||||
static_assert(kOpCapabilities[NPU_OP_MUL_MAT].compute_funcs[NPU_DATA_TYPE_F32] == hexagon::mul_mat_f32,
|
||||
|
|
@ -243,6 +413,8 @@ static_assert(kOpCapabilities[NPU_OP_MUL_MAT].compute_funcs[NPU_DATA_TYPE_F32] =
|
|||
static_assert(std::size(kOpCapabilities) == NPU_OP_COUNT);
|
||||
static_assert(kOpCapabilities[NPU_OP_MUL_MAT].op == NPU_OP_MUL_MAT, "kOpArray[NPU_OP_MUL_MAT].op != NPU_OP_MUL_MAT");
|
||||
static_assert(kOpCapabilities[NPU_OP_MUL].op == NPU_OP_MUL, "kOpArray[NPU_OP_MUL].op != NPU_OP_MUL");
|
||||
static_assert(kOpCapabilities[NPU_OP_RMS_NORM].op == NPU_OP_RMS_NORM,
|
||||
"kOpArray[NPU_OP_RMS_NORM].op != NPU_OP_RMS_NORM");
|
||||
|
||||
hexagon::compute_func_type get_compute_func_impl(npu_device_tensor_op op, npu_device_tensor_data_type type) {
|
||||
if (op >= NPU_OP_COUNT) {
|
||||
|
|
@ -260,6 +432,14 @@ compute_func_type get_compute_func(tensor * dst) {
|
|||
return get_compute_func_impl(dst->get_op(), dst->get_type());
|
||||
}
|
||||
|
||||
bool requires_thread_barrier(npu_device_tensor_op op) {
|
||||
if (op >= NPU_OP_COUNT) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return kOpCapabilities[op].requires_thread_barrier;
|
||||
}
|
||||
|
||||
bool support_op(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
|
||||
const npu_device_tensor_spec & dst, npu_device_tensor_op op) {
|
||||
if (get_compute_func_impl(op, dst.type) == nullptr) {
|
||||
|
|
|
|||
|
|
@ -6,6 +6,8 @@ namespace hexagon {
|
|||
|
||||
compute_func_type get_compute_func(tensor * dst);
|
||||
|
||||
bool requires_thread_barrier(npu_device_tensor_op op);
|
||||
|
||||
bool support_op(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
|
||||
const npu_device_tensor_spec & dst, npu_device_tensor_op op);
|
||||
|
||||
|
|
|
|||
|
|
@ -3,44 +3,43 @@
|
|||
#include <HTP/core/intrinsics.h>
|
||||
|
||||
#include "quants.hpp"
|
||||
#include "thread_pool.hpp" // TODO: remove this dependency
|
||||
#include "vtcm_mem.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
inline float vec_reduction_f32(HVX_Vector sums) {
|
||||
constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(float);
|
||||
static_assert(kFloatsPerVector == 32 || kFloatsPerVector == 16, "kFloatsPerVector should be 16 or 32");
|
||||
|
||||
// TODO: do we have a better way to do the reduction?
|
||||
switch (kFloatsPerVector) {
|
||||
default:
|
||||
case 32:
|
||||
sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 16 * sizeof(float)));
|
||||
// fallthrough
|
||||
case 16:
|
||||
sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 8 * sizeof(float)));
|
||||
sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 4 * sizeof(float)));
|
||||
sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 2 * sizeof(float)));
|
||||
sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, sizeof(float)));
|
||||
break;
|
||||
}
|
||||
|
||||
return hexagon::get_flt0_from_fltv(Q6_Vsf_equals_Vqf32(sums));
|
||||
}
|
||||
|
||||
inline float vec_dot_product_f32_f32(const float * src0, const float * src1, size_t count) {
|
||||
constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(float);
|
||||
|
||||
HVX_Vector * iptr0 = ((HVX_Vector *) src0);
|
||||
HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kElementsPerVector);
|
||||
HVX_Vector * iptr1 = ((HVX_Vector *) src1);
|
||||
HVX_Vector prev0 = *iptr0++;
|
||||
HVX_Vector prev1 = *iptr1++;
|
||||
HVX_Vector sum = Q6_V_vzero();
|
||||
HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0);
|
||||
HVX_Vector * src0_vec_ptr_end = ((HVX_Vector *) src0) + count / kElementsPerVector;
|
||||
HVX_Vector * src1_vec_ptr = ((HVX_Vector *) src1);
|
||||
HVX_Vector prev0 = *src0_vec_ptr++;
|
||||
HVX_Vector prev1 = *src1_vec_ptr++;
|
||||
HVX_Vector sum = Q6_V_vzero();
|
||||
|
||||
while (iptr0 < iptr0_end) {
|
||||
HVX_Vector curr0 = *iptr0++;
|
||||
HVX_Vector curr1 = *iptr1++;
|
||||
while (src0_vec_ptr_end - src0_vec_ptr > 1) {
|
||||
HVX_Vector curr0_lo = src0_vec_ptr[0];
|
||||
HVX_Vector curr0_hi = src0_vec_ptr[1];
|
||||
HVX_Vector curr1_lo = src1_vec_ptr[0];
|
||||
HVX_Vector curr1_hi = src1_vec_ptr[1];
|
||||
|
||||
HVX_Vector l0 = Q6_V_valign_VVR(curr0_lo, prev0, (size_t) src0);
|
||||
HVX_Vector l1 = Q6_V_valign_VVR(curr1_lo, prev1, (size_t) src1);
|
||||
HVX_Vector h0 = Q6_V_valign_VVR(curr0_hi, curr0_lo, (size_t) src0);
|
||||
HVX_Vector h1 = Q6_V_valign_VVR(curr1_hi, curr1_lo, (size_t) src1);
|
||||
sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(l0, l1), sum);
|
||||
sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(h0, h1), sum);
|
||||
|
||||
prev0 = curr0_hi;
|
||||
prev1 = curr1_hi;
|
||||
src0_vec_ptr += 2;
|
||||
src1_vec_ptr += 2;
|
||||
}
|
||||
|
||||
if (src0_vec_ptr_end - src0_vec_ptr > 0) {
|
||||
HVX_Vector curr0 = *src0_vec_ptr++;
|
||||
HVX_Vector curr1 = *src1_vec_ptr++;
|
||||
HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
|
||||
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
|
||||
sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, s1), sum);
|
||||
|
|
@ -48,17 +47,17 @@ inline float vec_dot_product_f32_f32(const float * src0, const float * src1, siz
|
|||
prev1 = curr1;
|
||||
}
|
||||
|
||||
if ((iptr0_end - ((HVX_Vector *) src0)) > 0) {
|
||||
if ((src0_vec_ptr_end - ((HVX_Vector *) src0)) > 0) {
|
||||
// handle the last vector
|
||||
// see also:
|
||||
// https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147
|
||||
// or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c
|
||||
bool iptr0_aligned = hexagon::is_addr_aligned(iptr0);
|
||||
HVX_Vector curr0 = iptr0_aligned ? prev0 : *iptr0;
|
||||
iptr0 = iptr0_aligned ? iptr0 : iptr0 + 1;
|
||||
bool iptr1_aligned = hexagon::is_addr_aligned(iptr1);
|
||||
HVX_Vector curr1 = iptr1_aligned ? prev1 : *iptr1;
|
||||
iptr1 = iptr1_aligned ? iptr1 : iptr1 + 1;
|
||||
bool iptr0_aligned = hexagon::is_addr_aligned(src0_vec_ptr);
|
||||
HVX_Vector curr0 = iptr0_aligned ? prev0 : *src0_vec_ptr;
|
||||
src0_vec_ptr = iptr0_aligned ? src0_vec_ptr : src0_vec_ptr + 1;
|
||||
bool iptr1_aligned = hexagon::is_addr_aligned(src1_vec_ptr);
|
||||
HVX_Vector curr1 = iptr1_aligned ? prev1 : *src1_vec_ptr;
|
||||
src1_vec_ptr = iptr1_aligned ? src1_vec_ptr : src1_vec_ptr + 1;
|
||||
HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
|
||||
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
|
||||
sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, s1), sum);
|
||||
|
|
@ -70,19 +69,21 @@ inline float vec_dot_product_f32_f32(const float * src0, const float * src1, siz
|
|||
const size_t leftover_bytes = leftover * sizeof(float);
|
||||
if (leftover > 0) {
|
||||
// handle the leftover elements
|
||||
HVX_Vector curr0 =
|
||||
(leftover_bytes + hexagon::unaligned_bytes(iptr0) > hexagon::kBytesPerVector) ? *iptr0 : prev0;
|
||||
curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
|
||||
HVX_Vector curr0 = (leftover_bytes + hexagon::unaligned_bytes(src0_vec_ptr) > hexagon::kBytesPerVector) ?
|
||||
*src0_vec_ptr :
|
||||
prev0;
|
||||
curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
|
||||
|
||||
HVX_Vector curr1 =
|
||||
(leftover_bytes + hexagon::unaligned_bytes(iptr1) > hexagon::kBytesPerVector) ? *iptr1 : prev1;
|
||||
curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
|
||||
HVX_Vector curr1 = (leftover_bytes + hexagon::unaligned_bytes(src1_vec_ptr) > hexagon::kBytesPerVector) ?
|
||||
*src1_vec_ptr :
|
||||
prev1;
|
||||
curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
|
||||
|
||||
sum = Q6_Vqf32_vadd_Vqf32Vqf32(
|
||||
Q6_V_valign_VVR(Q6_Vqf32_vmpy_VsfVsf(curr0, curr1), Q6_V_vzero(), leftover_bytes), sum);
|
||||
}
|
||||
|
||||
return vec_reduction_f32(sum);
|
||||
return hexagon::vec_reduction_f32(sum);
|
||||
}
|
||||
|
||||
// TODO: merge with vec_dot_product_f32_f32?
|
||||
|
|
@ -90,17 +91,17 @@ inline float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_d
|
|||
constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(npu_device_fp16_t);
|
||||
constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(float);
|
||||
|
||||
HVX_Vector * iptr0 = ((HVX_Vector *) src0);
|
||||
HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kElementsPerVector);
|
||||
HVX_Vector * iptr1 = ((HVX_Vector *) src1);
|
||||
HVX_Vector prev0 = *iptr0++;
|
||||
HVX_Vector prev1 = *iptr1++;
|
||||
HVX_Vector sum_hi = Q6_V_vzero();
|
||||
HVX_Vector sum_lo = Q6_V_vzero();
|
||||
HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0);
|
||||
HVX_Vector * src0_vec_ptr_end = ((HVX_Vector *) src0) + (count / kElementsPerVector);
|
||||
HVX_Vector * src1_vec_ptr = ((HVX_Vector *) src1);
|
||||
HVX_Vector prev0 = *src0_vec_ptr++;
|
||||
HVX_Vector prev1 = *src1_vec_ptr++;
|
||||
HVX_Vector sum_hi = Q6_V_vzero();
|
||||
HVX_Vector sum_lo = Q6_V_vzero();
|
||||
|
||||
while (iptr0 < iptr0_end) {
|
||||
HVX_Vector curr0 = *iptr0++;
|
||||
HVX_Vector curr1 = *iptr1++;
|
||||
while (src0_vec_ptr < src0_vec_ptr_end) {
|
||||
HVX_Vector curr0 = *src0_vec_ptr++;
|
||||
HVX_Vector curr1 = *src1_vec_ptr++;
|
||||
HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
|
||||
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
|
||||
HVX_VectorPair result = Q6_Wqf32_vmpy_VhfVhf(s0, s1);
|
||||
|
|
@ -110,17 +111,17 @@ inline float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_d
|
|||
prev1 = curr1;
|
||||
}
|
||||
|
||||
if ((iptr0_end - ((HVX_Vector *) src0)) > 0) {
|
||||
if ((src0_vec_ptr_end - ((HVX_Vector *) src0)) > 0) {
|
||||
// handle the last vector
|
||||
// see also:
|
||||
// https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147
|
||||
// or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c
|
||||
bool iptr0_aligned = hexagon::is_addr_aligned(iptr0);
|
||||
HVX_Vector curr0 = iptr0_aligned ? prev0 : *iptr0;
|
||||
iptr0 = iptr0_aligned ? iptr0 : iptr0 + 1;
|
||||
bool iptr1_aligned = hexagon::is_addr_aligned(iptr1);
|
||||
HVX_Vector curr1 = iptr1_aligned ? prev1 : *iptr1;
|
||||
iptr1 = iptr1_aligned ? iptr1 : iptr1 + 1;
|
||||
bool iptr0_aligned = hexagon::is_addr_aligned(src0_vec_ptr);
|
||||
HVX_Vector curr0 = iptr0_aligned ? prev0 : *src0_vec_ptr;
|
||||
src0_vec_ptr = iptr0_aligned ? src0_vec_ptr : src0_vec_ptr + 1;
|
||||
bool iptr1_aligned = hexagon::is_addr_aligned(src1_vec_ptr);
|
||||
HVX_Vector curr1 = iptr1_aligned ? prev1 : *src1_vec_ptr;
|
||||
src1_vec_ptr = iptr1_aligned ? src1_vec_ptr : src1_vec_ptr + 1;
|
||||
HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
|
||||
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
|
||||
HVX_VectorPair result = Q6_Wqf32_vmpy_VhfVhf(s0, s1);
|
||||
|
|
@ -134,13 +135,15 @@ inline float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_d
|
|||
const size_t leftover_bytes = leftover * sizeof(npu_device_fp16_t);
|
||||
if (leftover > 0) {
|
||||
// handle the leftover elements
|
||||
HVX_Vector curr0 =
|
||||
(leftover_bytes + hexagon::unaligned_bytes(iptr0) > hexagon::kBytesPerVector) ? *iptr0 : prev0;
|
||||
curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
|
||||
HVX_Vector curr0 = (leftover_bytes + hexagon::unaligned_bytes(src0_vec_ptr) > hexagon::kBytesPerVector) ?
|
||||
*src0_vec_ptr :
|
||||
prev0;
|
||||
curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
|
||||
|
||||
HVX_Vector curr1 =
|
||||
(leftover_bytes + hexagon::unaligned_bytes(iptr1) > hexagon::kBytesPerVector) ? *iptr1 : prev1;
|
||||
curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
|
||||
HVX_Vector curr1 = (leftover_bytes + hexagon::unaligned_bytes(src1_vec_ptr) > hexagon::kBytesPerVector) ?
|
||||
*src1_vec_ptr :
|
||||
prev1;
|
||||
curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
|
||||
|
||||
HVX_VectorPair result = Q6_Wqf32_vmpy_VhfVhf(curr0, curr1);
|
||||
|
||||
|
|
@ -156,7 +159,7 @@ inline float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_d
|
|||
}
|
||||
}
|
||||
|
||||
return vec_reduction_f32(Q6_Vqf32_vadd_Vqf32Vqf32(sum_hi, sum_lo));
|
||||
return hexagon::vec_reduction_f32(Q6_Vqf32_vadd_Vqf32Vqf32(sum_hi, sum_lo));
|
||||
}
|
||||
|
||||
template <typename T> struct get_data_type {};
|
||||
|
|
@ -208,70 +211,118 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso
|
|||
}
|
||||
|
||||
// cache the src0 plane in VTCM
|
||||
const size_t src0_plane_row_count = start_end_element.second - start_end_element.first;
|
||||
size_t src0_plane_cache_size = 0;
|
||||
uint8_t * src0_plane_cache_ptr = nullptr;
|
||||
const uint8_t * last_cached_plane_ptr = nullptr;
|
||||
size_t src0_plane_slice_row_count = start_end_element.second - start_end_element.first;
|
||||
size_t src0_plane_cache_size = 0;
|
||||
uint8_t * src0_plane_cache_ptr = nullptr;
|
||||
const uint8_t * last_cached_plane_ptr = nullptr;
|
||||
bool is_mem_cache = false;
|
||||
if (is_quantized) {
|
||||
src0_plane_cache_size = src0_actual_row_size * src0_plane_row_count;
|
||||
src0_plane_cache_ptr = params->get_cache(src0_plane_cache_size, is_quantized);
|
||||
src0_plane_slice_row_count =
|
||||
std::min(params->vtcm_quota_size / src0_actual_row_size, src0_plane_slice_row_count);
|
||||
src0_plane_cache_size = src0_actual_row_size * src0_plane_slice_row_count;
|
||||
src0_plane_cache_ptr = params->get_vtcm_cache(src0_plane_cache_size);
|
||||
if (src0_plane_cache_ptr == nullptr) {
|
||||
DEVICE_LOG_DEBUG(
|
||||
"mul_mat_impl: failed to get VTCM cache for src0, size: %zu, src0_plane_slice_row_count: %zu, "
|
||||
"src0_actual_row_size: %zu, will fallback to mem cache\n",
|
||||
src0_plane_cache_size, src0_plane_slice_row_count, src0_actual_row_size);
|
||||
src0_plane_cache_ptr = params->get_mem_cache(src0_plane_cache_size);
|
||||
is_mem_cache = true;
|
||||
}
|
||||
}
|
||||
|
||||
DEVICE_LOG_DEBUG("mul_mat_impl src0_actual_row_size: %zu, is_quantized: %d, vtcm_mem: %p(%zu)\n",
|
||||
src0_actual_row_size, is_quantized, (void *) src0_plane_cache_ptr, src0_plane_cache_size);
|
||||
DEVICE_LOG_DEBUG(
|
||||
"mul_mat_impl src0_actual_row_size: %zu, src0_plane_slice_row_count: %zu, is_quantized: %d, vtcm_mem: "
|
||||
"%p(%zu)\n",
|
||||
src0_actual_row_size, src0_plane_slice_row_count, is_quantized, (void *) src0_plane_cache_ptr,
|
||||
src0_plane_cache_size);
|
||||
|
||||
const size_t valid_row_bytes = src1->get_ne(0) * sizeof(data_type);
|
||||
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_SUB_PROC(dst, params->tidx, dequant);
|
||||
for (int64_t ip = start_end_plane.first; ip < start_end_plane.second; ip++) {
|
||||
const auto i3 = ip / dst->get_ne(2);
|
||||
const auto i2 = ip - i3 * dst->get_ne(2);
|
||||
const auto * src0_plane = src0_ptr + i3 / r03 * src0->get_nb(3) + i2 / r02 * src0->get_nb(2) +
|
||||
start_end_element.first * src0->get_nb(1);
|
||||
const auto * src1_plane = src1_ptr + i3 * src1->get_nb(3) + i2 * src1->get_nb(2);
|
||||
auto * dst_plane = dst_ptr + i3 * dst->get_nb(3) + i2 * dst->get_nb(2);
|
||||
for (int64_t col_idx = start_end_element.first; col_idx < start_end_element.second;
|
||||
col_idx += src0_plane_slice_row_count) {
|
||||
const auto * src0_plane =
|
||||
src0_ptr + i3 / r03 * src0->get_nb(3) + i2 / r02 * src0->get_nb(2) + col_idx * src0->get_nb(1);
|
||||
if (src0_plane_cache_ptr) {
|
||||
if (last_cached_plane_ptr != src0_plane) {
|
||||
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_SUB_PROC(dequant);
|
||||
|
||||
if (src0_plane_cache_ptr) {
|
||||
if (last_cached_plane_ptr != src0_plane) {
|
||||
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_SUB_PROC(dequant);
|
||||
for (int64_t ir = 0; ir < (int64_t) src0_plane_slice_row_count; ir++) {
|
||||
auto * src0_row = src0_plane + ir * src0->get_nb(1);
|
||||
if (ir + 1 < src0_plane_slice_row_count) {
|
||||
hexagon::l2fetch_row(src0_row + src0->get_nb(1), src0->get_nb(1));
|
||||
}
|
||||
|
||||
for (int64_t ir = 0; ir < (int64_t) src0_plane_row_count; ir++) {
|
||||
auto * src0_row = src0_plane + ir * src0->get_nb(1);
|
||||
if (ir + 1 < src0_plane_row_count) {
|
||||
hexagon::l2fetch_row(src0_row + src0->get_nb(1), src0->get_nb(1));
|
||||
auto * dst_row = reinterpret_cast<float *>(src0_plane_cache_ptr + ir * src0_actual_row_size);
|
||||
dequantize_row_func(src0_row, reinterpret_cast<float *>(dst_row), src0->get_ne(0),
|
||||
params->f16_to_f32_table);
|
||||
}
|
||||
|
||||
auto * dst_row = reinterpret_cast<float *>(src0_plane_cache_ptr + ir * src0_actual_row_size);
|
||||
dequantize_row_func(src0_row, reinterpret_cast<float *>(dst_row), src0->get_ne(0),
|
||||
params->f16_to_f32_table);
|
||||
last_cached_plane_ptr = src0_plane;
|
||||
}
|
||||
|
||||
last_cached_plane_ptr = src0_plane;
|
||||
src0_plane = src0_plane_cache_ptr;
|
||||
}
|
||||
|
||||
src0_plane = src0_plane_cache_ptr;
|
||||
}
|
||||
|
||||
for (int64_t i1 = start_end_row.first; i1 < start_end_row.second; i1++) {
|
||||
auto * src1_row = src1_plane + i1 * src1->get_nb(1);
|
||||
auto * dst_row = reinterpret_cast<float *>(dst_plane + i1 * dst->get_nb(1)) + start_end_element.first;
|
||||
for (int64_t i0 = 0; i0 < (int64_t) src0_plane_row_count; i0++) {
|
||||
auto * src0_row = src0_plane + i0 * src0_actual_row_size;
|
||||
if (i0 + 1 < src0_plane_row_count) {
|
||||
if (!src0_plane_cache_ptr) {
|
||||
hexagon::l2fetch_row(src0_row + src0_actual_row_size, valid_row_bytes);
|
||||
for (int64_t i1 = start_end_row.first; i1 < start_end_row.second; i1++) {
|
||||
auto * src1_row = src1_plane + i1 * src1->get_nb(1);
|
||||
auto * dst_row = reinterpret_cast<float *>(dst_plane + i1 * dst->get_nb(1)) + col_idx;
|
||||
for (int64_t i0 = 0; i0 < (int64_t) src0_plane_slice_row_count; i0++) {
|
||||
auto * src0_row = src0_plane + i0 * src0_actual_row_size;
|
||||
if (i0 + 1 < src0_plane_slice_row_count) {
|
||||
if (!src0_plane_cache_ptr || is_mem_cache) {
|
||||
hexagon::l2fetch_row(src0_row + src0_actual_row_size, valid_row_bytes);
|
||||
}
|
||||
} else if (ip + 1 < start_end_plane.second) {
|
||||
hexagon::l2fetch_row(src1_row + src1->get_nb(1), valid_row_bytes);
|
||||
}
|
||||
} else if (ip + 1 < start_end_plane.second) {
|
||||
hexagon::l2fetch_row(src1_row + src1->get_nb(1), valid_row_bytes);
|
||||
}
|
||||
|
||||
// TODO: figure dst how to handle a entire row
|
||||
dst_row[i0] = _DotFunc(reinterpret_cast<const data_type *>(src0_row),
|
||||
reinterpret_cast<const data_type *>(src1_row), (size_t) src0->get_ne(0));
|
||||
// TODO: figure dst how to handle a entire row
|
||||
dst_row[i0] = _DotFunc(reinterpret_cast<const data_type *>(src0_row),
|
||||
reinterpret_cast<const data_type *>(src1_row), (size_t) src0->get_ne(0));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool is_quantized_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1) {
|
||||
if (src1.type != NPU_DATA_TYPE_F32) {
|
||||
DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) and src1.type(%s) mismatch and src1 is not F32\n",
|
||||
hexagon::get_type_name(src0.type), hexagon::get_type_name(src1.type));
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto type_traits = hexagon::get_type_traits(src0.type);
|
||||
if (!type_traits.is_quantized || type_traits.dequantize_row == nullptr) {
|
||||
DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) and src1.type(%s) mismatch and src0 is not quantized\n",
|
||||
hexagon::get_type_name(src0.type), hexagon::get_type_name(src1.type));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src0.ne[0] % type_traits.blck_size) {
|
||||
DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) ne[0] is not aligned: %ld\n", hexagon::get_type_name(src0.type),
|
||||
(long) src0.ne[0]);
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto vtcm_thread_quota_size = hexagon::vtcm_mem::get_total_size() / hexagon::kMaxThreadCount;
|
||||
if (src0.ne[0] * sizeof(hexagon::dequantized_element_type) > vtcm_thread_quota_size) {
|
||||
DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) ne[0] is too large: %ld, vtcm_thread_quota_size: %zu\n",
|
||||
hexagon::get_type_name(src0.type), (long) src0.ne[0], vtcm_thread_quota_size);
|
||||
return false;
|
||||
}
|
||||
|
||||
DEVICE_LOG_DEBUG("[MUL_MAT]supported quantized src0.type(%s) and src1.type(%s)\n",
|
||||
hexagon::get_type_name(src0.type), hexagon::get_type_name(src1.type));
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace hexagon {
|
||||
|
|
@ -319,27 +370,9 @@ bool is_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_
|
|||
|
||||
if (src0.type != src1.type) {
|
||||
#ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS
|
||||
if (src1.type != NPU_DATA_TYPE_F32) {
|
||||
DEVICE_LOG_DEBUG("[%s]src0.type(%s) and src1.type(%s) mismatch and src1 is not F32\n", op_get_name(op),
|
||||
get_type_name(src0.type), get_type_name(src1.type));
|
||||
if (!is_quantized_mul_mat_supported(src0, src1)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto type_traits = get_type_traits(src0.type);
|
||||
if (!type_traits.is_quantized || type_traits.dequantize_row == nullptr) {
|
||||
DEVICE_LOG_DEBUG("[%s]src0.type(%s) and src1.type(%s) mismatch and src0 is not quantized\n",
|
||||
op_get_name(op), get_type_name(src0.type), get_type_name(src1.type));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src0.ne[0] % type_traits.blck_size) {
|
||||
DEVICE_LOG_DEBUG("[%s]src0.type(%s) ne[0] is not aligned: %ld\n", op_get_name(op), get_type_name(src0.type),
|
||||
(long) src0.ne[0]);
|
||||
return false;
|
||||
}
|
||||
|
||||
DEVICE_LOG_DEBUG("[%s]supported quantized src0.type(%s) and src1.type(%s)\n", op_get_name(op),
|
||||
get_type_name(src0.type), get_type_name(src1.type));
|
||||
#else
|
||||
DEVICE_LOG_DEBUG("[%s]src0.type(%s) and src1.type(%s) mismatch and quantized tensors are not supported\n",
|
||||
op_get_name(op), get_type_name(src0.type), get_type_name(src1.type));
|
||||
|
|
|
|||
|
|
@ -7,11 +7,6 @@
|
|||
|
||||
namespace hexagon {
|
||||
|
||||
constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73
|
||||
constexpr const size_t kAlignMask = kBytesPerVector - 1;
|
||||
constexpr const size_t kL2CacheSize = 8 * 1024; // // 8KB L2 cache
|
||||
constexpr const size_t kL2FetchAheadVectors = kL2CacheSize / kBytesPerVector;
|
||||
|
||||
inline size_t unaligned_bytes(const void * addr) {
|
||||
return ((size_t) addr) & kAlignMask;
|
||||
}
|
||||
|
|
@ -43,6 +38,31 @@ inline float get_flt0_from_fltv(HVX_Vector vect) {
|
|||
return cvt.f;
|
||||
}
|
||||
|
||||
inline HVX_Vector vec_reduction_qf32(HVX_Vector sums) {
|
||||
constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(float);
|
||||
static_assert(kFloatsPerVector == 32 || kFloatsPerVector == 16, "kFloatsPerVector should be 16 or 32");
|
||||
|
||||
// TODO: do we have a better way to do the reduction?
|
||||
switch (kFloatsPerVector) {
|
||||
default:
|
||||
case 32:
|
||||
sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 16 * sizeof(float)));
|
||||
// fallthrough
|
||||
case 16:
|
||||
sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 8 * sizeof(float)));
|
||||
sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 4 * sizeof(float)));
|
||||
sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 2 * sizeof(float)));
|
||||
sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, sizeof(float)));
|
||||
break;
|
||||
}
|
||||
|
||||
return sums;
|
||||
}
|
||||
|
||||
inline float vec_reduction_f32(HVX_Vector sums) {
|
||||
return hexagon::get_flt0_from_fltv(Q6_Vsf_equals_Vqf32(vec_reduction_qf32(sums)));
|
||||
}
|
||||
|
||||
bool mul_mat_f32(tensor * out, compute_params * params);
|
||||
bool is_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
|
||||
const npu_device_tensor_spec & dst, npu_device_tensor_op op);
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
#pragma once
|
||||
|
||||
#include <hexagon_types.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
|
@ -15,26 +17,25 @@ namespace hexagon {
|
|||
struct compute_params {
|
||||
const size_t tidx;
|
||||
const size_t tcnt;
|
||||
const size_t vtcm_quota_size;
|
||||
const float * f16_to_f32_table;
|
||||
std::unique_ptr<hexagon::vtcm_mem> vtcm_cache;
|
||||
std::unique_ptr<uint8_t[]> mem_cache;
|
||||
size_t mem_cache_size = 0;
|
||||
|
||||
uint8_t * get_cache(size_t size, bool fallback_to_mem) {
|
||||
uint8_t * get_vtcm_cache(size_t size) {
|
||||
if (!vtcm_cache || vtcm_cache->get_size() < size) {
|
||||
vtcm_cache = std::make_unique<hexagon::vtcm_mem>(size, false);
|
||||
}
|
||||
|
||||
if (vtcm_cache->is_valid()) {
|
||||
return vtcm_cache->get_mem();
|
||||
}
|
||||
|
||||
if (!fallback_to_mem) {
|
||||
DEVICE_LOG_DEBUG("vtcm_mem not valid, return nullptr\n");
|
||||
if (!vtcm_cache->is_valid()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
DEVICE_LOG_DEBUG("vtcm_mem not valid, allocate from mem_cache\n");
|
||||
return vtcm_cache->get_mem();
|
||||
}
|
||||
|
||||
uint8_t * get_mem_cache(size_t size) {
|
||||
if (!mem_cache || mem_cache_size < size) {
|
||||
mem_cache = std::make_unique<uint8_t[]>(size + 256);
|
||||
mem_cache_size = mem_cache ? size : 0;
|
||||
|
|
@ -49,10 +50,31 @@ typedef bool (*op_is_supported_func_type)(const npu_device_tensor_spec & src0, c
|
|||
const npu_device_tensor_spec & dst, npu_device_tensor_op op);
|
||||
|
||||
inline constexpr std::pair<int64_t, int64_t> get_thread_work_slice(int64_t total, size_t tidx, size_t tcnt) {
|
||||
const auto elements_per_thread = (total + tcnt - 1) / tcnt;
|
||||
const auto start = tidx * elements_per_thread;
|
||||
const auto end = std::min<int64_t>(start + elements_per_thread, total);
|
||||
return { start, end };
|
||||
if (total <= 0 || tidx >= tcnt) {
|
||||
return { 0, 0 }; // No work for this thread
|
||||
}
|
||||
|
||||
const auto elements_per_thread = total / tcnt;
|
||||
const auto remainder = total % tcnt;
|
||||
|
||||
int64_t start = 0;
|
||||
int64_t end = 0;
|
||||
if (tidx < remainder) {
|
||||
// First 'remainder' threads get one extra item
|
||||
start = tidx * (elements_per_thread + 1);
|
||||
end = start + elements_per_thread + 1;
|
||||
} else {
|
||||
// Remaining threads get the base number of elements
|
||||
start = remainder * (elements_per_thread + 1) + (tidx - remainder) * elements_per_thread;
|
||||
end = start + elements_per_thread;
|
||||
}
|
||||
|
||||
return { start, std::min(end, total) };
|
||||
}
|
||||
|
||||
constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73
|
||||
constexpr const size_t kAlignMask = kBytesPerVector - 1;
|
||||
constexpr const size_t kL2CacheSize = 8 * 1024; // // 8KB L2 cache
|
||||
constexpr const size_t kL2FetchAheadVectors = kL2CacheSize / kBytesPerVector;
|
||||
|
||||
} // namespace hexagon
|
||||
|
|
|
|||
|
|
@ -4,6 +4,8 @@
|
|||
|
||||
#include <array>
|
||||
|
||||
#include "op_types.hpp" // TODO: remove this include
|
||||
|
||||
static_assert(sizeof(npu_device_block_q4_K) ==
|
||||
2 * sizeof(npu_device_fp16_t) + QUANT_K_SCALE_SIZE + QUANT_K_BLOCK_SIZE / 2,
|
||||
"wrong q4_K block size/padding");
|
||||
|
|
@ -16,14 +18,34 @@ static_assert(sizeof(npu_device_block_q8_0) == sizeof(npu_device_fp16_t) + QUANT
|
|||
|
||||
namespace {
|
||||
|
||||
inline float to_float(const npu_device_fp16_t src) {
|
||||
union {
|
||||
__fp16 f16;
|
||||
npu_device_fp16_t u16;
|
||||
} f16;
|
||||
inline HVX_Vector vmemu(const void * unaligned_ptr) {
|
||||
HVX_Vector ret = *reinterpret_cast<const HVX_UVector *>(unaligned_ptr);
|
||||
return ret;
|
||||
}
|
||||
|
||||
f16.u16 = src;
|
||||
return f16.f16;
|
||||
inline float to_float(const npu_device_fp16_t src) {
|
||||
return reinterpret_cast<const __fp16 &>(src);
|
||||
}
|
||||
|
||||
template <typename _TBlock> inline HVX_Vector load_block_generic(const _TBlock & src) {
|
||||
uint8_t buffer[hexagon::kBytesPerVector];
|
||||
|
||||
static_assert(sizeof(buffer) == sizeof(HVX_Vector), "wrong cvt size/padding");
|
||||
static_assert(sizeof(buffer) >= sizeof(src.qs), "wrong q4_0 block size/padding");
|
||||
|
||||
memcpy(&buffer[0], src.qs, sizeof(src.qs));
|
||||
return *reinterpret_cast<HVX_UVector *>(buffer);
|
||||
}
|
||||
|
||||
template <typename _TBlock> inline HVX_Vector load_dual_block_generic(const _TBlock & src1, const _TBlock & src2) {
|
||||
uint8_t buffer[hexagon::kBytesPerVector];
|
||||
|
||||
static_assert(sizeof(buffer) == sizeof(HVX_Vector), "wrong cvt size/padding");
|
||||
static_assert(sizeof(buffer) >= sizeof(src1.qs) * 2, "wrong q4_0 block size/padding");
|
||||
|
||||
memcpy(&buffer[0], src1.qs, sizeof(src1.qs));
|
||||
memcpy(&buffer[sizeof(src1.qs)], src2.qs, sizeof(src2.qs));
|
||||
return *reinterpret_cast<HVX_UVector *>(buffer);
|
||||
}
|
||||
|
||||
inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) {
|
||||
|
|
@ -37,38 +59,78 @@ inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m)
|
|||
}
|
||||
|
||||
void dequantize_row_q8_0(const void * src, float * dst, size_t count, const float * f16_to_f32_table) {
|
||||
constexpr const int qk = QUANT_BLOCK_SIZE;
|
||||
const int nb = count / qk;
|
||||
const auto * src_ptr = reinterpret_cast<const npu_device_block_q8_0 *>(src);
|
||||
constexpr const int qk = QUANT_BLOCK_SIZE;
|
||||
static_assert(QUANT_BLOCK_SIZE == hexagon::kBytesPerVector / sizeof(float));
|
||||
|
||||
const int nb = count / qk;
|
||||
const auto * src_ptr = reinterpret_cast<const npu_device_block_q8_0 *>(src);
|
||||
HVX_UVector * out = ((HVX_UVector *) dst); // TODO: opt for aligned access
|
||||
|
||||
// TODO: use intrinsics
|
||||
for (int i = 0; i < nb; i++) {
|
||||
const float d = f16_to_f32_table[src_ptr[i].d];
|
||||
const auto & src = src_ptr[i];
|
||||
HVX_Vector d = Q6_Vh_vsplat_R(src.d);
|
||||
|
||||
for (int j = 0; j < qk; ++j) {
|
||||
dst[i * qk + j] = src_ptr[i].qs[j] * d;
|
||||
}
|
||||
HVX_Vector q_lo = load_block_generic(src);
|
||||
HVX_VectorPair q = Q6_Wh_vunpack_Vb(q_lo);
|
||||
q = Q6_Wh_vunpack_Vb(Q6_V_lo_W(q));
|
||||
q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(q));
|
||||
q = Q6_Wqf32_vmpy_VhfVhf(q_lo, d);
|
||||
out[i] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(q));
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_row_q4_0(const void * src, float * dst, size_t count, const float * f16_to_f32_table) {
|
||||
constexpr const int qk = QUANT_BLOCK_SIZE;
|
||||
static_assert(qk % 2 == 0, "qk must be even");
|
||||
static_assert(QUANT_BLOCK_SIZE == hexagon::kBytesPerVector / sizeof(float));
|
||||
constexpr const uint32_t kSizeOfQs = sizeof(npu_device_block_q4_0::qs);
|
||||
|
||||
const int nb = count / qk;
|
||||
const auto * src_ptr = reinterpret_cast<const npu_device_block_q4_0 *>(src);
|
||||
const int nb = count / qk;
|
||||
const auto * src_ptr = reinterpret_cast<const npu_device_block_q4_0 *>(src);
|
||||
HVX_Vector mask = Q6_Vb_vsplat_R(0x0F);
|
||||
HVX_Vector minus = Q6_Vb_vsplat_R(8);
|
||||
HVX_UVector * out = ((HVX_UVector *) dst); // TODO: opt for aligned access
|
||||
|
||||
// TODO: use intrinsics
|
||||
for (int i = 0; i < nb; i++) {
|
||||
const float d = f16_to_f32_table[src_ptr[i].d];
|
||||
const int loop_count = nb - (nb % 2);
|
||||
for (int i = 0; i < loop_count; i += 2) {
|
||||
const auto & src1 = src_ptr[i];
|
||||
const auto & src2 = src_ptr[i + 1];
|
||||
|
||||
for (int j = 0; j < qk / 2; ++j) {
|
||||
const int x0 = (src_ptr[i].qs[j] & 0x0F) - 8;
|
||||
const int x1 = ((src_ptr[i].qs[j] >> 4) & 0xF) - 8;
|
||||
HVX_Vector d1 = Q6_Vh_vsplat_R(src1.d);
|
||||
HVX_Vector d2 = Q6_Vh_vsplat_R(src2.d);
|
||||
d1 = Q6_V_valign_VVR(d1, Q6_V_vzero(), hexagon::kBytesPerVector / 2);
|
||||
d1 = Q6_V_valign_VVR(d2, d1, hexagon::kBytesPerVector / 2);
|
||||
HVX_Vector d = Q6_Vh_vshuff_Vh(d1);
|
||||
|
||||
dst[i * qk + j + 0] = x0 * d;
|
||||
dst[i * qk + j + qk / 2] = x1 * d;
|
||||
}
|
||||
HVX_Vector q_lo = load_dual_block_generic(src1, src2);
|
||||
HVX_Vector q_hi = Q6_Vub_vlsr_VubR(q_lo, 4);
|
||||
HVX_VectorPair q = Q6_W_vshuff_VVR(q_hi, Q6_V_vand_VV(q_lo, mask), kSizeOfQs);
|
||||
q_lo = Q6_V_valign_VVR(Q6_V_lo_W(q), Q6_V_vzero(), hexagon::kBytesPerVector / 2);
|
||||
q_lo = Q6_V_valign_VVR(Q6_V_hi_W(q), q_lo, hexagon::kBytesPerVector / 2);
|
||||
q_lo = Q6_Vb_vshuff_Vb(q_lo);
|
||||
q_lo = Q6_Vb_vsub_VbVb(q_lo, minus);
|
||||
q = Q6_Wh_vunpack_Vb(q_lo);
|
||||
q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(q));
|
||||
q = Q6_Wqf32_vmpy_VhfVhf(q_lo, d);
|
||||
out[i] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(q));
|
||||
out[i + 1] = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(q));
|
||||
}
|
||||
|
||||
if (loop_count < nb) {
|
||||
const auto & curr_blk = src_ptr[nb - 1];
|
||||
HVX_Vector d = Q6_Vh_vsplat_R(curr_blk.d);
|
||||
|
||||
HVX_Vector q_lo = load_block_generic(curr_blk);
|
||||
HVX_Vector q_hi = Q6_Vub_vlsr_VubR(q_lo, 4);
|
||||
q_lo = Q6_V_valign_VVR(Q6_V_vand_VV(q_lo, mask), Q6_V_vzero(), sizeof(curr_blk.qs));
|
||||
q_lo = Q6_V_valign_VVR(q_hi, q_lo, hexagon::kBytesPerVector - sizeof(curr_blk.qs));
|
||||
q_lo = Q6_Vb_vsub_VbVb(q_lo, minus);
|
||||
|
||||
HVX_VectorPair q = Q6_Wh_vunpack_Vb(q_lo);
|
||||
q = Q6_Wh_vunpack_Vb(Q6_V_lo_W(q));
|
||||
q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(q));
|
||||
q = Q6_Wqf32_vmpy_VhfVhf(q_lo, d);
|
||||
out[nb - 1] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(q));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -23,13 +23,15 @@ inline bool is_quantized_type(npu_device_tensor_data_type type) {
|
|||
return get_type_traits(type).is_quantized;
|
||||
}
|
||||
|
||||
inline size_t get_dequantized_row_size(tensor * tensor) {
|
||||
using dequantized_element_type = float;
|
||||
|
||||
inline size_t get_dequantized_row_size(const tensor * tensor) {
|
||||
if (!is_quantized_type(tensor->get_type())) {
|
||||
return tensor->get_nb(1); // for f32 and f16
|
||||
}
|
||||
|
||||
auto row_elems_count = tensor->get_ne(0);
|
||||
return row_elems_count * sizeof(float); // currently only f32 is supported
|
||||
return row_elems_count * sizeof(dequantized_element_type); // currently only f32 is supported
|
||||
}
|
||||
|
||||
inline const char * get_type_name(npu_device_tensor_data_type type) {
|
||||
|
|
|
|||
|
|
@ -8,7 +8,8 @@
|
|||
|
||||
namespace hexagon {
|
||||
|
||||
constexpr const size_t kMaxTensorSrc = DEVICE_TENSOR_MAX_SRC;
|
||||
constexpr const size_t kMaxTensorSrc = DEVICE_TENSOR_MAX_SRC;
|
||||
constexpr const size_t kMaxParamsCount = DEVICE_TENSOR_MAX_OP_PARAMS;
|
||||
|
||||
class tensor {
|
||||
public:
|
||||
|
|
@ -50,17 +51,17 @@ class tensor {
|
|||
}
|
||||
}
|
||||
|
||||
bool set_src(size_t index, tensor * src) {
|
||||
if (index >= kMaxTensorSrc) {
|
||||
return false;
|
||||
void update_config(const npu_device_tensor_update_config & config) {
|
||||
static_assert(sizeof(_op_params) == sizeof(config.params), "op params size mismatch");
|
||||
|
||||
_info.op = config.op;
|
||||
memcpy(_op_params, config.params, sizeof(_op_params));
|
||||
for (size_t i = 0; i < DEVICE_TENSOR_MAX_SRC; ++i) {
|
||||
auto src_handle = config.src_handles[i];
|
||||
_src[i] = (src_handle ? reinterpret_cast<tensor *>(src_handle) : nullptr);
|
||||
}
|
||||
|
||||
_src[index] = src;
|
||||
return true;
|
||||
}
|
||||
|
||||
void set_op(npu_device_tensor_op op) { _info.op = op; }
|
||||
|
||||
tensor * get_src(size_t index) const {
|
||||
if (index >= kMaxTensorSrc) {
|
||||
return nullptr;
|
||||
|
|
@ -77,6 +78,20 @@ class tensor {
|
|||
|
||||
npu_device_tensor_op get_op() const { return _info.op; }
|
||||
|
||||
template <typename _TyParam> const _TyParam get_op_param(size_t index) const {
|
||||
static_assert(sizeof(_TyParam) <= sizeof(_op_params), "_op_param type size exceeds op params size");
|
||||
|
||||
if (sizeof(_TyParam) * (index + 1) >= sizeof(_op_params)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return reinterpret_cast<const _TyParam *>(_op_params)[index];
|
||||
}
|
||||
|
||||
const int32_t * get_op_params() const { return _op_params; }
|
||||
|
||||
const size_t get_op_param_count() const { return kMaxParamsCount; }
|
||||
|
||||
npu_device_tensor_data_type get_type() const { return _info.type; }
|
||||
|
||||
const uint8_t * get_read_buffer() const {
|
||||
|
|
@ -89,9 +104,10 @@ class tensor {
|
|||
bool is_valid() const { return _data != nullptr; }
|
||||
|
||||
private:
|
||||
npu_device_tensor_config _info;
|
||||
tensor * _src[kMaxTensorSrc] = {};
|
||||
uint8_t * _data = nullptr;
|
||||
npu_device_tensor_config _info = {};
|
||||
int32_t _op_params[kMaxParamsCount] = {};
|
||||
tensor * _src[kMaxTensorSrc] = {};
|
||||
uint8_t * _data = nullptr;
|
||||
|
||||
DISABLE_COPY_AND_MOVE(tensor);
|
||||
};
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@
|
|||
namespace hexagon {
|
||||
|
||||
constexpr const size_t kMaxThreadCount = 4;
|
||||
constexpr const size_t kDefaultStackSize = 1024 * 16; // 16KB
|
||||
constexpr const size_t kDefaultStackSize = 1024 * 32; // 32KB
|
||||
constexpr const unsigned long long kThreadTaskPendingBit = 1;
|
||||
|
||||
template <size_t _stack_size> class qurt_thread {
|
||||
|
|
@ -80,7 +80,7 @@ using qurt_thread_ptr = std::unique_ptr<qurt_thread<kDefaultStackSize>>;
|
|||
|
||||
template <size_t _thread_count> class thread_pool {
|
||||
static_assert(_thread_count > 1, "Thread count must be greater than 1");
|
||||
constexpr const static size_t kMaxThreadCount = _thread_count - 1;
|
||||
constexpr const static size_t kMaxSubThreadCount = _thread_count - 1;
|
||||
|
||||
public:
|
||||
typedef qurt_thread<kDefaultStackSize> thread_type;
|
||||
|
|
@ -88,9 +88,10 @@ template <size_t _thread_count> class thread_pool {
|
|||
|
||||
thread_pool() {
|
||||
std::string thread_name_base = "thread_pool_";
|
||||
qurt_barrier_init(&_pending, kMaxThreadCount + 1);
|
||||
qurt_barrier_init(&_completed, kMaxThreadCount + 1);
|
||||
for (size_t i = 0; i < kMaxThreadCount; ++i) {
|
||||
qurt_barrier_init(&_pending, kMaxSubThreadCount + 1);
|
||||
qurt_barrier_init(&_completed, kMaxSubThreadCount + 1);
|
||||
const auto priority = qurt_thread_get_priority(qurt_thread_get_id());
|
||||
for (size_t i = 0; i < kMaxSubThreadCount; ++i) {
|
||||
auto & thread_arg = _thread_args[i];
|
||||
thread_arg.pool = this;
|
||||
thread_arg.thread_idx = i + 1;
|
||||
|
|
@ -98,7 +99,7 @@ template <size_t _thread_count> class thread_pool {
|
|||
auto thread = std::make_unique<thread_type>(
|
||||
thread_name_base + std::to_string(i),
|
||||
reinterpret_cast<thread_type::qurt_thread_func_type>(&thread_pool::thread_func_impl), &thread_arg,
|
||||
QURT_THREAD_ATTR_PRIORITY_DEFAULT);
|
||||
priority);
|
||||
if (!thread->is_valid()) {
|
||||
DEVICE_LOG_ERROR("Failed to create thread: %zu", i);
|
||||
// destroy all barriers and threads at destructor
|
||||
|
|
@ -107,7 +108,7 @@ template <size_t _thread_count> class thread_pool {
|
|||
|
||||
_threads[i] = std::move(thread);
|
||||
}
|
||||
DEVICE_LOG_DEBUG("thread_pool.created: %zu", kMaxThreadCount);
|
||||
DEVICE_LOG_DEBUG("thread_pool.created: %zu", kMaxSubThreadCount);
|
||||
}
|
||||
|
||||
~thread_pool() {
|
||||
|
|
@ -133,7 +134,7 @@ template <size_t _thread_count> class thread_pool {
|
|||
_arg = arg;
|
||||
qurt_barrier_wait(&_pending);
|
||||
|
||||
task(this, 0, kMaxThreadCount + 1, arg);
|
||||
task(this, 0, kMaxSubThreadCount + 1, arg);
|
||||
DEVICE_LOG_DEBUG("main_thread.task_completed: 0");
|
||||
|
||||
qurt_barrier_wait(&_completed);
|
||||
|
|
@ -166,7 +167,7 @@ template <size_t _thread_count> class thread_pool {
|
|||
|
||||
auto task = pool._task;
|
||||
if (task) {
|
||||
task(arg->pool, arg->thread_idx, kMaxThreadCount + 1, pool._arg);
|
||||
task(arg->pool, arg->thread_idx, kMaxSubThreadCount + 1, pool._arg);
|
||||
}
|
||||
|
||||
DEVICE_LOG_DEBUG("thread_func_impl.task_completed: %zu", arg->thread_idx);
|
||||
|
|
@ -176,13 +177,13 @@ template <size_t _thread_count> class thread_pool {
|
|||
DEVICE_LOG_DEBUG("thread_func_impl.end: %zu", arg->thread_idx);
|
||||
}
|
||||
|
||||
std::atomic_bool _thread_exit = false;
|
||||
std::array<qurt_thread_ptr, kMaxThreadCount> _threads;
|
||||
thread_pool_arg _thread_args[kMaxThreadCount] = {};
|
||||
qurt_barrier_t _pending = {};
|
||||
qurt_barrier_t _completed = {};
|
||||
task_type _task = nullptr;
|
||||
void * _arg = nullptr;
|
||||
std::atomic_bool _thread_exit = false;
|
||||
std::array<qurt_thread_ptr, kMaxSubThreadCount> _threads;
|
||||
thread_pool_arg _thread_args[kMaxSubThreadCount] = {};
|
||||
qurt_barrier_t _pending = {};
|
||||
qurt_barrier_t _completed = {};
|
||||
task_type _task = nullptr;
|
||||
void * _arg = nullptr;
|
||||
|
||||
DISABLE_COPY_AND_MOVE(thread_pool);
|
||||
};
|
||||
|
|
|
|||
|
|
@ -1,7 +1,9 @@
|
|||
#pragma once
|
||||
|
||||
#include <AEEStdDef.h>
|
||||
#include <HAP_farf.h>
|
||||
#include <HAP_perf.h>
|
||||
#include <HAP_power.h>
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
|
|
@ -48,11 +50,114 @@ inline constexpr const char * op_get_name(npu_device_tensor_op op) {
|
|||
return "SUB";
|
||||
case NPU_OP_MUL:
|
||||
return "MUL";
|
||||
case NPU_OP_RMS_NORM:
|
||||
return "RMS_NORM";
|
||||
default:
|
||||
return "UNKNOWN";
|
||||
}
|
||||
}
|
||||
|
||||
class power_utils {
|
||||
public:
|
||||
power_utils() {
|
||||
_context_ptr = HAP_utils_create_context();
|
||||
if (_context_ptr == nullptr) {
|
||||
DEVICE_LOG_ERROR("Failed to create power context\n");
|
||||
}
|
||||
}
|
||||
|
||||
~power_utils() {
|
||||
if (_context_ptr != nullptr) {
|
||||
HAP_utils_destroy_context(_context_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
unsigned int get_clock_speed_hz() const {
|
||||
if (!is_valid()) {
|
||||
DEVICE_LOG_ERROR("Power context is not initialized\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
HAP_power_response_t response = {};
|
||||
response.type = HAP_power_get_clk_Freq;
|
||||
auto ret = HAP_power_get(_context_ptr, &response);
|
||||
if (ret != AEE_SUCCESS) {
|
||||
DEVICE_LOG_ERROR("Failed to get clock speed: %d\n", ret);
|
||||
return 0;
|
||||
}
|
||||
|
||||
return response.clkFreqHz;
|
||||
}
|
||||
|
||||
bool get_dvcs_enabled() const {
|
||||
if (!is_valid()) {
|
||||
DEVICE_LOG_ERROR("Power context is not initialized\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
HAP_power_response_t response = {};
|
||||
response.type = HAP_power_get_dcvsEnabled;
|
||||
auto ret = HAP_power_get(_context_ptr, &response);
|
||||
if (ret != AEE_SUCCESS) {
|
||||
DEVICE_LOG_ERROR("Failed to get DVCS enabled: %d\n", ret);
|
||||
return false;
|
||||
}
|
||||
|
||||
return response.dcvsEnabled;
|
||||
}
|
||||
|
||||
void set_dvcs_performance_mode(bool enable) {
|
||||
if (!is_valid()) {
|
||||
DEVICE_LOG_ERROR("Power context is not initialized\n");
|
||||
return;
|
||||
}
|
||||
|
||||
HAP_power_request_t request = {};
|
||||
request.type = HAP_power_set_DCVS_v3;
|
||||
request.dcvs_v3.dcvs_enable = enable ? TRUE : FALSE;
|
||||
if (enable) {
|
||||
request.dcvs_v3.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE;
|
||||
/*
|
||||
* sleep_latency : To request for sleep latency in micro-seconds.
|
||||
* Sleep latency is the minimum time before which the DSP sleeps
|
||||
* Set latency to 65535 to reset it to the default value
|
||||
*/
|
||||
request.dcvs_v3.set_latency = TRUE;
|
||||
request.dcvs_v3.latency = 1000;
|
||||
|
||||
request.dcvs_v3.set_bus_params = TRUE;
|
||||
request.dcvs_v3.bus_params.min_corner = HAP_DCVS_VCORNER_SVS;
|
||||
request.dcvs_v3.bus_params.max_corner = HAP_DCVS_VCORNER_TURBO;
|
||||
request.dcvs_v3.bus_params.target_corner = HAP_DCVS_VCORNER_NOM;
|
||||
}
|
||||
|
||||
auto ret = HAP_power_set(_context_ptr, &request);
|
||||
if (ret != AEE_SUCCESS) {
|
||||
DEVICE_LOG_ERROR("Failed to set DVCS performance mode: %d\n", ret);
|
||||
}
|
||||
}
|
||||
|
||||
void set_sleep_mode(bool enable) {
|
||||
if (!is_valid()) {
|
||||
DEVICE_LOG_ERROR("Power context is not initialized\n");
|
||||
return;
|
||||
}
|
||||
|
||||
boolean sleep_disable = enable ? FALSE : TRUE;
|
||||
auto ret = HAP_power_set_sleep_mode(_context_ptr, sleep_disable);
|
||||
if (ret != AEE_SUCCESS) {
|
||||
DEVICE_LOG_ERROR("Failed to set sleep mode: %d\n", ret);
|
||||
}
|
||||
}
|
||||
|
||||
bool is_valid() const { return _context_ptr != nullptr; }
|
||||
|
||||
private:
|
||||
void * _context_ptr = nullptr;
|
||||
|
||||
DISABLE_COPY_AND_MOVE(power_utils);
|
||||
};
|
||||
|
||||
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
|
||||
|
||||
template <size_t _buffer_count> class npu_scoped_timer {
|
||||
|
|
|
|||
|
|
@ -47,7 +47,7 @@ class vtcm_mem {
|
|||
DEVICE_LOG_DEBUG("VTCM released: %zu bytes at %p\n", _vtcm_size, _vtcm_mem);
|
||||
}
|
||||
|
||||
bool is_valid() const { return _vtcm_mem != nullptr; }
|
||||
bool is_valid() const { return _vtcm_mem != nullptr && _vtcm_size != 0; }
|
||||
|
||||
uint8_t * get_mem() const { return reinterpret_cast<uint8_t *>(_vtcm_mem); }
|
||||
|
||||
|
|
|
|||
|
|
@ -177,7 +177,7 @@ std::shared_ptr<host_tensor> host_buffer::init_tensor(ggml_tensor * tensor, remo
|
|||
|
||||
auto ret = _allocator->fastrpc_mmap((int) _domain_id, _buffer_fd, _data, 0, _size, FASTRPC_MAP_FD);
|
||||
if (ret != AEE_SUCCESS) {
|
||||
LOG_ERROR("failed to mmap rpc memory, fd: %d, ret: %d\n", _buffer_fd, ret);
|
||||
LOG_ERROR("failed to mmap rpc memory, fd: %d, size: %zu, ret: %d\n", _buffer_fd, _size, ret);
|
||||
return std::shared_ptr<host_tensor>();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
#include "graph.hpp"
|
||||
|
||||
#include "profiler.hpp"
|
||||
#include "tensor.hpp"
|
||||
|
||||
namespace hexagon {
|
||||
|
|
@ -28,8 +29,12 @@ bool host_graph::update(ggml_cgraph * cgraph) {
|
|||
return false;
|
||||
}
|
||||
|
||||
SCOPED_PERFORMANCE_TRACKER("[hexagon-npu][%p]update, handle(%p)", (void *) this, (void *) _graph_handle);
|
||||
|
||||
_tensor_handles.clear();
|
||||
_tensor_update_configs.clear();
|
||||
_tensor_handles.reserve(cgraph->n_nodes);
|
||||
_tensor_update_configs.reserve(cgraph->n_nodes);
|
||||
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
||||
auto * node = cgraph->nodes[i];
|
||||
if (node->op == GGML_OP_NONE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE ||
|
||||
|
|
@ -40,28 +45,38 @@ bool host_graph::update(ggml_cgraph * cgraph) {
|
|||
continue;
|
||||
}
|
||||
|
||||
// TODO: move to tensor?
|
||||
auto * tensor_obj = host_tensor::from_ggml_tensor(node);
|
||||
if (!tensor_obj) {
|
||||
LOG_DEBUG("Unable to get host tensor from ggml tensor: %p\n", (void *) node);
|
||||
continue;
|
||||
}
|
||||
|
||||
tensor_obj->set_op(node->op);
|
||||
_tensor_handles.push_back(tensor_obj->get_device_tensor_handle());
|
||||
LOG_DEBUG("node[%d]%s(%s), addr: %p, type: %s, tensor_handle: %p\n", i, ggml_get_name(node), ggml_op_desc(node),
|
||||
(void *) node, ggml_type_name(node->type), (void *) tensor_obj->get_device_tensor_handle());
|
||||
for (size_t j = 0; j < GGML_MAX_SRC && node->src[j]; ++j) {
|
||||
auto * src = host_tensor::from_ggml_tensor(node->src[j]);
|
||||
tensor_obj->set_src(j, src);
|
||||
}
|
||||
_tensor_update_configs.push_back(tensor_obj->update_hosts_params_only(node));
|
||||
LOG_DEBUG("[%p]node[%d]%s(%s), addr: %p, type: %s, tensor_handle: %p\n", (void *) this, i, ggml_get_name(node),
|
||||
ggml_op_desc(node), (void *) node, ggml_type_name(node->type),
|
||||
(void *) tensor_obj->get_device_tensor_handle());
|
||||
}
|
||||
|
||||
LOG_DEBUG("host_graph::update, host_graph(%p), handle(%p), ggml_cgraph(%p), tensor count(%zu)\n", (void *) this,
|
||||
(void *) _graph_handle, (void *) cgraph, _tensor_handles.size());
|
||||
if (!_tensor_handles.empty()) {
|
||||
npu_device_graph_set_tensor(_device_handle, _graph_handle, _tensor_handles.data(),
|
||||
(int) _tensor_handles.size());
|
||||
GGML_ASSERT(_tensor_handles.size() == _tensor_update_configs.size());
|
||||
|
||||
constexpr const npu_device_tensor_handle_t kEmptyTensorHandle = 0;
|
||||
constexpr const npu_device_tensor_update_config kEmptyUpdateConfig = {};
|
||||
|
||||
auto ret = npu_device_graph_set_tensor_with_param(
|
||||
_device_handle, _graph_handle, _tensor_handles.size() ? _tensor_handles.data() : &kEmptyTensorHandle,
|
||||
(int) _tensor_handles.size(),
|
||||
_tensor_update_configs.size() ? _tensor_update_configs.data() : &kEmptyUpdateConfig,
|
||||
(int) _tensor_update_configs.size());
|
||||
|
||||
if (ret != AEE_SUCCESS) {
|
||||
LOG_ERROR("Failed to set tensors in host_graph: 0x%x\n", (int) ret);
|
||||
return false;
|
||||
}
|
||||
|
||||
LOG_DEBUG("[%p]host_graph::update, handle(%p), ggml_cgraph(%p), tensor count(%zu)\n", (void *) this,
|
||||
(void *) _graph_handle, (void *) cgraph, _tensor_handles.size());
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -71,6 +86,7 @@ bool host_graph::compute() {
|
|||
return false;
|
||||
}
|
||||
|
||||
SCOPED_PERFORMANCE_TRACKER("[hexagon-npu][%p]compute, handle(%p)", (void *) this, (void *) _graph_handle);
|
||||
auto status = npu_device_graph_compute(_device_handle, _graph_handle);
|
||||
if (status != AEE_SUCCESS) {
|
||||
LOG_ERROR("Failed to compute host_graph: 0x%x\n", (int) status);
|
||||
|
|
|
|||
|
|
@ -21,9 +21,10 @@ class host_graph {
|
|||
bool compute();
|
||||
|
||||
private:
|
||||
remote_handle64 _device_handle = 0;
|
||||
npu_device_graph_handle_t _graph_handle = 0;
|
||||
std::vector<npu_device_tensor_handle_t> _tensor_handles;
|
||||
remote_handle64 _device_handle = 0;
|
||||
npu_device_graph_handle_t _graph_handle = 0;
|
||||
std::vector<npu_device_tensor_handle_t> _tensor_handles;
|
||||
std::vector<npu_device_tensor_update_config> _tensor_update_configs;
|
||||
|
||||
DISABLE_COPY(host_graph);
|
||||
DISABLE_MOVE(host_graph);
|
||||
|
|
|
|||
|
|
@ -151,7 +151,7 @@ bool npu_device::supports_op_impl(const ggml_tensor * op) {
|
|||
|
||||
auto * src0 = op->src[0];
|
||||
if (!src0) {
|
||||
LOG_DEBUG("[%s]Unsupported inplace op: %s\n", get_name(), ggml_op_name(op->op));
|
||||
LOG_DEBUG("[%s]Unsupported inplace op: %s\n", get_name(), ggml_op_desc(op));
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -168,7 +168,7 @@ bool npu_device::supports_op_impl(const ggml_tensor * op) {
|
|||
|
||||
auto npu_op = op_to_npu_op(op->op);
|
||||
if (npu_op == NPU_OP_COUNT) {
|
||||
LOG_DEBUG("[%s]Unsupported op: %s\n", get_name(), ggml_op_name(op->op));
|
||||
LOG_DEBUG("[%s]Unsupported op: %s\n", get_name(), ggml_op_desc(op));
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -179,7 +179,7 @@ bool npu_device::supports_op_impl(const ggml_tensor * op) {
|
|||
|
||||
constexpr const auto get_spec = [](const ggml_tensor * tensor) -> npu_device_tensor_spec {
|
||||
if (!tensor) {
|
||||
return npu_device_tensor_spec{};
|
||||
return npu_device_tensor_spec{ {}, NPU_DATA_TYPE_COUNT };
|
||||
}
|
||||
|
||||
static_assert(DEVICE_TENSOR_MAX_DIMS == GGML_MAX_DIMS, "tensor dimensions mismatch");
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
#pragma once
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ggml-impl.h"
|
||||
#include "hexagon_npu.h"
|
||||
|
|
@ -19,11 +21,15 @@ class host_tensor {
|
|||
|
||||
explicit host_tensor(ggml_tensor * tensor, int buffer_fd, uint64_t offset, remote_handle64 device_handle) :
|
||||
_device_handle(device_handle) {
|
||||
|
||||
// TODO: figure out why the npu_device_tensor_config can't be larger than 100 bytes
|
||||
static_assert(sizeof(npu_device_tensor_config) < 100, "npu_device_tensor_config size too large");
|
||||
|
||||
_info.buffer_fd = buffer_fd;
|
||||
_info.offset = offset;
|
||||
_info.type = type_to_npu_type(tensor->type);
|
||||
_info.op = op_to_npu_op(tensor->op);
|
||||
_info.size = ggml_nbytes(tensor);
|
||||
// _info.op will be updated in update_params()
|
||||
|
||||
static_assert(DEVICE_TENSOR_MAX_DIMS == GGML_MAX_DIMS, "tensor dimensions mismatch");
|
||||
static_assert(sizeof(_info.ne) == sizeof(tensor->ne), "tensor ne size mismatch");
|
||||
|
|
@ -56,28 +62,96 @@ class host_tensor {
|
|||
|
||||
npu_device_tensor_handle_t get_device_tensor_handle() const { return _device_tensor_handle; }
|
||||
|
||||
void set_src(size_t index, host_tensor * src) {
|
||||
if (index >= DEVICE_TENSOR_MAX_SRC) {
|
||||
LOG_ERROR("host_tensor(%p) set_src[%zu] out of range\n", (void *) this, index);
|
||||
void update_params(ggml_tensor * ggml_tensor) {
|
||||
static_assert(sizeof(_info_update.params) <= sizeof(_ggml_tensor->op_params),
|
||||
"device tensor params size mismatch");
|
||||
static_assert(DEVICE_TENSOR_MAX_SRC <= GGML_MAX_SRC, "device tensor src size mismatch");
|
||||
|
||||
GGML_ASSERT(ggml_tensor == _ggml_tensor);
|
||||
if (!_ggml_tensor) {
|
||||
LOG_DEBUG("host_tensor(%p) _ggml_tensor is null\n", (void *) this);
|
||||
return;
|
||||
}
|
||||
|
||||
LOG_DEBUG("host_tensor(%p) set_src[%zu]: %p\n", (void *) this, index, (void *) src);
|
||||
npu_device_tensor_set_src(_device_handle, _device_tensor_handle, index, src->get_device_tensor_handle());
|
||||
auto new_op = op_to_npu_op(_ggml_tensor->op);
|
||||
bool params_changed = new_op != _info_update.op;
|
||||
if (params_changed) {
|
||||
LOG_DEBUG("host_tensor(%p) op changed: %s -> %s\n", (void *) this, get_npu_op_desc(_info.op),
|
||||
get_npu_op_desc(new_op));
|
||||
}
|
||||
|
||||
_info.op = new_op;
|
||||
_info_update.op = new_op;
|
||||
|
||||
if (memcmp(_info_update.params, _ggml_tensor->op_params, sizeof(_info_update.params)) != 0) {
|
||||
params_changed = true;
|
||||
memcpy(_info_update.params, _ggml_tensor->op_params, sizeof(_info_update.params));
|
||||
LOG_DEBUG("host_tensor(%p) op_params changed: [%x, %x, %x, %x]\n", (void *) this,
|
||||
(int) _info_update.params[0], (int) _info_update.params[1], (int) _info_update.params[2],
|
||||
(int) _info_update.params[3]);
|
||||
}
|
||||
|
||||
npu_device_tensor_handle_t src_tensor_handles[DEVICE_TENSOR_MAX_SRC] = {};
|
||||
for (size_t j = 0; j < DEVICE_TENSOR_MAX_SRC && _ggml_tensor->src[j]; ++j) {
|
||||
auto * src = host_tensor::from_ggml_tensor(_ggml_tensor->src[j]);
|
||||
src_tensor_handles[j] = src->get_device_tensor_handle();
|
||||
LOG_DEBUG("host_tensor(%p) set_src[%zu]: %p\n", (void *) this, j, (void *) src);
|
||||
}
|
||||
|
||||
static_assert(std::is_same<decltype(_info_update.src_handles), decltype(src_tensor_handles)>::value,
|
||||
"src tensor handles type mismatch");
|
||||
|
||||
if (memcmp(_info_update.src_handles, src_tensor_handles, sizeof(_info_update.src_handles)) != 0) {
|
||||
params_changed = true;
|
||||
memcpy(_info_update.src_handles, src_tensor_handles, sizeof(_info_update.src_handles));
|
||||
LOG_DEBUG("host_tensor(%p) src changed, handles: [%p, %p]\n", (void *) this,
|
||||
(void *) _info_update.src_handles[0], (void *) _info_update.src_handles[1]);
|
||||
}
|
||||
|
||||
if (params_changed) {
|
||||
npu_device_tensor_update_params(_device_handle, _device_tensor_handle, &_info_update);
|
||||
LOG_DEBUG("host_tensor(%p) update_params, op: %s, params: [%x, %x, %x, %x]\n", (void *) this,
|
||||
ggml_op_desc(_ggml_tensor), (int) _info_update.params[0], (int) _info_update.params[1],
|
||||
(int) _info_update.params[2], (int) _info_update.params[3]);
|
||||
} else {
|
||||
LOG_DEBUG("host_tensor(%p) update_params, no changes, op: %s, params: [%x, %x, %x, %x]\n", (void *) this,
|
||||
ggml_op_desc(_ggml_tensor), (int) _info_update.params[0], (int) _info_update.params[1],
|
||||
(int) _info_update.params[2], (int) _info_update.params[3]);
|
||||
}
|
||||
}
|
||||
|
||||
void set_op(ggml_op op) {
|
||||
_info.op = op_to_npu_op(op);
|
||||
npu_device_tensor_set_op(_device_handle, _device_tensor_handle, _info.op);
|
||||
const npu_device_tensor_update_config & update_hosts_params_only(ggml_tensor * ggml_tensor) {
|
||||
static_assert(sizeof(_info_update.params) <= sizeof(ggml_tensor->op_params),
|
||||
"device tensor params size mismatch");
|
||||
static_assert(DEVICE_TENSOR_MAX_SRC <= GGML_MAX_SRC, "device tensor src size mismatch");
|
||||
|
||||
GGML_ASSERT(ggml_tensor == _ggml_tensor);
|
||||
|
||||
auto new_op = op_to_npu_op(_ggml_tensor->op);
|
||||
_info.op = new_op;
|
||||
_info_update.op = new_op;
|
||||
memcpy(_info_update.params, _ggml_tensor->op_params, sizeof(_info_update.params));
|
||||
|
||||
for (size_t j = 0; j < DEVICE_TENSOR_MAX_SRC && _ggml_tensor->src[j]; ++j) {
|
||||
auto * src = host_tensor::from_ggml_tensor(_ggml_tensor->src[j]);
|
||||
_info_update.src_handles[j] = src->get_device_tensor_handle();
|
||||
LOG_DEBUG("host_tensor(%p) set_src[%zu]: %p\n", (void *) this, j, (void *) src);
|
||||
}
|
||||
|
||||
LOG_DEBUG("host_tensor(%p) update_params, op: %s, params: [%x, %x, %x, %x]\n", (void *) this,
|
||||
ggml_op_desc(_ggml_tensor), (int) _info_update.params[0], (int) _info_update.params[1],
|
||||
(int) _info_update.params[2], (int) _info_update.params[3]);
|
||||
return _info_update;
|
||||
}
|
||||
|
||||
bool is_valid() const { return _device_tensor_handle != 0; }
|
||||
|
||||
private:
|
||||
remote_handle64 _device_handle = 0;
|
||||
npu_device_tensor_handle_t _device_tensor_handle = 0;
|
||||
npu_device_tensor_config _info = {};
|
||||
ggml_tensor * _ggml_tensor = nullptr;
|
||||
remote_handle64 _device_handle = 0;
|
||||
npu_device_tensor_handle_t _device_tensor_handle = 0;
|
||||
npu_device_tensor_config _info = {};
|
||||
npu_device_tensor_update_config _info_update = {};
|
||||
ggml_tensor * _ggml_tensor = nullptr;
|
||||
|
||||
DISABLE_COPY(host_tensor);
|
||||
DISABLE_MOVE(host_tensor);
|
||||
|
|
|
|||
|
|
@ -25,11 +25,30 @@ enum npu_device_tensor_op op_to_npu_op(ggml_op op) {
|
|||
return NPU_OP_SUB;
|
||||
case GGML_OP_MUL:
|
||||
return NPU_OP_MUL;
|
||||
case GGML_OP_RMS_NORM:
|
||||
return NPU_OP_RMS_NORM;
|
||||
default:
|
||||
return NPU_OP_COUNT;
|
||||
}
|
||||
}
|
||||
|
||||
const char * get_npu_op_desc(enum npu_device_tensor_op op) {
|
||||
switch (op) {
|
||||
case NPU_OP_MUL_MAT:
|
||||
return ggml_op_name(GGML_OP_MUL_MAT);
|
||||
case NPU_OP_ADD:
|
||||
return ggml_op_name(GGML_OP_ADD);
|
||||
case NPU_OP_SUB:
|
||||
return ggml_op_name(GGML_OP_SUB);
|
||||
case NPU_OP_MUL:
|
||||
return ggml_op_name(GGML_OP_MUL);
|
||||
case NPU_OP_RMS_NORM:
|
||||
return ggml_op_name(GGML_OP_RMS_NORM);
|
||||
default:
|
||||
return "UNKNOWN";
|
||||
}
|
||||
}
|
||||
|
||||
enum npu_device_tensor_data_type type_to_npu_type(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@
|
|||
namespace hexagon {
|
||||
|
||||
enum npu_device_tensor_op op_to_npu_op(ggml_op op);
|
||||
const char * get_npu_op_desc(enum npu_device_tensor_op op);
|
||||
enum npu_device_tensor_data_type type_to_npu_type(ggml_type type);
|
||||
|
||||
// TODO: merge with qcom_htp_arch
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@
|
|||
|
||||
const uint32_t DEVICE_TENSOR_MAX_DIMS = 4;
|
||||
const uint32_t DEVICE_TENSOR_MAX_SRC = 2;
|
||||
const uint32_t DEVICE_TENSOR_MAX_OP_PARAMS = 4;
|
||||
const uint32_t QUANT_BLOCK_SIZE = 32;
|
||||
const uint32_t QUANT_K_BLOCK_SIZE = 256;
|
||||
const uint32_t QUANT_K_SCALE_SIZE = 12;
|
||||
|
|
@ -38,6 +39,7 @@ interface npu_device : remote_handle64{
|
|||
NPU_OP_ADD,
|
||||
NPU_OP_SUB,
|
||||
NPU_OP_MUL,
|
||||
NPU_OP_RMS_NORM,
|
||||
NPU_OP_COUNT
|
||||
};
|
||||
|
||||
|
|
@ -55,6 +57,12 @@ interface npu_device : remote_handle64{
|
|||
tensor_data_type type;
|
||||
};
|
||||
|
||||
struct tensor_update_config {
|
||||
tensor_op op;
|
||||
int32_t params[DEVICE_TENSOR_MAX_OP_PARAMS];
|
||||
tensor_handle_t src_handles[DEVICE_TENSOR_MAX_SRC];
|
||||
};
|
||||
|
||||
struct tensor_config {
|
||||
ne_type ne;
|
||||
uint64_t nb[DEVICE_TENSOR_MAX_DIMS];
|
||||
|
|
@ -82,15 +90,9 @@ interface npu_device : remote_handle64{
|
|||
rout tensor_handle_t tensor_handle
|
||||
);
|
||||
|
||||
AEEResult tensor_set_src(
|
||||
AEEResult tensor_update_params(
|
||||
in tensor_handle_t tensor_handle,
|
||||
in uint64_t index,
|
||||
in tensor_handle_t src
|
||||
);
|
||||
|
||||
AEEResult tensor_set_op(
|
||||
in tensor_handle_t tensor_handle,
|
||||
in tensor_op op
|
||||
in tensor_update_config config
|
||||
);
|
||||
|
||||
AEEResult tensor_free(
|
||||
|
|
@ -106,6 +108,12 @@ interface npu_device : remote_handle64{
|
|||
in sequence<tensor_handle_t> tensor_handles
|
||||
);
|
||||
|
||||
AEEResult graph_set_tensor_with_param(
|
||||
in graph_handle_t graph_handle,
|
||||
in sequence<tensor_handle_t> tensor_handles,
|
||||
in sequence<tensor_update_config> tensor_params
|
||||
);
|
||||
|
||||
AEEResult graph_compute(
|
||||
in graph_handle_t graph_handle
|
||||
);
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
#include "profiler.hpp"
|
||||
#include "event_tracer.hpp"
|
||||
|
||||
#include <HTP/QnnHtpProfile.h>
|
||||
#include <QnnProfile.h>
|
||||
|
|
@ -0,0 +1,45 @@
|
|||
#pragma once
|
||||
|
||||
#include <QnnCommon.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "logger.hpp"
|
||||
#include "profiler.hpp"
|
||||
#include "qnn-types.hpp"
|
||||
|
||||
namespace qnn {
|
||||
|
||||
// forward declaration of qnn_interface
|
||||
class qnn_interface;
|
||||
|
||||
class qnn_event_tracer {
|
||||
public:
|
||||
// ref:
|
||||
// https://github.com/pytorch/executorch/blob/ae3d558d5e6aa04fc52a3065399fe6a773702f52/backends/qualcomm/serialization/qc_schema.py#L53
|
||||
// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html#supported-snapdragon-devices
|
||||
enum sdk_profile_level { PROFILE_OFF = 0, PROFILE_BASIC, PROFILE_DETAIL, PROFILE_OP_TRACE };
|
||||
|
||||
explicit qnn_event_tracer(const std::string & prefix, std::shared_ptr<qnn_interface> interface,
|
||||
Qnn_BackendHandle_t backend_handle, sdk_profile_level level);
|
||||
~qnn_event_tracer();
|
||||
|
||||
Qnn_ProfileHandle_t get_handle() const { return _handle; }
|
||||
|
||||
void print_profile_events();
|
||||
|
||||
private:
|
||||
std::shared_ptr<qnn_interface> _interface;
|
||||
Qnn_ProfileHandle_t _handle = nullptr;
|
||||
std::string _prefix;
|
||||
|
||||
DISABLE_COPY(qnn_event_tracer);
|
||||
DISABLE_MOVE(qnn_event_tracer);
|
||||
};
|
||||
|
||||
using qnn_event_tracer_ptr = std::shared_ptr<qnn_event_tracer>;
|
||||
|
||||
} // namespace qnn
|
||||
|
|
@ -4,10 +4,10 @@
|
|||
#include <algorithm>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "event_tracer.hpp"
|
||||
#include "ggml-impl.h"
|
||||
#include "logger.hpp"
|
||||
#include "op-config.hpp"
|
||||
#include "profiler.hpp"
|
||||
#include "tensor.hpp"
|
||||
|
||||
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
|
||||
|
|
@ -411,8 +411,8 @@ bool qnn_graph::build_graph_from_ggml_graph(const ggml_cgraph * cgraph) {
|
|||
GGML_TYPE_COUNT > GGML_TYPE_Q8_0 && GGML_TYPE_Q8_0 > GGML_TYPE_F16 && GGML_TYPE_F16 > GGML_TYPE_F32,
|
||||
"GGML_TYPE enum order is not correct");
|
||||
|
||||
QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]build_graph_from_ggml_graph", get_backend_name(_device),
|
||||
_graph_name.c_str());
|
||||
SCOPED_PERFORMANCE_TRACKER("[%s][%s]build_graph_from_ggml_graph", get_backend_name(_device),
|
||||
_graph_name.c_str());
|
||||
|
||||
auto override_data_type = get_override_data_type(inputs, outputs);
|
||||
if (override_data_type != GGML_TYPE_COUNT) {
|
||||
|
|
@ -466,8 +466,7 @@ bool qnn_graph::execute(const ggml_cgraph * cgraph, std::shared_ptr<qnn_convert_
|
|||
ggml_tensor_array_t inputs;
|
||||
ggml_tensor_array_t outputs;
|
||||
{
|
||||
QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]get_io_tensors_from_graph", get_backend_name(_device),
|
||||
_graph_name.c_str());
|
||||
SCOPED_PERFORMANCE_TRACKER("[%s][%s]get_io_tensors_from_graph", get_backend_name(_device), _graph_name.c_str());
|
||||
#ifdef NDEBUG
|
||||
get_io_tensors_from_graph(cgraph, inputs, outputs);
|
||||
#else
|
||||
|
|
@ -478,7 +477,7 @@ bool qnn_graph::execute(const ggml_cgraph * cgraph, std::shared_ptr<qnn_convert_
|
|||
}
|
||||
|
||||
{
|
||||
QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]bind_tensors", get_backend_name(_device), _graph_name.c_str());
|
||||
SCOPED_PERFORMANCE_TRACKER("[%s][%s]bind_tensors", get_backend_name(_device), _graph_name.c_str());
|
||||
auto override_data_type = get_override_data_type(inputs, outputs);
|
||||
if (override_data_type != GGML_TYPE_COUNT) {
|
||||
QNN_LOG_DEBUG("[%s][%s]override_data_type: %s\n", get_backend_name(_device), _graph_name.c_str(),
|
||||
|
|
@ -502,7 +501,7 @@ bool qnn_graph::execute(const ggml_cgraph * cgraph, std::shared_ptr<qnn_convert_
|
|||
}
|
||||
|
||||
{
|
||||
QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]execute", get_backend_name(_device), _graph_name.c_str());
|
||||
SCOPED_PERFORMANCE_TRACKER("[%s][%s]execute", get_backend_name(_device), _graph_name.c_str());
|
||||
auto & qnn_tensor_inputs = _qnn_tensor_inputs;
|
||||
auto & qnn_tensor_outputs = _qnn_tensor_outputs;
|
||||
auto error = _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(),
|
||||
|
|
@ -529,7 +528,7 @@ bool qnn_graph::execute(const ggml_cgraph * cgraph, std::shared_ptr<qnn_convert_
|
|||
}
|
||||
|
||||
bool qnn_graph::finalize() {
|
||||
QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]finalize", get_backend_name(_device), _graph_name.c_str());
|
||||
SCOPED_PERFORMANCE_TRACKER("[%s][%s]finalize", get_backend_name(_device), _graph_name.c_str());
|
||||
|
||||
if (!qnn::add_op_to_graph(_graph_handle, _operations)) {
|
||||
QNN_LOG_ERROR("[%s]add nodes failed\n", _graph_name.c_str());
|
||||
|
|
|
|||
|
|
@ -6,9 +6,9 @@
|
|||
#include <vector>
|
||||
|
||||
#include "convert.hpp"
|
||||
#include "event_tracer.hpp"
|
||||
#include "ggml-qnn.h"
|
||||
#include "op-config.hpp"
|
||||
#include "profiler.hpp"
|
||||
#include "qnn-lib.hpp"
|
||||
|
||||
namespace qnn {
|
||||
|
|
|
|||
|
|
@ -1,100 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include <QnnCommon.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "logger.hpp"
|
||||
#include "qnn-types.hpp"
|
||||
|
||||
namespace qnn {
|
||||
|
||||
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
|
||||
|
||||
class qnn_scoped_timer {
|
||||
public:
|
||||
qnn_scoped_timer(const std::string & log_prefix) : _log_prefix(std::move(log_prefix)) {
|
||||
_begin_us = ggml_time_us();
|
||||
}
|
||||
|
||||
qnn_scoped_timer(qnn_scoped_timer && other) {
|
||||
_begin_us = other._begin_us;
|
||||
_log_prefix = std::move(other._log_prefix);
|
||||
}
|
||||
|
||||
~qnn_scoped_timer() { print(); }
|
||||
|
||||
void operator=(qnn_scoped_timer && other) {
|
||||
_begin_us = other._begin_us;
|
||||
_log_prefix = std::move(other._log_prefix);
|
||||
}
|
||||
|
||||
void print() const {
|
||||
auto duration = (ggml_time_us() - _begin_us) / 1000.0;
|
||||
QNN_LOG_INFO("[profiler]%s, duration: %.4f ms\n", _log_prefix.c_str(), duration);
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
int64_t _begin_us = 0LL;
|
||||
std::string _log_prefix;
|
||||
|
||||
qnn_scoped_timer(const qnn_scoped_timer &) = delete;
|
||||
void operator=(const qnn_scoped_timer &) = delete;
|
||||
};
|
||||
|
||||
inline qnn_scoped_timer make_scope_perf_timer(const char * format, ...) {
|
||||
va_list args;
|
||||
va_start(args, format);
|
||||
char buffer[4096];
|
||||
vsnprintf(buffer, sizeof(buffer), format, args);
|
||||
va_end(args);
|
||||
return qnn_scoped_timer(buffer);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
inline void make_scope_perf_timer(const char *, ...) {}
|
||||
|
||||
#endif
|
||||
|
||||
// forward declaration of qnn_interface
|
||||
class qnn_interface;
|
||||
|
||||
class qnn_event_tracer {
|
||||
public:
|
||||
// ref:
|
||||
// https://github.com/pytorch/executorch/blob/ae3d558d5e6aa04fc52a3065399fe6a773702f52/backends/qualcomm/serialization/qc_schema.py#L53
|
||||
// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html#supported-snapdragon-devices
|
||||
enum sdk_profile_level { PROFILE_OFF = 0, PROFILE_BASIC, PROFILE_DETAIL, PROFILE_OP_TRACE };
|
||||
|
||||
explicit qnn_event_tracer(const std::string & prefix, std::shared_ptr<qnn_interface> interface,
|
||||
Qnn_BackendHandle_t backend_handle, sdk_profile_level level);
|
||||
~qnn_event_tracer();
|
||||
|
||||
Qnn_ProfileHandle_t get_handle() const { return _handle; }
|
||||
|
||||
void print_profile_events();
|
||||
|
||||
private:
|
||||
std::shared_ptr<qnn_interface> _interface;
|
||||
Qnn_ProfileHandle_t _handle = nullptr;
|
||||
std::string _prefix;
|
||||
|
||||
DISABLE_COPY(qnn_event_tracer);
|
||||
DISABLE_MOVE(qnn_event_tracer);
|
||||
};
|
||||
|
||||
using qnn_event_tracer_ptr = std::shared_ptr<qnn_event_tracer>;
|
||||
|
||||
} // namespace qnn
|
||||
|
||||
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
|
||||
# define QNN_SCOPED_PERFORMANCE_TRACKER(fmt, ...) \
|
||||
auto __qnn_timer_##__LINE__ = qnn::make_scope_perf_timer(fmt, __VA_ARGS__)
|
||||
#else
|
||||
# define QNN_SCOPED_PERFORMANCE_TRACKER(fmt, ...) ((void) 0)
|
||||
#endif
|
||||
|
|
@ -34,21 +34,36 @@ constexpr const qnn::device_caps kDeviceCaps[] = {
|
|||
{
|
||||
// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul
|
||||
kQnnCpuLibName, GGML_BACKEND_DEVICE_TYPE_ACCEL, (1L << GGML_TYPE_I8) | (1L << GGML_TYPE_F32),
|
||||
0xFFFFFE, // all quantized types can be offload to CPU, at current implementation, those types will be dequantized into float32 on cpu
|
||||
0, // 0 for no limitation
|
||||
#ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS
|
||||
// all quantized types can be offload to CPU, at current implementation, those types will be dequantized into float32 on cpu
|
||||
0xFFFFFE,
|
||||
#else
|
||||
0,
|
||||
#endif
|
||||
|
||||
0, // 0 for no limitation
|
||||
},
|
||||
{
|
||||
// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul
|
||||
kQnnGpuLibName, GGML_BACKEND_DEVICE_TYPE_GPU, (1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16),
|
||||
#ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS
|
||||
// all quantized types can be offload to GPU, at current implementation, those types will be dequantized into float32 on cpu
|
||||
0xFFFFFE, (128256L * 4096 *
|
||||
0xFFFFFE,
|
||||
#else
|
||||
0,
|
||||
#endif
|
||||
(128256L * 4096 *
|
||||
sizeof(float)), // tested on 8 gen 2, failed to allocate tensor with size 128256x4096 and float32
|
||||
},
|
||||
{
|
||||
// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul
|
||||
kQnnNpuLibName, GGML_BACKEND_DEVICE_TYPE_ACCEL,
|
||||
#ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS
|
||||
(1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16) | (1L << GGML_TYPE_I16),
|
||||
(1L << GGML_TYPE_Q2_K) | (1L << GGML_TYPE_Q3_K) | (1L << GGML_TYPE_Q4_K) | (1L << GGML_TYPE_Q8_K),
|
||||
#else
|
||||
0,
|
||||
#endif
|
||||
(8192L * 2048 + 8192 * 512 + 2048 * 512) * sizeof(float), // TODO: should have a better way to get this value
|
||||
},
|
||||
};
|
||||
|
|
|
|||
|
|
@ -45,6 +45,10 @@ size_t get_system_free_memory_in_bytes();
|
|||
class_name(class_name &&) = delete; \
|
||||
void operator=(class_name &&) = delete
|
||||
|
||||
#define DISABLE_COPY_AND_MOVE(class_name) \
|
||||
DISABLE_COPY(class_name); \
|
||||
DISABLE_MOVE(class_name)
|
||||
|
||||
#define LOG_ERROR(...) (GGML_LOG_ERROR(__VA_ARGS__))
|
||||
#define LOG_WARN(...) (GGML_LOG_WARN(__VA_ARGS__))
|
||||
#define LOG_INFO(...) (GGML_LOG_INFO(__VA_ARGS__))
|
||||
|
|
|
|||
|
|
@ -0,0 +1,61 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
namespace profiler {
|
||||
|
||||
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
|
||||
|
||||
class scoped_timer {
|
||||
public:
|
||||
scoped_timer(const std::string & log_prefix) : _log_prefix(std::move(log_prefix)) { _begin_us = ggml_time_us(); }
|
||||
|
||||
scoped_timer(scoped_timer && other) {
|
||||
_begin_us = other._begin_us;
|
||||
_log_prefix = std::move(other._log_prefix);
|
||||
}
|
||||
|
||||
~scoped_timer() { print(); }
|
||||
|
||||
void operator=(scoped_timer && other) {
|
||||
_begin_us = other._begin_us;
|
||||
_log_prefix = std::move(other._log_prefix);
|
||||
}
|
||||
|
||||
void print() const {
|
||||
auto duration = ggml_time_us() - _begin_us;
|
||||
GGML_LOG_INFO("[profiler]%s, dur: %lld us\n", _log_prefix.c_str(), (long long) duration);
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
int64_t _begin_us = 0LL;
|
||||
std::string _log_prefix;
|
||||
|
||||
DISABLE_COPY(scoped_timer);
|
||||
};
|
||||
|
||||
inline scoped_timer make_scope_perf_timer(const char * format, ...) {
|
||||
va_list args;
|
||||
va_start(args, format);
|
||||
char buffer[4096];
|
||||
vsnprintf(buffer, sizeof(buffer), format, args);
|
||||
va_end(args);
|
||||
return scoped_timer(buffer);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
} // namespace profiler
|
||||
|
||||
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
|
||||
# define SCOPED_PERFORMANCE_TRACKER(fmt, ...) \
|
||||
auto __scoped_timer_##__LINE__ = profiler::make_scope_perf_timer(fmt, __VA_ARGS__)
|
||||
#else
|
||||
# define SCOPED_PERFORMANCE_TRACKER(fmt, ...) ((void) 0)
|
||||
#endif
|
||||
|
|
@ -64,8 +64,10 @@ class rpc_mem {
|
|||
|
||||
void * buf = nullptr;
|
||||
if (_rpc_interface->is_alloc2_available()) {
|
||||
LOG_DEBUG("rpcmem_alloc2 available, using it\n");
|
||||
buf = _rpc_interface->rpcmem_alloc2(heapid, flags, size);
|
||||
} else {
|
||||
LOG_DEBUG("rpcmem_alloc2 not available, using rpcmem_alloc\n");
|
||||
buf = _rpc_interface->rpcmem_alloc(heapid, flags, size);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue