feat: perf opt part4 (#43)

* wip

* refactor: rewrite dequantize_row_q4_0 by intrinsic

* log for debug

* fix q4 intrinsic

* small opt

* wip

* wip

* add vtcm_quota_size

* add perf log for hexagon-npu backend

* wip

* add log

* sync after a specfic op

* increase worker thread priority

* fix unbalanced thread slice

* small slict to fit in vtcm cache

* limit the supported row element size

* opt 4_0 dequant

* fix q4 dequant

* add power_utils

* add rms_norm

* wip

* enable rms_norm f32

* fix rms_norm with param

* fix compiling flags

* use float

* fix small row size

* vectorized rms norm

* wip

* read 2 vectors

* rename

* add perf log on update

* set empty tensors handle also

* merge some rpc functions

* opt param update

* wip

* print more log

* add struct for update param config

* add npu_device_graph_set_tensor_with_param

* merge tensor and params update

* wip

* wip

* make as template to reuse

* vectorize dequantize_row_q8_0

* opt

* avoid using union to store q data

* wip

* wip

* wip
This commit is contained in:
nullname 2025-05-28 00:00:42 +08:00 committed by GitHub
parent 2306f82a58
commit c23ab465c0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
32 changed files with 1014 additions and 397 deletions

View File

@ -231,6 +231,11 @@ else()
build_idl(idl/hexagon_npu.idl hexagon_npu_skel_OBJS)
add_subdirectory(${HEXAGON_SDK_ROOT}/libs/qprintf qprintf_dir)
target_include_directories(hexagon_npu_skel_OBJS PUBLIC
${HEXAGON_SDK_ROOT}/libs/qprintf/inc/
)
# disable warnings for the skel
set_source_files_properties(
${skel_srcs}
@ -239,12 +244,12 @@ else()
)
add_library(hexagon_npu_skel SHARED $<TARGET_OBJECTS:hexagon_npu_skel_OBJS>)
target_link_libraries(hexagon_npu_skel
${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++abi.a
${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++.a
)
set_target_properties(hexagon_npu_skel PROPERTIES OUTPUT_NAME "hexagon_npu_skel_${HEXAGON_ARCH}")
target_link_libraries(hexagon_npu_skel qprintf_static)
copy_binaries(hexagon_npu_skel)
endif()

View File

@ -106,6 +106,7 @@ int npu_device_open(const char * uri, remote_handle64 * h) {
}
*h = reinterpret_cast<remote_handle64>(context);
DEVICE_LOG_INFO("NPU device context created: %p", (void *) *h);
return AEE_SUCCESS;
}
@ -117,6 +118,7 @@ int npu_device_close(remote_handle64 h) {
}
delete context;
DEVICE_LOG_INFO("NPU device context destroyed: %p", (void *) h);
return AEE_SUCCESS;
}
@ -130,6 +132,12 @@ AEEResult npu_device_device_support_op(remote_handle64 _h, const npu_device_tens
const npu_device_tensor_spec * src1, const npu_device_tensor_spec * dst,
npu_device_tensor_op op, boolean * is_supported) {
NPU_UNUSED(_h);
if (!src0 || !src1 || !dst || !is_supported) {
DEVICE_LOG_ERROR("npu_device_device_support_op: Invalid arguments");
return AEE_EINVARGS;
}
*is_supported = hexagon::support_op(*src0, *src1, *dst, op);
return AEE_SUCCESS;
}
@ -147,28 +155,15 @@ AEEResult npu_device_tensor_init(remote_handle64 _h, const npu_device_tensor_con
return AEE_SUCCESS;
}
AEEResult npu_device_tensor_set_src(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle, uint64_t index,
npu_device_tensor_handle_t src) {
AEEResult npu_device_tensor_update_params(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle,
const npu_device_tensor_update_config * config) {
NPU_UNUSED(_h);
auto * tensor = tensor_from_handle(tensor_handle);
if (!tensor) {
if (!tensor || !config) {
return AEE_EINVHANDLE;
}
auto * src_tensor = tensor_from_handle(src);
tensor->set_src(index, src_tensor);
return AEE_SUCCESS;
}
AEEResult npu_device_tensor_set_op(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle,
npu_device_tensor_op op) {
NPU_UNUSED(_h);
auto * tensor = tensor_from_handle(tensor_handle);
if (!tensor) {
return AEE_EINVHANDLE;
}
tensor->set_op(op);
tensor->update_config(*config);
return AEE_SUCCESS;
}
@ -206,6 +201,29 @@ AEEResult npu_device_graph_set_tensor(remote_handle64 _h, npu_device_graph_handl
return AEE_SUCCESS;
}
AEEResult npu_device_graph_set_tensor_with_param(remote_handle64 _h, npu_device_graph_handle_t graph_handle,
const npu_device_tensor_handle_t * tensor_handles,
int tensor_handlesLen,
const npu_device_tensor_update_config * tensor_params,
int tensor_paramsLen) {
NPU_UNUSED(_h);
auto * graph = graph_from_handle(graph_handle);
if (!graph || !tensor_handles || tensor_handlesLen <= 0 || !tensor_params ||
tensor_handlesLen != tensor_paramsLen) {
return AEE_EINVHANDLE;
}
graph->set_tensor(tensor_handles, tensor_handlesLen);
for (int i = 0; i < tensor_handlesLen; ++i) {
auto * tensor = tensor_from_handle(tensor_handles[i]);
if (tensor) {
tensor->update_config(tensor_params[i]);
}
}
return AEE_SUCCESS;
}
AEEResult npu_device_graph_compute(remote_handle64 _h, npu_device_graph_handle_t graph_handle) {
auto dev_ctx = device_context_from_handle(_h);
if (!dev_ctx) {

View File

@ -10,7 +10,8 @@
namespace hexagon {
graph::graph() noexcept {
DEVICE_LOG_DEBUG("graph(%p) created\n", (void *) this);
_vtcm_quota_size = hexagon::vtcm_mem::get_avail_block_size(); // TODO: move to device init?
DEVICE_LOG_DEBUG("graph(%p) created: vtcm quota size: %zu\n", (void *) this, _vtcm_quota_size);
}
graph::~graph() noexcept {
@ -45,6 +46,8 @@ bool graph::compute(default_thread_pool * thread_pool, const float * f16_to_f32_
}
DEVICE_LOG_DEBUG("graph(%p) compute\n", (void *) this);
DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p]compute", (void *) this);
_f16_to_f32_table = f16_to_f32_table;
if (thread_pool) {
thread_pool->sync_execute(reinterpret_cast<default_thread_pool::task_type>(&graph::thread_pool_task), this);
@ -61,6 +64,8 @@ void graph::thread_pool_task(default_thread_pool * pool, size_t thread_idx, size
}
void graph::compute_impl(default_thread_pool * pool, size_t thread_idx, size_t thread_count) {
hexagon::compute_params params = { thread_idx, thread_count, _vtcm_quota_size / thread_count, _f16_to_f32_table };
for (size_t i = 0; i < _tensor_count; ++i) {
auto * dst = _tensors[i];
auto op = dst->get_op();
@ -69,14 +74,14 @@ void graph::compute_impl(default_thread_pool * pool, size_t thread_idx, size_t t
DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d not supported\n", (void *) this, i, op);
return;
}
hexagon::compute_params params = { thread_idx, thread_count, _f16_to_f32_table };
if (!func(dst, &params)) {
DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d compute failed\n", (void *) this, i, op);
}
// TODO: figure out which ops need to sync
if (pool) {
DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p]sync_thread, tidx: %zu", (void *) this, thread_idx);
const bool should_sync = requires_thread_barrier(op);
if (pool && should_sync && i < _tensor_count - 1) {
pool->sync_thread();
}
dst->invalidate();

View File

@ -25,6 +25,7 @@ class graph {
std::unique_ptr<tensor *[]> _tensors;
size_t _tensor_count = 0;
size_t _vtcm_quota_size = 0;
const float * _f16_to_f32_table = nullptr;
DISABLE_COPY_AND_MOVE(graph);

View File

@ -5,6 +5,8 @@
#include <hexagon_types.h>
#include <HTP/core/intrinsics.h>
#include <type_traits>
#include "op_mul_mat.hpp"
#include "quants.hpp"
@ -17,7 +19,7 @@ inline void vec_op_impl(const _TyData * src0, const _TyData * src1, size_t count
HVX_Vector * iptr0 = ((HVX_Vector *) src0);
HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kElementsPerVector);
HVX_Vector * iptr1 = ((HVX_Vector *) src1);
HVX_Vector * optr = ((HVX_Vector *) dst);
HVX_Vector * optr = ((HVX_Vector *) dst); // framework will ensure the dst is aligned
HVX_Vector prev0 = *iptr0++;
HVX_Vector prev1 = *iptr1++;
@ -108,6 +110,12 @@ template <typename _TyData> struct get_data_type<void (*)(const _TyData *, const
using type = _TyData;
};
template <typename _TyData, typename _TyParam>
struct get_data_type<void (*)(const _TyData *, size_t, _TyParam, _TyData *)> {
using type = _TyData;
using param_type = typename std::remove_cv<typename std::remove_reference<_TyData>::type>::type;
};
template <auto _RowFunc> bool element_wise_op(hexagon::tensor * out, hexagon::compute_params * params) {
using data_type = typename get_data_type<decltype(_RowFunc)>::type;
@ -166,6 +174,16 @@ template <auto _RowFunc> bool element_wise_op(hexagon::tensor * out, hexagon::co
return true;
}
bool is_same_shape(const npu_device_tensor_spec & src, const npu_device_tensor_spec & dst) {
for (size_t i = 0; i < DEVICE_TENSOR_MAX_DIMS; ++i) {
if (src.ne[i] != dst.ne[i]) {
return false;
}
}
return true;
}
bool is_element_wise_op_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
const npu_device_tensor_spec & dst, npu_device_tensor_op op) {
if (op != NPU_OP_ADD && op != NPU_OP_SUB && op != NPU_OP_MUL) {
@ -196,12 +214,149 @@ bool is_element_wise_op_supported(const npu_device_tensor_spec & src0, const npu
return false;
}
for (size_t i = 0; i < DEVICE_TENSOR_MAX_DIMS; ++i) {
if (src0.ne[i] != dst.ne[i]) {
DEVICE_LOG_DEBUG("[%s]src0.ne[%zu] and dst.ne[%zu] not match: %lld vs %lld\n", hexagon::op_get_name(op), i,
i, (long long) src0.ne[i], (long long) dst.ne[i]);
return false;
if (!is_same_shape(src0, dst)) {
DEVICE_LOG_DEBUG("[%s]src0 and dst have different shape\n", hexagon::op_get_name(op));
return false;
}
return true;
}
void rms_norm_vec_f32(const float * src, size_t count, float eps, float * dst) {
constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(float);
HVX_Vector * src_vec_ptr = ((HVX_Vector *) src);
HVX_Vector * src_vec_end = ((HVX_Vector *) src) + (count / kElementsPerVector);
HVX_Vector prev = *src_vec_ptr++;
HVX_Vector sum = Q6_V_vzero();
while (src_vec_ptr < src_vec_end) {
HVX_Vector curr = *src_vec_ptr++;
HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src);
sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_Vqf32_vmpy_VsfVsf(s0, s0));
prev = curr;
}
if ((src_vec_end - ((HVX_Vector *) src)) > 0) {
// handle the last vector
bool src_ptr_aligned = hexagon::is_addr_aligned(src_vec_ptr);
HVX_Vector curr = src_ptr_aligned ? prev : *src_vec_ptr;
src_vec_ptr = src_ptr_aligned ? src_vec_ptr : src_vec_ptr + 1;
HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src);
sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_Vqf32_vmpy_VsfVsf(s0, s0));
prev = curr;
}
const size_t leftover = count % kElementsPerVector;
const size_t leftover_bytes = leftover * sizeof(float);
if (leftover > 0) {
// handle the leftover elements
HVX_Vector curr =
(leftover_bytes + hexagon::unaligned_bytes(src_vec_ptr) > hexagon::kBytesPerVector) ? *src_vec_ptr : prev;
curr = Q6_V_valign_VVR(curr, prev, (size_t) src);
sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum,
Q6_V_valign_VVR(Q6_Vqf32_vmpy_VsfVsf(curr, curr), Q6_V_vzero(), leftover_bytes));
}
const float mean = hexagon::vec_reduction_f32(sum) / count; // TODO: figure out how to do division in vector
const float scale = 1.0f / sqrtf(mean + eps); // TODO: use buildin blas sqrtf?
HVX_Vector scale_vec = Q6_V_vsplat_R(reinterpret_cast<const uint32_t &>(scale));
src_vec_ptr = ((HVX_Vector *) src);
prev = *src_vec_ptr++;
HVX_Vector * dst_vec_ptr = ((HVX_Vector *) dst); // framework will ensure the dst is aligned
while (src_vec_ptr < src_vec_end) {
HVX_Vector curr = *src_vec_ptr++;
HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src);
*dst_vec_ptr++ = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, scale_vec));
prev = curr;
}
if ((src_vec_end - ((HVX_Vector *) src)) > 0) {
// handle the last vector
bool src_ptr_aligned = hexagon::is_addr_aligned(src_vec_ptr);
HVX_Vector curr = src_ptr_aligned ? prev : *src_vec_ptr;
src_vec_ptr = src_ptr_aligned ? src_vec_ptr : src_vec_ptr + 1;
HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src);
*dst_vec_ptr++ = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, scale_vec));
prev = curr;
}
if (leftover > 0) {
// handle the leftover elements
HVX_Vector curr =
(leftover_bytes + hexagon::unaligned_bytes(src_vec_ptr) > hexagon::kBytesPerVector) ? *src_vec_ptr : prev;
curr = Q6_V_valign_VVR(curr, prev, (size_t) src);
q6op_vstu_variable_ARV(dst_vec_ptr, leftover_bytes, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(curr, scale_vec)));
}
}
// TODO: merge with element_wise_op?
template <auto _RowFunc> bool unary_op(hexagon::tensor * out, hexagon::compute_params * params) {
using data_type = typename get_data_type<decltype(_RowFunc)>::type;
using param_type = typename get_data_type<decltype(_RowFunc)>::param_type;
if (!out) {
return false;
}
static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "element_wise_op requires max dims 4");
auto * src0 = out->get_src(0);
if (!src0) {
return true; // skip if no src
}
const auto * src0_ptr = reinterpret_cast<const uint8_t *>(src0->get_read_buffer());
auto * dst_ptr = reinterpret_cast<uint8_t *>(out->get_write_buffer());
auto total_rows = out->get_ne(3) * out->get_ne(2) * out->get_ne(1);
const auto rows_per_cube = out->get_ne(2) * out->get_ne(1);
const auto start_end = hexagon::get_thread_work_slice(total_rows, params->tidx, params->tcnt);
if (start_end.first >= start_end.second) {
return true;
}
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(out, params->tidx);
const auto param = out->get_op_param<param_type>(0);
const size_t valid_row_bytes = src0->get_ne(0) * sizeof(data_type);
for (int64_t ir = start_end.first; ir < start_end.second; ++ir) {
const auto i03 = ir / rows_per_cube;
const auto i02 = ir / out->get_ne(1) - i03 * out->get_ne(2);
const auto i01 = ir % out->get_ne(1); // TODO: should we use divide instead of mod?
auto * src0_row = src0_ptr + i03 * src0->get_nb(3) + i02 * src0->get_nb(2) + i01 * src0->get_nb(1);
auto * dst_row = dst_ptr + i03 * out->get_nb(3) + i02 * out->get_nb(2) + i01 * out->get_nb(1);
if (ir + 1 < start_end.second) {
hexagon::l2fetch_row(src0_row + src0->get_nb(1), valid_row_bytes);
}
_RowFunc(reinterpret_cast<const data_type *>(src0_row), static_cast<size_t>(out->get_ne(0)), param,
reinterpret_cast<data_type *>(dst_row));
}
return true;
}
bool is_unary_op_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
const npu_device_tensor_spec & dst, npu_device_tensor_op op) {
if (op != NPU_OP_RMS_NORM) {
DEVICE_LOG_DEBUG("[%s]unsupported\n", hexagon::op_get_name(op));
return false;
}
if (dst.type != src0.type) {
DEVICE_LOG_DEBUG("[%s]src0.type and dst.type mismatch: %s vs %s\n", hexagon::op_get_name(op),
hexagon::get_type_name(src0.type), hexagon::get_type_name(dst.type));
return false;
}
if (dst.type != NPU_DATA_TYPE_F32) {
DEVICE_LOG_DEBUG("[%s]unsupported data type: %s\n", hexagon::op_get_name(op), hexagon::get_type_name(dst.type));
return false;
}
if (!is_same_shape(src0, dst)) {
DEVICE_LOG_DEBUG("[%s]src0 and dst have different shape\n", hexagon::op_get_name(op));
return false;
}
return true;
@ -211,6 +366,7 @@ struct op_capabilities {
npu_device_tensor_op op;
hexagon::op_is_supported_func_type is_supported;
hexagon::compute_func_type compute_funcs[NPU_DATA_TYPE_COUNT];
bool requires_thread_barrier = false;
};
constexpr const op_capabilities kOpCapabilities[] = {
@ -219,22 +375,36 @@ constexpr const op_capabilities kOpCapabilities[] = {
{
hexagon::mul_mat_f32, // NPU_DATA_TYPE_F32
nullptr, // NPU_DATA_TYPE_F16
}, },
{ NPU_OP_ADD,
is_element_wise_op_supported, {
element_wise_op<vec_op_f32_f32<vadd_f32_f32>>, // NPU_DATA_TYPE_F32
element_wise_op<vec_op_f16_f16<vadd_f16_f16>>, // NPU_DATA_TYPE_F16
} },
{ NPU_OP_SUB,
is_element_wise_op_supported, {
element_wise_op<vec_op_f32_f32<vsub_f32_f32>>, // NPU_DATA_TYPE_F32
element_wise_op<vec_op_f16_f16<vsub_f16_f16>>, // NPU_DATA_TYPE_F16
} },
{ NPU_OP_MUL,
is_element_wise_op_supported, {
element_wise_op<vec_op_f32_f32<vmul_f32_f32>>, // NPU_DATA_TYPE_F32
element_wise_op<vec_op_f16_f16<vmul_f16_f16>>, // NPU_DATA_TYPE_F16
} },
}, true,
},
{
NPU_OP_ADD, is_element_wise_op_supported,
{
element_wise_op<vec_op_f32_f32<vadd_f32_f32>>, // NPU_DATA_TYPE_F32
element_wise_op<vec_op_f16_f16<vadd_f16_f16>>, // NPU_DATA_TYPE_F16
}, false,
},
{
NPU_OP_SUB, is_element_wise_op_supported,
{
element_wise_op<vec_op_f32_f32<vsub_f32_f32>>, // NPU_DATA_TYPE_F32
element_wise_op<vec_op_f16_f16<vsub_f16_f16>>, // NPU_DATA_TYPE_F16
}, false,
},
{
NPU_OP_MUL, is_element_wise_op_supported,
{
element_wise_op<vec_op_f32_f32<vmul_f32_f32>>, // NPU_DATA_TYPE_F32
element_wise_op<vec_op_f16_f16<vmul_f16_f16>>, // NPU_DATA_TYPE_F16
}, false,
},
{
NPU_OP_RMS_NORM, is_unary_op_supported,
{
unary_op<rms_norm_vec_f32>, // NPU_DATA_TYPE_F32
nullptr, // NPU_DATA_TYPE_F16
}, false,
},
};
static_assert(kOpCapabilities[NPU_OP_MUL_MAT].compute_funcs[NPU_DATA_TYPE_F32] == hexagon::mul_mat_f32,
@ -243,6 +413,8 @@ static_assert(kOpCapabilities[NPU_OP_MUL_MAT].compute_funcs[NPU_DATA_TYPE_F32] =
static_assert(std::size(kOpCapabilities) == NPU_OP_COUNT);
static_assert(kOpCapabilities[NPU_OP_MUL_MAT].op == NPU_OP_MUL_MAT, "kOpArray[NPU_OP_MUL_MAT].op != NPU_OP_MUL_MAT");
static_assert(kOpCapabilities[NPU_OP_MUL].op == NPU_OP_MUL, "kOpArray[NPU_OP_MUL].op != NPU_OP_MUL");
static_assert(kOpCapabilities[NPU_OP_RMS_NORM].op == NPU_OP_RMS_NORM,
"kOpArray[NPU_OP_RMS_NORM].op != NPU_OP_RMS_NORM");
hexagon::compute_func_type get_compute_func_impl(npu_device_tensor_op op, npu_device_tensor_data_type type) {
if (op >= NPU_OP_COUNT) {
@ -260,6 +432,14 @@ compute_func_type get_compute_func(tensor * dst) {
return get_compute_func_impl(dst->get_op(), dst->get_type());
}
bool requires_thread_barrier(npu_device_tensor_op op) {
if (op >= NPU_OP_COUNT) {
return false;
}
return kOpCapabilities[op].requires_thread_barrier;
}
bool support_op(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
const npu_device_tensor_spec & dst, npu_device_tensor_op op) {
if (get_compute_func_impl(op, dst.type) == nullptr) {

View File

@ -6,6 +6,8 @@ namespace hexagon {
compute_func_type get_compute_func(tensor * dst);
bool requires_thread_barrier(npu_device_tensor_op op);
bool support_op(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
const npu_device_tensor_spec & dst, npu_device_tensor_op op);

View File

@ -3,44 +3,43 @@
#include <HTP/core/intrinsics.h>
#include "quants.hpp"
#include "thread_pool.hpp" // TODO: remove this dependency
#include "vtcm_mem.hpp"
namespace {
inline float vec_reduction_f32(HVX_Vector sums) {
constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(float);
static_assert(kFloatsPerVector == 32 || kFloatsPerVector == 16, "kFloatsPerVector should be 16 or 32");
// TODO: do we have a better way to do the reduction?
switch (kFloatsPerVector) {
default:
case 32:
sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 16 * sizeof(float)));
// fallthrough
case 16:
sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 8 * sizeof(float)));
sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 4 * sizeof(float)));
sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 2 * sizeof(float)));
sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, sizeof(float)));
break;
}
return hexagon::get_flt0_from_fltv(Q6_Vsf_equals_Vqf32(sums));
}
inline float vec_dot_product_f32_f32(const float * src0, const float * src1, size_t count) {
constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(float);
HVX_Vector * iptr0 = ((HVX_Vector *) src0);
HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kElementsPerVector);
HVX_Vector * iptr1 = ((HVX_Vector *) src1);
HVX_Vector prev0 = *iptr0++;
HVX_Vector prev1 = *iptr1++;
HVX_Vector sum = Q6_V_vzero();
HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0);
HVX_Vector * src0_vec_ptr_end = ((HVX_Vector *) src0) + count / kElementsPerVector;
HVX_Vector * src1_vec_ptr = ((HVX_Vector *) src1);
HVX_Vector prev0 = *src0_vec_ptr++;
HVX_Vector prev1 = *src1_vec_ptr++;
HVX_Vector sum = Q6_V_vzero();
while (iptr0 < iptr0_end) {
HVX_Vector curr0 = *iptr0++;
HVX_Vector curr1 = *iptr1++;
while (src0_vec_ptr_end - src0_vec_ptr > 1) {
HVX_Vector curr0_lo = src0_vec_ptr[0];
HVX_Vector curr0_hi = src0_vec_ptr[1];
HVX_Vector curr1_lo = src1_vec_ptr[0];
HVX_Vector curr1_hi = src1_vec_ptr[1];
HVX_Vector l0 = Q6_V_valign_VVR(curr0_lo, prev0, (size_t) src0);
HVX_Vector l1 = Q6_V_valign_VVR(curr1_lo, prev1, (size_t) src1);
HVX_Vector h0 = Q6_V_valign_VVR(curr0_hi, curr0_lo, (size_t) src0);
HVX_Vector h1 = Q6_V_valign_VVR(curr1_hi, curr1_lo, (size_t) src1);
sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(l0, l1), sum);
sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(h0, h1), sum);
prev0 = curr0_hi;
prev1 = curr1_hi;
src0_vec_ptr += 2;
src1_vec_ptr += 2;
}
if (src0_vec_ptr_end - src0_vec_ptr > 0) {
HVX_Vector curr0 = *src0_vec_ptr++;
HVX_Vector curr1 = *src1_vec_ptr++;
HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, s1), sum);
@ -48,17 +47,17 @@ inline float vec_dot_product_f32_f32(const float * src0, const float * src1, siz
prev1 = curr1;
}
if ((iptr0_end - ((HVX_Vector *) src0)) > 0) {
if ((src0_vec_ptr_end - ((HVX_Vector *) src0)) > 0) {
// handle the last vector
// see also:
// https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147
// or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c
bool iptr0_aligned = hexagon::is_addr_aligned(iptr0);
HVX_Vector curr0 = iptr0_aligned ? prev0 : *iptr0;
iptr0 = iptr0_aligned ? iptr0 : iptr0 + 1;
bool iptr1_aligned = hexagon::is_addr_aligned(iptr1);
HVX_Vector curr1 = iptr1_aligned ? prev1 : *iptr1;
iptr1 = iptr1_aligned ? iptr1 : iptr1 + 1;
bool iptr0_aligned = hexagon::is_addr_aligned(src0_vec_ptr);
HVX_Vector curr0 = iptr0_aligned ? prev0 : *src0_vec_ptr;
src0_vec_ptr = iptr0_aligned ? src0_vec_ptr : src0_vec_ptr + 1;
bool iptr1_aligned = hexagon::is_addr_aligned(src1_vec_ptr);
HVX_Vector curr1 = iptr1_aligned ? prev1 : *src1_vec_ptr;
src1_vec_ptr = iptr1_aligned ? src1_vec_ptr : src1_vec_ptr + 1;
HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, s1), sum);
@ -70,19 +69,21 @@ inline float vec_dot_product_f32_f32(const float * src0, const float * src1, siz
const size_t leftover_bytes = leftover * sizeof(float);
if (leftover > 0) {
// handle the leftover elements
HVX_Vector curr0 =
(leftover_bytes + hexagon::unaligned_bytes(iptr0) > hexagon::kBytesPerVector) ? *iptr0 : prev0;
curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
HVX_Vector curr0 = (leftover_bytes + hexagon::unaligned_bytes(src0_vec_ptr) > hexagon::kBytesPerVector) ?
*src0_vec_ptr :
prev0;
curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
HVX_Vector curr1 =
(leftover_bytes + hexagon::unaligned_bytes(iptr1) > hexagon::kBytesPerVector) ? *iptr1 : prev1;
curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
HVX_Vector curr1 = (leftover_bytes + hexagon::unaligned_bytes(src1_vec_ptr) > hexagon::kBytesPerVector) ?
*src1_vec_ptr :
prev1;
curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
sum = Q6_Vqf32_vadd_Vqf32Vqf32(
Q6_V_valign_VVR(Q6_Vqf32_vmpy_VsfVsf(curr0, curr1), Q6_V_vzero(), leftover_bytes), sum);
}
return vec_reduction_f32(sum);
return hexagon::vec_reduction_f32(sum);
}
// TODO: merge with vec_dot_product_f32_f32?
@ -90,17 +91,17 @@ inline float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_d
constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(npu_device_fp16_t);
constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(float);
HVX_Vector * iptr0 = ((HVX_Vector *) src0);
HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kElementsPerVector);
HVX_Vector * iptr1 = ((HVX_Vector *) src1);
HVX_Vector prev0 = *iptr0++;
HVX_Vector prev1 = *iptr1++;
HVX_Vector sum_hi = Q6_V_vzero();
HVX_Vector sum_lo = Q6_V_vzero();
HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0);
HVX_Vector * src0_vec_ptr_end = ((HVX_Vector *) src0) + (count / kElementsPerVector);
HVX_Vector * src1_vec_ptr = ((HVX_Vector *) src1);
HVX_Vector prev0 = *src0_vec_ptr++;
HVX_Vector prev1 = *src1_vec_ptr++;
HVX_Vector sum_hi = Q6_V_vzero();
HVX_Vector sum_lo = Q6_V_vzero();
while (iptr0 < iptr0_end) {
HVX_Vector curr0 = *iptr0++;
HVX_Vector curr1 = *iptr1++;
while (src0_vec_ptr < src0_vec_ptr_end) {
HVX_Vector curr0 = *src0_vec_ptr++;
HVX_Vector curr1 = *src1_vec_ptr++;
HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
HVX_VectorPair result = Q6_Wqf32_vmpy_VhfVhf(s0, s1);
@ -110,17 +111,17 @@ inline float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_d
prev1 = curr1;
}
if ((iptr0_end - ((HVX_Vector *) src0)) > 0) {
if ((src0_vec_ptr_end - ((HVX_Vector *) src0)) > 0) {
// handle the last vector
// see also:
// https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147
// or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c
bool iptr0_aligned = hexagon::is_addr_aligned(iptr0);
HVX_Vector curr0 = iptr0_aligned ? prev0 : *iptr0;
iptr0 = iptr0_aligned ? iptr0 : iptr0 + 1;
bool iptr1_aligned = hexagon::is_addr_aligned(iptr1);
HVX_Vector curr1 = iptr1_aligned ? prev1 : *iptr1;
iptr1 = iptr1_aligned ? iptr1 : iptr1 + 1;
bool iptr0_aligned = hexagon::is_addr_aligned(src0_vec_ptr);
HVX_Vector curr0 = iptr0_aligned ? prev0 : *src0_vec_ptr;
src0_vec_ptr = iptr0_aligned ? src0_vec_ptr : src0_vec_ptr + 1;
bool iptr1_aligned = hexagon::is_addr_aligned(src1_vec_ptr);
HVX_Vector curr1 = iptr1_aligned ? prev1 : *src1_vec_ptr;
src1_vec_ptr = iptr1_aligned ? src1_vec_ptr : src1_vec_ptr + 1;
HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
HVX_VectorPair result = Q6_Wqf32_vmpy_VhfVhf(s0, s1);
@ -134,13 +135,15 @@ inline float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_d
const size_t leftover_bytes = leftover * sizeof(npu_device_fp16_t);
if (leftover > 0) {
// handle the leftover elements
HVX_Vector curr0 =
(leftover_bytes + hexagon::unaligned_bytes(iptr0) > hexagon::kBytesPerVector) ? *iptr0 : prev0;
curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
HVX_Vector curr0 = (leftover_bytes + hexagon::unaligned_bytes(src0_vec_ptr) > hexagon::kBytesPerVector) ?
*src0_vec_ptr :
prev0;
curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
HVX_Vector curr1 =
(leftover_bytes + hexagon::unaligned_bytes(iptr1) > hexagon::kBytesPerVector) ? *iptr1 : prev1;
curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
HVX_Vector curr1 = (leftover_bytes + hexagon::unaligned_bytes(src1_vec_ptr) > hexagon::kBytesPerVector) ?
*src1_vec_ptr :
prev1;
curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
HVX_VectorPair result = Q6_Wqf32_vmpy_VhfVhf(curr0, curr1);
@ -156,7 +159,7 @@ inline float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_d
}
}
return vec_reduction_f32(Q6_Vqf32_vadd_Vqf32Vqf32(sum_hi, sum_lo));
return hexagon::vec_reduction_f32(Q6_Vqf32_vadd_Vqf32Vqf32(sum_hi, sum_lo));
}
template <typename T> struct get_data_type {};
@ -208,70 +211,118 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso
}
// cache the src0 plane in VTCM
const size_t src0_plane_row_count = start_end_element.second - start_end_element.first;
size_t src0_plane_cache_size = 0;
uint8_t * src0_plane_cache_ptr = nullptr;
const uint8_t * last_cached_plane_ptr = nullptr;
size_t src0_plane_slice_row_count = start_end_element.second - start_end_element.first;
size_t src0_plane_cache_size = 0;
uint8_t * src0_plane_cache_ptr = nullptr;
const uint8_t * last_cached_plane_ptr = nullptr;
bool is_mem_cache = false;
if (is_quantized) {
src0_plane_cache_size = src0_actual_row_size * src0_plane_row_count;
src0_plane_cache_ptr = params->get_cache(src0_plane_cache_size, is_quantized);
src0_plane_slice_row_count =
std::min(params->vtcm_quota_size / src0_actual_row_size, src0_plane_slice_row_count);
src0_plane_cache_size = src0_actual_row_size * src0_plane_slice_row_count;
src0_plane_cache_ptr = params->get_vtcm_cache(src0_plane_cache_size);
if (src0_plane_cache_ptr == nullptr) {
DEVICE_LOG_DEBUG(
"mul_mat_impl: failed to get VTCM cache for src0, size: %zu, src0_plane_slice_row_count: %zu, "
"src0_actual_row_size: %zu, will fallback to mem cache\n",
src0_plane_cache_size, src0_plane_slice_row_count, src0_actual_row_size);
src0_plane_cache_ptr = params->get_mem_cache(src0_plane_cache_size);
is_mem_cache = true;
}
}
DEVICE_LOG_DEBUG("mul_mat_impl src0_actual_row_size: %zu, is_quantized: %d, vtcm_mem: %p(%zu)\n",
src0_actual_row_size, is_quantized, (void *) src0_plane_cache_ptr, src0_plane_cache_size);
DEVICE_LOG_DEBUG(
"mul_mat_impl src0_actual_row_size: %zu, src0_plane_slice_row_count: %zu, is_quantized: %d, vtcm_mem: "
"%p(%zu)\n",
src0_actual_row_size, src0_plane_slice_row_count, is_quantized, (void *) src0_plane_cache_ptr,
src0_plane_cache_size);
const size_t valid_row_bytes = src1->get_ne(0) * sizeof(data_type);
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_SUB_PROC(dst, params->tidx, dequant);
for (int64_t ip = start_end_plane.first; ip < start_end_plane.second; ip++) {
const auto i3 = ip / dst->get_ne(2);
const auto i2 = ip - i3 * dst->get_ne(2);
const auto * src0_plane = src0_ptr + i3 / r03 * src0->get_nb(3) + i2 / r02 * src0->get_nb(2) +
start_end_element.first * src0->get_nb(1);
const auto * src1_plane = src1_ptr + i3 * src1->get_nb(3) + i2 * src1->get_nb(2);
auto * dst_plane = dst_ptr + i3 * dst->get_nb(3) + i2 * dst->get_nb(2);
for (int64_t col_idx = start_end_element.first; col_idx < start_end_element.second;
col_idx += src0_plane_slice_row_count) {
const auto * src0_plane =
src0_ptr + i3 / r03 * src0->get_nb(3) + i2 / r02 * src0->get_nb(2) + col_idx * src0->get_nb(1);
if (src0_plane_cache_ptr) {
if (last_cached_plane_ptr != src0_plane) {
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_SUB_PROC(dequant);
if (src0_plane_cache_ptr) {
if (last_cached_plane_ptr != src0_plane) {
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_SUB_PROC(dequant);
for (int64_t ir = 0; ir < (int64_t) src0_plane_slice_row_count; ir++) {
auto * src0_row = src0_plane + ir * src0->get_nb(1);
if (ir + 1 < src0_plane_slice_row_count) {
hexagon::l2fetch_row(src0_row + src0->get_nb(1), src0->get_nb(1));
}
for (int64_t ir = 0; ir < (int64_t) src0_plane_row_count; ir++) {
auto * src0_row = src0_plane + ir * src0->get_nb(1);
if (ir + 1 < src0_plane_row_count) {
hexagon::l2fetch_row(src0_row + src0->get_nb(1), src0->get_nb(1));
auto * dst_row = reinterpret_cast<float *>(src0_plane_cache_ptr + ir * src0_actual_row_size);
dequantize_row_func(src0_row, reinterpret_cast<float *>(dst_row), src0->get_ne(0),
params->f16_to_f32_table);
}
auto * dst_row = reinterpret_cast<float *>(src0_plane_cache_ptr + ir * src0_actual_row_size);
dequantize_row_func(src0_row, reinterpret_cast<float *>(dst_row), src0->get_ne(0),
params->f16_to_f32_table);
last_cached_plane_ptr = src0_plane;
}
last_cached_plane_ptr = src0_plane;
src0_plane = src0_plane_cache_ptr;
}
src0_plane = src0_plane_cache_ptr;
}
for (int64_t i1 = start_end_row.first; i1 < start_end_row.second; i1++) {
auto * src1_row = src1_plane + i1 * src1->get_nb(1);
auto * dst_row = reinterpret_cast<float *>(dst_plane + i1 * dst->get_nb(1)) + start_end_element.first;
for (int64_t i0 = 0; i0 < (int64_t) src0_plane_row_count; i0++) {
auto * src0_row = src0_plane + i0 * src0_actual_row_size;
if (i0 + 1 < src0_plane_row_count) {
if (!src0_plane_cache_ptr) {
hexagon::l2fetch_row(src0_row + src0_actual_row_size, valid_row_bytes);
for (int64_t i1 = start_end_row.first; i1 < start_end_row.second; i1++) {
auto * src1_row = src1_plane + i1 * src1->get_nb(1);
auto * dst_row = reinterpret_cast<float *>(dst_plane + i1 * dst->get_nb(1)) + col_idx;
for (int64_t i0 = 0; i0 < (int64_t) src0_plane_slice_row_count; i0++) {
auto * src0_row = src0_plane + i0 * src0_actual_row_size;
if (i0 + 1 < src0_plane_slice_row_count) {
if (!src0_plane_cache_ptr || is_mem_cache) {
hexagon::l2fetch_row(src0_row + src0_actual_row_size, valid_row_bytes);
}
} else if (ip + 1 < start_end_plane.second) {
hexagon::l2fetch_row(src1_row + src1->get_nb(1), valid_row_bytes);
}
} else if (ip + 1 < start_end_plane.second) {
hexagon::l2fetch_row(src1_row + src1->get_nb(1), valid_row_bytes);
}
// TODO: figure dst how to handle a entire row
dst_row[i0] = _DotFunc(reinterpret_cast<const data_type *>(src0_row),
reinterpret_cast<const data_type *>(src1_row), (size_t) src0->get_ne(0));
// TODO: figure dst how to handle a entire row
dst_row[i0] = _DotFunc(reinterpret_cast<const data_type *>(src0_row),
reinterpret_cast<const data_type *>(src1_row), (size_t) src0->get_ne(0));
}
}
}
}
}
bool is_quantized_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1) {
if (src1.type != NPU_DATA_TYPE_F32) {
DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) and src1.type(%s) mismatch and src1 is not F32\n",
hexagon::get_type_name(src0.type), hexagon::get_type_name(src1.type));
return false;
}
const auto type_traits = hexagon::get_type_traits(src0.type);
if (!type_traits.is_quantized || type_traits.dequantize_row == nullptr) {
DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) and src1.type(%s) mismatch and src0 is not quantized\n",
hexagon::get_type_name(src0.type), hexagon::get_type_name(src1.type));
return false;
}
if (src0.ne[0] % type_traits.blck_size) {
DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) ne[0] is not aligned: %ld\n", hexagon::get_type_name(src0.type),
(long) src0.ne[0]);
return false;
}
const auto vtcm_thread_quota_size = hexagon::vtcm_mem::get_total_size() / hexagon::kMaxThreadCount;
if (src0.ne[0] * sizeof(hexagon::dequantized_element_type) > vtcm_thread_quota_size) {
DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) ne[0] is too large: %ld, vtcm_thread_quota_size: %zu\n",
hexagon::get_type_name(src0.type), (long) src0.ne[0], vtcm_thread_quota_size);
return false;
}
DEVICE_LOG_DEBUG("[MUL_MAT]supported quantized src0.type(%s) and src1.type(%s)\n",
hexagon::get_type_name(src0.type), hexagon::get_type_name(src1.type));
return true;
}
} // namespace
namespace hexagon {
@ -319,27 +370,9 @@ bool is_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_
if (src0.type != src1.type) {
#ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS
if (src1.type != NPU_DATA_TYPE_F32) {
DEVICE_LOG_DEBUG("[%s]src0.type(%s) and src1.type(%s) mismatch and src1 is not F32\n", op_get_name(op),
get_type_name(src0.type), get_type_name(src1.type));
if (!is_quantized_mul_mat_supported(src0, src1)) {
return false;
}
const auto type_traits = get_type_traits(src0.type);
if (!type_traits.is_quantized || type_traits.dequantize_row == nullptr) {
DEVICE_LOG_DEBUG("[%s]src0.type(%s) and src1.type(%s) mismatch and src0 is not quantized\n",
op_get_name(op), get_type_name(src0.type), get_type_name(src1.type));
return false;
}
if (src0.ne[0] % type_traits.blck_size) {
DEVICE_LOG_DEBUG("[%s]src0.type(%s) ne[0] is not aligned: %ld\n", op_get_name(op), get_type_name(src0.type),
(long) src0.ne[0]);
return false;
}
DEVICE_LOG_DEBUG("[%s]supported quantized src0.type(%s) and src1.type(%s)\n", op_get_name(op),
get_type_name(src0.type), get_type_name(src1.type));
#else
DEVICE_LOG_DEBUG("[%s]src0.type(%s) and src1.type(%s) mismatch and quantized tensors are not supported\n",
op_get_name(op), get_type_name(src0.type), get_type_name(src1.type));

View File

@ -7,11 +7,6 @@
namespace hexagon {
constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73
constexpr const size_t kAlignMask = kBytesPerVector - 1;
constexpr const size_t kL2CacheSize = 8 * 1024; // // 8KB L2 cache
constexpr const size_t kL2FetchAheadVectors = kL2CacheSize / kBytesPerVector;
inline size_t unaligned_bytes(const void * addr) {
return ((size_t) addr) & kAlignMask;
}
@ -43,6 +38,31 @@ inline float get_flt0_from_fltv(HVX_Vector vect) {
return cvt.f;
}
inline HVX_Vector vec_reduction_qf32(HVX_Vector sums) {
constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(float);
static_assert(kFloatsPerVector == 32 || kFloatsPerVector == 16, "kFloatsPerVector should be 16 or 32");
// TODO: do we have a better way to do the reduction?
switch (kFloatsPerVector) {
default:
case 32:
sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 16 * sizeof(float)));
// fallthrough
case 16:
sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 8 * sizeof(float)));
sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 4 * sizeof(float)));
sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 2 * sizeof(float)));
sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, sizeof(float)));
break;
}
return sums;
}
inline float vec_reduction_f32(HVX_Vector sums) {
return hexagon::get_flt0_from_fltv(Q6_Vsf_equals_Vqf32(vec_reduction_qf32(sums)));
}
bool mul_mat_f32(tensor * out, compute_params * params);
bool is_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
const npu_device_tensor_spec & dst, npu_device_tensor_op op);

View File

@ -1,5 +1,7 @@
#pragma once
#include <hexagon_types.h>
#include <algorithm>
#include <cstdint>
#include <memory>
@ -15,26 +17,25 @@ namespace hexagon {
struct compute_params {
const size_t tidx;
const size_t tcnt;
const size_t vtcm_quota_size;
const float * f16_to_f32_table;
std::unique_ptr<hexagon::vtcm_mem> vtcm_cache;
std::unique_ptr<uint8_t[]> mem_cache;
size_t mem_cache_size = 0;
uint8_t * get_cache(size_t size, bool fallback_to_mem) {
uint8_t * get_vtcm_cache(size_t size) {
if (!vtcm_cache || vtcm_cache->get_size() < size) {
vtcm_cache = std::make_unique<hexagon::vtcm_mem>(size, false);
}
if (vtcm_cache->is_valid()) {
return vtcm_cache->get_mem();
}
if (!fallback_to_mem) {
DEVICE_LOG_DEBUG("vtcm_mem not valid, return nullptr\n");
if (!vtcm_cache->is_valid()) {
return nullptr;
}
DEVICE_LOG_DEBUG("vtcm_mem not valid, allocate from mem_cache\n");
return vtcm_cache->get_mem();
}
uint8_t * get_mem_cache(size_t size) {
if (!mem_cache || mem_cache_size < size) {
mem_cache = std::make_unique<uint8_t[]>(size + 256);
mem_cache_size = mem_cache ? size : 0;
@ -49,10 +50,31 @@ typedef bool (*op_is_supported_func_type)(const npu_device_tensor_spec & src0, c
const npu_device_tensor_spec & dst, npu_device_tensor_op op);
inline constexpr std::pair<int64_t, int64_t> get_thread_work_slice(int64_t total, size_t tidx, size_t tcnt) {
const auto elements_per_thread = (total + tcnt - 1) / tcnt;
const auto start = tidx * elements_per_thread;
const auto end = std::min<int64_t>(start + elements_per_thread, total);
return { start, end };
if (total <= 0 || tidx >= tcnt) {
return { 0, 0 }; // No work for this thread
}
const auto elements_per_thread = total / tcnt;
const auto remainder = total % tcnt;
int64_t start = 0;
int64_t end = 0;
if (tidx < remainder) {
// First 'remainder' threads get one extra item
start = tidx * (elements_per_thread + 1);
end = start + elements_per_thread + 1;
} else {
// Remaining threads get the base number of elements
start = remainder * (elements_per_thread + 1) + (tidx - remainder) * elements_per_thread;
end = start + elements_per_thread;
}
return { start, std::min(end, total) };
}
constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73
constexpr const size_t kAlignMask = kBytesPerVector - 1;
constexpr const size_t kL2CacheSize = 8 * 1024; // // 8KB L2 cache
constexpr const size_t kL2FetchAheadVectors = kL2CacheSize / kBytesPerVector;
} // namespace hexagon

View File

@ -4,6 +4,8 @@
#include <array>
#include "op_types.hpp" // TODO: remove this include
static_assert(sizeof(npu_device_block_q4_K) ==
2 * sizeof(npu_device_fp16_t) + QUANT_K_SCALE_SIZE + QUANT_K_BLOCK_SIZE / 2,
"wrong q4_K block size/padding");
@ -16,14 +18,34 @@ static_assert(sizeof(npu_device_block_q8_0) == sizeof(npu_device_fp16_t) + QUANT
namespace {
inline float to_float(const npu_device_fp16_t src) {
union {
__fp16 f16;
npu_device_fp16_t u16;
} f16;
inline HVX_Vector vmemu(const void * unaligned_ptr) {
HVX_Vector ret = *reinterpret_cast<const HVX_UVector *>(unaligned_ptr);
return ret;
}
f16.u16 = src;
return f16.f16;
inline float to_float(const npu_device_fp16_t src) {
return reinterpret_cast<const __fp16 &>(src);
}
template <typename _TBlock> inline HVX_Vector load_block_generic(const _TBlock & src) {
uint8_t buffer[hexagon::kBytesPerVector];
static_assert(sizeof(buffer) == sizeof(HVX_Vector), "wrong cvt size/padding");
static_assert(sizeof(buffer) >= sizeof(src.qs), "wrong q4_0 block size/padding");
memcpy(&buffer[0], src.qs, sizeof(src.qs));
return *reinterpret_cast<HVX_UVector *>(buffer);
}
template <typename _TBlock> inline HVX_Vector load_dual_block_generic(const _TBlock & src1, const _TBlock & src2) {
uint8_t buffer[hexagon::kBytesPerVector];
static_assert(sizeof(buffer) == sizeof(HVX_Vector), "wrong cvt size/padding");
static_assert(sizeof(buffer) >= sizeof(src1.qs) * 2, "wrong q4_0 block size/padding");
memcpy(&buffer[0], src1.qs, sizeof(src1.qs));
memcpy(&buffer[sizeof(src1.qs)], src2.qs, sizeof(src2.qs));
return *reinterpret_cast<HVX_UVector *>(buffer);
}
inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) {
@ -37,38 +59,78 @@ inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m)
}
void dequantize_row_q8_0(const void * src, float * dst, size_t count, const float * f16_to_f32_table) {
constexpr const int qk = QUANT_BLOCK_SIZE;
const int nb = count / qk;
const auto * src_ptr = reinterpret_cast<const npu_device_block_q8_0 *>(src);
constexpr const int qk = QUANT_BLOCK_SIZE;
static_assert(QUANT_BLOCK_SIZE == hexagon::kBytesPerVector / sizeof(float));
const int nb = count / qk;
const auto * src_ptr = reinterpret_cast<const npu_device_block_q8_0 *>(src);
HVX_UVector * out = ((HVX_UVector *) dst); // TODO: opt for aligned access
// TODO: use intrinsics
for (int i = 0; i < nb; i++) {
const float d = f16_to_f32_table[src_ptr[i].d];
const auto & src = src_ptr[i];
HVX_Vector d = Q6_Vh_vsplat_R(src.d);
for (int j = 0; j < qk; ++j) {
dst[i * qk + j] = src_ptr[i].qs[j] * d;
}
HVX_Vector q_lo = load_block_generic(src);
HVX_VectorPair q = Q6_Wh_vunpack_Vb(q_lo);
q = Q6_Wh_vunpack_Vb(Q6_V_lo_W(q));
q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(q));
q = Q6_Wqf32_vmpy_VhfVhf(q_lo, d);
out[i] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(q));
}
}
void dequantize_row_q4_0(const void * src, float * dst, size_t count, const float * f16_to_f32_table) {
constexpr const int qk = QUANT_BLOCK_SIZE;
static_assert(qk % 2 == 0, "qk must be even");
static_assert(QUANT_BLOCK_SIZE == hexagon::kBytesPerVector / sizeof(float));
constexpr const uint32_t kSizeOfQs = sizeof(npu_device_block_q4_0::qs);
const int nb = count / qk;
const auto * src_ptr = reinterpret_cast<const npu_device_block_q4_0 *>(src);
const int nb = count / qk;
const auto * src_ptr = reinterpret_cast<const npu_device_block_q4_0 *>(src);
HVX_Vector mask = Q6_Vb_vsplat_R(0x0F);
HVX_Vector minus = Q6_Vb_vsplat_R(8);
HVX_UVector * out = ((HVX_UVector *) dst); // TODO: opt for aligned access
// TODO: use intrinsics
for (int i = 0; i < nb; i++) {
const float d = f16_to_f32_table[src_ptr[i].d];
const int loop_count = nb - (nb % 2);
for (int i = 0; i < loop_count; i += 2) {
const auto & src1 = src_ptr[i];
const auto & src2 = src_ptr[i + 1];
for (int j = 0; j < qk / 2; ++j) {
const int x0 = (src_ptr[i].qs[j] & 0x0F) - 8;
const int x1 = ((src_ptr[i].qs[j] >> 4) & 0xF) - 8;
HVX_Vector d1 = Q6_Vh_vsplat_R(src1.d);
HVX_Vector d2 = Q6_Vh_vsplat_R(src2.d);
d1 = Q6_V_valign_VVR(d1, Q6_V_vzero(), hexagon::kBytesPerVector / 2);
d1 = Q6_V_valign_VVR(d2, d1, hexagon::kBytesPerVector / 2);
HVX_Vector d = Q6_Vh_vshuff_Vh(d1);
dst[i * qk + j + 0] = x0 * d;
dst[i * qk + j + qk / 2] = x1 * d;
}
HVX_Vector q_lo = load_dual_block_generic(src1, src2);
HVX_Vector q_hi = Q6_Vub_vlsr_VubR(q_lo, 4);
HVX_VectorPair q = Q6_W_vshuff_VVR(q_hi, Q6_V_vand_VV(q_lo, mask), kSizeOfQs);
q_lo = Q6_V_valign_VVR(Q6_V_lo_W(q), Q6_V_vzero(), hexagon::kBytesPerVector / 2);
q_lo = Q6_V_valign_VVR(Q6_V_hi_W(q), q_lo, hexagon::kBytesPerVector / 2);
q_lo = Q6_Vb_vshuff_Vb(q_lo);
q_lo = Q6_Vb_vsub_VbVb(q_lo, minus);
q = Q6_Wh_vunpack_Vb(q_lo);
q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(q));
q = Q6_Wqf32_vmpy_VhfVhf(q_lo, d);
out[i] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(q));
out[i + 1] = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(q));
}
if (loop_count < nb) {
const auto & curr_blk = src_ptr[nb - 1];
HVX_Vector d = Q6_Vh_vsplat_R(curr_blk.d);
HVX_Vector q_lo = load_block_generic(curr_blk);
HVX_Vector q_hi = Q6_Vub_vlsr_VubR(q_lo, 4);
q_lo = Q6_V_valign_VVR(Q6_V_vand_VV(q_lo, mask), Q6_V_vzero(), sizeof(curr_blk.qs));
q_lo = Q6_V_valign_VVR(q_hi, q_lo, hexagon::kBytesPerVector - sizeof(curr_blk.qs));
q_lo = Q6_Vb_vsub_VbVb(q_lo, minus);
HVX_VectorPair q = Q6_Wh_vunpack_Vb(q_lo);
q = Q6_Wh_vunpack_Vb(Q6_V_lo_W(q));
q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(q));
q = Q6_Wqf32_vmpy_VhfVhf(q_lo, d);
out[nb - 1] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(q));
}
}

View File

@ -23,13 +23,15 @@ inline bool is_quantized_type(npu_device_tensor_data_type type) {
return get_type_traits(type).is_quantized;
}
inline size_t get_dequantized_row_size(tensor * tensor) {
using dequantized_element_type = float;
inline size_t get_dequantized_row_size(const tensor * tensor) {
if (!is_quantized_type(tensor->get_type())) {
return tensor->get_nb(1); // for f32 and f16
}
auto row_elems_count = tensor->get_ne(0);
return row_elems_count * sizeof(float); // currently only f32 is supported
return row_elems_count * sizeof(dequantized_element_type); // currently only f32 is supported
}
inline const char * get_type_name(npu_device_tensor_data_type type) {

View File

@ -8,7 +8,8 @@
namespace hexagon {
constexpr const size_t kMaxTensorSrc = DEVICE_TENSOR_MAX_SRC;
constexpr const size_t kMaxTensorSrc = DEVICE_TENSOR_MAX_SRC;
constexpr const size_t kMaxParamsCount = DEVICE_TENSOR_MAX_OP_PARAMS;
class tensor {
public:
@ -50,17 +51,17 @@ class tensor {
}
}
bool set_src(size_t index, tensor * src) {
if (index >= kMaxTensorSrc) {
return false;
void update_config(const npu_device_tensor_update_config & config) {
static_assert(sizeof(_op_params) == sizeof(config.params), "op params size mismatch");
_info.op = config.op;
memcpy(_op_params, config.params, sizeof(_op_params));
for (size_t i = 0; i < DEVICE_TENSOR_MAX_SRC; ++i) {
auto src_handle = config.src_handles[i];
_src[i] = (src_handle ? reinterpret_cast<tensor *>(src_handle) : nullptr);
}
_src[index] = src;
return true;
}
void set_op(npu_device_tensor_op op) { _info.op = op; }
tensor * get_src(size_t index) const {
if (index >= kMaxTensorSrc) {
return nullptr;
@ -77,6 +78,20 @@ class tensor {
npu_device_tensor_op get_op() const { return _info.op; }
template <typename _TyParam> const _TyParam get_op_param(size_t index) const {
static_assert(sizeof(_TyParam) <= sizeof(_op_params), "_op_param type size exceeds op params size");
if (sizeof(_TyParam) * (index + 1) >= sizeof(_op_params)) {
return 0;
}
return reinterpret_cast<const _TyParam *>(_op_params)[index];
}
const int32_t * get_op_params() const { return _op_params; }
const size_t get_op_param_count() const { return kMaxParamsCount; }
npu_device_tensor_data_type get_type() const { return _info.type; }
const uint8_t * get_read_buffer() const {
@ -89,9 +104,10 @@ class tensor {
bool is_valid() const { return _data != nullptr; }
private:
npu_device_tensor_config _info;
tensor * _src[kMaxTensorSrc] = {};
uint8_t * _data = nullptr;
npu_device_tensor_config _info = {};
int32_t _op_params[kMaxParamsCount] = {};
tensor * _src[kMaxTensorSrc] = {};
uint8_t * _data = nullptr;
DISABLE_COPY_AND_MOVE(tensor);
};

View File

@ -12,7 +12,7 @@
namespace hexagon {
constexpr const size_t kMaxThreadCount = 4;
constexpr const size_t kDefaultStackSize = 1024 * 16; // 16KB
constexpr const size_t kDefaultStackSize = 1024 * 32; // 32KB
constexpr const unsigned long long kThreadTaskPendingBit = 1;
template <size_t _stack_size> class qurt_thread {
@ -80,7 +80,7 @@ using qurt_thread_ptr = std::unique_ptr<qurt_thread<kDefaultStackSize>>;
template <size_t _thread_count> class thread_pool {
static_assert(_thread_count > 1, "Thread count must be greater than 1");
constexpr const static size_t kMaxThreadCount = _thread_count - 1;
constexpr const static size_t kMaxSubThreadCount = _thread_count - 1;
public:
typedef qurt_thread<kDefaultStackSize> thread_type;
@ -88,9 +88,10 @@ template <size_t _thread_count> class thread_pool {
thread_pool() {
std::string thread_name_base = "thread_pool_";
qurt_barrier_init(&_pending, kMaxThreadCount + 1);
qurt_barrier_init(&_completed, kMaxThreadCount + 1);
for (size_t i = 0; i < kMaxThreadCount; ++i) {
qurt_barrier_init(&_pending, kMaxSubThreadCount + 1);
qurt_barrier_init(&_completed, kMaxSubThreadCount + 1);
const auto priority = qurt_thread_get_priority(qurt_thread_get_id());
for (size_t i = 0; i < kMaxSubThreadCount; ++i) {
auto & thread_arg = _thread_args[i];
thread_arg.pool = this;
thread_arg.thread_idx = i + 1;
@ -98,7 +99,7 @@ template <size_t _thread_count> class thread_pool {
auto thread = std::make_unique<thread_type>(
thread_name_base + std::to_string(i),
reinterpret_cast<thread_type::qurt_thread_func_type>(&thread_pool::thread_func_impl), &thread_arg,
QURT_THREAD_ATTR_PRIORITY_DEFAULT);
priority);
if (!thread->is_valid()) {
DEVICE_LOG_ERROR("Failed to create thread: %zu", i);
// destroy all barriers and threads at destructor
@ -107,7 +108,7 @@ template <size_t _thread_count> class thread_pool {
_threads[i] = std::move(thread);
}
DEVICE_LOG_DEBUG("thread_pool.created: %zu", kMaxThreadCount);
DEVICE_LOG_DEBUG("thread_pool.created: %zu", kMaxSubThreadCount);
}
~thread_pool() {
@ -133,7 +134,7 @@ template <size_t _thread_count> class thread_pool {
_arg = arg;
qurt_barrier_wait(&_pending);
task(this, 0, kMaxThreadCount + 1, arg);
task(this, 0, kMaxSubThreadCount + 1, arg);
DEVICE_LOG_DEBUG("main_thread.task_completed: 0");
qurt_barrier_wait(&_completed);
@ -166,7 +167,7 @@ template <size_t _thread_count> class thread_pool {
auto task = pool._task;
if (task) {
task(arg->pool, arg->thread_idx, kMaxThreadCount + 1, pool._arg);
task(arg->pool, arg->thread_idx, kMaxSubThreadCount + 1, pool._arg);
}
DEVICE_LOG_DEBUG("thread_func_impl.task_completed: %zu", arg->thread_idx);
@ -176,13 +177,13 @@ template <size_t _thread_count> class thread_pool {
DEVICE_LOG_DEBUG("thread_func_impl.end: %zu", arg->thread_idx);
}
std::atomic_bool _thread_exit = false;
std::array<qurt_thread_ptr, kMaxThreadCount> _threads;
thread_pool_arg _thread_args[kMaxThreadCount] = {};
qurt_barrier_t _pending = {};
qurt_barrier_t _completed = {};
task_type _task = nullptr;
void * _arg = nullptr;
std::atomic_bool _thread_exit = false;
std::array<qurt_thread_ptr, kMaxSubThreadCount> _threads;
thread_pool_arg _thread_args[kMaxSubThreadCount] = {};
qurt_barrier_t _pending = {};
qurt_barrier_t _completed = {};
task_type _task = nullptr;
void * _arg = nullptr;
DISABLE_COPY_AND_MOVE(thread_pool);
};

View File

@ -1,7 +1,9 @@
#pragma once
#include <AEEStdDef.h>
#include <HAP_farf.h>
#include <HAP_perf.h>
#include <HAP_power.h>
#include <cstdint>
#include <cstring>
@ -48,11 +50,114 @@ inline constexpr const char * op_get_name(npu_device_tensor_op op) {
return "SUB";
case NPU_OP_MUL:
return "MUL";
case NPU_OP_RMS_NORM:
return "RMS_NORM";
default:
return "UNKNOWN";
}
}
class power_utils {
public:
power_utils() {
_context_ptr = HAP_utils_create_context();
if (_context_ptr == nullptr) {
DEVICE_LOG_ERROR("Failed to create power context\n");
}
}
~power_utils() {
if (_context_ptr != nullptr) {
HAP_utils_destroy_context(_context_ptr);
}
}
unsigned int get_clock_speed_hz() const {
if (!is_valid()) {
DEVICE_LOG_ERROR("Power context is not initialized\n");
return 0;
}
HAP_power_response_t response = {};
response.type = HAP_power_get_clk_Freq;
auto ret = HAP_power_get(_context_ptr, &response);
if (ret != AEE_SUCCESS) {
DEVICE_LOG_ERROR("Failed to get clock speed: %d\n", ret);
return 0;
}
return response.clkFreqHz;
}
bool get_dvcs_enabled() const {
if (!is_valid()) {
DEVICE_LOG_ERROR("Power context is not initialized\n");
return false;
}
HAP_power_response_t response = {};
response.type = HAP_power_get_dcvsEnabled;
auto ret = HAP_power_get(_context_ptr, &response);
if (ret != AEE_SUCCESS) {
DEVICE_LOG_ERROR("Failed to get DVCS enabled: %d\n", ret);
return false;
}
return response.dcvsEnabled;
}
void set_dvcs_performance_mode(bool enable) {
if (!is_valid()) {
DEVICE_LOG_ERROR("Power context is not initialized\n");
return;
}
HAP_power_request_t request = {};
request.type = HAP_power_set_DCVS_v3;
request.dcvs_v3.dcvs_enable = enable ? TRUE : FALSE;
if (enable) {
request.dcvs_v3.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE;
/*
* sleep_latency : To request for sleep latency in micro-seconds.
* Sleep latency is the minimum time before which the DSP sleeps
* Set latency to 65535 to reset it to the default value
*/
request.dcvs_v3.set_latency = TRUE;
request.dcvs_v3.latency = 1000;
request.dcvs_v3.set_bus_params = TRUE;
request.dcvs_v3.bus_params.min_corner = HAP_DCVS_VCORNER_SVS;
request.dcvs_v3.bus_params.max_corner = HAP_DCVS_VCORNER_TURBO;
request.dcvs_v3.bus_params.target_corner = HAP_DCVS_VCORNER_NOM;
}
auto ret = HAP_power_set(_context_ptr, &request);
if (ret != AEE_SUCCESS) {
DEVICE_LOG_ERROR("Failed to set DVCS performance mode: %d\n", ret);
}
}
void set_sleep_mode(bool enable) {
if (!is_valid()) {
DEVICE_LOG_ERROR("Power context is not initialized\n");
return;
}
boolean sleep_disable = enable ? FALSE : TRUE;
auto ret = HAP_power_set_sleep_mode(_context_ptr, sleep_disable);
if (ret != AEE_SUCCESS) {
DEVICE_LOG_ERROR("Failed to set sleep mode: %d\n", ret);
}
}
bool is_valid() const { return _context_ptr != nullptr; }
private:
void * _context_ptr = nullptr;
DISABLE_COPY_AND_MOVE(power_utils);
};
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
template <size_t _buffer_count> class npu_scoped_timer {

View File

@ -47,7 +47,7 @@ class vtcm_mem {
DEVICE_LOG_DEBUG("VTCM released: %zu bytes at %p\n", _vtcm_size, _vtcm_mem);
}
bool is_valid() const { return _vtcm_mem != nullptr; }
bool is_valid() const { return _vtcm_mem != nullptr && _vtcm_size != 0; }
uint8_t * get_mem() const { return reinterpret_cast<uint8_t *>(_vtcm_mem); }

View File

@ -177,7 +177,7 @@ std::shared_ptr<host_tensor> host_buffer::init_tensor(ggml_tensor * tensor, remo
auto ret = _allocator->fastrpc_mmap((int) _domain_id, _buffer_fd, _data, 0, _size, FASTRPC_MAP_FD);
if (ret != AEE_SUCCESS) {
LOG_ERROR("failed to mmap rpc memory, fd: %d, ret: %d\n", _buffer_fd, ret);
LOG_ERROR("failed to mmap rpc memory, fd: %d, size: %zu, ret: %d\n", _buffer_fd, _size, ret);
return std::shared_ptr<host_tensor>();
}

View File

@ -1,5 +1,6 @@
#include "graph.hpp"
#include "profiler.hpp"
#include "tensor.hpp"
namespace hexagon {
@ -28,8 +29,12 @@ bool host_graph::update(ggml_cgraph * cgraph) {
return false;
}
SCOPED_PERFORMANCE_TRACKER("[hexagon-npu][%p]update, handle(%p)", (void *) this, (void *) _graph_handle);
_tensor_handles.clear();
_tensor_update_configs.clear();
_tensor_handles.reserve(cgraph->n_nodes);
_tensor_update_configs.reserve(cgraph->n_nodes);
for (int i = 0; i < cgraph->n_nodes; ++i) {
auto * node = cgraph->nodes[i];
if (node->op == GGML_OP_NONE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE ||
@ -40,28 +45,38 @@ bool host_graph::update(ggml_cgraph * cgraph) {
continue;
}
// TODO: move to tensor?
auto * tensor_obj = host_tensor::from_ggml_tensor(node);
if (!tensor_obj) {
LOG_DEBUG("Unable to get host tensor from ggml tensor: %p\n", (void *) node);
continue;
}
tensor_obj->set_op(node->op);
_tensor_handles.push_back(tensor_obj->get_device_tensor_handle());
LOG_DEBUG("node[%d]%s(%s), addr: %p, type: %s, tensor_handle: %p\n", i, ggml_get_name(node), ggml_op_desc(node),
(void *) node, ggml_type_name(node->type), (void *) tensor_obj->get_device_tensor_handle());
for (size_t j = 0; j < GGML_MAX_SRC && node->src[j]; ++j) {
auto * src = host_tensor::from_ggml_tensor(node->src[j]);
tensor_obj->set_src(j, src);
}
_tensor_update_configs.push_back(tensor_obj->update_hosts_params_only(node));
LOG_DEBUG("[%p]node[%d]%s(%s), addr: %p, type: %s, tensor_handle: %p\n", (void *) this, i, ggml_get_name(node),
ggml_op_desc(node), (void *) node, ggml_type_name(node->type),
(void *) tensor_obj->get_device_tensor_handle());
}
LOG_DEBUG("host_graph::update, host_graph(%p), handle(%p), ggml_cgraph(%p), tensor count(%zu)\n", (void *) this,
(void *) _graph_handle, (void *) cgraph, _tensor_handles.size());
if (!_tensor_handles.empty()) {
npu_device_graph_set_tensor(_device_handle, _graph_handle, _tensor_handles.data(),
(int) _tensor_handles.size());
GGML_ASSERT(_tensor_handles.size() == _tensor_update_configs.size());
constexpr const npu_device_tensor_handle_t kEmptyTensorHandle = 0;
constexpr const npu_device_tensor_update_config kEmptyUpdateConfig = {};
auto ret = npu_device_graph_set_tensor_with_param(
_device_handle, _graph_handle, _tensor_handles.size() ? _tensor_handles.data() : &kEmptyTensorHandle,
(int) _tensor_handles.size(),
_tensor_update_configs.size() ? _tensor_update_configs.data() : &kEmptyUpdateConfig,
(int) _tensor_update_configs.size());
if (ret != AEE_SUCCESS) {
LOG_ERROR("Failed to set tensors in host_graph: 0x%x\n", (int) ret);
return false;
}
LOG_DEBUG("[%p]host_graph::update, handle(%p), ggml_cgraph(%p), tensor count(%zu)\n", (void *) this,
(void *) _graph_handle, (void *) cgraph, _tensor_handles.size());
return true;
}
@ -71,6 +86,7 @@ bool host_graph::compute() {
return false;
}
SCOPED_PERFORMANCE_TRACKER("[hexagon-npu][%p]compute, handle(%p)", (void *) this, (void *) _graph_handle);
auto status = npu_device_graph_compute(_device_handle, _graph_handle);
if (status != AEE_SUCCESS) {
LOG_ERROR("Failed to compute host_graph: 0x%x\n", (int) status);

View File

@ -21,9 +21,10 @@ class host_graph {
bool compute();
private:
remote_handle64 _device_handle = 0;
npu_device_graph_handle_t _graph_handle = 0;
std::vector<npu_device_tensor_handle_t> _tensor_handles;
remote_handle64 _device_handle = 0;
npu_device_graph_handle_t _graph_handle = 0;
std::vector<npu_device_tensor_handle_t> _tensor_handles;
std::vector<npu_device_tensor_update_config> _tensor_update_configs;
DISABLE_COPY(host_graph);
DISABLE_MOVE(host_graph);

View File

@ -151,7 +151,7 @@ bool npu_device::supports_op_impl(const ggml_tensor * op) {
auto * src0 = op->src[0];
if (!src0) {
LOG_DEBUG("[%s]Unsupported inplace op: %s\n", get_name(), ggml_op_name(op->op));
LOG_DEBUG("[%s]Unsupported inplace op: %s\n", get_name(), ggml_op_desc(op));
return false;
}
@ -168,7 +168,7 @@ bool npu_device::supports_op_impl(const ggml_tensor * op) {
auto npu_op = op_to_npu_op(op->op);
if (npu_op == NPU_OP_COUNT) {
LOG_DEBUG("[%s]Unsupported op: %s\n", get_name(), ggml_op_name(op->op));
LOG_DEBUG("[%s]Unsupported op: %s\n", get_name(), ggml_op_desc(op));
return false;
}
@ -179,7 +179,7 @@ bool npu_device::supports_op_impl(const ggml_tensor * op) {
constexpr const auto get_spec = [](const ggml_tensor * tensor) -> npu_device_tensor_spec {
if (!tensor) {
return npu_device_tensor_spec{};
return npu_device_tensor_spec{ {}, NPU_DATA_TYPE_COUNT };
}
static_assert(DEVICE_TENSOR_MAX_DIMS == GGML_MAX_DIMS, "tensor dimensions mismatch");

View File

@ -1,5 +1,7 @@
#pragma once
#include <type_traits>
#include "common.hpp"
#include "ggml-impl.h"
#include "hexagon_npu.h"
@ -19,11 +21,15 @@ class host_tensor {
explicit host_tensor(ggml_tensor * tensor, int buffer_fd, uint64_t offset, remote_handle64 device_handle) :
_device_handle(device_handle) {
// TODO: figure out why the npu_device_tensor_config can't be larger than 100 bytes
static_assert(sizeof(npu_device_tensor_config) < 100, "npu_device_tensor_config size too large");
_info.buffer_fd = buffer_fd;
_info.offset = offset;
_info.type = type_to_npu_type(tensor->type);
_info.op = op_to_npu_op(tensor->op);
_info.size = ggml_nbytes(tensor);
// _info.op will be updated in update_params()
static_assert(DEVICE_TENSOR_MAX_DIMS == GGML_MAX_DIMS, "tensor dimensions mismatch");
static_assert(sizeof(_info.ne) == sizeof(tensor->ne), "tensor ne size mismatch");
@ -56,28 +62,96 @@ class host_tensor {
npu_device_tensor_handle_t get_device_tensor_handle() const { return _device_tensor_handle; }
void set_src(size_t index, host_tensor * src) {
if (index >= DEVICE_TENSOR_MAX_SRC) {
LOG_ERROR("host_tensor(%p) set_src[%zu] out of range\n", (void *) this, index);
void update_params(ggml_tensor * ggml_tensor) {
static_assert(sizeof(_info_update.params) <= sizeof(_ggml_tensor->op_params),
"device tensor params size mismatch");
static_assert(DEVICE_TENSOR_MAX_SRC <= GGML_MAX_SRC, "device tensor src size mismatch");
GGML_ASSERT(ggml_tensor == _ggml_tensor);
if (!_ggml_tensor) {
LOG_DEBUG("host_tensor(%p) _ggml_tensor is null\n", (void *) this);
return;
}
LOG_DEBUG("host_tensor(%p) set_src[%zu]: %p\n", (void *) this, index, (void *) src);
npu_device_tensor_set_src(_device_handle, _device_tensor_handle, index, src->get_device_tensor_handle());
auto new_op = op_to_npu_op(_ggml_tensor->op);
bool params_changed = new_op != _info_update.op;
if (params_changed) {
LOG_DEBUG("host_tensor(%p) op changed: %s -> %s\n", (void *) this, get_npu_op_desc(_info.op),
get_npu_op_desc(new_op));
}
_info.op = new_op;
_info_update.op = new_op;
if (memcmp(_info_update.params, _ggml_tensor->op_params, sizeof(_info_update.params)) != 0) {
params_changed = true;
memcpy(_info_update.params, _ggml_tensor->op_params, sizeof(_info_update.params));
LOG_DEBUG("host_tensor(%p) op_params changed: [%x, %x, %x, %x]\n", (void *) this,
(int) _info_update.params[0], (int) _info_update.params[1], (int) _info_update.params[2],
(int) _info_update.params[3]);
}
npu_device_tensor_handle_t src_tensor_handles[DEVICE_TENSOR_MAX_SRC] = {};
for (size_t j = 0; j < DEVICE_TENSOR_MAX_SRC && _ggml_tensor->src[j]; ++j) {
auto * src = host_tensor::from_ggml_tensor(_ggml_tensor->src[j]);
src_tensor_handles[j] = src->get_device_tensor_handle();
LOG_DEBUG("host_tensor(%p) set_src[%zu]: %p\n", (void *) this, j, (void *) src);
}
static_assert(std::is_same<decltype(_info_update.src_handles), decltype(src_tensor_handles)>::value,
"src tensor handles type mismatch");
if (memcmp(_info_update.src_handles, src_tensor_handles, sizeof(_info_update.src_handles)) != 0) {
params_changed = true;
memcpy(_info_update.src_handles, src_tensor_handles, sizeof(_info_update.src_handles));
LOG_DEBUG("host_tensor(%p) src changed, handles: [%p, %p]\n", (void *) this,
(void *) _info_update.src_handles[0], (void *) _info_update.src_handles[1]);
}
if (params_changed) {
npu_device_tensor_update_params(_device_handle, _device_tensor_handle, &_info_update);
LOG_DEBUG("host_tensor(%p) update_params, op: %s, params: [%x, %x, %x, %x]\n", (void *) this,
ggml_op_desc(_ggml_tensor), (int) _info_update.params[0], (int) _info_update.params[1],
(int) _info_update.params[2], (int) _info_update.params[3]);
} else {
LOG_DEBUG("host_tensor(%p) update_params, no changes, op: %s, params: [%x, %x, %x, %x]\n", (void *) this,
ggml_op_desc(_ggml_tensor), (int) _info_update.params[0], (int) _info_update.params[1],
(int) _info_update.params[2], (int) _info_update.params[3]);
}
}
void set_op(ggml_op op) {
_info.op = op_to_npu_op(op);
npu_device_tensor_set_op(_device_handle, _device_tensor_handle, _info.op);
const npu_device_tensor_update_config & update_hosts_params_only(ggml_tensor * ggml_tensor) {
static_assert(sizeof(_info_update.params) <= sizeof(ggml_tensor->op_params),
"device tensor params size mismatch");
static_assert(DEVICE_TENSOR_MAX_SRC <= GGML_MAX_SRC, "device tensor src size mismatch");
GGML_ASSERT(ggml_tensor == _ggml_tensor);
auto new_op = op_to_npu_op(_ggml_tensor->op);
_info.op = new_op;
_info_update.op = new_op;
memcpy(_info_update.params, _ggml_tensor->op_params, sizeof(_info_update.params));
for (size_t j = 0; j < DEVICE_TENSOR_MAX_SRC && _ggml_tensor->src[j]; ++j) {
auto * src = host_tensor::from_ggml_tensor(_ggml_tensor->src[j]);
_info_update.src_handles[j] = src->get_device_tensor_handle();
LOG_DEBUG("host_tensor(%p) set_src[%zu]: %p\n", (void *) this, j, (void *) src);
}
LOG_DEBUG("host_tensor(%p) update_params, op: %s, params: [%x, %x, %x, %x]\n", (void *) this,
ggml_op_desc(_ggml_tensor), (int) _info_update.params[0], (int) _info_update.params[1],
(int) _info_update.params[2], (int) _info_update.params[3]);
return _info_update;
}
bool is_valid() const { return _device_tensor_handle != 0; }
private:
remote_handle64 _device_handle = 0;
npu_device_tensor_handle_t _device_tensor_handle = 0;
npu_device_tensor_config _info = {};
ggml_tensor * _ggml_tensor = nullptr;
remote_handle64 _device_handle = 0;
npu_device_tensor_handle_t _device_tensor_handle = 0;
npu_device_tensor_config _info = {};
npu_device_tensor_update_config _info_update = {};
ggml_tensor * _ggml_tensor = nullptr;
DISABLE_COPY(host_tensor);
DISABLE_MOVE(host_tensor);

View File

@ -25,11 +25,30 @@ enum npu_device_tensor_op op_to_npu_op(ggml_op op) {
return NPU_OP_SUB;
case GGML_OP_MUL:
return NPU_OP_MUL;
case GGML_OP_RMS_NORM:
return NPU_OP_RMS_NORM;
default:
return NPU_OP_COUNT;
}
}
const char * get_npu_op_desc(enum npu_device_tensor_op op) {
switch (op) {
case NPU_OP_MUL_MAT:
return ggml_op_name(GGML_OP_MUL_MAT);
case NPU_OP_ADD:
return ggml_op_name(GGML_OP_ADD);
case NPU_OP_SUB:
return ggml_op_name(GGML_OP_SUB);
case NPU_OP_MUL:
return ggml_op_name(GGML_OP_MUL);
case NPU_OP_RMS_NORM:
return ggml_op_name(GGML_OP_RMS_NORM);
default:
return "UNKNOWN";
}
}
enum npu_device_tensor_data_type type_to_npu_type(ggml_type type) {
switch (type) {
case GGML_TYPE_F32:

View File

@ -5,6 +5,7 @@
namespace hexagon {
enum npu_device_tensor_op op_to_npu_op(ggml_op op);
const char * get_npu_op_desc(enum npu_device_tensor_op op);
enum npu_device_tensor_data_type type_to_npu_type(ggml_type type);
// TODO: merge with qcom_htp_arch

View File

@ -4,6 +4,7 @@
const uint32_t DEVICE_TENSOR_MAX_DIMS = 4;
const uint32_t DEVICE_TENSOR_MAX_SRC = 2;
const uint32_t DEVICE_TENSOR_MAX_OP_PARAMS = 4;
const uint32_t QUANT_BLOCK_SIZE = 32;
const uint32_t QUANT_K_BLOCK_SIZE = 256;
const uint32_t QUANT_K_SCALE_SIZE = 12;
@ -38,6 +39,7 @@ interface npu_device : remote_handle64{
NPU_OP_ADD,
NPU_OP_SUB,
NPU_OP_MUL,
NPU_OP_RMS_NORM,
NPU_OP_COUNT
};
@ -55,6 +57,12 @@ interface npu_device : remote_handle64{
tensor_data_type type;
};
struct tensor_update_config {
tensor_op op;
int32_t params[DEVICE_TENSOR_MAX_OP_PARAMS];
tensor_handle_t src_handles[DEVICE_TENSOR_MAX_SRC];
};
struct tensor_config {
ne_type ne;
uint64_t nb[DEVICE_TENSOR_MAX_DIMS];
@ -82,15 +90,9 @@ interface npu_device : remote_handle64{
rout tensor_handle_t tensor_handle
);
AEEResult tensor_set_src(
AEEResult tensor_update_params(
in tensor_handle_t tensor_handle,
in uint64_t index,
in tensor_handle_t src
);
AEEResult tensor_set_op(
in tensor_handle_t tensor_handle,
in tensor_op op
in tensor_update_config config
);
AEEResult tensor_free(
@ -106,6 +108,12 @@ interface npu_device : remote_handle64{
in sequence<tensor_handle_t> tensor_handles
);
AEEResult graph_set_tensor_with_param(
in graph_handle_t graph_handle,
in sequence<tensor_handle_t> tensor_handles,
in sequence<tensor_update_config> tensor_params
);
AEEResult graph_compute(
in graph_handle_t graph_handle
);

View File

@ -1,5 +1,5 @@
#include "profiler.hpp"
#include "event_tracer.hpp"
#include <HTP/QnnHtpProfile.h>
#include <QnnProfile.h>

View File

@ -0,0 +1,45 @@
#pragma once
#include <QnnCommon.h>
#include <cstddef>
#include <cstdint>
#include <memory>
#include <string>
#include "logger.hpp"
#include "profiler.hpp"
#include "qnn-types.hpp"
namespace qnn {
// forward declaration of qnn_interface
class qnn_interface;
class qnn_event_tracer {
public:
// ref:
// https://github.com/pytorch/executorch/blob/ae3d558d5e6aa04fc52a3065399fe6a773702f52/backends/qualcomm/serialization/qc_schema.py#L53
// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html#supported-snapdragon-devices
enum sdk_profile_level { PROFILE_OFF = 0, PROFILE_BASIC, PROFILE_DETAIL, PROFILE_OP_TRACE };
explicit qnn_event_tracer(const std::string & prefix, std::shared_ptr<qnn_interface> interface,
Qnn_BackendHandle_t backend_handle, sdk_profile_level level);
~qnn_event_tracer();
Qnn_ProfileHandle_t get_handle() const { return _handle; }
void print_profile_events();
private:
std::shared_ptr<qnn_interface> _interface;
Qnn_ProfileHandle_t _handle = nullptr;
std::string _prefix;
DISABLE_COPY(qnn_event_tracer);
DISABLE_MOVE(qnn_event_tracer);
};
using qnn_event_tracer_ptr = std::shared_ptr<qnn_event_tracer>;
} // namespace qnn

View File

@ -4,10 +4,10 @@
#include <algorithm>
#include <unordered_map>
#include "event_tracer.hpp"
#include "ggml-impl.h"
#include "logger.hpp"
#include "op-config.hpp"
#include "profiler.hpp"
#include "tensor.hpp"
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
@ -411,8 +411,8 @@ bool qnn_graph::build_graph_from_ggml_graph(const ggml_cgraph * cgraph) {
GGML_TYPE_COUNT > GGML_TYPE_Q8_0 && GGML_TYPE_Q8_0 > GGML_TYPE_F16 && GGML_TYPE_F16 > GGML_TYPE_F32,
"GGML_TYPE enum order is not correct");
QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]build_graph_from_ggml_graph", get_backend_name(_device),
_graph_name.c_str());
SCOPED_PERFORMANCE_TRACKER("[%s][%s]build_graph_from_ggml_graph", get_backend_name(_device),
_graph_name.c_str());
auto override_data_type = get_override_data_type(inputs, outputs);
if (override_data_type != GGML_TYPE_COUNT) {
@ -466,8 +466,7 @@ bool qnn_graph::execute(const ggml_cgraph * cgraph, std::shared_ptr<qnn_convert_
ggml_tensor_array_t inputs;
ggml_tensor_array_t outputs;
{
QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]get_io_tensors_from_graph", get_backend_name(_device),
_graph_name.c_str());
SCOPED_PERFORMANCE_TRACKER("[%s][%s]get_io_tensors_from_graph", get_backend_name(_device), _graph_name.c_str());
#ifdef NDEBUG
get_io_tensors_from_graph(cgraph, inputs, outputs);
#else
@ -478,7 +477,7 @@ bool qnn_graph::execute(const ggml_cgraph * cgraph, std::shared_ptr<qnn_convert_
}
{
QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]bind_tensors", get_backend_name(_device), _graph_name.c_str());
SCOPED_PERFORMANCE_TRACKER("[%s][%s]bind_tensors", get_backend_name(_device), _graph_name.c_str());
auto override_data_type = get_override_data_type(inputs, outputs);
if (override_data_type != GGML_TYPE_COUNT) {
QNN_LOG_DEBUG("[%s][%s]override_data_type: %s\n", get_backend_name(_device), _graph_name.c_str(),
@ -502,7 +501,7 @@ bool qnn_graph::execute(const ggml_cgraph * cgraph, std::shared_ptr<qnn_convert_
}
{
QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]execute", get_backend_name(_device), _graph_name.c_str());
SCOPED_PERFORMANCE_TRACKER("[%s][%s]execute", get_backend_name(_device), _graph_name.c_str());
auto & qnn_tensor_inputs = _qnn_tensor_inputs;
auto & qnn_tensor_outputs = _qnn_tensor_outputs;
auto error = _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(),
@ -529,7 +528,7 @@ bool qnn_graph::execute(const ggml_cgraph * cgraph, std::shared_ptr<qnn_convert_
}
bool qnn_graph::finalize() {
QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]finalize", get_backend_name(_device), _graph_name.c_str());
SCOPED_PERFORMANCE_TRACKER("[%s][%s]finalize", get_backend_name(_device), _graph_name.c_str());
if (!qnn::add_op_to_graph(_graph_handle, _operations)) {
QNN_LOG_ERROR("[%s]add nodes failed\n", _graph_name.c_str());

View File

@ -6,9 +6,9 @@
#include <vector>
#include "convert.hpp"
#include "event_tracer.hpp"
#include "ggml-qnn.h"
#include "op-config.hpp"
#include "profiler.hpp"
#include "qnn-lib.hpp"
namespace qnn {

View File

@ -1,100 +0,0 @@
#pragma once
#include <QnnCommon.h>
#include <cstddef>
#include <cstdint>
#include <memory>
#include <string>
#include "logger.hpp"
#include "qnn-types.hpp"
namespace qnn {
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
class qnn_scoped_timer {
public:
qnn_scoped_timer(const std::string & log_prefix) : _log_prefix(std::move(log_prefix)) {
_begin_us = ggml_time_us();
}
qnn_scoped_timer(qnn_scoped_timer && other) {
_begin_us = other._begin_us;
_log_prefix = std::move(other._log_prefix);
}
~qnn_scoped_timer() { print(); }
void operator=(qnn_scoped_timer && other) {
_begin_us = other._begin_us;
_log_prefix = std::move(other._log_prefix);
}
void print() const {
auto duration = (ggml_time_us() - _begin_us) / 1000.0;
QNN_LOG_INFO("[profiler]%s, duration: %.4f ms\n", _log_prefix.c_str(), duration);
}
private:
int64_t _begin_us = 0LL;
std::string _log_prefix;
qnn_scoped_timer(const qnn_scoped_timer &) = delete;
void operator=(const qnn_scoped_timer &) = delete;
};
inline qnn_scoped_timer make_scope_perf_timer(const char * format, ...) {
va_list args;
va_start(args, format);
char buffer[4096];
vsnprintf(buffer, sizeof(buffer), format, args);
va_end(args);
return qnn_scoped_timer(buffer);
}
#else
inline void make_scope_perf_timer(const char *, ...) {}
#endif
// forward declaration of qnn_interface
class qnn_interface;
class qnn_event_tracer {
public:
// ref:
// https://github.com/pytorch/executorch/blob/ae3d558d5e6aa04fc52a3065399fe6a773702f52/backends/qualcomm/serialization/qc_schema.py#L53
// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html#supported-snapdragon-devices
enum sdk_profile_level { PROFILE_OFF = 0, PROFILE_BASIC, PROFILE_DETAIL, PROFILE_OP_TRACE };
explicit qnn_event_tracer(const std::string & prefix, std::shared_ptr<qnn_interface> interface,
Qnn_BackendHandle_t backend_handle, sdk_profile_level level);
~qnn_event_tracer();
Qnn_ProfileHandle_t get_handle() const { return _handle; }
void print_profile_events();
private:
std::shared_ptr<qnn_interface> _interface;
Qnn_ProfileHandle_t _handle = nullptr;
std::string _prefix;
DISABLE_COPY(qnn_event_tracer);
DISABLE_MOVE(qnn_event_tracer);
};
using qnn_event_tracer_ptr = std::shared_ptr<qnn_event_tracer>;
} // namespace qnn
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
# define QNN_SCOPED_PERFORMANCE_TRACKER(fmt, ...) \
auto __qnn_timer_##__LINE__ = qnn::make_scope_perf_timer(fmt, __VA_ARGS__)
#else
# define QNN_SCOPED_PERFORMANCE_TRACKER(fmt, ...) ((void) 0)
#endif

View File

@ -34,21 +34,36 @@ constexpr const qnn::device_caps kDeviceCaps[] = {
{
// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul
kQnnCpuLibName, GGML_BACKEND_DEVICE_TYPE_ACCEL, (1L << GGML_TYPE_I8) | (1L << GGML_TYPE_F32),
0xFFFFFE, // all quantized types can be offload to CPU, at current implementation, those types will be dequantized into float32 on cpu
0, // 0 for no limitation
#ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS
// all quantized types can be offload to CPU, at current implementation, those types will be dequantized into float32 on cpu
0xFFFFFE,
#else
0,
#endif
0, // 0 for no limitation
},
{
// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul
kQnnGpuLibName, GGML_BACKEND_DEVICE_TYPE_GPU, (1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16),
#ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS
// all quantized types can be offload to GPU, at current implementation, those types will be dequantized into float32 on cpu
0xFFFFFE, (128256L * 4096 *
0xFFFFFE,
#else
0,
#endif
(128256L * 4096 *
sizeof(float)), // tested on 8 gen 2, failed to allocate tensor with size 128256x4096 and float32
},
{
// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul
kQnnNpuLibName, GGML_BACKEND_DEVICE_TYPE_ACCEL,
#ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS
(1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16) | (1L << GGML_TYPE_I16),
(1L << GGML_TYPE_Q2_K) | (1L << GGML_TYPE_Q3_K) | (1L << GGML_TYPE_Q4_K) | (1L << GGML_TYPE_Q8_K),
#else
0,
#endif
(8192L * 2048 + 8192 * 512 + 2048 * 512) * sizeof(float), // TODO: should have a better way to get this value
},
};

View File

@ -45,6 +45,10 @@ size_t get_system_free_memory_in_bytes();
class_name(class_name &&) = delete; \
void operator=(class_name &&) = delete
#define DISABLE_COPY_AND_MOVE(class_name) \
DISABLE_COPY(class_name); \
DISABLE_MOVE(class_name)
#define LOG_ERROR(...) (GGML_LOG_ERROR(__VA_ARGS__))
#define LOG_WARN(...) (GGML_LOG_WARN(__VA_ARGS__))
#define LOG_INFO(...) (GGML_LOG_INFO(__VA_ARGS__))

View File

@ -0,0 +1,61 @@
#pragma once
#include <cstddef>
#include <cstdint>
#include <string>
#include "common.hpp"
#include "ggml-impl.h"
namespace profiler {
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
class scoped_timer {
public:
scoped_timer(const std::string & log_prefix) : _log_prefix(std::move(log_prefix)) { _begin_us = ggml_time_us(); }
scoped_timer(scoped_timer && other) {
_begin_us = other._begin_us;
_log_prefix = std::move(other._log_prefix);
}
~scoped_timer() { print(); }
void operator=(scoped_timer && other) {
_begin_us = other._begin_us;
_log_prefix = std::move(other._log_prefix);
}
void print() const {
auto duration = ggml_time_us() - _begin_us;
GGML_LOG_INFO("[profiler]%s, dur: %lld us\n", _log_prefix.c_str(), (long long) duration);
}
private:
int64_t _begin_us = 0LL;
std::string _log_prefix;
DISABLE_COPY(scoped_timer);
};
inline scoped_timer make_scope_perf_timer(const char * format, ...) {
va_list args;
va_start(args, format);
char buffer[4096];
vsnprintf(buffer, sizeof(buffer), format, args);
va_end(args);
return scoped_timer(buffer);
}
#endif
} // namespace profiler
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
# define SCOPED_PERFORMANCE_TRACKER(fmt, ...) \
auto __scoped_timer_##__LINE__ = profiler::make_scope_perf_timer(fmt, __VA_ARGS__)
#else
# define SCOPED_PERFORMANCE_TRACKER(fmt, ...) ((void) 0)
#endif

View File

@ -64,8 +64,10 @@ class rpc_mem {
void * buf = nullptr;
if (_rpc_interface->is_alloc2_available()) {
LOG_DEBUG("rpcmem_alloc2 available, using it\n");
buf = _rpc_interface->rpcmem_alloc2(heapid, flags, size);
} else {
LOG_DEBUG("rpcmem_alloc2 not available, using rpcmem_alloc\n");
buf = _rpc_interface->rpcmem_alloc(heapid, flags, size);
}