feat: dequant use lut (#55)

* Add power management utilities to NPU device context and update DCVS settings

* Update DCVS settings in power_utils to use v3 API and enhance power management

* wip

* Enhance dequantization functions by adding load_dequant_table support and updating signatures for improved performance

* use lut

* wip

* fix test failure

* wip

* Refactor load_qual_block_generic to improve block handling and optimize vector operations

* Enhance load_dual_block_generic and load_qual_block_generic to accept a mask parameter for improved block handling

* Refactor flash_attn_impl to optimize mask l2 prefetch

* wip

* wip

* wip

* wip

* add log

* link against shared libraries instead of static ones

* fix swiglu

* wip

* refactor expf_fix to handle overflow for different data types

* enhance is_glu_op_supported to validate shapes for multiple sources

* wip

* refactor logging macros to use hexagon namespace and improve formatting

* fix printf format error

* wip

* refactor: update static_assert messages for block size validation and add HVX_VectorPred_x3 type alias

* rename

* feat: enhance fa with mask

* wip

* wip

* refactor: replace instances of Q6_V_vzero() with kZeroV for consistency

* wip

* wip

* wip

* fix: improve address alignment check in HVX_Vector handling

* refactor: streamline vector dot product implementations for improved readability

* refactor: q4k add hvx intrinsic impl

* refactor: enhance dequantize_row_q4_K for clarity and performance

* refactor: optimize scale mask usage in dequantization functions for improved performance

* refactor: optimize dequantize_row_q4_K for intrinsic usage and performance improvements

* refactor: move GLU operation implementation into separated file

* sync after swiglu

* wip

* wip

* wip

* feat: increase prc main thread stack size

* fix: replace hardcoded stack size with NPU_THREAD_STACK_SIZE constant

* wip

* feat: add optimized vector operations for exponential and division with overflow handling

* wip

* feat: refactor exponential function to handle overflow and underflow with improved logic

* wip

* wip

* feat: add vector loading and scaling functions for improved performance in block processing

* wip

* feat: optimize block loading by refactoring scale index handling for improved performance

* use Q6_Vb_vlut32_VbVbR_nomatch instead

* feat: enhance scale loading by adding static assertion and restructuring block handling

* wip

* feat: refactor vec_dot_product_mixed_impl for improved clarity and performance

* wip

* feat: simplify vector loading functions and improve alignment handling

* wip

* feat: enhance scale loading mask with quantization block size validation

* wip

* feat: implement make_scale_load_mask function and refactor vector handling in vec_ops

* feat: enhance load_dual_block_generic to include scale indices for improved vector loading

* revert q8 dequant

* wip

* feat: optimize dequantization functions by removing unnecessary masking and updating lookup methods

* wip

* wip
This commit is contained in:
nullname 2025-08-29 21:42:57 +08:00 committed by GitHub
parent 0979133ea8
commit 5ef9b98869
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
23 changed files with 1025 additions and 533 deletions

View File

@ -220,7 +220,7 @@ else()
target_compile_options(hexagon_npu_skel_OBJS PUBLIC
-fsanitize=address -fno-omit-frame-pointer
)
target_link_libraries(hexagon_npu_skel_OBJS PUBLIC
target_link_options(hexagon_npu_skel_OBJS PUBLIC
-fsanitize=address
)
endif()
@ -248,9 +248,9 @@ else()
add_library(hexagon_npu_skel SHARED $<TARGET_OBJECTS:hexagon_npu_skel_OBJS>)
target_link_libraries(hexagon_npu_skel
${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++abi.a
${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++.a
${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc.a
${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++abi.so.1
${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++.so.1
${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc.so
)
set_target_properties(hexagon_npu_skel PROPERTIES OUTPUT_NAME "hexagon_npu_skel_${HEXAGON_ARCH}")
target_link_libraries(hexagon_npu_skel qprintf_static)

View File

@ -17,21 +17,30 @@
namespace {
struct npu_device_context {
std::unique_ptr<hexagon::power_utils> power_utils; // Power management utilities
std::unique_ptr<hexagon::default_thread_pool> thread_pool;
std::unique_ptr<float[]> f16_to_f32_table; // TODO: store vtcm?
bool init() {
if (!init_ltu()) {
DEVICE_LOG_ERROR("Failed to initialize LTU");
DEVICE_LOG_ERROR("Failed to initialize LTU\n");
return false;
}
if (!init_thread_pool()) {
DEVICE_LOG_ERROR("Failed to initialize thread pool");
DEVICE_LOG_ERROR("Failed to initialize thread pool\n");
return false;
}
DEVICE_LOG_DEBUG("NPU device context initialized");
power_utils = std::make_unique<hexagon::power_utils>();
if (power_utils && power_utils->is_valid()) {
power_utils->set_dvcs_performance_mode(true);
DEVICE_LOG_DEBUG("Power utilities initialized with DVCS performance mode enabled\n");
} else {
DEVICE_LOG_ERROR("Failed to initialize power utilities\n");
}
DEVICE_LOG_DEBUG("NPU device context initialized\n");
return true;
}
@ -41,29 +50,29 @@ struct npu_device_context {
f16_to_f32_table = std::make_unique<float[]>(kLtuCount);
if (!f16_to_f32_table) {
DEVICE_LOG_ERROR("Failed to allocate memory for f16_to_f32 table");
DEVICE_LOG_ERROR("Failed to allocate memory for f16_to_f32 table\n");
return false;
}
hexagon::init_f16_f32_table(f16_to_f32_table.get(), kLtuCount);
DEVICE_LOG_DEBUG("f16_to_f32 table initialized");
DEVICE_LOG_DEBUG("f16_to_f32 table initialized\n");
return true;
}
bool init_thread_pool() {
if (thread_pool) {
DEVICE_LOG_DEBUG("Thread pool already initialized");
DEVICE_LOG_DEBUG("Thread pool already initialized\n");
return true;
}
auto pool = std::make_unique<hexagon::default_thread_pool>();
if (!pool) {
DEVICE_LOG_ERROR("Failed to create thread pool");
DEVICE_LOG_ERROR("Failed to create thread pool\n");
return false;
}
thread_pool = std::move(pool);
DEVICE_LOG_DEBUG("Thread pool initialized");
DEVICE_LOG_DEBUG("Thread pool initialized\n");
return true;
}
};
@ -102,25 +111,25 @@ int npu_device_open(const char * uri, remote_handle64 * h) {
// TODO: should we have a device context here?
auto * context = new npu_device_context();
if (!context->init()) {
DEVICE_LOG_ERROR("Failed to initialize npu_device_context");
DEVICE_LOG_ERROR("Failed to initialize npu_device_context\n");
delete context;
return AEE_EFAILED;
}
*h = reinterpret_cast<remote_handle64>(context);
DEVICE_LOG_INFO("NPU device context created: %p", (void *) *h);
DEVICE_LOG_INFO("NPU device context created: %p\n", (void *) *h);
return AEE_SUCCESS;
}
int npu_device_close(remote_handle64 h) {
auto * context = device_context_from_handle(h);
if (!context) {
DEVICE_LOG_ERROR("Invalid npu_device_context handle");
DEVICE_LOG_ERROR("Invalid npu_device_context handle\n");
return AEE_EINVHANDLE;
}
delete context;
DEVICE_LOG_INFO("NPU device context destroyed: %p", (void *) h);
DEVICE_LOG_INFO("NPU device context destroyed: %p\n", (void *) h);
return AEE_SUCCESS;
}
@ -139,7 +148,7 @@ AEEResult npu_device_device_support_op(remote_handle64 _h,
NPU_UNUSED(_h);
if (!srcs || srcsLen <= 0 || !dst || !is_supported) {
DEVICE_LOG_ERROR("npu_device_device_support_op: Invalid arguments");
DEVICE_LOG_ERROR("npu_device_device_support_op: Invalid arguments\n");
return AEE_EINVARGS;
}
@ -185,7 +194,7 @@ AEEResult npu_device_tensors_free(remote_handle64 _h,
int tensor_handlesLen) {
NPU_UNUSED(_h);
if (!tensor_handles || tensor_handlesLen < 0) {
DEVICE_LOG_ERROR("npu_device_tensors_free: Invalid arguments");
DEVICE_LOG_ERROR("npu_device_tensors_free: Invalid arguments\n");
return AEE_EINVARGS;
}
@ -194,7 +203,7 @@ AEEResult npu_device_tensors_free(remote_handle64 _h,
if (tensor) {
delete tensor;
} else {
DEVICE_LOG_ERROR("npu_device_tensors_free: Invalid tensor handle at index %d", i);
DEVICE_LOG_ERROR("npu_device_tensors_free: Invalid tensor handle at index %d\n", i);
}
}
@ -250,13 +259,13 @@ AEEResult npu_device_graph_set_tensor_with_param(remote_handle64
AEEResult npu_device_graph_compute(remote_handle64 _h, npu_device_graph_handle_t graph_handle) {
auto dev_ctx = device_context_from_handle(_h);
if (!dev_ctx) {
DEVICE_LOG_DEBUG("Invalid npu_device_context handle");
DEVICE_LOG_DEBUG("Invalid npu_device_context handle\n");
return AEE_EINVHANDLE;
}
auto * graph = graph_from_handle(graph_handle);
if (!graph) {
DEVICE_LOG_ERROR("Invalid graph handle");
DEVICE_LOG_ERROR("Invalid graph handle\n");
return AEE_EINVHANDLE;
}

View File

@ -91,6 +91,7 @@ void graph::compute_impl(default_thread_pool * pool, default_thread_pool::thread
const bool should_sync = requires_thread_barrier(op);
if (pool && should_sync && i < _tensor_count - 1) {
// For the last tensor, the thread pool will handle synchronization
DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p]sync_thread, tidx: %zu, tensor[%zu/%zu]",
(void *) this,
params.get_thread_index(),

View File

@ -13,7 +13,7 @@ inline float f16_to_f32(const npu_device_fp16_t src) {
}
// From: ggml/src/ggml-cpu/ops.cpp
template <bool _IsKvF16>
template <bool _IsKvF16, bool _HasMask>
void flash_attn_impl(hexagon::tensor * out,
const hexagon::tensor * q,
const hexagon::tensor * k,
@ -24,6 +24,7 @@ void flash_attn_impl(hexagon::tensor * out,
static_assert(3 <= hexagon::kMaxParamsCount, "flash_attn op params count exceeds max params count");
constexpr const npu_device_tensor_data_type kKvDataType = _IsKvF16 ? NPU_DATA_TYPE_F16 : NPU_DATA_TYPE_F32;
constexpr const bool kHasMask = _HasMask;
if (k->get_type() != kKvDataType || v->get_type() != k->get_type()) {
DEVICE_LOG_ERROR("flash_attn_impl: k and v must have same type, got k: %s, v: %s\n",
@ -32,6 +33,11 @@ void flash_attn_impl(hexagon::tensor * out,
return;
}
if (kHasMask != (mask != nullptr)) {
DEVICE_LOG_ERROR("flash_attn_impl: mask is required when kHasMask is true\n");
return;
}
float scale = out->get_op_param<float>(0);
const float max_bias = out->get_op_param<float>(1);
const float logit_softcap = out->get_op_param<float>(2);
@ -96,7 +102,7 @@ void flash_attn_impl(hexagon::tensor * out,
const uint8_t * q_ptr = q->get_read_buffer();
const uint8_t * k_ptr = k->get_read_buffer();
const uint8_t * v_ptr = v->get_read_buffer();
const uint8_t * mask_ptr = mask ? mask->get_read_buffer() : nullptr;
const uint8_t * mask_ptr = kHasMask ? mask->get_read_buffer() : nullptr;
const uint8_t * sinks_ptr = sinks ? sinks->get_read_buffer() : nullptr;
float * VKQ32 = reinterpret_cast<float *>(cache_ptr); // FP32 VKQ accumulator
auto * VKQ16 = reinterpret_cast<npu_device_fp16_t *>(VKQ32 + aligned_dv); // (temporary) FP16 VKQ accumulator
@ -125,11 +131,17 @@ void flash_attn_impl(hexagon::tensor * out,
}
const npu_device_fp16_t * mp =
mask_ptr ? reinterpret_cast<const npu_device_fp16_t *>(mask_ptr + iq1 * mask->get_nb(1) +
kHasMask ? reinterpret_cast<const npu_device_fp16_t *>(mask_ptr + iq1 * mask->get_nb(1) +
(iq2 % mask->get_ne(2)) * mask->get_nb(2) +
(iq3 % mask->get_ne(3)) * mask->get_nb(3)) :
nullptr;
q_to_vec_dot(reinterpret_cast<const float *>(q_data), Q_q, DK);
if (kHasMask) {
hexagon::l2fetch_row(reinterpret_cast<const uint8_t *>(mp), mask->get_nb(1));
}
// k indices
const int ik3 = iq3 / rk3;
const int ik2 = iq2 / rk2;
@ -138,8 +150,6 @@ void flash_attn_impl(hexagon::tensor * out,
const int iv3 = iq3 / rv3;
const int iv2 = iq2 / rv2;
q_to_vec_dot(reinterpret_cast<const float *>(q_data), Q_q, DK);
// online softmax / attention
// loop over n_kv and n_head_kv
// ref: https://arxiv.org/pdf/2112.05682.pdf
@ -147,7 +157,7 @@ void flash_attn_impl(hexagon::tensor * out,
const auto * v_plane_ptr = v_ptr + iv2 * v->get_nb(2) + iv3 * v->get_nb(3);
for (int64_t ic = 0; ic < k->get_ne(1); ++ic) {
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 0, loop);
float mv = mp ? (slope * f16_to_f32(mp[ic])) : 0.0f;
float mv = kHasMask ? (slope * f16_to_f32(mp[ic])) : 0.0f;
if (mv == -INFINITY) {
continue;
}
@ -282,9 +292,17 @@ bool flash_attn_f32(tensor * out, compute_params * params) {
const auto * mask = out->get_src(3);
const auto * sinks = out->get_src(4);
if (k->get_type() == NPU_DATA_TYPE_F16) {
flash_attn_impl<true>(out, q, k, v, mask, sinks, params);
if (mask) {
flash_attn_impl<true, true>(out, q, k, v, mask, sinks, params);
} else {
flash_attn_impl<true, false>(out, q, k, v, mask, sinks, params);
}
} else {
flash_attn_impl<false>(out, q, k, v, mask, sinks, params);
if (mask) {
flash_attn_impl<false, true>(out, q, k, v, mask, sinks, params);
} else {
flash_attn_impl<false, false>(out, q, k, v, mask, sinks, params);
}
}
return true;
}
@ -338,8 +356,8 @@ bool is_flash_attn_supported(const npu_device_tensor_op_spec * op_spec,
if (dst->ne[0] != v->ne[0] || dst->ne[2] != q->ne[1]) {
DEVICE_LOG_DEBUG(
"[%s]dst shape does not match q and v: dst ne: %ld, %ld, %ld, %ld, q ne: %ld, %ld, %ld, %ld, "
"v ne: %ld, %ld, %ld, %ld\n",
"[%s]dst shape does not match q and v: dst ne: %lld, %lld, %lld, %lld, q ne: %lld, %lld, %lld, %lld, "
"v ne: %lld, %lld, %lld, %lld\n",
op_get_name(op),
dst->ne[0],
dst->ne[1],
@ -359,24 +377,25 @@ bool is_flash_attn_supported(const npu_device_tensor_op_spec * op_spec,
if (is_transposed_or_permuted(dst->nb)) {
DEVICE_LOG_DEBUG("[%s]dst cannot be transposed or permuted, nb: %zu, %zu, %zu, %zu\n",
op_get_name(op),
dst->nb[0],
dst->nb[1],
dst->nb[2],
dst->nb[3]);
(size_t) dst->nb[0],
(size_t) dst->nb[1],
(size_t) dst->nb[2],
(size_t) dst->nb[3]);
return false;
}
if (q->ne[0] != k->ne[0]) {
DEVICE_LOG_DEBUG("[%s]q and k shapes do not match: q ne: %ld, %ld, %ld, %ld, k ne: %ld, %ld, %ld, %ld\n",
op_get_name(op),
q->ne[0],
q->ne[1],
q->ne[2],
q->ne[3],
k->ne[0],
k->ne[1],
k->ne[2],
k->ne[3]);
DEVICE_LOG_DEBUG(
"[%s]q and k shapes do not match: q ne: %lld, %lld, %lld, %lld, k ne: %lld, %lld, %lld, %lld\n",
op_get_name(op),
q->ne[0],
q->ne[1],
q->ne[2],
q->ne[3],
k->ne[0],
k->ne[1],
k->ne[2],
k->ne[3]);
return false;
}

View File

@ -0,0 +1,228 @@
#include "op_glu.hpp"
#include "type_traits.hpp"
#include "util.hpp"
namespace {
template <typename T> struct get_data_type {};
template <typename _TyData, typename _TyParam>
struct get_data_type<void (*)(const _TyData *, const _TyData *, _TyData *, size_t, _TyParam)> {
using type = _TyData;
using param_type = typename std::remove_cv<typename std::remove_reference<_TyParam>::type>::type;
};
inline float dummy_load_coeff() {
// This is a dummy function to satisfy the template requirements.
// In practice, this should be replaced with a proper coefficient loading function.
return 0;
}
inline float expf_f16_guard_inf(float x) {
// Avoid overflow for large values, f16: log(65504)
constexpr float kMaxExp = 11.0898664f;
if (x >= kMaxExp) {
// Avoid overflow for large values
return std::numeric_limits<float>::infinity();
}
return std::expf(x);
}
inline void glu_vec_op_f16_f16(const __fp16 * src0, const __fp16 * src1, __fp16 * dst, size_t count, float coeff) {
// TODO: use simd version, for some input hexagon intrinsics will generate nan instead of inf.
for (uint32_t i = 0; i < count; ++i) {
float x = src0[i];
float g = src1[i];
dst[i] = (x / (1.0f + expf_f16_guard_inf(-x))) * g;
}
}
inline void glu_vec_op_f32_f32(const float * src0,
const float * src1,
float * dst,
size_t count,
hexagon::HVX_VectorPair_x4 coeff) {
using namespace hexagon::vec;
vec_trans_with_param_impl<float, hexagon::HVX_VectorPair_x4, hexagon::vec_swiglu_f32_f32>(
src0, src1, dst, count, coeff);
}
template <auto _GluRowFunc, auto _CoeffLoadFunc>
bool glu_impl(hexagon::tensor * out, hexagon::compute_params * params) {
using data_type = typename get_data_type<decltype(_GluRowFunc)>::type;
using param_type = typename get_data_type<decltype(_GluRowFunc)>::param_type;
static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "element_wise_op requires max dims 4");
static_assert(std::is_same_v<param_type, decltype(_CoeffLoadFunc())>,
"GluRowFunc must have the same param type as CoeffLoadFunc");
if (!out) {
return false;
}
const bool has_src1 = out->get_src(1) != nullptr;
auto * src0 = out->get_src(0);
auto * src1 = has_src1 ? out->get_src(1) : src0;
if (!src0 || !src1) {
return true; // skip if no src
}
const auto total_cols = has_src1 ? src0->get_ne(0) : src0->get_ne(0) / 2;
if (out->get_ne(0) != total_cols) {
DEVICE_LOG_ERROR(
"[hexagon-npu][GLU]out.ne[0] (%ld) != total_cols (%d)\n", (long) out->get_ne(0), (int) total_cols);
return false;
}
auto total_rows = out->get_ne(3) * out->get_ne(2) * out->get_ne(1);
const auto rows_per_cube = out->get_ne(2) * out->get_ne(1);
const auto start_end = params->get_work_slice(total_rows);
if (start_end.first >= start_end.second) {
return true;
}
uint8_t * dst_ptr = out->get_write_buffer();
if (!dst_ptr) {
DEVICE_LOG_ERROR("[hexagon-npu][GLU]glu_impl: dst_ptr is not writable, tensor: %p, type: %s\n",
(void *) out,
hexagon::get_type_name(out->get_type()));
return false;
}
const int32_t swapped = out->get_op_param<int32_t>(1);
const uint8_t * src0_ptr = src0->get_read_buffer();
const uint8_t * src1_ptr = has_src1 ? src1->get_read_buffer() : (src0_ptr + total_cols * sizeof(data_type));
if (swapped) {
std::swap(src0_ptr, src1_ptr);
}
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(out, params->get_thread_index());
auto coeff = _CoeffLoadFunc();
const size_t valid_row_bytes = src0->get_ne(0) * sizeof(data_type);
for (int64_t ir = start_end.first; ir < start_end.second; ++ir) {
const auto i03 = ir / rows_per_cube;
const auto i02 = ir / out->get_ne(1) - i03 * out->get_ne(2);
const auto i01 = ir % out->get_ne(1); // TODO: should we use divide instead of mod?
const auto i13 = i03 % src1->get_ne(3);
const auto i12 = i02 % src1->get_ne(2);
const auto i11 = i01 % src1->get_ne(1);
auto * src1_plane = src1_ptr + i13 * src1->get_nb(3) + i12 * src1->get_nb(2);
auto * src0_row = src0_ptr + i03 * src0->get_nb(3) + i02 * src0->get_nb(2) + i01 * src0->get_nb(1);
auto * src1_row = src1_plane + i11 * src1->get_nb(1);
auto * dst_row = dst_ptr + i03 * out->get_nb(3) + i02 * out->get_nb(2) + i01 * out->get_nb(1);
if (ir + 1 < start_end.second) {
hexagon::l2fetch_row(src0_row + src0->get_nb(1), valid_row_bytes);
hexagon::l2fetch_row(src1_row + src1->get_nb(1), valid_row_bytes);
}
_GluRowFunc(reinterpret_cast<const data_type *>(src0_row),
reinterpret_cast<const data_type *>(src1_row),
reinterpret_cast<data_type *>(dst_row),
static_cast<size_t>(total_cols),
coeff);
}
out->release_write_buffer(); // mark the output tensor as modified
return true;
}
template <npu_device_tensor_data_type _DataType>
bool glu_compute(hexagon::tensor * out, hexagon::compute_params * params) {
using namespace hexagon::vec::math;
if (out->get_op_param<int32_t>(0) != NPU_GLU_OP_SWIGLU) {
DEVICE_LOG_ERROR("Invalid GLU op type: %d\n", (int) out->get_op_param<int32_t>(0));
return false;
}
if (out->get_type() != _DataType) {
DEVICE_LOG_ERROR("GLU op type mismatch: %s vs %s\n",
hexagon::get_type_name(out->get_type()),
hexagon::get_type_name(_DataType));
return false;
}
if constexpr (_DataType == NPU_DATA_TYPE_F32) {
return glu_impl<glu_vec_op_f32_f32, qhmath_load_div_sf_ltu>(out, params);
} else if constexpr (_DataType == NPU_DATA_TYPE_F16) {
return glu_impl<glu_vec_op_f16_f16, dummy_load_coeff>(out, params);
}
DEVICE_LOG_ERROR("Unsupported GLU data type: %s\n", hexagon::get_type_name(out->get_type()));
return true;
}
} // namespace
namespace hexagon {
bool glu_f32(hexagon::tensor * out, hexagon::compute_params * params) {
return glu_compute<npu_device_tensor_data_type::NPU_DATA_TYPE_F32>(out, params);
}
bool glu_f16(hexagon::tensor * out, hexagon::compute_params * params) {
return glu_compute<npu_device_tensor_data_type::NPU_DATA_TYPE_F16>(out, params);
}
bool is_glu_op_supported(const npu_device_tensor_op_spec * op_spec,
const npu_device_tensor_spec * dst,
const npu_device_tensor_spec * srcs,
size_t src_len) {
const auto op = op_spec->op;
if (op != NPU_OP_GLU) {
DEVICE_LOG_DEBUG("[%s]unsupported\n", hexagon::op_get_name(op));
return false;
}
if (op_spec->params[0] != NPU_GLU_OP_SWIGLU) {
DEVICE_LOG_DEBUG("[%s]unsupported GLU op type: %d\n", hexagon::op_get_name(op), (int) op_spec->params[0]);
return false;
}
if (!dst || !srcs || src_len < 1) {
DEVICE_LOG_DEBUG("[%s]invalid dst or srcs\n", hexagon::op_get_name(op));
return false;
}
const auto & src0 = srcs[0];
if (dst->type != src0.type) {
DEVICE_LOG_DEBUG("[%s]src0.type and dst.type mismatch: %s vs %s\n",
hexagon::op_get_name(op),
hexagon::get_type_name(src0.type),
hexagon::get_type_name(dst->type));
return false;
}
if (dst->type != NPU_DATA_TYPE_F32 && dst->type != NPU_DATA_TYPE_F16) {
DEVICE_LOG_DEBUG(
"[%s]unsupported data type: %s\n", hexagon::op_get_name(op), hexagon::get_type_name(dst->type));
return false;
}
if (src_len > 1) {
if (!hexagon::is_same_shape(src0, *dst) || !hexagon::is_same_shape(srcs[1], *dst)) {
DEVICE_LOG_DEBUG("[%s]src0, src1 and dst have different shape\n", hexagon::op_get_name(op));
return false; // src0 and src1 have the same shape as dst
}
} else {
static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "GLU requires max dims 4");
if (src0.ne[0] / 2 != dst->ne[0] || src0.ne[1] != dst->ne[1] || src0.ne[2] != dst->ne[2] ||
src0.ne[3] != dst->ne[3]) {
DEVICE_LOG_DEBUG("[%s]src0 and dst have different shape: src0.ne[0]: %ld, dst.ne[0]: %ld\n",
hexagon::op_get_name(op),
(long) src0.ne[0],
(long) dst->ne[0]);
return false;
}
}
return true;
}
} // namespace hexagon

View File

@ -0,0 +1,15 @@
#pragma once
#include "op_types.hpp"
namespace hexagon {
bool glu_f32(hexagon::tensor * out, hexagon::compute_params * params);
bool glu_f16(hexagon::tensor * out, hexagon::compute_params * params);
bool is_glu_op_supported(const npu_device_tensor_op_spec * op_spec,
const npu_device_tensor_spec * dst,
const npu_device_tensor_spec * srcs,
size_t src_len);
} // namespace hexagon

View File

@ -3,11 +3,13 @@
#include "op_impl.hpp"
#include "op_flash_attn.hpp"
#include "op_glu.hpp"
#include "op_mul_mat.hpp"
#include "op_rope.hpp"
#include "type_traits.hpp"
#include "vec_ops.hpp"
#include <cmath>
#include <type_traits>
namespace {
@ -59,15 +61,10 @@ template <typename _TyData> struct get_data_type<void (*)(const _TyData *, const
using type = _TyData;
};
template <typename _TyData>
struct get_data_type<void (*)(const _TyData *, const _TyData *, _TyData *, size_t, hexagon::HVX_VectorPair_x4)> {
using type = _TyData;
};
template <typename _TyData, typename _TyParam>
struct get_data_type<void (*)(const _TyData *, _TyData *, size_t, _TyParam)> {
using type = _TyData;
using param_type = typename std::remove_cv<typename std::remove_reference<_TyData>::type>::type;
using param_type = typename std::remove_cv<typename std::remove_reference<_TyParam>::type>::type;
};
template <auto _RowFunc> bool element_wise_op(hexagon::tensor * out, hexagon::compute_params * params) {
@ -325,171 +322,6 @@ bool is_unary_op_supported(const npu_device_tensor_op_spec * op_spec,
return true;
}
inline void glu_vec_op_f32_f32(const float * src0,
const float * src1,
float * dst,
size_t count,
hexagon::HVX_VectorPair_x4 coeff) {
using namespace hexagon::vec;
vec_trans_with_param_impl<float, hexagon::HVX_VectorPair_x4, hexagon::vec_swiglu_f32_f32>(
src0, src1, dst, count, coeff);
}
inline void glu_vec_op_f16_f16(const npu_device_fp16_t * src0,
const npu_device_fp16_t * src1,
npu_device_fp16_t * dst,
size_t count,
hexagon::HVX_VectorPair_x4 coeff) {
using namespace hexagon::vec;
vec_trans_with_param_impl<npu_device_fp16_t, hexagon::HVX_VectorPair_x4, hexagon::vec_swiglu_f16_f16>(
src0, src1, dst, count, coeff);
}
template <auto _GluRowFunc, hexagon::HVX_VectorPair_x4 (*_CoeffLoadFunc)()>
bool glu_impl(hexagon::tensor * out, hexagon::compute_params * params) {
using data_type = typename get_data_type<decltype(_GluRowFunc)>::type;
static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "element_wise_op requires max dims 4");
if (!out) {
return false;
}
const bool has_src1 = out->get_src(1) != nullptr;
auto * src0 = out->get_src(0);
auto * src1 = has_src1 ? out->get_src(1) : src0;
if (!src0 || !src1) {
return true; // skip if no src
}
const auto total_cols = has_src1 ? src0->get_ne(0) : src0->get_ne(0) / 2;
if (out->get_ne(0) != total_cols) {
DEVICE_LOG_ERROR("out.ne[0] (%ld) != total_cols (%d)\n", (long) out->get_ne(0), (int) total_cols);
return false;
}
auto total_rows = out->get_ne(3) * out->get_ne(2) * out->get_ne(1);
const auto rows_per_cube = out->get_ne(2) * out->get_ne(1);
const auto start_end = params->get_work_slice(total_rows);
if (start_end.first >= start_end.second) {
return true;
}
uint8_t * dst_ptr = out->get_write_buffer();
if (!dst_ptr) {
DEVICE_LOG_ERROR("element_wise_op: dst_ptr is not writable, tensor: %p, type: %s\n",
(void *) out,
hexagon::get_type_name(out->get_type()));
return false;
}
const int32_t swapped = out->get_op_param<int32_t>(1);
const uint8_t * src0_ptr = src0->get_read_buffer();
const uint8_t * src1_ptr = has_src1 ? src1->get_read_buffer() : (src0_ptr + total_cols * sizeof(data_type));
if (swapped) {
std::swap(src0_ptr, src1_ptr);
}
DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(out, params->get_thread_index());
auto coeff = _CoeffLoadFunc();
const size_t valid_row_bytes = src0->get_ne(0) * sizeof(data_type);
for (int64_t ir = start_end.first; ir < start_end.second; ++ir) {
const auto i03 = ir / rows_per_cube;
const auto i02 = ir / out->get_ne(1) - i03 * out->get_ne(2);
const auto i01 = ir % out->get_ne(1); // TODO: should we use divide instead of mod?
const auto i13 = i03 % src1->get_ne(3);
const auto i12 = i02 % src1->get_ne(2);
const auto i11 = i01 % src1->get_ne(1);
auto * src1_plane = src1_ptr + i13 * src1->get_nb(3) + i12 * src1->get_nb(2);
auto * src0_row = src0_ptr + i03 * src0->get_nb(3) + i02 * src0->get_nb(2) + i01 * src0->get_nb(1);
auto * src1_row = src1_plane + i11 * src1->get_nb(1);
auto * dst_row = dst_ptr + i03 * out->get_nb(3) + i02 * out->get_nb(2) + i01 * out->get_nb(1);
if (ir + 1 < start_end.second) {
hexagon::l2fetch_row(src0_row + src0->get_nb(1), valid_row_bytes);
hexagon::l2fetch_row(src1_row + src1->get_nb(1), valid_row_bytes);
}
_GluRowFunc(reinterpret_cast<const data_type *>(src0_row),
reinterpret_cast<const data_type *>(src1_row),
reinterpret_cast<data_type *>(dst_row),
static_cast<size_t>(total_cols),
coeff);
}
out->release_write_buffer(); // mark the output tensor as modified
return true;
}
template <npu_device_tensor_data_type _DataType>
bool glu_compute(hexagon::tensor * out, hexagon::compute_params * params) {
using namespace hexagon::vec::math;
if (out->get_op_param<int32_t>(0) != NPU_GLU_OP_SWIGLU) {
DEVICE_LOG_ERROR("Invalid GLU op type: %d\n", out->get_op_param<int32_t>(0));
return false;
}
if (out->get_type() != _DataType) {
DEVICE_LOG_ERROR("GLU op type mismatch: %s vs %s\n",
hexagon::get_type_name(out->get_type()),
hexagon::get_type_name(_DataType));
return false;
}
if constexpr (_DataType == NPU_DATA_TYPE_F32) {
return glu_impl<glu_vec_op_f32_f32, qhmath_load_div_sf_ltu>(out, params);
} else if constexpr (_DataType == NPU_DATA_TYPE_F16) {
return glu_impl<glu_vec_op_f16_f16, qhmath_load_div_hf_ltu>(out, params);
}
DEVICE_LOG_ERROR("Unsupported GLU data type: %s\n", hexagon::get_type_name(out->get_type()));
return true;
}
bool is_glu_op_supported(const npu_device_tensor_op_spec * op_spec,
const npu_device_tensor_spec * dst,
const npu_device_tensor_spec * srcs,
size_t src_len) {
const auto op = op_spec->op;
if (op != NPU_OP_GLU) {
DEVICE_LOG_DEBUG("[%s]unsupported\n", hexagon::op_get_name(op));
return false;
}
if (op_spec->params[0] != NPU_GLU_OP_SWIGLU) {
DEVICE_LOG_DEBUG("[%s]unsupported GLU op type: %d\n", hexagon::op_get_name(op), op_spec->params[0]);
return false;
}
if (!dst || !srcs || src_len < 1) {
DEVICE_LOG_DEBUG("[%s]invalid dst or srcs\n", hexagon::op_get_name(op));
return false;
}
const auto & src0 = srcs[0];
if (dst->type != src0.type) {
DEVICE_LOG_DEBUG("[%s]src0.type and dst.type mismatch: %s vs %s\n",
hexagon::op_get_name(op),
hexagon::get_type_name(src0.type),
hexagon::get_type_name(dst->type));
return false;
}
if (dst->type != NPU_DATA_TYPE_F32 && dst->type != NPU_DATA_TYPE_F16) {
DEVICE_LOG_DEBUG(
"[%s]unsupported data type: %s\n", hexagon::op_get_name(op), hexagon::get_type_name(dst->type));
return false;
}
if (!hexagon::is_same_shape(src0, *dst)) {
DEVICE_LOG_DEBUG("[%s]src0 and dst have different shape\n", hexagon::op_get_name(op));
return false;
}
return false; // TODO: fix: for some input hexagon intrinsics will generate nan instead of inf.
}
struct op_capabilities {
npu_device_tensor_op op;
hexagon::op_is_supported_func_type is_supported;
@ -499,60 +331,60 @@ struct op_capabilities {
constexpr const op_capabilities kOpCapabilities[] = {
{
NPU_OP_MUL_MAT, hexagon::is_mul_mat_supported,
NPU_OP_MUL_MAT, hexagon::is_mul_mat_supported,
{
hexagon::mul_mat_f32, // NPU_DATA_TYPE_F32
nullptr, // NPU_DATA_TYPE_F16
}, true, // requires_thread_barrier
}, true, // requires_thread_barrier
},
{
NPU_OP_ADD, is_element_wise_op_supported,
NPU_OP_ADD, is_element_wise_op_supported,
{
element_wise_op<vec_op_f32_f32<vadd_f32_f32>>, // NPU_DATA_TYPE_F32
element_wise_op<vec_op_f16_f16<vadd_f16_f16>>, // NPU_DATA_TYPE_F16
}, false, // requires_thread_barrier
},
}, false,
},
{
NPU_OP_SUB, is_element_wise_op_supported,
{
element_wise_op<vec_op_f32_f32<vsub_f32_f32>>, // NPU_DATA_TYPE_F32
element_wise_op<vec_op_f16_f16<vsub_f16_f16>>, // NPU_DATA_TYPE_F16
}, false, // requires_thread_barrier
},
}, false,
},
{
NPU_OP_MUL, is_element_wise_op_supported,
NPU_OP_MUL, is_element_wise_op_supported,
{
element_wise_op<vec_op_f32_f32<vmul_f32_f32>>, // NPU_DATA_TYPE_F32
element_wise_op<vec_op_f16_f16<vmul_f16_f16>>, // NPU_DATA_TYPE_F16
}, false, // requires_thread_barrier
},
}, false,
},
{
NPU_OP_RMS_NORM, is_unary_op_supported,
NPU_OP_RMS_NORM, is_unary_op_supported,
{
unary_op<rms_norm_vec_f32>, // NPU_DATA_TYPE_F32
nullptr, // NPU_DATA_TYPE_F16
}, false, // requires_thread_barrier
},
}, false,
},
{
NPU_OP_FLASH_ATTN,hexagon::is_flash_attn_supported,
NPU_OP_FLASH_ATTN, hexagon::is_flash_attn_supported,
{
hexagon::flash_attn_f32, // NPU_DATA_TYPE_F32
nullptr, // NPU_DATA_TYPE_F16
}, true, // requires_thread_barrier
},
{
NPU_OP_ROPE, hexagon::is_rope_supported,
NPU_OP_ROPE, hexagon::is_rope_supported,
{
hexagon::rope_f32, // NPU_DATA_TYPE_F32
nullptr, // NPU_DATA_TYPE_F16
}, false, // requires_thread_barrier
},
}, false,
},
{
NPU_OP_GLU, is_glu_op_supported,
NPU_OP_GLU, hexagon::is_glu_op_supported,
{
glu_compute<NPU_DATA_TYPE_F32>, // NPU_DATA_TYPE_F32
glu_compute<NPU_DATA_TYPE_F16>, // NPU_DATA_TYPE_F16
}, false, // requires_thread_barrier
hexagon::glu_f32, // NPU_DATA_TYPE_F32
hexagon::glu_f16, // NPU_DATA_TYPE_F16
}, true, // TODO: should we avoid using thread barrier?
},
};

View File

@ -36,8 +36,9 @@ void mul_mat_impl(hexagon::tensor * src0,
using data_type0 = typename get_data_type<decltype(_DotFunc)>::data_type0;
using data_type1 = typename get_data_type<decltype(_DotFunc)>::data_type1;
const auto src0_actual_row_size = hexagon::get_dequantized_row_size(src0);
auto * dequantize_row_func = hexagon::get_type_traits(src0->get_type()).to_float;
const auto src0_actual_row_size = hexagon::get_dequantized_row_size(src0);
auto * dequantize_row_func = hexagon::get_type_traits(src0->get_type()).to_float;
auto * load_dequant_table_func = hexagon::get_type_traits(src0->get_type()).load_dequant_table;
if (_ShouldCacheSrc0 && dequantize_row_func == nullptr) {
DEVICE_LOG_ERROR("Unsupported quantized src0 type: %d, dequantize_row_func is null\n", src0->get_type());
return;
@ -62,8 +63,8 @@ void mul_mat_impl(hexagon::tensor * src0,
if (start_end_plane.second <= start_end_plane.first || start_end_row.second <= start_end_row.first ||
start_end_element.second <= start_end_element.first) {
DEVICE_LOG_DEBUG(
"mul_mat_impl: no work to do, start_end_plane: (%ld, %ld), start_end_row: (%ld, %ld), "
"start_end_element: (%ld, %ld)\n",
"mul_mat_impl: no work to do, start_end_plane: (%lld, %lld), start_end_row: (%lld, %lld), "
"start_end_element: (%lld, %lld)\n",
start_end_plane.first,
start_end_plane.second,
start_end_row.first,
@ -116,6 +117,7 @@ void mul_mat_impl(hexagon::tensor * src0,
return;
}
auto dequant_table = load_dequant_table_func ? load_dequant_table_func() : HVX_Vector();
constexpr bool should_fetch_src0_row = !_ShouldCacheSrc0;
const uint8_t * src0_ptr = src0->get_read_buffer();
const uint8_t * src1_ptr = src1->get_read_buffer();
@ -146,7 +148,8 @@ void mul_mat_impl(hexagon::tensor * src0,
auto * cached_row_ptr = src0_plane_cache_ptr + ir * src0_actual_row_size;
dequantize_row_func(src0_row,
reinterpret_cast<hexagon::dequant_output_type *>(cached_row_ptr),
src0->get_ne(0));
src0->get_ne(0),
dequant_table);
}
last_cached_plane_ptr = src0_plane;
@ -218,8 +221,9 @@ void mul_mat_gemv_impl(hexagon::tensor * src0,
using data_type0 = typename get_data_type<decltype(_DotFunc)>::data_type0;
using data_type1 = typename get_data_type<decltype(_DotFunc)>::data_type1;
const auto src0_actual_row_size = hexagon::get_dequantized_row_size(src0);
auto * dequantize_row_func = hexagon::get_type_traits(src0->get_type()).to_float;
const auto src0_actual_row_size = hexagon::get_dequantized_row_size(src0);
auto * dequantize_row_func = hexagon::get_type_traits(src0->get_type()).to_float;
auto * load_dequant_table_func = hexagon::get_type_traits(src0->get_type()).load_dequant_table;
if (_ShouldCacheSrc0 && dequantize_row_func == nullptr) {
DEVICE_LOG_ERROR("Unsupported quantized src0 type: %d, dequantize_row_func is null\n", src0->get_type());
return;
@ -229,7 +233,7 @@ void mul_mat_gemv_impl(hexagon::tensor * src0,
if (dst->get_ne(0) >= params->get_thread_count()) {
start_end_element = params->get_work_slice(dst->get_ne(0));
} else {
DEVICE_LOG_ERROR("Unsupported src1 tensor shape for gemv: %s, ne: %ldx%ldx%ldx%ld\n",
DEVICE_LOG_ERROR("Unsupported src1 tensor shape for gemv: %s, ne: %lldx%lldx%lldx%lld\n",
hexagon::get_type_name(src1->get_type()),
src1->get_ne(0),
src1->get_ne(1),
@ -241,7 +245,7 @@ void mul_mat_gemv_impl(hexagon::tensor * src0,
if (start_end_element.second <= start_end_element.first) {
DEVICE_LOG_DEBUG(
"mul_mat_impl: no work to do, start_end_plane: [0, 1), start_end_row: [0, 1), "
"start_end_element: [%ld, %ld)\n",
"start_end_element: [%lld, %lld)\n",
start_end_element.first,
start_end_element.second);
return;
@ -297,6 +301,7 @@ void mul_mat_gemv_impl(hexagon::tensor * src0,
return;
}
auto dequant_table = load_dequant_table_func ? load_dequant_table_func() : HVX_Vector();
constexpr bool should_fetch_src0_row = !_ShouldCacheSrc0;
const uint8_t * src0_ptr = src0->get_read_buffer();
const uint8_t * src1_ptr = src1->get_read_buffer();
@ -325,8 +330,10 @@ void mul_mat_gemv_impl(hexagon::tensor * src0,
}
auto * cached_row_ptr = src0_plane_cache_ptr + ir * src0_actual_row_size;
dequantize_row_func(
src0_row, reinterpret_cast<hexagon::dequant_output_type *>(cached_row_ptr), src0->get_ne(0));
dequantize_row_func(src0_row,
reinterpret_cast<hexagon::dequant_output_type *>(cached_row_ptr),
src0->get_ne(0),
dequant_table);
}
src0_plane = src0_plane_cache_ptr;

View File

@ -165,7 +165,7 @@ bool rope_impl(hexagon::tensor * out, hexagon::compute_params * params) {
}
if (n_dims % 2 || (_IsVision && n_dims != out->get_ne(0) / 2)) {
DEVICE_LOG_ERROR("[ROPE]invalid n_dims for vision ROPE: %d, expected: %d\n", n_dims, out->get_ne(0) / 2);
DEVICE_LOG_ERROR("[ROPE]invalid n_dims for vision ROPE: %d, expected: %lld\n", n_dims, out->get_ne(0) / 2);
return false; // invalid n_dims for vision ROPE
}

View File

@ -20,30 +20,30 @@ class tensor {
void * mmap_address = nullptr;
auto ret = HAP_mmap_get(_info.buffer_fd, &mmap_address, &phy_address);
if (ret != AEE_SUCCESS) {
DEVICE_LOG_ERROR("Failed to mmap tensor buffer: %d", (int) ret);
DEVICE_LOG_ERROR("Failed to mmap tensor buffer: %d\n", (int) ret);
return;
}
_data = static_cast<uint8_t *>(mmap_address);
DEVICE_LOG_INFO("tensor(%p[%ldx%ldx%ldx%ld]), fd: %d, offset: %zu, mmap_addr: %p, phy_addr: 0x%lx\n",
(void *) this,
(long) _info.ne[0],
(long) _info.ne[1],
(long) _info.ne[2],
(long) _info.ne[3],
_info.buffer_fd,
_info.offset,
(void *) mmap_address,
phy_address);
DEVICE_LOG_DEBUG("tensor(%p[%ldx%ldx%ldx%ld]), fd: %d, offset: %zu, mmap_addr: %p, phy_addr: 0x%lx\n",
(void *) this,
(long) _info.ne[0],
(long) _info.ne[1],
(long) _info.ne[2],
(long) _info.ne[3],
(int) _info.buffer_fd,
(size_t) _info.offset,
(void *) mmap_address,
(long) phy_address);
}
~tensor() noexcept {
auto ret = HAP_mmap_put(_info.buffer_fd);
if (ret != AEE_SUCCESS) {
DEVICE_LOG_ERROR("Failed to unmap tensor buffer: %d", (int) ret);
DEVICE_LOG_ERROR("Failed to unmap tensor buffer: %d\n", (int) ret);
}
DEVICE_LOG_INFO("~tensor(%p) fd: %d", (void *) this, _info.buffer_fd);
DEVICE_LOG_DEBUG("~tensor(%p) fd: %d\n", (void *) this, _info.buffer_fd);
}
void flush() const {
@ -131,7 +131,7 @@ class tensor {
uint8_t * get_write_buffer() const {
if (_info.is_constant) {
DEVICE_LOG_ERROR("Attempt to write to a constant tensor: %p", (void *) this);
DEVICE_LOG_ERROR("Attempt to write to a constant tensor: %p\n", (void *) this);
return nullptr; // Do not allow writing to constant tensors
}

View File

@ -14,7 +14,7 @@
namespace hexagon {
constexpr const size_t kMaxThreadCount = 4;
constexpr const size_t kDefaultStackSize = 1024 * 64; // 64KB
constexpr const size_t kDefaultStackSize = NPU_THREAD_STACK_SIZE; // 64KB
template <size_t _stack_size> class qurt_thread {
public:
@ -24,7 +24,7 @@ template <size_t _stack_size> class qurt_thread {
qurt_thread_func_type thread_func,
void * arg,
unsigned short priority) {
DEVICE_LOG_DEBUG("qurt_thread.create: %s", thread_name.c_str());
DEVICE_LOG_DEBUG("qurt_thread.create: %s\n", thread_name.c_str());
qurt_thread_attr_init(&_attributes);
qurt_thread_attr_set_name(&_attributes, (char *) thread_name.c_str());
qurt_thread_attr_set_stack_addr(&_attributes, _stack);
@ -37,26 +37,26 @@ template <size_t _stack_size> class qurt_thread {
auto ret = qurt_thread_create(
&_tid, &_attributes, reinterpret_cast<void (*)(void *)>(&qurt_thread::thread_func_impl), (void *) this);
if (ret != QURT_EOK) {
DEVICE_LOG_ERROR("Failed to create thread: %d", (int) ret);
DEVICE_LOG_ERROR("Failed to create thread: %d\n", (int) ret);
_func = nullptr;
_arg = nullptr;
return;
}
DEVICE_LOG_DEBUG("qurt_thread.created: %s, id: %d", thread_name.c_str(), (int) _tid);
DEVICE_LOG_DEBUG("qurt_thread.created: %s, id: %d\n", thread_name.c_str(), (int) _tid);
}
~qurt_thread() {
DEVICE_LOG_DEBUG("qurt_thread.destroy: %d", (int) _tid);
DEVICE_LOG_DEBUG("qurt_thread.destroy: %d\n", (int) _tid);
int thread_exit_code = QURT_EOK;
auto ret = qurt_thread_join(_tid, &thread_exit_code);
if (ret != QURT_EOK && ret != QURT_ENOTHREAD) {
DEVICE_LOG_ERROR("Failed to join thread: %d", (int) ret);
DEVICE_LOG_ERROR("Failed to join thread: %d\n", (int) ret);
return;
}
if (thread_exit_code != QURT_EOK) {
DEVICE_LOG_ERROR("Thread exit code: %d", (int) thread_exit_code);
DEVICE_LOG_ERROR("Thread exit code: %d\n", (int) thread_exit_code);
}
}
@ -135,7 +135,7 @@ template <size_t _ThreadCount> class thread_pool {
auto thread = std::make_unique<thread_type>(
thread_name_base + std::to_string(i), &thread_pool::thread_func_impl, &_thread_params[i + 1], priority);
if (!thread->is_valid()) {
DEVICE_LOG_ERROR("Failed to create thread: %zu", i);
DEVICE_LOG_ERROR("Failed to create thread: %zu\n", i);
// destroy all barriers and threads at destructor
return;
}
@ -143,11 +143,11 @@ template <size_t _ThreadCount> class thread_pool {
_threads[i] = std::move(thread);
}
DEVICE_LOG_DEBUG("thread_pool.created: %zu", kMaxSubThreadCount);
DEVICE_LOG_DEBUG("thread_pool.created: %zu\n", kMaxSubThreadCount);
}
~thread_pool() {
DEVICE_LOG_DEBUG("thread_pool.destroy");
DEVICE_LOG_DEBUG("thread_pool.destroy\n");
_thread_exit = true;
qurt_barrier_wait(&_pending); // release all task threads
@ -161,7 +161,7 @@ template <size_t _ThreadCount> class thread_pool {
bool sync_execute(task_type task, void * arg) {
if (!task) {
DEVICE_LOG_ERROR("Invalid task");
DEVICE_LOG_ERROR("Invalid task\n");
return false;
}
@ -174,7 +174,7 @@ template <size_t _ThreadCount> class thread_pool {
qurt_barrier_wait(&_pending);
task(this, &_thread_params[0], arg);
DEVICE_LOG_DEBUG("main_thread.task_completed: 0");
DEVICE_LOG_DEBUG("main_thread.task_completed: 0\n");
qurt_barrier_wait(&_completed);
@ -198,19 +198,19 @@ template <size_t _ThreadCount> class thread_pool {
auto * param = reinterpret_cast<thread_params *>(arg);
DEVICE_LOG_DEBUG("thread_func_impl.start: %zu", param->tidx);
DEVICE_LOG_DEBUG("thread_func_impl.start: %zu\n", param->tidx);
auto & pool = *(param->pool);
for (;;) {
qurt_barrier_wait(&pool._pending);
if (pool._thread_exit) {
DEVICE_LOG_DEBUG("thread_func_impl.exit: %zu", param->tidx);
DEVICE_LOG_DEBUG("thread_func_impl.exit: %zu\n", param->tidx);
break;
}
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
auto task_begin_cycles = pool._task_begin_cycles.load();
DEVICE_LOG_WARN("[profiler]worker_thread, tidx: %zu, prepare: %lluus",
DEVICE_LOG_WARN("[profiler]worker_thread, tidx: %zu, prepare: %lluus\n",
param->tidx,
static_cast<unsigned long long>(
HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - task_begin_cycles)));
@ -221,18 +221,18 @@ template <size_t _ThreadCount> class thread_pool {
task(param->pool, param, pool._arg);
}
DEVICE_LOG_DEBUG("thread_func_impl.task_completed: %zu", param->tidx);
DEVICE_LOG_DEBUG("thread_func_impl.task_completed: %zu\n", param->tidx);
qurt_barrier_wait(&pool._completed);
#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
DEVICE_LOG_WARN("[profiler]worker_thread, tidx: %zu, task_end: %lluus",
DEVICE_LOG_WARN("[profiler]worker_thread, tidx: %zu, task_end: %lluus\n",
param->tidx,
static_cast<unsigned long long>(
HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - task_begin_cycles)));
#endif
}
DEVICE_LOG_DEBUG("thread_func_impl.end: %zu", param->tidx);
DEVICE_LOG_DEBUG("thread_func_impl.end: %zu\n", param->tidx);
}
std::atomic_bool _thread_exit = false;

View File

@ -3,8 +3,6 @@
#include "op_types.hpp" // TODO: remove this include
#include "vec_ops.hpp"
#include <hexagon_types.h>
#include <array>
static_assert(sizeof(npu_device_block_q4_k) ==
@ -31,42 +29,122 @@ inline npu_device_fp16_t to_fp16(const float src) {
template <typename _TStruct, size_t _Count, auto _MemberPtr> inline HVX_Vector load_into_vector(const _TStruct * src) {
static_assert(hexagon::kBytesPerVector >= sizeof(_TStruct) * _Count, "_TStruct too large for vector load");
const HVX_Vector * qs0 = reinterpret_cast<const HVX_Vector *>(&(src->*_MemberPtr));
HVX_Vector prev = *qs0;
HVX_Vector curr = hexagon::is_addr_aligned(qs0) ? Q6_V_vzero() : *(qs0 + 1);
return Q6_V_valign_VVR(curr, prev, (size_t) qs0);
return *reinterpret_cast<const HVX_UVector *>(&(src->*_MemberPtr));
}
template <typename _TStruct, size_t _Count> inline HVX_Vector load_struct_into_vector(const _TStruct * src) {
static_assert(hexagon::kBytesPerVector >= sizeof(_TStruct) * _Count, "_TStruct too large for vector load");
return *reinterpret_cast<const HVX_UVector *>(src);
}
template <typename _TBlock> inline HVX_Vector load_block_generic(const _TBlock & src) {
static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock), "wrong q4_0 block size/padding");
static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock), "wrong block size/padding");
return load_into_vector<_TBlock, 1, &_TBlock::qs>(&src);
}
template <typename _TBlock> inline HVX_Vector load_dual_block_generic(const _TBlock * srcs) {
static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 2, "wrong q4_0 block size/padding");
constexpr const uint32_t kSizeOfQs = sizeof(_TBlock::qs);
template <typename _TBlock> inline HVX_Vector make_scale_load_mask() {
static_assert(sizeof(_TBlock) < 32, "wrong block size/padding");
static_assert(sizeof(_TBlock::qs) == 16 || sizeof(_TBlock::qs) == 32, "wrong quantization block size");
HVX_Vector blocks = load_into_vector<_TBlock, 2, &_TBlock::qs>(srcs);
HVX_Vector block1 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock));
return Q6_V_lo_W(Q6_W_vshuff_VVR(block1, blocks, kSizeOfQs));
constexpr const size_t kScaleBlockSize = QUANT_BLOCK_SIZE * sizeof(hexagon::dequant_output_type);
// TODO: handle the case that scale not at the start of struct
hexagon::HVX_VectorAlias ret;
for (size_t i = 0; i < QUANT_BLOCK_SIZE; ++i) {
size_t base = i * 2;
ret.u8[base] = 0;
ret.u8[base + 1] = 1;
ret.u8[base + kScaleBlockSize] = sizeof(_TBlock);
ret.u8[base + kScaleBlockSize + 1] = sizeof(_TBlock) + 1;
}
return ret.v;
}
template <typename _TBlock> inline HVX_Vector load_qual_block_generic(const _TBlock * srcs) {
static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 4, "wrong q4_0 block size/padding");
template <typename _TBlock> inline HVX_Vector load_dual_block_generic(const _TBlock * srcs, HVX_VectorPred mask) {
static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 2, "wrong block size/padding");
constexpr const uint32_t kSizeOfQs = sizeof(_TBlock::qs);
constexpr const uint32_t kSizeOfScale = sizeof(_TBlock) - kSizeOfQs;
HVX_Vector blocks = load_into_vector<_TBlock, 2, &_TBlock::qs>(srcs);
HVX_Vector block1 = Q6_V_vror_VR(blocks, kSizeOfScale);
return Q6_V_vmux_QVV(mask, blocks, block1);
}
template <typename _TBlock>
inline hexagon::HVX_Vector_x2 load_dual_block_generic(const _TBlock * srcs,
HVX_VectorPred mask,
const HVX_Vector scale_indices) {
static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 2, "wrong block size/padding");
constexpr const uint32_t kSizeOfQs = sizeof(_TBlock::qs);
constexpr const uint32_t kSizeOfScale = sizeof(_TBlock) - kSizeOfQs;
hexagon::HVX_Vector_x2 result;
HVX_Vector blocks = load_into_vector<_TBlock, 2, &_TBlock::qs>(srcs);
HVX_Vector block1 = Q6_V_vror_VR(blocks, kSizeOfScale);
HVX_Vector scale01 = Q6_Vb_vshuff_Vb(blocks);
result.val[0] = Q6_V_vmux_QVV(mask, blocks, block1);
result.val[1] = Q6_Vb_vlut32_VbVbR_nomatch(scale_indices, scale01, 0);
return result;
}
template <typename _TBlock> inline hexagon::HVX_VectorPred_x3 make_quad_block_mask() {
static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 4, "wrong block size/padding");
constexpr const uint32_t kSizeOfQs = sizeof(_TBlock::qs);
HVX_Vector blocks = load_into_vector<_TBlock, 4, &_TBlock::qs>(srcs);
HVX_Vector block1 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock));
HVX_VectorPair qp0 = Q6_W_vshuff_VVR(block1, blocks, kSizeOfQs);
hexagon::HVX_VectorPred_x3 mask;
mask.val[0] = Q6_Q_vsetq_R(kSizeOfQs);
mask.val[1] = Q6_Q_vsetq_R(kSizeOfQs * 3);
mask.val[2] = Q6_Q_vsetq_R(kSizeOfQs * 2);
return mask;
}
HVX_Vector block2 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock) * 2);
HVX_Vector block3 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock) * 3);
HVX_VectorPair qp1 = Q6_W_vshuff_VVR(block3, block2, kSizeOfQs);
template <typename _TBlock>
inline hexagon::HVX_Vector_x3 load_qual_block_generic(const _TBlock * srcs,
const hexagon::HVX_VectorPred_x3 mask,
const HVX_Vector scale_indices) {
static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 4, "wrong block size/padding");
constexpr const uint32_t kSizeOfQs = sizeof(_TBlock::qs);
constexpr const uint32_t kSizeOfScale = sizeof(_TBlock) - kSizeOfQs;
return Q6_V_lo_W(Q6_W_vshuff_VVR(Q6_V_lo_W(qp1), Q6_V_lo_W(qp0), kSizeOfQs * 2));
hexagon::HVX_Vector_x3 result;
const HVX_Vector blocks = load_struct_into_vector<_TBlock, 4>(srcs);
{
HVX_Vector block0 = Q6_V_vror_VR(blocks, kSizeOfScale);
HVX_Vector block1 = Q6_V_vror_VR(blocks, kSizeOfScale * 2);
HVX_Vector block2 = Q6_V_vror_VR(blocks, kSizeOfScale * 3);
HVX_Vector block3 = Q6_V_vror_VR(blocks, kSizeOfScale * 4);
HVX_Vector block01 = Q6_V_vmux_QVV(mask.val[0], block0, block1);
HVX_Vector block23 = Q6_V_vmux_QVV(mask.val[1], block2, block3);
result.val[0] = Q6_V_vmux_QVV(mask.val[2], block01, block23);
}
{
HVX_Vector scale23 = Q6_V_vror_VR(blocks, sizeof(_TBlock) * 2);
HVX_Vector scale01 = Q6_Vb_vshuff_Vb(blocks);
scale23 = Q6_Vb_vshuff_Vb(scale23);
result.val[1] = Q6_Vb_vlut32_VbVbR_nomatch(scale_indices, scale01, 0);
result.val[2] = Q6_Vb_vlut32_VbVbR_nomatch(scale_indices, scale23, 0);
}
return result;
}
inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) {
// TODO: use intrinsics
if (j < 4) {
*d = q[j] & 63;
*m = q[j + 4] & 63;
@ -324,23 +402,24 @@ void quantize_row_q4_K(const float * src, void * dst, size_t count) {
}
}
void dequantize_row_q8_0(const void * src, hexagon::dequant_output_type * dst, size_t count) {
void dequantize_row_q8_0(const void * src, hexagon::dequant_output_type * dst, size_t count, HVX_Vector) {
constexpr const int qk = QUANT_BLOCK_SIZE;
static_assert(QUANT_BLOCK_SIZE == hexagon::kBytesPerVector / sizeof(float));
const int nb = count / qk;
const auto * src_ptr = reinterpret_cast<const npu_device_block_q8_0 *>(src);
auto * dst_ptr = ((hexagon::dequant_output_type *) dst); // TODO: opt for aligned access
const int nb = count / qk;
const auto * src_ptr = reinterpret_cast<const npu_device_block_q8_0 *>(src);
auto * dst_ptr = ((hexagon::dequant_output_type *) dst); // TODO: opt for aligned access
const HVX_VectorPred mask = Q6_Q_vsetq_R(sizeof(npu_device_block_q8_0::qs));
const HVX_VectorPred scale_mask = Q6_Q_vsetq_R(hexagon::kBytesPerVector / 2);
int i = 0;
for (; i + 1 < nb; i += 2) {
const auto & src0 = src_ptr[i];
const auto & src1 = src_ptr[i + 1];
HVX_Vector scales01 =
Q6_V_valign_VVR(Q6_Vh_vsplat_R(src1.d), Q6_Vh_vsplat_R(src0.d), hexagon::kBytesPerVector / 2);
HVX_Vector scales01 = Q6_V_vmux_QVV(scale_mask, Q6_Vh_vsplat_R(src0.d), Q6_Vh_vsplat_R(src1.d));
HVX_Vector qs = load_dual_block_generic(src_ptr + i);
HVX_Vector qs = load_dual_block_generic(src_ptr + i, mask);
HVX_Vector q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(Q6_Wh_vunpack_Vb(qs)));
HVX_Vector result = Q6_Vqf16_vmpy_VhfVhf(q_lo, scales01);
@ -363,44 +442,39 @@ void dequantize_row_q8_0(const void * src, hexagon::dequant_output_type * dst, s
}
template <bool _IsDstAligned>
void dequantize_row_q4_0_impl(const void * src, hexagon::dequant_output_type * dst, size_t count) {
void dequantize_row_q4_0_impl(const void * src, hexagon::dequant_output_type * dst, size_t count, HVX_Vector table) {
constexpr const int qk = QUANT_BLOCK_SIZE;
static_assert(qk % 2 == 0, "qk must be even");
static_assert(QUANT_BLOCK_SIZE == hexagon::kBytesPerVector / sizeof(float));
constexpr const uint32_t kSizeOfQs = sizeof(npu_device_block_q4_0::qs);
static const auto load_masks = make_quad_block_mask<npu_device_block_q4_0>();
static const HVX_Vector scale_indices __attribute__((aligned(hexagon::kBytesPerVector))) =
make_scale_load_mask<npu_device_block_q4_0>();
const int nb = count / qk;
const auto * src_ptr = reinterpret_cast<const npu_device_block_q4_0 *>(src);
const HVX_Vector mask = Q6_Vb_vsplat_R(0x0F);
const HVX_Vector minus = Q6_Vb_vsplat_R(8);
hexagon::dequant_output_type * dst_ptr = dst; // TODO: opt for aligned access
int i = 0;
for (; i + 3 < nb; i += 4) {
const auto & src0 = src_ptr[i];
const auto & src1 = src_ptr[i + 1];
const auto & src2 = src_ptr[i + 2];
const auto & src3 = src_ptr[i + 3];
auto qs = load_qual_block_generic(src_ptr + i, load_masks, scale_indices);
HVX_Vector scales01 =
Q6_V_valign_VVR(Q6_Vh_vsplat_R(src1.d), Q6_Vh_vsplat_R(src0.d), hexagon::kBytesPerVector / 2);
HVX_Vector scales23 =
Q6_V_valign_VVR(Q6_Vh_vsplat_R(src3.d), Q6_Vh_vsplat_R(src2.d), hexagon::kBytesPerVector / 2);
HVX_Vector q_lo = qs.val[0];
HVX_Vector q_hi = Q6_Vub_vlsr_VubR(qs.val[0], 4);
HVX_Vector qs = load_qual_block_generic(src_ptr + i);
HVX_Vector q_lo = Q6_V_vand_VV(qs, mask);
HVX_Vector q_hi = Q6_Vub_vlsr_VubR(qs, 4);
HVX_VectorPair qp0 = Q6_W_vshuff_VVR(q_hi, q_lo, kSizeOfQs * (1 + 2 + 4));
q_lo = Q6_Vb_vsub_VbVb(Q6_V_lo_W(qp0), minus);
qp0 = Q6_Wh_vunpack_Vb(q_lo);
HVX_VectorPair qp0 = Q6_W_vshuff_VVR(q_hi, q_lo, kSizeOfQs * (1 + 2 + 4));
q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(qp0));
q_hi = Q6_Vhf_equals_Vh(Q6_V_hi_W(qp0));
q_lo = Q6_Vb_vshuff_Vb(Q6_V_lo_W(qp0));
qp0 = Q6_Wh_vlut16_VbVhR_nomatch(q_lo, table, 0);
q_lo = Q6_V_lo_W(qp0);
q_hi = Q6_V_hi_W(qp0);
q_lo = Q6_Vqf16_vmpy_VhfVhf(q_lo, qs.val[1]);
q_hi = Q6_Vqf16_vmpy_VhfVhf(q_hi, qs.val[2]);
q_lo = Q6_Vqf16_vmpy_VhfVhf(q_lo, scales01);
q_lo = Q6_Vhf_equals_Vqf16(q_lo);
q_hi = Q6_Vqf16_vmpy_VhfVhf(q_hi, scales23);
q_hi = Q6_Vhf_equals_Vqf16(q_hi);
if constexpr (_IsDstAligned) {
@ -415,21 +489,16 @@ void dequantize_row_q4_0_impl(const void * src, hexagon::dequant_output_type * d
}
for (; i + 1 < nb; i += 2) {
const auto & src0 = src_ptr[i];
const auto & src1 = src_ptr[i + 1];
HVX_Vector scales01 =
Q6_V_valign_VVR(Q6_Vh_vsplat_R(src1.d), Q6_Vh_vsplat_R(src0.d), hexagon::kBytesPerVector / 2);
HVX_Vector qs = load_dual_block_generic(src_ptr + i);
HVX_Vector q_lo = Q6_V_vand_VV(qs, mask);
HVX_Vector q_hi = Q6_Vub_vlsr_VubR(qs, 4);
auto qs = load_dual_block_generic(src_ptr + i, load_masks.val[0], scale_indices);
HVX_Vector q_lo = qs.val[0];
HVX_Vector q_hi = Q6_Vub_vlsr_VubR(qs.val[0], 4);
HVX_VectorPair qp0 = Q6_W_vshuff_VVR(q_hi, q_lo, kSizeOfQs * (1 + 2));
q_lo = Q6_Vb_vsub_VbVb(Q6_V_lo_W(qp0), minus);
qp0 = Q6_Wh_vunpack_Vb(q_lo);
q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(qp0));
q_lo = Q6_Vqf16_vmpy_VhfVhf(q_lo, scales01);
q_lo = Q6_Vhf_equals_Vqf16(q_lo);
q_lo = Q6_Vb_vshuff_Vb(Q6_V_lo_W(qp0));
qp0 = Q6_Wh_vlut16_VbVhR_nomatch(q_lo, table, 0);
q_lo = Q6_Vqf16_vmpy_VhfVhf(Q6_V_lo_W(qp0), qs.val[1]);
q_lo = Q6_Vhf_equals_Vqf16(q_lo);
if constexpr (_IsDstAligned) {
*reinterpret_cast<HVX_Vector *>(dst_ptr) = q_lo;
@ -445,14 +514,15 @@ void dequantize_row_q4_0_impl(const void * src, hexagon::dequant_output_type * d
HVX_Vector scales = Q6_Vh_vsplat_R(curr_blk.d);
HVX_Vector qs = load_block_generic(curr_blk);
HVX_Vector q_lo = Q6_V_vand_VV(qs, mask);
HVX_Vector q_lo = qs;
HVX_Vector q_hi = Q6_Vub_vlsr_VubR(qs, 4);
HVX_VectorPair qp0 = Q6_W_vshuff_VVR(q_hi, q_lo, kSizeOfQs);
q_lo = Q6_Vb_vsub_VbVb(Q6_V_lo_W(qp0), minus);
qp0 = Q6_Wh_vunpack_Vb(q_lo);
q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(qp0));
q_lo = Q6_Vqf16_vmpy_VhfVhf(q_lo, scales);
q_lo = Q6_Vhf_equals_Vqf16(q_lo);
q_lo = Q6_Vb_vshuff_Vb(Q6_V_lo_W(qp0));
qp0 = Q6_Wh_vlut16_VbVhR_nomatch(q_lo, table, 0);
q_lo = Q6_Vqf16_vmpy_VhfVhf(Q6_V_lo_W(qp0), scales);
q_lo = Q6_Vhf_equals_Vqf16(q_lo);
if constexpr (_IsDstAligned) {
hexagon::q6op_vstu_variable_aligned<hexagon::kBytesPerVector / 2>(dst_ptr, q_lo);
@ -462,24 +532,82 @@ void dequantize_row_q4_0_impl(const void * src, hexagon::dequant_output_type * d
}
}
void dequantize_row_q4_0(const void * src, hexagon::dequant_output_type * dst, size_t count) {
HVX_Vector load_dequant_table_q4_0() {
constexpr const int kTableSize = 1 << 4; // 4 bits per value, 16 values
constexpr const int kQ4ZeroPoint = 8; // zero point for q4_0 quantization
static_assert(kTableSize <= hexagon::kBytesPerVector / sizeof(__fp16), "table too large");
static const HVX_Vector result = []() -> HVX_Vector {
union {
HVX_Vector v;
__fp16 f16[sizeof(HVX_Vector) / sizeof(__fp16)];
} table __attribute__((aligned(hexagon::kBytesPerVector)));
table.v = Q6_V_vzero();
for (int i = 0; i < kTableSize; ++i) {
table.f16[i * 2] = i - kQ4ZeroPoint; // TODO: vectorize this?
}
return table.v;
}();
return result;
}
void dequantize_row_q4_0(const void * src, hexagon::dequant_output_type * dst, size_t count, HVX_Vector table) {
const bool dst_aligned = hexagon::is_addr_aligned(dst);
if (dst_aligned) {
dequantize_row_q4_0_impl<true>(src, dst, count);
dequantize_row_q4_0_impl<true>(src, dst, count, table);
} else {
dequantize_row_q4_0_impl<false>(src, dst, count);
dequantize_row_q4_0_impl<false>(src, dst, count, table);
}
}
void dequantize_row_q4_K(const void * src, hexagon::dequant_output_type * dst, size_t count) {
HVX_Vector load_dequant_table_q4_k() {
constexpr const int kTableSize = 1 << 4; // 4 bits per value, 16 values
static_assert(kTableSize <= hexagon::kBytesPerVector / sizeof(__fp16), "table too large");
const static HVX_Vector result = []() -> HVX_Vector {
union {
HVX_Vector v;
__fp16 f16[sizeof(HVX_Vector) / sizeof(__fp16)];
} table __attribute__((aligned(hexagon::kBytesPerVector)));
table.v = Q6_V_vzero();
for (int i = 0; i < kTableSize; ++i) {
table.f16[i * 2] = i; // TODO: vectorize this?
}
return table.v;
}();
return result;
}
void dequantize_row_q4_K(const void * src, hexagon::dequant_output_type * dst, size_t count, HVX_Vector table) {
constexpr const int kQuantSubBlockSize = 32;
const int nb = count / QUANT_K_BLOCK_SIZE;
const auto * src_ptr = reinterpret_cast<const npu_device_block_q4_k *>(src);
auto * dst_ptr = reinterpret_cast<__fp16 *>(dst);
auto * dst_ptr = reinterpret_cast<npu_device_fp16_t *>(dst);
const HVX_VectorPred scale_mask = Q6_Q_vsetq_R(hexagon::kBytesPerVector / 2);
union {
HVX_VectorPair p[2];
HVX_Vector v[4];
} dual_pair __attribute__((aligned(hexagon::kBytesPerVector * 4)));
// TODO: use intrinsics
for (int i = 0; i < nb; i++) {
const uint8_t * q = src_ptr[i].qs;
HVX_Vector qv = *reinterpret_cast<const HVX_UVector *>(q);
HVX_Vector q_lo = qv;
HVX_Vector q_hi = Q6_Vub_vlsr_VubR(qv, 4);
HVX_VectorPair qp = Q6_W_vshuff_VVR(q_hi, q_lo, kQuantSubBlockSize * 3);
dual_pair.p[0] = Q6_Wh_vlut16_VbVhR_nomatch(Q6_Vb_vshuff_Vb(Q6_V_lo_W(qp)), table, 0);
dual_pair.p[1] = Q6_Wh_vlut16_VbVhR_nomatch(Q6_Vb_vshuff_Vb(Q6_V_hi_W(qp)), table, 0);
const __fp16 d = reinterpret_cast<const __fp16 &>(src_ptr[i].d);
const __fp16 min = reinterpret_cast<const __fp16 &>(src_ptr[i].dmin);
@ -487,30 +615,61 @@ void dequantize_row_q4_K(const void * src, hexagon::dequant_output_type * dst, s
uint8_t sc = 0;
uint8_t m = 0;
const auto * scales = src_ptr[i].scales;
for (int j = 0; j < QUANT_K_BLOCK_SIZE; j += 64) {
for (int j = 0; j < QUANT_K_BLOCK_SIZE; j += 128) {
get_scale_min_k4(is + 0, scales, &sc, &m);
const __fp16 d0 = d * sc;
const __fp16 m0 = min * m;
HVX_Vector dv0 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(d0));
HVX_Vector dm0 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(m0));
get_scale_min_k4(is + 1, scales, &sc, &m);
const __fp16 d1 = d * sc;
const __fp16 m1 = min * m;
get_scale_min_k4(is + 1, scales, &sc, &m);
HVX_Vector dv1 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(d1));
HVX_Vector dm1 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(m1));
get_scale_min_k4(is + 2, scales, &sc, &m);
const __fp16 d2 = d * sc;
const __fp16 m2 = min * m;
for (int l = 0; l < 32; ++l) {
dst_ptr[0] = d1 * (q[l] & 0xF) - m1;
dst_ptr[32] = d2 * ((q[l] >> 4) & 0xF) - m2;
dst_ptr++;
}
dst_ptr += 32;
q += 32;
is += 2;
HVX_Vector dv2 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(d2));
HVX_Vector dm2 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(m2));
get_scale_min_k4(is + 3, scales, &sc, &m);
const __fp16 d3 = d * sc;
const __fp16 m3 = min * m;
HVX_Vector dv3 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(d3));
HVX_Vector dm3 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(m3));
HVX_Vector dv01 = Q6_V_vmux_QVV(scale_mask, dv0, dv1);
HVX_Vector dm01 = Q6_V_vmux_QVV(scale_mask, dm0, dm1);
HVX_Vector dv23 = Q6_V_vmux_QVV(scale_mask, dv2, dv3);
HVX_Vector dm23 = Q6_V_vmux_QVV(scale_mask, dm2, dm3);
q_lo = Q6_Vqf16_vmpy_VhfVhf(dual_pair.v[j / 64], dv01);
q_lo = Q6_Vqf16_vsub_Vqf16Vhf(q_lo, dm01);
q_hi = Q6_Vqf16_vmpy_VhfVhf(dual_pair.v[j / 64 + 1], dv23);
q_hi = Q6_Vqf16_vsub_Vqf16Vhf(q_hi, dm23);
reinterpret_cast<HVX_UVector *>(dst_ptr)[0] = Q6_Vhf_equals_Vqf16(q_lo);
reinterpret_cast<HVX_UVector *>(dst_ptr)[1] = Q6_Vhf_equals_Vqf16(q_hi);
dst_ptr += 128;
is += 4;
}
}
}
void copy_row_f16(const void * src, hexagon::dequant_output_type * dst, size_t count) {
void copy_row_f16(const void * src, hexagon::dequant_output_type * dst, size_t count, HVX_Vector) {
hexagon::vec_cpy_f16(reinterpret_cast<const npu_device_fp16_t *>(src), dst, count);
}
void copy_row_f32(const void * src, hexagon::dequant_output_type * dst, size_t count) {
void copy_row_f32(const void * src, hexagon::dequant_output_type * dst, size_t count, HVX_Vector) {
hexagon::vec_cpy_f32(reinterpret_cast<const float *>(src), reinterpret_cast<float *>(dst), count);
}
@ -539,12 +698,16 @@ constexpr const hexagon::device_type_traits kDeviceTypeTraits[] = {
"Q4_0", QUANT_BLOCK_SIZE,
sizeof(npu_device_block_q4_0),
true, dequantize_row_q4_0,
quantize_row_q4_0 },
quantize_row_q4_0, nullptr,
nullptr, nullptr,
load_dequant_table_q4_0 },
{ NPU_DATA_TYPE_Q4_K,
"Q4_K", QUANT_K_BLOCK_SIZE,
sizeof(npu_device_block_q4_k),
true, dequantize_row_q4_K,
quantize_row_q4_K },
quantize_row_q4_K, nullptr,
nullptr, nullptr,
load_dequant_table_q4_k },
};
static_assert(std::size(kDeviceTypeTraits) == NPU_DATA_TYPE_COUNT,

View File

@ -3,6 +3,8 @@
#include "tensor.hpp"
#include "util.hpp"
#include <hexagon_types.h>
namespace hexagon {
using dequant_output_type = npu_device_fp16_t;
@ -10,9 +12,10 @@ using dequant_output_type = npu_device_fp16_t;
bool init_f16_f32_table(float * table, size_t count);
typedef void (*quantize_row_type)(const float * src, void * dst, size_t count);
typedef void (*dequantize_row_type)(const void * src, dequant_output_type * dst, size_t count);
typedef void (*dequantize_row_type)(const void * src, dequant_output_type * dst, size_t count, HVX_Vector table);
typedef float (*vec_dot_type)(const void * src0, const void * src1, size_t count);
typedef bool (*can_use_aligned_vec_dot_type)(const void * src0, const void * src1, size_t count);
typedef HVX_Vector (*load_dequant_table_type)();
struct device_type_traits {
npu_device_tensor_data_type type;
@ -21,11 +24,12 @@ struct device_type_traits {
size_t type_size;
bool is_quantized;
dequantize_row_type to_float;
quantize_row_type from_float;
vec_dot_type vec_dot;
vec_dot_type vec_dot_aligned;
can_use_aligned_vec_dot_type can_use_aligned_vec_dot;
dequantize_row_type to_float = nullptr;
quantize_row_type from_float = nullptr;
vec_dot_type vec_dot = nullptr;
vec_dot_type vec_dot_aligned = nullptr;
can_use_aligned_vec_dot_type can_use_aligned_vec_dot = nullptr;
load_dequant_table_type load_dequant_table = nullptr;
};
const device_type_traits & get_type_traits(npu_device_tensor_data_type type);
@ -49,7 +53,7 @@ namespace hexagon {
inline auto make_scoped_op_perf_timer(tensor * op, size_t tidx) {
auto * src0 = op->get_src(0);
auto * src1 = op->get_src(1);
char buffer[1024];
char buffer[512];
if (src1 == nullptr) {
snprintf(buffer,
sizeof(buffer),
@ -96,8 +100,10 @@ inline auto make_scoped_op_perf_timer(tensor * op, size_t tidx) {
# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_MULTI_SUB_PROC(op, tidx, tracker_name) \
auto __npu_op_timer_##tracker_name = hexagon::make_scoped_op_perf_timer(op, tidx)
# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(tracker_name, idx, sub_prefix) \
hexagon::npu_sub_process_scoped_timer<decltype(__npu_op_timer_##tracker_name)::kBufferCount, idx> \
# define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(tracker_name, idx, sub_prefix) \
hexagon::npu_sub_process_scoped_timer< \
std::remove_reference_t<decltype(__npu_op_timer_##tracker_name)>::kBufferCount, \
idx> \
__npu_op_sub_timer##sub_prefix(__npu_op_timer_##tracker_name, #sub_prefix)
#else

View File

@ -8,17 +8,16 @@
#include <HAP_power.h>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <utility>
#define DEVICE_LOG_ERROR(...) FARF(FATAL, __VA_ARGS__)
#define DEVICE_LOG_WARN(...) FARF(ERROR, __VA_ARGS__)
#define DEVICE_LOG_INFO(...) FARF(HIGH, __VA_ARGS__)
#define DEVICE_LOG_ERROR(...) hexagon::log_error(__VA_ARGS__)
#define DEVICE_LOG_WARN(...) hexagon::log_message(__VA_ARGS__)
#define DEVICE_LOG_INFO(...) hexagon::log_message(__VA_ARGS__)
#ifdef _DEBUG
# undef FARF_LOW
# define FARF_LOW 1
# define DEVICE_LOG_DEBUG(...) FARF(LOW, __VA_ARGS__)
# define DEVICE_LOG_DEBUG(...) hexagon::log_message(__VA_ARGS__)
#else
# define DEVICE_LOG_DEBUG(...) (void) 0
#endif
@ -40,6 +39,20 @@
namespace hexagon {
__attribute__((format(printf, 1, 2))) inline void log_error(const char * format, ...) {
va_list args;
va_start(args, format);
std::vfprintf(stderr, format, args);
va_end(args);
}
__attribute__((format(printf, 1, 2))) inline void log_message(const char * format, ...) {
va_list args;
va_start(args, format);
std::vprintf(format, args);
va_end(args);
}
inline constexpr const char * op_get_name(npu_device_tensor_op op) {
switch (op) {
case NPU_OP_MUL_MAT:
@ -137,23 +150,22 @@ class power_utils {
return;
}
HAP_power_request_t request = {};
request.type = HAP_power_set_DCVS_v3;
request.dcvs_v3.dcvs_enable = enable ? TRUE : FALSE;
HAP_power_request_t request = {};
request.type = HAP_power_set_DCVS_v3;
request.dcvs_v3.set_dcvs_enable = enable ? TRUE : FALSE;
request.dcvs_v3.dcvs_enable = enable ? TRUE : FALSE;
request.dcvs_v3.set_core_params = TRUE;
if (enable) {
request.dcvs_v3.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE;
/*
* sleep_latency : To request for sleep latency in micro-seconds.
* Sleep latency is the minimum time before which the DSP sleeps
* Set latency to 65535 to reset it to the default value
*/
request.dcvs_v3.set_latency = TRUE;
request.dcvs_v3.latency = 1000;
request.dcvs_v3.set_bus_params = TRUE;
request.dcvs_v3.bus_params.min_corner = HAP_DCVS_VCORNER_SVS;
request.dcvs_v3.bus_params.max_corner = HAP_DCVS_VCORNER_TURBO;
request.dcvs_v3.bus_params.target_corner = HAP_DCVS_VCORNER_NOM;
request.dcvs_v3.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE;
request.dcvs_v3.set_bus_params = TRUE;
request.dcvs_v3.bus_params.min_corner = HAP_DCVS_VCORNER_MAX;
request.dcvs_v3.bus_params.max_corner = HAP_DCVS_VCORNER_MAX;
request.dcvs_v3.bus_params.target_corner = HAP_DCVS_VCORNER_MAX;
request.dcvs_v3.core_params.min_corner = HAP_DCVS_VCORNER_MAX;
request.dcvs_v3.core_params.max_corner = HAP_DCVS_VCORNER_MAX;
request.dcvs_v3.core_params.target_corner = HAP_DCVS_VCORNER_MAX;
request.dcvs_v3.set_sleep_disable = TRUE;
request.dcvs_v3.sleep_disable = TRUE;
}
auto ret = HAP_power_set(_context_ptr, &request);
@ -359,7 +371,7 @@ template <size_t _buffer_count, size_t _sub_idx> class npu_sub_process_scoped_ti
inline auto make_scoped_perf_timer(const char * format, ...) {
va_list args;
va_start(args, format);
char buffer[1024];
char buffer[512];
vsnprintf(buffer, sizeof(buffer), format, args);
va_end(args);
return npu_scoped_timer<1024>(buffer);

View File

@ -1120,10 +1120,75 @@ inline HVX_VectorPair hvx_vqf32_convert_vhf(HVX_Vector vxl) {
inline HVX_Vector_x2 hvx_vsf_convert_vhf(HVX_Vector vxl, HVX_Vector one) {
HVX_VectorPair res = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vxl), one);
return {
Q6_Vsf_equals_Vqf32(Q6_V_lo_W(res)),
Q6_Vsf_equals_Vqf32(Q6_V_hi_W(res)),
};
HVX_Vector_x2 ret;
ret.val[0] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(res));
ret.val[1] = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(res));
return ret;
}
/**
* @brief Calculates exponential (e^x) for vector elements with infinity guard
*
* This function computes the exponential value for each element in the input vector.
* For input values greater than kMaxExp (88.02f), the function returns the provided
* infinity value instead of attempting to calculate an exponential that would overflow.
*
* @param sline The input vector containing values to compute exponential for
* @param inf The vector containing the infinity representation to use for guarded values
* @return HVX_Vector containing exponential values, with values > kMaxExp replaced by inf
*
* @note Input values greater than 88.02f will return the specified infinity value
*/
inline HVX_Vector qhmath_hvx_exp_vf_guard_inf(HVX_Vector sline, const HVX_Vector inf) {
constexpr float kMaxExp = 88.02f;
const HVX_Vector max_exp = Q6_V_vsplat_R(reinterpret_cast<const uint32_t &>(kMaxExp));
HVX_VectorPred pred_gt_max_exp = Q6_Q_vcmp_gt_VsfVsf(sline, max_exp);
HVX_Vector out = qhmath_hvx_exp_vf(sline);
out = Q6_V_vmux_QVV(pred_gt_max_exp, inf, out);
return out;
}
/**
* @brief Vectorized division with guard for infinite denominators on HVX.
*
* Performs element-wise division num/denom using qhmath_hvx_div_vf and then
* masks out lanes where denom equals the provided inf value, forcing those
* lanes of the result to zero. This is a temporary guard until proper INF
* handling is implemented in the underlying division routine.
*
* @param num Numerator vector (per-lane).
* @param denom Denominator vector (per-lane); lanes equal to inf are zeroed in the output.
* @param coeffs Coefficients used by qhmath_hvx_div_vf for the reciprocal/division approximation.
* @param inf Lane value representing +INF to compare against denom.
* @return Vector of num/denom with lanes set to zero where denom == inf.
*
* @note NaNs, negative infinity, zero denominators, and subnormals are not explicitly handled.
* @see qhmath_hvx_div_vf
*/
inline HVX_Vector qhmath_hvx_div_vf_guard_inf(HVX_Vector num,
HVX_Vector denom,
HVX_VectorPair_x4 coeffs,
const HVX_Vector inf) {
HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(denom, inf);
// TODO: fix the inf in div
HVX_Vector out = qhmath_hvx_div_vf(num, denom, coeffs);
out = Q6_V_vmux_QVV(pred_inf, Q6_V_vzero(), out);
return out;
}
inline HVX_Vector Q6_Vsf_vadd_VsfVsf_guard_inf(HVX_Vector num0, HVX_Vector num1, const HVX_Vector inf) {
HVX_VectorPred pred0 = Q6_Q_vcmp_eq_VwVw(num0, inf);
HVX_Vector out = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(num0, num1));
out = Q6_V_vmux_QVV(pred0, inf, out);
return out;
}
} // namespace hexagon::vec::math

View File

@ -8,12 +8,18 @@
namespace hexagon {
constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73
constexpr const size_t kAlignMask = kBytesPerVector - 1;
template <typename T, int N> struct HEXAGON_pack {
T val[N];
};
using HVX_Vector_x2 = std::pair<HVX_Vector, HVX_Vector>;
using HVX_Vector_x2 = HEXAGON_pack<HVX_Vector, 2>;
using HVX_Vector_x3 = HEXAGON_pack<HVX_Vector, 3>;
using HVX_Vector_x4 = HEXAGON_pack<HVX_Vector, 4>;
using HVX_VectorPair_x4 = HEXAGON_pack<HVX_VectorPair, 4>;
using HVX_VectorPred_x3 = HEXAGON_pack<HVX_VectorPred, 3>;
typedef union {
HVX_VectorPair VV;
@ -24,8 +30,14 @@ typedef union {
} V;
} HVX_DV;
constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73
constexpr const size_t kAlignMask = kBytesPerVector - 1;
typedef union {
HVX_Vector v;
float f32[kBytesPerVector / sizeof(float)];
uint32_t u32[kBytesPerVector / sizeof(uint32_t)];
__fp16 f16[kBytesPerVector / sizeof(__fp16)];
uint16_t u16[kBytesPerVector / sizeof(uint16_t)];
uint8_t u8[kBytesPerVector];
} HVX_VectorAlias;
inline size_t get_aligned_size(size_t size) {
return (size + kAlignMask) & ~kAlignMask;
@ -383,22 +395,35 @@ _TReturn type_erase_dot_func(const void * src0, const void * src1, size_t count)
inline HVX_Vector vec_silu_f32_f32(HVX_Vector x, HVX_VectorPair_x4 coeff) {
using namespace hexagon::vec::math;
HVX_Vector one = Q6_V_vsplat_R(0x3F800000);
constexpr float kMaxExp = 88.02f; // log(INF)
const HVX_Vector max_exp = Q6_V_vsplat_R(reinterpret_cast<const uint32_t &>(kMaxExp));
HVX_Vector one = Q6_V_vsplat_R(0x3F800000);
// x/(1.0f + expf(-x));
HVX_Vector exp_neg_x = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(Q6_V_vzero(), x));
HVX_Vector denom = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(qhmath_hvx_exp_vf(exp_neg_x), one));
return qhmath_hvx_div_vf(x, denom, coeff);
HVX_Vector exp_neg_x = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(Q6_V_vzero(), x));
HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(exp_neg_x, max_exp);
HVX_Vector denom = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(qhmath_hvx_exp_vf(exp_neg_x), one));
HVX_Vector out = qhmath_hvx_div_vf(x, denom, coeff);
out = Q6_V_vmux_QVV(pred0, Q6_V_vzero(), out);
return out;
}
inline HVX_Vector vec_silu_f16_f16(HVX_Vector x, HVX_VectorPair_x4 coeff) {
using namespace hexagon::vec::math;
HVX_Vector one = Q6_Vh_vsplat_R(0x3c00);
constexpr __fp16 kMaxExp = 11.0898664f; // log(INF)
const HVX_Vector max_exp = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(kMaxExp));
HVX_Vector one = Q6_Vh_vsplat_R(0x3c00);
// x/(1.0f + expf(-x));
HVX_Vector exp_neg_x = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vsub_VhfVhf(Q6_V_vzero(), x));
HVX_Vector denom = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_VhfVhf(qhmath_hvx_exp_vhf(exp_neg_x), one));
return qhmath_hvx_div_vhf(x, denom, coeff);
HVX_Vector exp_neg_x = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vsub_VhfVhf(Q6_V_vzero(), x));
HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VhfVhf(exp_neg_x, max_exp);
HVX_Vector denom = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_VhfVhf(qhmath_hvx_exp_vhf(exp_neg_x), one));
HVX_Vector out = qhmath_hvx_div_vhf(x, denom, coeff);
out = Q6_V_vmux_QVV(pred0, Q6_V_vzero(), out);
return out;
}
inline HVX_Vector vec_swiglu_f32_f32(HVX_Vector x, HVX_Vector g, HVX_VectorPair_x4 coeff) {

View File

@ -16,16 +16,18 @@ template <typename _TElem,
inline _TRet vec_dot_product_impl(const _TElem * src0, const _TElem * src1, size_t count) {
constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(_TElem);
const HVX_Vector kZeroV = Q6_V_vzero();
HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0);
HVX_Vector * const src0_vec_ptr_end = ((HVX_Vector *) src0) + count / kElementsPerVector;
HVX_Vector * src1_vec_ptr = ((HVX_Vector *) src1);
HVX_Vector prev0 = *src0_vec_ptr++;
HVX_Vector prev1 = *src1_vec_ptr++;
HVX_Vector sum = Q6_V_vzero();
HVX_Vector sum = kZeroV;
if (src0_vec_ptr_end - src0_vec_ptr > 1) {
HVX_Vector sum0 = Q6_V_vzero();
HVX_Vector sum1 = Q6_V_vzero();
HVX_Vector sum0 = kZeroV;
HVX_Vector sum1 = kZeroV;
do {
HVX_VectorPair curr0 = reinterpret_cast<HVX_VectorPair *>(src0_vec_ptr)[0];
@ -33,14 +35,19 @@ inline _TRet vec_dot_product_impl(const _TElem * src0, const _TElem * src1, size
HVX_Vector l0 = Q6_V_valign_VVR(Q6_V_lo_W(curr0), prev0, (size_t) src0);
HVX_Vector l1 = Q6_V_valign_VVR(Q6_V_lo_W(curr1), prev1, (size_t) src1);
sum0 = _AddFunc(_MpyFunc(l0, l1), sum0);
HVX_Vector h0 = Q6_V_valign_VVR(Q6_V_hi_W(curr0), Q6_V_lo_W(curr0), (size_t) src0);
HVX_Vector h1 = Q6_V_valign_VVR(Q6_V_hi_W(curr1), Q6_V_lo_W(curr1), (size_t) src1);
sum1 = _AddFunc(_MpyFunc(h0, h1), sum1);
HVX_Vector mpy0 = _MpyFunc(l0, l1);
HVX_Vector mpy1 = _MpyFunc(h0, h1);
prev0 = Q6_V_hi_W(curr0);
prev1 = Q6_V_hi_W(curr1);
sum0 = _AddFunc(mpy0, sum0);
sum1 = _AddFunc(mpy1, sum1);
src0_vec_ptr += 2;
src1_vec_ptr += 2;
} while (src0_vec_ptr_end - src0_vec_ptr > 1);
@ -73,10 +80,11 @@ inline _TRet vec_dot_product_impl(const _TElem * src0, const _TElem * src1, size
src1_vec_ptr += should_fetch_src1 ? 1 : 0;
HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
prev0 = curr0;
prev1 = curr1;
sum = _AddFunc(_MpyFunc(s0, s1), sum);
HVX_Vector mpy0 = _MpyFunc(s0, s1);
prev0 = curr0;
prev1 = curr1;
sum = _AddFunc(mpy0, sum);
}
if (leftover > 0) {
@ -92,7 +100,7 @@ inline _TRet vec_dot_product_impl(const _TElem * src0, const _TElem * src1, size
prev1;
curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
sum = _AddFunc(Q6_V_valign_VVR(_MpyFunc(curr0, curr1), Q6_V_vzero(), leftover_bytes), sum);
sum = _AddFunc(Q6_V_valign_VVR(_MpyFunc(curr0, curr1), kZeroV, leftover_bytes), sum);
}
return _ReduceFunc(sum);
@ -106,36 +114,38 @@ template <typename _TElem,
inline _TRet vec_dot_product_aligned_impl(const _TElem * src0, const _TElem * src1, size_t count) {
constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(_TElem);
const HVX_Vector kZeroV = Q6_V_vzero();
HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0);
HVX_Vector * const src0_vec_ptr_end = ((HVX_Vector *) src0) + count / kElementsPerVector;
HVX_Vector * src1_vec_ptr = ((HVX_Vector *) src1);
HVX_Vector sum = Q6_V_vzero();
HVX_Vector sum = kZeroV;
{
HVX_Vector sum0 = Q6_V_vzero();
HVX_Vector sum1 = Q6_V_vzero();
if (src0_vec_ptr_end - src0_vec_ptr > 3) {
HVX_Vector sum2 = Q6_V_vzero();
HVX_Vector sum3 = Q6_V_vzero();
HVX_Vector sum0 = kZeroV;
HVX_Vector sum1 = kZeroV;
while (src0_vec_ptr_end - src0_vec_ptr > 3) {
HVX_VectorPair curr00 = reinterpret_cast<HVX_VectorPair *>(src0_vec_ptr)[0];
HVX_VectorPair curr10 = reinterpret_cast<HVX_VectorPair *>(src1_vec_ptr)[0];
do {
HVX_VectorPair curr00 = reinterpret_cast<HVX_VectorPair *>(src0_vec_ptr)[0];
HVX_VectorPair curr10 = reinterpret_cast<HVX_VectorPair *>(src1_vec_ptr)[0];
sum0 = _AddFunc(_MpyFunc(Q6_V_lo_W(curr00), Q6_V_lo_W(curr10)), sum0);
sum1 = _AddFunc(_MpyFunc(Q6_V_hi_W(curr00), Q6_V_hi_W(curr10)), sum1);
HVX_VectorPair curr01 = reinterpret_cast<HVX_VectorPair *>(src0_vec_ptr)[1];
HVX_VectorPair curr11 = reinterpret_cast<HVX_VectorPair *>(src1_vec_ptr)[1];
HVX_VectorPair curr01 = reinterpret_cast<HVX_VectorPair *>(src0_vec_ptr)[1];
HVX_VectorPair curr11 = reinterpret_cast<HVX_VectorPair *>(src1_vec_ptr)[1];
sum2 = _AddFunc(_MpyFunc(Q6_V_lo_W(curr01), Q6_V_lo_W(curr11)), sum2);
sum3 = _AddFunc(_MpyFunc(Q6_V_hi_W(curr01), Q6_V_hi_W(curr11)), sum3);
HVX_Vector mpy0 = _MpyFunc(Q6_V_lo_W(curr00), Q6_V_lo_W(curr10));
HVX_Vector mpy1 = _MpyFunc(Q6_V_hi_W(curr00), Q6_V_hi_W(curr10));
src0_vec_ptr += 4;
src1_vec_ptr += 4;
} while (src0_vec_ptr_end - src0_vec_ptr > 3);
HVX_Vector mpy2 = _MpyFunc(Q6_V_lo_W(curr01), Q6_V_lo_W(curr11));
HVX_Vector mpy3 = _MpyFunc(Q6_V_hi_W(curr01), Q6_V_hi_W(curr11));
sum0 = _AddFunc(sum2, sum0);
sum1 = _AddFunc(sum3, sum1);
}
sum0 = _AddFunc(mpy0, sum0);
sum1 = _AddFunc(mpy1, sum1);
sum0 = _AddFunc(mpy2, sum0);
sum1 = _AddFunc(mpy3, sum1);
src0_vec_ptr += 4;
src1_vec_ptr += 4;
};
if (src0_vec_ptr_end - src0_vec_ptr > 1) {
HVX_VectorPair curr0 = reinterpret_cast<HVX_VectorPair *>(src0_vec_ptr)[0];
@ -143,8 +153,11 @@ inline _TRet vec_dot_product_aligned_impl(const _TElem * src0, const _TElem * sr
src0_vec_ptr += 2;
src1_vec_ptr += 2;
sum0 = _AddFunc(_MpyFunc(Q6_V_lo_W(curr0), Q6_V_lo_W(curr1)), sum0);
sum1 = _AddFunc(_MpyFunc(Q6_V_hi_W(curr0), Q6_V_hi_W(curr1)), sum1);
HVX_Vector mpy0 = _MpyFunc(Q6_V_lo_W(curr0), Q6_V_lo_W(curr1));
HVX_Vector mpy1 = _MpyFunc(Q6_V_hi_W(curr0), Q6_V_hi_W(curr1));
sum0 = _AddFunc(mpy0, sum0);
sum1 = _AddFunc(mpy1, sum1);
}
sum = _AddFunc(sum0, sum1);
@ -195,6 +208,7 @@ inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr
constexpr const __fp16 kOne = 1.0f;
const HVX_Vector kOneV = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(kOne));
const HVX_Vector kZeroV = Q6_V_vzero();
const _TElem0 * const src0_ptr_end = src0 + count;
HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0);
@ -202,27 +216,33 @@ inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr
HVX_Vector * const src1_vec_ptr_end = ((HVX_Vector *) src1) + count / kElementsPerVector1;
HVX_Vector prev0 = *src0_vec_ptr++;
HVX_Vector prev1 = *src1_vec_ptr++;
HVX_Vector sum = Q6_V_vzero();
HVX_Vector sum = kZeroV;
if (src1_vec_ptr_end - src1_vec_ptr > 1) {
HVX_Vector sum0 = Q6_V_vzero();
HVX_Vector sum1 = Q6_V_vzero();
HVX_Vector sum0 = kZeroV;
HVX_Vector sum1 = kZeroV;
do {
HVX_Vector curr0 = src0_vec_ptr[0];
HVX_VectorPair curr1 = reinterpret_cast<HVX_VectorPair *>(src1_vec_ptr)[0];
HVX_Vector curr0 = src0_vec_ptr[0];
HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
HVX_Vector_x2 s0_pair = _ExpandFunc(s0, kOneV);
HVX_Vector l1 = Q6_V_valign_VVR(Q6_V_lo_W(curr1), prev1, (size_t) src1);
sum0 = _AddFunc(_MpyFunc(s0_pair.first, l1), sum0);
HVX_Vector curr10 = src1_vec_ptr[0];
HVX_Vector curr11 = src1_vec_ptr[1];
HVX_Vector h1 = Q6_V_valign_VVR(Q6_V_hi_W(curr1), Q6_V_lo_W(curr1), (size_t) src1);
sum1 = _AddFunc(_MpyFunc(s0_pair.second, h1), sum1);
HVX_Vector l1 = Q6_V_valign_VVR(curr10, prev1, (size_t) src1);
HVX_Vector h1 = Q6_V_valign_VVR(curr11, curr10, (size_t) src1);
HVX_Vector mpy0 = _MpyFunc(s0_pair.val[0], l1);
HVX_Vector mpy1 = _MpyFunc(s0_pair.val[1], h1);
prev0 = curr0;
prev1 = Q6_V_hi_W(curr1);
prev1 = curr11;
sum0 = _AddFunc(mpy0, sum0);
sum1 = _AddFunc(mpy1, sum1);
src0_vec_ptr++;
src1_vec_ptr += 2;
} while (src1_vec_ptr_end - src1_vec_ptr > 1);
@ -245,8 +265,11 @@ inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr
if (has_remaining_src1_vector) {
HVX_Vector curr1 = *src1_vec_ptr++;
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
sum = _AddFunc(_MpyFunc(s0_pair.first, s1), sum);
prev1 = curr1;
HVX_Vector mpy0 = _MpyFunc(s0_pair.val[0], s1);
prev1 = curr1;
sum = _AddFunc(mpy0, sum);
}
bool should_fetch_src1 = leftover1 != 0 || !hexagon::is_addr_aligned(src1_vec_ptr);
@ -254,9 +277,11 @@ inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr
src1_vec_ptr += should_fetch_src1 ? 1 : 0;
HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
prev0 = curr0;
prev1 = curr1;
sum = _AddFunc(_MpyFunc(has_remaining_src1_vector ? s0_pair.second : s0_pair.first, s1), sum);
HVX_Vector mpy1 = _MpyFunc(has_remaining_src1_vector ? s0_pair.val[1] : s0_pair.val[0], s1);
prev1 = curr1;
sum = _AddFunc(mpy1, sum);
}
if (leftover1 > 0) {
@ -274,8 +299,8 @@ inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr
HVX_Vector_x2 curr0_pair = _ExpandFunc(curr0, kOneV);
curr0 = leftover1 == leftover0 ? curr0_pair.first : curr0_pair.second;
sum = _AddFunc(Q6_V_valign_VVR(_MpyFunc(curr0, curr1), Q6_V_vzero(), leftover_bytes1), sum);
curr0 = leftover1 == leftover0 ? curr0_pair.val[0] : curr0_pair.val[1];
sum = _AddFunc(Q6_V_valign_VVR(_MpyFunc(curr0, curr1), kZeroV, leftover_bytes1), sum);
}
return _ReduceFunc(sum);
@ -299,44 +324,55 @@ inline _TRet vec_dot_product_mix_aligned_impl(const _TElem0 * src0, const _TElem
constexpr const __fp16 kOne = 1.0f;
const HVX_Vector kOneV = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(kOne));
const HVX_Vector kZeroV = Q6_V_vzero();
HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0);
HVX_Vector * src1_vec_ptr = ((HVX_Vector *) src1);
HVX_Vector * const src1_vec_ptr_end = ((HVX_Vector *) src1) + count / kElementsPerVector1;
HVX_Vector sum0 = Q6_V_vzero();
HVX_Vector sum1 = Q6_V_vzero();
HVX_Vector sum0 = kZeroV;
HVX_Vector sum1 = kZeroV;
if (src1_vec_ptr_end - src1_vec_ptr > 3) {
HVX_Vector sum2 = Q6_V_vzero();
HVX_Vector sum3 = Q6_V_vzero();
while (src1_vec_ptr_end - src1_vec_ptr > 3) {
HVX_Vector curr0_lo = src0_vec_ptr[0];
HVX_Vector curr10_lo = src1_vec_ptr[0];
do {
HVX_VectorPair curr0 = reinterpret_cast<HVX_VectorPair *>(src0_vec_ptr)[0];
HVX_Vector_x2 curr00 = _ExpandFunc(Q6_V_lo_W(curr0), kOneV);
HVX_VectorPair curr10 = reinterpret_cast<HVX_VectorPair *>(src1_vec_ptr)[0];
sum0 = _AddFunc(_MpyFunc(curr00.first, Q6_V_lo_W(curr10)), sum0);
sum1 = _AddFunc(_MpyFunc(curr00.second, Q6_V_hi_W(curr10)), sum1);
HVX_Vector curr0_hi = src0_vec_ptr[1];
HVX_Vector_x2 curr00 = _ExpandFunc(curr0_lo, kOneV);
HVX_Vector_x2 curr01 = _ExpandFunc(Q6_V_hi_W(curr0), kOneV);
HVX_VectorPair curr11 = reinterpret_cast<HVX_VectorPair *>(src1_vec_ptr)[1];
sum2 = _AddFunc(_MpyFunc(curr01.first, Q6_V_lo_W(curr11)), sum2);
sum3 = _AddFunc(_MpyFunc(curr01.second, Q6_V_hi_W(curr11)), sum3);
HVX_Vector curr10_hi = src1_vec_ptr[1];
HVX_Vector_x2 curr01 = _ExpandFunc(curr0_hi, kOneV);
src0_vec_ptr += 2;
src1_vec_ptr += 4;
} while (src1_vec_ptr_end - src1_vec_ptr > 3);
HVX_Vector mpy0 = _MpyFunc(curr00.val[0], curr10_lo);
HVX_Vector mpy1 = _MpyFunc(curr00.val[1], curr10_hi);
sum0 = _AddFunc(sum0, sum2);
sum1 = _AddFunc(sum1, sum3);
}
HVX_Vector curr11_lo = src1_vec_ptr[2];
HVX_Vector curr11_hi = src1_vec_ptr[3];
sum0 = _AddFunc(mpy0, sum0);
sum1 = _AddFunc(mpy1, sum1);
HVX_Vector mpy2 = _MpyFunc(curr01.val[0], curr11_lo);
HVX_Vector mpy3 = _MpyFunc(curr01.val[1], curr11_hi);
sum0 = _AddFunc(mpy2, sum0);
sum1 = _AddFunc(mpy3, sum1);
src0_vec_ptr += 2;
src1_vec_ptr += 4;
};
if (src1_vec_ptr_end - src1_vec_ptr > 1) {
HVX_Vector curr0 = src0_vec_ptr[0];
HVX_Vector_x2 s0_pair = _ExpandFunc(curr0, kOneV);
HVX_Vector curr0 = src0_vec_ptr[0];
HVX_Vector curr1_lo = src1_vec_ptr[0];
HVX_VectorPair curr1 = reinterpret_cast<HVX_VectorPair *>(src1_vec_ptr)[0];
sum0 = _AddFunc(_MpyFunc(s0_pair.first, Q6_V_lo_W(curr1)), sum0);
sum1 = _AddFunc(_MpyFunc(s0_pair.second, Q6_V_hi_W(curr1)), sum1);
HVX_Vector_x2 s0_pair = _ExpandFunc(curr0, kOneV);
HVX_Vector curr1_hi = src1_vec_ptr[1];
HVX_Vector mpy0 = _MpyFunc(s0_pair.val[0], curr1_lo);
HVX_Vector mpy1 = _MpyFunc(s0_pair.val[1], curr1_hi);
sum0 = _AddFunc(mpy0, sum0);
sum1 = _AddFunc(mpy1, sum1);
}
return _ReduceFunc(_AddFunc(sum0, sum1));
@ -360,14 +396,14 @@ inline void vec_scale_impl(const _TParam * src, float scale, _TParam * dst, size
HVX_VectorPair curr = reinterpret_cast<HVX_VectorPair *>(src_vec_ptr)[0];
src_vec_ptr += 2;
HVX_Vector lo = Q6_V_valign_VVR(Q6_V_lo_W(curr), prev, (size_t) src);
dst_vec_ptr[0] = _Func(lo, dst_vec_ptr, scale_vec);
HVX_Vector lo = Q6_V_valign_VVR(Q6_V_lo_W(curr), prev, (size_t) src);
HVX_Vector hi = Q6_V_valign_VVR(Q6_V_hi_W(curr), Q6_V_lo_W(curr), (size_t) src);
prev = Q6_V_hi_W(curr);
HVX_Vector hi = Q6_V_valign_VVR(Q6_V_hi_W(curr), Q6_V_lo_W(curr), (size_t) src);
dst_vec_ptr[0] = _Func(lo, dst_vec_ptr, scale_vec);
dst_vec_ptr[1] = _Func(hi, dst_vec_ptr + 1, scale_vec);
dst_vec_ptr += 2;
prev = Q6_V_hi_W(curr);
}
if (src_vec_end - src_vec_ptr > 0) {
@ -405,14 +441,16 @@ template <typename _TData> inline void vec_zero_impl(_TData * src, size_t count)
HVX_UVector * src_vec_ptr = ((HVX_UVector *) src);
HVX_UVector * const src_vec_end = ((HVX_UVector *) src) + (count / kElementsPerVector);
const HVX_Vector kZeroV = Q6_V_vzero();
while (src_vec_end - src_vec_ptr > 1) {
src_vec_ptr[0] = Q6_V_vzero();
src_vec_ptr[1] = Q6_V_vzero();
src_vec_ptr[0] = kZeroV;
src_vec_ptr[1] = kZeroV;
src_vec_ptr += 2;
}
if (src_vec_end - src_vec_ptr > 0) {
src_vec_ptr[0] = Q6_V_vzero();
src_vec_ptr[0] = kZeroV;
src_vec_ptr++;
}
@ -420,7 +458,7 @@ template <typename _TData> inline void vec_zero_impl(_TData * src, size_t count)
if (leftover > 0) {
// handle the leftover elements
const size_t leftover_bytes = leftover * sizeof(_TData);
q6op_vstu_variable_ARV(src_vec_ptr, leftover_bytes, Q6_V_vzero());
q6op_vstu_variable_ARV(src_vec_ptr, leftover_bytes, kZeroV);
}
}

View File

@ -90,13 +90,16 @@ bool host_graph::compute() {
return false;
}
LOG_DEBUG("[%p]host_graph::compute started\n", (void *) this);
SCOPED_PERFORMANCE_TRACKER("[hexagon-npu][%p]compute, handle(%p)", (void *) this, (void *) _graph_handle);
auto status = npu_device_graph_compute(_device_handle, _graph_handle);
if (status != AEE_SUCCESS) {
LOG_ERROR("Failed to compute host_graph: 0x%x\n", (int) status);
LOG_DEBUG("[%p]host_graph::compute finished with failure\n", (void *) this);
return false;
}
LOG_DEBUG("[%p]host_graph::compute finished\n", (void *) this);
return true;
}

View File

@ -242,6 +242,7 @@ bool npu_device::init_rpc_mem() {
bool npu_device::init_device_lib() {
if (!_device_handle) {
set_fast_rpc_stack_size(_rpc_interface, _dsp_domain_id, NPU_THREAD_STACK_SIZE);
auto arch = get_dsp_arch(_rpc_interface, _dsp_domain_id);
const auto & device_lib_info = get_device_library_info(arch);
std::string device_lib_uri = device_lib_info.device_lib_uri;

View File

@ -1,14 +1,14 @@
#pragma once
#include <list>
#include <type_traits>
#include <vector>
#include "common.hpp"
#include "ggml-impl.h"
#include "hexagon_npu.h"
#include "util.hpp"
#include <list>
#include <type_traits>
#include <vector>
namespace hexagon {
// TODO: merge this with device tensor?
@ -62,7 +62,7 @@ class host_tensor {
~host_tensor() {
LOG_DEBUG("host_tensor(%p) destroy, device_tensor_handle: %p\n", (void *) this, (void *) _device_tensor_handle);
if (_device_tensor_handle) {
if (_device_tensor_handle != npu_device_INVALID_DEVICE_TENSOR_HANDLE) {
npu_device_tensor_free(_device_handle, _device_tensor_handle);
// TODO: figure out why the _ggml_tensor is invalid here
}
@ -113,8 +113,11 @@ class host_tensor {
if (memcmp(_info_update.params, _ggml_tensor->op_params, sizeof(_info_update.params)) != 0) {
params_changed = true;
memcpy(_info_update.params, _ggml_tensor->op_params, sizeof(_info_update.params));
LOG_DEBUG("host_tensor(%p) op_params changed: [%x, %x, %x, %x]\n", (void *) this,
(int) _info_update.params[0], (int) _info_update.params[1], (int) _info_update.params[2],
LOG_DEBUG("host_tensor(%p) op_params changed: [%x, %x, %x, %x]\n",
(void *) this,
(int) _info_update.params[0],
(int) _info_update.params[1],
(int) _info_update.params[2],
(int) _info_update.params[3]);
}
@ -136,19 +139,29 @@ class host_tensor {
if (memcmp(_info_update.src_handles, src_tensor_handles, sizeof(_info_update.src_handles)) != 0) {
params_changed = true;
memcpy(_info_update.src_handles, src_tensor_handles, sizeof(_info_update.src_handles));
LOG_DEBUG("host_tensor(%p) src changed, handles: [%p, %p]\n", (void *) this,
(void *) _info_update.src_handles[0], (void *) _info_update.src_handles[1]);
LOG_DEBUG("host_tensor(%p) src changed, handles: [%p, %p]\n",
(void *) this,
(void *) _info_update.src_handles[0],
(void *) _info_update.src_handles[1]);
}
if (params_changed) {
npu_device_tensor_update_params(_device_handle, _device_tensor_handle, &_info_update);
LOG_DEBUG("host_tensor(%p) update_params, op: %s, params: [%x, %x, %x, %x]\n", (void *) this,
ggml_op_desc(_ggml_tensor), (int) _info_update.params[0], (int) _info_update.params[1],
(int) _info_update.params[2], (int) _info_update.params[3]);
LOG_DEBUG("host_tensor(%p) update_params, op: %s, params: [%x, %x, %x, %x]\n",
(void *) this,
ggml_op_desc(_ggml_tensor),
(int) _info_update.params[0],
(int) _info_update.params[1],
(int) _info_update.params[2],
(int) _info_update.params[3]);
} else {
LOG_DEBUG("host_tensor(%p) update_params, no changes, op: %s, params: [%x, %x, %x, %x]\n", (void *) this,
ggml_op_desc(_ggml_tensor), (int) _info_update.params[0], (int) _info_update.params[1],
(int) _info_update.params[2], (int) _info_update.params[3]);
LOG_DEBUG("host_tensor(%p) update_params, no changes, op: %s, params: [%x, %x, %x, %x]\n",
(void *) this,
ggml_op_desc(_ggml_tensor),
(int) _info_update.params[0],
(int) _info_update.params[1],
(int) _info_update.params[2],
(int) _info_update.params[3]);
}
}
@ -174,9 +187,13 @@ class host_tensor {
#endif
}
LOG_DEBUG("host_tensor(%p) update_params, op: %s, params: [%x, %x, %x, %x]\n", (void *) this,
ggml_op_desc(_ggml_tensor), (int) _info_update.params[0], (int) _info_update.params[1],
(int) _info_update.params[2], (int) _info_update.params[3]);
LOG_DEBUG("host_tensor(%p) update_params, op: %s, params: [%x, %x, %x, %x]\n",
(void *) this,
ggml_op_desc(_ggml_tensor),
(int) _info_update.params[0],
(int) _info_update.params[1],
(int) _info_update.params[2],
(int) _info_update.params[3]);
return _info_update;
}
@ -192,11 +209,21 @@ class host_tensor {
}
int get_desc(char * buffer, size_t size) const {
return snprintf(buffer, size, "%s[%ldx%ldx%ldx%ld], nb[%ld,%ld,%ld,%ld], %s, addr: %p, ggml: %p, handle:%p",
_ggml_tensor->name, (long) _ggml_tensor->ne[0], (long) _ggml_tensor->ne[1],
(long) _ggml_tensor->ne[2], (long) _ggml_tensor->ne[3], (long) _ggml_tensor->nb[0],
(long) _ggml_tensor->nb[1], (long) _ggml_tensor->nb[2], (long) _ggml_tensor->nb[3],
ggml_type_name(_ggml_tensor->type), (void *) this, (void *) _ggml_tensor,
return snprintf(buffer,
size,
"%s[%ldx%ldx%ldx%ld], nb[%ld,%ld,%ld,%ld], %s, addr: %p, ggml: %p, handle:%p",
_ggml_tensor->name,
(long) _ggml_tensor->ne[0],
(long) _ggml_tensor->ne[1],
(long) _ggml_tensor->ne[2],
(long) _ggml_tensor->ne[3],
(long) _ggml_tensor->nb[0],
(long) _ggml_tensor->nb[1],
(long) _ggml_tensor->nb[2],
(long) _ggml_tensor->nb[3],
ggml_type_name(_ggml_tensor->type),
(void *) this,
(void *) _ggml_tensor,
(void *) _device_tensor_handle);
}

View File

@ -149,6 +149,23 @@ void enable_unsigned_dsp_module(common::rpc_interface_ptr rpc_interface, uint32_
}
}
void set_fast_rpc_stack_size(common::rpc_interface_ptr rpc_interface, uint32_t domain_id, uint32_t stack_size) {
constexpr const uint32_t FASTRPC_THREAD_PARAMS = 1;
if (!rpc_interface || !rpc_interface->is_valid()) {
return;
}
remote_rpc_thread_params tp = {};
tp.domain = domain_id;
tp.prio = -1;
tp.stack_size = stack_size;
auto ret = rpc_interface->remote_session_control(FASTRPC_THREAD_PARAMS, &tp, sizeof(tp));
if (ret != AEE_SUCCESS) {
LOG_ERROR("failed to set fast RPC stack size: 0x%x\n", ret);
}
}
void get_op_tensor_desc(const ggml_tensor * dst, char * out, size_t max_len) {
if (dst == nullptr) {
snprintf(out, max_len, "null");
@ -161,15 +178,30 @@ void get_op_tensor_desc(const ggml_tensor * dst, char * out, size_t max_len) {
switch (dims) {
default:
case 4:
snprintf(out, max_len, "%s[%ldx%ldx%ldx%ld]", ggml_type_name(tensor->type), (long) tensor->ne[0],
(long) tensor->ne[1], (long) tensor->ne[2], (long) tensor->ne[3]);
snprintf(out,
max_len,
"%s[%ldx%ldx%ldx%ld]",
ggml_type_name(tensor->type),
(long) tensor->ne[0],
(long) tensor->ne[1],
(long) tensor->ne[2],
(long) tensor->ne[3]);
break;
case 3:
snprintf(out, max_len, "%s[%ldx%ldx%ld]", ggml_type_name(tensor->type), (long) tensor->ne[0],
(long) tensor->ne[1], (long) tensor->ne[2]);
snprintf(out,
max_len,
"%s[%ldx%ldx%ld]",
ggml_type_name(tensor->type),
(long) tensor->ne[0],
(long) tensor->ne[1],
(long) tensor->ne[2]);
break;
case 2:
snprintf(out, max_len, "%s[%ldx%ld]", ggml_type_name(tensor->type), (long) tensor->ne[0],
snprintf(out,
max_len,
"%s[%ldx%ld]",
ggml_type_name(tensor->type),
(long) tensor->ne[0],
(long) tensor->ne[1]);
break;
case 1:
@ -201,8 +233,14 @@ void get_op_tensor_desc(const ggml_tensor * dst, char * out, size_t max_len) {
print_tensor(dst->src[2], src2_desc, sizeof(src2_desc));
char src3_desc[256];
print_tensor(dst->src[3], src3_desc, sizeof(src3_desc));
snprintf(out, max_len, "dst: %s, src0: %s, src1: %s, src2: %s, src3: %s", dst_desc, src0_desc,
src1_desc, src2_desc, src3_desc);
snprintf(out,
max_len,
"dst: %s, src0: %s, src1: %s, src2: %s, src3: %s",
dst_desc,
src0_desc,
src1_desc,
src2_desc,
src3_desc);
return;
}
case 3:
@ -213,8 +251,8 @@ void get_op_tensor_desc(const ggml_tensor * dst, char * out, size_t max_len) {
print_tensor(dst->src[1], src1_desc, sizeof(src1_desc));
char src2_desc[256];
print_tensor(dst->src[2], src2_desc, sizeof(src2_desc));
snprintf(out, max_len, "dst: %s, src0: %s, src1: %s, src2: %s", dst_desc, src0_desc, src1_desc,
src2_desc);
snprintf(
out, max_len, "dst: %s, src0: %s, src1: %s, src2: %s", dst_desc, src0_desc, src1_desc, src2_desc);
return;
}
case 2:

View File

@ -23,6 +23,7 @@ hexagon_dsp_arch get_dsp_arch(common::rpc_interface_ptr rpc_interface, uint32_t
const char * get_dsp_arch_desc(hexagon_dsp_arch arch);
void enable_unsigned_dsp_module(common::rpc_interface_ptr rpc_interface, uint32_t domain_id);
void set_fast_rpc_stack_size(common::rpc_interface_ptr rpc_interface, uint32_t domain_id, uint32_t stack_size);
void get_op_tensor_desc(const ggml_tensor * dst, char * out, size_t max_len);

View File

@ -13,6 +13,8 @@ const uint32_t NPU_ROPE_TYPE_NEOX = 2;
const uint32_t NPU_ROPE_TYPE_MROPE = 8;
const uint32_t NPU_ROPE_TYPE_VISION = 24;
const uint32_t NPU_THREAD_STACK_SIZE = 64 * 1024;
interface npu_device : remote_handle64{
typedef int64_t ne_type[DEVICE_TENSOR_MAX_DIMS];