hexagon: general DMA and Binary Op fixes for large strides (#20918)

* hex-dma: make chained dma the default to handle newer models

This also includes some new instrumentation that we can remove later.

* hexagon: add uint32 dump helper

* hexagon: use single-page VTCM allocation to avoid issues with large gather ops in ssm-conv

ssm-conv uses HVX gather instruction and that instruction cannot handle cases where the base+offset
spans page boundaries.

* hexagon: update ssm-conv to make base-addr compute a bit easier to read

* hex-dma: use 1d mode for reshaping, it supports sizes up to 24-bits (>16MB)

* hex-bin: fix incorrect stride logic

* hexagon: make sure repack buffs are dumped for verbose > 2

* hex-bin: consistently use dma_queue_push even for dummy dst transactions

* hex-dma: start using 2d-wide mode on v75 and up

The removes the need to deal with the 16-bit limitaion for the strides.

* hex-bin: cleanup kernel selection logic

* hex-bin: cleanup binary op core and fix transposed tensor handling

* snapdragon: update run-bench to use larger ubatch and fa-on
This commit is contained in:
Max Krasnyansky 2026-03-23 15:33:49 -07:00 committed by GitHub
parent 1fb2290a51
commit 7cadbfce10
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 369 additions and 330 deletions

View File

@ -461,7 +461,7 @@ static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
d[7] = x[i * 8 + 7].d;
}
if (opt_verbose > 1) {
if (opt_verbose > 2) {
for (int i = 0; i < nb; i++) {
dump_packed_block_q4x4x2(y, i, k);
}
@ -480,7 +480,7 @@ static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
const uint8_t * y_q = y + 0; // quants first
const uint8_t * y_d = y + qrow_size; // then scales
if (opt_verbose > 1) {
if (opt_verbose > 2) {
for (int i = 0; i < nb; i++) {
dump_packed_block_q4x4x2(y, i, k);
}
@ -796,7 +796,7 @@ static void repack_row_q8x4x2(uint8_t * y, const block_q8_0 * x, int64_t k) {
d[7] = x[i * 8 + 7].d;
}
if (opt_verbose > 1) {
if (opt_verbose > 2) {
for (int i = 0; i < nb; i++) {
dump_packed_block_q8x4x2(y, i, k);
}
@ -814,7 +814,7 @@ static void unpack_row_q8x4x2(block_q8_0 * x, const uint8_t * y, int64_t k) {
const uint8_t * y_q = y + 0; // quants first
const uint8_t * y_d = y + qrow_size; // then scales
if (opt_verbose > 1) {
if (opt_verbose > 2) {
for (int i = 0; i < nb; i++) {
dump_packed_block_q8x4x2(y, i, k);
}
@ -1149,7 +1149,7 @@ static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k)
e[7] = x[i * 8 + 7].e;
}
if (opt_verbose > 1) {
if (opt_verbose > 2) {
for (int i = 0; i < nb; i++) {
dump_packed_block_mxfp4x4x2(y, i, k);
}
@ -1168,7 +1168,7 @@ static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k)
const uint8_t * y_q = y + 0; // quants first
const uint8_t * y_e = y + qrow_size; // then scales
if (opt_verbose > 1) {
if (opt_verbose > 2) {
for (int i = 0; i < nb; i++) {
dump_packed_block_mxfp4x4x2(y, i, k);
}

View File

@ -24,28 +24,26 @@
// Context for binary operations
struct htp_binary_context {
struct htp_ops_context * octx;
struct fastdiv_values dim1_div;
struct fastdiv_values dim2_div;
struct fastdiv_values dim12_div;
struct fastdiv_values src0_dim1_div; // ne01
struct fastdiv_values src0_dim2_div; // ne02
struct fastdiv_values src0_dim12_div;// ne03
struct fastdiv_values src1_dim1_div; // ne11
struct fastdiv_values src1_dim2_div; // ne12
struct fastdiv_values src1_dim3_div; // ne13
uint32_t nrows_per_thread;
bool split_at_ne01;
bool split_at_ne02;
// Precomputed values
uint32_t block_max;
uint32_t nrows_per_thread;
size_t src0_row_size_aligned;
size_t src1_row_size_aligned;
size_t dst_row_size_aligned;
uint32_t src1_fetch_rows; // 1 or block_max
uint32_t src1_dma_stride; // 0 or stride
bool split_at_ne01;
bool split_at_ne02;
};
#define htp_binary_preamble \
#define htp_binary_preamble \
const struct htp_tensor * src0 = &octx->src0; \
const struct htp_tensor * src1 = &octx->src1; \
struct htp_tensor * dst = &octx->dst; \
@ -72,12 +70,11 @@ struct htp_binary_context {
const uint32_t nb2 = dst->nb[2]; \
const uint32_t nb3 = dst->nb[3];
static inline uint32_t calc_block_size(struct htp_binary_context * bctx, uint32_t ir, uint32_t end_row,
uint32_t ne01, uint32_t ne02) {
static inline uint32_t calc_block_size(struct htp_binary_context * bctx, uint32_t ir, uint32_t end_row, uint32_t ne01, uint32_t ne02) {
uint32_t i03, i02, i01, rem;
i03 = fastdiv(ir, &bctx->dim12_div);
i03 = fastdiv(ir, &bctx->src0_dim12_div);
rem = ir - i03 * (ne02 * ne01);
i02 = fastdiv(rem, &bctx->dim1_div);
i02 = fastdiv(rem, &bctx->src0_dim1_div);
i01 = rem - i02 * ne01;
uint32_t rows_left = end_row - ir;
@ -191,6 +188,8 @@ static void binary_job_scalar(unsigned int nth, unsigned int ith, void * data) {
const uint32_t end_row = MIN(start_row + bctx->nrows_per_thread, total_rows);
if (start_row >= end_row) return;
FARF(HIGH, "binary-scalar: %d/%d (%u:%u) row-size %u (%u)", ith, nth, start_row, end_row, nb01, bctx->dst_row_size_aligned);
uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
uint8_t * dst_spad_base = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
@ -204,9 +203,9 @@ static void binary_job_scalar(unsigned int nth, unsigned int ith, void * data) {
for (int k = 0; k < 2 && ir_prefetch < end_row; k++) {
uint32_t current_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
uint32_t i03, i02, i01, rem;
i03 = fastdiv(ir_prefetch, &bctx->dim12_div);
i03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
rem = ir_prefetch - i03 * (ne02 * ne01);
i02 = fastdiv(rem, &bctx->dim1_div);
i02 = fastdiv(rem, &bctx->src0_dim1_div);
i01 = rem - i02 * ne01;
uint8_t * src0_curr = (uint8_t *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01;
@ -215,7 +214,7 @@ static void binary_job_scalar(unsigned int nth, unsigned int ith, void * data) {
uint8_t * s0_spad = src0_spad_base + spad_idx * src0_spad_half;
uint8_t * d_spad = dst_spad_base + spad_idx * dst_spad_half;
dma_queue_push_vtcm_to_ddr(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, 0);
dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, 0);
dma_queue_push(q, dma_make_ptr(s0_spad, src0_curr), bctx->src0_row_size_aligned, nb01, row_size_bytes, current_block_size);
ir_prefetch += current_block_size;
spad_idx ^= 1;
@ -229,9 +228,9 @@ static void binary_job_scalar(unsigned int nth, unsigned int ith, void * data) {
uint8_t * s0_spad = (uint8_t *) dma_queue_pop(q).dst;
uint32_t i03, i02, i01, rem;
i03 = fastdiv(ir, &bctx->dim12_div);
i03 = fastdiv(ir, &bctx->src0_dim12_div);
rem = ir - i03 * (ne02 * ne01);
i02 = fastdiv(rem, &bctx->dim1_div);
i02 = fastdiv(rem, &bctx->src0_dim1_div);
i01 = rem - i02 * ne01;
// src1 indices (broadcast/repeat)
@ -255,9 +254,9 @@ static void binary_job_scalar(unsigned int nth, unsigned int ith, void * data) {
if (ir_prefetch < end_row) {
uint32_t next_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
uint32_t p03, p02, p01, prem;
p03 = fastdiv(ir_prefetch, &bctx->dim12_div);
p03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
prem = ir_prefetch - p03 * (ne02 * ne01);
p02 = fastdiv(prem, &bctx->dim1_div);
p02 = fastdiv(prem, &bctx->src0_dim1_div);
p01 = prem - p02 * ne01;
uint8_t * s0_next = (uint8_t *)src0->data + p03 * nb03 + p02 * nb02 + p01 * nb01;
@ -282,6 +281,8 @@ static void binary_job_vector_same_shape(unsigned int nth, unsigned int ith, voi
const uint32_t end_row = MIN(start_row + bctx->nrows_per_thread, total_rows);
if (start_row >= end_row) return;
FARF(HIGH, "binary-same-shape: %d/%d (%u:%u) row-size %u (%u)", ith, nth, start_row, end_row, nb01, bctx->dst_row_size_aligned);
uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
uint8_t * src1_spad_base = octx->src1_spad.data + (ith * octx->src1_spad.size_per_thread);
uint8_t * dst_spad_base = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
@ -297,9 +298,9 @@ static void binary_job_vector_same_shape(unsigned int nth, unsigned int ith, voi
for (int k = 0; k < 2 && ir_prefetch < end_row; k++) {
uint32_t current_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
uint32_t i03, i02, i01, rem;
i03 = fastdiv(ir_prefetch, &bctx->dim12_div);
i03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
rem = ir_prefetch - i03 * (ne02 * ne01);
i02 = fastdiv(rem, &bctx->dim1_div);
i02 = fastdiv(rem, &bctx->src0_dim1_div);
i01 = rem - i02 * ne01;
uint32_t i13 = (ne13 == 1) ? 0 : i03;
@ -307,23 +308,23 @@ static void binary_job_vector_same_shape(unsigned int nth, unsigned int ith, voi
uint32_t i11 = (ne11 == 1) ? 0 : i01;
uint8_t * src0_curr = (uint8_t *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01;
uint8_t * src1_base = (uint8_t *)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11;
uint8_t * src1_curr = (uint8_t *)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11;
uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
uint8_t * s0_spad = src0_spad_base + spad_idx * src0_spad_half;
uint8_t * s1_spad = src1_spad_base + spad_idx * src1_spad_half;
uint8_t * d_spad = dst_spad_base + spad_idx * dst_spad_half;
dma_queue_push_vtcm_to_ddr(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, 0);
dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, 0);
dma_queue_push(q, dma_make_ptr(s0_spad, src0_curr), bctx->src0_row_size_aligned, nb01, row_size_bytes, current_block_size);
dma_queue_push(q, dma_make_ptr(s1_spad, src1_base), bctx->src1_row_size_aligned, bctx->src1_dma_stride, row_size_bytes, current_block_size);
dma_queue_push(q, dma_make_ptr(s1_spad, src1_curr), bctx->src1_row_size_aligned, nb11, row_size_bytes, current_block_size);
ir_prefetch += current_block_size;
spad_idx ^= 1;
}
for (uint32_t ir = start_row; ir < end_row; ) {
uint32_t current_block_size = calc_block_size(bctx, ir, end_row, ne01, ne02);
uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
uint8_t * s0_spad = (uint8_t *) dma_queue_pop(q).dst;
uint8_t * s1_spad = (uint8_t *) dma_queue_pop(q).dst;
@ -335,9 +336,9 @@ static void binary_job_vector_same_shape(unsigned int nth, unsigned int ith, voi
}
uint32_t i03, i02, i01, rem;
i03 = fastdiv(ir, &bctx->dim12_div);
i03 = fastdiv(ir, &bctx->src0_dim12_div);
rem = ir - i03 * (ne02 * ne01);
i02 = fastdiv(rem, &bctx->dim1_div);
i02 = fastdiv(rem, &bctx->src0_dim1_div);
i01 = rem - i02 * ne01;
uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, current_block_size);
@ -345,9 +346,9 @@ static void binary_job_vector_same_shape(unsigned int nth, unsigned int ith, voi
if (ir_prefetch < end_row) {
uint32_t next_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
uint32_t p03, p02, p01, prem;
p03 = fastdiv(ir_prefetch, &bctx->dim12_div);
p03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
prem = ir_prefetch - p03 * (ne02 * ne01);
p02 = fastdiv(prem, &bctx->dim1_div);
p02 = fastdiv(prem, &bctx->src0_dim1_div);
p01 = prem - p02 * ne01;
uint32_t p13 = (ne13 == 1) ? 0 : p03;
@ -358,7 +359,7 @@ static void binary_job_vector_same_shape(unsigned int nth, unsigned int ith, voi
uint8_t * s1_next = (uint8_t *)src1->data + p13 * nb13 + p12 * nb12 + p11 * nb11;
dma_queue_push(q, dma_make_ptr(s0_spad, s0_next), bctx->src0_row_size_aligned, nb01, row_size_bytes, next_block_size);
dma_queue_push(q, dma_make_ptr(s1_spad, s1_next), bctx->src1_row_size_aligned, bctx->src1_dma_stride, row_size_bytes, next_block_size);
dma_queue_push(q, dma_make_ptr(s1_spad, s1_next), bctx->src1_row_size_aligned, nb11, row_size_bytes, next_block_size);
ir_prefetch += next_block_size;
}
@ -373,15 +374,17 @@ static void binary_job_vector_row_broadcast(unsigned int nth, unsigned int ith,
struct htp_ops_context * octx = bctx->octx;
htp_binary_preamble;
const uint32_t src0_type = octx->src0.type;
const uint32_t src0_type = octx->src0.type;
const uint32_t row_size_bytes = (src0_type == HTP_TYPE_F32) ? ne00 * sizeof(float) : ne00 * sizeof(_Float16);
const uint32_t total_rows = ne01 * ne02 * ne03;
const uint32_t start_row = bctx->nrows_per_thread * ith;
const uint32_t end_row = MIN(start_row + bctx->nrows_per_thread, total_rows);
const uint32_t start_row = bctx->nrows_per_thread * ith;
const uint32_t end_row = MIN(start_row + bctx->nrows_per_thread, total_rows);
if (start_row >= end_row) return;
FARF(HIGH, "binary-row-bcast: %d/%d (%u:%u) row-size %u (%u)", ith, nth, start_row, end_row, nb01, bctx->dst_row_size_aligned);
uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
uint8_t * src1_spad = octx->src1_spad.data + (ith * octx->src1_spad.size_per_thread);
uint8_t * src1_spad_base = octx->src1_spad.data + (ith * octx->src1_spad.size_per_thread);
uint8_t * dst_spad_base = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
@ -391,15 +394,14 @@ static void binary_job_vector_row_broadcast(unsigned int nth, unsigned int ith,
uint32_t ir_prefetch = start_row;
int spad_idx = 0;
void * s1_ptr = (void *) src1_spad;
void * s1_ptr = (void *) src1_spad_base;
for (int k = 0; k < 2 && ir_prefetch < end_row; k++) {
uint32_t current_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
uint32_t i03, i02, i01, rem;
i03 = fastdiv(ir_prefetch, &bctx->dim12_div);
rem = ir_prefetch - i03 * (ne02 * ne01);
i02 = fastdiv(rem, &bctx->dim1_div);
i01 = rem - i02 * ne01;
uint32_t i03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
uint32_t rem = ir_prefetch - i03 * (ne02 * ne01);
uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
uint32_t i01 = rem - i02 * ne01;
uint8_t * src0_curr = (uint8_t *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01;
uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
@ -407,7 +409,7 @@ static void binary_job_vector_row_broadcast(unsigned int nth, unsigned int ith,
uint8_t * s0_spad = src0_spad_base + spad_idx * src0_spad_half;
uint8_t * d_spad = dst_spad_base + spad_idx * dst_spad_half;
dma_queue_push_vtcm_to_ddr(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, 0);
dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, 0);
dma_queue_push(q, dma_make_ptr(s0_spad, src0_curr), bctx->src0_row_size_aligned, nb01, row_size_bytes, current_block_size);
ir_prefetch += current_block_size;
spad_idx ^= 1;
@ -415,7 +417,7 @@ static void binary_job_vector_row_broadcast(unsigned int nth, unsigned int ith,
for (uint32_t ir = start_row; ir < end_row; ) {
uint32_t current_block_size = calc_block_size(bctx, ir, end_row, ne01, ne02);
uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
uint8_t * s0_spad = (uint8_t *) dma_queue_pop(q).dst;
for (uint32_t r = 0; r < current_block_size; r++) {
@ -425,21 +427,19 @@ static void binary_job_vector_row_broadcast(unsigned int nth, unsigned int ith,
COMPUTE_VECTOR_OP_AAA(r_dst, r_src0, r_src1, src0_type, ne00);
}
uint32_t i03, i02, i01, rem;
i03 = fastdiv(ir, &bctx->dim12_div);
rem = ir - i03 * (ne02 * ne01);
i02 = fastdiv(rem, &bctx->dim1_div);
i01 = rem - i02 * ne01;
uint32_t i03 = fastdiv(ir, &bctx->src0_dim12_div);
uint32_t rem = ir - i03 * (ne02 * ne01);
uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
uint32_t i01 = rem - i02 * ne01;
uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, current_block_size);
if (ir_prefetch < end_row) {
uint32_t next_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
uint32_t p03, p02, p01, prem;
p03 = fastdiv(ir_prefetch, &bctx->dim12_div);
prem = ir_prefetch - p03 * (ne02 * ne01);
p02 = fastdiv(prem, &bctx->dim1_div);
p01 = prem - p02 * ne01;
uint32_t p03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
uint32_t prem = ir_prefetch - p03 * (ne02 * ne01);
uint32_t p02 = fastdiv(prem, &bctx->src0_dim1_div);
uint32_t p01 = prem - p02 * ne01;
uint8_t * s0_next = (uint8_t *)src0->data + p03 * nb03 + p02 * nb02 + p01 * nb01;
dma_queue_push(q, dma_make_ptr(s0_spad, s0_next), bctx->src0_row_size_aligned, nb01, row_size_bytes, next_block_size);
ir_prefetch += next_block_size;
@ -458,14 +458,16 @@ static void binary_job_vector_complex(unsigned int nth, unsigned int ith, void *
const uint32_t src0_type = octx->src0.type;
const uint32_t row_size_bytes = (src0_type == HTP_TYPE_F32) ? ne00 * sizeof(float) : ne00 * sizeof(_Float16);
const uint32_t total_rows = ne01 * ne02 * ne03;
const uint32_t start_row = bctx->nrows_per_thread * ith;
const uint32_t end_row = MIN(start_row + bctx->nrows_per_thread, total_rows);
const uint32_t start_row = bctx->nrows_per_thread * ith;
const uint32_t end_row = MIN(start_row + bctx->nrows_per_thread, total_rows);
if (start_row >= end_row) return;
FARF(HIGH, "binary-complex: %d/%d (%u:%u) row-size %u (%u)", ith, nth, start_row, end_row, nb01, bctx->dst_row_size_aligned);
uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
uint8_t * dst_spad_base = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
size_t dst_spad_half = octx->dst_spad.size_per_thread / 2;
size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
size_t dst_spad_half = octx->dst_spad.size_per_thread / 2;
dma_queue * q = octx->ctx->dma[ith];
uint32_t ir_prefetch = start_row;
@ -473,11 +475,10 @@ static void binary_job_vector_complex(unsigned int nth, unsigned int ith, void *
for (int k = 0; k < 2 && ir_prefetch < end_row; k++) {
uint32_t current_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
uint32_t i03, i02, i01, rem;
i03 = fastdiv(ir_prefetch, &bctx->dim12_div);
rem = ir_prefetch - i03 * (ne02 * ne01);
i02 = fastdiv(rem, &bctx->dim1_div);
i01 = rem - i02 * ne01;
uint32_t i03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
uint32_t rem = ir_prefetch - i03 * (ne02 * ne01);
uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
uint32_t i01 = rem - i02 * ne01;
uint8_t * src0_curr = (uint8_t *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01;
uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
@ -485,7 +486,7 @@ static void binary_job_vector_complex(unsigned int nth, unsigned int ith, void *
uint8_t * s0_spad = src0_spad_base + spad_idx * src0_spad_half;
uint8_t * d_spad = dst_spad_base + spad_idx * dst_spad_half;
dma_queue_push_vtcm_to_ddr(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, 0);
dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, 0);
dma_queue_push(q, dma_make_ptr(s0_spad, src0_curr), bctx->src0_row_size_aligned, nb01, row_size_bytes, current_block_size);
ir_prefetch += current_block_size;
spad_idx ^= 1;
@ -496,11 +497,10 @@ static void binary_job_vector_complex(unsigned int nth, unsigned int ith, void *
uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
uint8_t * s0_spad = (uint8_t *) dma_queue_pop(q).dst;
uint32_t i03, i02, i01, rem;
i03 = fastdiv(ir, &bctx->dim12_div);
rem = ir - i03 * (ne02 * ne01);
i02 = fastdiv(rem, &bctx->dim1_div);
i01 = rem - i02 * ne01;
uint32_t i03 = fastdiv(ir, &bctx->src0_dim12_div);
uint32_t rem = ir - i03 * (ne02 * ne01);
uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
uint32_t i01 = rem - i02 * ne01;
for (uint32_t r = 0; r < current_block_size; r++) {
uint32_t r_i01 = i01 + r;
@ -521,11 +521,10 @@ static void binary_job_vector_complex(unsigned int nth, unsigned int ith, void *
if (ir_prefetch < end_row) {
uint32_t next_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
uint32_t p03, p02, p01, prem;
p03 = fastdiv(ir_prefetch, &bctx->dim12_div);
prem = ir_prefetch - p03 * (ne02 * ne01);
p02 = fastdiv(prem, &bctx->dim1_div);
p01 = prem - p02 * ne01;
uint32_t p03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
uint32_t prem = ir_prefetch - p03 * (ne02 * ne01);
uint32_t p02 = fastdiv(prem, &bctx->src0_dim1_div);
uint32_t p01 = prem - p02 * ne01;
uint8_t * s0_next = (uint8_t *)src0->data + p03 * nb03 + p02 * nb02 + p01 * nb01;
dma_queue_push(q, dma_make_ptr(s0_spad, s0_next), bctx->src0_row_size_aligned, nb01, row_size_bytes, next_block_size);
ir_prefetch += next_block_size;
@ -545,14 +544,16 @@ static void binary_job_element_repeat(unsigned int nth, unsigned int ith, void *
const uint32_t elem_size_bytes = (src0_type == HTP_TYPE_F32) ? sizeof(float) : sizeof(_Float16);
const uint32_t row_size_bytes = ne00 * elem_size_bytes;;
const uint32_t total_rows = ne01 * ne02 * ne03;
const uint32_t start_row = bctx->nrows_per_thread * ith;
const uint32_t end_row = MIN(start_row + bctx->nrows_per_thread, total_rows);
const uint32_t start_row = bctx->nrows_per_thread * ith;
const uint32_t end_row = MIN(start_row + bctx->nrows_per_thread, total_rows);
if (start_row >= end_row) return;
uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
uint8_t * dst_spad_base = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
size_t dst_spad_half = octx->dst_spad.size_per_thread / 2;
size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
size_t dst_spad_half = octx->dst_spad.size_per_thread / 2;
FARF(HIGH, "binary-repeat: %d/%d (%u:%u) row-size %u (%u)", ith, nth, start_row, end_row, nb01, bctx->dst_row_size_aligned);
dma_queue * q = octx->ctx->dma[ith];
uint32_t ir_prefetch = start_row;
@ -560,11 +561,10 @@ static void binary_job_element_repeat(unsigned int nth, unsigned int ith, void *
for (int k = 0; k < 2 && ir_prefetch < end_row; k++) {
uint32_t current_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
uint32_t i03, i02, i01, rem;
i03 = fastdiv(ir_prefetch, &bctx->dim12_div);
rem = ir_prefetch - i03 * (ne02 * ne01);
i02 = fastdiv(rem, &bctx->dim1_div);
i01 = rem - i02 * ne01;
uint32_t i03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
uint32_t rem = ir_prefetch - i03 * (ne02 * ne01);
uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
uint32_t i01 = rem - i02 * ne01;
uint8_t * src0_curr = (uint8_t *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01;
uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
@ -572,7 +572,7 @@ static void binary_job_element_repeat(unsigned int nth, unsigned int ith, void *
uint8_t * s0_spad = src0_spad_base + spad_idx * src0_spad_half;
uint8_t * d_spad = dst_spad_base + spad_idx * dst_spad_half;
dma_queue_push_vtcm_to_ddr(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, 0);
dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, row_size_bytes, 0);
dma_queue_push(q, dma_make_ptr(s0_spad, src0_curr), bctx->src0_row_size_aligned, nb01, row_size_bytes, current_block_size);
ir_prefetch += current_block_size;
spad_idx ^= 1;
@ -583,11 +583,10 @@ static void binary_job_element_repeat(unsigned int nth, unsigned int ith, void *
uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
uint8_t * s0_spad = (uint8_t *) dma_queue_pop(q).dst;
uint32_t i03, i02, i01, rem;
i03 = fastdiv(ir, &bctx->dim12_div);
rem = ir - i03 * (ne02 * ne01);
i02 = fastdiv(rem, &bctx->dim1_div);
i01 = rem - i02 * ne01;
uint32_t i03 = fastdiv(ir, &bctx->src0_dim12_div);
uint32_t rem = ir - i03 * (ne02 * ne01);
uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
uint32_t i01 = rem - i02 * ne01;
for (uint32_t r = 0; r < current_block_size; r++) {
uint32_t r_i01 = i01 + r;
@ -612,11 +611,10 @@ static void binary_job_element_repeat(unsigned int nth, unsigned int ith, void *
if (ir_prefetch < end_row) {
uint32_t next_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
uint32_t p03, p02, p01, prem;
p03 = fastdiv(ir_prefetch, &bctx->dim12_div);
prem = ir_prefetch - p03 * (ne02 * ne01);
p02 = fastdiv(prem, &bctx->dim1_div);
p01 = prem - p02 * ne01;
uint32_t p03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
uint32_t prem = ir_prefetch - p03 * (ne02 * ne01);
uint32_t p02 = fastdiv(prem, &bctx->src0_dim1_div);
uint32_t p01 = prem - p02 * ne01;
uint8_t * s0_next = (uint8_t *)src0->data + p03 * nb03 + p02 * nb02 + p01 * nb01;
dma_queue_push(q, dma_make_ptr(s0_spad, s0_next), bctx->src0_row_size_aligned, nb01, row_size_bytes, next_block_size);
ir_prefetch += next_block_size;
@ -646,6 +644,7 @@ static void binary_job_add_id(unsigned int nth, unsigned int ith, void * data) {
const uint32_t nb02 = src0->nb[2];
const uint32_t nb03 = src0->nb[3];
const uint32_t nb11 = src1->nb[1]; // src1 row stride
const uint32_t nb1 = dst->nb[1];
const uint32_t nb2 = dst->nb[2];
const uint32_t nb3 = dst->nb[3];
@ -657,8 +656,8 @@ static void binary_job_add_id(unsigned int nth, unsigned int ith, void * data) {
uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
uint8_t * dst_spad_base = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
size_t dst_spad_half = octx->dst_spad.size_per_thread / 2;
size_t src0_spad_half = octx->src0_spad.size_per_thread / 2;
size_t dst_spad_half = octx->dst_spad.size_per_thread / 2;
dma_queue * q = octx->ctx->dma[ith];
uint32_t ir_prefetch = start_row;
@ -666,11 +665,10 @@ static void binary_job_add_id(unsigned int nth, unsigned int ith, void * data) {
for (int k = 0; k < 2 && ir_prefetch < end_row; k++) {
uint32_t current_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
uint32_t i03, i02, i01, rem;
i03 = fastdiv(ir_prefetch, &bctx->dim12_div);
rem = ir_prefetch - i03 * (ne02 * ne01);
i02 = fastdiv(rem, &bctx->dim1_div);
i01 = rem - i02 * ne01;
uint32_t i03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
uint32_t rem = ir_prefetch - i03 * (ne02 * ne01);
uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
uint32_t i01 = rem - i02 * ne01;
uint8_t * src0_curr = (uint8_t *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01;
uint8_t * dst_curr = (uint8_t *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1;
@ -678,7 +676,7 @@ static void binary_job_add_id(unsigned int nth, unsigned int ith, void * data) {
uint8_t * s0_spad = src0_spad_base + spad_idx * src0_spad_half;
uint8_t * d_spad = dst_spad_base + spad_idx * dst_spad_half;
dma_queue_push_vtcm_to_ddr(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, 0);
dma_queue_push(q, dma_make_ptr(dst_curr, d_spad), nb1, bctx->dst_row_size_aligned, ne00 * sizeof(float), 0);
dma_queue_push(q, dma_make_ptr(s0_spad, src0_curr), bctx->src0_row_size_aligned, nb01, ne00 * sizeof(float), current_block_size);
ir_prefetch += current_block_size;
spad_idx ^= 1;
@ -689,11 +687,10 @@ static void binary_job_add_id(unsigned int nth, unsigned int ith, void * data) {
uint8_t * d_spad = (uint8_t *) dma_queue_pop(q).src;
uint8_t * s0_spad = (uint8_t *) dma_queue_pop(q).dst;
uint32_t i03, i02, i01, rem;
i03 = fastdiv(ir, &bctx->dim12_div);
rem = ir - i03 * (ne02 * ne01);
i02 = fastdiv(rem, &bctx->dim1_div);
i01 = rem - i02 * ne01;
uint32_t i03 = fastdiv(ir, &bctx->src0_dim12_div);
uint32_t rem = ir - i03 * (ne02 * ne01);
uint32_t i02 = fastdiv(rem, &bctx->src0_dim1_div);
uint32_t i01 = rem - i02 * ne01;
for (uint32_t r = 0; r < current_block_size; r++) {
uint32_t r_i01 = i01 + r; // linear within block since we split at ne01
@ -712,11 +709,10 @@ static void binary_job_add_id(unsigned int nth, unsigned int ith, void * data) {
if (ir_prefetch < end_row) {
uint32_t next_block_size = calc_block_size(bctx, ir_prefetch, end_row, ne01, ne02);
uint32_t p03, p02, p01, prem;
p03 = fastdiv(ir_prefetch, &bctx->dim12_div);
prem = ir_prefetch - p03 * (ne02 * ne01);
p02 = fastdiv(prem, &bctx->dim1_div);
p01 = prem - p02 * ne01;
uint32_t p03 = fastdiv(ir_prefetch, &bctx->src0_dim12_div);
uint32_t prem = ir_prefetch - p03 * (ne02 * ne01);
uint32_t p02 = fastdiv(prem, &bctx->src0_dim1_div);
uint32_t p01 = prem - p02 * ne01;
uint8_t * s0_next = (uint8_t *)src0->data + p03 * nb03 + p02 * nb02 + p01 * nb01;
dma_queue_push(q, dma_make_ptr(s0_spad, s0_next), bctx->src0_row_size_aligned, nb01, ne00 * sizeof(float), next_block_size);
ir_prefetch += next_block_size;
@ -739,40 +735,36 @@ static int execute_op_binary(struct htp_ops_context * octx) {
const size_t elem_size = (src0_type == HTP_TYPE_F32) ? sizeof(float) : sizeof(_Float16);
const size_t src0_row_size = src0->ne[0] * elem_size;
const size_t src1_row_size = src1->ne[0] * elem_size;
const size_t dst_row_size = dst->ne[0] * elem_size;
const size_t dst_row_size = dst->ne[0] * elem_size;
// Align to VLEN
const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN);
size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
bool is_add_id = (octx->op == HTP_OP_ADD_ID);
bool is_scalar = !is_add_id && (src1->ne[0] == 1);
// Determine which kernel we will use to alloc memory and dispatch
bool use_vector_same = !is_add_id && !is_scalar && ((src0->nb[1] % VLEN) == 0) && (src1->ne[0] == src0->ne[0]) &&
bool is_transposed = (src0->nb[1] < src0_row_size || src1->nb[1] < src1_row_size || dst->nb[1] < dst_row_size);
bool is_same_shape = !is_add_id && !is_scalar && !is_transposed &&
(src1->ne[0] == src0->ne[0] && src0->ne[0] % VLEN == 0) &&
(src1->ne[1] == src0->ne[1] || src1->ne[1] == 1) &&
(src1->ne[2] == src0->ne[2] || src1->ne[2] == 1) &&
(src1->ne[3] == src0->ne[3] || src1->ne[3] == 1);
bool is_row_bcast = use_vector_same && (src1->ne[1] == 1 && src1->ne[2] == 1 && src1->ne[3] == 1);
bool use_complex = !is_add_id && !is_scalar && !use_vector_same && (src1->ne[0] == src0->ne[0]);
bool use_repeat = !is_add_id && !is_scalar && !use_vector_same && (src1->ne[0] != src0->ne[0]);
bool is_row_bcast = is_same_shape && (src1->ne[1] == 1 && src1->ne[2] == 1 && src1->ne[3] == 1);
bool is_complex = !is_add_id && !is_scalar && !is_same_shape && (src1->ne[0] == src0->ne[0]);
bool is_repeat = !is_add_id && !is_scalar && !is_same_shape && (src1->ne[0] != src0->ne[0]);
size_t spad_row_total;
if (is_scalar) {
spad_row_total = 2 * (src0_row_size_aligned + dst_row_size_aligned);
} else if (is_row_bcast) {
spad_row_total = 2 * (src0_row_size_aligned + dst_row_size_aligned);
} else if (use_vector_same) {
if (is_same_shape) {
spad_row_total = 2 * (src0_row_size_aligned + src1_row_size_aligned + dst_row_size_aligned);
} else if (is_add_id) {
spad_row_total = 2 * (src0_row_size_aligned + dst_row_size_aligned); // src1 read directly
} else {
spad_row_total = 2 * (src0_row_size_aligned + dst_row_size_aligned);
}
size_t rows_per_buffer = octx->ctx->vtcm_size / (n_threads * spad_row_total);
// Adjust for static src1 in row_bcast case
if (is_row_bcast) {
size_t needed_static = src1_row_size_aligned;
@ -782,28 +774,26 @@ static int execute_op_binary(struct htp_ops_context * octx) {
}
if (rows_per_buffer < 1) {
FARF(ERROR, "binary: VTCM too small\n");
return HTP_STATUS_VTCM_TOO_SMALL;
FARF(ERROR, "binary: VTCM too small\n");
return HTP_STATUS_VTCM_TOO_SMALL;
}
octx->src0_spad.size_per_thread = rows_per_buffer * 2 * src0_row_size_aligned;
octx->dst_spad.size_per_thread = rows_per_buffer * 2 * dst_row_size_aligned;
if (is_scalar || use_complex || use_repeat || is_add_id) {
octx->src1_spad.size_per_thread = 0;
} else if (is_row_bcast) {
if (is_add_id || is_scalar || is_complex || is_repeat || is_row_bcast) {
octx->src1_spad.size_per_thread = 0;
} else {
octx->src1_spad.size_per_thread = rows_per_buffer * 2 * src1_row_size_aligned;
}
octx->dst_spad.size = n_threads * octx->dst_spad.size_per_thread;
octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread;
if (is_row_bcast) {
octx->src1_spad.size = src1_row_size_aligned;
} else {
octx->src1_spad.size = n_threads * octx->src1_spad.size_per_thread;
}
octx->dst_spad.size = n_threads * octx->dst_spad.size_per_thread;
if (octx->ctx->vtcm_size < (octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size)) {
return HTP_STATUS_VTCM_TOO_SMALL;
@ -823,46 +813,37 @@ static int execute_op_binary(struct htp_ops_context * octx) {
}
struct htp_binary_context bctx;
bctx.octx = octx;
bctx.nrows_per_thread = (src0_nrows + n_threads - 1) / n_threads;
bctx.block_max = rows_per_buffer;
bctx.octx = octx;
bctx.nrows_per_thread = (src0_nrows + n_threads - 1) / n_threads;
bctx.block_max = rows_per_buffer;
bctx.src0_row_size_aligned = src0_row_size_aligned;
bctx.src1_row_size_aligned = src1_row_size_aligned;
bctx.dst_row_size_aligned = dst_row_size_aligned;
bctx.dim1_div = init_fastdiv_values(src0->ne[1]);
bctx.dim2_div = init_fastdiv_values(src0->ne[2]);
bctx.dim12_div = init_fastdiv_values(src0->ne[1] * src0->ne[2]);
bctx.src0_dim1_div = init_fastdiv_values(src0->ne[1]);
bctx.src0_dim2_div = init_fastdiv_values(src0->ne[2]);
bctx.src0_dim12_div = init_fastdiv_values(src0->ne[1] * src0->ne[2]);
bctx.src1_dim1_div = init_fastdiv_values(src1->ne[1]);
bctx.src1_dim2_div = init_fastdiv_values(src1->ne[2]);
bctx.src1_dim3_div = init_fastdiv_values(src1->ne[3]);
bctx.src1_dim1_div = init_fastdiv_values(src1->ne[1]);
bctx.src1_dim2_div = init_fastdiv_values(src1->ne[2]);
bctx.src1_dim3_div = init_fastdiv_values(src1->ne[3]);
bool src0_contig_dim1 = (src0->nb[2] == src0->ne[1] * src0->nb[1]);
bool dst_contig_dim1 = (dst->nb[2] == src0->ne[1] * dst->nb[1]);
bool dst_contig_dim1 = (dst->nb[2] == src0->ne[1] * dst->nb[1]);
bool src0_contig_dim2 = (src0->nb[3] == src0->ne[2] * src0->nb[2]);
bool dst_contig_dim2 = (dst->nb[3] == src0->ne[2] * dst->nb[2]);
bool dst_contig_dim2 = (dst->nb[3] == src0->ne[2] * dst->nb[2]);
bctx.split_at_ne01 = (src0->ne[2] > 1) &&
((src1->ne[1] > 1) || (src1->ne[2] > 1) || !src0_contig_dim1 || !dst_contig_dim1);
bctx.split_at_ne02 = (src0->ne[3] > 1) &&
((src1->ne[2] > 1) || (src1->ne[3] > 1) || !src0_contig_dim2 || !dst_contig_dim2);
// Precompute specific kernel parameters
if (use_vector_same) {
bctx.src1_dma_stride = (src1->ne[1] == 1) ? 0 : src1->nb[1];
bctx.src1_fetch_rows = (src1->ne[1] == 1) ? 1 : rows_per_buffer;
}
bctx.split_at_ne01 = (src0->ne[2] > 1) && ((src1->ne[1] > 1) || (src1->ne[2] > 1) || !src0_contig_dim1 || !dst_contig_dim1);
bctx.split_at_ne02 = (src0->ne[3] > 1) && ((src1->ne[2] > 1) || (src1->ne[3] > 1) || !src0_contig_dim2 || !dst_contig_dim2);
worker_callback_t worker_func;
if (is_add_id) worker_func = binary_job_add_id;
else if (is_scalar) worker_func = binary_job_scalar;
else if (is_row_bcast) worker_func = binary_job_vector_row_broadcast;
else if (use_vector_same) worker_func = binary_job_vector_same_shape;
else if (use_complex) worker_func = binary_job_vector_complex;
else worker_func = binary_job_element_repeat;
if (is_add_id) worker_func = binary_job_add_id;
else if (is_scalar) worker_func = binary_job_scalar;
else if (is_row_bcast) worker_func = binary_job_vector_row_broadcast;
else if (is_same_shape) worker_func = binary_job_vector_same_shape;
else if (is_complex) worker_func = binary_job_vector_complex;
else worker_func = binary_job_element_repeat;
if (is_row_bcast) {
dma_queue_pop(q);

View File

@ -31,8 +31,8 @@ dma_queue * dma_queue_create(size_t capacity) {
q->capacity = capacity;
q->idx_mask = capacity - 1;
q->desc = (hexagon_udma_descriptor_type1_t *) memalign(64, capacity * sizeof(hexagon_udma_descriptor_type1_t));
memset(q->desc, 0, capacity * sizeof(hexagon_udma_descriptor_type1_t));
q->desc = (dma_descriptor_2d *) memalign(64, capacity * sizeof(dma_descriptor_2d));
memset(q->desc, 0, capacity * sizeof(dma_descriptor_2d));
q->dptr = (dma_ptr *) memalign(4, capacity * sizeof(dma_ptr));
memset(q->dptr, 0, capacity * sizeof(dma_ptr));

View File

@ -10,19 +10,84 @@
extern "C" {
#endif
// Define the HW descriptor structs here since the ones in HexSDK are a bit out of date
typedef struct dma_descriptor_1d_s {
void * next;
uint32_t size:24;
uint32_t desc_size:2;
uint32_t dst_comp:1;
uint32_t src_comp:1;
uint32_t dst_bypass:1;
uint32_t src_bypass:1;
uint32_t order:1;
uint32_t done:1;
void * src;
void * dst;
} dma_descriptor_1d;
#if __HVX_ARCH__ < 75
typedef struct dma_descriptor_2d_s {
void * next;
uint32_t reserved0:24;
uint32_t desc_size:2;
uint32_t dst_comp:1;
uint32_t src_comp:1;
uint32_t dst_bypass:1;
uint32_t src_bypass:1;
uint32_t order:1;
uint32_t done:1;
void * src;
void * dst;
uint32_t desc_type:8;
uint32_t reserved1:24;
uint32_t row_size:16;
uint32_t nrows:16;
uint32_t src_stride:16;
uint32_t dst_stride:16;
uint32_t src_offset:16;
uint32_t dst_offset:16;
} dma_descriptor_2d;
#else
typedef struct dma_descriptor_2d_s {
void * next;
uint32_t dst_stride:24;
uint32_t desc_size:2;
uint32_t dst_comp:1;
uint32_t src_comp:1;
uint32_t dst_bypass:1;
uint32_t src_bypass:1;
uint32_t order:1;
uint32_t done:1;
void * src;
void * dst;
uint32_t desc_type:8;
uint32_t reserved0:24;
uint32_t row_size:24;
uint32_t nrows_lo:8;
uint32_t nrows_hi:8;
uint32_t src_stride:24;
uint32_t offset:24;
uint32_t reserved1:8;
} dma_descriptor_2d;
#endif
typedef struct {
void *dst;
void *dst;
const void *src;
} dma_ptr;
typedef struct {
hexagon_udma_descriptor_type1_t * desc; // descriptor pointers
hexagon_udma_descriptor_type1_t * tail; // tail pointer
dma_ptr * dptr; // dst/src pointers
uint32_t push_idx;
uint32_t pop_idx;
uint32_t capacity;
uint32_t idx_mask;
dma_descriptor_2d * desc; // descriptor pointers
dma_descriptor_2d * tail; // tail pointer
dma_ptr * dptr; // dst/src pointers
uint32_t push_idx;
uint32_t pop_idx;
uint32_t capacity;
uint32_t idx_mask;
} dma_queue;
dma_queue * dma_queue_create(size_t capacity);
@ -59,71 +124,87 @@ static inline dma_ptr dma_make_ptr(void *dst, const void *src)
return p;
}
static inline bool dma_queue_push(dma_queue * q,
dma_ptr dptr,
size_t dst_row_size,
size_t src_row_size,
size_t width, // width in bytes. number of bytes to transfer per row
size_t nrows) {
#if __HVX_ARCH__ < 73
static const uint32_t dma_src_l2_bypass_on = 1;
static const uint32_t dma_dst_l2_bypass_on = 0;
#else
static const uint32_t dma_src_l2_bypass_on = 1;
static const uint32_t dma_dst_l2_bypass_on = 1;
#endif
static inline bool dma_queue_push_single_1d(dma_queue * q, dma_ptr dptr, size_t size) {
if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) {
FARF(ERROR, "dma-push: queue full\n");
FARF(HIGH, "dma-push: queue full\n");
return false;
}
hexagon_udma_descriptor_type1_t * desc = &q->desc[q->push_idx];
dma_descriptor_1d * desc = (dma_descriptor_1d *) &q->desc[q->push_idx];
desc->next = NULL;
desc->desc_size = 0; // 1D mode
desc->src_bypass = dma_src_l2_bypass_on;
desc->dst_bypass = dma_dst_l2_bypass_on;
desc->order = 1;
desc->done = 0;
desc->src = (void *) dptr.src;
desc->dst = (void *) dptr.dst;
desc->size = size;
q->dptr[q->push_idx] = dptr;
dmlink(q->tail, desc);
q->tail = (dma_descriptor_2d *) desc;
// FARF(ERROR, "dma-push: i %u row-size %u nrows %d dst %p src %p\n", q->push_idx, row_size, nrows, dptr.dst, dptr.src);
q->push_idx = (q->push_idx + 1) & q->idx_mask;
return true;
}
static inline bool dma_queue_push_single_2d(dma_queue * q, dma_ptr dptr, size_t dst_stride, size_t src_stride, size_t row_size, size_t nrows) {
if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) {
FARF(HIGH, "dma-push: queue full\n");
return false;
}
dma_descriptor_2d * desc = &q->desc[q->push_idx];
desc->next = NULL;
desc->length = 0;
desc->desctype = HEXAGON_UDMA_DESC_DESCTYPE_TYPE1;
desc->dstbypass = 1;
desc->srcbypass = 1;
#if __HVX_ARCH__ >= 73
desc->dstbypass = 1;
desc->srcbypass = 1;
#else
desc->dstbypass = 0;
desc->srcbypass = 1;
#endif
desc->order = 0;
desc->dstate = HEXAGON_UDMA_DESC_DSTATE_INCOMPLETE;
desc->reserved0 = 0;
desc->reserved1 = 0;
desc->desc_size = 1; // 2d mode
desc->src_bypass = dma_src_l2_bypass_on;
desc->dst_bypass = dma_dst_l2_bypass_on;
desc->src_comp = 0;
desc->dst_comp = 0;
desc->order = 1;
desc->done = 0;
desc->src_stride = src_stride;
desc->dst_stride = dst_stride;
desc->src = (void *) dptr.src;
desc->dst = (void *) dptr.dst;
desc->allocation = 0;
desc->padding = 0;
desc->roiwidth = width;
desc->roiheight = nrows;
desc->srcstride = src_row_size;
desc->dststride = dst_row_size;
desc->srcwidthoffset = 0;
desc->dstwidthoffset = 0;
desc->row_size = row_size;
#if __HVX_ARCH__ < 75
desc->desc_type = 0; // 2d (16-bit) mode
desc->nrows = nrows;
desc->src_offset = 0;
desc->dst_offset = 0;
#else
desc->desc_type = 9; // 2d (24-bit) mode
desc->nrows_lo = (nrows & 0xff);
desc->nrows_hi = (nrows >> 8);
desc->offset = 0;
#endif
q->dptr[q->push_idx] = dptr;
dmlink(q->tail, desc);
q->tail = desc;
// FARF(ERROR, "dma-push: i %u width %u nrows %d dst %p src %p\n", q->push_idx, width, nrows, dptr.dst, dptr.src);
// FARF(ERROR, "dma-push: i %u row-size %u nrows %d dst %p src %p\n", q->push_idx, row_size, nrows, dptr.dst, dptr.src);
q->push_idx = (q->push_idx + 1) & q->idx_mask;
return true;
}
static inline bool dma_queue_push_ddr_to_vtcm(dma_queue * q,
dma_ptr dptr,
size_t dst_row_size,
size_t src_row_size,
size_t nrows) {
return dma_queue_push(q, dptr, dst_row_size, src_row_size, src_row_size, nrows);
}
static inline bool dma_queue_push_vtcm_to_ddr(dma_queue * q,
dma_ptr dptr,
size_t dst_row_size,
size_t src_row_size,
size_t nrows) {
return dma_queue_push(q, dptr, dst_row_size, src_row_size, dst_row_size, nrows);
}
static inline dma_ptr dma_queue_pop(dma_queue * q) {
dma_ptr dptr = { NULL };
@ -131,12 +212,12 @@ static inline dma_ptr dma_queue_pop(dma_queue * q) {
return dptr;
}
hexagon_udma_descriptor_type1_t * desc = &q->desc[q->pop_idx];
dma_descriptor_2d * desc = &q->desc[q->pop_idx];
// Wait for desc to complete
while (1) {
dmpoll();
if (desc->dstate == HEXAGON_UDMA_DESC_DSTATE_COMPLETE) {
if (desc->done) {
break;
}
// FARF(ERROR, "dma-pop: waiting for DMA : %u\n", q->pop_idx);
@ -175,86 +256,62 @@ static inline uint32_t dma_queue_capacity(dma_queue * q) {
return q->capacity;
}
// ---------------------------------------------------------------------------
// Overflow-safe DMA push: all UDMA type1 descriptor fields (roiwidth,
// roiheight, srcstride, dststride) are 16-bit, max 65535. This helper
// transparently handles values that exceed the 16-bit limit and submits
// chained DMA transtions.
//
// Case 1 (fast path): all params fit in 16 bits -> direct dma_queue_push.
// Case 2 (contiguous block): width == srcstride == dststride. Reshape the
// flat transfer into a 2D descriptor with sub_width <= 65535. Produces a
// single descriptor, preserving async DMA behavior.
// Case 3 (stride overflow): srcstride or dststride > 65535. Issue rows
// one at a time. The first N-1 rows are pushed+popped synchronously;
// the last row is left async so the caller can pop it.
// ---------------------------------------------------------------------------
#define UDMA_MAX_FIELD_VAL 65535u
#if __HVX_ARCH__ < 75
static inline bool dma_queue_push_chained(dma_queue *q, dma_ptr dptr, size_t dst_stride, size_t src_stride, size_t width, size_t nrows) {
// Fast path: everything fits in 16 bits.
if (__builtin_expect(
width <= UDMA_MAX_FIELD_VAL &&
nrows <= UDMA_MAX_FIELD_VAL &&
src_stride <= UDMA_MAX_FIELD_VAL &&
dst_stride <= UDMA_MAX_FIELD_VAL, 1)) {
return dma_queue_push(q, dptr, dst_stride, src_stride, width, nrows);
// Overflow-safe DMA push: all 2d descriptor fields (row_size, nrows, src_stride, dst_stride) are 16-bit, max 65535.
// This version transparently handles values that exceed the 16-bit limit and submits chained DMA transtions.
#define DMA_MAX_FIELD_VAL 65535u
static inline bool dma_queue_push(dma_queue *q, dma_ptr dptr, size_t dst_stride, size_t src_stride, size_t row_size, size_t nrows) {
// Fast path: everything fits in 16 bits
if (nrows == 0 || __builtin_expect(
row_size <= DMA_MAX_FIELD_VAL &&
nrows <= DMA_MAX_FIELD_VAL &&
src_stride <= DMA_MAX_FIELD_VAL &&
dst_stride <= DMA_MAX_FIELD_VAL, 1)) {
return dma_queue_push_single_2d(q, dptr, dst_stride, src_stride, row_size, nrows);
}
// Case 2: contiguous block (width == src_stride == dst_stride).
// Reshape total bytes into sub_width * sub_nrows where sub_width <= 65535.
if (width == src_stride && width == dst_stride) {
size_t total = width * nrows;
// Pick the largest 128-byte-aligned sub_width that divides total evenly.
size_t sub_width = UDMA_MAX_FIELD_VAL & ~(size_t)127; // 65408
while (sub_width > 0 && total % sub_width != 0) {
sub_width -= 128;
}
if (sub_width == 0) {
// Fallback: use original width (must fit) with adjusted nrows.
// This shouldn't happen for 128-aligned DMA sizes.
sub_width = width;
}
size_t sub_nrows = total / sub_width;
// Handle sub_nrows > 65535 by issuing chunked descriptors.
const uint8_t *src = (const uint8_t *)dptr.src;
uint8_t *dst = (uint8_t *)dptr.dst;
size_t rows_done = 0;
while (rows_done < sub_nrows) {
size_t chunk = sub_nrows - rows_done;
if (chunk > UDMA_MAX_FIELD_VAL) chunk = UDMA_MAX_FIELD_VAL;
dma_ptr p = dma_make_ptr(dst + rows_done * sub_width, src + rows_done * sub_width);
if (!dma_queue_push(q, p, sub_width, sub_width, sub_width, chunk))
return false;
rows_done += chunk;
// Complete all chunks without waiting except the last one, so the
// caller's single dma_queue_pop drains the final descriptor.
if (rows_done < sub_nrows)
dma_queue_pop_nowait(q);
}
return true;
// Contiguous block
// Use 1d DMA mode which supports sizes up to 24-bits (16MB)
if (nrows == 1 || (row_size == src_stride && row_size == dst_stride)) {
size_t total = row_size * nrows;
return dma_queue_push_single_1d(q, dptr, total);
}
// Case 3: stride overflow — fall back to row-by-row.
// Stride overflow — fall back to row-by-row.
{
const uint8_t *src = (const uint8_t *)dptr.src;
uint8_t *dst = (uint8_t *)dptr.dst;
const uint8_t *src = (const uint8_t *) dptr.src;
uint8_t *dst = (uint8_t *) dptr.dst;
for (size_t r = 0; r < nrows; ++r) {
dma_ptr p = dma_make_ptr(dst + r * dst_stride,
src + r * src_stride);
if (!dma_queue_push(q, p, 0, 0, width, 1))
return false;
if (r + 1 < nrows)
dma_queue_pop_nowait(q);
dma_ptr p = dma_make_ptr(dst + r * dst_stride, src + r * src_stride);
if (!dma_queue_push_single_1d(q, p, row_size))
return false;
if (r + 1 < nrows)
dma_queue_pop(q);
}
return true;
}
}
#else // HVX_ARCH >= 75
static inline bool dma_queue_push(dma_queue *q, dma_ptr dptr, size_t dst_stride, size_t src_stride, size_t row_size, size_t nrows) {
// On v75 and up we always use 2d 24-bit mode
return dma_queue_push_single_2d(q, dptr, dst_stride, src_stride, row_size, nrows);
}
#endif
static inline bool dma_queue_push_ddr_to_vtcm(dma_queue * q, dma_ptr dptr, size_t dst_row_size, size_t src_row_size, size_t nrows) {
return dma_queue_push(q, dptr, dst_row_size, src_row_size, src_row_size, nrows);
}
static inline bool dma_queue_push_vtcm_to_ddr(dma_queue * q, dma_ptr dptr, size_t dst_row_size, size_t src_row_size, size_t nrows) {
return dma_queue_push(q, dptr, dst_row_size, src_row_size, dst_row_size, nrows);
}
#ifdef __cplusplus
} // extern "C"
#endif

View File

@ -21,6 +21,15 @@ static inline void hex_dump_uint8_line(char * pref, const uint8_t * x, uint32_t
FARF(HIGH, "%s\n", str);
}
static inline void hex_dump_uint32_line(char * pref, const uint32_t * x, uint32_t n) {
char str[1024], *p = str, *p_end = str + sizeof(str);
p += snprintf(p, p_end - p, "%s: ", pref);
for (int i = 0; i < n; i++) {
p += snprintf(p, p_end - p, "%u, ", (unsigned int) x[i]);
}
FARF(HIGH, "%s\n", str);
}
static inline void hex_dump_int32_line(char * pref, const int32_t * x, uint32_t n) {
char str[1024], *p = str, *p_end = str + sizeof(str);
p += snprintf(p, p_end - p, "%s: ", pref);

View File

@ -727,7 +727,7 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
if (use_dma_activation) {
const size_t row_bytes = (size_t) params->k * sizeof(float);
const size_t stride_bytes = (size_t) params->act_stride * sizeof(float);
dma_queue_push_chained(ctx->dma[0],
dma_queue_push(ctx->dma[0],
dma_make_ptr(vtcm_f32_act, activation_chunk),
row_bytes, stride_bytes, row_bytes, n_rows);
dma_queue_pop(ctx->dma[0]);
@ -747,7 +747,7 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
{
const size_t n_cols_first = hex_smin((size_t) params->n, n_chunk_n_cols);
dma_queue_push_chained(ctx->dma[0], dma_make_ptr(buf_curr, weight_group),
dma_queue_push(ctx->dma[0], dma_make_ptr(buf_curr, weight_group),
fp16_row_bytes, weight_row_bytes, fp16_row_bytes, n_cols_first);
}
@ -765,7 +765,7 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
const size_t n_cols_next = hex_smin((size_t) params->n - nc_next, n_chunk_n_cols);
const __fp16 *next_weight_chunk = weight_group + nc_next * params->weight_stride;
dma_queue_push_chained(ctx->dma[0], dma_make_ptr(buf_next, next_weight_chunk),
dma_queue_push(ctx->dma[0], dma_make_ptr(buf_next, next_weight_chunk),
fp16_row_bytes, weight_row_bytes, fp16_row_bytes, n_cols_next);
}
@ -891,7 +891,7 @@ int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, co
if (use_dma_activation) {
const size_t row_bytes = (size_t) k * sizeof(float);
const size_t stride_bytes = (size_t) act_stride * sizeof(float);
dma_queue_push_chained(ctx->dma[0],
dma_queue_push(ctx->dma[0],
dma_make_ptr(vtcm_f32_act, activation_chunk),
row_bytes, stride_bytes, row_bytes, n_rows);
dma_queue_pop(ctx->dma[0]);
@ -916,7 +916,7 @@ int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, co
{
const size_t n_cols_first = hex_smin(n, n_chunk_n_cols);
dma_queue_push_chained(ctx->dma[0], dma_make_ptr(buf_curr, permuted_weight),
dma_queue_push(ctx->dma[0], dma_make_ptr(buf_curr, permuted_weight),
fp16_row_bytes, weight_row_bytes, fp16_row_bytes, n_cols_first);
}
@ -933,7 +933,7 @@ int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, co
const size_t n_cols_next = hex_smin(n - nc_next, n_chunk_n_cols);
const __fp16 *next_weight_chunk = permuted_weight + nc_next * weight_stride;
dma_queue_push_chained(ctx->dma[0], dma_make_ptr(buf_next, next_weight_chunk),
dma_queue_push(ctx->dma[0], dma_make_ptr(buf_next, next_weight_chunk),
fp16_row_bytes, weight_row_bytes, fp16_row_bytes, n_cols_next);
}
@ -1104,7 +1104,7 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
// because UDMA roiwidth is 16-bit and total size can exceed 65535.
{
const size_t n_cols_first = hex_smin(n, n_chunk_n_cols);
dma_queue_push_chained(ctx->dma[0], dma_make_ptr(buf_curr, permuted_weight), row_stride, row_stride, row_stride, n_cols_first);
dma_queue_push(ctx->dma[0], dma_make_ptr(buf_curr, permuted_weight), row_stride, row_stride, row_stride, n_cols_first);
}
for (size_t nc = 0; nc < n; nc += n_chunk_n_cols) {
@ -1120,7 +1120,7 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
const uint8_t *next_weight_chunk = permuted_weight + nc_next * row_stride;
dma_queue_push_chained(ctx->dma[0], dma_make_ptr(buf_next, next_weight_chunk), row_stride, row_stride, row_stride, n_cols_next);
dma_queue_push(ctx->dma[0], dma_make_ptr(buf_next, next_weight_chunk), row_stride, row_stride, row_stride, n_cols_next);
}
// Dequant + vscatter writes directly to [K, N] transposed tiles.
@ -1173,7 +1173,7 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
{
// Use 2D DMA (n_cols rows x row_stride) to avoid 16-bit roiwidth overflow.
const uint8_t *qweight_chunk_A0 = permuted_weight;
dma_queue_push_chained(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_A0), row_stride, row_stride, row_stride, n_cols_A0);
dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_A0), row_stride, row_stride, row_stride, n_cols_A0);
}
{
@ -1191,7 +1191,7 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
const size_t n_cols_A1 = hex_smin(n - 1 * n_chunk_n_cols, n_chunk_n_cols);
if (1 < n_chunk_cnt) {
const uint8_t *qweight_chunk_A1 = permuted_weight + n_chunk_n_cols * row_stride;
dma_queue_push_chained(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_A1), row_stride, row_stride, row_stride, n_cols_A1);
dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_A1), row_stride, row_stride, row_stride, n_cols_A1);
}
// C0
@ -1218,7 +1218,7 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
// issue A_{i+2}
if (i + 2 < n_chunk_cnt) {
const uint8_t *qweight_chunk_p2 = permuted_weight + nc_p2 * row_stride;
dma_queue_push_chained(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_p2), row_stride, row_stride, row_stride, n_cols_p2);
dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_p2), row_stride, row_stride, row_stride, n_cols_p2);
}
// wait for HMX (C_{i}) -- C_{i} is done
@ -1443,7 +1443,7 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict
{
const float *activation_block = x + mr * k + kk;
dma_queue_push_chained(ctx->dma[0],
dma_queue_push(ctx->dma[0],
dma_make_ptr(vtcm_scratch1, activation_block),
k_blk_sz * sizeof(float),
k * sizeof(float),
@ -1472,10 +1472,10 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict
s.scale_width = nb_sub * HMX_X4X2_DBLK_SIZE;
// 2D DMA: quants sub-range
dma_queue_push_chained(ctx->dma[0], dma_make_ptr(s.dst, s.src + s.quant_off),
dma_queue_push(ctx->dma[0], dma_make_ptr(s.dst, s.src + s.quant_off),
s.dst_stride, s.src_stride, s.quant_width, s.n_rows);
// 2D DMA: scales sub-range
dma_queue_push_chained(ctx->dma[0], dma_make_ptr(s.dst + s.quant_width, s.src + s.scale_off),
dma_queue_push(ctx->dma[0], dma_make_ptr(s.dst + s.quant_width, s.src + s.scale_off),
s.dst_stride, s.src_stride, s.scale_width, s.n_rows);
}
TIMER_STOP(fetch);

View File

@ -15,12 +15,4 @@
#include "hvx-div.h"
#include "hvx-base.h"
#ifndef GATHER_TYPE
# if defined(__hexagon__)
# define GATHER_TYPE(_a) (intptr_t) _a
# else
# define GATHER_TYPE(_a) (HVX_Vector *) _a
# endif
#endif
#endif /* HVX_UTILS_H */

View File

@ -214,7 +214,7 @@ static int vtcm_alloc(struct htp_context * ctx) {
HAP_compute_res_attr_init(&attr);
HAP_compute_res_attr_set_serialize(&attr, 0);
HAP_compute_res_attr_set_cache_mode(&attr, 1);
HAP_compute_res_attr_set_vtcm_param_v2(&attr, vtcm_size, 0, vtcm_size);
HAP_compute_res_attr_set_vtcm_param_v2(&attr, vtcm_size, vtcm_size, vtcm_size); // single page
HAP_compute_res_attr_set_release_callback(&attr, vtcm_release_callback, (void *) ctx);
HAP_compute_res_attr_set_hmx_param(&attr, 1);
@ -319,7 +319,7 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
ctx->n_threads = n_hvx;
for (int i = 0; i < ctx->n_threads; i++) {
// see discussion https://github.com/ggml-org/llama.cpp/pull/18151#discussion_r2632388541
ctx->dma[i] = dma_queue_create(64);
ctx->dma[i] = dma_queue_create(128);
}
// init worker pool

View File

@ -151,7 +151,7 @@ static void ssm_conv_thread_f32_f32_hvx(unsigned int nth, unsigned int ith, void
const int dr = scctx->nrows_per_thread;
const uint32_t ir0 = dr * ith;
const uint32_t ir1 = MIN(ir0 + dr, d_inner);
const int ir = ir1 - ir0;
const uint32_t ir = ir1 - ir0;
if (ir0 >= ir1) {
return; // No work for this thread
@ -205,10 +205,10 @@ static void ssm_conv_thread_f32_f32_hvx(unsigned int nth, unsigned int ith, void
HVX_Vector acc_vec = Q6_V_vsplat_R(0);
for (uint32_t i0 = 0; i0 < d_conv; ++i0) {
Q6_vgather_ARMVw(src0_vec, GATHER_TYPE(spad_src0 + (i0 + i1 * ncs) * sizeof(float) + i2 * (src0->nb[0])),
src0_gather_len, (*(const HVX_Vector *) src0_offsets));
Q6_vgather_ARMVw(src1_vec, GATHER_TYPE(spad_src1 + (i0 + i1 * nc) * sizeof(float)),
src1_gather_len, (*(const HVX_Vector *) src1_offsets));
uint32_t src0_base = (uint32_t) spad_src0 + (i0 + i1 * ncs) * sizeof(float) + i2 * (src0->nb[0]);
uint32_t src1_base = (uint32_t) spad_src1 + (i0 + i1 * nc) * sizeof(float);
Q6_vgather_ARMVw(src0_vec, src0_base, src0_gather_len, (*(const HVX_Vector *) src0_offsets));
Q6_vgather_ARMVw(src1_vec, src1_base, src1_gather_len, (*(const HVX_Vector *) src1_offsets));
HVX_Vector prod = Q6_Vqf32_vmpy_VsfVsf(*(const HVX_Vector *) src0_vec, *(const HVX_Vector *) src1_vec);
acc_vec = Q6_Vqf32_vadd_Vqf32Vqf32(acc_vec, prod);
@ -222,10 +222,10 @@ static void ssm_conv_thread_f32_f32_hvx(unsigned int nth, unsigned int ith, void
HVX_Vector acc_vec = Q6_V_vsplat_R(0);
for (uint32_t i0 = 0; i0 < d_conv; ++i0) {
Q6_vgather_ARMVw(src0_vec, GATHER_TYPE(spad_src0 + (i0 + i1 * ncs) * sizeof(float) + i2 * (src0->nb[0])),
src0_gather_len, (*(const HVX_Vector *) src0_offsets));
Q6_vgather_ARMVw(src1_vec, GATHER_TYPE(spad_src1 + (i0 + i1 * nc) * sizeof(float)),
src1_gather_len, (*(const HVX_Vector *) src1_offsets));
uint32_t src0_base = (uint32_t) spad_src0 + (i0 + i1 * ncs) * sizeof(float) + i2 * (src0->nb[0]);
uint32_t src1_base = (uint32_t) spad_src1 + (i0 + i1 * nc) * sizeof(float);
Q6_vgather_ARMVw(src0_vec, src0_base, src0_gather_len, (*(const HVX_Vector *) src0_offsets));
Q6_vgather_ARMVw(src1_vec, src1_base, src1_gather_len, (*(const HVX_Vector *) src1_offsets));
HVX_Vector prod = Q6_Vqf32_vmpy_VsfVsf(*(const HVX_Vector *) src0_vec, *(const HVX_Vector *) src1_vec);
acc_vec = Q6_Vqf32_vadd_Vqf32Vqf32(acc_vec, prod);

View File

@ -48,5 +48,5 @@ adb $adbserial $adbhost shell " \
ADSP_LIBRARY_PATH=$basedir/$branch/lib \
$ndev $nhvx $opmask $verbose $experimental $profile $hb ./$branch/bin/llama-bench --device $device --mmap 0 -m $basedir/../gguf/$model \
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \
--batch-size 128 -ngl 99 $cli_opts $@ \
--ubatch-size 256 -fa 1 -ngl 99 $cli_opts $@ \
"