hexagon: vectorize partial f32 loads

This commit is contained in:
Todor Boinovski 2026-04-22 11:51:32 -07:00
parent 82f2809742
commit 976b9d9d9f
1 changed files with 3 additions and 6 deletions

View File

@ -63,10 +63,9 @@ static inline void solve_tri_row_scalar(const float * A_row,
} }
static inline HVX_Vector hvx_load_partial_f32(const float * src, uint32_t n) { static inline HVX_Vector hvx_load_partial_f32(const float * src, uint32_t n) {
HVX_VectorAlias tmp; HVX_Vector v = *((const HVX_UVector *) src);
memset(&tmp, 0, sizeof(tmp)); HVX_VectorPred mask = Q6_Q_vsetq2_R(n * sizeof(float));
memcpy(tmp.fp32, src, n * sizeof(float)); return Q6_V_vmux_QVV(mask, v, Q6_V_vzero());
return tmp.v;
} }
static inline void solve_tri_row_hvx(const float * A_row, static inline void solve_tri_row_hvx(const float * A_row,
@ -100,7 +99,6 @@ static inline void solve_tri_row_hvx(const float * A_row,
} }
// Batch-level thread: each job is one full batch. // Batch-level thread: each job is one full batch.
// Processes all column chunks within each row for better A_row cache reuse.
static void solve_tri_batch_thread_f32(unsigned int nth, unsigned int ith, void * data) { static void solve_tri_batch_thread_f32(unsigned int nth, unsigned int ith, void * data) {
struct htp_solve_tri_context * sctx = (struct htp_solve_tri_context *) data; struct htp_solve_tri_context * sctx = (struct htp_solve_tri_context *) data;
struct htp_ops_context * octx = sctx->octx; struct htp_ops_context * octx = sctx->octx;
@ -163,7 +161,6 @@ static void solve_tri_batch_thread_f32(unsigned int nth, unsigned int ith, void
} }
// Chunk-level thread: each job is one (batch, col_chunk) pair. // Chunk-level thread: each job is one (batch, col_chunk) pair.
// Used when there are fewer batches than threads to maintain parallelism.
static void solve_tri_chunk_thread_f32(unsigned int nth, unsigned int ith, void * data) { static void solve_tri_chunk_thread_f32(unsigned int nth, unsigned int ith, void * data) {
struct htp_solve_tri_context * sctx = (struct htp_solve_tri_context *) data; struct htp_solve_tri_context * sctx = (struct htp_solve_tri_context *) data;
struct htp_ops_context * octx = sctx->octx; struct htp_ops_context * octx = sctx->octx;