From 976b9d9d9fee40e2c5dd0e4f05ea0834e89a6f60 Mon Sep 17 00:00:00 2001 From: Todor Boinovski Date: Wed, 22 Apr 2026 11:51:32 -0700 Subject: [PATCH] hexagon: vectorize partial f32 loads --- ggml/src/ggml-hexagon/htp/solve-tri-ops.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/solve-tri-ops.c b/ggml/src/ggml-hexagon/htp/solve-tri-ops.c index 157561ba7e..9241f600b7 100644 --- a/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +++ b/ggml/src/ggml-hexagon/htp/solve-tri-ops.c @@ -63,10 +63,9 @@ static inline void solve_tri_row_scalar(const float * A_row, } static inline HVX_Vector hvx_load_partial_f32(const float * src, uint32_t n) { - HVX_VectorAlias tmp; - memset(&tmp, 0, sizeof(tmp)); - memcpy(tmp.fp32, src, n * sizeof(float)); - return tmp.v; + HVX_Vector v = *((const HVX_UVector *) src); + HVX_VectorPred mask = Q6_Q_vsetq2_R(n * sizeof(float)); + return Q6_V_vmux_QVV(mask, v, Q6_V_vzero()); } static inline void solve_tri_row_hvx(const float * A_row, @@ -100,7 +99,6 @@ static inline void solve_tri_row_hvx(const float * A_row, } // Batch-level thread: each job is one full batch. -// Processes all column chunks within each row for better A_row cache reuse. static void solve_tri_batch_thread_f32(unsigned int nth, unsigned int ith, void * data) { struct htp_solve_tri_context * sctx = (struct htp_solve_tri_context *) data; struct htp_ops_context * octx = sctx->octx; @@ -163,7 +161,6 @@ static void solve_tri_batch_thread_f32(unsigned int nth, unsigned int ith, void } // Chunk-level thread: each job is one (batch, col_chunk) pair. -// Used when there are fewer batches than threads to maintain parallelism. static void solve_tri_chunk_thread_f32(unsigned int nth, unsigned int ith, void * data) { struct htp_solve_tri_context * sctx = (struct htp_solve_tri_context *) data; struct htp_ops_context * octx = sctx->octx;