From fc2289dc96e2c2622922189b0e04bb30302c3aa1 Mon Sep 17 00:00:00 2001 From: shouyud Date: Fri, 12 Dec 2025 11:58:45 -0500 Subject: [PATCH] Feat: optiized unaligned sigmoid_f32 --- ggml/src/ggml-hexagon/htp/act-ops.c | 5 +- ggml/src/ggml-hexagon/htp/hvx-utils.h | 108 ++++++++++++++++++++++++++ 2 files changed, 109 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c index 2db4a2a35b..5266567d37 100644 --- a/ggml/src/ggml-hexagon/htp/act-ops.c +++ b/ggml/src/ggml-hexagon/htp/act-ops.c @@ -317,10 +317,7 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, } else { hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0); // sigmoid - hvx_exp_f32((const uint8_t *) src0_spad_data, src0_spad_data, ne0, true); - hvx_add_scalar_f32(src0_spad_data, 1.0, dst_spad_data, ne0); - hvx_inverse_f32(dst_spad_data, src0_spad_data, ne0); - + hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0); hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0); } } diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index 80658105c5..6c713b40eb 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -265,12 +265,16 @@ static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t } } + +/* Return whether 'n' elements from vector are in the one chunk of 'chunk_size'. */ static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) { uint32_t left_off = (size_t) addr & (chunk_size - 1); uint32_t right_off = left_off + n; return right_off <= chunk_size; } + + static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) { HVX_VectorAlias u = { .v = v }; @@ -994,6 +998,110 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * } } + +static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){ + int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector + int leftover = num_elems - (step_of_1 * VLEN_FP32); + + // assert(remaining == 0);//TODO: handle remaining elements later + + + int32_t leftover_size = leftover * sizeof(float); + + static const float kMinExp = -87.f; // 0 + static const float kMaxExp = 87.f; // 1 + + const HVX_Vector one = hvx_vec_splat_fp32(1.f); + const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); + const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp); + + const float *input = (float *)src; + float *output = (float *)dst; + + HVX_Vector * input_v_ptr = (HVX_Vector *) input; + HVX_UVector * output_v_ptr = (HVX_UVector *) output; + + + HVX_Vector slinep; + HVX_Vector slinec; + HVX_Vector sline; + + + slinep = *input_v_ptr++; + #pragma unroll(4) + for(uint32_t i = step_of_1 -1; i> 0; i--){ + slinec = *input_v_ptr++; + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); + *((HVX_UVector *)(output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp); + /* Prepare slinep for next iteration */ + slinep = slinec; + } + + if(step_of_1> 0){ + + slinec = htp_is_aligned(input_v_ptr, 128) && leftover == 0 ? slinep : *input_v_ptr++; + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); + *((HVX_UVector *)(output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);; + + slinep = slinec; + } + if(leftover> 0){ + slinec = (is_in_one_chunk(input_v_ptr, leftover_size, 128) + ? slinep + : *input_v_ptr++); + + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); + + HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp); + /* Store output */ + hvx_vec_store_u(output_v_ptr, leftover_size, sout); + } + + +} + + + +// static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){ +// int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector +// int leftover = num_elems - (step_of_1 * VLEN_FP32); + +// // assert(remaining == 0);//TODO: handle remaining elements later + + +// int32_t leftover_size = leftover * sizeof(float); + +// static const float kMinExp = -87.f; // 0 +// static const float kMaxExp = 87.f; // 1 + +// const HVX_Vector one = hvx_vec_splat_fp32(1.f); +// const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); +// const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp); + +// const float *input = (float *)src; +// float *output = (float *)dst; + +// HVX_UVector * input_v_ptr = (HVX_UVector *) input; +// HVX_UVector * output_v_ptr = (HVX_UVector *) output; + +// // #pragma unroll(4) NOTE: this actual got slower +// for(uint32_t i = step_of_1; i> 0; i--){ +// *((HVX_UVector *)(output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(*(input_v_ptr++), one, max_exp, min_exp); + +// } + + +// if(leftover> 0){ + + +// HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(*(input_v_ptr++), one, max_exp, min_exp); +// /* Store output */ +// hvx_vec_store_u(output_v_ptr, leftover_size, sout); +// } + + +// } + float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems); void hvx_mul_f32(const uint8_t * restrict src0, const uint8_t * restrict src1,