From fc2289dc96e2c2622922189b0e04bb30302c3aa1 Mon Sep 17 00:00:00 2001
From: shouyud <joeldushouyu@gmail.com>
Date: Fri, 12 Dec 2025 11:58:45 -0500
Subject: [PATCH] Feat: optiized unaligned sigmoid_f32

---
 ggml/src/ggml-hexagon/htp/act-ops.c   |   5 +-
 ggml/src/ggml-hexagon/htp/hvx-utils.h | 108 ++++++++++++++++++++++++++
 2 files changed, 109 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c
index 2db4a2a35b..5266567d37 100644
--- a/ggml/src/ggml-hexagon/htp/act-ops.c
+++ b/ggml/src/ggml-hexagon/htp/act-ops.c
@@ -317,10 +317,7 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
         } else {
             hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
             // sigmoid
-            hvx_exp_f32((const uint8_t *) src0_spad_data, src0_spad_data, ne0, true);
-            hvx_add_scalar_f32(src0_spad_data, 1.0, dst_spad_data, ne0);
-            hvx_inverse_f32(dst_spad_data, src0_spad_data, ne0);
-
+            hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
             hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
         }
     }
diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index 80658105c5..6c713b40eb 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -265,12 +265,16 @@ static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t
     }
 }
 
+
+/* Return whether 'n' elements from vector are in the one chunk of 'chunk_size'. */
 static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
     uint32_t left_off  = (size_t) addr & (chunk_size - 1);
     uint32_t right_off = left_off + n;
     return right_off <= chunk_size;
 }
 
+
+
 static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) {
     HVX_VectorAlias u = { .v = v };
 
@@ -994,6 +998,110 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
     }
 }
 
+
+static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){
+    int step_of_1 = num_elems >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
+    int leftover = num_elems - (step_of_1 * VLEN_FP32);
+
+    // assert(remaining == 0);//TODO: handle remaining elements later
+
+
+    int32_t leftover_size = leftover * sizeof(float);
+
+    static const float kMinExp = -87.f;  // 0
+    static const float kMaxExp = 87.f;   // 1
+
+    const HVX_Vector one     = hvx_vec_splat_fp32(1.f);
+    const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
+    const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
+
+    const float *input = (float *)src;
+    float *output = (float *)dst;
+
+    HVX_Vector *  input_v_ptr = (HVX_Vector *) input;
+    HVX_UVector *  output_v_ptr       = (HVX_UVector *) output;
+
+
+    HVX_Vector slinep;
+    HVX_Vector slinec;
+    HVX_Vector sline;
+    
+
+    slinep = *input_v_ptr++; 
+    #pragma unroll(4)
+    for(uint32_t i = step_of_1 -1; i> 0; i--){
+        slinec = *input_v_ptr++;
+        sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);       
+        *((HVX_UVector *)(output_v_ptr++)) =  hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
+        /* Prepare slinep for next iteration */
+        slinep = slinec;        
+    }
+
+    if(step_of_1> 0){
+
+        slinec = htp_is_aligned(input_v_ptr, 128) && leftover == 0 ? slinep : *input_v_ptr++;
+        sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
+        *((HVX_UVector *)(output_v_ptr++)) =  hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);;
+
+        slinep = slinec;
+    }
+    if(leftover> 0){
+        slinec = (is_in_one_chunk(input_v_ptr, leftover_size, 128)
+                   ? slinep
+                   : *input_v_ptr++);
+
+        sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
+
+        HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
+        /* Store output */
+        hvx_vec_store_u(output_v_ptr, leftover_size, sout);        
+    }
+
+  
+}
+
+
+
+// static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){
+//     int step_of_1 = num_elems >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
+//     int leftover = num_elems - (step_of_1 * VLEN_FP32);
+
+//     // assert(remaining == 0);//TODO: handle remaining elements later
+
+
+//     int32_t leftover_size = leftover * sizeof(float);
+
+//     static const float kMinExp = -87.f;  // 0
+//     static const float kMaxExp = 87.f;   // 1
+
+//     const HVX_Vector one     = hvx_vec_splat_fp32(1.f);
+//     const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
+//     const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
+
+//     const float *input = (float *)src;
+//     float *output = (float *)dst;
+
+//     HVX_UVector *  input_v_ptr = (HVX_UVector *) input;
+//     HVX_UVector *  output_v_ptr       = (HVX_UVector *) output;
+
+//     // #pragma unroll(4)  NOTE: this actual got slower
+//     for(uint32_t i = step_of_1; i> 0; i--){
+//         *((HVX_UVector *)(output_v_ptr++)) =  hvx_vec_fast_sigmoid_fp32_guard(*(input_v_ptr++), one, max_exp, min_exp);
+ 
+//     }
+
+
+//     if(leftover> 0){
+
+
+//         HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(*(input_v_ptr++), one, max_exp, min_exp);
+//         /* Store output */
+//         hvx_vec_store_u(output_v_ptr, leftover_size, sout);        
+//     }
+
+  
+// }
+
 float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems);
 void  hvx_mul_f32(const uint8_t * restrict src0,
                   const uint8_t * restrict src1,