diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c index 97823575c1..9d3e584a84 100644 --- a/ggml/src/ggml-hexagon/htp/act-ops.c +++ b/ggml/src/ggml-hexagon/htp/act-ops.c @@ -313,17 +313,14 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, const float * restrict src0 = (float *) (data_src0 + (ib * src0_row_size)); float * restrict dst = (float *) (data_dst + (ib * dst_row_size)); - // gelu = 0.5 * x * (1.0 + tanh( sqrt(2/pi) * (x + 0.044715 * x^3) )) // gelu_tanh // gelu = x * sigmoid(1.702 * x) // current implementation if (1 == opt_path) { hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0); hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0); - hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0); } else { hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0); - // sigmoid hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0); hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0); } diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c index 63c7c85427..e7ee589f34 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.c +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c @@ -62,16 +62,6 @@ void hvx_mul_f32(const uint8_t * restrict src0, *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { - // #pragma unroll(4) - // for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - // HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32); - // HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32); - - // HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in1, in2); - - // *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out); - // } - int step_of_1 = num_elems_whole >> 5; // divby 32, because 32 float = 128 bytes per HVX vector int leftover_size = left_over * sizeof(float); @@ -98,7 +88,6 @@ void hvx_mul_f32(const uint8_t * restrict src0, sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); *((HVX_UVector *)(vec_out++)) =Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, sline2)); - /* Prepare slinep for next iteration */ slinep = slinec; sline2p = sline2c; } @@ -109,7 +98,6 @@ void hvx_mul_f32(const uint8_t * restrict src0, sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); *((HVX_UVector *)(vec_out++)) =Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, sline2)); - /* Prepare slinep for next iteration */ slinep = slinec; sline2p = sline2c; } @@ -131,17 +119,6 @@ void hvx_mul_f32(const uint8_t * restrict src0, } } - // if (left_over > 0 ) { - // const float * src0f = (const float *) src0 + num_elems_whole; - // const float * src1f = (const float *) src1 + num_elems_whole; - // float * dstf = (float *) dst + num_elems_whole; - - // HVX_Vector in1 = *(HVX_UVector *) src0f; - // HVX_Vector in2 = *(HVX_UVector *) src1f; - - // HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in1, in2); - // hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out)); - // } if (left_over > 0 && !handled_leftover) { const float * src0f = (const float *) src0 + num_elems_whole; @@ -547,15 +524,6 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { - // #pragma unroll(4) - // for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - // HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); - - // HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, val_vec); - - // *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out); - // } - int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector int leftover_size = left_over * sizeof(float); @@ -597,22 +565,11 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src); HVX_Vector sout = Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); - /* Store output */ hvx_vec_store_u(output_v_ptr, leftover_size, sout); handled_leftover = true; } } - // if (left_over > 0 ) { - // const float * srcf = (const float *) src + num_elems_whole; - // float * dstf = (float *) dst + num_elems_whole; - - // HVX_Vector in = *(HVX_UVector *) srcf; - - // HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, val_vec); - // hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out)); - // } - if (left_over > 0 && !handled_leftover) { const float * srcf = (const float *) src + num_elems_whole; float * dstf = (float *) dst + num_elems_whole; diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index 6c713b40eb..0b24786391 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -1003,9 +1003,6 @@ static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restr int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector int leftover = num_elems - (step_of_1 * VLEN_FP32); - // assert(remaining == 0);//TODO: handle remaining elements later - - int32_t leftover_size = leftover * sizeof(float); static const float kMinExp = -87.f; // 0 @@ -1053,7 +1050,6 @@ static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restr sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp); - /* Store output */ hvx_vec_store_u(output_v_ptr, leftover_size, sout); } @@ -1061,47 +1057,6 @@ static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restr } - -// static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){ -// int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector -// int leftover = num_elems - (step_of_1 * VLEN_FP32); - -// // assert(remaining == 0);//TODO: handle remaining elements later - - -// int32_t leftover_size = leftover * sizeof(float); - -// static const float kMinExp = -87.f; // 0 -// static const float kMaxExp = 87.f; // 1 - -// const HVX_Vector one = hvx_vec_splat_fp32(1.f); -// const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); -// const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp); - -// const float *input = (float *)src; -// float *output = (float *)dst; - -// HVX_UVector * input_v_ptr = (HVX_UVector *) input; -// HVX_UVector * output_v_ptr = (HVX_UVector *) output; - -// // #pragma unroll(4) NOTE: this actual got slower -// for(uint32_t i = step_of_1; i> 0; i--){ -// *((HVX_UVector *)(output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(*(input_v_ptr++), one, max_exp, min_exp); - -// } - - -// if(leftover> 0){ - - -// HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(*(input_v_ptr++), one, max_exp, min_exp); -// /* Store output */ -// hvx_vec_store_u(output_v_ptr, leftover_size, sout); -// } - - -// } - float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems); void hvx_mul_f32(const uint8_t * restrict src0, const uint8_t * restrict src1,