refactor: cleanup commented unused code
This commit is contained in:
parent
e51b6bf2b9
commit
05693357c8
|
|
@ -313,17 +313,14 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
|
||||||
const float * restrict src0 = (float *) (data_src0 + (ib * src0_row_size));
|
const float * restrict src0 = (float *) (data_src0 + (ib * src0_row_size));
|
||||||
float * restrict dst = (float *) (data_dst + (ib * dst_row_size));
|
float * restrict dst = (float *) (data_dst + (ib * dst_row_size));
|
||||||
|
|
||||||
// gelu = 0.5 * x * (1.0 + tanh( sqrt(2/pi) * (x + 0.044715 * x^3) )) // gelu_tanh
|
|
||||||
// gelu = x * sigmoid(1.702 * x) // current implementation
|
// gelu = x * sigmoid(1.702 * x) // current implementation
|
||||||
if (1 == opt_path) {
|
if (1 == opt_path) {
|
||||||
hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
|
hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
|
||||||
hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
|
hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
|
||||||
|
|
||||||
hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
|
hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
|
hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
|
||||||
// sigmoid
|
|
||||||
hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
|
hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
|
||||||
hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
|
hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -62,16 +62,6 @@ void hvx_mul_f32(const uint8_t * restrict src0,
|
||||||
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
|
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// #pragma unroll(4)
|
|
||||||
// for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
|
||||||
// HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
|
|
||||||
// HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
|
|
||||||
|
|
||||||
// HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in1, in2);
|
|
||||||
|
|
||||||
// *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
|
|
||||||
// }
|
|
||||||
|
|
||||||
int step_of_1 = num_elems_whole >> 5; // divby 32, because 32 float = 128 bytes per HVX vector
|
int step_of_1 = num_elems_whole >> 5; // divby 32, because 32 float = 128 bytes per HVX vector
|
||||||
int leftover_size = left_over * sizeof(float);
|
int leftover_size = left_over * sizeof(float);
|
||||||
|
|
||||||
|
|
@ -98,7 +88,6 @@ void hvx_mul_f32(const uint8_t * restrict src0,
|
||||||
sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
|
sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
|
||||||
|
|
||||||
*((HVX_UVector *)(vec_out++)) =Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
|
*((HVX_UVector *)(vec_out++)) =Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
|
||||||
/* Prepare slinep for next iteration */
|
|
||||||
slinep = slinec;
|
slinep = slinec;
|
||||||
sline2p = sline2c;
|
sline2p = sline2c;
|
||||||
}
|
}
|
||||||
|
|
@ -109,7 +98,6 @@ void hvx_mul_f32(const uint8_t * restrict src0,
|
||||||
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
|
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
|
||||||
sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
|
sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
|
||||||
*((HVX_UVector *)(vec_out++)) =Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
|
*((HVX_UVector *)(vec_out++)) =Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
|
||||||
/* Prepare slinep for next iteration */
|
|
||||||
slinep = slinec;
|
slinep = slinec;
|
||||||
sline2p = sline2c;
|
sline2p = sline2c;
|
||||||
}
|
}
|
||||||
|
|
@ -131,17 +119,6 @@ void hvx_mul_f32(const uint8_t * restrict src0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// if (left_over > 0 ) {
|
|
||||||
// const float * src0f = (const float *) src0 + num_elems_whole;
|
|
||||||
// const float * src1f = (const float *) src1 + num_elems_whole;
|
|
||||||
// float * dstf = (float *) dst + num_elems_whole;
|
|
||||||
|
|
||||||
// HVX_Vector in1 = *(HVX_UVector *) src0f;
|
|
||||||
// HVX_Vector in2 = *(HVX_UVector *) src1f;
|
|
||||||
|
|
||||||
// HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in1, in2);
|
|
||||||
// hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
|
|
||||||
// }
|
|
||||||
|
|
||||||
if (left_over > 0 && !handled_leftover) {
|
if (left_over > 0 && !handled_leftover) {
|
||||||
const float * src0f = (const float *) src0 + num_elems_whole;
|
const float * src0f = (const float *) src0 + num_elems_whole;
|
||||||
|
|
@ -547,15 +524,6 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
|
||||||
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
|
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// #pragma unroll(4)
|
|
||||||
// for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
|
||||||
// HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
|
|
||||||
|
|
||||||
// HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, val_vec);
|
|
||||||
|
|
||||||
// *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
|
|
||||||
// }
|
|
||||||
|
|
||||||
int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector
|
int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector
|
||||||
int leftover_size = left_over * sizeof(float);
|
int leftover_size = left_over * sizeof(float);
|
||||||
|
|
||||||
|
|
@ -597,22 +565,11 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
|
||||||
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
|
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
|
||||||
|
|
||||||
HVX_Vector sout = Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
|
HVX_Vector sout = Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
|
||||||
/* Store output */
|
|
||||||
hvx_vec_store_u(output_v_ptr, leftover_size, sout);
|
hvx_vec_store_u(output_v_ptr, leftover_size, sout);
|
||||||
handled_leftover = true;
|
handled_leftover = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// if (left_over > 0 ) {
|
|
||||||
// const float * srcf = (const float *) src + num_elems_whole;
|
|
||||||
// float * dstf = (float *) dst + num_elems_whole;
|
|
||||||
|
|
||||||
// HVX_Vector in = *(HVX_UVector *) srcf;
|
|
||||||
|
|
||||||
// HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, val_vec);
|
|
||||||
// hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
|
|
||||||
// }
|
|
||||||
|
|
||||||
if (left_over > 0 && !handled_leftover) {
|
if (left_over > 0 && !handled_leftover) {
|
||||||
const float * srcf = (const float *) src + num_elems_whole;
|
const float * srcf = (const float *) src + num_elems_whole;
|
||||||
float * dstf = (float *) dst + num_elems_whole;
|
float * dstf = (float *) dst + num_elems_whole;
|
||||||
|
|
|
||||||
|
|
@ -1003,9 +1003,6 @@ static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restr
|
||||||
int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector
|
int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector
|
||||||
int leftover = num_elems - (step_of_1 * VLEN_FP32);
|
int leftover = num_elems - (step_of_1 * VLEN_FP32);
|
||||||
|
|
||||||
// assert(remaining == 0);//TODO: handle remaining elements later
|
|
||||||
|
|
||||||
|
|
||||||
int32_t leftover_size = leftover * sizeof(float);
|
int32_t leftover_size = leftover * sizeof(float);
|
||||||
|
|
||||||
static const float kMinExp = -87.f; // 0
|
static const float kMinExp = -87.f; // 0
|
||||||
|
|
@ -1053,7 +1050,6 @@ static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restr
|
||||||
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
|
sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
|
||||||
|
|
||||||
HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
|
HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
|
||||||
/* Store output */
|
|
||||||
hvx_vec_store_u(output_v_ptr, leftover_size, sout);
|
hvx_vec_store_u(output_v_ptr, leftover_size, sout);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1061,47 +1057,6 @@ static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restr
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){
|
|
||||||
// int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector
|
|
||||||
// int leftover = num_elems - (step_of_1 * VLEN_FP32);
|
|
||||||
|
|
||||||
// // assert(remaining == 0);//TODO: handle remaining elements later
|
|
||||||
|
|
||||||
|
|
||||||
// int32_t leftover_size = leftover * sizeof(float);
|
|
||||||
|
|
||||||
// static const float kMinExp = -87.f; // 0
|
|
||||||
// static const float kMaxExp = 87.f; // 1
|
|
||||||
|
|
||||||
// const HVX_Vector one = hvx_vec_splat_fp32(1.f);
|
|
||||||
// const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
|
|
||||||
// const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
|
|
||||||
|
|
||||||
// const float *input = (float *)src;
|
|
||||||
// float *output = (float *)dst;
|
|
||||||
|
|
||||||
// HVX_UVector * input_v_ptr = (HVX_UVector *) input;
|
|
||||||
// HVX_UVector * output_v_ptr = (HVX_UVector *) output;
|
|
||||||
|
|
||||||
// // #pragma unroll(4) NOTE: this actual got slower
|
|
||||||
// for(uint32_t i = step_of_1; i> 0; i--){
|
|
||||||
// *((HVX_UVector *)(output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(*(input_v_ptr++), one, max_exp, min_exp);
|
|
||||||
|
|
||||||
// }
|
|
||||||
|
|
||||||
|
|
||||||
// if(leftover> 0){
|
|
||||||
|
|
||||||
|
|
||||||
// HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(*(input_v_ptr++), one, max_exp, min_exp);
|
|
||||||
// /* Store output */
|
|
||||||
// hvx_vec_store_u(output_v_ptr, leftover_size, sout);
|
|
||||||
// }
|
|
||||||
|
|
||||||
|
|
||||||
// }
|
|
||||||
|
|
||||||
float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems);
|
float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems);
|
||||||
void hvx_mul_f32(const uint8_t * restrict src0,
|
void hvx_mul_f32(const uint8_t * restrict src0,
|
||||||
const uint8_t * restrict src1,
|
const uint8_t * restrict src1,
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue