diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c index d6e928c96f..b0099991cd 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.c +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c @@ -49,6 +49,8 @@ void hvx_mul_f32(const uint8_t * restrict src0, //FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n"); } + + bool handled_leftover = false; if (0 == unaligned_loop) { HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0; HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1; @@ -60,18 +62,88 @@ void hvx_mul_f32(const uint8_t * restrict src0, *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { + // #pragma unroll(4) + // for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { + // HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32); + // HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32); + + // HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in1, in2); + + // *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out); + // } + + int step_of_1 = num_elems_whole >> 5; // divby 32, because 32 float = 128 bytes per HVX vector + int leftover_size = left_over * sizeof(float); + + + HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0; + HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1; + HVX_UVector * restrict vec_out = (HVX_UVector *) dst; + + + HVX_Vector slinep; + HVX_Vector slinec; + HVX_Vector sline; + HVX_Vector sline2p; + HVX_Vector sline2c; + HVX_Vector sline2; + + slinep = *vec_in1++; + sline2p = *vec_in2++; #pragma unroll(4) - for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32); - HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32); + for(uint32_t i = step_of_1 -1; i> 0; i--){ + slinec = *vec_in1++; + sline2c = *vec_in2++; + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); + + *((HVX_UVector *)(vec_out++)) =Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, sline2)); + /* Prepare slinep for next iteration */ + slinep = slinec; + sline2p = sline2c; + } + if(step_of_1 > 1){ + slinec = htp_is_aligned(vec_in1, VLEN) && left_over == 0 ? slinep : *vec_in1++; + sline2c = htp_is_aligned(vec_in2, VLEN) && left_over == 0 ? sline2p : *vec_in2++; - HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in1, in2); + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); + *((HVX_UVector *)(vec_out++)) =Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, sline2)); + /* Prepare slinep for next iteration */ + slinep = slinec; + sline2p = sline2c; + } + if(left_over > 0 ){ - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out); + slinec = (is_in_one_chunk(vec_in1, leftover_size, VLEN) + ? slinep + : *vec_in1++); + + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); + sline2c = (is_in_one_chunk(vec_in2, leftover_size, VLEN) + ? sline2p + : *vec_in2++); + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); + + HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(sline, sline2); + hvx_vec_store_u(vec_out, leftover_size, Q6_Vsf_equals_Vqf32(out)); + handled_leftover = true; } } - if (left_over > 0) { + // if (left_over > 0 ) { + // const float * src0f = (const float *) src0 + num_elems_whole; + // const float * src1f = (const float *) src1 + num_elems_whole; + // float * dstf = (float *) dst + num_elems_whole; + + // HVX_Vector in1 = *(HVX_UVector *) src0f; + // HVX_Vector in2 = *(HVX_UVector *) src1f; + + // HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in1, in2); + // hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out)); + // } + + if (left_over > 0 && !handled_leftover) { const float * src0f = (const float *) src0 + num_elems_whole; const float * src1f = (const float *) src1 + num_elems_whole; float * dstf = (float *) dst + num_elems_whole; @@ -464,7 +536,7 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * } HVX_Vector val_vec = hvx_vec_splat_fp32(val); - + bool handled_leftover = false; if (0 == unaligned_loop) { HVX_Vector * restrict vec_in1 = (HVX_Vector *) src; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; @@ -475,17 +547,73 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { + // #pragma unroll(4) + // for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { + // HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); + + // HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, val_vec); + + // *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out); + // } + + int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector + int leftover_size = left_over * sizeof(float); + + + + HVX_Vector * input_v_ptr = (HVX_Vector *) src; + HVX_UVector * output_v_ptr = (HVX_UVector *) dst; + + + HVX_Vector slinep; + HVX_Vector slinec; + HVX_Vector sline; + + slinep = *input_v_ptr++; + #pragma unroll(4) - for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); + for(uint32_t i = step_of_1 - 1; i > 0; i--){ + slinec = *input_v_ptr++; + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src); + *((HVX_UVector *)(output_v_ptr++)) = Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); + /* Prepare slinep for next iteration */ + slinep = slinec; + } - HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, val_vec); + if(step_of_1 > 0){ - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out); + slinec = htp_is_aligned(input_v_ptr, VLEN) && left_over == 0 ? slinep : *input_v_ptr++; + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src); + *((HVX_UVector *)(output_v_ptr++)) = Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); + + slinep = slinec; + } + + if(leftover_size > 0){ + slinec = (is_in_one_chunk(input_v_ptr, leftover_size, VLEN) + ? slinep + : *input_v_ptr++); + + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src); + + HVX_Vector sout = Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); + /* Store output */ + hvx_vec_store_u(output_v_ptr, leftover_size, sout); + handled_leftover = true; } } - if (left_over > 0) { + // if (left_over > 0 ) { + // const float * srcf = (const float *) src + num_elems_whole; + // float * dstf = (float *) dst + num_elems_whole; + + // HVX_Vector in = *(HVX_UVector *) srcf; + + // HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, val_vec); + // hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out)); + // } + + if (left_over > 0 && !handled_leftover) { const float * srcf = (const float *) src + num_elems_whole; float * dstf = (float *) dst + num_elems_whole;