#pragma clang diagnostic ignored "-Wunused-variable" #pragma clang diagnostic ignored "-Wunused-function" #pragma clang diagnostic ignored "-Wunused-but-set-variable" #ifdef HTP_DEBUG # define FARF_HIGH 1 #endif #include #include #include #include #include #include #include #include #define GGML_COMMON_DECL_C #include "ggml-common.h" #include "hvx-utils.h" #define htp_binary_ops_preamble \ int step_of_4 = num_elems >> 7; \ int step_of_2 = (num_elems - step_of_4 * VLEN_FP32 * 4) >> 6; \ int step_of_1 = (num_elems - step_of_4 * VLEN_FP32 * 4 - step_of_2 * VLEN_FP32 * 2) >> 5; \ int remaining = num_elems - step_of_4 * VLEN_FP32 * 4 - step_of_2 * VLEN_FP32 * 2 - step_of_1 * VLEN_FP32; \ \ const uint8_t * restrict src0_curr = src0; \ const uint8_t * restrict src1_curr = src1; \ uint8_t * restrict dst_curr = dst; void hvx_mul_f32(const uint8_t * restrict src0, const uint8_t * restrict src1, uint8_t * restrict dst, const int num_elems) { int left_over = num_elems & (VLEN_FP32 - 1); int num_elems_whole = num_elems - left_over; int unaligned_addr = 0; int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { FARF(HIGH, "hvx_mul_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n"); } bool handled_leftover = false; if (0 == unaligned_loop) { HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0; HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, *vec_in2++); *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { int step_of_1 = num_elems_whole >> 5; // divby 32, because 32 float = 128 bytes per HVX vector int leftover_size = left_over * sizeof(float); HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0; HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1; HVX_UVector * restrict vec_out = (HVX_UVector *) dst; HVX_Vector slinep; HVX_Vector slinec; HVX_Vector sline; HVX_Vector sline2p; HVX_Vector sline2c; HVX_Vector sline2; slinep = *vec_in1++; sline2p = *vec_in2++; #pragma unroll(4) for (int i = step_of_1 - 1; i > 0; i--) { slinec = *vec_in1++; sline2c = *vec_in2++; sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); *((HVX_UVector *) (vec_out++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, sline2)); slinep = slinec; sline2p = sline2c; } if (step_of_1 > 1) { slinec = htp_is_aligned(vec_in1, VLEN) && left_over == 0 ? slinep : *vec_in1++; sline2c = htp_is_aligned(vec_in2, VLEN) && left_over == 0 ? sline2p : *vec_in2++; sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); *((HVX_UVector *) (vec_out++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, sline2)); slinep = slinec; sline2p = sline2c; } if (left_over > 0) { slinec = (is_in_one_chunk(vec_in1, leftover_size, VLEN) ? slinep : *vec_in1++); sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); sline2c = (is_in_one_chunk(vec_in2, leftover_size, VLEN) ? sline2p : *vec_in2++); sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(sline, sline2); hvx_vec_store_u(vec_out, leftover_size, Q6_Vsf_equals_Vqf32(out)); handled_leftover = true; } } if (left_over > 0 && !handled_leftover) { const float * src0f = (const float *) src0 + num_elems_whole; const float * src1f = (const float *) src1 + num_elems_whole; float * dstf = (float *) dst + num_elems_whole; HVX_Vector in1 = *(HVX_UVector *) src0f; HVX_Vector in2 = *(HVX_UVector *) src1f; HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in1, in2); hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out)); } } void hvx_mul_f32_opt(const uint8_t * restrict src0, const uint8_t * restrict src1, uint8_t * restrict dst, const int num_elems) { htp_binary_ops_preamble; for (int i = 0; i < step_of_4; i++) { HVX_Vector v1a = *(HVX_Vector *) src0_curr; HVX_Vector v1b = *(HVX_Vector *) src1_curr; HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN); HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(v1a, v1b); HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN); HVX_Vector v3a = *(HVX_Vector *) (src0_curr + 2 * VLEN); HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v2a, v2b); *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1); HVX_Vector v3b = *(HVX_Vector *) (src1_curr + 2 * VLEN); HVX_Vector v4a = *(HVX_Vector *) (src0_curr + 3 * VLEN); src0_curr += 4 * VLEN; HVX_Vector v3 = Q6_Vqf32_vmpy_VsfVsf(v3a, v3b); *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2); HVX_Vector v4b = *(HVX_Vector *) (src1_curr + 3 * VLEN); *(HVX_Vector *) (dst_curr + 2 * VLEN) = Q6_Vsf_equals_Vqf32(v3); HVX_Vector v4 = Q6_Vqf32_vmpy_VsfVsf(v4a, v4b); src1_curr += 4 * VLEN; *(HVX_Vector *) (dst_curr + 3 * VLEN) = Q6_Vsf_equals_Vqf32(v4); dst_curr += 4 * VLEN; } for (int i = 0; i < step_of_2; i++) { HVX_Vector v1a = *(HVX_Vector *) src0_curr; HVX_Vector v1b = *(HVX_Vector *) src1_curr; HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN); HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(v1a, v1b); HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN); *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1); src0_curr += 2 * VLEN; HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v2a, v2b); src1_curr += 2 * VLEN; *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2); dst_curr += 2 * VLEN; } for (int i = 0; i < step_of_1; i++) { HVX_Vector va = *(HVX_Vector *) src0_curr; src0_curr += VLEN; HVX_Vector vb = *(HVX_Vector *) src1_curr; src1_curr += VLEN; HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(va, vb); *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v); dst_curr += VLEN; } if (remaining > 0) { HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*(HVX_Vector *) src0_curr, *(HVX_Vector *) src1_curr); hvx_vec_store_u((void *) dst_curr, remaining * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(v)); } } void hvx_mul_mul_f32_opt(const uint8_t * restrict src0, const uint8_t * restrict src1, const uint8_t * restrict src2, uint8_t * restrict dst, const int num_elems) { const uint8_t * restrict src0_curr = src0; const uint8_t * restrict src1_curr = src1; const uint8_t * restrict src2_curr = src2; uint8_t * restrict dst_curr = dst; int step_of_2 = num_elems >> 6; int step_of_1 = (num_elems - step_of_2 * VLEN_FP32 * 2) >> 5; int remaining = num_elems - step_of_2 * VLEN_FP32 * 2 - step_of_1 * VLEN_FP32; for (int i = 0; i < step_of_2; i++) { HVX_Vector v1a = *(HVX_Vector *) src0_curr; HVX_Vector v1b = *(HVX_Vector *) src1_curr; HVX_Vector v1c = *(HVX_Vector *) src2_curr; HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN); HVX_Vector v1_ = Q6_Vqf32_vmpy_VsfVsf(v1a, v1b); HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v1_), v1c); HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN); *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1); HVX_Vector v2c = *(HVX_Vector *) (src2_curr + VLEN); src0_curr += 2 * VLEN; HVX_Vector v2_ = Q6_Vqf32_vmpy_VsfVsf(v2a, v2b); HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v2_), v2c); src1_curr += 2 * VLEN; src2_curr += 2 * VLEN; *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2); dst_curr += 2 * VLEN; } for (int i = 0; i < step_of_1; i++) { HVX_Vector va = *(HVX_Vector *) src0_curr; src0_curr += VLEN; HVX_Vector vb = *(HVX_Vector *) src1_curr; src1_curr += VLEN; HVX_Vector vc = *(HVX_Vector *) src2_curr; src2_curr += VLEN; HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(va, vb); HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v1), vc); *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v2); dst_curr += VLEN; } if (remaining > 0) { HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(*(HVX_Vector *) src0_curr, *(HVX_Vector *) src1_curr); HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v1), *(HVX_Vector *) src2_curr); hvx_vec_store_u((void *) dst_curr, remaining * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(v2)); } } void hvx_add_f32(const uint8_t * restrict src0, const uint8_t * restrict src1, uint8_t * restrict dst, const int num_elems) { int left_over = num_elems & (VLEN_FP32 - 1); int num_elems_whole = num_elems - left_over; int unaligned_addr = 0; int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { FARF(HIGH, "hvx_add_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; FARF(HIGH, "hvx_add_f32: unaligned loop in hvx op, possibly slower execution\n"); } if (0 == unaligned_loop) { HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0; HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*vec_in1++, *vec_in2++); *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32); HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32); HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in1, in2); *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out); } } if (left_over > 0) { const float * src0f = (const float *) src0 + num_elems_whole; const float * src1f = (const float *) src1 + num_elems_whole; float * dstf = (float *) dst + num_elems_whole; HVX_Vector in1 = *(HVX_UVector *) src0f; HVX_Vector in2 = *(HVX_UVector *) src1f; HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in1, in2); hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out)); } } void hvx_add_f32_opt(const uint8_t * restrict src0, const uint8_t * restrict src1, uint8_t * restrict dst, const int num_elems) { htp_binary_ops_preamble; for (int i = 0; i < step_of_4; i++) { HVX_Vector v1a = *(HVX_Vector *) src0_curr; HVX_Vector v1b = *(HVX_Vector *) src1_curr; HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN); HVX_Vector v1 = Q6_Vqf32_vadd_VsfVsf(v1a, v1b); HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN); HVX_Vector v3a = *(HVX_Vector *) (src0_curr + 2 * VLEN); HVX_Vector v2 = Q6_Vqf32_vadd_VsfVsf(v2a, v2b); *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1); HVX_Vector v3b = *(HVX_Vector *) (src1_curr + 2 * VLEN); HVX_Vector v4a = *(HVX_Vector *) (src0_curr + 3 * VLEN); src0_curr += 4 * VLEN; HVX_Vector v3 = Q6_Vqf32_vadd_VsfVsf(v3a, v3b); *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2); HVX_Vector v4b = *(HVX_Vector *) (src1_curr + 3 * VLEN); *(HVX_Vector *) (dst_curr + 2 * VLEN) = Q6_Vsf_equals_Vqf32(v3); HVX_Vector v4 = Q6_Vqf32_vadd_VsfVsf(v4a, v4b); src1_curr += 4 * VLEN; *(HVX_Vector *) (dst_curr + 3 * VLEN) = Q6_Vsf_equals_Vqf32(v4); dst_curr += 4 * VLEN; } for (int i = 0; i < step_of_2; i++) { HVX_Vector v1a = *(HVX_Vector *) src0_curr; HVX_Vector v1b = *(HVX_Vector *) src1_curr; HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN); HVX_Vector v1 = Q6_Vqf32_vadd_VsfVsf(v1a, v1b); HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN); *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1); src0_curr += 2 * VLEN; HVX_Vector v2 = Q6_Vqf32_vadd_VsfVsf(v2a, v2b); src1_curr += 2 * VLEN; *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2); dst_curr += 2 * VLEN; } for (int i = 0; i < step_of_1; i++) { HVX_Vector va = *(HVX_Vector *) src0_curr; src0_curr += VLEN; HVX_Vector vb = *(HVX_Vector *) src1_curr; src1_curr += VLEN; HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(va, vb); *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v); dst_curr += VLEN; } if (remaining > 0) { HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*(HVX_Vector *) src0_curr, *(HVX_Vector *) src1_curr); hvx_vec_store_u((void *) dst_curr, remaining * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(v)); } } void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems) { size_t left_over = num_elems & (VLEN_FP32 - 1); size_t num_elems_whole = num_elems - left_over; int unaligned_addr = 0; int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { FARF(HIGH, "hvx_add_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); } static const float kInf = INFINITY; const HVX_Vector inf = hvx_vec_splat_fp32(kInf); HVX_Vector val_vec = hvx_vec_splat_fp32(val); if (0 == unaligned_loop) { HVX_Vector * restrict vec_in1 = (HVX_Vector *) src; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *vec_in1++; const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in); HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(in, val_vec); v = Q6_Vsf_equals_Vqf32(v); v = Q6_V_vmux_QVV(pred_inf, inf, v); *vec_out++ = v; } } else { #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in); HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec); out = Q6_Vsf_equals_Vqf32(out); out = Q6_V_vmux_QVV(pred_inf, inf, out); *(HVX_UVector *) (dst + i * SIZEOF_FP32) = out; } } if (left_over > 0) { const float * srcf = (const float *) src + num_elems_whole; float * dstf = (float *) dst + num_elems_whole; HVX_Vector in = *(HVX_UVector *) srcf; const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in); HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec); out = Q6_Vsf_equals_Vqf32(out); out = Q6_V_vmux_QVV(pred_inf, inf, out); hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out); } } void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems) { size_t left_over = num_elems & (VLEN_FP32 - 1); size_t num_elems_whole = num_elems - left_over; int unaligned_addr = 0; int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { FARF(HIGH, "hvx_mul_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; FARF(HIGH, "hvx_mul_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); } HVX_Vector val_vec = hvx_vec_splat_fp32(val); bool handled_leftover = false; if (0 == unaligned_loop) { HVX_Vector * restrict vec_in1 = (HVX_Vector *) src; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, val_vec); *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector int leftover_size = left_over * sizeof(float); HVX_Vector * input_v_ptr = (HVX_Vector *) src; HVX_UVector * output_v_ptr = (HVX_UVector *) dst; HVX_Vector slinep; HVX_Vector slinec; HVX_Vector sline; slinep = *input_v_ptr++; #pragma unroll(4) for (int i = step_of_1 - 1; i > 0; i--) { slinec = *input_v_ptr++; sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src); *((HVX_UVector *) (output_v_ptr++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); /* Prepare slinep for next iteration */ slinep = slinec; } if (step_of_1 > 0) { slinec = htp_is_aligned(input_v_ptr, VLEN) && left_over == 0 ? slinep : *input_v_ptr++; sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src); *((HVX_UVector *) (output_v_ptr++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); slinep = slinec; } if (leftover_size > 0) { slinec = (is_in_one_chunk(input_v_ptr, leftover_size, VLEN) ? slinep : *input_v_ptr++); sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src); HVX_Vector sout = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); hvx_vec_store_u(output_v_ptr, leftover_size, sout); handled_leftover = true; } } if (left_over > 0 && !handled_leftover) { const float * srcf = (const float *) src + num_elems_whole; float * dstf = (float *) dst + num_elems_whole; HVX_Vector in = *(HVX_UVector *) srcf; HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, val_vec); hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out)); } } void hvx_sub_f32(const uint8_t * restrict src0, const uint8_t * restrict src1, uint8_t * restrict dst, const int num_elems) { size_t left_over = num_elems & (VLEN_FP32 - 1); size_t num_elems_whole = num_elems - left_over; int unaligned_addr = 0; int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { FARF(HIGH, "hvx_sub_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; FARF(HIGH, "hvx_sub_f32: unaligned loop in hvx op, possibly slower execution\n"); } if (0 == unaligned_loop) { HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0; HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*vec_in1++, *vec_in2++); *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32); HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32); HVX_Vector out = Q6_Vqf32_vsub_VsfVsf(in1, in2); *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out); } } if (left_over > 0) { const float * src0f = (const float *) src0 + num_elems_whole; const float * src1f = (const float *) src1 + num_elems_whole; float * dstf = (float *) dst + num_elems_whole; HVX_Vector in1 = *(HVX_UVector *) src0f; HVX_Vector in2 = *(HVX_UVector *) src1f; HVX_Vector out = Q6_Vqf32_vsub_VsfVsf(in1, in2); hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out)); } } void hvx_sub_f32_opt(const uint8_t * restrict src0, const uint8_t * restrict src1, uint8_t * restrict dst, const int num_elems) { htp_binary_ops_preamble; for (int i = 0; i < step_of_4; i++) { HVX_Vector v1a = *(HVX_Vector *) src0_curr; HVX_Vector v1b = *(HVX_Vector *) src1_curr; HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN); HVX_Vector v1 = Q6_Vqf32_vsub_VsfVsf(v1a, v1b); HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN); HVX_Vector v3a = *(HVX_Vector *) (src0_curr + 2 * VLEN); HVX_Vector v2 = Q6_Vqf32_vsub_VsfVsf(v2a, v2b); *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1); HVX_Vector v3b = *(HVX_Vector *) (src1_curr + 2 * VLEN); HVX_Vector v4a = *(HVX_Vector *) (src0_curr + 3 * VLEN); src0_curr += 4 * VLEN; HVX_Vector v3 = Q6_Vqf32_vsub_VsfVsf(v3a, v3b); *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2); HVX_Vector v4b = *(HVX_Vector *) (src1_curr + 3 * VLEN); *(HVX_Vector *) (dst_curr + 2 * VLEN) = Q6_Vsf_equals_Vqf32(v3); HVX_Vector v4 = Q6_Vqf32_vsub_VsfVsf(v4a, v4b); src1_curr += 4 * VLEN; *(HVX_Vector *) (dst_curr + 3 * VLEN) = Q6_Vsf_equals_Vqf32(v4); dst_curr += 4 * VLEN; } for (int i = 0; i < step_of_2; i++) { HVX_Vector v1a = *(HVX_Vector *) src0_curr; HVX_Vector v1b = *(HVX_Vector *) src1_curr; HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN); HVX_Vector v1 = Q6_Vqf32_vsub_VsfVsf(v1a, v1b); HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN); *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1); src0_curr += 2 * VLEN; HVX_Vector v2 = Q6_Vqf32_vsub_VsfVsf(v2a, v2b); src1_curr += 2 * VLEN; *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2); dst_curr += 2 * VLEN; } for (int i = 0; i < step_of_1; i++) { HVX_Vector va = *(HVX_Vector *) src0_curr; src0_curr += VLEN; HVX_Vector vb = *(HVX_Vector *) src1_curr; src1_curr += VLEN; HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(va, vb); *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v); dst_curr += VLEN; } if (remaining > 0) { HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*(HVX_Vector *) src0_curr, *(HVX_Vector *) src1_curr); hvx_vec_store_u((void *) dst_curr, remaining * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(v)); } } void hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems) { size_t left_over = num_elems & (VLEN_FP32 - 1); size_t num_elems_whole = num_elems - left_over; int unaligned_addr = 0; int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { FARF(HIGH, "hvx_sub_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; FARF(HIGH, "hvx_sub_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); } HVX_Vector val_vec = hvx_vec_splat_fp32(val); if (0 == unaligned_loop) { HVX_Vector * restrict vec_in1 = (HVX_Vector *) src; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*vec_in1++, val_vec); *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); HVX_Vector out = Q6_Vqf32_vsub_VsfVsf(in, val_vec); *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out); } } if (left_over > 0) { const float * srcf = (const float *) src + num_elems_whole; float * dstf = (float *) dst + num_elems_whole; HVX_Vector in = *(HVX_UVector *) srcf; HVX_Vector out = Q6_Vqf32_vsub_VsfVsf(in, val_vec); hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out)); } } float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems) { int left_over = num_elems & (VLEN_FP32 - 1); int num_elems_whole = num_elems - left_over; if (0 == htp_is_aligned((void *) src, VLEN)) { FARF(HIGH, "hvx_sum_of_squares_f32: unaligned address in hvx op, possibly slower execution\n"); } assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole)); HVX_Vector * restrict vec_in1 = (HVX_Vector *) src; HVX_Vector sum_vec_acc = Q6_V_vsplat_R(0x00000000); HVX_Vector zero_vec = Q6_V_vsplat_R(0x00000000); #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1, *vec_in1); sum_vec_acc = Q6_Vqf32_vadd_Vqf32Vqf32(sum_vec_acc, v); vec_in1++; } if (left_over > 0) { const float * srcf = (const float *) src + num_elems_whole; HVX_Vector vec_left = *(HVX_UVector *) srcf; HVX_Vector vec_left_sq = Q6_Vqf32_vmpy_VsfVsf(vec_left, vec_left); HVX_Vector vec_tmp = Q6_V_valign_VVR(vec_left_sq, zero_vec, left_over * SIZEOF_FP32); sum_vec_acc = Q6_Vqf32_vadd_Vqf32Vqf32(sum_vec_acc, vec_tmp); } HVX_Vector v = hvx_vec_qf32_reduce_sum(sum_vec_acc); return hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(v)); } float hvx_self_sum_f32(const uint8_t * restrict src, const int num_elems) { int left_over = num_elems & (VLEN_FP32 - 1); int num_elems_whole = num_elems - left_over; int unaligned_addr = 0; int unaligned_loop = 0; if (0 == htp_is_aligned((void *) src, VLEN)) { FARF(HIGH, "hvx_self_sum_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; FARF(HIGH, "hvx_self_sum_f32: unaligned loop in hvx op, possibly slower execution\n"); } HVX_Vector sum_vec = Q6_V_vsplat_R(0x00000000); HVX_Vector zero_vec = Q6_V_vsplat_R(0x00000000); if (0 == unaligned_loop) { HVX_Vector * vec_in = (HVX_Vector *) src; #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { // sum_vec = Q6_Vqf32_vadd_Vqf32Vsf(sum_vec, *vec_in++); sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), *vec_in++); } } else { #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), in); } } if (left_over > 0) { const float * srcf = (const float *) src + num_elems_whole; HVX_Vector vec_left = *(HVX_UVector *) srcf; HVX_Vector vec_tmp = Q6_V_valign_VVR(vec_left, zero_vec, left_over * SIZEOF_FP32); // sum_vec = Q6_Vqf32_vadd_Vqf32Vsf(sum_vec, vec_tmp); sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), vec_tmp); } HVX_Vector v = hvx_vec_qf32_reduce_sum(sum_vec); return hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(v)); } void hvx_scale_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, const float scale) { int left_over = num_elems & (VLEN_FP32 - 1); int num_elems_whole = num_elems - left_over; int unaligned_addr = 0; int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { FARF(HIGH, "hvx_scale_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; FARF(HIGH, "hvx_scale_f32: unaligned loop in hvx op, possibly slower execution\n"); } HVX_Vector scale_vec = hvx_vec_splat_fp32(scale); if (0 == unaligned_loop) { HVX_Vector * vec_in1 = (HVX_Vector *) src; HVX_Vector * vec_out = (HVX_Vector *) dst; #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, scale_vec); *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, scale_vec); *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out); } } if (left_over > 0) { const float * srcf = (const float *) src + num_elems_whole; float * dstf = (float *) dst + num_elems_whole; HVX_Vector in = *(HVX_UVector *) srcf; HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, scale_vec); hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out)); } } float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems) { int left_over = num_elems & (VLEN_FP32 - 1); int num_elems_whole = num_elems - left_over; int unaligned_addr = 0; int unaligned_loop = 0; if (0 == htp_is_aligned((void *) src, VLEN)) { FARF(HIGH, "hvx_self_max_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; FARF(HIGH, "hvx_self_max_f32: unaligned loop in hvx op, possibly slower execution\n"); } HVX_Vector vec_max = hvx_vec_splat_fp32(((const float *) src)[0]); HVX_Vector vec_first = hvx_vec_splat_fp32(((const float *) src)[0]); if (0 == unaligned_loop) { HVX_Vector * restrict vec_in = (HVX_Vector *) src; #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { vec_max = Q6_Vsf_vmax_VsfVsf(vec_max, *vec_in++); } } else { #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); vec_max = Q6_Vsf_vmax_VsfVsf(vec_max, in); } } if (left_over > 0) { const float * srcf = (const float *) src + num_elems_whole; HVX_Vector in = *(HVX_UVector *) srcf; HVX_Vector temp = Q6_V_valign_VVR(in, vec_first, left_over * SIZEOF_FP32); vec_max = Q6_Vsf_vmax_VsfVsf(vec_max, temp); } HVX_Vector v = hvx_vec_reduce_max_fp32(vec_max); return hvx_vec_get_fp32(v); } void hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems) { size_t left_over = num_elems & (VLEN_FP32 - 1); size_t num_elems_whole = num_elems - left_over; int unalign_address = 0; if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { FARF(HIGH, "hvx_min_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); unalign_address = 1; } const float * src_f = (const float *) src; HVX_Vector vec_min = hvx_vec_splat_fp32(val); if(unalign_address == 0){ HVX_Vector * restrict vec_in = (HVX_Vector *) src; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector min_clamp = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++); *vec_out++ = (min_clamp); } }else{ HVX_UVector * restrict vec_in = (HVX_Vector *) src; HVX_UVector * restrict vec_out = (HVX_Vector *) dst; #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector min_clamp = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++); *vec_out++ = (min_clamp); } } if (left_over > 0 ) { const float * srcf = (const float *) src + num_elems_whole; float * dstf = (float *) dst + num_elems_whole; HVX_UVector in = *(HVX_UVector *) srcf; HVX_UVector min_clamp = Q6_Vsf_vmin_VsfVsf(vec_min, in); hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, (min_clamp)); } } void hvx_clamp_scalar_f32(const uint8_t * restrict src, const float limit_left, const float limit_right, uint8_t * restrict dst, const int num_elems) { size_t left_over = num_elems & (VLEN_FP32 - 1); size_t num_elems_whole = num_elems - left_over; int unalign_address = 0; if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { FARF(HIGH, "hvx_clamp_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); unalign_address = 1; } HVX_Vector range_left = hvx_vec_splat_fp32(limit_left); HVX_Vector range_right = hvx_vec_splat_fp32(limit_right); if(unalign_address == 0){ HVX_Vector * restrict vec_in = (HVX_Vector *) src; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in_vec = *vec_in++; HVX_Vector temp_v = in_vec; HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right); HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec); in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v); in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, in_vec); *vec_out++ = in_vec; } }else{ HVX_UVector * restrict vec_in = (HVX_UVector *) src; HVX_UVector * restrict vec_out = (HVX_UVector *) dst; #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in_vec = *vec_in++; HVX_Vector temp_v = in_vec; HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right); HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec); in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v); in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, in_vec); *vec_out++ = in_vec; } } if (left_over > 0) { const float * srcf = (const float *) src + num_elems_whole; float * dstf = (float *) dst + num_elems_whole; HVX_Vector in_vec = *(HVX_UVector *) srcf; HVX_Vector temp_v = in_vec; HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right); HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec); in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v); in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, in_vec); hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, in_vec); } }