From 83412e0b6a20733afc9fbe2ad4785420c1bd2239 Mon Sep 17 00:00:00 2001 From: shouyud Date: Wed, 10 Dec 2025 18:33:00 -0500 Subject: [PATCH 01/14] feat: inital support for gelu using sigmoid approximation --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 21 +++++- ggml/src/ggml-hexagon/htp/act-ops.c | 89 +++++++++++++++++++++++++- ggml/src/ggml-hexagon/htp/htp-msg.h | 11 ++-- ggml/src/ggml-hexagon/htp/main.c | 1 + 4 files changed, 113 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 72a82a8911..c45b292a52 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -2164,8 +2164,14 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session } // src0, src1 & dst must be mapped to the same session - if (!hex_supported_buffer(sess, src0, src1, dst)) { - return false; + if(src1){ + if (!hex_supported_buffer(sess, src0, src1, dst)) { + return false; + } + }else{ + if (!hex_supported_buffer(sess, src0, dst)) { + return false; + } } return true; @@ -2665,6 +2671,10 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) { req.op = HTP_OP_UNARY_SILU; supported = true; } + else if (ggml_get_unary_op(dst) == GGML_UNARY_OP_GELU){ + req.op = HTP_OP_UNARY_GELU; + supported = true; + } break; case GGML_OP_GLU: @@ -2680,6 +2690,7 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) { case GGML_OP_SOFT_MAX: req.op = HTP_OP_SOFTMAX; supported = true; + break; default: break; @@ -2959,6 +2970,8 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg case GGML_OP_UNARY: if (ggml_get_unary_op(node) == GGML_UNARY_OP_SILU) { ggml_hexagon_unary(node, flags); + } else if (ggml_get_unary_op(node) == GGML_UNARY_OP_GELU) { + ggml_hexagon_unary(node, flags); } break; case GGML_OP_GLU: @@ -3257,7 +3270,6 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons auto sess = static_cast(dev->context); bool supp = false; - switch (op->op) { case GGML_OP_NONE: case GGML_OP_RESHAPE: @@ -3297,6 +3309,9 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons if (ggml_get_unary_op(op) == GGML_UNARY_OP_SILU) { supp = ggml_hexagon_supported_activations(sess, op); } + else if (ggml_get_unary_op(op) == GGML_UNARY_OP_GELU){ + supp = ggml_hexagon_supported_activations(sess, op); + } break; case GGML_OP_GLU: diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c index 87b09cca3a..2db4a2a35b 100644 --- a/ggml/src/ggml-hexagon/htp/act-ops.c +++ b/ggml/src/ggml-hexagon/htp/act-ops.c @@ -255,6 +255,90 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0, src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); } + +static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, + struct htp_tensor * dst, + const int32_t * op_params, + struct htp_spad * src0_spad, + struct htp_spad * dst_spad, + uint32_t nth, + uint32_t ith, + uint32_t src0_nrows_per_thread) { + htp_act_preamble2; + + uint64_t t1, t2; + t1 = HAP_perf_get_qtimer_count(); + + const size_t src0_row_size = nb01; + const size_t dst_row_size = nb1; + + const uint32_t src0_nrows = ne01 * ne02 * ne03; + + const uint32_t src0_start_row = src0_nrows_per_thread * ith; + const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows); + + // no work for this thread + if (src0_start_row >= src0_end_row) { + return; + } + + int is_aligned = 1; + int opt_path = 0; + if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) { + is_aligned = 0; + FARF(HIGH, "silu-f32: unaligned addresses in elementwise op, possibly slower execution\n"); + } + if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) { + opt_path = 1; + } + + const uint8_t * restrict data_src0 = (const uint8_t *) src0->data; + uint8_t * restrict data_dst = (uint8_t *) dst->data; + + uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size); + uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_row_size); + + for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) { + const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size)); + float * restrict dst = (float *) (data_dst + (ir * dst_row_size)); + + if (ir + 1 < src0_end_row) { + htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size); + } + + + // gelu = 0.5 * x * (1.0 + tanh( sqrt(2/pi) * (x + 0.044715 * x^3) )) // gelu_tanh + // gelu = x * sigmoid(1.702 * x) // current implementation + if (1 == opt_path) { + hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0); + hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0); + + hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0); + } else { + hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0); + // sigmoid + hvx_exp_f32((const uint8_t *) src0_spad_data, src0_spad_data, ne0, true); + hvx_add_scalar_f32(src0_spad_data, 1.0, dst_spad_data, ne0); + hvx_inverse_f32(dst_spad_data, src0_spad_data, ne0); + + hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0); + } + } + + t2 = HAP_perf_get_qtimer_count(); + + FARF(HIGH, "gelu-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path, ne00, ne01, ne02, + ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); +} + +static void unary_gelu_fp32(unsigned int n, unsigned int i, void * data) { + struct htp_ops_context * octx = (struct htp_ops_context *) data; + unary_gelu_fp32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i, + octx->src0_nrows_per_thread); +} + + + static void unary_silu_fp32_per_thread(const struct htp_tensor * src0, struct htp_tensor * dst, const int32_t * op_params, @@ -371,7 +455,10 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) { act_op_func = glu_swiglu_oai_fp32; op_type = "swiglu-oai-f32"; break; - + case HTP_OP_UNARY_GELU: + act_op_func = unary_gelu_fp32; + op_type = "gelu-f32"; + break; default: FARF(ERROR, "Unsupported activations Op %u\n", octx->op); return HTP_STATUS_NO_SUPPORT; diff --git a/ggml/src/ggml-hexagon/htp/htp-msg.h b/ggml/src/ggml-hexagon/htp/htp-msg.h index 9278f41f4e..a61652304a 100644 --- a/ggml/src/ggml-hexagon/htp/htp-msg.h +++ b/ggml/src/ggml-hexagon/htp/htp-msg.h @@ -51,11 +51,12 @@ enum htp_op { HTP_OP_MUL_MAT_ID = 5, HTP_OP_RMS_NORM = 6, HTP_OP_UNARY_SILU = 7, - HTP_OP_GLU_SWIGLU = 8, - HTP_OP_GLU_SWIGLU_OAI = 9, - HTP_OP_SOFTMAX = 10, - HTP_OP_ADD_ID = 11, - HTP_OP_ROPE = 12, + HTP_OP_UNARY_GELU = 8, + HTP_OP_GLU_SWIGLU = 9, + HTP_OP_GLU_SWIGLU_OAI = 10, + HTP_OP_SOFTMAX = 11, + HTP_OP_ADD_ID = 12, + HTP_OP_ROPE = 13, INVALID }; diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c index b60b352a7b..e30ae69502 100644 --- a/ggml/src/ggml-hexagon/htp/main.c +++ b/ggml/src/ggml-hexagon/htp/main.c @@ -798,6 +798,7 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) { break; case HTP_OP_UNARY_SILU: + case HTP_OP_UNARY_GELU: if (n_bufs != 2) { FARF(ERROR, "Bad act-req buffer list"); continue; From 2a787a61d11f9e63e5943a2e6d134b2f0c402ace Mon Sep 17 00:00:00 2001 From: shouyud Date: Thu, 11 Dec 2025 16:15:46 -0500 Subject: [PATCH 02/14] snapshot: faster gelu using polynomial approximation --- ggml/src/ggml-hexagon/htp/CMakeLists.txt | 1 + ggml/src/ggml-hexagon/htp/act-ops.c | 16 +- ggml/src/ggml-hexagon/htp/hvx-exp.c | 4 +- ggml/src/ggml-hexagon/htp/hvx-utils.c | 36 +- .../src/ggml-hexagon/htp/qhcg_approximation.c | 518 ++++++++++++++++++ .../src/ggml-hexagon/htp/qhcg_approximation.h | 21 + ggml/src/ggml-hexagon/htp/qhcg_internal.h | 91 +++ 7 files changed, 666 insertions(+), 21 deletions(-) create mode 100644 ggml/src/ggml-hexagon/htp/qhcg_approximation.c create mode 100644 ggml/src/ggml-hexagon/htp/qhcg_approximation.h create mode 100644 ggml/src/ggml-hexagon/htp/qhcg_internal.h diff --git a/ggml/src/ggml-hexagon/htp/CMakeLists.txt b/ggml/src/ggml-hexagon/htp/CMakeLists.txt index 22e3fea11d..fa350be19e 100644 --- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt +++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt @@ -28,6 +28,7 @@ add_library(${HTP_LIB} SHARED softmax-ops.c act-ops.c rope-ops.c + qhcg_approximation.c ) target_compile_definitions(${HTP_LIB} PRIVATE diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c index 2db4a2a35b..b0d4d1a477 100644 --- a/ggml/src/ggml-hexagon/htp/act-ops.c +++ b/ggml/src/ggml-hexagon/htp/act-ops.c @@ -24,6 +24,9 @@ #include "hvx-utils.h" #include "ops-utils.h" + +#include "qhcg_approximation.h" + #define htp_act_preamble3 \ const uint32_t ne00 = src0->ne[0]; \ const uint32_t ne01 = src0->ne[1]; \ @@ -306,7 +309,7 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size); } - + #if 0 // gelu = 0.5 * x * (1.0 + tanh( sqrt(2/pi) * (x + 0.044715 * x^3) )) // gelu_tanh // gelu = x * sigmoid(1.702 * x) // current implementation if (1 == opt_path) { @@ -323,6 +326,17 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0); } + #else + + // alternative method + float low_bound = -6.0f; + float up_bound = 6.0f; + + qhcg_approximation( (float*)src0, (float*)dst, ne0, low_bound, up_bound ); + + + #endif + } t2 = HAP_perf_get_qtimer_count(); diff --git a/ggml/src/ggml-hexagon/htp/hvx-exp.c b/ggml/src/ggml-hexagon/htp/hvx-exp.c index 21bf46a542..1f5e9e476c 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-exp.c +++ b/ggml/src/ggml-hexagon/htp/hvx-exp.c @@ -31,13 +31,13 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int int unaligned_addr = 0; int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - FARF(HIGH, "hvx_exp_f32: unaligned address in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_exp_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } // assert((0 == unaligned_addr) || (0 == num_elems_whole)); if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - FARF(HIGH, "hvx_exp_f32: unaligned loop in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_exp_f32: unaligned loop in hvx op, possibly slower execution\n"); } HVX_Vector vec_out = Q6_V_vzero(); diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c index e02b1d9099..4597423eb9 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.c +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c @@ -40,13 +40,13 @@ void hvx_mul_f32(const uint8_t * restrict src0, int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - FARF(HIGH, "hvx_mul_f32: unaligned address in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_mul_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n"); } if (0 == unaligned_loop) { @@ -252,13 +252,13 @@ void hvx_add_f32(const uint8_t * restrict src0, int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - FARF(HIGH, "hvx_add_f32: unaligned address in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_add_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - FARF(HIGH, "hvx_add_f32: unaligned loop in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_add_f32: unaligned loop in hvx op, possibly slower execution\n"); } if (0 == unaligned_loop) { @@ -392,13 +392,13 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * int unaligned_addr = 0; int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - FARF(HIGH, "hvx_add_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_add_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); } static const float kInf = INFINITY; @@ -454,13 +454,13 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * int unaligned_addr = 0; int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - FARF(HIGH, "hvx_mul_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_mul_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - FARF(HIGH, "hvx_mul_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_mul_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); } HVX_Vector val_vec = hvx_vec_splat_fp32(val); @@ -507,13 +507,13 @@ void hvx_sub_f32(const uint8_t * restrict src0, int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - FARF(HIGH, "hvx_sub_f32: unaligned address in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_sub_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - FARF(HIGH, "hvx_sub_f32: unaligned loop in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_sub_f32: unaligned loop in hvx op, possibly slower execution\n"); } if (0 == unaligned_loop) { @@ -647,13 +647,13 @@ void hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * int unaligned_addr = 0; int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - FARF(HIGH, "hvx_sub_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_sub_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - FARF(HIGH, "hvx_sub_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_sub_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); } HVX_Vector val_vec = hvx_vec_splat_fp32(val); @@ -733,13 +733,13 @@ float hvx_self_sum_f32(const uint8_t * restrict src, const int num_elems) { int unaligned_addr = 0; int unaligned_loop = 0; if (0 == htp_is_aligned((void *) src, VLEN)) { - FARF(HIGH, "hvx_self_sum_f32: unaligned address in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_self_sum_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - FARF(HIGH, "hvx_self_sum_f32: unaligned loop in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_self_sum_f32: unaligned loop in hvx op, possibly slower execution\n"); } HVX_Vector sum_vec = Q6_V_vsplat_R(0x00000000); @@ -782,13 +782,13 @@ void hvx_scale_f32(const uint8_t * restrict src, uint8_t * restrict dst, const i int unaligned_addr = 0; int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - FARF(HIGH, "hvx_scale_f32: unaligned address in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_scale_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - FARF(HIGH, "hvx_scale_f32: unaligned loop in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_scale_f32: unaligned loop in hvx op, possibly slower execution\n"); } HVX_Vector scale_vec = hvx_vec_splat_fp32(scale); @@ -831,13 +831,13 @@ float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems) { int unaligned_addr = 0; int unaligned_loop = 0; if (0 == htp_is_aligned((void *) src, VLEN)) { - FARF(HIGH, "hvx_self_max_f32: unaligned address in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_self_max_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - FARF(HIGH, "hvx_self_max_f32: unaligned loop in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_self_max_f32: unaligned loop in hvx op, possibly slower execution\n"); } HVX_Vector vec_max = hvx_vec_splat_fp32(((const float *) src)[0]); diff --git a/ggml/src/ggml-hexagon/htp/qhcg_approximation.c b/ggml/src/ggml-hexagon/htp/qhcg_approximation.c new file mode 100644 index 0000000000..5068c5fa34 --- /dev/null +++ b/ggml/src/ggml-hexagon/htp/qhcg_approximation.c @@ -0,0 +1,518 @@ +/**============================================================================= +@file + qhcg_approximation.c + +@brief + Calculate polynomial approximation of the function below in + floating-point arithmetic using HVX instructions. + + Function: gelu(x) + + Function is approximated in specified input range from -6.0 to 6.0, + where inputs and outputs are arrays of 32-bit float values. + + Approximation is performed using the following method: + + 1) Input range is split into 16 equidistant segments + 2) For each segment, Numpy's polynomial package is used to find the best + polynomial approximation of order N with the corresponding C0, C1, ..., Cn. + 3) VLUT instructions are used to select appropriate coefficients for each input sample + 4) Horner's method is used to compute polynomial values: + f(x) = ((((Cn*x + Cn-1)*x + Cn-2)*x + ...)*x + C1)*x + C0 + +Copyright (c) 2020 Qualcomm Technologies Incorporated. +All Rights Reserved. Qualcomm Proprietary and Confidential. +=============================================================================**/ + +#if __HVX_ARCH__ >= 68 + +#include "qhcg_approximation.h" +#include "qhcg_internal.h" + +#define BLOCK_SIZE (8*1024/128) /* vector chunks */ +#define L2FETCH_AHEAD (BLOCK_SIZE) + +/* Polynomial coefficients */ +static const float c0_coeffs[32] __attribute__((aligned(VLEN))) = +{ + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, +-0.1025868397073178,-1.1184356646199394,-1.9705895994321767,0.11469604839384463,0.40991447569341943,0.00424292239610935,-0.0017846707638177889,4.125901398310816e-09, +9.718309490480692e-11,-0.0015488336803479719,0.001064556481209511,0.3906162486717146,0.19084584900320978,-1.911422745140333,-1.1879384314707315,-0.10823562636002611, +}; +static const float c1_coeffs[32] __attribute__((aligned(VLEN))) = +{ + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, +-0.1234196807250312,-1.5042580469229814,-2.7701977888429816,1.1561921948215528,1.73891533063333,0.49580124294548433,0.4867587290479026,0.500000435462697, +0.4999997919981341,0.5116842338641109,0.5163606020356294,-0.6867154811454343,-0.31551789326265844,3.6694157536939014,2.6042137343731855,1.1304321895807614, +}; +static const float c2_coeffs[32] __attribute__((aligned(VLEN))) = +{ + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, +-0.06367500510012546,-0.8689061926460069,-1.674005553705795,1.5013658408230053,1.9798213609930566,0.33731544915026324,0.35673778512915555,0.398953295788538, +0.3989496120997857,0.3611051680040998,0.31742994078248077,1.9193992198306873,1.6441036493618186,-1.600477678714911,-0.9304878890577859,-0.06740463140212431, +}; +static const float c3_coeffs[32] __attribute__((aligned(VLEN))) = +{ + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, +-0.01826132753437031,-0.27938965958246625,-0.56347555462781,0.8662803078586866,1.0748504969694623,-0.13840364789720844,-0.07490683960610874,0.00011501805987770841, +-8.89610380930177e-05,0.06815977365648013,0.1564140217086786,-1.036053072449464,-0.9372597866516783,0.5336910940777527,0.3004584208315817,0.019362956684359556, +}; +static const float c4_coeffs[32] __attribute__((aligned(VLEN))) = +{ + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, +-0.003143995202268695,-0.05400134785573118,-0.11405541169720136,0.2730279204613817,0.3235877725936627,-0.21757221119731435,-0.14645680741997966,-0.06620698974306806, +-0.06630082288474698,-0.14025595963442758,-0.22733077791076023,0.30866276792496655,0.29418673249390104,-0.10682071320119783,-0.05832443091378418,-0.003339162830362702, +}; +static const float c5_coeffs[32] __attribute__((aligned(VLEN))) = +{ + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, +-0.0003249391771954532,-0.006273424264889337,-0.01387698764918236,0.04913223606829913,0.05545197372332761,-0.09031312961799431,-0.04973154630108704,0.001825035162087615, +-0.0016453620553813022,0.046340833813673266,0.09347637717015225,-0.0520121796723486,-0.0529133082948728,0.012823220702040979,0.00680542921713579,0.00034567793526667706, +}; +static const float c6_coeffs[32] __attribute__((aligned(VLEN))) = +{ + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, +-1.866614032852734e-05,-0.0004055443985444202,-0.0009392734765841164,0.004770459893478637,0.00504881095326808,-0.016904688419800747,-0.0049839929986692675,0.013321931926939602, +0.01314779303860721,-0.003962384692377502,-0.017472688915232914,0.004609031329816739,0.005145502689303376,-0.0008540539868357813,-0.0004419008815610675,-1.989001488248583e-05, +}; +static const float c7_coeffs[32] __attribute__((aligned(VLEN))) = +{ + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, +-4.5974928158662533e-07,-1.1252676844375516e-05,-2.7270249210246306e-05,0.0001949129249064408,0.00018774479051508836,-0.001238293494672944,0.000199977799551612,0.0029325624122584024, +-0.0028654073893250895,-0.00033083897498689484,0.0012818786224478341,-0.00016368340082111905,-0.0002108418119120288,2.431836142521883e-05,1.2317036094266618e-05,4.906925630164402e-07, +}; + +/** + * @brief Polynomial approximation of gelu(x) function. + * @param[in] input Input array of elements in IEEE 32-bit floating-point format. + * @param[out] output Output array of elements in IEEE 32-bit floating-point format. + * @param[in] length Number of elements in input/output arrays. + * @return Returns 0 on successful execution. Otherwise -1. + */ +int32_t qhcg_approximation(float *restrict input, float *restrict output, uint32_t size, + float limit_left, float limit_right) + +{ + HVX_Vector *input_v_ptr; + HVX_UVector *output_v_ptr; + HVX_Vector input_min_v_f; + HVX_Vector input_max_v_f; + + HVX_Vector input_shifted_v_qf32; + HVX_Vector input_scaled_v_qf32; + HVX_Vector scale_v; + HVX_Vector input_v_qf32; + HVX_Vector const16_0_v_sf; + HVX_Vector zero_v_sf; + HVX_Vector mask_idx1_v, mask_idx2_v; + HVX_Vector tmp_v, idx1_v, idx2_v; + HVX_Vector output_v; + HVX_Vector slinep; + HVX_Vector slinec; + HVX_Vector sline; + HVX_Vector sline_tmp; + HVX_Vector sout; + int32_t block, l2fetch_block; + int32_t leftover = size & 31; + int32_t vectors_in_rounddown = size / 32; + int32_t leftover_size = leftover * sizeof(float); + HVX_DV c0_coeff_dv; + HVX_VectorPair c0_coeff_vp; + HVX_Vector c0_coeff_v; + HVX_DV c1_coeff_dv; + HVX_VectorPair c1_coeff_vp; + HVX_Vector c1_coeff_v; + HVX_DV c2_coeff_dv; + HVX_VectorPair c2_coeff_vp; + HVX_Vector c2_coeff_v; + HVX_DV c3_coeff_dv; + HVX_VectorPair c3_coeff_vp; + HVX_Vector c3_coeff_v; + HVX_DV c4_coeff_dv; + HVX_VectorPair c4_coeff_vp; + HVX_Vector c4_coeff_v; + HVX_DV c5_coeff_dv; + HVX_VectorPair c5_coeff_vp; + HVX_Vector c5_coeff_v; + HVX_DV c6_coeff_dv; + HVX_VectorPair c6_coeff_vp; + HVX_Vector c6_coeff_v; + HVX_DV c7_coeff_dv; + HVX_VectorPair c7_coeff_vp; + HVX_Vector c7_coeff_v; + + HVX_Vector zero_vec = Q6_V_vsplat_R(0x00000000); + + /* Check input arguments. Return error status if some argument has invalid value */ + if ((input == 0) || (output == 0) || (size == 0)) + { + return -1; + } + + input_v_ptr = (HVX_Vector *) input; + output_v_ptr = (HVX_UVector *) output; + + /* + * If input data is not aligned to HVX vector size, compose aligned vectors + * from data loaded in slinep and slinec + */ + slinep = *input_v_ptr++; + + /* + * Splat scale factor in order to be used later for finding indexes of coefficients. + * Scale factor is represented in IEEE 16-bit floating-point format and it is + * calculated using the following formula: + * scale_factor = (16.0 / (b0 - a0)) + * NOTE: Calculated value is slightly decreased in order to avoid out of bound + * indexes during VLUT lookup. + */ + scale_v = Q6_V_vsplat_R(0x3faaaaa9); + + /* + * Vector of zeroes used as neutral element in sf to qf32 conversions. + * NOTE: Some of conversions (i.e conversion of scale factor and coefficients) + * can be avoided in real-time, but this is not done in order to don't + * sacrify code readibility in expense of insignificant performance improvement. + */ + zero_v_sf = Q6_V_vzero(); + + /* Mask for extracting only 4 bits of mantissa */ + mask_idx1_v = Q6_V_vsplat_R(0x0000000F); + mask_idx2_v = Q6_V_vsplat_R(0x00000010); + + /* 16.0 in IEEE 16-bit floating-point representation */ + const16_0_v_sf = Q6_V_vsplat_R(0x41800000); + + /* + * Prepare vector of input_min values, that is used later in shifting input range. + * input_min is low boundary of specified input range. + */ + int32_t input_min_bits = *((int32_t *) &limit_left); + int32_t input_max_bits = *((int32_t *) &limit_right); + + input_min_v_f = Q6_V_vsplat_R(input_min_bits); + input_max_v_f = Q6_V_vsplat_R(input_max_bits); + + /* Convert scale factor from sf to q32. Use the same vector for both formats */ + scale_v = Q6_Vqf32_vadd_VsfVsf(scale_v, zero_v_sf); + + /* Load coefficients */ + c0_coeff_v = *((HVX_Vector *)(c0_coeffs)); + c1_coeff_v = *((HVX_Vector *)(c1_coeffs)); + c2_coeff_v = *((HVX_Vector *)(c2_coeffs)); + c3_coeff_v = *((HVX_Vector *)(c3_coeffs)); + c4_coeff_v = *((HVX_Vector *)(c4_coeffs)); + c5_coeff_v = *((HVX_Vector *)(c5_coeffs)); + c6_coeff_v = *((HVX_Vector *)(c6_coeffs)); + c7_coeff_v = *((HVX_Vector *)(c7_coeffs)); + + /* Convert coefficients from sf to qf32 format. Use the same vector for both representations */ + c0_coeff_v = Q6_Vqf32_vadd_VsfVsf(c0_coeff_v, zero_v_sf); + c1_coeff_v = Q6_Vqf32_vadd_VsfVsf(c1_coeff_v, zero_v_sf); + c2_coeff_v = Q6_Vqf32_vadd_VsfVsf(c2_coeff_v, zero_v_sf); + c3_coeff_v = Q6_Vqf32_vadd_VsfVsf(c3_coeff_v, zero_v_sf); + c4_coeff_v = Q6_Vqf32_vadd_VsfVsf(c4_coeff_v, zero_v_sf); + c5_coeff_v = Q6_Vqf32_vadd_VsfVsf(c5_coeff_v, zero_v_sf); + c6_coeff_v = Q6_Vqf32_vadd_VsfVsf(c6_coeff_v, zero_v_sf); + c7_coeff_v = Q6_Vqf32_vadd_VsfVsf(c7_coeff_v, zero_v_sf); + + /* Split 32-bit coefficients to lower and upper part in order to obtain them later with VLUT16. */ + c0_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c0_coeff_v); + c1_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c1_coeff_v); + c2_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c2_coeff_v); + c3_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c3_coeff_v); + c4_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c4_coeff_v); + c5_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c5_coeff_v); + c6_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c6_coeff_v); + c7_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c7_coeff_v); + + /* + * Handle number of whole vectors in input data. + * Don't process last vector in order to avoid out-of-boundary load. + */ + for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) + { + block = Q6_R_min_RR(i, BLOCK_SIZE); + l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE); + + if (l2fetch_block > 0) + { + l2fetch(input_v_ptr + L2FETCH_AHEAD, 128, 128, l2fetch_block, 0); + } + + /* Process one vector at a time */ + for (int32_t j = 0; j < block; ++j) + { + slinec = *input_v_ptr++; + + /* Compose vector of input data from slinec and slinep */ + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); + sline_tmp = sline; + /* Shift input range from [input_min, input_max] to [0, input_max - input_min] */ + input_shifted_v_qf32 = Q6_Vqf32_vsub_VsfVsf(sline, input_min_v_f); + + /* + * Scale shifted input range from [0, input_max - input_min] to [0,16.0) + * in order to get corresponding coefficient indexes + */ + input_scaled_v_qf32 = Q6_Vqf32_vmpy_Vqf32Vqf32(input_shifted_v_qf32, scale_v); + + /* + * VLUT 16 requires integer indexes. Shift scaled input range from [0,16.0) + * to [16.0,32.0) in order to convert float indexes to integer values. + * Float values, represented in IEEE 754, in range [16.0,32.0] have the + * same exponent, which means 4 MSB of mantissa carry information about + * integer index. + */ + input_scaled_v_qf32 = Q6_Vqf32_vadd_Vqf32Vsf(input_scaled_v_qf32, const16_0_v_sf); + + /* Convert back from qf32 to sf in order to extract integer index */ + tmp_v = Q6_Vsf_equals_Vqf32(input_scaled_v_qf32); + + /* Only 4 MSB bits of mantissa represent segment index */ + idx1_v = Q6_Vuw_vlsr_VuwR(tmp_v, 19); + + idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v); + idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v); + idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16); + + /* Obtain the polynomial coefficients from lookup table */ + c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1); + c0_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v, Q6_V_hi_W(c0_coeff_dv.VV), 1); + c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1); + c1_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v, Q6_V_hi_W(c1_coeff_dv.VV), 1); + c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1); + c2_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v, Q6_V_hi_W(c2_coeff_dv.VV), 1); + c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1); + c3_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v, Q6_V_hi_W(c3_coeff_dv.VV), 1); + c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1); + c4_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v, Q6_V_hi_W(c4_coeff_dv.VV), 1); + c5_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c5_coeff_dv.VV), 1); + c5_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c5_coeff_vp, idx2_v, Q6_V_hi_W(c5_coeff_dv.VV), 1); + c6_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c6_coeff_dv.VV), 1); + c6_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c6_coeff_vp, idx2_v, Q6_V_hi_W(c6_coeff_dv.VV), 1); + c7_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c7_coeff_dv.VV), 1); + c7_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c7_coeff_vp, idx2_v, Q6_V_hi_W(c7_coeff_dv.VV), 1); + + /* Convert input from sf vector to qf32 vector for Horner's method*/ + input_v_qf32 = Q6_Vqf32_vadd_VsfVsf(sline, zero_v_sf); + + /* Perform evaluation of polynomial using Horner's method */ + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c7_coeff_vp), input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c6_coeff_vp)); + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c5_coeff_vp)); + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c4_coeff_vp)); + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c3_coeff_vp)); + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c2_coeff_vp)); + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c1_coeff_vp)); + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c0_coeff_vp)); + + // /* Store results to the output buffer and convert from qf32 to sf */ + // *((HVX_UVector *)(output_v_ptr++)) = Q6_Vsf_equals_Vqf32(output_v); + + + /* Convert from qf32 to sf, store output and go to handle leftover */ + HVX_Vector output_v_f32 = Q6_Vsf_equals_Vqf32(output_v); + HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(input_min_v_f, sline_tmp); // 1 if input_min_v_f > sline_tmp + output_v_f32 = Q6_V_vmux_QVV(pred_cap_left, zero_vec, output_v_f32); // if sline_tmp> input_min_v_f, set to zero + + HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(sline_tmp, input_max_v_f); // 1 if sline_tmp > input_max_v_f + output_v_f32 = Q6_V_vmux_QVV(pred_cap_right, sline_tmp, output_v_f32); // if sline_tmp> input_max_v_f, set to whatever the sline_tmp was + + *((HVX_UVector *)(output_v_ptr++)) = output_v_f32; + + + /* Prepare slinep for next iteration */ + slinep = slinec; + } + } + + /* Handle last whole vector from input data */ + if (vectors_in_rounddown > 0) + { + slinec = is_aligned(input_v_ptr, 128) && leftover == 0 ? slinep : *input_v_ptr++; + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); + sline_tmp = sline; + + /* Shift input range from [input_min, input_max] to [0, input_max - input_min] */ + input_shifted_v_qf32 = Q6_Vqf32_vsub_VsfVsf(sline, input_min_v_f); + + /* Scale shifted input range from [0, input_max - input_min] to [0,16.0) */ + input_scaled_v_qf32 = Q6_Vqf32_vmpy_Vqf32Vqf32(input_shifted_v_qf32, scale_v); + + /* + * VLUT 16 requires integer indexes. Shift scaled input range from [0,16.0) + * to [16.0,32.0) in order to convert float indexes to integer values. + * Float values, represented in IEEE 754, in range [16.0,32.0] have the + * same exponent, which means 4 MSB of mantissa carry information about + * integer index. + */ + input_scaled_v_qf32 = Q6_Vqf32_vadd_Vqf32Vsf(input_scaled_v_qf32, const16_0_v_sf); + + /* Convert back from qf32 to sf in order to extract integer index */ + tmp_v = Q6_Vsf_equals_Vqf32(input_scaled_v_qf32); + + /* Only 4 MSB bits of mantissa represent segment index */ + idx1_v = Q6_Vuw_vlsr_VuwR(tmp_v, 19); + + /* Ensure only 4 MSB bits of mantissa are used as indexes */ + idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v); + idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v); + idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16); + + /* Obtain the polynomial coefficients from lookup table */ + c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1); + c0_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v, Q6_V_hi_W(c0_coeff_dv.VV), 1); + c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1); + c1_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v, Q6_V_hi_W(c1_coeff_dv.VV), 1); + c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1); + c2_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v, Q6_V_hi_W(c2_coeff_dv.VV), 1); + c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1); + c3_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v, Q6_V_hi_W(c3_coeff_dv.VV), 1); + c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1); + c4_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v, Q6_V_hi_W(c4_coeff_dv.VV), 1); + c5_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c5_coeff_dv.VV), 1); + c5_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c5_coeff_vp, idx2_v, Q6_V_hi_W(c5_coeff_dv.VV), 1); + c6_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c6_coeff_dv.VV), 1); + c6_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c6_coeff_vp, idx2_v, Q6_V_hi_W(c6_coeff_dv.VV), 1); + c7_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c7_coeff_dv.VV), 1); + c7_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c7_coeff_vp, idx2_v, Q6_V_hi_W(c7_coeff_dv.VV), 1); + + /* Convert input from sf vector to qf32 vector for Horner's method*/ + input_v_qf32 = Q6_Vqf32_vadd_VsfVsf(sline, zero_v_sf); + + /* Perform evaluation of polynomial using Horner's method */ + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c7_coeff_vp), input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c6_coeff_vp)); + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c5_coeff_vp)); + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c4_coeff_vp)); + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c3_coeff_vp)); + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c2_coeff_vp)); + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c1_coeff_vp)); + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c0_coeff_vp)); + + /* Convert from qf32 to sf, store output and go to handle leftover */ + HVX_Vector output_v_f32 = Q6_Vsf_equals_Vqf32(output_v); + HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(input_min_v_f, sline_tmp); // 1 if input_min_v_f > sline_tmp + output_v_f32 = Q6_V_vmux_QVV(pred_cap_left, zero_vec, output_v_f32); // if sline_tmp> input_min_v_f, set to zero + + HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(sline_tmp, input_max_v_f); // 1 if sline_tmp > input_max_v_f + output_v_f32 = Q6_V_vmux_QVV(pred_cap_right, sline_tmp, output_v_f32); // if sline_tmp> input_max_v_f, set to whatever the sline_tmp was + + *((HVX_UVector *)(output_v_ptr++)) = output_v_f32; + + slinep = slinec; + } + + /* Handle leftover elements */ + if (leftover > 0) + { + slinec = (is_in_one_chunk(input_v_ptr, leftover_size, 128) + ? slinep + : *input_v_ptr++); + + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); + sline_tmp = sline; + + /* Shift input range from [input_min, input_max] to [0, input_max - input_min] */ + input_shifted_v_qf32 = Q6_Vqf32_vsub_VsfVsf(sline, input_min_v_f); + + /* Scale shifted input range from [0, input_max - input_min] to [0,16.0) */ + input_scaled_v_qf32 = Q6_Vqf32_vmpy_Vqf32Vqf32(input_shifted_v_qf32, scale_v); + + /* + * VLUT 16 requires integer indexes. Shift scaled input range from [0,16.0) + * to [16.0,32.0) in order to convert float indexes to integer values. + * Float values, represented in IEEE 754, in range [16.0,32.0] have the + * same exponent, which means 4 MSB of mantissa carry information about + * integer index. + */ + input_scaled_v_qf32 = Q6_Vqf32_vadd_Vqf32Vsf(input_scaled_v_qf32, const16_0_v_sf); + + /* Convert back from qf32 to sf in order to extract integer index */ + tmp_v = Q6_Vsf_equals_Vqf32(input_scaled_v_qf32); + + /* Only 4 MSB bits of mantissa represent segment index */ + idx1_v = Q6_Vuw_vlsr_VuwR(tmp_v, 19); + + /* Ensure only 4 MSB bits of mantissa are used as indexes */ + idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v); + idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v); + idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16); + + /* Obtain the polynomial coefficients from lookup table */ + c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1); + c0_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v, Q6_V_hi_W(c0_coeff_dv.VV), 1); + c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1); + c1_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v, Q6_V_hi_W(c1_coeff_dv.VV), 1); + c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1); + c2_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v, Q6_V_hi_W(c2_coeff_dv.VV), 1); + c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1); + c3_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v, Q6_V_hi_W(c3_coeff_dv.VV), 1); + c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1); + c4_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v, Q6_V_hi_W(c4_coeff_dv.VV), 1); + c5_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c5_coeff_dv.VV), 1); + c5_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c5_coeff_vp, idx2_v, Q6_V_hi_W(c5_coeff_dv.VV), 1); + c6_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c6_coeff_dv.VV), 1); + c6_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c6_coeff_vp, idx2_v, Q6_V_hi_W(c6_coeff_dv.VV), 1); + c7_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c7_coeff_dv.VV), 1); + c7_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c7_coeff_vp, idx2_v, Q6_V_hi_W(c7_coeff_dv.VV), 1); + + /* Convert input from sf vector to qf32 vector for Horner's method*/ + input_v_qf32 = Q6_Vqf32_vadd_VsfVsf(sline, zero_v_sf); + + /* Perform evaluation of polynomial using Horner's method */ + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c7_coeff_vp), input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c6_coeff_vp)); + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c5_coeff_vp)); + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c4_coeff_vp)); + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c3_coeff_vp)); + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c2_coeff_vp)); + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c1_coeff_vp)); + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c0_coeff_vp)); + + /* Convert from qf32 to sf */ + // sout = Q6_Vsf_equals_Vqf32(output_v); + HVX_Vector output_v_f32 = Q6_Vsf_equals_Vqf32(output_v); + HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(input_min_v_f, sline_tmp); // 1 if input_min_v_f > sline_tmp + output_v_f32 = Q6_V_vmux_QVV(pred_cap_left, zero_vec, output_v_f32); // if sline_tmp> input_min_v_f, set to zero + + + HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(sline_tmp, input_max_v_f); // 1 if sline_tmp > input_max_v_f + output_v_f32 = Q6_V_vmux_QVV(pred_cap_right, sline_tmp, output_v_f32); // if sline_tmp> input_max_v_f, set to whatever the sline_tmp was + + sout = output_v_f32; + /* Store output */ + vstu_variable(output_v_ptr, leftover_size, sout); + } + + return 0; +} + +#endif /* __HVX_ARCH__ >= 68 */ diff --git a/ggml/src/ggml-hexagon/htp/qhcg_approximation.h b/ggml/src/ggml-hexagon/htp/qhcg_approximation.h new file mode 100644 index 0000000000..6f70e209ff --- /dev/null +++ b/ggml/src/ggml-hexagon/htp/qhcg_approximation.h @@ -0,0 +1,21 @@ +/**============================================================================= +@file + qhcg_approximation.h + +@brief + Header file of polynomial approximation generated by QHCG + +Copyright (c) 2020 Qualcomm Technologies Incorporated. +All Rights Reserved. Qualcomm Proprietary and Confidential. +=============================================================================**/ + +#ifndef __qhcg_approximation__ +#define __qhcg_approximation__ + +#include + +int32_t qhcg_approximation(float *inputs, float *outputs, uint32_t length, + float limit_left, float limit_right +); + +#endif /* __qhcg_approximation__ */ diff --git a/ggml/src/ggml-hexagon/htp/qhcg_internal.h b/ggml/src/ggml-hexagon/htp/qhcg_internal.h new file mode 100644 index 0000000000..618610dc88 --- /dev/null +++ b/ggml/src/ggml-hexagon/htp/qhcg_internal.h @@ -0,0 +1,91 @@ +/**============================================================================= +@file + hvx_internal.h + +@brief + Header file for HVX routines. + +Copyright (c) 2020 Qualcomm Technologies Incorporated. +All Rights Reserved. Qualcomm Proprietary and Confidential. +=============================================================================**/ + +#ifndef _HVX_INTERNAL_H +#define _HVX_INTERNAL_H + +#include // size_t +#include + +#define HVX_INLINE_ALWAYS inline __attribute__((unused,always_inline)) + +#ifndef LOG2VLEN +#define LOG2VLEN 7 +#endif +#define VLEN (1<>1 // HVX vector - number of int16_t elements +#define VLEN_WORD (1<>2 // HVX vector - number of int32_t elements + +typedef union +{ + HVX_VectorPair VV; + struct + { + HVX_Vector lo; + HVX_Vector hi; + } V; +} HVX_DV; + +static HVX_INLINE_ALWAYS void l2fetch(const void *p, uint32_t stride, + uint32_t width, uint32_t height, + uint32_t dir) +{ + uint64_t control = HEXAGON_V64_CREATE_H(dir, stride, width, height); + __asm__ __volatile__ (" l2fetch(%0,%1) " : :"r"(p),"r"(control)); +} + +/* Return whether address is aligned. */ + +static HVX_INLINE_ALWAYS int32_t is_aligned(void *addr, uint32_t align) +{ + return ((size_t) addr & (align - 1)) == 0; +} + +/* Return whether 'n' elements from vector are in the one chunk of 'chunk_size'. */ + +static HVX_INLINE_ALWAYS int32_t is_in_one_chunk(void *addr, uint32_t n, + uint32_t chunk_size) +{ + uint32_t left_off = (size_t) addr & (chunk_size - 1); + uint32_t right_off = left_off + n; + return right_off <= chunk_size; +} + +/* + * This function stores the first n bytes from vector vin to address 'addr'. + * n must be in range 1..128 and addr may have any alignment. Does one or + * two masked stores. + */ + +static HVX_INLINE_ALWAYS void vstu_variable(void *addr, uint32_t n, + HVX_Vector vin) +{ + /* Rotate as needed. */ + vin = Q6_V_vlalign_VVR(vin, vin, (size_t) addr); + + uint32_t left_off = (size_t) addr & 127; + uint32_t right_off = left_off + n; + + HVX_VectorPred ql_not = Q6_Q_vsetq_R((size_t) addr); + HVX_VectorPred qr = Q6_Q_vsetq2_R(right_off); + + if (right_off > 128) + { + Q6_vmem_QRIV(qr, (HVX_Vector*) addr + 1, vin); + /* all 1's */ + qr = Q6_Q_vcmp_eq_VbVb(vin, vin); + } + + ql_not = Q6_Q_or_QQn(ql_not, qr); + Q6_vmem_QnRIV(ql_not, (HVX_Vector*) addr, vin); +} + +#endif /* _HVX_INTERNAL_H */ From 72339994d45b2bed887e79994403c378d90b62b5 Mon Sep 17 00:00:00 2001 From: shouyud Date: Thu, 11 Dec 2025 16:59:06 -0500 Subject: [PATCH 03/14] test: disable l2-block prefetch in polynomail approximation --- ggml/src/ggml-hexagon/htp/qhcg_approximation.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/qhcg_approximation.c b/ggml/src/ggml-hexagon/htp/qhcg_approximation.c index 5068c5fa34..c2ecfff4fe 100644 --- a/ggml/src/ggml-hexagon/htp/qhcg_approximation.c +++ b/ggml/src/ggml-hexagon/htp/qhcg_approximation.c @@ -120,7 +120,7 @@ int32_t qhcg_approximation(float *restrict input, float *restrict output, uint32 HVX_Vector sline; HVX_Vector sline_tmp; HVX_Vector sout; - int32_t block, l2fetch_block; + int32_t block; //l2fetch_block; int32_t leftover = size & 31; int32_t vectors_in_rounddown = size / 32; int32_t leftover_size = leftover * sizeof(float); @@ -241,12 +241,12 @@ int32_t qhcg_approximation(float *restrict input, float *restrict output, uint32 for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) { block = Q6_R_min_RR(i, BLOCK_SIZE); - l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE); + //l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE); - if (l2fetch_block > 0) - { - l2fetch(input_v_ptr + L2FETCH_AHEAD, 128, 128, l2fetch_block, 0); - } + // if (l2fetch_block > 0) + // { + // l2fetch(input_v_ptr + L2FETCH_AHEAD, 128, 128, l2fetch_block, 0); + // } /* Process one vector at a time */ for (int32_t j = 0; j < block; ++j) From 470b499130e46e7aa3acc7022240775278cbcf1f Mon Sep 17 00:00:00 2001 From: shouyud Date: Fri, 12 Dec 2025 10:03:51 -0500 Subject: [PATCH 04/14] Revert "test: disable l2-block prefetch in polynomail approximation" This reverts commit 72339994d45b2bed887e79994403c378d90b62b5. --- ggml/src/ggml-hexagon/htp/qhcg_approximation.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/qhcg_approximation.c b/ggml/src/ggml-hexagon/htp/qhcg_approximation.c index c2ecfff4fe..5068c5fa34 100644 --- a/ggml/src/ggml-hexagon/htp/qhcg_approximation.c +++ b/ggml/src/ggml-hexagon/htp/qhcg_approximation.c @@ -120,7 +120,7 @@ int32_t qhcg_approximation(float *restrict input, float *restrict output, uint32 HVX_Vector sline; HVX_Vector sline_tmp; HVX_Vector sout; - int32_t block; //l2fetch_block; + int32_t block, l2fetch_block; int32_t leftover = size & 31; int32_t vectors_in_rounddown = size / 32; int32_t leftover_size = leftover * sizeof(float); @@ -241,12 +241,12 @@ int32_t qhcg_approximation(float *restrict input, float *restrict output, uint32 for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) { block = Q6_R_min_RR(i, BLOCK_SIZE); - //l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE); + l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE); - // if (l2fetch_block > 0) - // { - // l2fetch(input_v_ptr + L2FETCH_AHEAD, 128, 128, l2fetch_block, 0); - // } + if (l2fetch_block > 0) + { + l2fetch(input_v_ptr + L2FETCH_AHEAD, 128, 128, l2fetch_block, 0); + } /* Process one vector at a time */ for (int32_t j = 0; j < block; ++j) From 999492fe9b1149df30df57ce91ad08bd8b96c0c0 Mon Sep 17 00:00:00 2001 From: shouyud Date: Fri, 12 Dec 2025 10:04:05 -0500 Subject: [PATCH 05/14] Revert "snapshot: faster gelu using polynomial approximation" This reverts commit 2a787a61d11f9e63e5943a2e6d134b2f0c402ace. --- ggml/src/ggml-hexagon/htp/CMakeLists.txt | 1 - ggml/src/ggml-hexagon/htp/act-ops.c | 16 +- ggml/src/ggml-hexagon/htp/hvx-exp.c | 4 +- ggml/src/ggml-hexagon/htp/hvx-utils.c | 36 +- .../src/ggml-hexagon/htp/qhcg_approximation.c | 518 ------------------ .../src/ggml-hexagon/htp/qhcg_approximation.h | 21 - ggml/src/ggml-hexagon/htp/qhcg_internal.h | 91 --- 7 files changed, 21 insertions(+), 666 deletions(-) delete mode 100644 ggml/src/ggml-hexagon/htp/qhcg_approximation.c delete mode 100644 ggml/src/ggml-hexagon/htp/qhcg_approximation.h delete mode 100644 ggml/src/ggml-hexagon/htp/qhcg_internal.h diff --git a/ggml/src/ggml-hexagon/htp/CMakeLists.txt b/ggml/src/ggml-hexagon/htp/CMakeLists.txt index fa350be19e..22e3fea11d 100644 --- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt +++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt @@ -28,7 +28,6 @@ add_library(${HTP_LIB} SHARED softmax-ops.c act-ops.c rope-ops.c - qhcg_approximation.c ) target_compile_definitions(${HTP_LIB} PRIVATE diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c index b0d4d1a477..2db4a2a35b 100644 --- a/ggml/src/ggml-hexagon/htp/act-ops.c +++ b/ggml/src/ggml-hexagon/htp/act-ops.c @@ -24,9 +24,6 @@ #include "hvx-utils.h" #include "ops-utils.h" - -#include "qhcg_approximation.h" - #define htp_act_preamble3 \ const uint32_t ne00 = src0->ne[0]; \ const uint32_t ne01 = src0->ne[1]; \ @@ -309,7 +306,7 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size); } - #if 0 + // gelu = 0.5 * x * (1.0 + tanh( sqrt(2/pi) * (x + 0.044715 * x^3) )) // gelu_tanh // gelu = x * sigmoid(1.702 * x) // current implementation if (1 == opt_path) { @@ -326,17 +323,6 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0); } - #else - - // alternative method - float low_bound = -6.0f; - float up_bound = 6.0f; - - qhcg_approximation( (float*)src0, (float*)dst, ne0, low_bound, up_bound ); - - - #endif - } t2 = HAP_perf_get_qtimer_count(); diff --git a/ggml/src/ggml-hexagon/htp/hvx-exp.c b/ggml/src/ggml-hexagon/htp/hvx-exp.c index 1f5e9e476c..21bf46a542 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-exp.c +++ b/ggml/src/ggml-hexagon/htp/hvx-exp.c @@ -31,13 +31,13 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int int unaligned_addr = 0; int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - //FARF(HIGH, "hvx_exp_f32: unaligned address in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_exp_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } // assert((0 == unaligned_addr) || (0 == num_elems_whole)); if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - //FARF(HIGH, "hvx_exp_f32: unaligned loop in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_exp_f32: unaligned loop in hvx op, possibly slower execution\n"); } HVX_Vector vec_out = Q6_V_vzero(); diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c index 4597423eb9..e02b1d9099 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.c +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c @@ -40,13 +40,13 @@ void hvx_mul_f32(const uint8_t * restrict src0, int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - //FARF(HIGH, "hvx_mul_f32: unaligned address in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_mul_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - //FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n"); } if (0 == unaligned_loop) { @@ -252,13 +252,13 @@ void hvx_add_f32(const uint8_t * restrict src0, int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - //FARF(HIGH, "hvx_add_f32: unaligned address in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_add_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - //FARF(HIGH, "hvx_add_f32: unaligned loop in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_add_f32: unaligned loop in hvx op, possibly slower execution\n"); } if (0 == unaligned_loop) { @@ -392,13 +392,13 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * int unaligned_addr = 0; int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - //FARF(HIGH, "hvx_add_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_add_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - //FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); } static const float kInf = INFINITY; @@ -454,13 +454,13 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * int unaligned_addr = 0; int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - //FARF(HIGH, "hvx_mul_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_mul_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - //FARF(HIGH, "hvx_mul_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_mul_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); } HVX_Vector val_vec = hvx_vec_splat_fp32(val); @@ -507,13 +507,13 @@ void hvx_sub_f32(const uint8_t * restrict src0, int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - //FARF(HIGH, "hvx_sub_f32: unaligned address in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_sub_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - //FARF(HIGH, "hvx_sub_f32: unaligned loop in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_sub_f32: unaligned loop in hvx op, possibly slower execution\n"); } if (0 == unaligned_loop) { @@ -647,13 +647,13 @@ void hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * int unaligned_addr = 0; int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - //FARF(HIGH, "hvx_sub_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_sub_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - //FARF(HIGH, "hvx_sub_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_sub_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); } HVX_Vector val_vec = hvx_vec_splat_fp32(val); @@ -733,13 +733,13 @@ float hvx_self_sum_f32(const uint8_t * restrict src, const int num_elems) { int unaligned_addr = 0; int unaligned_loop = 0; if (0 == htp_is_aligned((void *) src, VLEN)) { - //FARF(HIGH, "hvx_self_sum_f32: unaligned address in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_self_sum_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - //FARF(HIGH, "hvx_self_sum_f32: unaligned loop in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_self_sum_f32: unaligned loop in hvx op, possibly slower execution\n"); } HVX_Vector sum_vec = Q6_V_vsplat_R(0x00000000); @@ -782,13 +782,13 @@ void hvx_scale_f32(const uint8_t * restrict src, uint8_t * restrict dst, const i int unaligned_addr = 0; int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - //FARF(HIGH, "hvx_scale_f32: unaligned address in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_scale_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - //FARF(HIGH, "hvx_scale_f32: unaligned loop in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_scale_f32: unaligned loop in hvx op, possibly slower execution\n"); } HVX_Vector scale_vec = hvx_vec_splat_fp32(scale); @@ -831,13 +831,13 @@ float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems) { int unaligned_addr = 0; int unaligned_loop = 0; if (0 == htp_is_aligned((void *) src, VLEN)) { - //FARF(HIGH, "hvx_self_max_f32: unaligned address in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_self_max_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - //FARF(HIGH, "hvx_self_max_f32: unaligned loop in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_self_max_f32: unaligned loop in hvx op, possibly slower execution\n"); } HVX_Vector vec_max = hvx_vec_splat_fp32(((const float *) src)[0]); diff --git a/ggml/src/ggml-hexagon/htp/qhcg_approximation.c b/ggml/src/ggml-hexagon/htp/qhcg_approximation.c deleted file mode 100644 index 5068c5fa34..0000000000 --- a/ggml/src/ggml-hexagon/htp/qhcg_approximation.c +++ /dev/null @@ -1,518 +0,0 @@ -/**============================================================================= -@file - qhcg_approximation.c - -@brief - Calculate polynomial approximation of the function below in - floating-point arithmetic using HVX instructions. - - Function: gelu(x) - - Function is approximated in specified input range from -6.0 to 6.0, - where inputs and outputs are arrays of 32-bit float values. - - Approximation is performed using the following method: - - 1) Input range is split into 16 equidistant segments - 2) For each segment, Numpy's polynomial package is used to find the best - polynomial approximation of order N with the corresponding C0, C1, ..., Cn. - 3) VLUT instructions are used to select appropriate coefficients for each input sample - 4) Horner's method is used to compute polynomial values: - f(x) = ((((Cn*x + Cn-1)*x + Cn-2)*x + ...)*x + C1)*x + C0 - -Copyright (c) 2020 Qualcomm Technologies Incorporated. -All Rights Reserved. Qualcomm Proprietary and Confidential. -=============================================================================**/ - -#if __HVX_ARCH__ >= 68 - -#include "qhcg_approximation.h" -#include "qhcg_internal.h" - -#define BLOCK_SIZE (8*1024/128) /* vector chunks */ -#define L2FETCH_AHEAD (BLOCK_SIZE) - -/* Polynomial coefficients */ -static const float c0_coeffs[32] __attribute__((aligned(VLEN))) = -{ - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, --0.1025868397073178,-1.1184356646199394,-1.9705895994321767,0.11469604839384463,0.40991447569341943,0.00424292239610935,-0.0017846707638177889,4.125901398310816e-09, -9.718309490480692e-11,-0.0015488336803479719,0.001064556481209511,0.3906162486717146,0.19084584900320978,-1.911422745140333,-1.1879384314707315,-0.10823562636002611, -}; -static const float c1_coeffs[32] __attribute__((aligned(VLEN))) = -{ - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, --0.1234196807250312,-1.5042580469229814,-2.7701977888429816,1.1561921948215528,1.73891533063333,0.49580124294548433,0.4867587290479026,0.500000435462697, -0.4999997919981341,0.5116842338641109,0.5163606020356294,-0.6867154811454343,-0.31551789326265844,3.6694157536939014,2.6042137343731855,1.1304321895807614, -}; -static const float c2_coeffs[32] __attribute__((aligned(VLEN))) = -{ - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, --0.06367500510012546,-0.8689061926460069,-1.674005553705795,1.5013658408230053,1.9798213609930566,0.33731544915026324,0.35673778512915555,0.398953295788538, -0.3989496120997857,0.3611051680040998,0.31742994078248077,1.9193992198306873,1.6441036493618186,-1.600477678714911,-0.9304878890577859,-0.06740463140212431, -}; -static const float c3_coeffs[32] __attribute__((aligned(VLEN))) = -{ - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, --0.01826132753437031,-0.27938965958246625,-0.56347555462781,0.8662803078586866,1.0748504969694623,-0.13840364789720844,-0.07490683960610874,0.00011501805987770841, --8.89610380930177e-05,0.06815977365648013,0.1564140217086786,-1.036053072449464,-0.9372597866516783,0.5336910940777527,0.3004584208315817,0.019362956684359556, -}; -static const float c4_coeffs[32] __attribute__((aligned(VLEN))) = -{ - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, --0.003143995202268695,-0.05400134785573118,-0.11405541169720136,0.2730279204613817,0.3235877725936627,-0.21757221119731435,-0.14645680741997966,-0.06620698974306806, --0.06630082288474698,-0.14025595963442758,-0.22733077791076023,0.30866276792496655,0.29418673249390104,-0.10682071320119783,-0.05832443091378418,-0.003339162830362702, -}; -static const float c5_coeffs[32] __attribute__((aligned(VLEN))) = -{ - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, --0.0003249391771954532,-0.006273424264889337,-0.01387698764918236,0.04913223606829913,0.05545197372332761,-0.09031312961799431,-0.04973154630108704,0.001825035162087615, --0.0016453620553813022,0.046340833813673266,0.09347637717015225,-0.0520121796723486,-0.0529133082948728,0.012823220702040979,0.00680542921713579,0.00034567793526667706, -}; -static const float c6_coeffs[32] __attribute__((aligned(VLEN))) = -{ - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, --1.866614032852734e-05,-0.0004055443985444202,-0.0009392734765841164,0.004770459893478637,0.00504881095326808,-0.016904688419800747,-0.0049839929986692675,0.013321931926939602, -0.01314779303860721,-0.003962384692377502,-0.017472688915232914,0.004609031329816739,0.005145502689303376,-0.0008540539868357813,-0.0004419008815610675,-1.989001488248583e-05, -}; -static const float c7_coeffs[32] __attribute__((aligned(VLEN))) = -{ - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, --4.5974928158662533e-07,-1.1252676844375516e-05,-2.7270249210246306e-05,0.0001949129249064408,0.00018774479051508836,-0.001238293494672944,0.000199977799551612,0.0029325624122584024, --0.0028654073893250895,-0.00033083897498689484,0.0012818786224478341,-0.00016368340082111905,-0.0002108418119120288,2.431836142521883e-05,1.2317036094266618e-05,4.906925630164402e-07, -}; - -/** - * @brief Polynomial approximation of gelu(x) function. - * @param[in] input Input array of elements in IEEE 32-bit floating-point format. - * @param[out] output Output array of elements in IEEE 32-bit floating-point format. - * @param[in] length Number of elements in input/output arrays. - * @return Returns 0 on successful execution. Otherwise -1. - */ -int32_t qhcg_approximation(float *restrict input, float *restrict output, uint32_t size, - float limit_left, float limit_right) - -{ - HVX_Vector *input_v_ptr; - HVX_UVector *output_v_ptr; - HVX_Vector input_min_v_f; - HVX_Vector input_max_v_f; - - HVX_Vector input_shifted_v_qf32; - HVX_Vector input_scaled_v_qf32; - HVX_Vector scale_v; - HVX_Vector input_v_qf32; - HVX_Vector const16_0_v_sf; - HVX_Vector zero_v_sf; - HVX_Vector mask_idx1_v, mask_idx2_v; - HVX_Vector tmp_v, idx1_v, idx2_v; - HVX_Vector output_v; - HVX_Vector slinep; - HVX_Vector slinec; - HVX_Vector sline; - HVX_Vector sline_tmp; - HVX_Vector sout; - int32_t block, l2fetch_block; - int32_t leftover = size & 31; - int32_t vectors_in_rounddown = size / 32; - int32_t leftover_size = leftover * sizeof(float); - HVX_DV c0_coeff_dv; - HVX_VectorPair c0_coeff_vp; - HVX_Vector c0_coeff_v; - HVX_DV c1_coeff_dv; - HVX_VectorPair c1_coeff_vp; - HVX_Vector c1_coeff_v; - HVX_DV c2_coeff_dv; - HVX_VectorPair c2_coeff_vp; - HVX_Vector c2_coeff_v; - HVX_DV c3_coeff_dv; - HVX_VectorPair c3_coeff_vp; - HVX_Vector c3_coeff_v; - HVX_DV c4_coeff_dv; - HVX_VectorPair c4_coeff_vp; - HVX_Vector c4_coeff_v; - HVX_DV c5_coeff_dv; - HVX_VectorPair c5_coeff_vp; - HVX_Vector c5_coeff_v; - HVX_DV c6_coeff_dv; - HVX_VectorPair c6_coeff_vp; - HVX_Vector c6_coeff_v; - HVX_DV c7_coeff_dv; - HVX_VectorPair c7_coeff_vp; - HVX_Vector c7_coeff_v; - - HVX_Vector zero_vec = Q6_V_vsplat_R(0x00000000); - - /* Check input arguments. Return error status if some argument has invalid value */ - if ((input == 0) || (output == 0) || (size == 0)) - { - return -1; - } - - input_v_ptr = (HVX_Vector *) input; - output_v_ptr = (HVX_UVector *) output; - - /* - * If input data is not aligned to HVX vector size, compose aligned vectors - * from data loaded in slinep and slinec - */ - slinep = *input_v_ptr++; - - /* - * Splat scale factor in order to be used later for finding indexes of coefficients. - * Scale factor is represented in IEEE 16-bit floating-point format and it is - * calculated using the following formula: - * scale_factor = (16.0 / (b0 - a0)) - * NOTE: Calculated value is slightly decreased in order to avoid out of bound - * indexes during VLUT lookup. - */ - scale_v = Q6_V_vsplat_R(0x3faaaaa9); - - /* - * Vector of zeroes used as neutral element in sf to qf32 conversions. - * NOTE: Some of conversions (i.e conversion of scale factor and coefficients) - * can be avoided in real-time, but this is not done in order to don't - * sacrify code readibility in expense of insignificant performance improvement. - */ - zero_v_sf = Q6_V_vzero(); - - /* Mask for extracting only 4 bits of mantissa */ - mask_idx1_v = Q6_V_vsplat_R(0x0000000F); - mask_idx2_v = Q6_V_vsplat_R(0x00000010); - - /* 16.0 in IEEE 16-bit floating-point representation */ - const16_0_v_sf = Q6_V_vsplat_R(0x41800000); - - /* - * Prepare vector of input_min values, that is used later in shifting input range. - * input_min is low boundary of specified input range. - */ - int32_t input_min_bits = *((int32_t *) &limit_left); - int32_t input_max_bits = *((int32_t *) &limit_right); - - input_min_v_f = Q6_V_vsplat_R(input_min_bits); - input_max_v_f = Q6_V_vsplat_R(input_max_bits); - - /* Convert scale factor from sf to q32. Use the same vector for both formats */ - scale_v = Q6_Vqf32_vadd_VsfVsf(scale_v, zero_v_sf); - - /* Load coefficients */ - c0_coeff_v = *((HVX_Vector *)(c0_coeffs)); - c1_coeff_v = *((HVX_Vector *)(c1_coeffs)); - c2_coeff_v = *((HVX_Vector *)(c2_coeffs)); - c3_coeff_v = *((HVX_Vector *)(c3_coeffs)); - c4_coeff_v = *((HVX_Vector *)(c4_coeffs)); - c5_coeff_v = *((HVX_Vector *)(c5_coeffs)); - c6_coeff_v = *((HVX_Vector *)(c6_coeffs)); - c7_coeff_v = *((HVX_Vector *)(c7_coeffs)); - - /* Convert coefficients from sf to qf32 format. Use the same vector for both representations */ - c0_coeff_v = Q6_Vqf32_vadd_VsfVsf(c0_coeff_v, zero_v_sf); - c1_coeff_v = Q6_Vqf32_vadd_VsfVsf(c1_coeff_v, zero_v_sf); - c2_coeff_v = Q6_Vqf32_vadd_VsfVsf(c2_coeff_v, zero_v_sf); - c3_coeff_v = Q6_Vqf32_vadd_VsfVsf(c3_coeff_v, zero_v_sf); - c4_coeff_v = Q6_Vqf32_vadd_VsfVsf(c4_coeff_v, zero_v_sf); - c5_coeff_v = Q6_Vqf32_vadd_VsfVsf(c5_coeff_v, zero_v_sf); - c6_coeff_v = Q6_Vqf32_vadd_VsfVsf(c6_coeff_v, zero_v_sf); - c7_coeff_v = Q6_Vqf32_vadd_VsfVsf(c7_coeff_v, zero_v_sf); - - /* Split 32-bit coefficients to lower and upper part in order to obtain them later with VLUT16. */ - c0_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c0_coeff_v); - c1_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c1_coeff_v); - c2_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c2_coeff_v); - c3_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c3_coeff_v); - c4_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c4_coeff_v); - c5_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c5_coeff_v); - c6_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c6_coeff_v); - c7_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c7_coeff_v); - - /* - * Handle number of whole vectors in input data. - * Don't process last vector in order to avoid out-of-boundary load. - */ - for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) - { - block = Q6_R_min_RR(i, BLOCK_SIZE); - l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE); - - if (l2fetch_block > 0) - { - l2fetch(input_v_ptr + L2FETCH_AHEAD, 128, 128, l2fetch_block, 0); - } - - /* Process one vector at a time */ - for (int32_t j = 0; j < block; ++j) - { - slinec = *input_v_ptr++; - - /* Compose vector of input data from slinec and slinep */ - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); - sline_tmp = sline; - /* Shift input range from [input_min, input_max] to [0, input_max - input_min] */ - input_shifted_v_qf32 = Q6_Vqf32_vsub_VsfVsf(sline, input_min_v_f); - - /* - * Scale shifted input range from [0, input_max - input_min] to [0,16.0) - * in order to get corresponding coefficient indexes - */ - input_scaled_v_qf32 = Q6_Vqf32_vmpy_Vqf32Vqf32(input_shifted_v_qf32, scale_v); - - /* - * VLUT 16 requires integer indexes. Shift scaled input range from [0,16.0) - * to [16.0,32.0) in order to convert float indexes to integer values. - * Float values, represented in IEEE 754, in range [16.0,32.0] have the - * same exponent, which means 4 MSB of mantissa carry information about - * integer index. - */ - input_scaled_v_qf32 = Q6_Vqf32_vadd_Vqf32Vsf(input_scaled_v_qf32, const16_0_v_sf); - - /* Convert back from qf32 to sf in order to extract integer index */ - tmp_v = Q6_Vsf_equals_Vqf32(input_scaled_v_qf32); - - /* Only 4 MSB bits of mantissa represent segment index */ - idx1_v = Q6_Vuw_vlsr_VuwR(tmp_v, 19); - - idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v); - idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v); - idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16); - - /* Obtain the polynomial coefficients from lookup table */ - c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1); - c0_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v, Q6_V_hi_W(c0_coeff_dv.VV), 1); - c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1); - c1_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v, Q6_V_hi_W(c1_coeff_dv.VV), 1); - c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1); - c2_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v, Q6_V_hi_W(c2_coeff_dv.VV), 1); - c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1); - c3_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v, Q6_V_hi_W(c3_coeff_dv.VV), 1); - c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1); - c4_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v, Q6_V_hi_W(c4_coeff_dv.VV), 1); - c5_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c5_coeff_dv.VV), 1); - c5_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c5_coeff_vp, idx2_v, Q6_V_hi_W(c5_coeff_dv.VV), 1); - c6_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c6_coeff_dv.VV), 1); - c6_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c6_coeff_vp, idx2_v, Q6_V_hi_W(c6_coeff_dv.VV), 1); - c7_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c7_coeff_dv.VV), 1); - c7_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c7_coeff_vp, idx2_v, Q6_V_hi_W(c7_coeff_dv.VV), 1); - - /* Convert input from sf vector to qf32 vector for Horner's method*/ - input_v_qf32 = Q6_Vqf32_vadd_VsfVsf(sline, zero_v_sf); - - /* Perform evaluation of polynomial using Horner's method */ - output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c7_coeff_vp), input_v_qf32); - output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c6_coeff_vp)); - output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); - output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c5_coeff_vp)); - output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); - output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c4_coeff_vp)); - output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); - output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c3_coeff_vp)); - output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); - output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c2_coeff_vp)); - output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); - output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c1_coeff_vp)); - output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); - output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c0_coeff_vp)); - - // /* Store results to the output buffer and convert from qf32 to sf */ - // *((HVX_UVector *)(output_v_ptr++)) = Q6_Vsf_equals_Vqf32(output_v); - - - /* Convert from qf32 to sf, store output and go to handle leftover */ - HVX_Vector output_v_f32 = Q6_Vsf_equals_Vqf32(output_v); - HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(input_min_v_f, sline_tmp); // 1 if input_min_v_f > sline_tmp - output_v_f32 = Q6_V_vmux_QVV(pred_cap_left, zero_vec, output_v_f32); // if sline_tmp> input_min_v_f, set to zero - - HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(sline_tmp, input_max_v_f); // 1 if sline_tmp > input_max_v_f - output_v_f32 = Q6_V_vmux_QVV(pred_cap_right, sline_tmp, output_v_f32); // if sline_tmp> input_max_v_f, set to whatever the sline_tmp was - - *((HVX_UVector *)(output_v_ptr++)) = output_v_f32; - - - /* Prepare slinep for next iteration */ - slinep = slinec; - } - } - - /* Handle last whole vector from input data */ - if (vectors_in_rounddown > 0) - { - slinec = is_aligned(input_v_ptr, 128) && leftover == 0 ? slinep : *input_v_ptr++; - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); - sline_tmp = sline; - - /* Shift input range from [input_min, input_max] to [0, input_max - input_min] */ - input_shifted_v_qf32 = Q6_Vqf32_vsub_VsfVsf(sline, input_min_v_f); - - /* Scale shifted input range from [0, input_max - input_min] to [0,16.0) */ - input_scaled_v_qf32 = Q6_Vqf32_vmpy_Vqf32Vqf32(input_shifted_v_qf32, scale_v); - - /* - * VLUT 16 requires integer indexes. Shift scaled input range from [0,16.0) - * to [16.0,32.0) in order to convert float indexes to integer values. - * Float values, represented in IEEE 754, in range [16.0,32.0] have the - * same exponent, which means 4 MSB of mantissa carry information about - * integer index. - */ - input_scaled_v_qf32 = Q6_Vqf32_vadd_Vqf32Vsf(input_scaled_v_qf32, const16_0_v_sf); - - /* Convert back from qf32 to sf in order to extract integer index */ - tmp_v = Q6_Vsf_equals_Vqf32(input_scaled_v_qf32); - - /* Only 4 MSB bits of mantissa represent segment index */ - idx1_v = Q6_Vuw_vlsr_VuwR(tmp_v, 19); - - /* Ensure only 4 MSB bits of mantissa are used as indexes */ - idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v); - idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v); - idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16); - - /* Obtain the polynomial coefficients from lookup table */ - c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1); - c0_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v, Q6_V_hi_W(c0_coeff_dv.VV), 1); - c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1); - c1_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v, Q6_V_hi_W(c1_coeff_dv.VV), 1); - c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1); - c2_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v, Q6_V_hi_W(c2_coeff_dv.VV), 1); - c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1); - c3_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v, Q6_V_hi_W(c3_coeff_dv.VV), 1); - c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1); - c4_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v, Q6_V_hi_W(c4_coeff_dv.VV), 1); - c5_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c5_coeff_dv.VV), 1); - c5_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c5_coeff_vp, idx2_v, Q6_V_hi_W(c5_coeff_dv.VV), 1); - c6_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c6_coeff_dv.VV), 1); - c6_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c6_coeff_vp, idx2_v, Q6_V_hi_W(c6_coeff_dv.VV), 1); - c7_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c7_coeff_dv.VV), 1); - c7_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c7_coeff_vp, idx2_v, Q6_V_hi_W(c7_coeff_dv.VV), 1); - - /* Convert input from sf vector to qf32 vector for Horner's method*/ - input_v_qf32 = Q6_Vqf32_vadd_VsfVsf(sline, zero_v_sf); - - /* Perform evaluation of polynomial using Horner's method */ - output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c7_coeff_vp), input_v_qf32); - output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c6_coeff_vp)); - output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); - output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c5_coeff_vp)); - output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); - output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c4_coeff_vp)); - output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); - output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c3_coeff_vp)); - output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); - output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c2_coeff_vp)); - output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); - output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c1_coeff_vp)); - output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); - output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c0_coeff_vp)); - - /* Convert from qf32 to sf, store output and go to handle leftover */ - HVX_Vector output_v_f32 = Q6_Vsf_equals_Vqf32(output_v); - HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(input_min_v_f, sline_tmp); // 1 if input_min_v_f > sline_tmp - output_v_f32 = Q6_V_vmux_QVV(pred_cap_left, zero_vec, output_v_f32); // if sline_tmp> input_min_v_f, set to zero - - HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(sline_tmp, input_max_v_f); // 1 if sline_tmp > input_max_v_f - output_v_f32 = Q6_V_vmux_QVV(pred_cap_right, sline_tmp, output_v_f32); // if sline_tmp> input_max_v_f, set to whatever the sline_tmp was - - *((HVX_UVector *)(output_v_ptr++)) = output_v_f32; - - slinep = slinec; - } - - /* Handle leftover elements */ - if (leftover > 0) - { - slinec = (is_in_one_chunk(input_v_ptr, leftover_size, 128) - ? slinep - : *input_v_ptr++); - - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); - sline_tmp = sline; - - /* Shift input range from [input_min, input_max] to [0, input_max - input_min] */ - input_shifted_v_qf32 = Q6_Vqf32_vsub_VsfVsf(sline, input_min_v_f); - - /* Scale shifted input range from [0, input_max - input_min] to [0,16.0) */ - input_scaled_v_qf32 = Q6_Vqf32_vmpy_Vqf32Vqf32(input_shifted_v_qf32, scale_v); - - /* - * VLUT 16 requires integer indexes. Shift scaled input range from [0,16.0) - * to [16.0,32.0) in order to convert float indexes to integer values. - * Float values, represented in IEEE 754, in range [16.0,32.0] have the - * same exponent, which means 4 MSB of mantissa carry information about - * integer index. - */ - input_scaled_v_qf32 = Q6_Vqf32_vadd_Vqf32Vsf(input_scaled_v_qf32, const16_0_v_sf); - - /* Convert back from qf32 to sf in order to extract integer index */ - tmp_v = Q6_Vsf_equals_Vqf32(input_scaled_v_qf32); - - /* Only 4 MSB bits of mantissa represent segment index */ - idx1_v = Q6_Vuw_vlsr_VuwR(tmp_v, 19); - - /* Ensure only 4 MSB bits of mantissa are used as indexes */ - idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v); - idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v); - idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16); - - /* Obtain the polynomial coefficients from lookup table */ - c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1); - c0_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v, Q6_V_hi_W(c0_coeff_dv.VV), 1); - c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1); - c1_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v, Q6_V_hi_W(c1_coeff_dv.VV), 1); - c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1); - c2_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v, Q6_V_hi_W(c2_coeff_dv.VV), 1); - c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1); - c3_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v, Q6_V_hi_W(c3_coeff_dv.VV), 1); - c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1); - c4_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v, Q6_V_hi_W(c4_coeff_dv.VV), 1); - c5_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c5_coeff_dv.VV), 1); - c5_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c5_coeff_vp, idx2_v, Q6_V_hi_W(c5_coeff_dv.VV), 1); - c6_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c6_coeff_dv.VV), 1); - c6_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c6_coeff_vp, idx2_v, Q6_V_hi_W(c6_coeff_dv.VV), 1); - c7_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c7_coeff_dv.VV), 1); - c7_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c7_coeff_vp, idx2_v, Q6_V_hi_W(c7_coeff_dv.VV), 1); - - /* Convert input from sf vector to qf32 vector for Horner's method*/ - input_v_qf32 = Q6_Vqf32_vadd_VsfVsf(sline, zero_v_sf); - - /* Perform evaluation of polynomial using Horner's method */ - output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c7_coeff_vp), input_v_qf32); - output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c6_coeff_vp)); - output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); - output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c5_coeff_vp)); - output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); - output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c4_coeff_vp)); - output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); - output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c3_coeff_vp)); - output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); - output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c2_coeff_vp)); - output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); - output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c1_coeff_vp)); - output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); - output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c0_coeff_vp)); - - /* Convert from qf32 to sf */ - // sout = Q6_Vsf_equals_Vqf32(output_v); - HVX_Vector output_v_f32 = Q6_Vsf_equals_Vqf32(output_v); - HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(input_min_v_f, sline_tmp); // 1 if input_min_v_f > sline_tmp - output_v_f32 = Q6_V_vmux_QVV(pred_cap_left, zero_vec, output_v_f32); // if sline_tmp> input_min_v_f, set to zero - - - HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(sline_tmp, input_max_v_f); // 1 if sline_tmp > input_max_v_f - output_v_f32 = Q6_V_vmux_QVV(pred_cap_right, sline_tmp, output_v_f32); // if sline_tmp> input_max_v_f, set to whatever the sline_tmp was - - sout = output_v_f32; - /* Store output */ - vstu_variable(output_v_ptr, leftover_size, sout); - } - - return 0; -} - -#endif /* __HVX_ARCH__ >= 68 */ diff --git a/ggml/src/ggml-hexagon/htp/qhcg_approximation.h b/ggml/src/ggml-hexagon/htp/qhcg_approximation.h deleted file mode 100644 index 6f70e209ff..0000000000 --- a/ggml/src/ggml-hexagon/htp/qhcg_approximation.h +++ /dev/null @@ -1,21 +0,0 @@ -/**============================================================================= -@file - qhcg_approximation.h - -@brief - Header file of polynomial approximation generated by QHCG - -Copyright (c) 2020 Qualcomm Technologies Incorporated. -All Rights Reserved. Qualcomm Proprietary and Confidential. -=============================================================================**/ - -#ifndef __qhcg_approximation__ -#define __qhcg_approximation__ - -#include - -int32_t qhcg_approximation(float *inputs, float *outputs, uint32_t length, - float limit_left, float limit_right -); - -#endif /* __qhcg_approximation__ */ diff --git a/ggml/src/ggml-hexagon/htp/qhcg_internal.h b/ggml/src/ggml-hexagon/htp/qhcg_internal.h deleted file mode 100644 index 618610dc88..0000000000 --- a/ggml/src/ggml-hexagon/htp/qhcg_internal.h +++ /dev/null @@ -1,91 +0,0 @@ -/**============================================================================= -@file - hvx_internal.h - -@brief - Header file for HVX routines. - -Copyright (c) 2020 Qualcomm Technologies Incorporated. -All Rights Reserved. Qualcomm Proprietary and Confidential. -=============================================================================**/ - -#ifndef _HVX_INTERNAL_H -#define _HVX_INTERNAL_H - -#include // size_t -#include - -#define HVX_INLINE_ALWAYS inline __attribute__((unused,always_inline)) - -#ifndef LOG2VLEN -#define LOG2VLEN 7 -#endif -#define VLEN (1<>1 // HVX vector - number of int16_t elements -#define VLEN_WORD (1<>2 // HVX vector - number of int32_t elements - -typedef union -{ - HVX_VectorPair VV; - struct - { - HVX_Vector lo; - HVX_Vector hi; - } V; -} HVX_DV; - -static HVX_INLINE_ALWAYS void l2fetch(const void *p, uint32_t stride, - uint32_t width, uint32_t height, - uint32_t dir) -{ - uint64_t control = HEXAGON_V64_CREATE_H(dir, stride, width, height); - __asm__ __volatile__ (" l2fetch(%0,%1) " : :"r"(p),"r"(control)); -} - -/* Return whether address is aligned. */ - -static HVX_INLINE_ALWAYS int32_t is_aligned(void *addr, uint32_t align) -{ - return ((size_t) addr & (align - 1)) == 0; -} - -/* Return whether 'n' elements from vector are in the one chunk of 'chunk_size'. */ - -static HVX_INLINE_ALWAYS int32_t is_in_one_chunk(void *addr, uint32_t n, - uint32_t chunk_size) -{ - uint32_t left_off = (size_t) addr & (chunk_size - 1); - uint32_t right_off = left_off + n; - return right_off <= chunk_size; -} - -/* - * This function stores the first n bytes from vector vin to address 'addr'. - * n must be in range 1..128 and addr may have any alignment. Does one or - * two masked stores. - */ - -static HVX_INLINE_ALWAYS void vstu_variable(void *addr, uint32_t n, - HVX_Vector vin) -{ - /* Rotate as needed. */ - vin = Q6_V_vlalign_VVR(vin, vin, (size_t) addr); - - uint32_t left_off = (size_t) addr & 127; - uint32_t right_off = left_off + n; - - HVX_VectorPred ql_not = Q6_Q_vsetq_R((size_t) addr); - HVX_VectorPred qr = Q6_Q_vsetq2_R(right_off); - - if (right_off > 128) - { - Q6_vmem_QRIV(qr, (HVX_Vector*) addr + 1, vin); - /* all 1's */ - qr = Q6_Q_vcmp_eq_VbVb(vin, vin); - } - - ql_not = Q6_Q_or_QQn(ql_not, qr); - Q6_vmem_QnRIV(ql_not, (HVX_Vector*) addr, vin); -} - -#endif /* _HVX_INTERNAL_H */ From 84f2f23aa9f17e2fa826db969cd825d0ab192995 Mon Sep 17 00:00:00 2001 From: shouyud Date: Fri, 12 Dec 2025 10:11:23 -0500 Subject: [PATCH 06/14] debug: temporarily disable unnecessary log message for debug purpose --- ggml/src/ggml-hexagon/htp/hvx-utils.c | 42 +++++++++++++-------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c index e02b1d9099..d6e928c96f 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.c +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c @@ -40,13 +40,13 @@ void hvx_mul_f32(const uint8_t * restrict src0, int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - FARF(HIGH, "hvx_mul_f32: unaligned address in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_mul_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n"); } if (0 == unaligned_loop) { @@ -252,13 +252,13 @@ void hvx_add_f32(const uint8_t * restrict src0, int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - FARF(HIGH, "hvx_add_f32: unaligned address in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_add_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - FARF(HIGH, "hvx_add_f32: unaligned loop in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_add_f32: unaligned loop in hvx op, possibly slower execution\n"); } if (0 == unaligned_loop) { @@ -392,13 +392,13 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * int unaligned_addr = 0; int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - FARF(HIGH, "hvx_add_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_add_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); } static const float kInf = INFINITY; @@ -454,13 +454,13 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * int unaligned_addr = 0; int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - FARF(HIGH, "hvx_mul_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_mul_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - FARF(HIGH, "hvx_mul_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_mul_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); } HVX_Vector val_vec = hvx_vec_splat_fp32(val); @@ -507,13 +507,13 @@ void hvx_sub_f32(const uint8_t * restrict src0, int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - FARF(HIGH, "hvx_sub_f32: unaligned address in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_sub_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - FARF(HIGH, "hvx_sub_f32: unaligned loop in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_sub_f32: unaligned loop in hvx op, possibly slower execution\n"); } if (0 == unaligned_loop) { @@ -647,13 +647,13 @@ void hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * int unaligned_addr = 0; int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - FARF(HIGH, "hvx_sub_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_sub_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - FARF(HIGH, "hvx_sub_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_sub_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); } HVX_Vector val_vec = hvx_vec_splat_fp32(val); @@ -694,7 +694,7 @@ float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems) int num_elems_whole = num_elems - left_over; if (0 == htp_is_aligned((void *) src, VLEN)) { - FARF(HIGH, "hvx_sum_of_squares_f32: unaligned address in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_sum_of_squares_f32: unaligned address in hvx op, possibly slower execution\n"); } assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole)); @@ -733,13 +733,13 @@ float hvx_self_sum_f32(const uint8_t * restrict src, const int num_elems) { int unaligned_addr = 0; int unaligned_loop = 0; if (0 == htp_is_aligned((void *) src, VLEN)) { - FARF(HIGH, "hvx_self_sum_f32: unaligned address in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_self_sum_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - FARF(HIGH, "hvx_self_sum_f32: unaligned loop in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_self_sum_f32: unaligned loop in hvx op, possibly slower execution\n"); } HVX_Vector sum_vec = Q6_V_vsplat_R(0x00000000); @@ -782,13 +782,13 @@ void hvx_scale_f32(const uint8_t * restrict src, uint8_t * restrict dst, const i int unaligned_addr = 0; int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - FARF(HIGH, "hvx_scale_f32: unaligned address in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_scale_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - FARF(HIGH, "hvx_scale_f32: unaligned loop in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_scale_f32: unaligned loop in hvx op, possibly slower execution\n"); } HVX_Vector scale_vec = hvx_vec_splat_fp32(scale); @@ -831,13 +831,13 @@ float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems) { int unaligned_addr = 0; int unaligned_loop = 0; if (0 == htp_is_aligned((void *) src, VLEN)) { - FARF(HIGH, "hvx_self_max_f32: unaligned address in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_self_max_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - FARF(HIGH, "hvx_self_max_f32: unaligned loop in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_self_max_f32: unaligned loop in hvx op, possibly slower execution\n"); } HVX_Vector vec_max = hvx_vec_splat_fp32(((const float *) src)[0]); @@ -877,7 +877,7 @@ void hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * size_t num_elems_whole = num_elems - left_over; if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - FARF(HIGH, "hvx_min_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_min_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); } assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole)); @@ -916,7 +916,7 @@ void hvx_clamp_scalar_f32(const uint8_t * restrict src, size_t num_elems_whole = num_elems - left_over; if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - FARF(HIGH, "hvx_clamp_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); + //FARF(HIGH, "hvx_clamp_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); } assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole)); From fc2289dc96e2c2622922189b0e04bb30302c3aa1 Mon Sep 17 00:00:00 2001 From: shouyud Date: Fri, 12 Dec 2025 11:58:45 -0500 Subject: [PATCH 07/14] Feat: optiized unaligned sigmoid_f32 --- ggml/src/ggml-hexagon/htp/act-ops.c | 5 +- ggml/src/ggml-hexagon/htp/hvx-utils.h | 108 ++++++++++++++++++++++++++ 2 files changed, 109 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c index 2db4a2a35b..5266567d37 100644 --- a/ggml/src/ggml-hexagon/htp/act-ops.c +++ b/ggml/src/ggml-hexagon/htp/act-ops.c @@ -317,10 +317,7 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, } else { hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0); // sigmoid - hvx_exp_f32((const uint8_t *) src0_spad_data, src0_spad_data, ne0, true); - hvx_add_scalar_f32(src0_spad_data, 1.0, dst_spad_data, ne0); - hvx_inverse_f32(dst_spad_data, src0_spad_data, ne0); - + hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0); hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0); } } diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index 80658105c5..6c713b40eb 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -265,12 +265,16 @@ static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t } } + +/* Return whether 'n' elements from vector are in the one chunk of 'chunk_size'. */ static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) { uint32_t left_off = (size_t) addr & (chunk_size - 1); uint32_t right_off = left_off + n; return right_off <= chunk_size; } + + static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) { HVX_VectorAlias u = { .v = v }; @@ -994,6 +998,110 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * } } + +static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){ + int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector + int leftover = num_elems - (step_of_1 * VLEN_FP32); + + // assert(remaining == 0);//TODO: handle remaining elements later + + + int32_t leftover_size = leftover * sizeof(float); + + static const float kMinExp = -87.f; // 0 + static const float kMaxExp = 87.f; // 1 + + const HVX_Vector one = hvx_vec_splat_fp32(1.f); + const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); + const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp); + + const float *input = (float *)src; + float *output = (float *)dst; + + HVX_Vector * input_v_ptr = (HVX_Vector *) input; + HVX_UVector * output_v_ptr = (HVX_UVector *) output; + + + HVX_Vector slinep; + HVX_Vector slinec; + HVX_Vector sline; + + + slinep = *input_v_ptr++; + #pragma unroll(4) + for(uint32_t i = step_of_1 -1; i> 0; i--){ + slinec = *input_v_ptr++; + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); + *((HVX_UVector *)(output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp); + /* Prepare slinep for next iteration */ + slinep = slinec; + } + + if(step_of_1> 0){ + + slinec = htp_is_aligned(input_v_ptr, 128) && leftover == 0 ? slinep : *input_v_ptr++; + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); + *((HVX_UVector *)(output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);; + + slinep = slinec; + } + if(leftover> 0){ + slinec = (is_in_one_chunk(input_v_ptr, leftover_size, 128) + ? slinep + : *input_v_ptr++); + + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); + + HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp); + /* Store output */ + hvx_vec_store_u(output_v_ptr, leftover_size, sout); + } + + +} + + + +// static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){ +// int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector +// int leftover = num_elems - (step_of_1 * VLEN_FP32); + +// // assert(remaining == 0);//TODO: handle remaining elements later + + +// int32_t leftover_size = leftover * sizeof(float); + +// static const float kMinExp = -87.f; // 0 +// static const float kMaxExp = 87.f; // 1 + +// const HVX_Vector one = hvx_vec_splat_fp32(1.f); +// const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); +// const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp); + +// const float *input = (float *)src; +// float *output = (float *)dst; + +// HVX_UVector * input_v_ptr = (HVX_UVector *) input; +// HVX_UVector * output_v_ptr = (HVX_UVector *) output; + +// // #pragma unroll(4) NOTE: this actual got slower +// for(uint32_t i = step_of_1; i> 0; i--){ +// *((HVX_UVector *)(output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(*(input_v_ptr++), one, max_exp, min_exp); + +// } + + +// if(leftover> 0){ + + +// HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(*(input_v_ptr++), one, max_exp, min_exp); +// /* Store output */ +// hvx_vec_store_u(output_v_ptr, leftover_size, sout); +// } + + +// } + float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems); void hvx_mul_f32(const uint8_t * restrict src0, const uint8_t * restrict src1, From 8bc299ddefc2a46bbc0a3800f2bed7ae390ffd49 Mon Sep 17 00:00:00 2001 From: shouyud Date: Fri, 12 Dec 2025 17:15:53 -0500 Subject: [PATCH 08/14] Feat: larger l2prefetch block --- ggml/src/ggml-hexagon/htp/act-ops.c | 42 +++++++++++++++++------------ 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c index 5266567d37..97823575c1 100644 --- a/ggml/src/ggml-hexagon/htp/act-ops.c +++ b/ggml/src/ggml-hexagon/htp/act-ops.c @@ -298,27 +298,35 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size); uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_row_size); - for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) { - const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size)); - float * restrict dst = (float *) (data_dst + (ir * dst_row_size)); - - if (ir + 1 < src0_end_row) { - htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size); + const int BLOCK = 8; + for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) { + const uint32_t block_end = MIN(ir + BLOCK, src0_end_row); + + // Prefetch next block + if (block_end < src0_end_row) { + const float * restrict prefetch_ptr = (float *) (data_src0 + (block_end * src0_row_size)); + htp_l2fetch(prefetch_ptr, 1, block_end * src0_row_size, src0_row_size); } + // Process rows in current block + for (uint32_t ib = ir; ib < block_end; ib++) { + const float * restrict src0 = (float *) (data_src0 + (ib * src0_row_size)); + float * restrict dst = (float *) (data_dst + (ib * dst_row_size)); - // gelu = 0.5 * x * (1.0 + tanh( sqrt(2/pi) * (x + 0.044715 * x^3) )) // gelu_tanh - // gelu = x * sigmoid(1.702 * x) // current implementation - if (1 == opt_path) { - hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0); - hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0); + // gelu = 0.5 * x * (1.0 + tanh( sqrt(2/pi) * (x + 0.044715 * x^3) )) // gelu_tanh + // gelu = x * sigmoid(1.702 * x) // current implementation + if (1 == opt_path) { + hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0); + hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0); - hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0); - } else { - hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0); - // sigmoid - hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0); - hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0); + hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0); + } + else { + hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0); + // sigmoid + hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0); + hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0); + } } } From cbd4e93296f64314cab9cbbc1f587437ed262ba1 Mon Sep 17 00:00:00 2001 From: shouyud Date: Fri, 12 Dec 2025 17:17:43 -0500 Subject: [PATCH 09/14] feat: apply unaligned-load optimization on mul and mul_scalar --- ggml/src/ggml-hexagon/htp/hvx-utils.c | 152 ++++++++++++++++++++++++-- 1 file changed, 140 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c index d6e928c96f..b0099991cd 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.c +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c @@ -49,6 +49,8 @@ void hvx_mul_f32(const uint8_t * restrict src0, //FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n"); } + + bool handled_leftover = false; if (0 == unaligned_loop) { HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0; HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1; @@ -60,18 +62,88 @@ void hvx_mul_f32(const uint8_t * restrict src0, *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { + // #pragma unroll(4) + // for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { + // HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32); + // HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32); + + // HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in1, in2); + + // *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out); + // } + + int step_of_1 = num_elems_whole >> 5; // divby 32, because 32 float = 128 bytes per HVX vector + int leftover_size = left_over * sizeof(float); + + + HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0; + HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1; + HVX_UVector * restrict vec_out = (HVX_UVector *) dst; + + + HVX_Vector slinep; + HVX_Vector slinec; + HVX_Vector sline; + HVX_Vector sline2p; + HVX_Vector sline2c; + HVX_Vector sline2; + + slinep = *vec_in1++; + sline2p = *vec_in2++; #pragma unroll(4) - for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32); - HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32); + for(uint32_t i = step_of_1 -1; i> 0; i--){ + slinec = *vec_in1++; + sline2c = *vec_in2++; + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); + + *((HVX_UVector *)(vec_out++)) =Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, sline2)); + /* Prepare slinep for next iteration */ + slinep = slinec; + sline2p = sline2c; + } + if(step_of_1 > 1){ + slinec = htp_is_aligned(vec_in1, VLEN) && left_over == 0 ? slinep : *vec_in1++; + sline2c = htp_is_aligned(vec_in2, VLEN) && left_over == 0 ? sline2p : *vec_in2++; - HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in1, in2); + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); + *((HVX_UVector *)(vec_out++)) =Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, sline2)); + /* Prepare slinep for next iteration */ + slinep = slinec; + sline2p = sline2c; + } + if(left_over > 0 ){ - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out); + slinec = (is_in_one_chunk(vec_in1, leftover_size, VLEN) + ? slinep + : *vec_in1++); + + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); + sline2c = (is_in_one_chunk(vec_in2, leftover_size, VLEN) + ? sline2p + : *vec_in2++); + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); + + HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(sline, sline2); + hvx_vec_store_u(vec_out, leftover_size, Q6_Vsf_equals_Vqf32(out)); + handled_leftover = true; } } - if (left_over > 0) { + // if (left_over > 0 ) { + // const float * src0f = (const float *) src0 + num_elems_whole; + // const float * src1f = (const float *) src1 + num_elems_whole; + // float * dstf = (float *) dst + num_elems_whole; + + // HVX_Vector in1 = *(HVX_UVector *) src0f; + // HVX_Vector in2 = *(HVX_UVector *) src1f; + + // HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in1, in2); + // hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out)); + // } + + if (left_over > 0 && !handled_leftover) { const float * src0f = (const float *) src0 + num_elems_whole; const float * src1f = (const float *) src1 + num_elems_whole; float * dstf = (float *) dst + num_elems_whole; @@ -464,7 +536,7 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * } HVX_Vector val_vec = hvx_vec_splat_fp32(val); - + bool handled_leftover = false; if (0 == unaligned_loop) { HVX_Vector * restrict vec_in1 = (HVX_Vector *) src; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; @@ -475,17 +547,73 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { + // #pragma unroll(4) + // for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { + // HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); + + // HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, val_vec); + + // *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out); + // } + + int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector + int leftover_size = left_over * sizeof(float); + + + + HVX_Vector * input_v_ptr = (HVX_Vector *) src; + HVX_UVector * output_v_ptr = (HVX_UVector *) dst; + + + HVX_Vector slinep; + HVX_Vector slinec; + HVX_Vector sline; + + slinep = *input_v_ptr++; + #pragma unroll(4) - for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); + for(uint32_t i = step_of_1 - 1; i > 0; i--){ + slinec = *input_v_ptr++; + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src); + *((HVX_UVector *)(output_v_ptr++)) = Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); + /* Prepare slinep for next iteration */ + slinep = slinec; + } - HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, val_vec); + if(step_of_1 > 0){ - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out); + slinec = htp_is_aligned(input_v_ptr, VLEN) && left_over == 0 ? slinep : *input_v_ptr++; + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src); + *((HVX_UVector *)(output_v_ptr++)) = Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); + + slinep = slinec; + } + + if(leftover_size > 0){ + slinec = (is_in_one_chunk(input_v_ptr, leftover_size, VLEN) + ? slinep + : *input_v_ptr++); + + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src); + + HVX_Vector sout = Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); + /* Store output */ + hvx_vec_store_u(output_v_ptr, leftover_size, sout); + handled_leftover = true; } } - if (left_over > 0) { + // if (left_over > 0 ) { + // const float * srcf = (const float *) src + num_elems_whole; + // float * dstf = (float *) dst + num_elems_whole; + + // HVX_Vector in = *(HVX_UVector *) srcf; + + // HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, val_vec); + // hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out)); + // } + + if (left_over > 0 && !handled_leftover) { const float * srcf = (const float *) src + num_elems_whole; float * dstf = (float *) dst + num_elems_whole; From e51b6bf2b94a6f0addbe18f1b822efcf4ad4b498 Mon Sep 17 00:00:00 2001 From: shouyud Date: Tue, 16 Dec 2025 08:27:57 -0500 Subject: [PATCH 10/14] Revert "debug: temporarily disable unnecessary log message for debug purpose" This reverts commit 84f2f23aa9f17e2fa826db969cd825d0ab192995. --- ggml/src/ggml-hexagon/htp/hvx-utils.c | 42 +++++++++++++-------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c index b0099991cd..63c7c85427 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.c +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c @@ -40,13 +40,13 @@ void hvx_mul_f32(const uint8_t * restrict src0, int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - //FARF(HIGH, "hvx_mul_f32: unaligned address in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_mul_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - //FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n"); } @@ -324,13 +324,13 @@ void hvx_add_f32(const uint8_t * restrict src0, int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - //FARF(HIGH, "hvx_add_f32: unaligned address in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_add_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - //FARF(HIGH, "hvx_add_f32: unaligned loop in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_add_f32: unaligned loop in hvx op, possibly slower execution\n"); } if (0 == unaligned_loop) { @@ -464,13 +464,13 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * int unaligned_addr = 0; int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - //FARF(HIGH, "hvx_add_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_add_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - //FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); } static const float kInf = INFINITY; @@ -526,13 +526,13 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * int unaligned_addr = 0; int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - //FARF(HIGH, "hvx_mul_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_mul_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - //FARF(HIGH, "hvx_mul_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_mul_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); } HVX_Vector val_vec = hvx_vec_splat_fp32(val); @@ -635,13 +635,13 @@ void hvx_sub_f32(const uint8_t * restrict src0, int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - //FARF(HIGH, "hvx_sub_f32: unaligned address in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_sub_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - //FARF(HIGH, "hvx_sub_f32: unaligned loop in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_sub_f32: unaligned loop in hvx op, possibly slower execution\n"); } if (0 == unaligned_loop) { @@ -775,13 +775,13 @@ void hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * int unaligned_addr = 0; int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - //FARF(HIGH, "hvx_sub_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_sub_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - //FARF(HIGH, "hvx_sub_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_sub_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); } HVX_Vector val_vec = hvx_vec_splat_fp32(val); @@ -822,7 +822,7 @@ float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems) int num_elems_whole = num_elems - left_over; if (0 == htp_is_aligned((void *) src, VLEN)) { - //FARF(HIGH, "hvx_sum_of_squares_f32: unaligned address in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_sum_of_squares_f32: unaligned address in hvx op, possibly slower execution\n"); } assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole)); @@ -861,13 +861,13 @@ float hvx_self_sum_f32(const uint8_t * restrict src, const int num_elems) { int unaligned_addr = 0; int unaligned_loop = 0; if (0 == htp_is_aligned((void *) src, VLEN)) { - //FARF(HIGH, "hvx_self_sum_f32: unaligned address in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_self_sum_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - //FARF(HIGH, "hvx_self_sum_f32: unaligned loop in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_self_sum_f32: unaligned loop in hvx op, possibly slower execution\n"); } HVX_Vector sum_vec = Q6_V_vsplat_R(0x00000000); @@ -910,13 +910,13 @@ void hvx_scale_f32(const uint8_t * restrict src, uint8_t * restrict dst, const i int unaligned_addr = 0; int unaligned_loop = 0; if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - //FARF(HIGH, "hvx_scale_f32: unaligned address in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_scale_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - //FARF(HIGH, "hvx_scale_f32: unaligned loop in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_scale_f32: unaligned loop in hvx op, possibly slower execution\n"); } HVX_Vector scale_vec = hvx_vec_splat_fp32(scale); @@ -959,13 +959,13 @@ float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems) { int unaligned_addr = 0; int unaligned_loop = 0; if (0 == htp_is_aligned((void *) src, VLEN)) { - //FARF(HIGH, "hvx_self_max_f32: unaligned address in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_self_max_f32: unaligned address in hvx op, possibly slower execution\n"); unaligned_addr = 1; } if ((1 == unaligned_addr) && (num_elems_whole != 0)) { unaligned_loop = 1; - //FARF(HIGH, "hvx_self_max_f32: unaligned loop in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_self_max_f32: unaligned loop in hvx op, possibly slower execution\n"); } HVX_Vector vec_max = hvx_vec_splat_fp32(((const float *) src)[0]); @@ -1005,7 +1005,7 @@ void hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * size_t num_elems_whole = num_elems - left_over; if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - //FARF(HIGH, "hvx_min_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_min_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); } assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole)); @@ -1044,7 +1044,7 @@ void hvx_clamp_scalar_f32(const uint8_t * restrict src, size_t num_elems_whole = num_elems - left_over; if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { - //FARF(HIGH, "hvx_clamp_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); + FARF(HIGH, "hvx_clamp_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); } assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole)); From 05693357c8d60daa9694df44d1797b9e6256becd Mon Sep 17 00:00:00 2001 From: shouyud Date: Tue, 16 Dec 2025 08:31:16 -0500 Subject: [PATCH 11/14] refactor: cleanup commented unused code --- ggml/src/ggml-hexagon/htp/act-ops.c | 3 -- ggml/src/ggml-hexagon/htp/hvx-utils.c | 43 ------------------------- ggml/src/ggml-hexagon/htp/hvx-utils.h | 45 --------------------------- 3 files changed, 91 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c index 97823575c1..9d3e584a84 100644 --- a/ggml/src/ggml-hexagon/htp/act-ops.c +++ b/ggml/src/ggml-hexagon/htp/act-ops.c @@ -313,17 +313,14 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, const float * restrict src0 = (float *) (data_src0 + (ib * src0_row_size)); float * restrict dst = (float *) (data_dst + (ib * dst_row_size)); - // gelu = 0.5 * x * (1.0 + tanh( sqrt(2/pi) * (x + 0.044715 * x^3) )) // gelu_tanh // gelu = x * sigmoid(1.702 * x) // current implementation if (1 == opt_path) { hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0); hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0); - hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0); } else { hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0); - // sigmoid hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0); hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0); } diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c index 63c7c85427..e7ee589f34 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.c +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c @@ -62,16 +62,6 @@ void hvx_mul_f32(const uint8_t * restrict src0, *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { - // #pragma unroll(4) - // for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - // HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32); - // HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32); - - // HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in1, in2); - - // *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out); - // } - int step_of_1 = num_elems_whole >> 5; // divby 32, because 32 float = 128 bytes per HVX vector int leftover_size = left_over * sizeof(float); @@ -98,7 +88,6 @@ void hvx_mul_f32(const uint8_t * restrict src0, sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); *((HVX_UVector *)(vec_out++)) =Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, sline2)); - /* Prepare slinep for next iteration */ slinep = slinec; sline2p = sline2c; } @@ -109,7 +98,6 @@ void hvx_mul_f32(const uint8_t * restrict src0, sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); *((HVX_UVector *)(vec_out++)) =Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, sline2)); - /* Prepare slinep for next iteration */ slinep = slinec; sline2p = sline2c; } @@ -131,17 +119,6 @@ void hvx_mul_f32(const uint8_t * restrict src0, } } - // if (left_over > 0 ) { - // const float * src0f = (const float *) src0 + num_elems_whole; - // const float * src1f = (const float *) src1 + num_elems_whole; - // float * dstf = (float *) dst + num_elems_whole; - - // HVX_Vector in1 = *(HVX_UVector *) src0f; - // HVX_Vector in2 = *(HVX_UVector *) src1f; - - // HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in1, in2); - // hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out)); - // } if (left_over > 0 && !handled_leftover) { const float * src0f = (const float *) src0 + num_elems_whole; @@ -547,15 +524,6 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { - // #pragma unroll(4) - // for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - // HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); - - // HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, val_vec); - - // *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out); - // } - int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector int leftover_size = left_over * sizeof(float); @@ -597,22 +565,11 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src); HVX_Vector sout = Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); - /* Store output */ hvx_vec_store_u(output_v_ptr, leftover_size, sout); handled_leftover = true; } } - // if (left_over > 0 ) { - // const float * srcf = (const float *) src + num_elems_whole; - // float * dstf = (float *) dst + num_elems_whole; - - // HVX_Vector in = *(HVX_UVector *) srcf; - - // HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, val_vec); - // hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out)); - // } - if (left_over > 0 && !handled_leftover) { const float * srcf = (const float *) src + num_elems_whole; float * dstf = (float *) dst + num_elems_whole; diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index 6c713b40eb..0b24786391 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -1003,9 +1003,6 @@ static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restr int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector int leftover = num_elems - (step_of_1 * VLEN_FP32); - // assert(remaining == 0);//TODO: handle remaining elements later - - int32_t leftover_size = leftover * sizeof(float); static const float kMinExp = -87.f; // 0 @@ -1053,7 +1050,6 @@ static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restr sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp); - /* Store output */ hvx_vec_store_u(output_v_ptr, leftover_size, sout); } @@ -1061,47 +1057,6 @@ static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restr } - -// static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){ -// int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector -// int leftover = num_elems - (step_of_1 * VLEN_FP32); - -// // assert(remaining == 0);//TODO: handle remaining elements later - - -// int32_t leftover_size = leftover * sizeof(float); - -// static const float kMinExp = -87.f; // 0 -// static const float kMaxExp = 87.f; // 1 - -// const HVX_Vector one = hvx_vec_splat_fp32(1.f); -// const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); -// const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp); - -// const float *input = (float *)src; -// float *output = (float *)dst; - -// HVX_UVector * input_v_ptr = (HVX_UVector *) input; -// HVX_UVector * output_v_ptr = (HVX_UVector *) output; - -// // #pragma unroll(4) NOTE: this actual got slower -// for(uint32_t i = step_of_1; i> 0; i--){ -// *((HVX_UVector *)(output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(*(input_v_ptr++), one, max_exp, min_exp); - -// } - - -// if(leftover> 0){ - - -// HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(*(input_v_ptr++), one, max_exp, min_exp); -// /* Store output */ -// hvx_vec_store_u(output_v_ptr, leftover_size, sout); -// } - - -// } - float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems); void hvx_mul_f32(const uint8_t * restrict src0, const uint8_t * restrict src1, From 952877ec24732b12010c7fa7ed3fc8de4b74e718 Mon Sep 17 00:00:00 2001 From: shouyud Date: Tue, 16 Dec 2025 08:41:54 -0500 Subject: [PATCH 12/14] chore: reformat code with clang-formatter to pass cli test --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 207 ++++++++++++------------- ggml/src/ggml-hexagon/htp/act-ops.c | 12 +- ggml/src/ggml-hexagon/htp/htp-msg.h | 8 +- ggml/src/ggml-hexagon/htp/hvx-utils.c | 145 ++++++++--------- ggml/src/ggml-hexagon/htp/hvx-utils.h | 87 +++++------ ggml/src/ggml-hexagon/htp/main.c | 22 +-- 6 files changed, 221 insertions(+), 260 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index c45b292a52..781db7facf 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -8,8 +8,8 @@ #include #include #include -#include #include +#include #ifdef _WIN32 # include @@ -53,10 +53,12 @@ static int opt_opmask = HTP_OPMASK_QUEUE | HTP_OPMASK_QUANTIZE | HTP_OPMASK_COMP static int opt_opsync = 0; // synchronous ops #define HEX_VERBOSE(...) \ - if (opt_verbose) GGML_LOG_DEBUG(__VA_ARGS__) + if (opt_verbose) \ + GGML_LOG_DEBUG(__VA_ARGS__) #define HEX_PROFILE(...) \ - if (opt_profile) GGML_LOG_INFO(__VA_ARGS__) + if (opt_profile) \ + GGML_LOG_INFO(__VA_ARGS__) static inline uint64_t hex_is_aligned(void * addr, uint32_t align) { return ((size_t) addr & (align - 1)) == 0; @@ -218,7 +220,7 @@ struct ggml_hexagon_session { void allocate(int dev_id) noexcept(false); void release() noexcept(true); - void enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync = false); + void enqueue(struct htp_general_req & req, struct dspqueue_buffer * bufs, uint32_t n_bufs, bool sync = false); void flush(); ggml_backend_buffer_type buffer_type; @@ -258,7 +260,10 @@ static inline void hex_print_op_info(const ggml_tensor * op, ggml_hexagon_sessio names, dims, types, strides, buffs, req_flags); } -void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) { +void ggml_hexagon_session::enqueue(struct htp_general_req & req, + struct dspqueue_buffer * bufs, + uint32_t n_bufs, + bool sync) { // Bump pending flag (cleared in the session::flush once we get the responce) this->op_pending++; // atomic inc @@ -298,13 +303,13 @@ void ggml_hexagon_session::flush() { // Read response packet from queue int err = dspqueue_read(q, &flags, - HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references - &n_bufs, // Number of buffer references - bufs, // Buffer references - sizeof(rsp), // Max message length - &rsp_size, // Message length - (uint8_t *) &rsp, - 1000000); // Timeout + HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references + &n_bufs, // Number of buffer references + bufs, // Buffer references + sizeof(rsp), // Max message length + &rsp_size, // Message length + (uint8_t *) &rsp, + 1000000); // Timeout if (err == AEE_EEXPIRED) { // TODO: might need to bail out if the HTP is stuck on something @@ -354,8 +359,8 @@ struct ggml_backend_hexagon_buffer_context { int err = fastrpc_mmap(s->domain_id, this->fd, (void *) this->base, 0, this->size, FASTRPC_MAP_FD); if (err != 0) { - GGML_LOG_ERROR("ggml-hex: buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n", - s->domain_id, this->size, this->fd, (unsigned) err); + GGML_LOG_ERROR("ggml-hex: buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n", s->domain_id, + this->size, this->fd, (unsigned) err); return false; } @@ -386,10 +391,12 @@ struct ggml_backend_hexagon_buffer_context { size += 4 * 1024; // extra page for padding if (rpcmem_alloc2) { - this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size); + this->base = + (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size); } else { GGML_LOG_INFO("ggml-hex: %s rpcmem_alloc2 not found, falling back to rpcmem_alloc\n", sess->name.c_str()); - this->base = (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size); + this->base = + (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size); } if (!this->base) { @@ -453,7 +460,7 @@ static enum ggml_status ggml_backend_hexagon_buffer_init_tensor(ggml_backend_buf (int) ctx->repack); if (tensor->view_src != NULL && tensor->view_offs == 0) { - ; // nothing to do for the view + ; // nothing to do for the view } else { if (!ctx->mapped) { ctx->mmap(); @@ -702,8 +709,8 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size) // Ensure we don't try to read more data than is available in the source buffer 'data' // or write more than the tensor can hold. - const size_t total_tensor_size = (size_t)nrows * row_size; - const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; + const size_t total_tensor_size = (size_t) nrows * row_size; + const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; // Calculate how many full rows and how many remaining bytes we need to process. const int64_t n_full_rows = n_bytes_to_copy / row_size; @@ -732,7 +739,7 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size) // 2. Process the final, potentially partial, row if (n_rem_bytes > 0) { - const int64_t i = n_full_rows; + const int64_t i = n_full_rows; const uint8_t * src = (const uint8_t *) data + (i * row_size); uint8_t * dst = (uint8_t *) t->data + (i * row_size); @@ -762,8 +769,8 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size) size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any) // Ensure we don't try to copy more data than the tensor actually contains. - const size_t total_tensor_size = (size_t)nrows * row_size; - const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; + const size_t total_tensor_size = (size_t) nrows * row_size; + const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; // Calculate how many full rows and how many remaining bytes we need to process. const int64_t n_full_rows = n_bytes_to_copy / row_size; @@ -792,7 +799,7 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size) // 2. Process the final, potentially partial, row if (n_rem_bytes > 0) { - const int64_t i = n_full_rows; + const int64_t i = n_full_rows; const uint8_t * src = (const uint8_t *) t->data + (i * row_size); uint8_t * dst = (uint8_t *) data + (i * row_size); @@ -1028,8 +1035,8 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size) // Ensure we don't try to read more data than is available in the source buffer 'data' // or write more than the tensor can hold. - const size_t total_tensor_size = (size_t)nrows * row_size; - const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; + const size_t total_tensor_size = (size_t) nrows * row_size; + const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; // Calculate how many full rows and how many remaining bytes we need to process. const int64_t n_full_rows = n_bytes_to_copy / row_size; @@ -1058,7 +1065,7 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size) // 2. Process the final, potentially partial, row if (n_rem_bytes > 0) { - const int64_t i = n_full_rows; + const int64_t i = n_full_rows; const uint8_t * src = (const uint8_t *) data + (i * row_size); uint8_t * dst = (uint8_t *) t->data + (i * row_size); @@ -1088,8 +1095,8 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size) size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any) // Ensure we don't try to copy more data than the tensor actually contains. - const size_t total_tensor_size = (size_t)nrows * row_size; - const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; + const size_t total_tensor_size = (size_t) nrows * row_size; + const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; // Calculate how many full rows and how many remaining bytes we need to process. const int64_t n_full_rows = n_bytes_to_copy / row_size; @@ -1118,7 +1125,7 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size) // 2. Process the final, potentially partial, row if (n_rem_bytes > 0) { - const int64_t i = n_full_rows; + const int64_t i = n_full_rows; const uint8_t * src = (const uint8_t *) t->data + (i * row_size); uint8_t * dst = (uint8_t *) data + (i * row_size); @@ -1379,8 +1386,8 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si // Ensure we don't try to read more data than is available in the source buffer 'data' // or write more than the tensor can hold. - const size_t total_tensor_size = (size_t)nrows * row_size; - const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; + const size_t total_tensor_size = (size_t) nrows * row_size; + const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; // Calculate how many full rows and how many remaining bytes we need to process. const int64_t n_full_rows = n_bytes_to_copy / row_size; @@ -1409,7 +1416,7 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si // 2. Process the final, potentially partial, row if (n_rem_bytes > 0) { - const int64_t i = n_full_rows; + const int64_t i = n_full_rows; const uint8_t * src = (const uint8_t *) data + (i * row_size); uint8_t * dst = (uint8_t *) t->data + (i * row_size); @@ -1439,8 +1446,8 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any) // Ensure we don't try to copy more data than the tensor actually contains. - const size_t total_tensor_size = (size_t)nrows * row_size; - const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; + const size_t total_tensor_size = (size_t) nrows * row_size; + const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; // Calculate how many full rows and how many remaining bytes we need to process. const int64_t n_full_rows = n_bytes_to_copy / row_size; @@ -1469,7 +1476,7 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si // 2. Process the final, potentially partial, row if (n_rem_bytes > 0) { - const int64_t i = n_full_rows; + const int64_t i = n_full_rows; const uint8_t * src = (const uint8_t *) t->data + (i * row_size); uint8_t * dst = (uint8_t *) data + (i * row_size); @@ -1592,25 +1599,28 @@ static const char * ggml_backend_hexagon_buffer_type_name(ggml_backend_buffer_ty return static_cast(buffer_type->context)->name.c_str(); } -static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer( - ggml_backend_buffer_type_t buffer_type, size_t size) { +static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, + size_t size) { auto sess = static_cast(buffer_type->context)->sess; try { - ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/); + ggml_backend_hexagon_buffer_context * ctx = + new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/); return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size); - } catch (std::exception const &exc) { + } catch (const std::exception & exc) { GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what()); return nullptr; } } static ggml_backend_buffer_t ggml_backend_hexagon_repack_buffer_type_alloc_buffer( - ggml_backend_buffer_type_t buffer_type, size_t size) { + ggml_backend_buffer_type_t buffer_type, + size_t size) { auto sess = static_cast(buffer_type->context)->sess; try { - ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, true /*repack*/); + ggml_backend_hexagon_buffer_context * ctx = + new ggml_backend_hexagon_buffer_context(sess, size, true /*repack*/); return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size); - } catch (std::exception const &exc) { + } catch (const std::exception & exc) { GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what()); return nullptr; } @@ -1621,7 +1631,8 @@ static size_t ggml_backend_hexagon_buffer_type_get_alignment(ggml_backend_buffer GGML_UNUSED(buffer_type); } -static size_t ggml_backend_hexagon_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * t) { +static size_t ggml_backend_hexagon_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, + const struct ggml_tensor * t) { return ggml_nbytes(t); } @@ -1697,8 +1708,8 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) { } // Save the IDs - this->session_id = n.session_id; - this->domain_id = n.effective_domain_id; + this->session_id = n.session_id; + this->domain_id = n.effective_domain_id; this->valid_session = true; } @@ -1707,16 +1718,17 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) { char session_uri[256]; { char htp_uri[256]; - snprintf(htp_uri, sizeof(htp_uri), "file:///libggml-htp-v%u.so?htp_iface_skel_handle_invoke&_modver=1.0", opt_arch); + snprintf(htp_uri, sizeof(htp_uri), "file:///libggml-htp-v%u.so?htp_iface_skel_handle_invoke&_modver=1.0", + opt_arch); struct remote_rpc_get_uri u = {}; - u.session_id = this->session_id; - u.domain_name = const_cast(CDSP_DOMAIN_NAME); - u.domain_name_len = strlen(CDSP_DOMAIN_NAME); - u.module_uri = const_cast(htp_uri); - u.module_uri_len = strlen(htp_uri); - u.uri = session_uri; - u.uri_len = sizeof(session_uri); + u.session_id = this->session_id; + u.domain_name = const_cast(CDSP_DOMAIN_NAME); + u.domain_name_len = strlen(CDSP_DOMAIN_NAME); + u.module_uri = const_cast(htp_uri); + u.module_uri_len = strlen(htp_uri); + u.uri = session_uri; + u.uri_len = sizeof(session_uri); int err = remote_session_control(FASTRPC_GET_URI, (void *) &u, sizeof(u)); if (err != AEE_SUCCESS) { @@ -1725,7 +1737,9 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) { snprintf(session_uri, htp_URI_domain_len, "%s%s", htp_uri, my_domain->uri); - GGML_LOG_WARN("ggml-hex: failed to get URI for session %d : error 0x%x. Falling back to single session URI: %s\n", dev_id, err, session_uri); + GGML_LOG_WARN( + "ggml-hex: failed to get URI for session %d : error 0x%x. Falling back to single session URI: %s\n", + dev_id, err, session_uri); } } @@ -1751,7 +1765,7 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) { this->valid_handle = true; GGML_LOG_INFO("ggml-hex: new session: %s : session-id %d domain-id %d uri %s handle 0x%lx\n", this->name.c_str(), - this->session_id, this->domain_id, session_uri, (unsigned long) this->handle); + this->session_id, this->domain_id, session_uri, (unsigned long) this->handle); // Enable FastRPC QoS mode { @@ -1841,8 +1855,8 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n buffer_type.context = nullptr; repack_buffer_type.context = nullptr; - buffer_type.device = dev; - repack_buffer_type.device = dev; + buffer_type.device = dev; + repack_buffer_type.device = dev; try { allocate(dev_id); @@ -1852,7 +1866,7 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n repack_buffer_type.iface = ggml_backend_hexagon_repack_buffer_type_interface; repack_buffer_type.context = new ggml_backend_hexagon_buffer_type_context(this->name + "-REPACK", this); - } catch (std::exception const &exc) { + } catch (const std::exception & exc) { release(); throw; } @@ -1861,8 +1875,8 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n ggml_hexagon_session::~ggml_hexagon_session() noexcept(true) { release(); - delete static_cast(buffer_type.context); - delete static_cast(repack_buffer_type.context); + delete static_cast(buffer_type.context); + delete static_cast(repack_buffer_type.context); } // ** backend interface @@ -2164,11 +2178,11 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session } // src0, src1 & dst must be mapped to the same session - if(src1){ + if (src1) { if (!hex_supported_buffer(sess, src0, src1, dst)) { return false; } - }else{ + } else { if (!hex_supported_buffer(sess, src0, dst)) { return false; } @@ -2306,11 +2320,11 @@ static size_t dspqueue_buffers_init(dspqueue_buffer * buf, const ggml_tensor * t memset(buf, 0, sizeof(*buf)); auto tensor_buf = static_cast(t->buffer->context); - buf->fd = tensor_buf->fd; - buf->ptr = t->data; - buf->offset = (uint8_t *) t->data - tensor_buf->base; - buf->size = ggml_nbytes(t); - buf->flags = (flush_host ? DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER : 0); // Flush CPU + buf->fd = tensor_buf->fd; + buf->ptr = t->data; + buf->offset = (uint8_t *) t->data - tensor_buf->base; + buf->size = ggml_nbytes(t); + buf->flags = (flush_host ? DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER : 0); // Flush CPU buf->flags |= (flush_htp ? DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT : 0); // Invalidate DSP return 1; } @@ -2670,8 +2684,7 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) { if (ggml_get_unary_op(dst) == GGML_UNARY_OP_SILU) { req.op = HTP_OP_UNARY_SILU; supported = true; - } - else if (ggml_get_unary_op(dst) == GGML_UNARY_OP_GELU){ + } else if (ggml_get_unary_op(dst) == GGML_UNARY_OP_GELU) { req.op = HTP_OP_UNARY_GELU; supported = true; } @@ -2902,8 +2915,7 @@ static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op return (op0 && op0->src[1] == op1->src[1]); } -static inline bool is_compute_op(ggml_tensor *node) -{ +static inline bool is_compute_op(ggml_tensor * node) { return !(ggml_op_is_empty(node->op) || ggml_is_empty(node)); } @@ -3013,29 +3025,17 @@ struct node_info { std::vector fused; - ggml_op op() const { - return node->op; - } + ggml_op op() const { return node->op; } - const ggml_tensor * dst() const { - return fused.empty() ? node : fused.back(); - } + const ggml_tensor * dst() const { return fused.empty() ? node : fused.back(); } - const ggml_tensor * src0() const { - return node->src[0]; - } + const ggml_tensor * src0() const { return node->src[0]; } - const ggml_tensor * src1() const { - return node->src[1]; - } + const ggml_tensor * src1() const { return node->src[1]; } - bool is_empty() const { - return ggml_op_is_empty(node->op); - } + bool is_empty() const { return ggml_op_is_empty(node->op); } - void add_fused(ggml_tensor * t) { - fused.push_back(t); - } + void add_fused(ggml_tensor * t) { fused.push_back(t); } bool stackable() const { switch (this->op()) { @@ -3047,9 +3047,7 @@ struct node_info { } } - bool same_input(const node_info& n) const { - return n.src1() == this->src1(); - } + bool same_input(const node_info & n) const { return n.src1() == this->src1(); } }; static std::vector ggml_hexagon_graph_optimize_reorder(const std::vector & nodes) { @@ -3114,25 +3112,21 @@ static void ggml_backend_hexagon_graph_optimize(ggml_backend_t backend, ggml_cgr // and perform the reorder over the fused nodes. after the reorder is done, we unfuse for (int i = 0; i < n; i++) { node_info node = { - /*.node =*/ gf->nodes[i], - /*.fused =*/ {}, + /*.node =*/gf->nodes[i], + /*.fused =*/{}, }; // fuse only ops that start with these operations // can be expanded when needed - if (node.op() == GGML_OP_ADD || - node.op() == GGML_OP_NORM || - node.op() == GGML_OP_RMS_NORM) { + if (node.op() == GGML_OP_ADD || node.op() == GGML_OP_NORM || node.op() == GGML_OP_RMS_NORM) { ops[0] = node.op(); int f = i + 1; while (f < n && f < i + MAX_FUSE) { // conservatively allow fusing only these ops // can be expanded when needed - if (gf->nodes[f]->op != GGML_OP_ADD && - gf->nodes[f]->op != GGML_OP_MUL && - gf->nodes[f]->op != GGML_OP_NORM && - gf->nodes[f]->op != GGML_OP_RMS_NORM) { + if (gf->nodes[f]->op != GGML_OP_ADD && gf->nodes[f]->op != GGML_OP_MUL && + gf->nodes[f]->op != GGML_OP_NORM && gf->nodes[f]->op != GGML_OP_RMS_NORM) { break; } ops[f - i] = gf->nodes[f]->op; @@ -3308,8 +3302,7 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons case GGML_OP_UNARY: if (ggml_get_unary_op(op) == GGML_UNARY_OP_SILU) { supp = ggml_hexagon_supported_activations(sess, op); - } - else if (ggml_get_unary_op(op) == GGML_UNARY_OP_GELU){ + } else if (ggml_get_unary_op(op) == GGML_UNARY_OP_GELU) { supp = ggml_hexagon_supported_activations(sess, op); } break; @@ -3416,7 +3409,7 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) { } } - if(opt_arch < 75) { + if (opt_arch < 75) { opt_ndev = 1; GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n"); } @@ -3425,11 +3418,11 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) { // Create devices / sessions for (size_t i = 0; i < opt_ndev; i++) { - devices[i].iface = ggml_backend_hexagon_device_i; - devices[i].reg = reg; + devices[i].iface = ggml_backend_hexagon_device_i; + devices[i].reg = reg; try { devices[i].context = new ggml_hexagon_session(i, &devices[i]); - } catch (std::exception const &exc) { + } catch (const std::exception & exc) { GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i); devices[i].context = nullptr; } diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c index 9d3e584a84..273179ae2f 100644 --- a/ggml/src/ggml-hexagon/htp/act-ops.c +++ b/ggml/src/ggml-hexagon/htp/act-ops.c @@ -255,7 +255,6 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0, src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); } - static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, struct htp_tensor * dst, const int32_t * op_params, @@ -301,7 +300,7 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, const int BLOCK = 8; for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) { const uint32_t block_end = MIN(ir + BLOCK, src0_end_row); - + // Prefetch next block if (block_end < src0_end_row) { const float * restrict prefetch_ptr = (float *) (data_src0 + (block_end * src0_row_size)); @@ -315,12 +314,11 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, // gelu = x * sigmoid(1.702 * x) // current implementation if (1 == opt_path) { - hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0); + hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) src0_spad_data, ne0); hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0); hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0); - } - else { - hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0); + } else { + hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) src0_spad_data, ne0); hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0); hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0); } @@ -339,8 +337,6 @@ static void unary_gelu_fp32(unsigned int n, unsigned int i, void * data) { octx->src0_nrows_per_thread); } - - static void unary_silu_fp32_per_thread(const struct htp_tensor * src0, struct htp_tensor * dst, const int32_t * op_params, diff --git a/ggml/src/ggml-hexagon/htp/htp-msg.h b/ggml/src/ggml-hexagon/htp/htp-msg.h index a61652304a..0e893c1d96 100644 --- a/ggml/src/ggml-hexagon/htp/htp-msg.h +++ b/ggml/src/ggml-hexagon/htp/htp-msg.h @@ -120,10 +120,10 @@ static const char * htp_type_name(uint32_t t) { #define HTP_MAX_DIMS 4 struct htp_tensor { - uint32_t data; // Buffer offset in the messages, and data pointer on the NSP - uint32_t type; // Data type - uint32_t ne[HTP_MAX_DIMS]; // Number of elements - uint32_t nb[HTP_MAX_DIMS]; // Stride in bytes (see ggml.h ggml_tensor) + uint32_t data; // Buffer offset in the messages, and data pointer on the NSP + uint32_t type; // Data type + uint32_t ne[HTP_MAX_DIMS]; // Number of elements + uint32_t nb[HTP_MAX_DIMS]; // Stride in bytes (see ggml.h ggml_tensor) }; #define HTP_MAX_OP_PARAMS 64 diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c index e7ee589f34..2ac4cfb263 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.c +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c @@ -49,28 +49,25 @@ void hvx_mul_f32(const uint8_t * restrict src0, FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n"); } - bool handled_leftover = false; if (0 == unaligned_loop) { HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0; HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, *vec_in2++); *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { - int step_of_1 = num_elems_whole >> 5; // divby 32, because 32 float = 128 bytes per HVX vector + int step_of_1 = num_elems_whole >> 5; // divby 32, because 32 float = 128 bytes per HVX vector int leftover_size = left_over * sizeof(float); - - HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0; - HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1; + HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0; + HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1; HVX_UVector * restrict vec_out = (HVX_UVector *) dst; - HVX_Vector slinep; HVX_Vector slinec; HVX_Vector sline; @@ -78,48 +75,42 @@ void hvx_mul_f32(const uint8_t * restrict src0, HVX_Vector sline2c; HVX_Vector sline2; - slinep = *vec_in1++; + slinep = *vec_in1++; sline2p = *vec_in2++; - #pragma unroll(4) - for(uint32_t i = step_of_1 -1; i> 0; i--){ - slinec = *vec_in1++; +#pragma unroll(4) + for (uint32_t i = step_of_1 - 1; i > 0; i--) { + slinec = *vec_in1++; sline2c = *vec_in2++; - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); - sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); - - *((HVX_UVector *)(vec_out++)) =Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, sline2)); - slinep = slinec; - sline2p = sline2c; + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); + + *((HVX_UVector *) (vec_out++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, sline2)); + slinep = slinec; + sline2p = sline2c; } - if(step_of_1 > 1){ - slinec = htp_is_aligned(vec_in1, VLEN) && left_over == 0 ? slinep : *vec_in1++; + if (step_of_1 > 1) { + slinec = htp_is_aligned(vec_in1, VLEN) && left_over == 0 ? slinep : *vec_in1++; sline2c = htp_is_aligned(vec_in2, VLEN) && left_over == 0 ? sline2p : *vec_in2++; - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); - sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); - *((HVX_UVector *)(vec_out++)) =Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, sline2)); - slinep = slinec; - sline2p = sline2c; + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); + *((HVX_UVector *) (vec_out++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, sline2)); + slinep = slinec; + sline2p = sline2c; } - if(left_over > 0 ){ + if (left_over > 0) { + slinec = (is_in_one_chunk(vec_in1, leftover_size, VLEN) ? slinep : *vec_in1++); - slinec = (is_in_one_chunk(vec_in1, leftover_size, VLEN) - ? slinep - : *vec_in1++); - - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); - sline2c = (is_in_one_chunk(vec_in2, leftover_size, VLEN) - ? sline2p - : *vec_in2++); - sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); + sline2c = (is_in_one_chunk(vec_in2, leftover_size, VLEN) ? sline2p : *vec_in2++); + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(sline, sline2); - hvx_vec_store_u(vec_out, leftover_size, Q6_Vsf_equals_Vqf32(out)); + hvx_vec_store_u(vec_out, leftover_size, Q6_Vsf_equals_Vqf32(out)); handled_leftover = true; } } - if (left_over > 0 && !handled_leftover) { const float * src0f = (const float *) src0 + num_elems_whole; const float * src1f = (const float *) src1 + num_elems_whole; @@ -315,13 +306,13 @@ void hvx_add_f32(const uint8_t * restrict src0, HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*vec_in1++, *vec_in2++); *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32); HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32); @@ -458,7 +449,7 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * HVX_Vector * restrict vec_in1 = (HVX_Vector *) src; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *vec_in1++; const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in); @@ -468,7 +459,7 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * *vec_out++ = v; } } else { - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); @@ -512,60 +503,54 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * FARF(HIGH, "hvx_mul_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); } - HVX_Vector val_vec = hvx_vec_splat_fp32(val); - bool handled_leftover = false; + HVX_Vector val_vec = hvx_vec_splat_fp32(val); + bool handled_leftover = false; if (0 == unaligned_loop) { HVX_Vector * restrict vec_in1 = (HVX_Vector *) src; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, val_vec); *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { - int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector + int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector int leftover_size = left_over * sizeof(float); - - - HVX_Vector * input_v_ptr = (HVX_Vector *) src; - HVX_UVector * output_v_ptr = (HVX_UVector *) dst; - + HVX_Vector * input_v_ptr = (HVX_Vector *) src; + HVX_UVector * output_v_ptr = (HVX_UVector *) dst; HVX_Vector slinep; HVX_Vector slinec; HVX_Vector sline; - - slinep = *input_v_ptr++; - #pragma unroll(4) - for(uint32_t i = step_of_1 - 1; i > 0; i--){ - slinec = *input_v_ptr++; - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src); - *((HVX_UVector *)(output_v_ptr++)) = Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); + slinep = *input_v_ptr++; + +#pragma unroll(4) + for (uint32_t i = step_of_1 - 1; i > 0; i--) { + slinec = *input_v_ptr++; + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src); + *((HVX_UVector *) (output_v_ptr++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); /* Prepare slinep for next iteration */ - slinep = slinec; + slinep = slinec; } - if(step_of_1 > 0){ - + if (step_of_1 > 0) { slinec = htp_is_aligned(input_v_ptr, VLEN) && left_over == 0 ? slinep : *input_v_ptr++; - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src); - *((HVX_UVector *)(output_v_ptr++)) = Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src); + *((HVX_UVector *) (output_v_ptr++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); slinep = slinec; } - if(leftover_size > 0){ - slinec = (is_in_one_chunk(input_v_ptr, leftover_size, VLEN) - ? slinep - : *input_v_ptr++); + if (leftover_size > 0) { + slinec = (is_in_one_chunk(input_v_ptr, leftover_size, VLEN) ? slinep : *input_v_ptr++); sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src); - HVX_Vector sout = Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); - hvx_vec_store_u(output_v_ptr, leftover_size, sout); + HVX_Vector sout = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); + hvx_vec_store_u(output_v_ptr, leftover_size, sout); handled_leftover = true; } } @@ -606,13 +591,13 @@ void hvx_sub_f32(const uint8_t * restrict src0, HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*vec_in1++, *vec_in2++); *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32); HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32); @@ -747,13 +732,13 @@ void hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * HVX_Vector * restrict vec_in1 = (HVX_Vector *) src; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*vec_in1++, val_vec); *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); @@ -789,7 +774,7 @@ float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems) HVX_Vector sum_vec_acc = Q6_V_vsplat_R(0x00000000); HVX_Vector zero_vec = Q6_V_vsplat_R(0x00000000); - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1, *vec_in1); sum_vec_acc = Q6_Vqf32_vadd_Vqf32Vqf32(sum_vec_acc, v); @@ -833,13 +818,13 @@ float hvx_self_sum_f32(const uint8_t * restrict src, const int num_elems) { if (0 == unaligned_loop) { HVX_Vector * vec_in = (HVX_Vector *) src; - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { // sum_vec = Q6_Vqf32_vadd_Vqf32Vsf(sum_vec, *vec_in++); sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), *vec_in++); } } else { - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); @@ -882,13 +867,13 @@ void hvx_scale_f32(const uint8_t * restrict src, uint8_t * restrict dst, const i HVX_Vector * vec_in1 = (HVX_Vector *) src; HVX_Vector * vec_out = (HVX_Vector *) dst; - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, scale_vec); *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); @@ -931,12 +916,12 @@ float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems) { if (0 == unaligned_loop) { HVX_Vector * restrict vec_in = (HVX_Vector *) src; - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { vec_max = Q6_Vsf_vmax_VsfVsf(vec_max, *vec_in++); } } else { - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); @@ -974,7 +959,7 @@ void hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * HVX_Vector * restrict vec_in = (HVX_Vector *) src; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { vec_min = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++); *vec_out++ = Q6_Vsf_equals_Vqf32(vec_min); @@ -1012,7 +997,7 @@ void hvx_clamp_scalar_f32(const uint8_t * restrict src, HVX_Vector range_left = hvx_vec_splat_fp32(limit_left); HVX_Vector range_right = hvx_vec_splat_fp32(limit_right); - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in_vec = *vec_in++; HVX_Vector temp_v = in_vec; diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index 0b24786391..c5da167d49 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -23,20 +23,18 @@ typedef union { /* Q6_Vsf_equals_Vw is only available on v73+.*/ #if __HVX_ARCH__ < 73 -static inline HVX_Vector int32_to_qfloat(HVX_Vector const in) -{ - HVX_Vector const vzero = Q6_V_vzero(); - HVX_VectorPred is_zero = Q6_Q_vcmp_eq_VwVw(in, vzero); - HVX_Vector lshift = Q6_Vw_vnormamt_Vw(in); - HVX_Vector normalized = Q6_Vw_vasl_VwVw(in, lshift); - HVX_Vector vexp = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift); - HVX_Vector mant = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized); - HVX_Vector ret = Q6_V_vmux_QVV(is_zero, vzero, Q6_Vw_vadd_VwVw(mant, vexp)); +static inline HVX_Vector int32_to_qfloat(const HVX_Vector in) { + const HVX_Vector vzero = Q6_V_vzero(); + HVX_VectorPred is_zero = Q6_Q_vcmp_eq_VwVw(in, vzero); + HVX_Vector lshift = Q6_Vw_vnormamt_Vw(in); + HVX_Vector normalized = Q6_Vw_vasl_VwVw(in, lshift); + HVX_Vector vexp = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift); + HVX_Vector mant = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized); + HVX_Vector ret = Q6_V_vmux_QVV(is_zero, vzero, Q6_Vw_vadd_VwVw(mant, vexp)); return ret; } -static inline HVX_Vector Q6_Vsf_equals_Vw(HVX_Vector const in) -{ +static inline HVX_Vector Q6_Vsf_equals_Vw(const HVX_Vector in) { return Q6_Vsf_equals_Vqf32(int32_to_qfloat(in)); } #endif @@ -109,7 +107,7 @@ static inline void hvx_copy_fp16_aa(uint8_t * restrict dst, const uint8_t * rest uint32_t i = 0; - #pragma unroll(4) +#pragma unroll(4) for (; i < nvec; i++) { HVX_Vector v = vsrc[i]; vdst[i] = v; @@ -133,7 +131,7 @@ static inline void hvx_copy_fp16_ua(uint8_t * restrict dst, const uint8_t * rest uint32_t i = 0; - #pragma unroll(4) +#pragma unroll(4) for (; i < nvec; i++) { HVX_Vector v = vsrc[i]; vdst[i] = v; @@ -157,7 +155,7 @@ static inline void hvx_copy_fp16_au(uint8_t * restrict dst, const uint8_t * rest uint32_t i = 0; - #pragma unroll(4) +#pragma unroll(4) for (; i < nvec; i++) { HVX_Vector v = vsrc[i]; vdst[i] = v; @@ -182,7 +180,7 @@ static inline void hvx_copy_fp32_aa(uint8_t * restrict dst, const uint8_t * rest uint32_t i = 0; - #pragma unroll(4) +#pragma unroll(4) for (; i < nvec; i++) { HVX_Vector v = vsrc[i]; vdst[i] = v; @@ -206,7 +204,7 @@ static inline void hvx_copy_fp32_ua(uint8_t * restrict dst, const uint8_t * rest uint32_t i = 0; - #pragma unroll(4) +#pragma unroll(4) for (; i < nvec; i++) { HVX_Vector v = vsrc[i]; vdst[i] = v; @@ -230,7 +228,7 @@ static inline void hvx_copy_fp32_au(uint8_t * restrict dst, const uint8_t * rest uint32_t i = 0; - #pragma unroll(4) +#pragma unroll(4) for (; i < nvec; i++) { HVX_Vector v = vsrc[i]; vdst[i] = v; @@ -255,7 +253,7 @@ static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t uint32_t i = 0; - #pragma unroll(4) +#pragma unroll(4) for (; i < nvec; i++) { vdst[i] = velem; } @@ -265,7 +263,6 @@ static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t } } - /* Return whether 'n' elements from vector are in the one chunk of 'chunk_size'. */ static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) { uint32_t left_off = (size_t) addr & (chunk_size - 1); @@ -273,8 +270,6 @@ static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint3 return right_off <= chunk_size; } - - static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) { HVX_VectorAlias u = { .v = v }; @@ -992,16 +987,15 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp); - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < step_of_1; i++) { v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp); } } - -static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){ +static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) { int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector - int leftover = num_elems - (step_of_1 * VLEN_FP32); + int leftover = num_elems - (step_of_1 * VLEN_FP32); int32_t leftover_size = leftover * sizeof(float); @@ -1012,51 +1006,44 @@ static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restr const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp); - const float *input = (float *)src; - float *output = (float *)dst; - - HVX_Vector * input_v_ptr = (HVX_Vector *) input; - HVX_UVector * output_v_ptr = (HVX_UVector *) output; + const float * input = (float *) src; + float * output = (float *) dst; + HVX_Vector * input_v_ptr = (HVX_Vector *) input; + HVX_UVector * output_v_ptr = (HVX_UVector *) output; HVX_Vector slinep; HVX_Vector slinec; HVX_Vector sline; - - slinep = *input_v_ptr++; - #pragma unroll(4) - for(uint32_t i = step_of_1 -1; i> 0; i--){ - slinec = *input_v_ptr++; - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); - *((HVX_UVector *)(output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp); + slinep = *input_v_ptr++; +#pragma unroll(4) + for (uint32_t i = step_of_1 - 1; i > 0; i--) { + slinec = *input_v_ptr++; + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); + *((HVX_UVector *) (output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp); /* Prepare slinep for next iteration */ - slinep = slinec; + slinep = slinec; } - if(step_of_1> 0){ - + if (step_of_1 > 0) { slinec = htp_is_aligned(input_v_ptr, 128) && leftover == 0 ? slinep : *input_v_ptr++; - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); - *((HVX_UVector *)(output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);; + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); + *((HVX_UVector *) (output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp); + ; slinep = slinec; } - if(leftover> 0){ - slinec = (is_in_one_chunk(input_v_ptr, leftover_size, 128) - ? slinep - : *input_v_ptr++); + if (leftover > 0) { + slinec = (is_in_one_chunk(input_v_ptr, leftover_size, 128) ? slinep : *input_v_ptr++); sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp); - hvx_vec_store_u(output_v_ptr, leftover_size, sout); + hvx_vec_store_u(output_v_ptr, leftover_size, sout); } - - } - float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems); void hvx_mul_f32(const uint8_t * restrict src0, const uint8_t * restrict src1, diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c index e30ae69502..cbfdd0472f 100644 --- a/ggml/src/ggml-hexagon/htp/main.c +++ b/ggml/src/ggml-hexagon/htp/main.c @@ -151,7 +151,7 @@ static int vtcm_acquire(struct htp_context * ctx) { qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio - 10); err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000); if (err != 0) { - FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err); + FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned) err); abort(); } HAP_compute_res_release_cached(ctx->vtcm_rctx); @@ -159,7 +159,7 @@ static int vtcm_acquire(struct htp_context * ctx) { err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000); if (err != 0) { - FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err); + FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned) err); abort(); } ctx->vtcm_valid = true; @@ -411,7 +411,7 @@ static void proc_matmul_req(struct htp_context * ctx, rsp_bufs[0].ptr = bufs[2].ptr; rsp_bufs[0].size = bufs[2].size; rsp_bufs[0].offset = bufs[2].offset; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP + rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU // Setup Op context @@ -453,7 +453,7 @@ static void proc_matmul_id_req(struct htp_context * ctx, rsp_bufs[0].ptr = bufs[3].ptr; rsp_bufs[0].size = bufs[3].size; rsp_bufs[0].offset = bufs[3].offset; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP + rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU // Setup Op context @@ -494,7 +494,7 @@ static void proc_binary_req(struct htp_context * ctx, struct htp_general_req * r rsp_bufs[0].ptr = bufs[2].ptr; rsp_bufs[0].offset = bufs[2].offset; rsp_bufs[0].size = bufs[2].size; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP + rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU // Setup Op context @@ -533,7 +533,7 @@ static void proc_add_id_req(struct htp_context * ctx, struct htp_general_req * r rsp_bufs[0].ptr = bufs[3].ptr; rsp_bufs[0].offset = bufs[3].offset; rsp_bufs[0].size = bufs[3].size; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP + rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU // Setup Op context @@ -574,7 +574,7 @@ static void proc_unary_req(struct htp_context * ctx, struct htp_general_req * re rsp_bufs[0].ptr = bufs[1].ptr; rsp_bufs[0].offset = bufs[1].offset; rsp_bufs[0].size = bufs[1].size; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP + rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU // Setup Op context @@ -618,8 +618,8 @@ static void proc_activations_req(struct htp_context * ctx, rsp_bufs[0].ptr = bufs[write_idx].ptr; rsp_bufs[0].offset = bufs[write_idx].offset; rsp_bufs[0].size = bufs[write_idx].size; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU + rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP + DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU // Setup Op context struct htp_ops_context octx = { 0 }; @@ -674,8 +674,8 @@ static void proc_rope_req(struct htp_context * ctx, rsp_bufs[0].ptr = bufs[write_idx].ptr; rsp_bufs[0].offset = bufs[write_idx].offset; rsp_bufs[0].size = bufs[write_idx].size; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU + rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP + DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU // Setup Op context struct htp_ops_context octx = { 0 }; From cf3a65fb73b21b108b9f586add04f5250ece934d Mon Sep 17 00:00:00 2001 From: shouyud Date: Tue, 16 Dec 2025 14:28:34 -0500 Subject: [PATCH 13/14] Revert "chore: reformat code with clang-formatter to pass cli test" This reverts commit 952877ec24732b12010c7fa7ed3fc8de4b74e718. --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 207 +++++++++++++------------ ggml/src/ggml-hexagon/htp/act-ops.c | 12 +- ggml/src/ggml-hexagon/htp/htp-msg.h | 8 +- ggml/src/ggml-hexagon/htp/hvx-utils.c | 145 +++++++++-------- ggml/src/ggml-hexagon/htp/hvx-utils.h | 87 ++++++----- ggml/src/ggml-hexagon/htp/main.c | 22 +-- 6 files changed, 260 insertions(+), 221 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 781db7facf..c45b292a52 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -8,8 +8,8 @@ #include #include #include -#include #include +#include #ifdef _WIN32 # include @@ -53,12 +53,10 @@ static int opt_opmask = HTP_OPMASK_QUEUE | HTP_OPMASK_QUANTIZE | HTP_OPMASK_COMP static int opt_opsync = 0; // synchronous ops #define HEX_VERBOSE(...) \ - if (opt_verbose) \ - GGML_LOG_DEBUG(__VA_ARGS__) + if (opt_verbose) GGML_LOG_DEBUG(__VA_ARGS__) #define HEX_PROFILE(...) \ - if (opt_profile) \ - GGML_LOG_INFO(__VA_ARGS__) + if (opt_profile) GGML_LOG_INFO(__VA_ARGS__) static inline uint64_t hex_is_aligned(void * addr, uint32_t align) { return ((size_t) addr & (align - 1)) == 0; @@ -220,7 +218,7 @@ struct ggml_hexagon_session { void allocate(int dev_id) noexcept(false); void release() noexcept(true); - void enqueue(struct htp_general_req & req, struct dspqueue_buffer * bufs, uint32_t n_bufs, bool sync = false); + void enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync = false); void flush(); ggml_backend_buffer_type buffer_type; @@ -260,10 +258,7 @@ static inline void hex_print_op_info(const ggml_tensor * op, ggml_hexagon_sessio names, dims, types, strides, buffs, req_flags); } -void ggml_hexagon_session::enqueue(struct htp_general_req & req, - struct dspqueue_buffer * bufs, - uint32_t n_bufs, - bool sync) { +void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) { // Bump pending flag (cleared in the session::flush once we get the responce) this->op_pending++; // atomic inc @@ -303,13 +298,13 @@ void ggml_hexagon_session::flush() { // Read response packet from queue int err = dspqueue_read(q, &flags, - HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references - &n_bufs, // Number of buffer references - bufs, // Buffer references - sizeof(rsp), // Max message length - &rsp_size, // Message length - (uint8_t *) &rsp, - 1000000); // Timeout + HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references + &n_bufs, // Number of buffer references + bufs, // Buffer references + sizeof(rsp), // Max message length + &rsp_size, // Message length + (uint8_t *) &rsp, + 1000000); // Timeout if (err == AEE_EEXPIRED) { // TODO: might need to bail out if the HTP is stuck on something @@ -359,8 +354,8 @@ struct ggml_backend_hexagon_buffer_context { int err = fastrpc_mmap(s->domain_id, this->fd, (void *) this->base, 0, this->size, FASTRPC_MAP_FD); if (err != 0) { - GGML_LOG_ERROR("ggml-hex: buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n", s->domain_id, - this->size, this->fd, (unsigned) err); + GGML_LOG_ERROR("ggml-hex: buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n", + s->domain_id, this->size, this->fd, (unsigned) err); return false; } @@ -391,12 +386,10 @@ struct ggml_backend_hexagon_buffer_context { size += 4 * 1024; // extra page for padding if (rpcmem_alloc2) { - this->base = - (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size); + this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size); } else { GGML_LOG_INFO("ggml-hex: %s rpcmem_alloc2 not found, falling back to rpcmem_alloc\n", sess->name.c_str()); - this->base = - (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size); + this->base = (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size); } if (!this->base) { @@ -460,7 +453,7 @@ static enum ggml_status ggml_backend_hexagon_buffer_init_tensor(ggml_backend_buf (int) ctx->repack); if (tensor->view_src != NULL && tensor->view_offs == 0) { - ; // nothing to do for the view + ; // nothing to do for the view } else { if (!ctx->mapped) { ctx->mmap(); @@ -709,8 +702,8 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size) // Ensure we don't try to read more data than is available in the source buffer 'data' // or write more than the tensor can hold. - const size_t total_tensor_size = (size_t) nrows * row_size; - const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; + const size_t total_tensor_size = (size_t)nrows * row_size; + const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; // Calculate how many full rows and how many remaining bytes we need to process. const int64_t n_full_rows = n_bytes_to_copy / row_size; @@ -739,7 +732,7 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size) // 2. Process the final, potentially partial, row if (n_rem_bytes > 0) { - const int64_t i = n_full_rows; + const int64_t i = n_full_rows; const uint8_t * src = (const uint8_t *) data + (i * row_size); uint8_t * dst = (uint8_t *) t->data + (i * row_size); @@ -769,8 +762,8 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size) size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any) // Ensure we don't try to copy more data than the tensor actually contains. - const size_t total_tensor_size = (size_t) nrows * row_size; - const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; + const size_t total_tensor_size = (size_t)nrows * row_size; + const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; // Calculate how many full rows and how many remaining bytes we need to process. const int64_t n_full_rows = n_bytes_to_copy / row_size; @@ -799,7 +792,7 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size) // 2. Process the final, potentially partial, row if (n_rem_bytes > 0) { - const int64_t i = n_full_rows; + const int64_t i = n_full_rows; const uint8_t * src = (const uint8_t *) t->data + (i * row_size); uint8_t * dst = (uint8_t *) data + (i * row_size); @@ -1035,8 +1028,8 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size) // Ensure we don't try to read more data than is available in the source buffer 'data' // or write more than the tensor can hold. - const size_t total_tensor_size = (size_t) nrows * row_size; - const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; + const size_t total_tensor_size = (size_t)nrows * row_size; + const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; // Calculate how many full rows and how many remaining bytes we need to process. const int64_t n_full_rows = n_bytes_to_copy / row_size; @@ -1065,7 +1058,7 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size) // 2. Process the final, potentially partial, row if (n_rem_bytes > 0) { - const int64_t i = n_full_rows; + const int64_t i = n_full_rows; const uint8_t * src = (const uint8_t *) data + (i * row_size); uint8_t * dst = (uint8_t *) t->data + (i * row_size); @@ -1095,8 +1088,8 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size) size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any) // Ensure we don't try to copy more data than the tensor actually contains. - const size_t total_tensor_size = (size_t) nrows * row_size; - const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; + const size_t total_tensor_size = (size_t)nrows * row_size; + const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; // Calculate how many full rows and how many remaining bytes we need to process. const int64_t n_full_rows = n_bytes_to_copy / row_size; @@ -1125,7 +1118,7 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size) // 2. Process the final, potentially partial, row if (n_rem_bytes > 0) { - const int64_t i = n_full_rows; + const int64_t i = n_full_rows; const uint8_t * src = (const uint8_t *) t->data + (i * row_size); uint8_t * dst = (uint8_t *) data + (i * row_size); @@ -1386,8 +1379,8 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si // Ensure we don't try to read more data than is available in the source buffer 'data' // or write more than the tensor can hold. - const size_t total_tensor_size = (size_t) nrows * row_size; - const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; + const size_t total_tensor_size = (size_t)nrows * row_size; + const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; // Calculate how many full rows and how many remaining bytes we need to process. const int64_t n_full_rows = n_bytes_to_copy / row_size; @@ -1416,7 +1409,7 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si // 2. Process the final, potentially partial, row if (n_rem_bytes > 0) { - const int64_t i = n_full_rows; + const int64_t i = n_full_rows; const uint8_t * src = (const uint8_t *) data + (i * row_size); uint8_t * dst = (uint8_t *) t->data + (i * row_size); @@ -1446,8 +1439,8 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any) // Ensure we don't try to copy more data than the tensor actually contains. - const size_t total_tensor_size = (size_t) nrows * row_size; - const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; + const size_t total_tensor_size = (size_t)nrows * row_size; + const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; // Calculate how many full rows and how many remaining bytes we need to process. const int64_t n_full_rows = n_bytes_to_copy / row_size; @@ -1476,7 +1469,7 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si // 2. Process the final, potentially partial, row if (n_rem_bytes > 0) { - const int64_t i = n_full_rows; + const int64_t i = n_full_rows; const uint8_t * src = (const uint8_t *) t->data + (i * row_size); uint8_t * dst = (uint8_t *) data + (i * row_size); @@ -1599,28 +1592,25 @@ static const char * ggml_backend_hexagon_buffer_type_name(ggml_backend_buffer_ty return static_cast(buffer_type->context)->name.c_str(); } -static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, - size_t size) { +static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer( + ggml_backend_buffer_type_t buffer_type, size_t size) { auto sess = static_cast(buffer_type->context)->sess; try { - ggml_backend_hexagon_buffer_context * ctx = - new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/); + ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/); return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size); - } catch (const std::exception & exc) { + } catch (std::exception const &exc) { GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what()); return nullptr; } } static ggml_backend_buffer_t ggml_backend_hexagon_repack_buffer_type_alloc_buffer( - ggml_backend_buffer_type_t buffer_type, - size_t size) { + ggml_backend_buffer_type_t buffer_type, size_t size) { auto sess = static_cast(buffer_type->context)->sess; try { - ggml_backend_hexagon_buffer_context * ctx = - new ggml_backend_hexagon_buffer_context(sess, size, true /*repack*/); + ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, true /*repack*/); return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size); - } catch (const std::exception & exc) { + } catch (std::exception const &exc) { GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what()); return nullptr; } @@ -1631,8 +1621,7 @@ static size_t ggml_backend_hexagon_buffer_type_get_alignment(ggml_backend_buffer GGML_UNUSED(buffer_type); } -static size_t ggml_backend_hexagon_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, - const struct ggml_tensor * t) { +static size_t ggml_backend_hexagon_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * t) { return ggml_nbytes(t); } @@ -1708,8 +1697,8 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) { } // Save the IDs - this->session_id = n.session_id; - this->domain_id = n.effective_domain_id; + this->session_id = n.session_id; + this->domain_id = n.effective_domain_id; this->valid_session = true; } @@ -1718,17 +1707,16 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) { char session_uri[256]; { char htp_uri[256]; - snprintf(htp_uri, sizeof(htp_uri), "file:///libggml-htp-v%u.so?htp_iface_skel_handle_invoke&_modver=1.0", - opt_arch); + snprintf(htp_uri, sizeof(htp_uri), "file:///libggml-htp-v%u.so?htp_iface_skel_handle_invoke&_modver=1.0", opt_arch); struct remote_rpc_get_uri u = {}; - u.session_id = this->session_id; - u.domain_name = const_cast(CDSP_DOMAIN_NAME); - u.domain_name_len = strlen(CDSP_DOMAIN_NAME); - u.module_uri = const_cast(htp_uri); - u.module_uri_len = strlen(htp_uri); - u.uri = session_uri; - u.uri_len = sizeof(session_uri); + u.session_id = this->session_id; + u.domain_name = const_cast(CDSP_DOMAIN_NAME); + u.domain_name_len = strlen(CDSP_DOMAIN_NAME); + u.module_uri = const_cast(htp_uri); + u.module_uri_len = strlen(htp_uri); + u.uri = session_uri; + u.uri_len = sizeof(session_uri); int err = remote_session_control(FASTRPC_GET_URI, (void *) &u, sizeof(u)); if (err != AEE_SUCCESS) { @@ -1737,9 +1725,7 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) { snprintf(session_uri, htp_URI_domain_len, "%s%s", htp_uri, my_domain->uri); - GGML_LOG_WARN( - "ggml-hex: failed to get URI for session %d : error 0x%x. Falling back to single session URI: %s\n", - dev_id, err, session_uri); + GGML_LOG_WARN("ggml-hex: failed to get URI for session %d : error 0x%x. Falling back to single session URI: %s\n", dev_id, err, session_uri); } } @@ -1765,7 +1751,7 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) { this->valid_handle = true; GGML_LOG_INFO("ggml-hex: new session: %s : session-id %d domain-id %d uri %s handle 0x%lx\n", this->name.c_str(), - this->session_id, this->domain_id, session_uri, (unsigned long) this->handle); + this->session_id, this->domain_id, session_uri, (unsigned long) this->handle); // Enable FastRPC QoS mode { @@ -1855,8 +1841,8 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n buffer_type.context = nullptr; repack_buffer_type.context = nullptr; - buffer_type.device = dev; - repack_buffer_type.device = dev; + buffer_type.device = dev; + repack_buffer_type.device = dev; try { allocate(dev_id); @@ -1866,7 +1852,7 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n repack_buffer_type.iface = ggml_backend_hexagon_repack_buffer_type_interface; repack_buffer_type.context = new ggml_backend_hexagon_buffer_type_context(this->name + "-REPACK", this); - } catch (const std::exception & exc) { + } catch (std::exception const &exc) { release(); throw; } @@ -1875,8 +1861,8 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n ggml_hexagon_session::~ggml_hexagon_session() noexcept(true) { release(); - delete static_cast(buffer_type.context); - delete static_cast(repack_buffer_type.context); + delete static_cast(buffer_type.context); + delete static_cast(repack_buffer_type.context); } // ** backend interface @@ -2178,11 +2164,11 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session } // src0, src1 & dst must be mapped to the same session - if (src1) { + if(src1){ if (!hex_supported_buffer(sess, src0, src1, dst)) { return false; } - } else { + }else{ if (!hex_supported_buffer(sess, src0, dst)) { return false; } @@ -2320,11 +2306,11 @@ static size_t dspqueue_buffers_init(dspqueue_buffer * buf, const ggml_tensor * t memset(buf, 0, sizeof(*buf)); auto tensor_buf = static_cast(t->buffer->context); - buf->fd = tensor_buf->fd; - buf->ptr = t->data; - buf->offset = (uint8_t *) t->data - tensor_buf->base; - buf->size = ggml_nbytes(t); - buf->flags = (flush_host ? DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER : 0); // Flush CPU + buf->fd = tensor_buf->fd; + buf->ptr = t->data; + buf->offset = (uint8_t *) t->data - tensor_buf->base; + buf->size = ggml_nbytes(t); + buf->flags = (flush_host ? DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER : 0); // Flush CPU buf->flags |= (flush_htp ? DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT : 0); // Invalidate DSP return 1; } @@ -2684,7 +2670,8 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) { if (ggml_get_unary_op(dst) == GGML_UNARY_OP_SILU) { req.op = HTP_OP_UNARY_SILU; supported = true; - } else if (ggml_get_unary_op(dst) == GGML_UNARY_OP_GELU) { + } + else if (ggml_get_unary_op(dst) == GGML_UNARY_OP_GELU){ req.op = HTP_OP_UNARY_GELU; supported = true; } @@ -2915,7 +2902,8 @@ static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op return (op0 && op0->src[1] == op1->src[1]); } -static inline bool is_compute_op(ggml_tensor * node) { +static inline bool is_compute_op(ggml_tensor *node) +{ return !(ggml_op_is_empty(node->op) || ggml_is_empty(node)); } @@ -3025,17 +3013,29 @@ struct node_info { std::vector fused; - ggml_op op() const { return node->op; } + ggml_op op() const { + return node->op; + } - const ggml_tensor * dst() const { return fused.empty() ? node : fused.back(); } + const ggml_tensor * dst() const { + return fused.empty() ? node : fused.back(); + } - const ggml_tensor * src0() const { return node->src[0]; } + const ggml_tensor * src0() const { + return node->src[0]; + } - const ggml_tensor * src1() const { return node->src[1]; } + const ggml_tensor * src1() const { + return node->src[1]; + } - bool is_empty() const { return ggml_op_is_empty(node->op); } + bool is_empty() const { + return ggml_op_is_empty(node->op); + } - void add_fused(ggml_tensor * t) { fused.push_back(t); } + void add_fused(ggml_tensor * t) { + fused.push_back(t); + } bool stackable() const { switch (this->op()) { @@ -3047,7 +3047,9 @@ struct node_info { } } - bool same_input(const node_info & n) const { return n.src1() == this->src1(); } + bool same_input(const node_info& n) const { + return n.src1() == this->src1(); + } }; static std::vector ggml_hexagon_graph_optimize_reorder(const std::vector & nodes) { @@ -3112,21 +3114,25 @@ static void ggml_backend_hexagon_graph_optimize(ggml_backend_t backend, ggml_cgr // and perform the reorder over the fused nodes. after the reorder is done, we unfuse for (int i = 0; i < n; i++) { node_info node = { - /*.node =*/gf->nodes[i], - /*.fused =*/{}, + /*.node =*/ gf->nodes[i], + /*.fused =*/ {}, }; // fuse only ops that start with these operations // can be expanded when needed - if (node.op() == GGML_OP_ADD || node.op() == GGML_OP_NORM || node.op() == GGML_OP_RMS_NORM) { + if (node.op() == GGML_OP_ADD || + node.op() == GGML_OP_NORM || + node.op() == GGML_OP_RMS_NORM) { ops[0] = node.op(); int f = i + 1; while (f < n && f < i + MAX_FUSE) { // conservatively allow fusing only these ops // can be expanded when needed - if (gf->nodes[f]->op != GGML_OP_ADD && gf->nodes[f]->op != GGML_OP_MUL && - gf->nodes[f]->op != GGML_OP_NORM && gf->nodes[f]->op != GGML_OP_RMS_NORM) { + if (gf->nodes[f]->op != GGML_OP_ADD && + gf->nodes[f]->op != GGML_OP_MUL && + gf->nodes[f]->op != GGML_OP_NORM && + gf->nodes[f]->op != GGML_OP_RMS_NORM) { break; } ops[f - i] = gf->nodes[f]->op; @@ -3302,7 +3308,8 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons case GGML_OP_UNARY: if (ggml_get_unary_op(op) == GGML_UNARY_OP_SILU) { supp = ggml_hexagon_supported_activations(sess, op); - } else if (ggml_get_unary_op(op) == GGML_UNARY_OP_GELU) { + } + else if (ggml_get_unary_op(op) == GGML_UNARY_OP_GELU){ supp = ggml_hexagon_supported_activations(sess, op); } break; @@ -3409,7 +3416,7 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) { } } - if (opt_arch < 75) { + if(opt_arch < 75) { opt_ndev = 1; GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n"); } @@ -3418,11 +3425,11 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) { // Create devices / sessions for (size_t i = 0; i < opt_ndev; i++) { - devices[i].iface = ggml_backend_hexagon_device_i; - devices[i].reg = reg; + devices[i].iface = ggml_backend_hexagon_device_i; + devices[i].reg = reg; try { devices[i].context = new ggml_hexagon_session(i, &devices[i]); - } catch (const std::exception & exc) { + } catch (std::exception const &exc) { GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i); devices[i].context = nullptr; } diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c index 273179ae2f..9d3e584a84 100644 --- a/ggml/src/ggml-hexagon/htp/act-ops.c +++ b/ggml/src/ggml-hexagon/htp/act-ops.c @@ -255,6 +255,7 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0, src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); } + static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, struct htp_tensor * dst, const int32_t * op_params, @@ -300,7 +301,7 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, const int BLOCK = 8; for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) { const uint32_t block_end = MIN(ir + BLOCK, src0_end_row); - + // Prefetch next block if (block_end < src0_end_row) { const float * restrict prefetch_ptr = (float *) (data_src0 + (block_end * src0_row_size)); @@ -314,11 +315,12 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, // gelu = x * sigmoid(1.702 * x) // current implementation if (1 == opt_path) { - hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) src0_spad_data, ne0); + hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0); hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0); hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0); - } else { - hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) src0_spad_data, ne0); + } + else { + hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0); hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0); hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0); } @@ -337,6 +339,8 @@ static void unary_gelu_fp32(unsigned int n, unsigned int i, void * data) { octx->src0_nrows_per_thread); } + + static void unary_silu_fp32_per_thread(const struct htp_tensor * src0, struct htp_tensor * dst, const int32_t * op_params, diff --git a/ggml/src/ggml-hexagon/htp/htp-msg.h b/ggml/src/ggml-hexagon/htp/htp-msg.h index 0e893c1d96..a61652304a 100644 --- a/ggml/src/ggml-hexagon/htp/htp-msg.h +++ b/ggml/src/ggml-hexagon/htp/htp-msg.h @@ -120,10 +120,10 @@ static const char * htp_type_name(uint32_t t) { #define HTP_MAX_DIMS 4 struct htp_tensor { - uint32_t data; // Buffer offset in the messages, and data pointer on the NSP - uint32_t type; // Data type - uint32_t ne[HTP_MAX_DIMS]; // Number of elements - uint32_t nb[HTP_MAX_DIMS]; // Stride in bytes (see ggml.h ggml_tensor) + uint32_t data; // Buffer offset in the messages, and data pointer on the NSP + uint32_t type; // Data type + uint32_t ne[HTP_MAX_DIMS]; // Number of elements + uint32_t nb[HTP_MAX_DIMS]; // Stride in bytes (see ggml.h ggml_tensor) }; #define HTP_MAX_OP_PARAMS 64 diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c index 2ac4cfb263..e7ee589f34 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.c +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c @@ -49,25 +49,28 @@ void hvx_mul_f32(const uint8_t * restrict src0, FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n"); } + bool handled_leftover = false; if (0 == unaligned_loop) { HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0; HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; -#pragma unroll(4) + #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, *vec_in2++); *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { - int step_of_1 = num_elems_whole >> 5; // divby 32, because 32 float = 128 bytes per HVX vector + int step_of_1 = num_elems_whole >> 5; // divby 32, because 32 float = 128 bytes per HVX vector int leftover_size = left_over * sizeof(float); - HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0; - HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1; + + HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0; + HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1; HVX_UVector * restrict vec_out = (HVX_UVector *) dst; + HVX_Vector slinep; HVX_Vector slinec; HVX_Vector sline; @@ -75,42 +78,48 @@ void hvx_mul_f32(const uint8_t * restrict src0, HVX_Vector sline2c; HVX_Vector sline2; - slinep = *vec_in1++; + slinep = *vec_in1++; sline2p = *vec_in2++; -#pragma unroll(4) - for (uint32_t i = step_of_1 - 1; i > 0; i--) { - slinec = *vec_in1++; + #pragma unroll(4) + for(uint32_t i = step_of_1 -1; i> 0; i--){ + slinec = *vec_in1++; sline2c = *vec_in2++; - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); - sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); - - *((HVX_UVector *) (vec_out++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, sline2)); - slinep = slinec; - sline2p = sline2c; + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); + + *((HVX_UVector *)(vec_out++)) =Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, sline2)); + slinep = slinec; + sline2p = sline2c; } - if (step_of_1 > 1) { - slinec = htp_is_aligned(vec_in1, VLEN) && left_over == 0 ? slinep : *vec_in1++; + if(step_of_1 > 1){ + slinec = htp_is_aligned(vec_in1, VLEN) && left_over == 0 ? slinep : *vec_in1++; sline2c = htp_is_aligned(vec_in2, VLEN) && left_over == 0 ? sline2p : *vec_in2++; - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); - sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); - *((HVX_UVector *) (vec_out++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, sline2)); - slinep = slinec; - sline2p = sline2c; + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); + *((HVX_UVector *)(vec_out++)) =Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, sline2)); + slinep = slinec; + sline2p = sline2c; } - if (left_over > 0) { - slinec = (is_in_one_chunk(vec_in1, leftover_size, VLEN) ? slinep : *vec_in1++); + if(left_over > 0 ){ - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); - sline2c = (is_in_one_chunk(vec_in2, leftover_size, VLEN) ? sline2p : *vec_in2++); - sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); + slinec = (is_in_one_chunk(vec_in1, leftover_size, VLEN) + ? slinep + : *vec_in1++); + + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); + sline2c = (is_in_one_chunk(vec_in2, leftover_size, VLEN) + ? sline2p + : *vec_in2++); + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1); HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(sline, sline2); - hvx_vec_store_u(vec_out, leftover_size, Q6_Vsf_equals_Vqf32(out)); + hvx_vec_store_u(vec_out, leftover_size, Q6_Vsf_equals_Vqf32(out)); handled_leftover = true; } } + if (left_over > 0 && !handled_leftover) { const float * src0f = (const float *) src0 + num_elems_whole; const float * src1f = (const float *) src1 + num_elems_whole; @@ -306,13 +315,13 @@ void hvx_add_f32(const uint8_t * restrict src0, HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; -#pragma unroll(4) + #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*vec_in1++, *vec_in2++); *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { -#pragma unroll(4) + #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32); HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32); @@ -449,7 +458,7 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * HVX_Vector * restrict vec_in1 = (HVX_Vector *) src; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; -#pragma unroll(4) + #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *vec_in1++; const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in); @@ -459,7 +468,7 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * *vec_out++ = v; } } else { -#pragma unroll(4) + #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); @@ -503,54 +512,60 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * FARF(HIGH, "hvx_mul_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); } - HVX_Vector val_vec = hvx_vec_splat_fp32(val); - bool handled_leftover = false; + HVX_Vector val_vec = hvx_vec_splat_fp32(val); + bool handled_leftover = false; if (0 == unaligned_loop) { HVX_Vector * restrict vec_in1 = (HVX_Vector *) src; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; -#pragma unroll(4) + #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, val_vec); *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { - int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector + int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector int leftover_size = left_over * sizeof(float); - HVX_Vector * input_v_ptr = (HVX_Vector *) src; - HVX_UVector * output_v_ptr = (HVX_UVector *) dst; + + + HVX_Vector * input_v_ptr = (HVX_Vector *) src; + HVX_UVector * output_v_ptr = (HVX_UVector *) dst; + HVX_Vector slinep; HVX_Vector slinec; HVX_Vector sline; + + slinep = *input_v_ptr++; - slinep = *input_v_ptr++; - -#pragma unroll(4) - for (uint32_t i = step_of_1 - 1; i > 0; i--) { - slinec = *input_v_ptr++; - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src); - *((HVX_UVector *) (output_v_ptr++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); + #pragma unroll(4) + for(uint32_t i = step_of_1 - 1; i > 0; i--){ + slinec = *input_v_ptr++; + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src); + *((HVX_UVector *)(output_v_ptr++)) = Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); /* Prepare slinep for next iteration */ - slinep = slinec; + slinep = slinec; } - if (step_of_1 > 0) { + if(step_of_1 > 0){ + slinec = htp_is_aligned(input_v_ptr, VLEN) && left_over == 0 ? slinep : *input_v_ptr++; - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src); - *((HVX_UVector *) (output_v_ptr++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src); + *((HVX_UVector *)(output_v_ptr++)) = Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); slinep = slinec; } - if (leftover_size > 0) { - slinec = (is_in_one_chunk(input_v_ptr, leftover_size, VLEN) ? slinep : *input_v_ptr++); + if(leftover_size > 0){ + slinec = (is_in_one_chunk(input_v_ptr, leftover_size, VLEN) + ? slinep + : *input_v_ptr++); sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src); - HVX_Vector sout = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); - hvx_vec_store_u(output_v_ptr, leftover_size, sout); + HVX_Vector sout = Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); + hvx_vec_store_u(output_v_ptr, leftover_size, sout); handled_leftover = true; } } @@ -591,13 +606,13 @@ void hvx_sub_f32(const uint8_t * restrict src0, HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; -#pragma unroll(4) + #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*vec_in1++, *vec_in2++); *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { -#pragma unroll(4) + #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32); HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32); @@ -732,13 +747,13 @@ void hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * HVX_Vector * restrict vec_in1 = (HVX_Vector *) src; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; -#pragma unroll(4) + #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*vec_in1++, val_vec); *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { -#pragma unroll(4) + #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); @@ -774,7 +789,7 @@ float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems) HVX_Vector sum_vec_acc = Q6_V_vsplat_R(0x00000000); HVX_Vector zero_vec = Q6_V_vsplat_R(0x00000000); -#pragma unroll(4) + #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1, *vec_in1); sum_vec_acc = Q6_Vqf32_vadd_Vqf32Vqf32(sum_vec_acc, v); @@ -818,13 +833,13 @@ float hvx_self_sum_f32(const uint8_t * restrict src, const int num_elems) { if (0 == unaligned_loop) { HVX_Vector * vec_in = (HVX_Vector *) src; -#pragma unroll(4) + #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { // sum_vec = Q6_Vqf32_vadd_Vqf32Vsf(sum_vec, *vec_in++); sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), *vec_in++); } } else { -#pragma unroll(4) + #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); @@ -867,13 +882,13 @@ void hvx_scale_f32(const uint8_t * restrict src, uint8_t * restrict dst, const i HVX_Vector * vec_in1 = (HVX_Vector *) src; HVX_Vector * vec_out = (HVX_Vector *) dst; -#pragma unroll(4) + #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, scale_vec); *vec_out++ = Q6_Vsf_equals_Vqf32(v); } } else { -#pragma unroll(4) + #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); @@ -916,12 +931,12 @@ float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems) { if (0 == unaligned_loop) { HVX_Vector * restrict vec_in = (HVX_Vector *) src; -#pragma unroll(4) + #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { vec_max = Q6_Vsf_vmax_VsfVsf(vec_max, *vec_in++); } } else { -#pragma unroll(4) + #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); @@ -959,7 +974,7 @@ void hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * HVX_Vector * restrict vec_in = (HVX_Vector *) src; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; -#pragma unroll(4) + #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { vec_min = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++); *vec_out++ = Q6_Vsf_equals_Vqf32(vec_min); @@ -997,7 +1012,7 @@ void hvx_clamp_scalar_f32(const uint8_t * restrict src, HVX_Vector range_left = hvx_vec_splat_fp32(limit_left); HVX_Vector range_right = hvx_vec_splat_fp32(limit_right); -#pragma unroll(4) + #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in_vec = *vec_in++; HVX_Vector temp_v = in_vec; diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index c5da167d49..0b24786391 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -23,18 +23,20 @@ typedef union { /* Q6_Vsf_equals_Vw is only available on v73+.*/ #if __HVX_ARCH__ < 73 -static inline HVX_Vector int32_to_qfloat(const HVX_Vector in) { - const HVX_Vector vzero = Q6_V_vzero(); - HVX_VectorPred is_zero = Q6_Q_vcmp_eq_VwVw(in, vzero); - HVX_Vector lshift = Q6_Vw_vnormamt_Vw(in); - HVX_Vector normalized = Q6_Vw_vasl_VwVw(in, lshift); - HVX_Vector vexp = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift); - HVX_Vector mant = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized); - HVX_Vector ret = Q6_V_vmux_QVV(is_zero, vzero, Q6_Vw_vadd_VwVw(mant, vexp)); +static inline HVX_Vector int32_to_qfloat(HVX_Vector const in) +{ + HVX_Vector const vzero = Q6_V_vzero(); + HVX_VectorPred is_zero = Q6_Q_vcmp_eq_VwVw(in, vzero); + HVX_Vector lshift = Q6_Vw_vnormamt_Vw(in); + HVX_Vector normalized = Q6_Vw_vasl_VwVw(in, lshift); + HVX_Vector vexp = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift); + HVX_Vector mant = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized); + HVX_Vector ret = Q6_V_vmux_QVV(is_zero, vzero, Q6_Vw_vadd_VwVw(mant, vexp)); return ret; } -static inline HVX_Vector Q6_Vsf_equals_Vw(const HVX_Vector in) { +static inline HVX_Vector Q6_Vsf_equals_Vw(HVX_Vector const in) +{ return Q6_Vsf_equals_Vqf32(int32_to_qfloat(in)); } #endif @@ -107,7 +109,7 @@ static inline void hvx_copy_fp16_aa(uint8_t * restrict dst, const uint8_t * rest uint32_t i = 0; -#pragma unroll(4) + #pragma unroll(4) for (; i < nvec; i++) { HVX_Vector v = vsrc[i]; vdst[i] = v; @@ -131,7 +133,7 @@ static inline void hvx_copy_fp16_ua(uint8_t * restrict dst, const uint8_t * rest uint32_t i = 0; -#pragma unroll(4) + #pragma unroll(4) for (; i < nvec; i++) { HVX_Vector v = vsrc[i]; vdst[i] = v; @@ -155,7 +157,7 @@ static inline void hvx_copy_fp16_au(uint8_t * restrict dst, const uint8_t * rest uint32_t i = 0; -#pragma unroll(4) + #pragma unroll(4) for (; i < nvec; i++) { HVX_Vector v = vsrc[i]; vdst[i] = v; @@ -180,7 +182,7 @@ static inline void hvx_copy_fp32_aa(uint8_t * restrict dst, const uint8_t * rest uint32_t i = 0; -#pragma unroll(4) + #pragma unroll(4) for (; i < nvec; i++) { HVX_Vector v = vsrc[i]; vdst[i] = v; @@ -204,7 +206,7 @@ static inline void hvx_copy_fp32_ua(uint8_t * restrict dst, const uint8_t * rest uint32_t i = 0; -#pragma unroll(4) + #pragma unroll(4) for (; i < nvec; i++) { HVX_Vector v = vsrc[i]; vdst[i] = v; @@ -228,7 +230,7 @@ static inline void hvx_copy_fp32_au(uint8_t * restrict dst, const uint8_t * rest uint32_t i = 0; -#pragma unroll(4) + #pragma unroll(4) for (; i < nvec; i++) { HVX_Vector v = vsrc[i]; vdst[i] = v; @@ -253,7 +255,7 @@ static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t uint32_t i = 0; -#pragma unroll(4) + #pragma unroll(4) for (; i < nvec; i++) { vdst[i] = velem; } @@ -263,6 +265,7 @@ static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t } } + /* Return whether 'n' elements from vector are in the one chunk of 'chunk_size'. */ static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) { uint32_t left_off = (size_t) addr & (chunk_size - 1); @@ -270,6 +273,8 @@ static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint3 return right_off <= chunk_size; } + + static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) { HVX_VectorAlias u = { .v = v }; @@ -987,15 +992,16 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp); -#pragma unroll(4) + #pragma unroll(4) for (int i = 0; i < step_of_1; i++) { v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp); } } -static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) { + +static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){ int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector - int leftover = num_elems - (step_of_1 * VLEN_FP32); + int leftover = num_elems - (step_of_1 * VLEN_FP32); int32_t leftover_size = leftover * sizeof(float); @@ -1006,44 +1012,51 @@ static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restr const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp); - const float * input = (float *) src; - float * output = (float *) dst; + const float *input = (float *)src; + float *output = (float *)dst; + + HVX_Vector * input_v_ptr = (HVX_Vector *) input; + HVX_UVector * output_v_ptr = (HVX_UVector *) output; - HVX_Vector * input_v_ptr = (HVX_Vector *) input; - HVX_UVector * output_v_ptr = (HVX_UVector *) output; HVX_Vector slinep; HVX_Vector slinec; HVX_Vector sline; + - slinep = *input_v_ptr++; -#pragma unroll(4) - for (uint32_t i = step_of_1 - 1; i > 0; i--) { - slinec = *input_v_ptr++; - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); - *((HVX_UVector *) (output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp); + slinep = *input_v_ptr++; + #pragma unroll(4) + for(uint32_t i = step_of_1 -1; i> 0; i--){ + slinec = *input_v_ptr++; + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); + *((HVX_UVector *)(output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp); /* Prepare slinep for next iteration */ - slinep = slinec; + slinep = slinec; } - if (step_of_1 > 0) { + if(step_of_1> 0){ + slinec = htp_is_aligned(input_v_ptr, 128) && leftover == 0 ? slinep : *input_v_ptr++; - sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); - *((HVX_UVector *) (output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp); - ; + sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); + *((HVX_UVector *)(output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);; slinep = slinec; } - if (leftover > 0) { - slinec = (is_in_one_chunk(input_v_ptr, leftover_size, 128) ? slinep : *input_v_ptr++); + if(leftover> 0){ + slinec = (is_in_one_chunk(input_v_ptr, leftover_size, 128) + ? slinep + : *input_v_ptr++); sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp); - hvx_vec_store_u(output_v_ptr, leftover_size, sout); + hvx_vec_store_u(output_v_ptr, leftover_size, sout); } + + } + float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems); void hvx_mul_f32(const uint8_t * restrict src0, const uint8_t * restrict src1, diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c index cbfdd0472f..e30ae69502 100644 --- a/ggml/src/ggml-hexagon/htp/main.c +++ b/ggml/src/ggml-hexagon/htp/main.c @@ -151,7 +151,7 @@ static int vtcm_acquire(struct htp_context * ctx) { qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio - 10); err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000); if (err != 0) { - FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned) err); + FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err); abort(); } HAP_compute_res_release_cached(ctx->vtcm_rctx); @@ -159,7 +159,7 @@ static int vtcm_acquire(struct htp_context * ctx) { err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000); if (err != 0) { - FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned) err); + FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err); abort(); } ctx->vtcm_valid = true; @@ -411,7 +411,7 @@ static void proc_matmul_req(struct htp_context * ctx, rsp_bufs[0].ptr = bufs[2].ptr; rsp_bufs[0].size = bufs[2].size; rsp_bufs[0].offset = bufs[2].offset; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP + rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU // Setup Op context @@ -453,7 +453,7 @@ static void proc_matmul_id_req(struct htp_context * ctx, rsp_bufs[0].ptr = bufs[3].ptr; rsp_bufs[0].size = bufs[3].size; rsp_bufs[0].offset = bufs[3].offset; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP + rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU // Setup Op context @@ -494,7 +494,7 @@ static void proc_binary_req(struct htp_context * ctx, struct htp_general_req * r rsp_bufs[0].ptr = bufs[2].ptr; rsp_bufs[0].offset = bufs[2].offset; rsp_bufs[0].size = bufs[2].size; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP + rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU // Setup Op context @@ -533,7 +533,7 @@ static void proc_add_id_req(struct htp_context * ctx, struct htp_general_req * r rsp_bufs[0].ptr = bufs[3].ptr; rsp_bufs[0].offset = bufs[3].offset; rsp_bufs[0].size = bufs[3].size; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP + rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU // Setup Op context @@ -574,7 +574,7 @@ static void proc_unary_req(struct htp_context * ctx, struct htp_general_req * re rsp_bufs[0].ptr = bufs[1].ptr; rsp_bufs[0].offset = bufs[1].offset; rsp_bufs[0].size = bufs[1].size; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP + rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU // Setup Op context @@ -618,8 +618,8 @@ static void proc_activations_req(struct htp_context * ctx, rsp_bufs[0].ptr = bufs[write_idx].ptr; rsp_bufs[0].offset = bufs[write_idx].offset; rsp_bufs[0].size = bufs[write_idx].size; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU + rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP + DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU // Setup Op context struct htp_ops_context octx = { 0 }; @@ -674,8 +674,8 @@ static void proc_rope_req(struct htp_context * ctx, rsp_bufs[0].ptr = bufs[write_idx].ptr; rsp_bufs[0].offset = bufs[write_idx].offset; rsp_bufs[0].size = bufs[write_idx].size; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU + rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP + DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU // Setup Op context struct htp_ops_context octx = { 0 }; From 52f43fb962f86975bfff2873f4ac96f671c9a51a Mon Sep 17 00:00:00 2001 From: shouyud Date: Tue, 16 Dec 2025 16:58:52 -0500 Subject: [PATCH 14/14] fix: fix loop overflow --- ggml/src/ggml-hexagon/htp/hvx-utils.c | 4 ++-- ggml/src/ggml-hexagon/htp/hvx-utils.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c index e7ee589f34..8e5c2c6983 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.c +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c @@ -81,7 +81,7 @@ void hvx_mul_f32(const uint8_t * restrict src0, slinep = *vec_in1++; sline2p = *vec_in2++; #pragma unroll(4) - for(uint32_t i = step_of_1 -1; i> 0; i--){ + for(int i = step_of_1 -1; i> 0; i--){ slinec = *vec_in1++; sline2c = *vec_in2++; sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0); @@ -540,7 +540,7 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * slinep = *input_v_ptr++; #pragma unroll(4) - for(uint32_t i = step_of_1 - 1; i > 0; i--){ + for(int i = step_of_1 - 1; i > 0; i--){ slinec = *input_v_ptr++; sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src); *((HVX_UVector *)(output_v_ptr++)) = Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_VsfVsf(sline, val_vec)); diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index 0b24786391..6b5b65a29e 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -1026,7 +1026,7 @@ static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restr slinep = *input_v_ptr++; #pragma unroll(4) - for(uint32_t i = step_of_1 -1; i> 0; i--){ + for(int i = step_of_1 -1; i> 0; i--){ slinec = *input_v_ptr++; sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); *((HVX_UVector *)(output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);