From 925a83ac7067e69748a9dd9d97e341404cae767e Mon Sep 17 00:00:00 2001 From: shouyud Date: Tue, 16 Dec 2025 14:12:43 -0500 Subject: [PATCH 1/3] snapshot: debug ggml-hexagon swiglu-oai --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 2 +- ggml/src/ggml-hexagon/htp/act-ops.c | 26 ++++++++++++++++++--- ggml/src/ggml-hexagon/htp/hvx-utils.c | 32 +++++++++++++++++++++----- ggml/src/ggml-hexagon/htp/main.c | 1 + 4 files changed, 51 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 514f086f68..fed4fcf111 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -3297,7 +3297,7 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons break; case GGML_OP_GLU: - if ((ggml_get_glu_op(op) == GGML_GLU_OP_SWIGLU) /* || (ggml_get_glu_op(op) == GGML_GLU_OP_SWIGLU_OAI) */) { + if ((ggml_get_glu_op(op) == GGML_GLU_OP_SWIGLU) || (ggml_get_glu_op(op) == GGML_GLU_OP_SWIGLU_OAI) ) { supp = ggml_hexagon_supported_activations(sess, op); } break; diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c index 87b09cca3a..0d7997ded1 100644 --- a/ggml/src/ggml-hexagon/htp/act-ops.c +++ b/ggml/src/ggml-hexagon/htp/act-ops.c @@ -229,9 +229,29 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0, } // x (src0_spad_data) = std::min(src0_p[k], limit); - hvx_min_scalar_f32((const uint8_t *) src0, limit, src0_spad_data, nc); - // y1 (src1_spad_data) = std::clamp(src1_p[k], -limit, limit); - hvx_clamp_scalar_f32((const uint8_t *) src1, limit, limit, src1_spad_data, nc); + //hvx_min_scalar_f32((const uint8_t *) src0, limit, src0_spad_data, nc); + // // y1 (src1_spad_data) = std::clamp(src1_p[k], -limit, limit); + // hvx_clamp_scalar_f32((const uint8_t *) src1, limit, limit, src1_spad_data, nc); + + // do manual limit + for (int i = 0; i < nc; i++) { + if (src0[i] > limit) { + ((float *) src0_spad_data)[i] = limit; + } else { + ((float *) src0_spad_data)[i] = src0[i]; + } + } + + for (int i = 0; i < nc; i++) { + if (src1[i] > limit) { + ((float *) src1_spad_data)[i] = limit; + } else if (src1[i] < -limit) { + ((float *) src1_spad_data)[i] = -limit; + } else { + ((float *) src1_spad_data)[i] = src1[i]; + } + } + // y (src1_spad_data) = y1 + 1.f hvx_add_scalar_f32(src1_spad_data, 1.0, src1_spad_data, nc); // x1 (dst_spad_data) = alpha * (x) diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c index e02b1d9099..ddf4206f6a 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.c +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c @@ -886,14 +886,34 @@ void hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * HVX_Vector vec_min = Q6_V_vsplat_R(val); - HVX_Vector * restrict vec_in = (HVX_Vector *) src; - HVX_Vector * restrict vec_out = (HVX_Vector *) dst; + int unaligned_input_addr = 0; + int unaligned_output_addr = 0; - #pragma unroll(4) - for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - vec_min = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++); - *vec_out++ = Q6_Vsf_equals_Vqf32(vec_min); + if(htp_is_aligned((void *) src, VLEN) == 0) { + unaligned_input_addr = 1; } + if(htp_is_aligned((void *) dst, VLEN) == 0) { + unaligned_output_addr = 1; + } + + + if(unaligned_input_addr == 0 && unaligned_output_addr == 0){ + + HVX_Vector * restrict vec_in = (HVX_Vector *) src; + HVX_Vector * restrict vec_out = (HVX_Vector *) dst; + + #pragma unroll(4) + for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { + vec_min = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++); + *vec_out++ = Q6_Vsf_equals_Vqf32(vec_min); + } + } + else if(unaligned_output_addr == 0){ + + }else if() + + + if (left_over > 0) { const float * srcf = (const float *) src + num_elems_whole; diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c index b60b352a7b..549f2a07bb 100644 --- a/ggml/src/ggml-hexagon/htp/main.c +++ b/ggml/src/ggml-hexagon/htp/main.c @@ -806,6 +806,7 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) { break; case HTP_OP_GLU_SWIGLU: + case HTP_OP_GLU_SWIGLU_OAI: case HTP_OP_SOFTMAX: if ((n_bufs != 2) && (n_bufs != 3)) { FARF(ERROR, "Bad act-req buffer list"); From 946f1a2037e7df3524de51a394dd4245f4eb7ac2 Mon Sep 17 00:00:00 2001 From: shouyud Date: Tue, 16 Dec 2025 15:20:44 -0500 Subject: [PATCH 2/3] fix: fix hvx_min_scalar_f32 --- ggml/src/ggml-hexagon/htp/act-ops.c | 12 +------- ggml/src/ggml-hexagon/htp/hvx-utils.c | 44 +++++++++++++-------------- 2 files changed, 22 insertions(+), 34 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c index 0d7997ded1..8273e05835 100644 --- a/ggml/src/ggml-hexagon/htp/act-ops.c +++ b/ggml/src/ggml-hexagon/htp/act-ops.c @@ -229,19 +229,9 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0, } // x (src0_spad_data) = std::min(src0_p[k], limit); - //hvx_min_scalar_f32((const uint8_t *) src0, limit, src0_spad_data, nc); + hvx_min_scalar_f32((const uint8_t *) src0, limit, src0_spad_data, nc); // // y1 (src1_spad_data) = std::clamp(src1_p[k], -limit, limit); // hvx_clamp_scalar_f32((const uint8_t *) src1, limit, limit, src1_spad_data, nc); - - // do manual limit - for (int i = 0; i < nc; i++) { - if (src0[i] > limit) { - ((float *) src0_spad_data)[i] = limit; - } else { - ((float *) src0_spad_data)[i] = src0[i]; - } - } - for (int i = 0; i < nc; i++) { if (src1[i] > limit) { ((float *) src1_spad_data)[i] = limit; diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c index ddf4206f6a..1eea76cba7 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.c +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c @@ -884,46 +884,44 @@ void hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * const float * src_f = (const float *) src; - HVX_Vector vec_min = Q6_V_vsplat_R(val); + HVX_Vector vec_min =hvx_vec_splat_fp32(val); - int unaligned_input_addr = 0; - int unaligned_output_addr = 0; + int unalign_address = 0; - if(htp_is_aligned((void *) src, VLEN) == 0) { - unaligned_input_addr = 1; - } - if(htp_is_aligned((void *) dst, VLEN) == 0) { - unaligned_output_addr = 1; + + if(htp_is_aligned((void *) src, VLEN) == 0 ||htp_is_aligned((void *) dst, VLEN) == 0 ) { + unalign_address = 1; } - - if(unaligned_input_addr == 0 && unaligned_output_addr == 0){ - + if(unalign_address == 0){ HVX_Vector * restrict vec_in = (HVX_Vector *) src; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - vec_min = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++); - *vec_out++ = Q6_Vsf_equals_Vqf32(vec_min); + HVX_Vector min_clamp = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++); + *vec_out++ = (min_clamp); + } + }else{ + HVX_UVector * restrict vec_in = (HVX_Vector *) src; + HVX_UVector * restrict vec_out = (HVX_Vector *) dst; + + #pragma unroll(4) + for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { + HVX_Vector min_clamp = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++); + *vec_out++ = (min_clamp); } } - else if(unaligned_output_addr == 0){ - }else if() - - - - - if (left_over > 0) { + if (left_over > 0 ) { const float * srcf = (const float *) src + num_elems_whole; float * dstf = (float *) dst + num_elems_whole; - HVX_Vector in = *(HVX_UVector *) srcf; + HVX_UVector in = *(HVX_UVector *) srcf; - vec_min = Q6_Vsf_vmin_VsfVsf(vec_min, in); + HVX_UVector min_clamp = Q6_Vsf_vmin_VsfVsf(vec_min, in); - hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(vec_min)); + hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, (min_clamp)); } } From e54390f9b4956f83b2c1288eb8e193312b1ad1eb Mon Sep 17 00:00:00 2001 From: shouyud Date: Tue, 16 Dec 2025 16:17:25 -0500 Subject: [PATCH 3/3] feat: working swiglu-oai --- ggml/src/ggml-hexagon/htp/act-ops.c | 14 +---- ggml/src/ggml-hexagon/htp/hvx-utils.c | 80 ++++++++++++++++----------- 2 files changed, 51 insertions(+), 43 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c index 8273e05835..5af67b80a5 100644 --- a/ggml/src/ggml-hexagon/htp/act-ops.c +++ b/ggml/src/ggml-hexagon/htp/act-ops.c @@ -230,18 +230,8 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0, // x (src0_spad_data) = std::min(src0_p[k], limit); hvx_min_scalar_f32((const uint8_t *) src0, limit, src0_spad_data, nc); - // // y1 (src1_spad_data) = std::clamp(src1_p[k], -limit, limit); - // hvx_clamp_scalar_f32((const uint8_t *) src1, limit, limit, src1_spad_data, nc); - for (int i = 0; i < nc; i++) { - if (src1[i] > limit) { - ((float *) src1_spad_data)[i] = limit; - } else if (src1[i] < -limit) { - ((float *) src1_spad_data)[i] = -limit; - } else { - ((float *) src1_spad_data)[i] = src1[i]; - } - } - + // y1 (src1_spad_data) = std::clamp(src1_p[k], -limit, limit); + hvx_clamp_scalar_f32((const uint8_t *) src1, -limit, limit, src1_spad_data, nc); // y (src1_spad_data) = y1 + 1.f hvx_add_scalar_f32(src1_spad_data, 1.0, src1_spad_data, nc); // x1 (dst_spad_data) = alpha * (x) diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c index 1eea76cba7..9635c83859 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.c +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c @@ -875,24 +875,17 @@ float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems) { void hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems) { size_t left_over = num_elems & (VLEN_FP32 - 1); size_t num_elems_whole = num_elems - left_over; - + int unalign_address = 0; if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { FARF(HIGH, "hvx_min_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); + unalign_address = 1; } - assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole)); - const float * src_f = (const float *) src; HVX_Vector vec_min =hvx_vec_splat_fp32(val); - int unalign_address = 0; - - - if(htp_is_aligned((void *) src, VLEN) == 0 ||htp_is_aligned((void *) dst, VLEN) == 0 ) { - unalign_address = 1; - } - + if(unalign_address == 0){ HVX_Vector * restrict vec_in = (HVX_Vector *) src; HVX_Vector * restrict vec_out = (HVX_Vector *) dst; @@ -932,47 +925,72 @@ void hvx_clamp_scalar_f32(const uint8_t * restrict src, const int num_elems) { size_t left_over = num_elems & (VLEN_FP32 - 1); size_t num_elems_whole = num_elems - left_over; - + + int unalign_address = 0; if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) { FARF(HIGH, "hvx_clamp_scalar_f32: unaligned address in hvx op, possibly slower execution\n"); + unalign_address = 1; } - assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole)); - - HVX_Vector * restrict vec_in = (HVX_Vector *) src; - HVX_Vector * restrict vec_out = (HVX_Vector *) dst; HVX_Vector range_left = hvx_vec_splat_fp32(limit_left); HVX_Vector range_right = hvx_vec_splat_fp32(limit_right); - #pragma unroll(4) - for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - HVX_Vector in_vec = *vec_in++; - HVX_Vector temp_v = in_vec; + if(unalign_address == 0){ + HVX_Vector * restrict vec_in = (HVX_Vector *) src; + HVX_Vector * restrict vec_out = (HVX_Vector *) dst; - HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right); - HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec); - in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v); - in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, temp_v); - *vec_out++ = Q6_Vsf_equals_Vqf32(in_vec); + #pragma unroll(4) + for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { + HVX_Vector in_vec = *vec_in++; + HVX_Vector temp_v = in_vec; + + HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right); + HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec); + + in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v); + in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, in_vec); + + *vec_out++ = in_vec; + } + + }else{ + + HVX_UVector * restrict vec_in = (HVX_UVector *) src; + HVX_UVector * restrict vec_out = (HVX_UVector *) dst; + + #pragma unroll(4) + for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { + HVX_Vector in_vec = *vec_in++; + HVX_Vector temp_v = in_vec; + + HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right); + HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec); + + in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v); + in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, in_vec); + + *vec_out++ = in_vec; + } + } if (left_over > 0) { const float * srcf = (const float *) src + num_elems_whole; float * dstf = (float *) dst + num_elems_whole; - HVX_Vector in = *(HVX_UVector *) srcf; + HVX_Vector in_vec = *(HVX_UVector *) srcf; - HVX_Vector temp_v = in; + HVX_Vector temp_v = in_vec; - HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in, range_right); - HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(range_left, in); + HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right); + HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec); - in = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v); - in = Q6_V_vmux_QVV(pred_cap_left, range_left, temp_v); + in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v); + in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, in_vec); - hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(in)); + hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, in_vec); } }