diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 514f086f68..fed4fcf111 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -3297,7 +3297,7 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons break; case GGML_OP_GLU: - if ((ggml_get_glu_op(op) == GGML_GLU_OP_SWIGLU) /* || (ggml_get_glu_op(op) == GGML_GLU_OP_SWIGLU_OAI) */) { + if ((ggml_get_glu_op(op) == GGML_GLU_OP_SWIGLU) || (ggml_get_glu_op(op) == GGML_GLU_OP_SWIGLU_OAI) ) { supp = ggml_hexagon_supported_activations(sess, op); } break; diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c index 87b09cca3a..0d7997ded1 100644 --- a/ggml/src/ggml-hexagon/htp/act-ops.c +++ b/ggml/src/ggml-hexagon/htp/act-ops.c @@ -229,9 +229,29 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0, } // x (src0_spad_data) = std::min(src0_p[k], limit); - hvx_min_scalar_f32((const uint8_t *) src0, limit, src0_spad_data, nc); - // y1 (src1_spad_data) = std::clamp(src1_p[k], -limit, limit); - hvx_clamp_scalar_f32((const uint8_t *) src1, limit, limit, src1_spad_data, nc); + //hvx_min_scalar_f32((const uint8_t *) src0, limit, src0_spad_data, nc); + // // y1 (src1_spad_data) = std::clamp(src1_p[k], -limit, limit); + // hvx_clamp_scalar_f32((const uint8_t *) src1, limit, limit, src1_spad_data, nc); + + // do manual limit + for (int i = 0; i < nc; i++) { + if (src0[i] > limit) { + ((float *) src0_spad_data)[i] = limit; + } else { + ((float *) src0_spad_data)[i] = src0[i]; + } + } + + for (int i = 0; i < nc; i++) { + if (src1[i] > limit) { + ((float *) src1_spad_data)[i] = limit; + } else if (src1[i] < -limit) { + ((float *) src1_spad_data)[i] = -limit; + } else { + ((float *) src1_spad_data)[i] = src1[i]; + } + } + // y (src1_spad_data) = y1 + 1.f hvx_add_scalar_f32(src1_spad_data, 1.0, src1_spad_data, nc); // x1 (dst_spad_data) = alpha * (x) diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c index e02b1d9099..ddf4206f6a 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.c +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c @@ -886,14 +886,34 @@ void hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * HVX_Vector vec_min = Q6_V_vsplat_R(val); - HVX_Vector * restrict vec_in = (HVX_Vector *) src; - HVX_Vector * restrict vec_out = (HVX_Vector *) dst; + int unaligned_input_addr = 0; + int unaligned_output_addr = 0; - #pragma unroll(4) - for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - vec_min = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++); - *vec_out++ = Q6_Vsf_equals_Vqf32(vec_min); + if(htp_is_aligned((void *) src, VLEN) == 0) { + unaligned_input_addr = 1; } + if(htp_is_aligned((void *) dst, VLEN) == 0) { + unaligned_output_addr = 1; + } + + + if(unaligned_input_addr == 0 && unaligned_output_addr == 0){ + + HVX_Vector * restrict vec_in = (HVX_Vector *) src; + HVX_Vector * restrict vec_out = (HVX_Vector *) dst; + + #pragma unroll(4) + for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { + vec_min = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++); + *vec_out++ = Q6_Vsf_equals_Vqf32(vec_min); + } + } + else if(unaligned_output_addr == 0){ + + }else if() + + + if (left_over > 0) { const float * srcf = (const float *) src + num_elems_whole; diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c index b60b352a7b..549f2a07bb 100644 --- a/ggml/src/ggml-hexagon/htp/main.c +++ b/ggml/src/ggml-hexagon/htp/main.c @@ -806,6 +806,7 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) { break; case HTP_OP_GLU_SWIGLU: + case HTP_OP_GLU_SWIGLU_OAI: case HTP_OP_SOFTMAX: if ((n_bufs != 2) && (n_bufs != 3)) { FARF(ERROR, "Bad act-req buffer list");