snapshot: debug ggml-hexagon swiglu-oai

This commit is contained in:
shouyud 2025-12-16 14:12:43 -05:00
parent ec98e20021
commit 925a83ac70
4 changed files with 51 additions and 10 deletions

View File

@ -3297,7 +3297,7 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
break; break;
case GGML_OP_GLU: case GGML_OP_GLU:
if ((ggml_get_glu_op(op) == GGML_GLU_OP_SWIGLU) /* || (ggml_get_glu_op(op) == GGML_GLU_OP_SWIGLU_OAI) */) { if ((ggml_get_glu_op(op) == GGML_GLU_OP_SWIGLU) || (ggml_get_glu_op(op) == GGML_GLU_OP_SWIGLU_OAI) ) {
supp = ggml_hexagon_supported_activations(sess, op); supp = ggml_hexagon_supported_activations(sess, op);
} }
break; break;

View File

@ -229,9 +229,29 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0,
} }
// x (src0_spad_data) = std::min(src0_p[k], limit); // x (src0_spad_data) = std::min(src0_p[k], limit);
hvx_min_scalar_f32((const uint8_t *) src0, limit, src0_spad_data, nc); //hvx_min_scalar_f32((const uint8_t *) src0, limit, src0_spad_data, nc);
// y1 (src1_spad_data) = std::clamp(src1_p[k], -limit, limit); // // y1 (src1_spad_data) = std::clamp(src1_p[k], -limit, limit);
hvx_clamp_scalar_f32((const uint8_t *) src1, limit, limit, src1_spad_data, nc); // hvx_clamp_scalar_f32((const uint8_t *) src1, limit, limit, src1_spad_data, nc);
// do manual limit
for (int i = 0; i < nc; i++) {
if (src0[i] > limit) {
((float *) src0_spad_data)[i] = limit;
} else {
((float *) src0_spad_data)[i] = src0[i];
}
}
for (int i = 0; i < nc; i++) {
if (src1[i] > limit) {
((float *) src1_spad_data)[i] = limit;
} else if (src1[i] < -limit) {
((float *) src1_spad_data)[i] = -limit;
} else {
((float *) src1_spad_data)[i] = src1[i];
}
}
// y (src1_spad_data) = y1 + 1.f // y (src1_spad_data) = y1 + 1.f
hvx_add_scalar_f32(src1_spad_data, 1.0, src1_spad_data, nc); hvx_add_scalar_f32(src1_spad_data, 1.0, src1_spad_data, nc);
// x1 (dst_spad_data) = alpha * (x) // x1 (dst_spad_data) = alpha * (x)

View File

@ -886,14 +886,34 @@ void hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
HVX_Vector vec_min = Q6_V_vsplat_R(val); HVX_Vector vec_min = Q6_V_vsplat_R(val);
HVX_Vector * restrict vec_in = (HVX_Vector *) src; int unaligned_input_addr = 0;
HVX_Vector * restrict vec_out = (HVX_Vector *) dst; int unaligned_output_addr = 0;
#pragma unroll(4) if(htp_is_aligned((void *) src, VLEN) == 0) {
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { unaligned_input_addr = 1;
vec_min = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++);
*vec_out++ = Q6_Vsf_equals_Vqf32(vec_min);
} }
if(htp_is_aligned((void *) dst, VLEN) == 0) {
unaligned_output_addr = 1;
}
if(unaligned_input_addr == 0 && unaligned_output_addr == 0){
HVX_Vector * restrict vec_in = (HVX_Vector *) src;
HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
vec_min = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++);
*vec_out++ = Q6_Vsf_equals_Vqf32(vec_min);
}
}
else if(unaligned_output_addr == 0){
}else if()
if (left_over > 0) { if (left_over > 0) {
const float * srcf = (const float *) src + num_elems_whole; const float * srcf = (const float *) src + num_elems_whole;

View File

@ -806,6 +806,7 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
break; break;
case HTP_OP_GLU_SWIGLU: case HTP_OP_GLU_SWIGLU:
case HTP_OP_GLU_SWIGLU_OAI:
case HTP_OP_SOFTMAX: case HTP_OP_SOFTMAX:
if ((n_bufs != 2) && (n_bufs != 3)) { if ((n_bufs != 2) && (n_bufs != 3)) {
FARF(ERROR, "Bad act-req buffer list"); FARF(ERROR, "Bad act-req buffer list");