From 20cc625edc2264aae2779e71bef1593e6a4e8c43 Mon Sep 17 00:00:00 2001 From: sirus20x6 Date: Sun, 12 Oct 2025 00:15:00 -0500 Subject: [PATCH] ggml: Correct SVE implementation in ggml_vec_dot_f16_unroll (#16518) The previous SVE implementation for `ggml_vec_dot_f16_unroll` contained a bug due to a copy-paste error. The wrong variable was used in an FMA instruction, leading to incorrect results. This commit corrects the variable usage and improves the clarity of the code by renaming variables to avoid confusion. Co-authored-by: Aaron --- ggml/src/ggml-cpu/vec.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h index 2751359ce4..d3834182a6 100644 --- a/ggml/src/ggml-cpu/vec.h +++ b/ggml/src/ggml-cpu/vec.h @@ -144,14 +144,14 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG for (int i = 0; i < np; i += ggml_f16_step) { ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements - ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elemnst + ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elements sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1 ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1); ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements - ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 ekements + ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 elements sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2); ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1); sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2); @@ -160,7 +160,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2); sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3); - ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2); + ax3 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2); sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3); ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);