diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp index abb93ee1ba..427e63245b 100644 --- a/ggml/src/ggml-cpu/vec.cpp +++ b/ggml/src/ggml-cpu/vec.cpp @@ -232,11 +232,8 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * } // reduce - vl = __riscv_vsetvlmax_e32m2(); - vfloat32m2_t acc0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum0, 0), __riscv_vget_v_f32m4_f32m2(vsum0, 1), vl); - vl = __riscv_vsetvlmax_e32m1(); - vfloat32m1_t acc1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc0, 0), __riscv_vget_v_f32m2_f32m1(acc0, 1), vl); - vfloat32m1_t redsum = __riscv_vfredusum_vs_f32m1_f32m1(acc1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl); + vl = __riscv_vsetvlmax_e32m4(); + vfloat32m1_t redsum = __riscv_vfredusum_vs_f32m4_f32m1(vsum0, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl); sumf += __riscv_vfmv_f_s_f32m1_f32(redsum); #endif diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h index 12b71a4898..3198b33b50 100644 --- a/ggml/src/ggml-cpu/vec.h +++ b/ggml/src/ggml-cpu/vec.h @@ -539,7 +539,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, // calculate step size const int epr = __riscv_vsetvlmax_e16m4(); const int step = epr * 2; - const int np = (n & ~(step - 1)); + int np = (n & ~(step - 1)); // unroll by 2 for (int i = 0; i < np; i += step) { @@ -560,11 +560,12 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, int vl; for (int i = np; i < n; i += vl) { vl = __riscv_vsetvl_e16m4(n - i); - vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i , vl); + vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, vl); vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl); ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, vl); __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl); } + np = n; #elif defined(GGML_SIMD) const int np = (n & ~(GGML_F16_STEP - 1));