ggml-cpu: change redsum to lmul 4, fix leftover

2025-12-15 13:29:11 +05:00 · 2025-12-15 13:29:11 +05:00 · e5c8adbbce
parent 2786a97ef0
commit e5c8adbbce
2 changed files with 5 additions and 7 deletions
--- a/ggml/src/ggml-cpu/vec.cpp
+++ b/ggml/src/ggml-cpu/vec.cpp
@ -232,11 +232,8 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t *
    }
    // reduce
-    vl = __riscv_vsetvlmax_e32m2();
+    vl = __riscv_vsetvlmax_e32m4();
-    vfloat32m2_t acc0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum0, 0), __riscv_vget_v_f32m4_f32m2(vsum0, 1), vl);
+    vfloat32m1_t redsum = __riscv_vfredusum_vs_f32m4_f32m1(vsum0, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
    vl = __riscv_vsetvlmax_e32m1();
    vfloat32m1_t acc1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc0, 0), __riscv_vget_v_f32m2_f32m1(acc0, 1), vl);
    vfloat32m1_t redsum = __riscv_vfredusum_vs_f32m1_f32m1(acc1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
    sumf += __riscv_vfmv_f_s_f32m1_f32(redsum);
 #endif
--- a/ggml/src/ggml-cpu/vec.h
+++ b/ggml/src/ggml-cpu/vec.h
@ -539,7 +539,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
    // calculate step size
    const int epr = __riscv_vsetvlmax_e16m4();
    const int step = epr * 2;
-    const int np = (n & ~(step - 1));
+    int np = (n & ~(step - 1));
    // unroll by 2
    for (int i = 0; i < np; i += step) {
@ -565,6 +565,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
        ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, vl);
        __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
    }
    np = n;
 #elif defined(GGML_SIMD)
    const int np = (n & ~(GGML_F16_STEP - 1));