ggml-cpu: change redsum to lmul 4, fix leftover
This commit is contained in:
parent
2786a97ef0
commit
e5c8adbbce
|
|
@ -232,11 +232,8 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t *
|
||||||
}
|
}
|
||||||
|
|
||||||
// reduce
|
// reduce
|
||||||
vl = __riscv_vsetvlmax_e32m2();
|
vl = __riscv_vsetvlmax_e32m4();
|
||||||
vfloat32m2_t acc0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum0, 0), __riscv_vget_v_f32m4_f32m2(vsum0, 1), vl);
|
vfloat32m1_t redsum = __riscv_vfredusum_vs_f32m4_f32m1(vsum0, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
|
||||||
vl = __riscv_vsetvlmax_e32m1();
|
|
||||||
vfloat32m1_t acc1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc0, 0), __riscv_vget_v_f32m2_f32m1(acc0, 1), vl);
|
|
||||||
vfloat32m1_t redsum = __riscv_vfredusum_vs_f32m1_f32m1(acc1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
|
|
||||||
sumf += __riscv_vfmv_f_s_f32m1_f32(redsum);
|
sumf += __riscv_vfmv_f_s_f32m1_f32(redsum);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -539,7 +539,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
|
||||||
// calculate step size
|
// calculate step size
|
||||||
const int epr = __riscv_vsetvlmax_e16m4();
|
const int epr = __riscv_vsetvlmax_e16m4();
|
||||||
const int step = epr * 2;
|
const int step = epr * 2;
|
||||||
const int np = (n & ~(step - 1));
|
int np = (n & ~(step - 1));
|
||||||
|
|
||||||
// unroll by 2
|
// unroll by 2
|
||||||
for (int i = 0; i < np; i += step) {
|
for (int i = 0; i < np; i += step) {
|
||||||
|
|
@ -565,6 +565,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
|
||||||
ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, vl);
|
ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, vl);
|
||||||
__riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
|
__riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
|
||||||
}
|
}
|
||||||
|
np = n;
|
||||||
#elif defined(GGML_SIMD)
|
#elif defined(GGML_SIMD)
|
||||||
const int np = (n & ~(GGML_F16_STEP - 1));
|
const int np = (n & ~(GGML_F16_STEP - 1));
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue