ggml-cpu : add RISC-V Zvfh impl for ggml_vec_mad_f16 (#17448)
* ggml-cpu : add RISC-V Zvfh impl for ggml_vec_mad_f16 * ggml-cpu : dedup scalar impl * Update ggml/src/ggml-cpu/vec.h --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
2336cc4784
commit
6ab4e50d9c
|
|
@ -397,15 +397,14 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
|
||||||
}
|
}
|
||||||
|
|
||||||
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
|
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
|
||||||
#if defined(GGML_SIMD)
|
#if defined(GGML_SIMD) && defined(__ARM_FEATURE_SVE)
|
||||||
#if defined(__ARM_FEATURE_SVE)
|
|
||||||
const int sve_register_length = svcntb() * 8;
|
const int sve_register_length = svcntb() * 8;
|
||||||
const int ggml_f16_epr = sve_register_length / 16;
|
const int ggml_f16_epr = sve_register_length / 16;
|
||||||
const int ggml_f16_step = 8 * ggml_f16_epr;
|
const int ggml_f16_step = 8 * ggml_f16_epr;
|
||||||
|
|
||||||
GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
|
GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
|
||||||
|
|
||||||
const int np= (n & ~(ggml_f16_step - 1));
|
int np = (n & ~(ggml_f16_step - 1));
|
||||||
|
|
||||||
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
||||||
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
||||||
|
|
@ -474,14 +473,18 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
|
||||||
hy = svmad_f16_x(pg, hx, vx, hy);
|
hy = svmad_f16_x(pg, hx, vx, hy);
|
||||||
svst1_f16(pg, (__fp16 *)(y + np2), hy);
|
svst1_f16(pg, (__fp16 *)(y + np2), hy);
|
||||||
}
|
}
|
||||||
|
np = n;
|
||||||
#elif defined(__riscv_v_intrinsic)
|
#elif defined(__riscv_zvfh) // implies __riscv_v_intrinsic
|
||||||
// todo: RVV impl
|
const int np = n;
|
||||||
// scalar
|
_Float16 hv = (_Float16)v;
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0, avl; i < n; i += avl) {
|
||||||
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
avl = __riscv_vsetvl_e16m8(n - i);
|
||||||
|
vfloat16m8_t ax = __riscv_vle16_v_f16m8((const _Float16 *)&x[i], avl);
|
||||||
|
vfloat16m8_t ay = __riscv_vle16_v_f16m8((_Float16 *)&y[i], avl);
|
||||||
|
vfloat16m8_t ny = __riscv_vfmadd_vf_f16m8(ax, hv, ay, avl);
|
||||||
|
__riscv_vse16_v_f16m8((_Float16 *)&y[i], ny, avl);
|
||||||
}
|
}
|
||||||
#else
|
#elif defined(GGML_SIMD)
|
||||||
const int np = (n & ~(GGML_F16_STEP - 1));
|
const int np = (n & ~(GGML_F16_STEP - 1));
|
||||||
|
|
||||||
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
||||||
|
|
@ -498,18 +501,14 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
|
||||||
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
const int np = 0;
|
||||||
|
#endif
|
||||||
|
|
||||||
// leftovers
|
// leftovers
|
||||||
for (int i = np; i < n; ++i) {
|
for (int i = np; i < n; ++i) {
|
||||||
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
#else
|
|
||||||
// scalar
|
|
||||||
for (int i = 0; i < n; ++i) {
|
|
||||||
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// xs and vs are byte strides of x and v
|
// xs and vs are byte strides of x and v
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue