ggml-cpu : add RISC-V Zvfh impl for ggml_vec_mad_f16 (#17448)

* ggml-cpu : add RISC-V Zvfh impl for ggml_vec_mad_f16

* ggml-cpu : dedup scalar impl

* Update ggml/src/ggml-cpu/vec.h

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
xctan 2025-11-26 21:33:05 +08:00 committed by GitHub
parent 2336cc4784
commit 6ab4e50d9c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 90 additions and 91 deletions

View File

@ -397,15 +397,14 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
} }
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) { inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
#if defined(GGML_SIMD) #if defined(GGML_SIMD) && defined(__ARM_FEATURE_SVE)
#if defined(__ARM_FEATURE_SVE)
const int sve_register_length = svcntb() * 8; const int sve_register_length = svcntb() * 8;
const int ggml_f16_epr = sve_register_length / 16; const int ggml_f16_epr = sve_register_length / 16;
const int ggml_f16_step = 8 * ggml_f16_epr; const int ggml_f16_step = 8 * ggml_f16_epr;
GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v); GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
const int np= (n & ~(ggml_f16_step - 1)); int np = (n & ~(ggml_f16_step - 1));
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8; svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8; svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
@ -474,14 +473,18 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
hy = svmad_f16_x(pg, hx, vx, hy); hy = svmad_f16_x(pg, hx, vx, hy);
svst1_f16(pg, (__fp16 *)(y + np2), hy); svst1_f16(pg, (__fp16 *)(y + np2), hy);
} }
np = n;
#elif defined(__riscv_v_intrinsic) #elif defined(__riscv_zvfh) // implies __riscv_v_intrinsic
// todo: RVV impl const int np = n;
// scalar _Float16 hv = (_Float16)v;
for (int i = 0; i < n; ++i) { for (int i = 0, avl; i < n; i += avl) {
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v); avl = __riscv_vsetvl_e16m8(n - i);
vfloat16m8_t ax = __riscv_vle16_v_f16m8((const _Float16 *)&x[i], avl);
vfloat16m8_t ay = __riscv_vle16_v_f16m8((_Float16 *)&y[i], avl);
vfloat16m8_t ny = __riscv_vfmadd_vf_f16m8(ax, hv, ay, avl);
__riscv_vse16_v_f16m8((_Float16 *)&y[i], ny, avl);
} }
#else #elif defined(GGML_SIMD)
const int np = (n & ~(GGML_F16_STEP - 1)); const int np = (n & ~(GGML_F16_STEP - 1));
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v); GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
@ -498,18 +501,14 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j); GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
} }
} }
#else
const int np = 0;
#endif
// leftovers // leftovers
for (int i = np; i < n; ++i) { for (int i = np; i < n; ++i) {
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v); y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
} }
#endif
#else
// scalar
for (int i = 0; i < n; ++i) {
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
}
#endif
} }
// xs and vs are byte strides of x and v // xs and vs are byte strides of x and v