From 8cc0ba957be158406dee261cee78bcea605c7ed4 Mon Sep 17 00:00:00 2001 From: shalinib-ibm Date: Thu, 15 Jan 2026 15:01:18 +0530 Subject: [PATCH] ggml-cpu: optimize ggml_vec_dot_bf16 for Power9 (#18837) --- ggml/src/ggml-cpu/simd-mappings.h | 31 +++++++++++++++++++++++++++++++ ggml/src/ggml-cpu/vec.cpp | 18 ++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h index a7a8272205..e367f110b4 100644 --- a/ggml/src/ggml-cpu/simd-mappings.h +++ b/ggml/src/ggml-cpu/simd-mappings.h @@ -654,6 +654,14 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) { vec_extract(x[0], 2) + \ vec_extract(x[0], 3); \ } +#define GGML_F32x4_REDUCE_4(res, s0, s1, s2, s3) \ +{ \ + vector float v = vec_add(vec_add(s0, s1), \ + vec_add(s2, s3)); \ + v = vec_add(v, vec_sld(v, v, 8)); \ + v = vec_add(v, vec_sld(v, v, 4)); \ + res += (ggml_float) vec_extract(v, 0); \ +} #define GGML_F32_VEC GGML_F32x4 #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO @@ -690,6 +698,29 @@ static inline unsigned char ggml_endian_byte(int i) { r[i - GGML_ENDIAN_BYTE(0)]), \ 0, p - GGML_F16_EPR) +//BF16 POWER9 +#define GGML_BF16_STEP 16 +#define GGML_BF16_EPR 8 + +#define GGML_BF16x8 vector unsigned short +#define GGML_BF16x8_ZERO vec_splats((unsigned short)0) +#define GGML_BF16x8_LOAD(p) vec_xl(0, (const unsigned short *)(p)) + +#define GGML_BF16_VEC GGML_BF16x8 +#define GGML_BF16_VEC_ZERO GGML_BF16x8_ZERO +#define GGML_BF16_VEC_LOAD GGML_BF16x8_LOAD +#if defined(__LITTLE_ENDIAN__) +#define GGML_BF16_TO_F32_LO(v) ((vector float) vec_mergel(GGML_BF16_VEC_ZERO, (v))) +#define GGML_BF16_TO_F32_HI(v) ((vector float) vec_mergeh(GGML_BF16_VEC_ZERO, (v))) +#else +#define GGML_BF16_TO_F32_LO(v) ((vector float) vec_mergel((v), GGML_BF16_VEC_ZERO)) +#define GGML_BF16_TO_F32_HI(v) ((vector float) vec_mergeh((v), GGML_BF16_VEC_ZERO)) +#endif +#define GGML_BF16_FMA_LO(acc, x, y) \ + (acc) = GGML_F32x4_FMA((acc), GGML_BF16_TO_F32_LO(x), GGML_BF16_TO_F32_LO(y)) +#define GGML_BF16_FMA_HI(acc, x, y) \ + (acc) = GGML_F32x4_FMA((acc), GGML_BF16_TO_F32_HI(x), GGML_BF16_TO_F32_HI(y)) + #elif defined(__wasm_simd128__) #define GGML_SIMD diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp index 427e63245b..8708cd4e92 100644 --- a/ggml/src/ggml-cpu/vec.cpp +++ b/ggml/src/ggml-cpu/vec.cpp @@ -237,6 +237,24 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * sumf += __riscv_vfmv_f_s_f32m1_f32(redsum); #endif +#if defined(__POWER9_VECTOR__) + const int np = (n & ~(GGML_BF16_STEP - 1)); + if (np > 0) { + GGML_F32_VEC sum[4] = {GGML_F32_VEC_ZERO}; + for (; i < np; i += GGML_BF16_STEP) { + GGML_BF16_VEC vx0 = GGML_BF16_VEC_LOAD(x + i); + GGML_BF16_VEC vx1 = GGML_BF16_VEC_LOAD(x + i + 8); + GGML_BF16_VEC vy0 = GGML_BF16_VEC_LOAD(y + i); + GGML_BF16_VEC vy1 = GGML_BF16_VEC_LOAD(y + i + 8); + GGML_BF16_FMA_LO(sum[0], vx0, vy0); + GGML_BF16_FMA_HI(sum[1], vx0, vy0); + GGML_BF16_FMA_LO(sum[2], vx1, vy1); + GGML_BF16_FMA_HI(sum[3], vx1, vy1); + } + GGML_F32x4_REDUCE_4(sumf, sum[0], sum[1], sum[2], sum[3]); + } +#endif + for (; i < n; ++i) { sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) * GGML_BF16_TO_FP32(y[i]));