From 5fb1bb99fe34ac59750f9255e1f226ecd74547bf Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Tue, 23 Sep 2025 14:36:40 +0800 Subject: [PATCH] ggml-cpu: expand to 2 blocks per loop Signed-off-by: Aaron Teo --- ggml/src/ggml-cpu/arch/s390/quants.c | 44 ++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/s390/quants.c b/ggml/src/ggml-cpu/arch/s390/quants.c index 257963ee03..2638162bf1 100644 --- a/ggml/src/ggml-cpu/arch/s390/quants.c +++ b/ggml/src/ggml-cpu/arch/s390/quants.c @@ -284,11 +284,49 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo float32x4_t v_acc = vec_splats(0.0f); + for (; ib + 1 < nb; ib += 2) { + const block_mxfp4 * GGML_RESTRICT x0 = &x[ib + 0]; + const block_mxfp4 * GGML_RESTRICT x1 = &x[ib + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; + const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; + + const uint8x16_t v_x0 = vec_xl(0, x0->qs); + const uint8x16_t v_x1 = vec_xl(0, x1->qs); + + int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m); + int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4); + int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m); + int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4); + + v_x0l = vec_perm(v_k, v_k, (uchar8x16_t)v_x0l); + v_x0h = vec_perm(v_k, v_k, (uchar8x16_t)v_x0h); + v_x1l = vec_perm(v_k, v_k, (uchar8x16_t)v_x1l); + v_x1h = vec_perm(v_k, v_k, (uchar8x16_t)v_x1h); + + const int8x16_t v_y0l = vec_xl(0, y0->qs); + const int8x16_t v_y0h = vec_xl(QK8_0/2, y0->qs); + const int8x16_t v_y1l = vec_xl(0, y1->qs); + const int8x16_t v_y1h = vec_xl(QK8_0/2, y1->qs); + + const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0l, v_y0l), v_x0h, v_y0h); + const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1l, v_y1l), v_x1h, v_y1h); + + const float32x4_t v_xy0f = vec_float(v_xy0); + const float32x4_t v_xy1f = vec_float(v_xy1); + + const float32x4_t v_d0 = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d)); + const float32x4_t v_d1 = vec_splats(GGML_E8M0_TO_FP32_HALF(x1->e) * GGML_CPU_FP16_TO_FP32(y1->d)); + + v_acc = vec_madd(v_xy0f, v_d0, v_acc); + v_acc = vec_madd(v_xy1f, v_d1, v_acc); + } + for (; ib < nb; ++ib) { - const block_mxfp4 * GGML_RESTRICT x0 = &x[ib]; - const block_q8_0 * GGML_RESTRICT y0 = &y[ib]; + const block_mxfp4 * GGML_RESTRICT x0 = &x[ib + 0]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; const uint8x16_t v_x = vec_xl(0, x0->qs); + int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m); int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4); @@ -300,8 +338,8 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh); const float32x4_t v_xyf = vec_float(v_xy); - const float32x4_t v_d = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d)); + const float32x4_t v_d = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d)); v_acc = vec_madd(v_xyf, v_d, v_acc); }