From d0b79aaa2f6e7b7d3c26b1845b43cef158697540 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Sat, 14 Mar 2026 10:06:14 +0100 Subject: [PATCH] ggml : add native AVX512-FP16 support for F16 operations (#20529) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The overall benchmark speed remains almost the same because the CPU is now calculating faster than the RAM can deliver the data. (See perf stat results below showing 2.7 billion fewer instructions). Also note that this path will be only enabled for native build or with custom flags. now: ``` Performance counter stats for 'build/bin/llama-bench -m Qwen3-0.6B-f16.gguf -p 512 -n 128': 189,073.52 msec task-clock # 14.658 CPUs utilized 404 context-switches # 2.137 /sec 19 cpu-migrations # 0.100 /sec 372,390 page-faults # 1.970 K/sec 310,877,195,595 instructions # 0.54 insn per cycle 581,071,530,602 cycles # 3.073 GHz 19,352,107,994 branches # 102.352 M/sec 48,304,438 branch-misses # 0.25% of all branches 84,998,431,152 L1-dcache-loads # 449.552 M/sec 12,186,410,279 L1-dcache-load-misses # 14.34% of all L1-dcache accesses 12.899358742 seconds time elapsed 187.823044000 seconds user 1.253416000 seconds sys ``` before: ``` Performance counter stats for 'build/bin/llama-bench -m Qwen3-0.6B-f16.gguf -p 512 -n 128': 190,594.56 msec task-clock # 14.652 CPUs utilized 436 context-switches # 2.288 /sec 22 cpu-migrations # 0.115 /sec 372,782 page-faults # 1.956 K/sec 313,574,921,966 instructions # 0.54 insn per cycle 586,064,970,425 cycles # 3.075 GHz 19,585,778,563 branches # 102.761 M/sec 48,437,488 branch-misses # 0.25% of all branches 86,219,336,628 L1-dcache-loads # 452.370 M/sec 12,232,085,771 L1-dcache-load-misses # 14.19% of all L1-dcache accesses 13.007923164 seconds time elapsed 189.395316000 seconds user 1.202612000 seconds sys ``` Signed-off-by: Adrien Gallouët --- ggml/src/ggml-cpu/simd-mappings.h | 46 +++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h index 22de55700d..0deda93098 100644 --- a/ggml/src/ggml-cpu/simd-mappings.h +++ b/ggml/src/ggml-cpu/simd-mappings.h @@ -479,13 +479,51 @@ do { \ // F16 AVX512 -// F16 AVX +#if defined(__AVX512FP16__) + +#define GGML_F16_STEP 128 +#define GGML_F16_EPR 32 + +#define GGML_F16x32 __m512h +#define GGML_F16x32_ZERO _mm512_setzero_ph() +#define GGML_F16x32_SET1(x) _mm512_set1_ph(__extension__(_Float16)(x)) +#define GGML_F16x32_LOAD(x) _mm512_loadu_ph(x) +#define GGML_F16x32_STORE(x, y) _mm512_storeu_ph(x, y) +#define GGML_F16x32_FMA(a, b, c) _mm512_fmadd_ph(b, c, a) +#define GGML_F16x32_ADD _mm512_add_ph +#define GGML_F16x32_MUL _mm512_mul_ph +#define GGML_F16x32_REDUCE(res, x) \ +do { \ + int offset = GGML_F16_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ph(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ph(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ph(x[i], x[offset+i]); \ + } \ + res = (ggml_float) _mm512_reduce_add_ph(x[0]); \ +} while (0) + +#define GGML_F16_VEC GGML_F16x32 +#define GGML_F16_VEC_ZERO GGML_F16x32_ZERO +#define GGML_F16_VEC_SET1 GGML_F16x32_SET1 +#define GGML_F16_VEC_LOAD(p, i) GGML_F16x32_LOAD(p) +#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x32_STORE(p, r[i]) +#define GGML_F16_VEC_FMA GGML_F16x32_FMA +#define GGML_F16_VEC_ADD GGML_F16x32_ADD +#define GGML_F16_VEC_MUL GGML_F16x32_MUL +#define GGML_F16_VEC_REDUCE GGML_F16x32_REDUCE + +#else // Fallback FP16 <-> FP32 #define GGML_F16_STEP 64 #define GGML_F16_EPR 16 -// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead - #define GGML_F32Cx16 __m512 #define GGML_F32Cx16_ZERO _mm512_setzero_ps() #define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x) @@ -525,6 +563,8 @@ do { \ #define GGML_F16_VEC_MUL GGML_F32Cx16_MUL #define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE + +#endif // __AVX512FP16__ #elif defined(__AVX__) #define GGML_SIMD