ggml : add native AVX512-FP16 support for F16 operations (#20529)

The overall benchmark speed remains almost the same because the CPU is
now calculating faster than the RAM can deliver the data. (See perf stat
results below showing 2.7 billion fewer instructions).

Also note that this path will be only enabled for native build or with
custom flags.

now:
```
 Performance counter stats for 'build/bin/llama-bench -m Qwen3-0.6B-f16.gguf -p 512 -n 128':

        189,073.52 msec task-clock                       #   14.658 CPUs utilized
               404      context-switches                 #    2.137 /sec
                19      cpu-migrations                   #    0.100 /sec
           372,390      page-faults                      #    1.970 K/sec
   310,877,195,595      instructions                     #    0.54  insn per cycle
   581,071,530,602      cycles                           #    3.073 GHz
    19,352,107,994      branches                         #  102.352 M/sec
        48,304,438      branch-misses                    #    0.25% of all branches
    84,998,431,152      L1-dcache-loads                  #  449.552 M/sec
    12,186,410,279      L1-dcache-load-misses            #   14.34% of all L1-dcache accesses

      12.899358742 seconds time elapsed

     187.823044000 seconds user
       1.253416000 seconds sys
```

before:
```
 Performance counter stats for 'build/bin/llama-bench -m Qwen3-0.6B-f16.gguf -p 512 -n 128':

        190,594.56 msec task-clock                       #   14.652 CPUs utilized
               436      context-switches                 #    2.288 /sec
                22      cpu-migrations                   #    0.115 /sec
           372,782      page-faults                      #    1.956 K/sec
   313,574,921,966      instructions                     #    0.54  insn per cycle
   586,064,970,425      cycles                           #    3.075 GHz
    19,585,778,563      branches                         #  102.761 M/sec
        48,437,488      branch-misses                    #    0.25% of all branches
    86,219,336,628      L1-dcache-loads                  #  452.370 M/sec
    12,232,085,771      L1-dcache-load-misses            #   14.19% of all L1-dcache accesses

      13.007923164 seconds time elapsed

     189.395316000 seconds user
       1.202612000 seconds sys
```

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
This commit is contained in:
Adrien Gallouët 2026-03-14 10:06:14 +01:00 committed by GitHub
parent f2c0dfb739
commit d0b79aaa2f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 43 additions and 3 deletions

View File

@ -479,13 +479,51 @@ do { \
// F16 AVX512
// F16 AVX
#if defined(__AVX512FP16__)
#define GGML_F16_STEP 128
#define GGML_F16_EPR 32
#define GGML_F16x32 __m512h
#define GGML_F16x32_ZERO _mm512_setzero_ph()
#define GGML_F16x32_SET1(x) _mm512_set1_ph(__extension__(_Float16)(x))
#define GGML_F16x32_LOAD(x) _mm512_loadu_ph(x)
#define GGML_F16x32_STORE(x, y) _mm512_storeu_ph(x, y)
#define GGML_F16x32_FMA(a, b, c) _mm512_fmadd_ph(b, c, a)
#define GGML_F16x32_ADD _mm512_add_ph
#define GGML_F16x32_MUL _mm512_mul_ph
#define GGML_F16x32_REDUCE(res, x) \
do { \
int offset = GGML_F16_ARR >> 1; \
for (int i = 0; i < offset; ++i) { \
x[i] = _mm512_add_ph(x[i], x[offset+i]); \
} \
offset >>= 1; \
for (int i = 0; i < offset; ++i) { \
x[i] = _mm512_add_ph(x[i], x[offset+i]); \
} \
offset >>= 1; \
for (int i = 0; i < offset; ++i) { \
x[i] = _mm512_add_ph(x[i], x[offset+i]); \
} \
res = (ggml_float) _mm512_reduce_add_ph(x[0]); \
} while (0)
#define GGML_F16_VEC GGML_F16x32
#define GGML_F16_VEC_ZERO GGML_F16x32_ZERO
#define GGML_F16_VEC_SET1 GGML_F16x32_SET1
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x32_LOAD(p)
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x32_STORE(p, r[i])
#define GGML_F16_VEC_FMA GGML_F16x32_FMA
#define GGML_F16_VEC_ADD GGML_F16x32_ADD
#define GGML_F16_VEC_MUL GGML_F16x32_MUL
#define GGML_F16_VEC_REDUCE GGML_F16x32_REDUCE
#else // Fallback FP16 <-> FP32
#define GGML_F16_STEP 64
#define GGML_F16_EPR 16
// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
#define GGML_F32Cx16 __m512
#define GGML_F32Cx16_ZERO _mm512_setzero_ps()
#define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
@ -525,6 +563,8 @@ do { \
#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
#endif // __AVX512FP16__
#elif defined(__AVX__)
#define GGML_SIMD