ggml : add native AVX512-FP16 support for F16 operations (#20529)
The overall benchmark speed remains almost the same because the CPU is
now calculating faster than the RAM can deliver the data. (See perf stat
results below showing 2.7 billion fewer instructions).
Also note that this path will be only enabled for native build or with
custom flags.
now:
```
Performance counter stats for 'build/bin/llama-bench -m Qwen3-0.6B-f16.gguf -p 512 -n 128':
189,073.52 msec task-clock # 14.658 CPUs utilized
404 context-switches # 2.137 /sec
19 cpu-migrations # 0.100 /sec
372,390 page-faults # 1.970 K/sec
310,877,195,595 instructions # 0.54 insn per cycle
581,071,530,602 cycles # 3.073 GHz
19,352,107,994 branches # 102.352 M/sec
48,304,438 branch-misses # 0.25% of all branches
84,998,431,152 L1-dcache-loads # 449.552 M/sec
12,186,410,279 L1-dcache-load-misses # 14.34% of all L1-dcache accesses
12.899358742 seconds time elapsed
187.823044000 seconds user
1.253416000 seconds sys
```
before:
```
Performance counter stats for 'build/bin/llama-bench -m Qwen3-0.6B-f16.gguf -p 512 -n 128':
190,594.56 msec task-clock # 14.652 CPUs utilized
436 context-switches # 2.288 /sec
22 cpu-migrations # 0.115 /sec
372,782 page-faults # 1.956 K/sec
313,574,921,966 instructions # 0.54 insn per cycle
586,064,970,425 cycles # 3.075 GHz
19,585,778,563 branches # 102.761 M/sec
48,437,488 branch-misses # 0.25% of all branches
86,219,336,628 L1-dcache-loads # 452.370 M/sec
12,232,085,771 L1-dcache-load-misses # 14.19% of all L1-dcache accesses
13.007923164 seconds time elapsed
189.395316000 seconds user
1.202612000 seconds sys
```
Signed-off-by: Adrien Gallouët <angt@huggingface.co>
This commit is contained in:
parent
f2c0dfb739
commit
d0b79aaa2f
|
|
@ -479,13 +479,51 @@ do { \
|
|||
|
||||
// F16 AVX512
|
||||
|
||||
// F16 AVX
|
||||
#if defined(__AVX512FP16__)
|
||||
|
||||
#define GGML_F16_STEP 128
|
||||
#define GGML_F16_EPR 32
|
||||
|
||||
#define GGML_F16x32 __m512h
|
||||
#define GGML_F16x32_ZERO _mm512_setzero_ph()
|
||||
#define GGML_F16x32_SET1(x) _mm512_set1_ph(__extension__(_Float16)(x))
|
||||
#define GGML_F16x32_LOAD(x) _mm512_loadu_ph(x)
|
||||
#define GGML_F16x32_STORE(x, y) _mm512_storeu_ph(x, y)
|
||||
#define GGML_F16x32_FMA(a, b, c) _mm512_fmadd_ph(b, c, a)
|
||||
#define GGML_F16x32_ADD _mm512_add_ph
|
||||
#define GGML_F16x32_MUL _mm512_mul_ph
|
||||
#define GGML_F16x32_REDUCE(res, x) \
|
||||
do { \
|
||||
int offset = GGML_F16_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = _mm512_add_ph(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = _mm512_add_ph(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = _mm512_add_ph(x[i], x[offset+i]); \
|
||||
} \
|
||||
res = (ggml_float) _mm512_reduce_add_ph(x[0]); \
|
||||
} while (0)
|
||||
|
||||
#define GGML_F16_VEC GGML_F16x32
|
||||
#define GGML_F16_VEC_ZERO GGML_F16x32_ZERO
|
||||
#define GGML_F16_VEC_SET1 GGML_F16x32_SET1
|
||||
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x32_LOAD(p)
|
||||
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x32_STORE(p, r[i])
|
||||
#define GGML_F16_VEC_FMA GGML_F16x32_FMA
|
||||
#define GGML_F16_VEC_ADD GGML_F16x32_ADD
|
||||
#define GGML_F16_VEC_MUL GGML_F16x32_MUL
|
||||
#define GGML_F16_VEC_REDUCE GGML_F16x32_REDUCE
|
||||
|
||||
#else // Fallback FP16 <-> FP32
|
||||
|
||||
#define GGML_F16_STEP 64
|
||||
#define GGML_F16_EPR 16
|
||||
|
||||
// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
|
||||
|
||||
#define GGML_F32Cx16 __m512
|
||||
#define GGML_F32Cx16_ZERO _mm512_setzero_ps()
|
||||
#define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
|
||||
|
|
@ -525,6 +563,8 @@ do { \
|
|||
#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
|
||||
|
||||
#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
|
||||
|
||||
#endif // __AVX512FP16__
|
||||
#elif defined(__AVX__)
|
||||
|
||||
#define GGML_SIMD
|
||||
|
|
|
|||
Loading…
Reference in New Issue