llama.cpp/benches/dgx-spark/dgx-spark.md

26 KiB

System info

uname --all
Linux spark-17ed 6.11.0-1016-nvidia #16-Ubuntu SMP PREEMPT_DYNAMIC Sun Sep 21 16:52:46 UTC 2025 aarch64 aarch64 aarch64 GNU/Linux

g++ --version
g++ (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0

nvidia-smi
Thu Feb  5 13:49:40 2026
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.95.05              Driver Version: 580.95.05      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA GB10                    On  |   0000000F:01:00.0 Off |                  N/A |
| N/A   47C    P0             13W /  N/A  | Not Supported          |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

ggml-org/gpt-oss-20b-GGUF

Model: https://huggingface.co/ggml-org/gpt-oss-20b-GGUF

  • llama-batched-bench

main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20

PP TG B N_KV T_PP s S_PP t/s T_TG s S_TG t/s T s S t/s
512 32 1 544 0.270 1895.57 0.399 80.13 0.669 812.60
512 32 2 1088 0.230 4451.23 0.583 109.71 0.813 1337.56
512 32 4 2176 0.437 4688.87 0.820 156.03 1.257 1730.91
512 32 8 4352 0.863 4744.23 0.942 271.79 1.805 2410.73
512 32 16 8704 1.725 4748.19 1.173 436.38 2.899 3002.85
512 32 32 17408 3.437 4767.38 1.503 681.49 4.939 3524.40
4096 32 1 4128 0.907 4513.91 0.407 78.54 1.315 3139.56
4096 32 2 8256 1.796 4560.42 0.625 102.37 2.422 3409.45
4096 32 4 16512 3.596 4555.66 0.888 144.11 4.485 3681.93
4096 32 8 33024 7.184 4561.44 1.098 233.11 8.282 3987.51
4096 32 16 66048 14.369 4560.82 1.503 340.74 15.872 4161.30
4096 32 32 132096 28.760 4557.52 2.162 473.59 30.922 4271.95
8192 32 1 8224 1.859 4405.59 0.430 74.36 2.290 3591.61
8192 32 2 16448 3.698 4430.02 0.656 97.59 4.354 3777.47
8192 32 4 32896 7.403 4426.10 0.957 133.82 8.360 3934.97
8192 32 8 65792 14.802 4427.63 1.222 209.44 16.024 4105.87
8192 32 16 131584 29.596 4428.67 1.741 294.13 31.337 4199.00
8192 32 32 263168 59.169 4430.42 2.619 390.92 61.789 4259.17
  • llama-bench
model size params backend ngl n_ubatch fa mmap dio test t/s
gpt-oss 20B MXFP4 MoE 11.27 GiB 20.91 B CUDA 99 2048 1 0 1 pp2048 4505.82 ± 12.90
gpt-oss 20B MXFP4 MoE 11.27 GiB 20.91 B CUDA 99 2048 1 0 1 tg32 83.43 ± 0.59
gpt-oss 20B MXFP4 MoE 11.27 GiB 20.91 B CUDA 99 2048 1 0 1 pp2048 @ d4096 4158.34 ± 18.84
gpt-oss 20B MXFP4 MoE 11.27 GiB 20.91 B CUDA 99 2048 1 0 1 tg32 @ d4096 79.22 ± 0.60
gpt-oss 20B MXFP4 MoE 11.27 GiB 20.91 B CUDA 99 2048 1 0 1 pp2048 @ d8192 3993.81 ± 17.55
gpt-oss 20B MXFP4 MoE 11.27 GiB 20.91 B CUDA 99 2048 1 0 1 tg32 @ d8192 75.22 ± 1.05
gpt-oss 20B MXFP4 MoE 11.27 GiB 20.91 B CUDA 99 2048 1 0 1 pp2048 @ d16384 3449.98 ± 12.13
gpt-oss 20B MXFP4 MoE 11.27 GiB 20.91 B CUDA 99 2048 1 0 1 tg32 @ d16384 70.36 ± 0.37
gpt-oss 20B MXFP4 MoE 11.27 GiB 20.91 B CUDA 99 2048 1 0 1 pp2048 @ d32768 2689.42 ± 18.89
gpt-oss 20B MXFP4 MoE 11.27 GiB 20.91 B CUDA 99 2048 1 0 1 tg32 @ d32768 61.65 ± 0.30

build: 11fb327bf (7941)

ggml-org/gpt-oss-120b-GGUF

Model: https://huggingface.co/ggml-org/gpt-oss-120b-GGUF

  • llama-batched-bench

main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20

PP TG B N_KV T_PP s S_PP t/s T_TG s S_TG t/s T s S t/s
512 32 1 544 0.445 1151.80 0.560 57.14 1.005 541.53
512 32 2 1088 0.472 2169.85 0.874 73.27 1.345 808.65
512 32 4 2176 0.826 2480.33 1.299 98.51 2.125 1023.94
512 32 8 4352 1.644 2491.67 1.608 159.18 3.252 1338.20
512 32 16 8704 3.292 2488.35 2.117 241.85 5.409 1609.13
512 32 32 17408 6.604 2481.07 2.898 353.31 9.502 1832.04
4096 32 1 4128 1.698 2412.65 0.580 55.21 2.277 1812.66
4096 32 2 8256 3.399 2409.88 0.934 68.53 4.333 1905.27
4096 32 4 16512 6.823 2401.21 1.411 90.72 8.234 2005.30
4096 32 8 33024 13.574 2413.97 1.841 139.07 15.415 2142.31
4096 32 16 66048 27.176 2411.52 2.609 196.26 29.785 2217.49
4096 32 32 132096 54.359 2411.23 3.905 262.20 58.264 2267.19
8192 32 1 8224 3.491 2346.81 0.613 52.23 4.103 2004.21
8192 32 2 16448 6.939 2361.03 0.981 65.21 7.921 2076.56
8192 32 4 32896 13.888 2359.40 1.511 84.71 15.399 2136.21
8192 32 8 65792 27.756 2361.18 2.034 125.86 29.790 2208.56
8192 32 16 131584 55.554 2359.34 3.021 169.49 58.575 2246.41
8192 32 32 263168 111.036 2360.89 4.537 225.72 115.573 2277.08
  • llama-bench
model size params backend ngl n_ubatch fa mmap dio test t/s
gpt-oss 120B MXFP4 MoE 59.02 GiB 116.83 B CUDA 99 2048 1 0 1 pp2048 2443.91 ± 7.47
gpt-oss 120B MXFP4 MoE 59.02 GiB 116.83 B CUDA 99 2048 1 0 1 tg32 58.72 ± 0.20
gpt-oss 120B MXFP4 MoE 59.02 GiB 116.83 B CUDA 99 2048 1 0 1 pp2048 @ d4096 2309.84 ± 3.63
gpt-oss 120B MXFP4 MoE 59.02 GiB 116.83 B CUDA 99 2048 1 0 1 tg32 @ d4096 55.67 ± 0.35
gpt-oss 120B MXFP4 MoE 59.02 GiB 116.83 B CUDA 99 2048 1 0 1 pp2048 @ d8192 2216.68 ± 10.16
gpt-oss 120B MXFP4 MoE 59.02 GiB 116.83 B CUDA 99 2048 1 0 1 tg32 @ d8192 52.87 ± 0.43
gpt-oss 120B MXFP4 MoE 59.02 GiB 116.83 B CUDA 99 2048 1 0 1 pp2048 @ d16384 1956.31 ± 6.39
gpt-oss 120B MXFP4 MoE 59.02 GiB 116.83 B CUDA 99 2048 1 0 1 tg32 @ d16384 49.45 ± 0.20
gpt-oss 120B MXFP4 MoE 59.02 GiB 116.83 B CUDA 99 2048 1 0 1 pp2048 @ d32768 1567.08 ± 11.79
gpt-oss 120B MXFP4 MoE 59.02 GiB 116.83 B CUDA 99 2048 1 0 1 tg32 @ d32768 42.76 ± 0.14

build: 11fb327bf (7941)

ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF

Model: https://huggingface.co/ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF

  • llama-batched-bench

main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20

PP TG B N_KV T_PP s S_PP t/s T_TG s S_TG t/s T s S t/s
512 32 1 544 0.393 1303.73 0.548 58.36 0.941 578.10
512 32 2 1088 0.387 2648.68 0.910 70.35 1.296 839.27
512 32 4 2176 0.659 3107.63 1.302 98.33 1.961 1109.77
512 32 8 4352 1.322 3099.35 1.669 153.42 2.990 1455.43
512 32 16 8704 2.639 3104.63 2.212 231.44 4.851 1794.32
512 32 32 17408 5.284 3100.80 2.955 346.53 8.239 2112.93
4096 32 1 4128 1.417 2890.36 0.598 53.51 2.015 2048.45
4096 32 2 8256 2.829 2895.62 1.019 62.82 3.848 2145.60
4096 32 4 16512 5.656 2896.96 1.528 83.79 7.183 2298.71
4096 32 8 33024 11.338 2890.02 2.127 120.36 13.465 2452.53
4096 32 16 66048 22.709 2885.96 3.104 164.97 25.812 2558.79
4096 32 32 132096 45.301 2893.35 4.723 216.80 50.024 2640.63
8192 32 1 8224 3.022 2711.09 0.678 47.20 3.700 2222.89
8192 32 2 16448 6.039 2713.01 1.149 55.70 7.188 2288.21
8192 32 4 32896 12.050 2719.35 1.785 71.69 13.835 2377.67
8192 32 8 65792 24.113 2717.90 2.629 97.39 26.741 2460.31
8192 32 16 131584 48.178 2720.58 4.099 124.91 52.277 2517.06
8192 32 32 263168 96.401 2719.31 6.696 152.93 103.097 2552.63
  • llama-bench
model size params backend ngl n_ubatch fa mmap dio test t/s
qwen3moe 30B.A3B Q8_0 30.25 GiB 30.53 B CUDA 99 2048 1 0 1 pp2048 2986.97 ± 18.87
qwen3moe 30B.A3B Q8_0 30.25 GiB 30.53 B CUDA 99 2048 1 0 1 tg32 61.06 ± 0.23
qwen3moe 30B.A3B Q8_0 30.25 GiB 30.53 B CUDA 99 2048 1 0 1 pp2048 @ d4096 2633.45 ± 6.26
qwen3moe 30B.A3B Q8_0 30.25 GiB 30.53 B CUDA 99 2048 1 0 1 tg32 @ d4096 54.77 ± 0.28
qwen3moe 30B.A3B Q8_0 30.25 GiB 30.53 B CUDA 99 2048 1 0 1 pp2048 @ d8192 2354.14 ± 3.84
qwen3moe 30B.A3B Q8_0 30.25 GiB 30.53 B CUDA 99 2048 1 0 1 tg32 @ d8192 48.02 ± 0.40
qwen3moe 30B.A3B Q8_0 30.25 GiB 30.53 B CUDA 99 2048 1 0 1 pp2048 @ d16384 1908.86 ± 4.25
qwen3moe 30B.A3B Q8_0 30.25 GiB 30.53 B CUDA 99 2048 1 0 1 tg32 @ d16384 40.23 ± 0.10
qwen3moe 30B.A3B Q8_0 30.25 GiB 30.53 B CUDA 99 2048 1 0 1 pp2048 @ d32768 1348.17 ± 2.00
qwen3moe 30B.A3B Q8_0 30.25 GiB 30.53 B CUDA 99 2048 1 0 1 tg32 @ d32768 30.21 ± 0.04

build: 11fb327bf (7941)

ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF

Model: https://huggingface.co/ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF

  • llama-batched-bench

main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20

PP TG B N_KV T_PP s S_PP t/s T_TG s S_TG t/s T s S t/s
512 32 1 544 0.212 2420.12 1.100 29.10 1.311 414.85
512 32 2 1088 0.428 2393.89 1.185 54.00 1.613 674.56
512 32 4 2176 0.894 2290.41 1.229 104.17 2.123 1025.02
512 32 8 4352 1.758 2330.36 1.319 194.15 3.076 1414.70
512 32 16 8704 3.508 2335.21 1.543 331.90 5.051 1723.33
512 32 32 17408 7.035 2328.93 1.738 589.21 8.773 1984.29
4096 32 1 4128 1.831 2237.25 1.125 28.44 2.956 1396.42
4096 32 2 8256 3.642 2249.48 1.253 51.07 4.895 1686.64
4096 32 4 16512 7.274 2252.26 1.380 92.72 8.655 1907.81
4096 32 8 33024 14.576 2248.09 1.617 158.29 16.193 2039.37
4096 32 16 66048 29.138 2249.17 2.081 246.01 31.219 2115.63
4096 32 32 132096 58.275 2249.19 2.814 363.87 61.089 2162.34
8192 32 1 8224 3.757 2180.26 1.184 27.03 4.941 1664.37
8192 32 2 16448 7.522 2178.05 1.341 47.73 8.863 1855.77
8192 32 4 32896 15.043 2178.25 1.548 82.69 16.591 1982.74
8192 32 8 65792 30.111 2176.49 1.937 132.13 32.048 2052.90
8192 32 16 131584 60.405 2169.90 2.706 189.21 63.111 2084.97
8192 32 32 263168 120.439 2176.58 3.993 256.46 124.432 2114.96
  • llama-bench
model size params backend ngl n_ubatch fa mmap dio test t/s
qwen2 7B Q8_0 7.54 GiB 7.62 B CUDA 99 2048 1 0 1 pp2048 2250.28 ± 6.41
qwen2 7B Q8_0 7.54 GiB 7.62 B CUDA 99 2048 1 0 1 tg32 29.43 ± 0.02
qwen2 7B Q8_0 7.54 GiB 7.62 B CUDA 99 2048 1 0 1 pp2048 @ d4096 2100.19 ± 8.96
qwen2 7B Q8_0 7.54 GiB 7.62 B CUDA 99 2048 1 0 1 tg32 @ d4096 28.61 ± 0.02
qwen2 7B Q8_0 7.54 GiB 7.62 B CUDA 99 2048 1 0 1 pp2048 @ d8192 2007.56 ± 4.16
qwen2 7B Q8_0 7.54 GiB 7.62 B CUDA 99 2048 1 0 1 tg32 @ d8192 27.38 ± 0.09
qwen2 7B Q8_0 7.54 GiB 7.62 B CUDA 99 2048 1 0 1 pp2048 @ d16384 1779.11 ± 6.42
qwen2 7B Q8_0 7.54 GiB 7.62 B CUDA 99 2048 1 0 1 tg32 @ d16384 25.72 ± 0.03
qwen2 7B Q8_0 7.54 GiB 7.62 B CUDA 99 2048 1 0 1 pp2048 @ d32768 1471.23 ± 1.71
qwen2 7B Q8_0 7.54 GiB 7.62 B CUDA 99 2048 1 0 1 tg32 @ d32768 22.51 ± 0.02

build: 11fb327bf (7941)

ggml-org/gemma-3-4b-it-qat-GGUF

Model: https://huggingface.co/ggml-org/gemma-3-4b-it-qat-GGUF

  • llama-batched-bench

main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20

PP TG B N_KV T_PP s S_PP t/s T_TG s S_TG t/s T s S t/s
512 32 1 544 0.092 5566.97 0.412 77.63 0.504 1078.95
512 32 2 1088 0.161 6345.67 0.522 122.70 0.683 1593.06
512 32 4 2176 0.325 6309.87 0.562 227.68 0.887 2453.87
512 32 8 4352 0.643 6374.42 0.685 373.67 1.328 3277.94
512 32 16 8704 1.277 6413.64 0.915 559.47 2.192 3970.01
512 32 32 17408 2.518 6506.57 1.249 819.61 3.767 4620.64
4096 32 1 4128 0.674 6079.68 0.453 70.60 1.127 3662.88
4096 32 2 8256 1.335 6137.82 0.627 102.03 1.962 4208.11
4096 32 4 16512 2.657 6167.35 0.749 170.92 3.405 4848.71
4096 32 8 33024 5.307 6173.91 0.974 262.89 6.281 5257.53
4096 32 16 66048 10.610 6176.96 1.379 371.42 11.988 5509.40
4096 32 32 132096 21.213 6178.89 2.122 482.50 23.335 5660.82
8192 32 1 8224 1.359 6027.34 0.467 68.52 1.826 4503.48
8192 32 2 16448 2.699 6069.68 0.653 98.03 3.352 4906.68
8192 32 4 32896 5.366 6106.74 0.818 156.55 6.184 5319.96
8192 32 8 65792 10.755 6093.50 1.174 218.04 11.929 5515.22
8192 32 16 131584 21.484 6100.82 1.829 279.90 23.314 5644.11
8192 32 32 263168 42.950 6103.40 3.058 334.91 46.008 5720.05
  • llama-bench
model size params backend ngl n_ubatch fa mmap dio test t/s
gemma3 4B Q4_0 2.35 GiB 3.88 B CUDA 99 2048 1 0 1 pp2048 5948.74 ± 10.61
gemma3 4B Q4_0 2.35 GiB 3.88 B CUDA 99 2048 1 0 1 tg32 81.05 ± 0.20
gemma3 4B Q4_0 2.35 GiB 3.88 B CUDA 99 2048 1 0 1 pp2048 @ d4096 5652.69 ± 34.29
gemma3 4B Q4_0 2.35 GiB 3.88 B CUDA 99 2048 1 0 1 tg32 @ d4096 76.37 ± 0.58
gemma3 4B Q4_0 2.35 GiB 3.88 B CUDA 99 2048 1 0 1 pp2048 @ d8192 5509.57 ± 40.69
gemma3 4B Q4_0 2.35 GiB 3.88 B CUDA 99 2048 1 0 1 tg32 @ d8192 71.61 ± 0.80
gemma3 4B Q4_0 2.35 GiB 3.88 B CUDA 99 2048 1 0 1 pp2048 @ d16384 5340.86 ± 36.92
gemma3 4B Q4_0 2.35 GiB 3.88 B CUDA 99 2048 1 0 1 tg32 @ d16384 70.89 ± 0.34
gemma3 4B Q4_0 2.35 GiB 3.88 B CUDA 99 2048 1 0 1 pp2048 @ d32768 5023.30 ± 13.52
gemma3 4B Q4_0 2.35 GiB 3.88 B CUDA 99 2048 1 0 1 tg32 @ d32768 62.28 ± 0.30

build: 11fb327bf (7941)

ggml-org/GLM-4.7-Flash-GGUF

Model: https://huggingface.co/ggml-org/GLM-4.7-Flash-GGUF

  • llama-batched-bench

main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20

PP TG B N_KV T_PP s S_PP t/s T_TG s S_TG t/s T s S t/s
512 32 1 544 0.433 1181.83 0.693 46.16 1.126 482.94
512 32 2 1088 0.439 2334.46 1.034 61.89 1.473 738.75
512 32 4 2176 0.772 2654.46 1.459 87.76 2.230 975.77
512 32 8 4352 1.541 2658.78 2.043 125.31 3.583 1214.47
512 32 16 8704 3.083 2656.91 2.675 191.42 5.758 1511.62
512 32 32 17408 6.159 2660.12 3.615 283.24 9.774 1780.98
4096 32 1 4128 1.915 2139.30 0.725 44.14 2.640 1563.83
4096 32 2 8256 3.834 2136.40 1.119 57.21 4.953 1666.81
4096 32 4 16512 7.636 2145.72 1.631 78.49 9.266 1781.93
4096 32 8 33024 15.295 2142.40 2.344 109.21 17.639 1872.20
4096 32 16 66048 30.573 2143.62 3.773 135.70 34.346 1923.04
4096 32 32 132096 61.282 2138.82 5.795 176.71 67.077 1969.31
8192 32 1 8224 4.510 1816.24 0.760 42.11 5.270 1560.44
8192 32 2 16448 9.036 1813.19 1.206 53.06 10.242 1605.91
8192 32 4 32896 18.070 1813.43 1.783 71.80 19.852 1657.03
8192 32 8 65792 36.125 1814.15 2.635 97.14 38.760 1697.41
8192 32 16 131584 72.367 1811.20 4.954 103.34 77.322 1701.77
8192 32 32 263168 144.501 1814.13 8.103 126.37 152.604 1724.51
  • llama-bench
model size params backend ngl n_ubatch fa dio test t/s
deepseek2 30B.A3B Q8_0 29.65 GiB 29.94 B CUDA 99 2048 1 1 pp2048 2364.18 ± 11.43
deepseek2 30B.A3B Q8_0 29.65 GiB 29.94 B CUDA 99 2048 1 1 tg32 48.68 ± 0.12
deepseek2 30B.A3B Q8_0 29.65 GiB 29.94 B CUDA 99 2048 1 1 pp2048 @ d4096 1684.13 ± 1.24
deepseek2 30B.A3B Q8_0 29.65 GiB 29.94 B CUDA 99 2048 1 1 tg32 @ d4096 44.62 ± 0.22
deepseek2 30B.A3B Q8_0 29.65 GiB 29.94 B CUDA 99 2048 1 1 pp2048 @ d8192 1314.68 ± 1.41
deepseek2 30B.A3B Q8_0 29.65 GiB 29.94 B CUDA 99 2048 1 1 tg32 @ d8192 42.59 ± 0.11
deepseek2 30B.A3B Q8_0 29.65 GiB 29.94 B CUDA 99 2048 1 1 pp2048 @ d16384 914.05 ± 3.32
deepseek2 30B.A3B Q8_0 29.65 GiB 29.94 B CUDA 99 2048 1 1 tg32 @ d16384 38.72 ± 0.13
deepseek2 30B.A3B Q8_0 29.65 GiB 29.94 B CUDA 99 2048 1 1 pp2048 @ d32768 567.20 ± 0.90
deepseek2 30B.A3B Q8_0 29.65 GiB 29.94 B CUDA 99 2048 1 1 tg32 @ d32768 32.65 ± 0.09

build: 11fb327bf (7941)