llama.cpp/benches/dgx-spark/dgx-spark.md at 6cdedc79ebc2fa735da755dcd22b2f901d4e29cb

26 KiB

Raw Blame History

System info

uname --all
Linux spark-17ed 6.11.0-1016-nvidia #16-Ubuntu SMP PREEMPT_DYNAMIC Sun Sep 21 16:52:46 UTC 2025 aarch64 aarch64 aarch64 GNU/Linux

g++ --version
g++ (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0

nvidia-smi
Thu Feb  5 13:49:40 2026
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.95.05              Driver Version: 580.95.05      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA GB10                    On  |   0000000F:01:00.0 Off |                  N/A |
| N/A   47C    P0             13W /  N/A  | Not Supported          |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

ggml-org/gpt-oss-20b-GGUF

Model: https://huggingface.co/ggml-org/gpt-oss-20b-GGUF

llama-batched-bench

main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20

PP	TG	B	N_KV	T_PP s	S_PP t/s	T_TG s	S_TG t/s	T s	S t/s
512	32	1	544	0.270	1895.57	0.399	80.13	0.669	812.60
512	32	2	1088	0.230	4451.23	0.583	109.71	0.813	1337.56
512	32	4	2176	0.437	4688.87	0.820	156.03	1.257	1730.91
512	32	8	4352	0.863	4744.23	0.942	271.79	1.805	2410.73
512	32	16	8704	1.725	4748.19	1.173	436.38	2.899	3002.85
512	32	32	17408	3.437	4767.38	1.503	681.49	4.939	3524.40
4096	32	1	4128	0.907	4513.91	0.407	78.54	1.315	3139.56
4096	32	2	8256	1.796	4560.42	0.625	102.37	2.422	3409.45
4096	32	4	16512	3.596	4555.66	0.888	144.11	4.485	3681.93
4096	32	8	33024	7.184	4561.44	1.098	233.11	8.282	3987.51
4096	32	16	66048	14.369	4560.82	1.503	340.74	15.872	4161.30
4096	32	32	132096	28.760	4557.52	2.162	473.59	30.922	4271.95
8192	32	1	8224	1.859	4405.59	0.430	74.36	2.290	3591.61
8192	32	2	16448	3.698	4430.02	0.656	97.59	4.354	3777.47
8192	32	4	32896	7.403	4426.10	0.957	133.82	8.360	3934.97
8192	32	8	65792	14.802	4427.63	1.222	209.44	16.024	4105.87
8192	32	16	131584	29.596	4428.67	1.741	294.13	31.337	4199.00
8192	32	32	263168	59.169	4430.42	2.619	390.92	61.789	4259.17

llama-bench

model	size	params	backend	ngl	n_ubatch	fa	dio	test	t/s
gpt-oss 20B MXFP4 MoE	11.27 GiB	20.91 B	CUDA	99	2048	1	1	pp2048	4505.82 ± 12.90
gpt-oss 20B MXFP4 MoE	11.27 GiB	20.91 B	CUDA	99	2048	1	1	tg32	83.43 ± 0.59
gpt-oss 20B MXFP4 MoE	11.27 GiB	20.91 B	CUDA	99	2048	1	1	pp2048 @ d4096	4158.34 ± 18.84
gpt-oss 20B MXFP4 MoE	11.27 GiB	20.91 B	CUDA	99	2048	1	1	tg32 @ d4096	79.22 ± 0.60
gpt-oss 20B MXFP4 MoE	11.27 GiB	20.91 B	CUDA	99	2048	1	1	pp2048 @ d8192	3993.81 ± 17.55
gpt-oss 20B MXFP4 MoE	11.27 GiB	20.91 B	CUDA	99	2048	1	1	tg32 @ d8192	75.22 ± 1.05
gpt-oss 20B MXFP4 MoE	11.27 GiB	20.91 B	CUDA	99	2048	1	1	pp2048 @ d16384	3449.98 ± 12.13
gpt-oss 20B MXFP4 MoE	11.27 GiB	20.91 B	CUDA	99	2048	1	1	tg32 @ d16384	70.36 ± 0.37
gpt-oss 20B MXFP4 MoE	11.27 GiB	20.91 B	CUDA	99	2048	1	1	pp2048 @ d32768	2689.42 ± 18.89
gpt-oss 20B MXFP4 MoE	11.27 GiB	20.91 B	CUDA	99	2048	1	1	tg32 @ d32768	61.65 ± 0.30

build: 11fb327bf (7941)

ggml-org/gpt-oss-120b-GGUF

Model: https://huggingface.co/ggml-org/gpt-oss-120b-GGUF

llama-batched-bench

main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20

PP	TG	B	N_KV	T_PP s	S_PP t/s	T_TG s	S_TG t/s	T s	S t/s
512	32	1	544	0.445	1151.80	0.560	57.14	1.005	541.53
512	32	2	1088	0.472	2169.85	0.874	73.27	1.345	808.65
512	32	4	2176	0.826	2480.33	1.299	98.51	2.125	1023.94
512	32	8	4352	1.644	2491.67	1.608	159.18	3.252	1338.20
512	32	16	8704	3.292	2488.35	2.117	241.85	5.409	1609.13
512	32	32	17408	6.604	2481.07	2.898	353.31	9.502	1832.04
4096	32	1	4128	1.698	2412.65	0.580	55.21	2.277	1812.66
4096	32	2	8256	3.399	2409.88	0.934	68.53	4.333	1905.27
4096	32	4	16512	6.823	2401.21	1.411	90.72	8.234	2005.30
4096	32	8	33024	13.574	2413.97	1.841	139.07	15.415	2142.31
4096	32	16	66048	27.176	2411.52	2.609	196.26	29.785	2217.49
4096	32	32	132096	54.359	2411.23	3.905	262.20	58.264	2267.19
8192	32	1	8224	3.491	2346.81	0.613	52.23	4.103	2004.21
8192	32	2	16448	6.939	2361.03	0.981	65.21	7.921	2076.56
8192	32	4	32896	13.888	2359.40	1.511	84.71	15.399	2136.21
8192	32	8	65792	27.756	2361.18	2.034	125.86	29.790	2208.56
8192	32	16	131584	55.554	2359.34	3.021	169.49	58.575	2246.41
8192	32	32	263168	111.036	2360.89	4.537	225.72	115.573	2277.08

llama-bench

model	size	params	backend	ngl	n_ubatch	fa	dio	test	t/s
gpt-oss 120B MXFP4 MoE	59.02 GiB	116.83 B	CUDA	99	2048	1	1	pp2048	2443.91 ± 7.47
gpt-oss 120B MXFP4 MoE	59.02 GiB	116.83 B	CUDA	99	2048	1	1	tg32	58.72 ± 0.20
gpt-oss 120B MXFP4 MoE	59.02 GiB	116.83 B	CUDA	99	2048	1	1	pp2048 @ d4096	2309.84 ± 3.63
gpt-oss 120B MXFP4 MoE	59.02 GiB	116.83 B	CUDA	99	2048	1	1	tg32 @ d4096	55.67 ± 0.35
gpt-oss 120B MXFP4 MoE	59.02 GiB	116.83 B	CUDA	99	2048	1	1	pp2048 @ d8192	2216.68 ± 10.16
gpt-oss 120B MXFP4 MoE	59.02 GiB	116.83 B	CUDA	99	2048	1	1	tg32 @ d8192	52.87 ± 0.43
gpt-oss 120B MXFP4 MoE	59.02 GiB	116.83 B	CUDA	99	2048	1	1	pp2048 @ d16384	1956.31 ± 6.39
gpt-oss 120B MXFP4 MoE	59.02 GiB	116.83 B	CUDA	99	2048	1	1	tg32 @ d16384	49.45 ± 0.20
gpt-oss 120B MXFP4 MoE	59.02 GiB	116.83 B	CUDA	99	2048	1	1	pp2048 @ d32768	1567.08 ± 11.79
gpt-oss 120B MXFP4 MoE	59.02 GiB	116.83 B	CUDA	99	2048	1	1	tg32 @ d32768	42.76 ± 0.14

build: 11fb327bf (7941)

ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF

Model: https://huggingface.co/ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF

llama-batched-bench

main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20

PP	TG	B	N_KV	T_PP s	S_PP t/s	T_TG s	S_TG t/s	T s	S t/s
512	32	1	544	0.393	1303.73	0.548	58.36	0.941	578.10
512	32	2	1088	0.387	2648.68	0.910	70.35	1.296	839.27
512	32	4	2176	0.659	3107.63	1.302	98.33	1.961	1109.77
512	32	8	4352	1.322	3099.35	1.669	153.42	2.990	1455.43
512	32	16	8704	2.639	3104.63	2.212	231.44	4.851	1794.32
512	32	32	17408	5.284	3100.80	2.955	346.53	8.239	2112.93
4096	32	1	4128	1.417	2890.36	0.598	53.51	2.015	2048.45
4096	32	2	8256	2.829	2895.62	1.019	62.82	3.848	2145.60
4096	32	4	16512	5.656	2896.96	1.528	83.79	7.183	2298.71
4096	32	8	33024	11.338	2890.02	2.127	120.36	13.465	2452.53
4096	32	16	66048	22.709	2885.96	3.104	164.97	25.812	2558.79
4096	32	32	132096	45.301	2893.35	4.723	216.80	50.024	2640.63
8192	32	1	8224	3.022	2711.09	0.678	47.20	3.700	2222.89
8192	32	2	16448	6.039	2713.01	1.149	55.70	7.188	2288.21
8192	32	4	32896	12.050	2719.35	1.785	71.69	13.835	2377.67
8192	32	8	65792	24.113	2717.90	2.629	97.39	26.741	2460.31
8192	32	16	131584	48.178	2720.58	4.099	124.91	52.277	2517.06
8192	32	32	263168	96.401	2719.31	6.696	152.93	103.097	2552.63

llama-bench

model	size	params	backend	ngl	n_ubatch	fa	dio	test	t/s
qwen3moe 30B.A3B Q8_0	30.25 GiB	30.53 B	CUDA	99	2048	1	1	pp2048	2986.97 ± 18.87
qwen3moe 30B.A3B Q8_0	30.25 GiB	30.53 B	CUDA	99	2048	1	1	tg32	61.06 ± 0.23
qwen3moe 30B.A3B Q8_0	30.25 GiB	30.53 B	CUDA	99	2048	1	1	pp2048 @ d4096	2633.45 ± 6.26
qwen3moe 30B.A3B Q8_0	30.25 GiB	30.53 B	CUDA	99	2048	1	1	tg32 @ d4096	54.77 ± 0.28
qwen3moe 30B.A3B Q8_0	30.25 GiB	30.53 B	CUDA	99	2048	1	1	pp2048 @ d8192	2354.14 ± 3.84
qwen3moe 30B.A3B Q8_0	30.25 GiB	30.53 B	CUDA	99	2048	1	1	tg32 @ d8192	48.02 ± 0.40
qwen3moe 30B.A3B Q8_0	30.25 GiB	30.53 B	CUDA	99	2048	1	1	pp2048 @ d16384	1908.86 ± 4.25
qwen3moe 30B.A3B Q8_0	30.25 GiB	30.53 B	CUDA	99	2048	1	1	tg32 @ d16384	40.23 ± 0.10
qwen3moe 30B.A3B Q8_0	30.25 GiB	30.53 B	CUDA	99	2048	1	1	pp2048 @ d32768	1348.17 ± 2.00
qwen3moe 30B.A3B Q8_0	30.25 GiB	30.53 B	CUDA	99	2048	1	1	tg32 @ d32768	30.21 ± 0.04

build: 11fb327bf (7941)

ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF

Model: https://huggingface.co/ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF

llama-batched-bench

main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20

PP	TG	B	N_KV	T_PP s	S_PP t/s	T_TG s	S_TG t/s	T s	S t/s
512	32	1	544	0.212	2420.12	1.100	29.10	1.311	414.85
512	32	2	1088	0.428	2393.89	1.185	54.00	1.613	674.56
512	32	4	2176	0.894	2290.41	1.229	104.17	2.123	1025.02
512	32	8	4352	1.758	2330.36	1.319	194.15	3.076	1414.70
512	32	16	8704	3.508	2335.21	1.543	331.90	5.051	1723.33
512	32	32	17408	7.035	2328.93	1.738	589.21	8.773	1984.29
4096	32	1	4128	1.831	2237.25	1.125	28.44	2.956	1396.42
4096	32	2	8256	3.642	2249.48	1.253	51.07	4.895	1686.64
4096	32	4	16512	7.274	2252.26	1.380	92.72	8.655	1907.81
4096	32	8	33024	14.576	2248.09	1.617	158.29	16.193	2039.37
4096	32	16	66048	29.138	2249.17	2.081	246.01	31.219	2115.63
4096	32	32	132096	58.275	2249.19	2.814	363.87	61.089	2162.34
8192	32	1	8224	3.757	2180.26	1.184	27.03	4.941	1664.37
8192	32	2	16448	7.522	2178.05	1.341	47.73	8.863	1855.77
8192	32	4	32896	15.043	2178.25	1.548	82.69	16.591	1982.74
8192	32	8	65792	30.111	2176.49	1.937	132.13	32.048	2052.90
8192	32	16	131584	60.405	2169.90	2.706	189.21	63.111	2084.97
8192	32	32	263168	120.439	2176.58	3.993	256.46	124.432	2114.96

llama-bench

model	size	params	backend	ngl	n_ubatch	fa	dio	test	t/s
qwen2 7B Q8_0	7.54 GiB	7.62 B	CUDA	99	2048	1	1	pp2048	2250.28 ± 6.41
qwen2 7B Q8_0	7.54 GiB	7.62 B	CUDA	99	2048	1	1	tg32	29.43 ± 0.02
qwen2 7B Q8_0	7.54 GiB	7.62 B	CUDA	99	2048	1	1	pp2048 @ d4096	2100.19 ± 8.96
qwen2 7B Q8_0	7.54 GiB	7.62 B	CUDA	99	2048	1	1	tg32 @ d4096	28.61 ± 0.02
qwen2 7B Q8_0	7.54 GiB	7.62 B	CUDA	99	2048	1	1	pp2048 @ d8192	2007.56 ± 4.16
qwen2 7B Q8_0	7.54 GiB	7.62 B	CUDA	99	2048	1	1	tg32 @ d8192	27.38 ± 0.09
qwen2 7B Q8_0	7.54 GiB	7.62 B	CUDA	99	2048	1	1	pp2048 @ d16384	1779.11 ± 6.42
qwen2 7B Q8_0	7.54 GiB	7.62 B	CUDA	99	2048	1	1	tg32 @ d16384	25.72 ± 0.03
qwen2 7B Q8_0	7.54 GiB	7.62 B	CUDA	99	2048	1	1	pp2048 @ d32768	1471.23 ± 1.71
qwen2 7B Q8_0	7.54 GiB	7.62 B	CUDA	99	2048	1	1	tg32 @ d32768	22.51 ± 0.02

build: 11fb327bf (7941)

ggml-org/gemma-3-4b-it-qat-GGUF

Model: https://huggingface.co/ggml-org/gemma-3-4b-it-qat-GGUF

llama-batched-bench

main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20

PP	TG	B	N_KV	T_PP s	S_PP t/s	T_TG s	S_TG t/s	T s	S t/s
512	32	1	544	0.092	5566.97	0.412	77.63	0.504	1078.95
512	32	2	1088	0.161	6345.67	0.522	122.70	0.683	1593.06
512	32	4	2176	0.325	6309.87	0.562	227.68	0.887	2453.87
512	32	8	4352	0.643	6374.42	0.685	373.67	1.328	3277.94
512	32	16	8704	1.277	6413.64	0.915	559.47	2.192	3970.01
512	32	32	17408	2.518	6506.57	1.249	819.61	3.767	4620.64
4096	32	1	4128	0.674	6079.68	0.453	70.60	1.127	3662.88
4096	32	2	8256	1.335	6137.82	0.627	102.03	1.962	4208.11
4096	32	4	16512	2.657	6167.35	0.749	170.92	3.405	4848.71
4096	32	8	33024	5.307	6173.91	0.974	262.89	6.281	5257.53
4096	32	16	66048	10.610	6176.96	1.379	371.42	11.988	5509.40
4096	32	32	132096	21.213	6178.89	2.122	482.50	23.335	5660.82
8192	32	1	8224	1.359	6027.34	0.467	68.52	1.826	4503.48
8192	32	2	16448	2.699	6069.68	0.653	98.03	3.352	4906.68
8192	32	4	32896	5.366	6106.74	0.818	156.55	6.184	5319.96
8192	32	8	65792	10.755	6093.50	1.174	218.04	11.929	5515.22
8192	32	16	131584	21.484	6100.82	1.829	279.90	23.314	5644.11
8192	32	32	263168	42.950	6103.40	3.058	334.91	46.008	5720.05

llama-bench

model	size	params	backend	ngl	n_ubatch	fa	dio	test	t/s
gemma3 4B Q4_0	2.35 GiB	3.88 B	CUDA	99	2048	1	1	pp2048	5948.74 ± 10.61
gemma3 4B Q4_0	2.35 GiB	3.88 B	CUDA	99	2048	1	1	tg32	81.05 ± 0.20
gemma3 4B Q4_0	2.35 GiB	3.88 B	CUDA	99	2048	1	1	pp2048 @ d4096	5652.69 ± 34.29
gemma3 4B Q4_0	2.35 GiB	3.88 B	CUDA	99	2048	1	1	tg32 @ d4096	76.37 ± 0.58
gemma3 4B Q4_0	2.35 GiB	3.88 B	CUDA	99	2048	1	1	pp2048 @ d8192	5509.57 ± 40.69
gemma3 4B Q4_0	2.35 GiB	3.88 B	CUDA	99	2048	1	1	tg32 @ d8192	71.61 ± 0.80
gemma3 4B Q4_0	2.35 GiB	3.88 B	CUDA	99	2048	1	1	pp2048 @ d16384	5340.86 ± 36.92
gemma3 4B Q4_0	2.35 GiB	3.88 B	CUDA	99	2048	1	1	tg32 @ d16384	70.89 ± 0.34
gemma3 4B Q4_0	2.35 GiB	3.88 B	CUDA	99	2048	1	1	pp2048 @ d32768	5023.30 ± 13.52
gemma3 4B Q4_0	2.35 GiB	3.88 B	CUDA	99	2048	1	1	tg32 @ d32768	62.28 ± 0.30

build: 11fb327bf (7941)

ggml-org/GLM-4.7-Flash-GGUF

Model: https://huggingface.co/ggml-org/GLM-4.7-Flash-GGUF

llama-batched-bench

main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20

PP	TG	B	N_KV	T_PP s	S_PP t/s	T_TG s	S_TG t/s	T s	S t/s
512	32	1	544	0.433	1181.83	0.693	46.16	1.126	482.94
512	32	2	1088	0.439	2334.46	1.034	61.89	1.473	738.75
512	32	4	2176	0.772	2654.46	1.459	87.76	2.230	975.77
512	32	8	4352	1.541	2658.78	2.043	125.31	3.583	1214.47
512	32	16	8704	3.083	2656.91	2.675	191.42	5.758	1511.62
512	32	32	17408	6.159	2660.12	3.615	283.24	9.774	1780.98
4096	32	1	4128	1.915	2139.30	0.725	44.14	2.640	1563.83
4096	32	2	8256	3.834	2136.40	1.119	57.21	4.953	1666.81
4096	32	4	16512	7.636	2145.72	1.631	78.49	9.266	1781.93
4096	32	8	33024	15.295	2142.40	2.344	109.21	17.639	1872.20
4096	32	16	66048	30.573	2143.62	3.773	135.70	34.346	1923.04
4096	32	32	132096	61.282	2138.82	5.795	176.71	67.077	1969.31
8192	32	1	8224	4.510	1816.24	0.760	42.11	5.270	1560.44
8192	32	2	16448	9.036	1813.19	1.206	53.06	10.242	1605.91
8192	32	4	32896	18.070	1813.43	1.783	71.80	19.852	1657.03
8192	32	8	65792	36.125	1814.15	2.635	97.14	38.760	1697.41
8192	32	16	131584	72.367	1811.20	4.954	103.34	77.322	1701.77
8192	32	32	263168	144.501	1814.13	8.103	126.37	152.604	1724.51

llama-bench

model	size	params	backend	ngl	n_ubatch	fa	dio	test	t/s
deepseek2 30B.A3B Q8_0	29.65 GiB	29.94 B	CUDA	99	2048	1	1	pp2048	2364.18 ± 11.43
deepseek2 30B.A3B Q8_0	29.65 GiB	29.94 B	CUDA	99	2048	1	1	tg32	48.68 ± 0.12
deepseek2 30B.A3B Q8_0	29.65 GiB	29.94 B	CUDA	99	2048	1	1	pp2048 @ d4096	1684.13 ± 1.24
deepseek2 30B.A3B Q8_0	29.65 GiB	29.94 B	CUDA	99	2048	1	1	tg32 @ d4096	44.62 ± 0.22
deepseek2 30B.A3B Q8_0	29.65 GiB	29.94 B	CUDA	99	2048	1	1	pp2048 @ d8192	1314.68 ± 1.41
deepseek2 30B.A3B Q8_0	29.65 GiB	29.94 B	CUDA	99	2048	1	1	tg32 @ d8192	42.59 ± 0.11
deepseek2 30B.A3B Q8_0	29.65 GiB	29.94 B	CUDA	99	2048	1	1	pp2048 @ d16384	914.05 ± 3.32
deepseek2 30B.A3B Q8_0	29.65 GiB	29.94 B	CUDA	99	2048	1	1	tg32 @ d16384	38.72 ± 0.13
deepseek2 30B.A3B Q8_0	29.65 GiB	29.94 B	CUDA	99	2048	1	1	pp2048 @ d32768	567.20 ± 0.90
deepseek2 30B.A3B Q8_0	29.65 GiB	29.94 B	CUDA	99	2048	1	1	tg32 @ d32768	32.65 ± 0.09

build: 11fb327bf (7941)

26 KiB Raw Blame History

System info

ggml-org/gpt-oss-20b-GGUF

ggml-org/gpt-oss-120b-GGUF

ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF

ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF

ggml-org/gemma-3-4b-it-qat-GGUF

ggml-org/GLM-4.7-Flash-GGUF

26 KiB

Raw Blame History