llama.cpp/benches/mac-m2-ultra/mac-m2-ultra.md at 6cdedc79ebc2fa735da755dcd22b2f901d4e29cb

24 KiB

Raw Blame History

System info

uname -a
Darwin gg-studio 25.2.0 Darwin Kernel Version 25.2.0: Tue Nov 18 21:07:05 PST 2025; root:xnu-12377.61.12~1/RELEASE_ARM64_T6020 arm64

g++ --version
Apple clang version 17.0.0 (clang-1700.3.19.1)
Target: arm64-apple-darwin25.2.0

ggml-org/gpt-oss-20b-GGUF

Model: https://huggingface.co/ggml-org/gpt-oss-20b-GGUF

llama-batched-bench

main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 16, n_threads_batch = 16

PP	TG	B	N_KV	T_PP s	S_PP t/s	T_TG s	S_TG t/s	T s	S t/s
512	32	1	544	0.215	2381.35	0.245	130.45	0.460	1181.81
512	32	2	1088	0.379	2701.43	0.382	167.56	0.761	1429.67
512	32	4	2176	0.721	2839.27	0.604	211.76	1.326	1641.32
512	32	8	4352	1.433	2858.30	1.033	247.75	2.466	1764.57
512	32	16	8704	2.853	2871.12	1.570	326.11	4.423	1967.77
512	32	32	17408	5.699	2874.95	1.910	536.15	7.609	2287.88
4096	32	1	4128	1.552	2638.56	0.334	95.72	1.887	2188.00
4096	32	2	8256	3.084	2655.88	0.404	158.54	3.488	2366.86
4096	32	4	16512	6.151	2663.78	0.652	196.39	6.802	2427.37
4096	32	8	33024	12.288	2666.77	1.135	225.47	13.423	2460.27
4096	32	16	66048	24.563	2668.12	1.762	290.55	26.325	2508.97
4096	32	32	132096	49.114	2668.73	2.398	426.94	51.512	2564.35
8192	32	1	8224	3.345	2448.78	0.275	116.46	3.620	2271.76
8192	32	2	16448	6.665	2458.11	0.425	150.71	7.090	2319.91
8192	32	4	32896	13.315	2460.92	0.691	185.21	14.006	2348.63
8192	32	8	65792	26.611	2462.73	1.212	211.16	27.823	2364.62
8192	32	16	131584	53.232	2462.27	1.919	266.83	55.151	2385.88
8192	32	32	263168	110.455	2373.30	2.752	372.03	113.208	2324.64

llama-bench

model	size	params	backend	threads	n_ubatch	fa	test	t/s
gpt-oss 20B MXFP4 MoE	11.27 GiB	20.91 B	MTL,BLAS	16	2048	1	pp2048	2713.40 ± 3.56
gpt-oss 20B MXFP4 MoE	11.27 GiB	20.91 B	MTL,BLAS	16	2048	1	tg32	129.97 ± 3.90
gpt-oss 20B MXFP4 MoE	11.27 GiB	20.91 B	MTL,BLAS	16	2048	1	pp2048 @ d4096	2324.59 ± 3.01
gpt-oss 20B MXFP4 MoE	11.27 GiB	20.91 B	MTL,BLAS	16	2048	1	tg32 @ d4096	123.38 ± 0.17
gpt-oss 20B MXFP4 MoE	11.27 GiB	20.91 B	MTL,BLAS	16	2048	1	pp2048 @ d8192	1989.82 ± 30.11
gpt-oss 20B MXFP4 MoE	11.27 GiB	20.91 B	MTL,BLAS	16	2048	1	tg32 @ d8192	117.39 ± 0.33
gpt-oss 20B MXFP4 MoE	11.27 GiB	20.91 B	MTL,BLAS	16	2048	1	pp2048 @ d16384	1556.54 ± 6.22
gpt-oss 20B MXFP4 MoE	11.27 GiB	20.91 B	MTL,BLAS	16	2048	1	tg32 @ d16384	109.75 ± 0.42
gpt-oss 20B MXFP4 MoE	11.27 GiB	20.91 B	MTL,BLAS	16	2048	1	pp2048 @ d32768	1122.63 ± 1.45
gpt-oss 20B MXFP4 MoE	11.27 GiB	20.91 B	MTL,BLAS	16	2048	1	tg32 @ d32768	98.25 ± 0.08

build: b828e18c7 (7948)

ggml-org/gpt-oss-120b-GGUF

Model: https://huggingface.co/ggml-org/gpt-oss-120b-GGUF

llama-batched-bench

main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 16, n_threads_batch = 16

PP	TG	B	N_KV	T_PP s	S_PP t/s	T_TG s	S_TG t/s	T s	S t/s
512	32	1	544	0.426	1200.92	0.361	88.56	0.788	690.64
512	32	2	1088	0.683	1500.14	0.545	117.35	1.228	886.02
512	32	4	2176	1.204	1701.56	0.847	151.19	2.050	1061.34
512	32	8	4352	2.402	1705.20	1.455	176.00	3.857	1128.45
512	32	16	8704	4.802	1705.90	2.349	217.93	7.152	1217.08
512	32	32	17408	9.593	1707.85	3.665	279.42	13.258	1313.01
4096	32	1	4128	2.581	1587.08	0.390	82.12	2.970	1389.67
4096	32	2	8256	5.124	1598.79	0.589	108.62	5.713	1445.10
4096	32	4	16512	10.231	1601.47	0.928	137.98	11.158	1479.80
4096	32	8	33024	20.468	1600.94	1.606	159.38	22.074	1496.04
4096	32	16	66048	40.924	1601.42	2.639	193.99	43.563	1516.15
4096	32	32	132096	81.819	1601.98	4.466	229.29	86.284	1530.94
8192	32	1	8224	5.517	1484.74	0.409	78.16	5.927	1387.58
8192	32	2	16448	11.008	1488.43	0.622	102.92	11.629	1414.34
8192	32	4	32896	22.002	1489.29	0.987	129.66	22.990	1430.90
8192	32	8	65792	46.051	1423.11	1.858	137.79	47.909	1373.27
8192	32	16	131584	97.680	1341.85	2.872	178.28	100.552	1308.62
8192	32	32	263168	176.407	1486.02	5.048	202.85	181.455	1450.32

llama-bench

model	size	params	backend	threads	n_ubatch	fa	test	t/s
gpt-oss 120B MXFP4 MoE	59.02 GiB	116.83 B	MTL,BLAS	16	2048	1	pp2048	1648.69 ± 1.80
gpt-oss 120B MXFP4 MoE	59.02 GiB	116.83 B	MTL,BLAS	16	2048	1	tg32	85.60 ± 0.52
gpt-oss 120B MXFP4 MoE	59.02 GiB	116.83 B	MTL,BLAS	16	2048	1	pp2048 @ d4096	1429.86 ± 1.01
gpt-oss 120B MXFP4 MoE	59.02 GiB	116.83 B	MTL,BLAS	16	2048	1	tg32 @ d4096	82.03 ± 0.12
gpt-oss 120B MXFP4 MoE	59.02 GiB	116.83 B	MTL,BLAS	16	2048	1	pp2048 @ d8192	1257.90 ± 1.81
gpt-oss 120B MXFP4 MoE	59.02 GiB	116.83 B	MTL,BLAS	16	2048	1	tg32 @ d8192	78.23 ± 0.33
gpt-oss 120B MXFP4 MoE	59.02 GiB	116.83 B	MTL,BLAS	16	2048	1	pp2048 @ d16384	1013.49 ± 0.70
gpt-oss 120B MXFP4 MoE	59.02 GiB	116.83 B	MTL,BLAS	16	2048	1	tg32 @ d16384	73.20 ± 0.28
gpt-oss 120B MXFP4 MoE	59.02 GiB	116.83 B	MTL,BLAS	16	2048	1	pp2048 @ d32768	721.11 ± 0.58
gpt-oss 120B MXFP4 MoE	59.02 GiB	116.83 B	MTL,BLAS	16	2048	1	tg32 @ d32768	65.52 ± 0.10

build: b828e18c7 (7948)

ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF

Model: https://huggingface.co/ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF

llama-batched-bench

main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 16, n_threads_batch = 16

PP	TG	B	N_KV	T_PP s	S_PP t/s	T_TG s	S_TG t/s	T s	S t/s
512	32	1	544	0.243	2109.23	0.419	76.34	0.662	821.84
512	32	2	1088	0.406	2521.40	0.575	111.36	0.981	1109.27
512	32	4	2176	0.744	2751.65	0.841	152.22	1.585	1372.71
512	32	8	4352	1.479	2770.20	1.330	192.48	2.809	1549.53
512	32	16	8704	2.951	2776.20	2.572	199.05	5.523	1575.93
512	32	32	17408	5.899	2777.64	2.603	393.34	8.502	2047.54
4096	32	1	4128	1.901	2154.15	0.474	67.58	2.375	1738.14
4096	32	2	8256	3.788	2162.89	0.652	98.17	4.439	1859.69
4096	32	4	16512	7.564	2166.18	0.990	129.24	8.554	1930.34
4096	32	8	33024	15.121	2166.98	1.632	156.82	16.754	1971.12
4096	32	16	66048	30.241	2167.09	3.166	161.72	33.407	1977.04
4096	32	32	132096	60.474	2167.42	3.780	270.93	64.254	2055.86
8192	32	1	8224	4.733	1730.92	0.483	66.29	5.215	1576.85
8192	32	2	16448	9.459	1732.09	0.722	88.58	10.182	1615.46
8192	32	4	32896	18.912	1732.65	1.120	114.26	20.032	1642.14
8192	32	8	65792	37.797	1733.91	1.873	136.67	39.670	1658.49
8192	32	16	131584	84.133	1557.92	3.718	137.72	87.850	1497.82
8192	32	32	263168	157.550	1663.88	4.854	210.98	162.403	1620.46

llama-bench

model	size	params	backend	threads	n_ubatch	fa	test	t/s
qwen3moe 30B.A3B Q8_0	30.25 GiB	30.53 B	MTL,BLAS	16	2048	1	pp2048	2453.11 ± 1.70
qwen3moe 30B.A3B Q8_0	30.25 GiB	30.53 B	MTL,BLAS	16	2048	1	tg32	78.97 ± 0.46
qwen3moe 30B.A3B Q8_0	30.25 GiB	30.53 B	MTL,BLAS	16	2048	1	pp2048 @ d4096	1569.46 ± 1.97
qwen3moe 30B.A3B Q8_0	30.25 GiB	30.53 B	MTL,BLAS	16	2048	1	tg32 @ d4096	71.18 ± 0.37
qwen3moe 30B.A3B Q8_0	30.25 GiB	30.53 B	MTL,BLAS	16	2048	1	pp2048 @ d8192	1145.51 ± 1.16
qwen3moe 30B.A3B Q8_0	30.25 GiB	30.53 B	MTL,BLAS	16	2048	1	tg32 @ d8192	65.11 ± 0.36
qwen3moe 30B.A3B Q8_0	30.25 GiB	30.53 B	MTL,BLAS	16	2048	1	pp2048 @ d16384	741.04 ± 0.74
qwen3moe 30B.A3B Q8_0	30.25 GiB	30.53 B	MTL,BLAS	16	2048	1	tg32 @ d16384	56.87 ± 0.14
qwen3moe 30B.A3B Q8_0	30.25 GiB	30.53 B	MTL,BLAS	16	2048	1	pp2048 @ d32768	431.31 ± 0.31
qwen3moe 30B.A3B Q8_0	30.25 GiB	30.53 B	MTL,BLAS	16	2048	1	tg32 @ d32768	45.26 ± 0.11

build: b828e18c7 (7948)

ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF

Model: https://huggingface.co/ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF

llama-batched-bench

main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 16, n_threads_batch = 16

PP	TG	B	N_KV	T_PP s	S_PP t/s	T_TG s	S_TG t/s	T s	S t/s
512	32	1	544	0.339	1509.22	0.409	78.17	0.749	726.67
512	32	2	1088	0.646	1584.93	0.483	132.45	1.129	963.45
512	32	4	2176	1.258	1627.50	0.585	218.67	1.844	1180.21
512	32	8	4352	2.506	1634.41	1.005	254.83	3.511	1239.64
512	32	16	8704	5.007	1635.99	1.595	321.07	6.602	1318.38
512	32	32	17408	10.007	1637.19	1.676	611.12	11.683	1490.03
4096	32	1	4128	2.730	1500.46	0.431	74.31	3.160	1306.12
4096	32	2	8256	5.446	1504.33	0.524	122.04	5.970	1382.91
4096	32	4	16512	10.875	1506.59	0.662	193.45	11.537	1431.28
4096	32	8	33024	21.749	1506.61	1.158	221.11	22.907	1441.64
4096	32	16	66048	43.477	1507.36	1.901	269.32	45.378	1455.49
4096	32	32	132096	86.954	1507.37	2.325	440.42	89.279	1479.59
8192	32	1	8224	5.940	1379.21	0.449	71.20	6.389	1287.20
8192	32	2	16448	11.865	1380.84	0.559	114.59	12.424	1323.92
8192	32	4	32896	23.723	1381.25	0.728	175.80	24.452	1345.35
8192	32	8	65792	47.434	1381.63	1.279	200.09	48.713	1350.60
8192	32	16	131584	94.864	1381.69	2.198	232.97	97.061	1355.68
8192	32	32	263168	189.743	1381.57	3.052	335.50	192.795	1365.01

llama-bench

model	size	params	backend	threads	n_ubatch	fa	test	t/s
qwen2 7B Q8_0	7.54 GiB	7.62 B	MTL,BLAS	16	2048	1	pp2048	1565.91 ± 0.86
qwen2 7B Q8_0	7.54 GiB	7.62 B	MTL,BLAS	16	2048	1	tg32	79.68 ± 0.39
qwen2 7B Q8_0	7.54 GiB	7.62 B	MTL,BLAS	16	2048	1	pp2048 @ d4096	1317.41 ± 1.02
qwen2 7B Q8_0	7.54 GiB	7.62 B	MTL,BLAS	16	2048	1	tg32 @ d4096	74.70 ± 0.04
qwen2 7B Q8_0	7.54 GiB	7.62 B	MTL,BLAS	16	2048	1	pp2048 @ d8192	1134.65 ± 0.76
qwen2 7B Q8_0	7.54 GiB	7.62 B	MTL,BLAS	16	2048	1	tg32 @ d8192	71.31 ± 0.12
qwen2 7B Q8_0	7.54 GiB	7.62 B	MTL,BLAS	16	2048	1	pp2048 @ d16384	886.46 ± 0.78
qwen2 7B Q8_0	7.54 GiB	7.62 B	MTL,BLAS	16	2048	1	tg32 @ d16384	65.93 ± 0.06
qwen2 7B Q8_0	7.54 GiB	7.62 B	MTL,BLAS	16	2048	1	pp2048 @ d32768	612.21 ± 0.30
qwen2 7B Q8_0	7.54 GiB	7.62 B	MTL,BLAS	16	2048	1	tg32 @ d32768	56.83 ± 0.02

build: b828e18c7 (7948)

ggml-org/gemma-3-4b-it-qat-GGUF

Model: https://huggingface.co/ggml-org/gemma-3-4b-it-qat-GGUF

llama-batched-bench

main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 16, n_threads_batch = 16

PP	TG	B	N_KV	T_PP s	S_PP t/s	T_TG s	S_TG t/s	T s	S t/s
512	32	1	544	0.186	2748.06	0.235	136.28	0.421	1291.78
512	32	2	1088	0.342	2990.95	0.312	204.99	0.655	1662.15
512	32	4	2176	0.662	3092.69	0.404	316.97	1.066	2041.21
512	32	8	4352	1.317	3110.41	0.579	441.80	1.896	2294.97
512	32	16	8704	2.625	3120.23	1.207	424.08	3.833	2270.93
512	32	32	17408	5.242	3125.34	1.299	788.23	6.541	2661.19
4096	32	1	4128	1.408	2909.90	0.296	108.07	1.704	2422.95
4096	32	2	8256	2.793	2933.40	0.325	197.00	3.118	2648.25
4096	32	4	16512	5.567	2943.22	0.440	291.07	6.006	2749.05
4096	32	8	33024	11.114	2948.23	0.640	400.26	11.754	2809.59
4096	32	16	66048	22.217	2949.76	1.327	385.83	23.544	2805.26
4096	32	32	132096	44.420	2950.77	1.553	659.30	45.973	2873.36
8192	32	1	8224	2.860	2864.58	0.250	127.90	3.110	2644.42
8192	32	2	16448	5.702	2873.63	0.335	191.07	6.036	2724.77
8192	32	4	32896	11.383	2878.69	0.456	280.72	11.839	2778.63
8192	32	8	65792	22.750	2880.75	0.671	381.48	23.421	2809.14
8192	32	16	131584	45.484	2881.74	1.406	364.04	46.890	2806.22
8192	32	32	263168	90.956	2882.10	1.793	570.98	92.749	2837.41

llama-bench

model	size	params	backend	threads	n_ubatch	fa	test	t/s
gemma3 4B Q4_0	2.35 GiB	3.88 B	MTL,BLAS	16	2048	1	pp2048	2923.59 ± 3.10
gemma3 4B Q4_0	2.35 GiB	3.88 B	MTL,BLAS	16	2048	1	tg32	134.28 ± 1.29
gemma3 4B Q4_0	2.35 GiB	3.88 B	MTL,BLAS	16	2048	1	pp2048 @ d4096	2748.21 ± 3.05
gemma3 4B Q4_0	2.35 GiB	3.88 B	MTL,BLAS	16	2048	1	tg32 @ d4096	133.11 ± 0.08
gemma3 4B Q4_0	2.35 GiB	3.88 B	MTL,BLAS	16	2048	1	pp2048 @ d8192	2641.45 ± 2.31
gemma3 4B Q4_0	2.35 GiB	3.88 B	MTL,BLAS	16	2048	1	tg32 @ d8192	125.85 ± 0.35
gemma3 4B Q4_0	2.35 GiB	3.88 B	MTL,BLAS	16	2048	1	pp2048 @ d16384	2446.20 ± 2.94
gemma3 4B Q4_0	2.35 GiB	3.88 B	MTL,BLAS	16	2048	1	tg32 @ d16384	125.00 ± 0.12
gemma3 4B Q4_0	2.35 GiB	3.88 B	MTL,BLAS	16	2048	1	pp2048 @ d32768	2129.18 ± 7.43
gemma3 4B Q4_0	2.35 GiB	3.88 B	MTL,BLAS	16	2048	1	tg32 @ d32768	113.14 ± 0.10

build: b828e18c7 (7948)

ggml-org/GLM-4.7-Flash-GGUF

Model: https://huggingface.co/ggml-org/GLM-4.7-Flash-GGUF

llama-batched-bench

main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 16, n_threads_batch = 16

PP	TG	B	N_KV	T_PP s	S_PP t/s	T_TG s	S_TG t/s	T s	S t/s
512	32	1	544	0.326	1568.69	0.522	61.28	0.849	641.09
512	32	2	1088	0.528	1939.42	0.744	86.07	1.272	855.63
512	32	4	2176	0.968	2114.85	1.105	115.85	2.073	1049.56
512	32	8	4352	1.928	2124.62	1.684	151.99	3.612	1204.82
512	32	16	8704	3.844	2131.34	3.141	162.99	6.985	1246.11
512	32	32	17408	7.683	2132.38	3.924	260.95	11.608	1499.71
4096	32	1	4128	3.280	1248.75	0.723	44.29	4.003	1031.33
4096	32	2	8256	6.545	1251.63	0.930	68.85	7.475	1104.53
4096	32	4	16512	13.080	1252.64	1.454	88.03	14.534	1136.12
4096	32	8	33024	26.154	1252.90	2.388	107.20	28.542	1157.04
4096	32	16	66048	52.297	1253.14	4.724	108.37	57.022	1158.30
4096	32	32	132096	104.578	1253.34	7.266	140.93	111.844	1181.08
8192	32	1	8224	9.623	851.31	0.767	41.72	10.390	791.54
8192	32	2	16448	20.916	783.32	1.148	55.74	22.064	745.45
8192	32	4	32896	43.509	753.14	1.833	69.82	45.342	725.51
8192	32	8	65792	79.621	823.10	3.180	80.50	82.801	794.58
8192	32	16	131584	153.770	852.39	6.502	78.74	160.272	821.00
8192	32	32	263168	307.539	852.39	10.839	94.48	318.378	826.59

llama-bench

model	size	params	backend	threads	n_ubatch	fa	test	t/s
deepseek2 30B.A3B Q8_0	29.65 GiB	29.94 B	MTL,BLAS	16	2048	1	pp2048	1629.33 ± 0.27
deepseek2 30B.A3B Q8_0	29.65 GiB	29.94 B	MTL,BLAS	16	2048	1	tg32	59.58 ± 0.13
deepseek2 30B.A3B Q8_0	29.65 GiB	29.94 B	MTL,BLAS	16	2048	1	pp2048 @ d4096	732.67 ± 0.42
deepseek2 30B.A3B Q8_0	29.65 GiB	29.94 B	MTL,BLAS	16	2048	1	tg32 @ d4096	47.44 ± 0.15
deepseek2 30B.A3B Q8_0	29.65 GiB	29.94 B	MTL,BLAS	16	2048	1	pp2048 @ d8192	474.33 ± 0.33
deepseek2 30B.A3B Q8_0	29.65 GiB	29.94 B	MTL,BLAS	16	2048	1	tg32 @ d8192	40.20 ± 0.20
deepseek2 30B.A3B Q8_0	29.65 GiB	29.94 B	MTL,BLAS	16	2048	1	pp2048 @ d16384	277.46 ± 0.09
deepseek2 30B.A3B Q8_0	29.65 GiB	29.94 B	MTL,BLAS	16	2048	1	tg32 @ d16384	31.50 ± 0.93
deepseek2 30B.A3B Q8_0	29.65 GiB	29.94 B	MTL,BLAS	16	2048	1	pp2048 @ d32768	151.44 ± 0.05
deepseek2 30B.A3B Q8_0	29.65 GiB	29.94 B	MTL,BLAS	16	2048	1	tg32 @ d32768	21.81 ± 0.01

build: b828e18c7 (7948)

24 KiB Raw Blame History

System info

ggml-org/gpt-oss-20b-GGUF

ggml-org/gpt-oss-120b-GGUF

ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF

ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF

ggml-org/gemma-3-4b-it-qat-GGUF

ggml-org/GLM-4.7-Flash-GGUF

24 KiB

Raw Blame History