llama.cpp/tools/moe-pruning/analyze_stats.py

285 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
analyze_stats.py -- Summarize expert_stats.json and model size projections.
Usage: python analyze_stats.py [stats_file] [--keep 0.5]
"""
import json, statistics, argparse
parser = argparse.ArgumentParser()
parser.add_argument("stats", nargs="?", default="expert_stats_reap.json")
parser.add_argument("--keep", type=float, default=0.5, help="Fraction of experts to keep (default 0.5)")
args = parser.parse_args()
with open(args.stats) as f:
data = json.load(f)
layers = sorted(data.keys(), key=int)
n_layers = len(layers)
keep_ratio = args.keep
# Detect which scoring field is available (new REAP vs old importance_score)
sample_layer = data[layers[0]]
if "reap" in sample_layer:
score_field = "reap"
score_label = "REAP (gate_weight × ||expert_out||₂)"
elif "importance_score" in sample_layer:
score_field = "importance_score"
score_label = "importance_score (freq × avg_gate_weight) [legacy, no EAN]"
else:
raise ValueError(f"No recognised score field in stats. Keys: {list(sample_layer.keys())}")
# ── Model architecture constants (Nemotron-3-Nano-30B-A3B) ──────────────────
N_EXPERTS = 128
N_EXPERT_USED = 6 # top-k per token
N_MOE_LAYERS = 23
N_TOTAL_LAYERS = 53
# Approximate parameter counts (bf16, billions)
PARAMS_TOTAL_B = 30.0
PARAMS_MOE_EXPERTS_B = 22.0 # bulk of MoE weight is in expert FFNs
PARAMS_NON_MOE_B = PARAMS_TOTAL_B - PARAMS_MOE_EXPERTS_B
# ── Header ──────────────────────────────────────────────────────────────────
print("=" * 70)
print(f" Expert Stats Analysis | file: {args.stats}")
print("=" * 70)
# ── Profiling completeness ───────────────────────────────────────────────────
sample_tokens = list(data.values())[0]["total_tokens"]
# Each token activates N_EXPERT_USED experts, sum(activation_counts) = total*top_k
# Approximate samples: total_tokens / avg_tokens_per_sample
# We don't know avg, but can infer: total_tokens / (total_tokens / ctx) ≈ ctx chunks
# Better: just report tokens and note the user knows sample count
print(f"\n── Profiling progress ──────────────────────────────────────────────────")
print(f" MoE layers profiled : {n_layers} / {N_MOE_LAYERS}")
print(f" Tokens processed : {sample_tokens:,} (per layer)")
act_sum = sum(data[layers[0]]["activation_counts"])
assert abs(act_sum / sample_tokens - N_EXPERT_USED) < 0.01, "unexpected top-k"
print(f" top-k confirmed : {N_EXPERT_USED} (sum activations / tokens = {act_sum/sample_tokens:.1f})")
# ── Per-layer importance score stats ────────────────────────────────────────
print(f"\n── Per-layer score distribution [{score_label}]")
print(f" {'Layer':>5} {'Min':>9} {'Max':>9} {'Range':>9} {'CV%':>6} {'Never':>5}")
global_cvs = []
for k in layers:
d = data[k]
s = d[score_field]
mn, mx = min(s), max(s)
cv = statistics.stdev(s) / statistics.mean(s) * 100
global_cvs.append(cv)
print(f" {k:>5} {mn:>9.5f} {mx:>9.5f} {mx-mn:>9.5f} {cv:>6.3f}% {d['never_activated']:>5}")
print(f"\n Mean CV across layers : {statistics.mean(global_cvs):.3f}%")
print(f" (CV < 1% = near-uniform; load-balancing is working as designed)")
# ── Capacity loss sweep across pruning levels ────────────────────────────────
# Paper (observer.py): REAP[i] = mean(ean_norm * softmax_router_weight) over tokens
# routed to expert i, averaged via OnlineStatsTracker weighted by expert_frequency.
# Our implementation (llama.cpp): same formula but routing weights are the top-k
# gate weights (post-softmax within top-k), not the full softmax over all 128.
# Impact: our weights are slightly higher than the paper's (renormalized to top-k
# only), but relative expert ranking within a layer should be preserved.
#
# IMPORTANT CAVEAT for this model (Nemotron-3-Nano-30B-A3B):
# The model was trained with a strong load-balancing auxiliary loss, so all 128
# experts have nearly identical activation frequency (~4.69%) AND nearly identical
# REAP scores (Gini ~0.015, top/bottom ratio ~1.1-1.35x). The score distribution
# is a smooth monotone curve with NO natural elbow or gap.
#
# This means:
# - REAP ranking beats random pruning by only ~1pp in mass terms at keep=33%
# - The cut point boundary (rank 42 vs 43) has near-zero gap in most layers
# - REAP paper results on Qwen3-30B-A3B likely had higher Gini (less tight
# load-balancing or more expert specialization in pre-training)
# - For this model, actual quality loss must be measured via eval, not predicted
# from REAP score variance
#
# Metrics reported:
# - kept_mass%: REAP mass in the KEPT experts as % of total (> keep_ratio% = good)
# - vs_random%: how much more mass the REAP-selected set retains vs a random set
# of the same size (= kept_mass% - keep_ratio%). Positive = REAP wins.
# - Rel.gap: score gap at cut / layer score range. Near 0 = no natural cut point.
# - Gini: inequality of score distribution. ~0.015 here = near-uniform.
def gini(scores):
"""Gini coefficient of a list of non-negative values."""
n = len(scores)
s = sorted(scores)
total = sum(s)
if total == 0:
return 0.0
cumsum = 0.0
for i, v in enumerate(s):
cumsum += (2 * (i + 1) - n - 1) * v
return cumsum / (n * total)
def layer_stats(scores, n_keep):
"""Return capacity metrics for a single layer at a given keep count."""
n = len(scores)
ranked = sorted(range(n), key=lambda i: scores[i], reverse=True)
total = sum(scores)
kept_mass = sum(scores[i] for i in ranked[:n_keep])
kept_frac = kept_mass / total if total > 0 else 0.0 # fraction of REAP mass kept
random_frac = n_keep / n # uniform expectation
vs_random = kept_frac - random_frac # positive = REAP beats random
score_range = scores[ranked[0]] - scores[ranked[-1]]
gap = scores[ranked[n_keep - 1]] - (scores[ranked[n_keep]] if n_keep < n else 0)
rel_gap = gap / score_range if score_range > 0 else 0.0
return kept_frac * 100, vs_random * 100, rel_gap
# Sweep over a range of keep ratios
sweep_ratios = [0.10, 0.20, 0.25, 0.33, 0.40, 0.50, 0.60, 0.75]
if keep_ratio not in sweep_ratios:
sweep_ratios.append(keep_ratio)
sweep_ratios = sorted(set(sweep_ratios))
# Per-layer Gini (fixed, independent of keep ratio)
layer_ginis = {k: gini(data[k][score_field]) for k in layers}
mean_gini = statistics.mean(layer_ginis.values())
worst_gini_layer = max(layer_ginis, key=lambda k: layer_ginis[k])
print(f"\n── Score distribution inequality (Gini coefficient) ────────────────────")
print(f" Gini measures how non-uniform REAP scores are within each layer.")
print(f" Gini=0: all experts identical. Gini=1: one expert dominates.")
print(f" With load-balanced MoE, Gini is small — but any Gini > 0 means")
print(f" REAP ranking beats random pruning.")
print(f"")
print(f" {'Layer':>5} {'Gini':>8} {'Score range':>13} {'Max/Min ratio':>14}")
print(f" {'-'*5} {'-'*8} {'-'*13} {'-'*14}")
for k in layers:
s = data[k][score_field]
mn, mx = min(s), max(s)
g = layer_ginis[k]
ratio_mm = mx / mn if mn > 0 else float('inf')
print(f" {k:>5} {g:>8.5f} {mx-mn:>13.5f} {ratio_mm:>13.3f}x")
print(f"")
print(f" Mean Gini : {mean_gini:.5f} (worst layer: {worst_gini_layer})")
print(f"\n── Capacity retention sweep ─────────────────────────────────────────────")
print(f" Kept mass% = REAP mass in KEPT experts as % of total (higher = better)")
print(f" vs.rand% = Kept mass% minus uniform baseline (keep_ratio%)")
print(f" Positive = REAP beats random. Magnitude = advantage in pp.")
print(f" Rel.gap = score gap at cut / layer score range (higher = cleaner cut)")
print(f" WARNING: near-zero rel.gap and small vs.rand mean eval is the only ground truth.")
print(f"")
print(f" {'Keep':>5} {'Experts':>7} {'Kept mass%':>11} {'vs.rand%':>9} {'Rel.gap avg':>12} {'Worst layer':>11}")
print(f" {'-'*5} {'-'*7} {'-'*11} {'-'*9} {'-'*12} {'-'*11}")
sweep_results = {}
for ratio in sweep_ratios:
nk = max(1, round(N_EXPERTS * ratio))
mass_fracs, excesses, rel_gaps = [], [], []
worst_excess, worst_layer_id = -999.0, None
for k in layers:
scores = data[k][score_field]
mf, exc, rg = layer_stats(scores, nk)
mass_fracs.append(mf)
excesses.append(exc)
rel_gaps.append(rg)
if exc > worst_excess:
worst_excess = exc
worst_layer_id = k
avg_mf = statistics.mean(mass_fracs)
avg_exc = statistics.mean(excesses)
avg_rg = statistics.mean(rel_gaps)
marker = " <--" if abs(ratio - keep_ratio) < 1e-9 else ""
print(f" {ratio:>5.0%} {nk:>7d} {avg_mf:>10.2f}% {avg_exc:>+9.2f}% {avg_rg:>11.4f} layer {worst_layer_id:>3}{marker}")
sweep_results[ratio] = {
"n_keep": nk, "avg_kept_mass": avg_mf, "avg_vs_random": avg_exc,
"avg_rel_gap": avg_rg, "worst_layer_id": worst_layer_id, "worst_vs_random": worst_excess,
}
print(f"")
print(f" vs.rand% quantifies REAP's advantage over random pruning in REAP-mass terms.")
print(f" For this model it is small (+0.7 to +1.5pp) due to tight load-balancing.")
print(f" Rel.gap near zero means scores are smooth with no natural cut — any threshold")
print(f" is as defensible as another. Actual quality delta requires empirical eval.")
# ── Expert keep/prune detail at selected keep_ratio ──────────────────────────
n_keep = max(1, round(N_EXPERTS * keep_ratio))
n_prune = N_EXPERTS - n_keep
print(f"\n── Expert pruning detail at keep_ratio={keep_ratio:.0%} ({n_keep} keep / {n_prune} prune per layer) ──")
print(f" {'Layer':>5} {'Kept mass%':>11} {'vs.rand%':>9} {'Rel.gap':>9} {'Min kept':>10} {'Max pruned':>11}")
print(f" {'-'*5} {'-'*11} {'-'*9} {'-'*9} {'-'*10} {'-'*11}")
layer_results = {}
for k in layers:
scores = data[k][score_field]
ranked = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
mf, exc, rg = layer_stats(scores, n_keep)
min_kept = scores[ranked[n_keep - 1]]
max_pruned = scores[ranked[n_keep]] if n_prune > 0 else 0
layer_results[k] = {"mass_frac": mf, "excess": exc, "rel_gap": rg,
"min_kept": min_kept, "max_pruned": max_pruned}
print(f" {k:>5} {mf:>10.2f}% {exc:>+9.2f}% {rg:>9.4f} {min_kept:>10.5f} {max_pruned:>11.5f}")
avg_mf = statistics.mean(r["mass_frac"] for r in layer_results.values())
avg_exc = statistics.mean(r["excess"] for r in layer_results.values())
avg_rg = statistics.mean(r["rel_gap"] for r in layer_results.values())
print(f" {'AVG':>5} {avg_mf:>10.2f}% {avg_exc:>+9.2f}% {avg_rg:>9.4f}")
# ── Model size projections ───────────────────────────────────────────────────
print(f"\n── Model size projections ──────────────────────────────────────────────")
def model_size(keep):
expert_params = PARAMS_MOE_EXPERTS_B * keep
return PARAMS_NON_MOE_B + expert_params
original_b = model_size(1.0)
pruned_b = model_size(keep_ratio)
reduction_pct = (1 - pruned_b / original_b) * 100
# GGUF sizes at common quant levels (rough: 1B params ≈ quant_bpw/8 GB)
quants = [("Q8_0", 8.0), ("Q5_K_M", 5.5), ("Q4_K_M", 4.5), ("Q3_K_M", 3.35), ("Q2_K", 2.63)]
print(f" {'':20} {'Original':>10} {'Pruned':>10} {'Saved':>8}")
print(f" {'Parameters (B)':20} {original_b:>10.1f} {pruned_b:>10.1f} {original_b-pruned_b:>8.1f}B")
print(f" {'Reduction':20} {'':>10} {reduction_pct:>9.1f}%")
print()
print(f" Estimated GGUF sizes:")
print(f" {'Quant':10} {'Original':>10} {'Pruned':>10} {'Fits in':>12}")
for name, bpw in quants:
orig_gb = original_b * bpw / 8
prune_gb = pruned_b * bpw / 8
# VRAM fit (16GB GPU)
fits = "16GB GPU" if prune_gb <= 15.5 else ("32GB GPU" if prune_gb <= 31 else "CPU/RAM")
print(f" {name:10} {orig_gb:>9.1f}G {prune_gb:>9.1f}G {fits:>12}")
# ── Active params per token (inference cost) ─────────────────────────────────
print(f"\n── Inference cost (active params per token) ────────────────────────────")
# Active params = non-moe + (n_expert_used/n_experts_kept * moe_expert_params)
# After pruning: router still picks top-k but from n_keep pool
# Active expert params per token = (N_EXPERT_USED / n_keep) * (PARAMS_MOE_EXPERTS_B * keep_ratio)
# But actually active params = N_EXPERT_USED * (params per single expert)
params_per_expert_orig = PARAMS_MOE_EXPERTS_B / N_EXPERTS # B per expert
params_per_expert_pruned = (PARAMS_MOE_EXPERTS_B * keep_ratio) / n_keep # same, just fewer experts
active_orig = PARAMS_NON_MOE_B + N_EXPERT_USED * params_per_expert_orig * N_MOE_LAYERS / N_TOTAL_LAYERS
active_pruned = PARAMS_NON_MOE_B + N_EXPERT_USED * params_per_expert_pruned * N_MOE_LAYERS / N_TOTAL_LAYERS
print(f" Original : {active_orig:.2f}B active params/token (same expert size, more choice)")
print(f" Pruned : {active_pruned:.2f}B active params/token (same — top-k still fires {N_EXPERT_USED} experts)")
print(f" Note: active params per token are IDENTICAL — pruning only reduces")
print(f" model file size and memory footprint, not per-token compute.")
# ── Consistently low-importance experts ──────────────────────────────────────
print(f"\n── Experts consistently ranked low across all layers ───────────────────")
bottom_n = max(1, round(N_EXPERTS * 0.10)) # bottom 10%
low_count = {}
for k in layers:
scores = data[k][score_field]
ranked = sorted(range(len(scores)), key=lambda i: scores[i])
for eid in ranked[:bottom_n]:
low_count[eid] = low_count.get(eid, 0) + 1
consistent = sorted(low_count.items(), key=lambda x: -x[1])
consistent = [(eid, cnt) for eid, cnt in consistent if cnt >= 3]
print(f" (bottom 10% in >= 3 layers — most dispensable experts globally)")
print(f" Expert ID : layers in bottom 10%")
for eid, cnt in consistent[:20]:
bar = "" * cnt
print(f" Expert {eid:>3} : {cnt:>2}/{n_layers} {bar}")
print()
print("=" * 70)