llama.cpp/tools/moe-pruning/analyze_stats.py

#!/usr/bin/env python3
"""
analyze_stats.py  --  Summarize expert_stats.json and model size projections.
Usage: python analyze_stats.py [stats_file] [--keep 0.5]
"""
import json, statistics, argparse

parser = argparse.ArgumentParser()
parser.add_argument("stats", nargs="?", default="expert_stats_reap.json")
parser.add_argument("--keep", type=float, default=0.5, help="Fraction of experts to keep (default 0.5)")
args = parser.parse_args()

with open(args.stats) as f:
    data = json.load(f)

layers = sorted(data.keys(), key=int)
n_layers = len(layers)
keep_ratio = args.keep

# Detect which scoring field is available (new REAP vs old importance_score)
sample_layer = data[layers[0]]
if "reap" in sample_layer:
    score_field = "reap"
    score_label = "REAP (gate_weight × ||expert_out||₂)"
elif "importance_score" in sample_layer:
    score_field = "importance_score"
    score_label = "importance_score (freq × avg_gate_weight)  [legacy, no EAN]"
else:
    raise ValueError(f"No recognised score field in stats. Keys: {list(sample_layer.keys())}")

# ── Model architecture constants (Nemotron-3-Nano-30B-A3B) ──────────────────
N_EXPERTS        = 128
N_EXPERT_USED    = 6       # top-k per token
N_MOE_LAYERS     = 23
N_TOTAL_LAYERS   = 53
# Approximate parameter counts (bf16, billions)
PARAMS_TOTAL_B        = 30.0
PARAMS_MOE_EXPERTS_B  = 22.0   # bulk of MoE weight is in expert FFNs
PARAMS_NON_MOE_B      = PARAMS_TOTAL_B - PARAMS_MOE_EXPERTS_B

# ── Header ──────────────────────────────────────────────────────────────────
print("=" * 70)
print(f"  Expert Stats Analysis  |  file: {args.stats}")
print("=" * 70)

# ── Profiling completeness ───────────────────────────────────────────────────
sample_tokens = list(data.values())[0]["total_tokens"]
# Each token activates N_EXPERT_USED experts, sum(activation_counts) = total*top_k
# Approximate samples: total_tokens / avg_tokens_per_sample
# We don't know avg, but can infer: total_tokens / (total_tokens / ctx) ≈ ctx chunks
# Better: just report tokens and note the user knows sample count
print(f"\n── Profiling progress ──────────────────────────────────────────────────")
print(f"  MoE layers profiled    : {n_layers} / {N_MOE_LAYERS}")
print(f"  Tokens processed       : {sample_tokens:,}  (per layer)")
act_sum = sum(data[layers[0]]["activation_counts"])
assert abs(act_sum / sample_tokens - N_EXPERT_USED) < 0.01, "unexpected top-k"
print(f"  top-k confirmed        : {N_EXPERT_USED}  (sum activations / tokens = {act_sum/sample_tokens:.1f})")

# ── Per-layer importance score stats ────────────────────────────────────────
print(f"\n── Per-layer score distribution  [{score_label}]")
print(f"  {'Layer':>5}  {'Min':>9}  {'Max':>9}  {'Range':>9}  {'CV%':>6}  {'Never':>5}")
global_cvs = []
for k in layers:
    d = data[k]
    s = d[score_field]
    mn, mx = min(s), max(s)
    cv = statistics.stdev(s) / statistics.mean(s) * 100
    global_cvs.append(cv)
    print(f"  {k:>5}  {mn:>9.5f}  {mx:>9.5f}  {mx-mn:>9.5f}  {cv:>6.3f}%  {d['never_activated']:>5}")

print(f"\n  Mean CV across layers  : {statistics.mean(global_cvs):.3f}%")
print(f"  (CV < 1% = near-uniform; load-balancing is working as designed)")

# ── Capacity loss sweep across pruning levels ────────────────────────────────
# Paper (observer.py): REAP[i] = mean(ean_norm * softmax_router_weight) over tokens
#   routed to expert i, averaged via OnlineStatsTracker weighted by expert_frequency.
# Our implementation (llama.cpp): same formula but routing weights are the top-k
#   gate weights (post-softmax within top-k), not the full softmax over all 128.
# Impact: our weights are slightly higher than the paper's (renormalized to top-k
#   only), but relative expert ranking within a layer should be preserved.
#
# IMPORTANT CAVEAT for this model (Nemotron-3-Nano-30B-A3B):
#   The model was trained with a strong load-balancing auxiliary loss, so all 128
#   experts have nearly identical activation frequency (~4.69%) AND nearly identical
#   REAP scores (Gini ~0.015, top/bottom ratio ~1.1-1.35x). The score distribution
#   is a smooth monotone curve with NO natural elbow or gap.
#
#   This means:
#   - REAP ranking beats random pruning by only ~1pp in mass terms at keep=33%
#   - The cut point boundary (rank 42 vs 43) has near-zero gap in most layers
#   - REAP paper results on Qwen3-30B-A3B likely had higher Gini (less tight
#     load-balancing or more expert specialization in pre-training)
#   - For this model, actual quality loss must be measured via eval, not predicted
#     from REAP score variance
#
# Metrics reported:
# - kept_mass%: REAP mass in the KEPT experts as % of total (> keep_ratio% = good)
# - vs_random%: how much more mass the REAP-selected set retains vs a random set
#               of the same size (= kept_mass% - keep_ratio%). Positive = REAP wins.
# - Rel.gap:    score gap at cut / layer score range. Near 0 = no natural cut point.
# - Gini:       inequality of score distribution. ~0.015 here = near-uniform.

def gini(scores):
    """Gini coefficient of a list of non-negative values."""
    n = len(scores)
    s = sorted(scores)
    total = sum(s)
    if total == 0:
        return 0.0
    cumsum = 0.0
    for i, v in enumerate(s):
        cumsum += (2 * (i + 1) - n - 1) * v
    return cumsum / (n * total)

def layer_stats(scores, n_keep):
    """Return capacity metrics for a single layer at a given keep count."""
    n = len(scores)
    ranked = sorted(range(n), key=lambda i: scores[i], reverse=True)
    total  = sum(scores)
    kept_mass   = sum(scores[i] for i in ranked[:n_keep])
    kept_frac   = kept_mass / total if total > 0 else 0.0     # fraction of REAP mass kept
    random_frac = n_keep / n                                   # uniform expectation
    vs_random   = kept_frac - random_frac                     # positive = REAP beats random
    score_range = scores[ranked[0]] - scores[ranked[-1]]
    gap         = scores[ranked[n_keep - 1]] - (scores[ranked[n_keep]] if n_keep < n else 0)
    rel_gap     = gap / score_range if score_range > 0 else 0.0
    return kept_frac * 100, vs_random * 100, rel_gap

# Sweep over a range of keep ratios
sweep_ratios = [0.10, 0.20, 0.25, 0.33, 0.40, 0.50, 0.60, 0.75]
if keep_ratio not in sweep_ratios:
    sweep_ratios.append(keep_ratio)
sweep_ratios = sorted(set(sweep_ratios))

# Per-layer Gini (fixed, independent of keep ratio)
layer_ginis = {k: gini(data[k][score_field]) for k in layers}
mean_gini = statistics.mean(layer_ginis.values())
worst_gini_layer = max(layer_ginis, key=lambda k: layer_ginis[k])

print(f"\n── Score distribution inequality (Gini coefficient) ────────────────────")
print(f"  Gini measures how non-uniform REAP scores are within each layer.")
print(f"  Gini=0: all experts identical. Gini=1: one expert dominates.")
print(f"  With load-balanced MoE, Gini is small — but any Gini > 0 means")
print(f"  REAP ranking beats random pruning.")
print(f"")
print(f"  {'Layer':>5}  {'Gini':>8}  {'Score range':>13}  {'Max/Min ratio':>14}")
print(f"  {'-'*5}  {'-'*8}  {'-'*13}  {'-'*14}")
for k in layers:
    s = data[k][score_field]
    mn, mx = min(s), max(s)
    g = layer_ginis[k]
    ratio_mm = mx / mn if mn > 0 else float('inf')
    print(f"  {k:>5}  {g:>8.5f}  {mx-mn:>13.5f}  {ratio_mm:>13.3f}x")
print(f"")
print(f"  Mean Gini : {mean_gini:.5f}  (worst layer: {worst_gini_layer})")

print(f"\n── Capacity retention sweep ─────────────────────────────────────────────")
print(f"  Kept mass%  = REAP mass in KEPT experts as % of total (higher = better)")
print(f"  vs.rand%    = Kept mass% minus uniform baseline (keep_ratio%)")
print(f"                Positive = REAP beats random. Magnitude = advantage in pp.")
print(f"  Rel.gap     = score gap at cut / layer score range (higher = cleaner cut)")
print(f"  WARNING: near-zero rel.gap and small vs.rand mean eval is the only ground truth.")
print(f"")
print(f"  {'Keep':>5}  {'Experts':>7}  {'Kept mass%':>11}  {'vs.rand%':>9}  {'Rel.gap avg':>12}  {'Worst layer':>11}")
print(f"  {'-'*5}  {'-'*7}  {'-'*11}  {'-'*9}  {'-'*12}  {'-'*11}")

sweep_results = {}
for ratio in sweep_ratios:
    nk = max(1, round(N_EXPERTS * ratio))
    mass_fracs, excesses, rel_gaps = [], [], []
    worst_excess, worst_layer_id = -999.0, None
    for k in layers:
        scores = data[k][score_field]
        mf, exc, rg = layer_stats(scores, nk)
        mass_fracs.append(mf)
        excesses.append(exc)
        rel_gaps.append(rg)
        if exc > worst_excess:
            worst_excess = exc
            worst_layer_id = k
    avg_mf  = statistics.mean(mass_fracs)
    avg_exc = statistics.mean(excesses)
    avg_rg  = statistics.mean(rel_gaps)
    marker  = " <--" if abs(ratio - keep_ratio) < 1e-9 else ""
    print(f"  {ratio:>5.0%}  {nk:>7d}  {avg_mf:>10.2f}%  {avg_exc:>+9.2f}%  {avg_rg:>11.4f}  layer {worst_layer_id:>3}{marker}")
    sweep_results[ratio] = {
        "n_keep": nk, "avg_kept_mass": avg_mf, "avg_vs_random": avg_exc,
        "avg_rel_gap": avg_rg, "worst_layer_id": worst_layer_id, "worst_vs_random": worst_excess,
    }

print(f"")
print(f"  vs.rand% quantifies REAP's advantage over random pruning in REAP-mass terms.")
print(f"  For this model it is small (+0.7 to +1.5pp) due to tight load-balancing.")
print(f"  Rel.gap near zero means scores are smooth with no natural cut — any threshold")
print(f"  is as defensible as another. Actual quality delta requires empirical eval.")

# ── Expert keep/prune detail at selected keep_ratio ──────────────────────────
n_keep   = max(1, round(N_EXPERTS * keep_ratio))
n_prune  = N_EXPERTS - n_keep

print(f"\n── Expert pruning detail at keep_ratio={keep_ratio:.0%}  ({n_keep} keep / {n_prune} prune per layer) ──")
print(f"  {'Layer':>5}  {'Kept mass%':>11}  {'vs.rand%':>9}  {'Rel.gap':>9}  {'Min kept':>10}  {'Max pruned':>11}")
print(f"  {'-'*5}  {'-'*11}  {'-'*9}  {'-'*9}  {'-'*10}  {'-'*11}")

layer_results = {}
for k in layers:
    scores = data[k][score_field]
    ranked = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
    mf, exc, rg = layer_stats(scores, n_keep)
    min_kept   = scores[ranked[n_keep - 1]]
    max_pruned = scores[ranked[n_keep]] if n_prune > 0 else 0
    layer_results[k] = {"mass_frac": mf, "excess": exc, "rel_gap": rg,
                        "min_kept": min_kept, "max_pruned": max_pruned}
    print(f"  {k:>5}  {mf:>10.2f}%  {exc:>+9.2f}%  {rg:>9.4f}  {min_kept:>10.5f}  {max_pruned:>11.5f}")

avg_mf  = statistics.mean(r["mass_frac"] for r in layer_results.values())
avg_exc = statistics.mean(r["excess"]    for r in layer_results.values())
avg_rg  = statistics.mean(r["rel_gap"]   for r in layer_results.values())
print(f"  {'AVG':>5}  {avg_mf:>10.2f}%  {avg_exc:>+9.2f}%  {avg_rg:>9.4f}")

# ── Model size projections ───────────────────────────────────────────────────
print(f"\n── Model size projections ──────────────────────────────────────────────")

def model_size(keep):
    expert_params = PARAMS_MOE_EXPERTS_B * keep
    return PARAMS_NON_MOE_B + expert_params

original_b   = model_size(1.0)
pruned_b     = model_size(keep_ratio)
reduction_pct = (1 - pruned_b / original_b) * 100

# GGUF sizes at common quant levels (rough: 1B params ≈ quant_bpw/8 GB)
quants = [("Q8_0", 8.0), ("Q5_K_M", 5.5), ("Q4_K_M", 4.5), ("Q3_K_M", 3.35), ("Q2_K", 2.63)]

print(f"  {'':20}  {'Original':>10}  {'Pruned':>10}  {'Saved':>8}")
print(f"  {'Parameters (B)':20}  {original_b:>10.1f}  {pruned_b:>10.1f}  {original_b-pruned_b:>8.1f}B")
print(f"  {'Reduction':20}  {'':>10}  {reduction_pct:>9.1f}%")
print()
print(f"  Estimated GGUF sizes:")
print(f"  {'Quant':10}  {'Original':>10}  {'Pruned':>10}  {'Fits in':>12}")
for name, bpw in quants:
    orig_gb  = original_b * bpw / 8
    prune_gb = pruned_b   * bpw / 8
    # VRAM fit (16GB GPU)
    fits = "16GB GPU" if prune_gb <= 15.5 else ("32GB GPU" if prune_gb <= 31 else "CPU/RAM")
    print(f"  {name:10}  {orig_gb:>9.1f}G  {prune_gb:>9.1f}G  {fits:>12}")

# ── Active params per token (inference cost) ─────────────────────────────────
print(f"\n── Inference cost (active params per token) ────────────────────────────")
# Active params = non-moe + (n_expert_used/n_experts_kept * moe_expert_params)
# After pruning: router still picks top-k but from n_keep pool
# Active expert params per token = (N_EXPERT_USED / n_keep) * (PARAMS_MOE_EXPERTS_B * keep_ratio)
# But actually active params = N_EXPERT_USED * (params per single expert)
params_per_expert_orig   = PARAMS_MOE_EXPERTS_B / N_EXPERTS          # B per expert
params_per_expert_pruned = (PARAMS_MOE_EXPERTS_B * keep_ratio) / n_keep  # same, just fewer experts

active_orig   = PARAMS_NON_MOE_B + N_EXPERT_USED * params_per_expert_orig   * N_MOE_LAYERS / N_TOTAL_LAYERS
active_pruned = PARAMS_NON_MOE_B + N_EXPERT_USED * params_per_expert_pruned * N_MOE_LAYERS / N_TOTAL_LAYERS

print(f"  Original  : {active_orig:.2f}B active params/token  (same expert size, more choice)")
print(f"  Pruned    : {active_pruned:.2f}B active params/token  (same — top-k still fires {N_EXPERT_USED} experts)")
print(f"  Note: active params per token are IDENTICAL — pruning only reduces")
print(f"        model file size and memory footprint, not per-token compute.")

# ── Consistently low-importance experts ──────────────────────────────────────
print(f"\n── Experts consistently ranked low across all layers ───────────────────")
bottom_n = max(1, round(N_EXPERTS * 0.10))  # bottom 10%
low_count = {}
for k in layers:
    scores = data[k][score_field]
    ranked = sorted(range(len(scores)), key=lambda i: scores[i])
    for eid in ranked[:bottom_n]:
        low_count[eid] = low_count.get(eid, 0) + 1

consistent = sorted(low_count.items(), key=lambda x: -x[1])
consistent = [(eid, cnt) for eid, cnt in consistent if cnt >= 3]
print(f"  (bottom 10% in >= 3 layers — most dispensable experts globally)")
print(f"  Expert ID : layers in bottom 10%")
for eid, cnt in consistent[:20]:
    bar = "█" * cnt
    print(f"  Expert {eid:>3} : {cnt:>2}/{n_layers}  {bar}")

print()
print("=" * 70)