#!/usr/bin/env python3 """ analyze_stats.py -- Summarize expert_stats.json and model size projections. Usage: python analyze_stats.py [stats_file] [--keep 0.5] """ import json, statistics, argparse parser = argparse.ArgumentParser() parser.add_argument("stats", nargs="?", default="expert_stats_reap.json") parser.add_argument("--keep", type=float, default=0.5, help="Fraction of experts to keep (default 0.5)") args = parser.parse_args() with open(args.stats) as f: data = json.load(f) layers = sorted(data.keys(), key=int) n_layers = len(layers) keep_ratio = args.keep # Detect which scoring field is available (new REAP vs old importance_score) sample_layer = data[layers[0]] if "reap" in sample_layer: score_field = "reap" score_label = "REAP (gate_weight × ||expert_out||₂)" elif "importance_score" in sample_layer: score_field = "importance_score" score_label = "importance_score (freq × avg_gate_weight) [legacy, no EAN]" else: raise ValueError(f"No recognised score field in stats. Keys: {list(sample_layer.keys())}") # ── Model architecture constants (Nemotron-3-Nano-30B-A3B) ────────────────── N_EXPERTS = 128 N_EXPERT_USED = 6 # top-k per token N_MOE_LAYERS = 23 N_TOTAL_LAYERS = 53 # Approximate parameter counts (bf16, billions) PARAMS_TOTAL_B = 30.0 PARAMS_MOE_EXPERTS_B = 22.0 # bulk of MoE weight is in expert FFNs PARAMS_NON_MOE_B = PARAMS_TOTAL_B - PARAMS_MOE_EXPERTS_B # ── Header ────────────────────────────────────────────────────────────────── print("=" * 70) print(f" Expert Stats Analysis | file: {args.stats}") print("=" * 70) # ── Profiling completeness ─────────────────────────────────────────────────── sample_tokens = list(data.values())[0]["total_tokens"] # Each token activates N_EXPERT_USED experts, sum(activation_counts) = total*top_k # Approximate samples: total_tokens / avg_tokens_per_sample # We don't know avg, but can infer: total_tokens / (total_tokens / ctx) ≈ ctx chunks # Better: just report tokens and note the user knows sample count print(f"\n── Profiling progress ──────────────────────────────────────────────────") print(f" MoE layers profiled : {n_layers} / {N_MOE_LAYERS}") print(f" Tokens processed : {sample_tokens:,} (per layer)") act_sum = sum(data[layers[0]]["activation_counts"]) assert abs(act_sum / sample_tokens - N_EXPERT_USED) < 0.01, "unexpected top-k" print(f" top-k confirmed : {N_EXPERT_USED} (sum activations / tokens = {act_sum/sample_tokens:.1f})") # ── Per-layer importance score stats ──────────────────────────────────────── print(f"\n── Per-layer score distribution [{score_label}]") print(f" {'Layer':>5} {'Min':>9} {'Max':>9} {'Range':>9} {'CV%':>6} {'Never':>5}") global_cvs = [] for k in layers: d = data[k] s = d[score_field] mn, mx = min(s), max(s) cv = statistics.stdev(s) / statistics.mean(s) * 100 global_cvs.append(cv) print(f" {k:>5} {mn:>9.5f} {mx:>9.5f} {mx-mn:>9.5f} {cv:>6.3f}% {d['never_activated']:>5}") print(f"\n Mean CV across layers : {statistics.mean(global_cvs):.3f}%") print(f" (CV < 1% = near-uniform; load-balancing is working as designed)") # ── Capacity loss sweep across pruning levels ──────────────────────────────── # Paper (observer.py): REAP[i] = mean(ean_norm * softmax_router_weight) over tokens # routed to expert i, averaged via OnlineStatsTracker weighted by expert_frequency. # Our implementation (llama.cpp): same formula but routing weights are the top-k # gate weights (post-softmax within top-k), not the full softmax over all 128. # Impact: our weights are slightly higher than the paper's (renormalized to top-k # only), but relative expert ranking within a layer should be preserved. # # IMPORTANT CAVEAT for this model (Nemotron-3-Nano-30B-A3B): # The model was trained with a strong load-balancing auxiliary loss, so all 128 # experts have nearly identical activation frequency (~4.69%) AND nearly identical # REAP scores (Gini ~0.015, top/bottom ratio ~1.1-1.35x). The score distribution # is a smooth monotone curve with NO natural elbow or gap. # # This means: # - REAP ranking beats random pruning by only ~1pp in mass terms at keep=33% # - The cut point boundary (rank 42 vs 43) has near-zero gap in most layers # - REAP paper results on Qwen3-30B-A3B likely had higher Gini (less tight # load-balancing or more expert specialization in pre-training) # - For this model, actual quality loss must be measured via eval, not predicted # from REAP score variance # # Metrics reported: # - kept_mass%: REAP mass in the KEPT experts as % of total (> keep_ratio% = good) # - vs_random%: how much more mass the REAP-selected set retains vs a random set # of the same size (= kept_mass% - keep_ratio%). Positive = REAP wins. # - Rel.gap: score gap at cut / layer score range. Near 0 = no natural cut point. # - Gini: inequality of score distribution. ~0.015 here = near-uniform. def gini(scores): """Gini coefficient of a list of non-negative values.""" n = len(scores) s = sorted(scores) total = sum(s) if total == 0: return 0.0 cumsum = 0.0 for i, v in enumerate(s): cumsum += (2 * (i + 1) - n - 1) * v return cumsum / (n * total) def layer_stats(scores, n_keep): """Return capacity metrics for a single layer at a given keep count.""" n = len(scores) ranked = sorted(range(n), key=lambda i: scores[i], reverse=True) total = sum(scores) kept_mass = sum(scores[i] for i in ranked[:n_keep]) kept_frac = kept_mass / total if total > 0 else 0.0 # fraction of REAP mass kept random_frac = n_keep / n # uniform expectation vs_random = kept_frac - random_frac # positive = REAP beats random score_range = scores[ranked[0]] - scores[ranked[-1]] gap = scores[ranked[n_keep - 1]] - (scores[ranked[n_keep]] if n_keep < n else 0) rel_gap = gap / score_range if score_range > 0 else 0.0 return kept_frac * 100, vs_random * 100, rel_gap # Sweep over a range of keep ratios sweep_ratios = [0.10, 0.20, 0.25, 0.33, 0.40, 0.50, 0.60, 0.75] if keep_ratio not in sweep_ratios: sweep_ratios.append(keep_ratio) sweep_ratios = sorted(set(sweep_ratios)) # Per-layer Gini (fixed, independent of keep ratio) layer_ginis = {k: gini(data[k][score_field]) for k in layers} mean_gini = statistics.mean(layer_ginis.values()) worst_gini_layer = max(layer_ginis, key=lambda k: layer_ginis[k]) print(f"\n── Score distribution inequality (Gini coefficient) ────────────────────") print(f" Gini measures how non-uniform REAP scores are within each layer.") print(f" Gini=0: all experts identical. Gini=1: one expert dominates.") print(f" With load-balanced MoE, Gini is small — but any Gini > 0 means") print(f" REAP ranking beats random pruning.") print(f"") print(f" {'Layer':>5} {'Gini':>8} {'Score range':>13} {'Max/Min ratio':>14}") print(f" {'-'*5} {'-'*8} {'-'*13} {'-'*14}") for k in layers: s = data[k][score_field] mn, mx = min(s), max(s) g = layer_ginis[k] ratio_mm = mx / mn if mn > 0 else float('inf') print(f" {k:>5} {g:>8.5f} {mx-mn:>13.5f} {ratio_mm:>13.3f}x") print(f"") print(f" Mean Gini : {mean_gini:.5f} (worst layer: {worst_gini_layer})") print(f"\n── Capacity retention sweep ─────────────────────────────────────────────") print(f" Kept mass% = REAP mass in KEPT experts as % of total (higher = better)") print(f" vs.rand% = Kept mass% minus uniform baseline (keep_ratio%)") print(f" Positive = REAP beats random. Magnitude = advantage in pp.") print(f" Rel.gap = score gap at cut / layer score range (higher = cleaner cut)") print(f" WARNING: near-zero rel.gap and small vs.rand mean eval is the only ground truth.") print(f"") print(f" {'Keep':>5} {'Experts':>7} {'Kept mass%':>11} {'vs.rand%':>9} {'Rel.gap avg':>12} {'Worst layer':>11}") print(f" {'-'*5} {'-'*7} {'-'*11} {'-'*9} {'-'*12} {'-'*11}") sweep_results = {} for ratio in sweep_ratios: nk = max(1, round(N_EXPERTS * ratio)) mass_fracs, excesses, rel_gaps = [], [], [] worst_excess, worst_layer_id = -999.0, None for k in layers: scores = data[k][score_field] mf, exc, rg = layer_stats(scores, nk) mass_fracs.append(mf) excesses.append(exc) rel_gaps.append(rg) if exc > worst_excess: worst_excess = exc worst_layer_id = k avg_mf = statistics.mean(mass_fracs) avg_exc = statistics.mean(excesses) avg_rg = statistics.mean(rel_gaps) marker = " <--" if abs(ratio - keep_ratio) < 1e-9 else "" print(f" {ratio:>5.0%} {nk:>7d} {avg_mf:>10.2f}% {avg_exc:>+9.2f}% {avg_rg:>11.4f} layer {worst_layer_id:>3}{marker}") sweep_results[ratio] = { "n_keep": nk, "avg_kept_mass": avg_mf, "avg_vs_random": avg_exc, "avg_rel_gap": avg_rg, "worst_layer_id": worst_layer_id, "worst_vs_random": worst_excess, } print(f"") print(f" vs.rand% quantifies REAP's advantage over random pruning in REAP-mass terms.") print(f" For this model it is small (+0.7 to +1.5pp) due to tight load-balancing.") print(f" Rel.gap near zero means scores are smooth with no natural cut — any threshold") print(f" is as defensible as another. Actual quality delta requires empirical eval.") # ── Expert keep/prune detail at selected keep_ratio ────────────────────────── n_keep = max(1, round(N_EXPERTS * keep_ratio)) n_prune = N_EXPERTS - n_keep print(f"\n── Expert pruning detail at keep_ratio={keep_ratio:.0%} ({n_keep} keep / {n_prune} prune per layer) ──") print(f" {'Layer':>5} {'Kept mass%':>11} {'vs.rand%':>9} {'Rel.gap':>9} {'Min kept':>10} {'Max pruned':>11}") print(f" {'-'*5} {'-'*11} {'-'*9} {'-'*9} {'-'*10} {'-'*11}") layer_results = {} for k in layers: scores = data[k][score_field] ranked = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True) mf, exc, rg = layer_stats(scores, n_keep) min_kept = scores[ranked[n_keep - 1]] max_pruned = scores[ranked[n_keep]] if n_prune > 0 else 0 layer_results[k] = {"mass_frac": mf, "excess": exc, "rel_gap": rg, "min_kept": min_kept, "max_pruned": max_pruned} print(f" {k:>5} {mf:>10.2f}% {exc:>+9.2f}% {rg:>9.4f} {min_kept:>10.5f} {max_pruned:>11.5f}") avg_mf = statistics.mean(r["mass_frac"] for r in layer_results.values()) avg_exc = statistics.mean(r["excess"] for r in layer_results.values()) avg_rg = statistics.mean(r["rel_gap"] for r in layer_results.values()) print(f" {'AVG':>5} {avg_mf:>10.2f}% {avg_exc:>+9.2f}% {avg_rg:>9.4f}") # ── Model size projections ─────────────────────────────────────────────────── print(f"\n── Model size projections ──────────────────────────────────────────────") def model_size(keep): expert_params = PARAMS_MOE_EXPERTS_B * keep return PARAMS_NON_MOE_B + expert_params original_b = model_size(1.0) pruned_b = model_size(keep_ratio) reduction_pct = (1 - pruned_b / original_b) * 100 # GGUF sizes at common quant levels (rough: 1B params ≈ quant_bpw/8 GB) quants = [("Q8_0", 8.0), ("Q5_K_M", 5.5), ("Q4_K_M", 4.5), ("Q3_K_M", 3.35), ("Q2_K", 2.63)] print(f" {'':20} {'Original':>10} {'Pruned':>10} {'Saved':>8}") print(f" {'Parameters (B)':20} {original_b:>10.1f} {pruned_b:>10.1f} {original_b-pruned_b:>8.1f}B") print(f" {'Reduction':20} {'':>10} {reduction_pct:>9.1f}%") print() print(f" Estimated GGUF sizes:") print(f" {'Quant':10} {'Original':>10} {'Pruned':>10} {'Fits in':>12}") for name, bpw in quants: orig_gb = original_b * bpw / 8 prune_gb = pruned_b * bpw / 8 # VRAM fit (16GB GPU) fits = "16GB GPU" if prune_gb <= 15.5 else ("32GB GPU" if prune_gb <= 31 else "CPU/RAM") print(f" {name:10} {orig_gb:>9.1f}G {prune_gb:>9.1f}G {fits:>12}") # ── Active params per token (inference cost) ───────────────────────────────── print(f"\n── Inference cost (active params per token) ────────────────────────────") # Active params = non-moe + (n_expert_used/n_experts_kept * moe_expert_params) # After pruning: router still picks top-k but from n_keep pool # Active expert params per token = (N_EXPERT_USED / n_keep) * (PARAMS_MOE_EXPERTS_B * keep_ratio) # But actually active params = N_EXPERT_USED * (params per single expert) params_per_expert_orig = PARAMS_MOE_EXPERTS_B / N_EXPERTS # B per expert params_per_expert_pruned = (PARAMS_MOE_EXPERTS_B * keep_ratio) / n_keep # same, just fewer experts active_orig = PARAMS_NON_MOE_B + N_EXPERT_USED * params_per_expert_orig * N_MOE_LAYERS / N_TOTAL_LAYERS active_pruned = PARAMS_NON_MOE_B + N_EXPERT_USED * params_per_expert_pruned * N_MOE_LAYERS / N_TOTAL_LAYERS print(f" Original : {active_orig:.2f}B active params/token (same expert size, more choice)") print(f" Pruned : {active_pruned:.2f}B active params/token (same — top-k still fires {N_EXPERT_USED} experts)") print(f" Note: active params per token are IDENTICAL — pruning only reduces") print(f" model file size and memory footprint, not per-token compute.") # ── Consistently low-importance experts ────────────────────────────────────── print(f"\n── Experts consistently ranked low across all layers ───────────────────") bottom_n = max(1, round(N_EXPERTS * 0.10)) # bottom 10% low_count = {} for k in layers: scores = data[k][score_field] ranked = sorted(range(len(scores)), key=lambda i: scores[i]) for eid in ranked[:bottom_n]: low_count[eid] = low_count.get(eid, 0) + 1 consistent = sorted(low_count.items(), key=lambda x: -x[1]) consistent = [(eid, cnt) for eid, cnt in consistent if cnt >= 3] print(f" (bottom 10% in >= 3 layers — most dispensable experts globally)") print(f" Expert ID : layers in bottom 10%") for eid, cnt in consistent[:20]: bar = "█" * cnt print(f" Expert {eid:>3} : {cnt:>2}/{n_layers} {bar}") print() print("=" * 70)