diff --git a/tools/moe-pruning/analyze_stats.py b/tools/moe-pruning/analyze_stats.py index e7641a3bb6..2e0821f323 100644 --- a/tools/moe-pruning/analyze_stats.py +++ b/tools/moe-pruning/analyze_stats.py @@ -3,7 +3,7 @@ analyze_stats.py -- Summarize expert_stats.json and model size projections. Usage: python analyze_stats.py [stats_file] [--keep 0.5] """ -import json, sys, statistics, argparse +import json, statistics, argparse parser = argparse.ArgumentParser() parser.add_argument("stats", nargs="?", default="expert_stats_reap.json") diff --git a/tools/moe-pruning/gguf_prune.py b/tools/moe-pruning/gguf_prune.py index 2a36d5cf53..df3e638ab4 100644 --- a/tools/moe-pruning/gguf_prune.py +++ b/tools/moe-pruning/gguf_prune.py @@ -31,14 +31,15 @@ Usage: --keep_n 32 """ +from __future__ import annotations + import argparse import json import re -import sys from pathlib import Path import numpy as np -from gguf import GGUFReader, GGUFWriter, GGMLQuantizationType, GGUFValueType +from gguf import GGUFReader, GGUFWriter, GGUFValueType # ── Constants ───────────────────────────────────────────────────────────────── @@ -187,7 +188,7 @@ def main(): kept: dict[int, list[int]] = {} for tensor in reader.tensors: il, suffix = layer_and_suffix(tensor.name) - if il is None or not is_expert_suffix(suffix): + if il is None or suffix is None or not is_expert_suffix(suffix): continue if il in kept: continue # already computed for this layer @@ -222,9 +223,10 @@ def main(): n_pruned = 0 for tensor in reader.tensors: il, suffix = layer_and_suffix(tensor.name) - is_expert = il is not None and is_expert_suffix(suffix) + is_expert = il is not None and suffix is not None and is_expert_suffix(suffix) if is_expert: + assert il is not None k = kept[il] data = slice_expert_axis(tensor.data, k) writer.add_tensor( diff --git a/tools/moe-pruning/nemotron_reap.py b/tools/moe-pruning/nemotron_reap.py deleted file mode 100644 index fac5831d3c..0000000000 --- a/tools/moe-pruning/nemotron_reap.py +++ /dev/null @@ -1,296 +0,0 @@ -""" -NemotronH Expert Activation Profiler + Pruner -Two-phase: profile with 4-bit on GPU, prune bf16 on CPU. - -Usage: - # Phase 1 - profile - python nemotron_reap.py profile \ - --model unsloth/Nemotron-3-Nano-30B-A3B \ - --prompts training-data.jsonl \ - --output expert_stats.json - - # Phase 2 - prune - python nemotron_reap.py prune \ - --model unsloth/Nemotron-3-Nano-30B-A3B \ - --stats expert_stats.json \ - --keep_ratio 0.20 \ - --output ./nemotron-pruned-25e -""" - -import os -os.environ["TORCH_COMPILE_DISABLE"] = "1" # prevent inductor hang during save_pretrained - -import json -import argparse -import torch -import numpy as np -from collections import defaultdict -from transformers import AutoTokenizer, AutoModelForCausalLM - -try: - from transformers import BitsAndBytesConfig - import patch_bnb # noqa: F401 — patches Params4bit.__new__ for transformers 5.x compat - HAS_BNB = True -except ImportError: - HAS_BNB = False - - -# ── Tracker ─────────────────────────────────────────────────────────────────── - -class ExpertActivationTracker: - def __init__(self, n_experts: int = 128): - self.n_experts = n_experts - self.activation_counts = defaultdict(lambda: np.zeros(n_experts, dtype=np.int64)) - self.activation_weights = defaultdict(lambda: np.zeros(n_experts, dtype=np.float64)) - self.total_tokens = defaultdict(int) - self._hooks = [] - - def register_hooks(self, model): - count = 0 - for layer_idx, block in enumerate(model.backbone.layers): - if block.block_type == "moe": - h = block.mixer.gate.register_forward_hook(self._make_hook(layer_idx)) - self._hooks.append(h) - count += 1 - print(f" Hooks attached to {count} MoE layers") - - def _make_hook(self, layer_idx): - def hook(module, input, output): - topk_indices, topk_weights = output - idx = topk_indices.detach().cpu().numpy() # [T, 6] - wgt = topk_weights.detach().float().cpu().numpy() # [T, 6] - T = idx.shape[0] - self.total_tokens[layer_idx] += T - np.add.at(self.activation_counts[layer_idx], idx.flatten(), 1) - np.add.at(self.activation_weights[layer_idx], idx.flatten(), wgt.flatten()) - return hook - - def remove_hooks(self): - for h in self._hooks: - h.remove() - self._hooks.clear() - - def get_stats(self) -> dict: - stats = {} - for layer_idx in sorted(self.activation_counts): - counts = self.activation_counts[layer_idx] - weights = self.activation_weights[layer_idx] - total = self.total_tokens[layer_idx] - freq = counts / (total + 1e-9) - avg_w = np.where(counts > 0, weights / counts, 0.0) - importance = freq * avg_w - stats[layer_idx] = { - "total_tokens": int(total), - "activation_counts": counts.tolist(), - "activation_frequency": freq.tolist(), - "avg_weight": avg_w.tolist(), - "importance_score": importance.tolist(), - "never_activated": int((counts == 0).sum()), - } - return stats - - def print_summary(self, stats, keep_ratio): - keep_n = max(1, int(self.n_experts * keep_ratio)) - print(f"\n{'='*70}") - print(f" PROFILING SUMMARY | keep_ratio={keep_ratio:.0%} | keeping {keep_n}/128 experts/layer") - print(f"{'='*70}") - for li, s in stats.items(): - imp = np.array(s['importance_score']) - threshold = np.sort(imp)[self.n_experts - keep_n] - print( - f" Layer {li:3d}: " - f"never_activated={s['never_activated']:3d}/128 " - f"top_freq={max(s['activation_frequency']):.3f} " - f"threshold={threshold:.4f}" - ) - total_moe = len(stats) - print(f"\n MoE layers : {total_moe}") - print(f" Kept : {total_moe * keep_n} experts total") - print(f" Pruned : {total_moe * (self.n_experts - keep_n)} experts total") - print(f"{'='*70}\n") - - -# ── Phase 1: Profile ────────────────────────────────────────────────────────── - -def cmd_profile(args): - # Mamba2 layers use Triton kernels — CUDA required. - # 4-bit NF4 fits in 16GB VRAM (~15GB). We must keep ALL layers on GPU - # (no CPU spillover) otherwise PCIe transfers make inference unusably slow. - print(f"\n[Phase 1] Profiling — 4-bit NF4, GPU only") - print(f" Model : {args.model}") - print(f" Prompts: {args.prompts}") - - bnb_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.bfloat16, - bnb_4bit_use_double_quant=True, - ) - - tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) - print(" Loading model in 4-bit NF4...") - model = AutoModelForCausalLM.from_pretrained( - args.model, - trust_remote_code=True, - quantization_config=bnb_config, - device_map={"": 0}, # force ALL layers onto GPU 0, no CPU spillover - ) - model.eval() - print(" Model loaded on GPU.") - - # Load prompt+response pairs - pairs = [] - with open(args.prompts) as f: - for line in f: - line = line.strip() - if not line: - continue - obj = json.loads(line) - text = obj.get("prompt", "") + "\n" + obj.get("response", "") - pairs.append(text) - print(f" Loaded {len(pairs)} prompt+response pairs") - - tracker = ExpertActivationTracker(n_experts=128) - tracker.register_hooks(model) - - with torch.no_grad(): - for i, text in enumerate(pairs): - if i % 100 == 0: - print(f" [{i+1}/{len(pairs)}] processing...") - inputs = tokenizer( - text, - return_tensors="pt", - truncation=True, - max_length=args.max_length, - ).to("cuda") - try: - model(**inputs) - except torch.cuda.OutOfMemoryError: - print(f" [{i+1}] OOM — skipping") - torch.cuda.empty_cache() - - tracker.remove_hooks() - stats = tracker.get_stats() - tracker.print_summary(stats, keep_ratio=args.keep_ratio) - - stats_out = {str(k): v for k, v in stats.items()} - with open(args.output, "w") as f: - json.dump(stats_out, f, indent=2) - print(f" Stats saved → {args.output}") - print(f"\n[Phase 1] Done. Run 'prune' next (CPU only).") - - -# ── Phase 2: Prune ──────────────────────────────────────────────────────────── - -def cmd_prune(args): - print(f"\n[Phase 2] Pruning — bf16 on CPU") - print(f" Model : {args.model}") - print(f" Stats : {args.stats}") - print(f" Keep ratio : {args.keep_ratio:.0%}") - print(f" Output : {args.output}") - - with open(args.stats) as f: - stats = {int(k): v for k, v in json.load(f).items()} - - tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) - - print(" Loading model in bf16 on CPU — this takes a few minutes...") - model = AutoModelForCausalLM.from_pretrained( - args.model, - trust_remote_code=True, - dtype=torch.bfloat16, - device_map="cpu", - ) - - keep_n = max(1, int(128 * args.keep_ratio)) - print(f"\n Pruning to top-{keep_n} experts per MoE layer...\n") - - for layer_idx, block in enumerate(model.backbone.layers): - if block.block_type != "moe": - continue - - if layer_idx not in stats: - print(f" Layer {layer_idx:3d}: no profiling data — skipping") - continue - - # Use REAP score if available (from llama.cpp profiler), else fall back to legacy importance_score - layer_stats = stats[layer_idx] - if "reap" in layer_stats: - importance = np.array(layer_stats["reap"]) - else: - importance = np.array(layer_stats["importance_score"]) - keep_sorted = sorted(np.argsort(importance)[-keep_n:].tolist()) - prune_count = 128 - len(keep_sorted) - - # Prune expert list - block.mixer.experts = torch.nn.ModuleList( - [block.mixer.experts[i] for i in keep_sorted] - ) - - # Prune router weights to match new expert indices - keep_t = torch.tensor(keep_sorted, dtype=torch.long) - block.mixer.gate.weight = torch.nn.Parameter( - block.mixer.gate.weight.data[keep_t].clone() - ) - old_bias = block.mixer.gate.e_score_correction_bias.data[keep_t].clone() - block.mixer.gate.register_buffer("e_score_correction_bias", old_bias) - block.mixer.gate.n_routed_experts = keep_n - - never = stats[layer_idx]["never_activated"] - print(f" Layer {layer_idx:3d}: kept {keep_n}, pruned {prune_count} (was {never} never-activated)") - - # Patch top-level config - model.config.n_routed_experts = keep_n - - # Fix transformers 5.x incompatibility: _tied_weights_keys must be a list of dicts, - # but the custom NemotronH modeling code sets it as a plain list of strings. - # _get_tied_weight_keys() calls .keys() on each element → AttributeError. - # Clear it — lm_head weight tying is not needed for inference on the pruned model. - for mod in model.modules(): - if isinstance(getattr(mod, '_tied_weights_keys', None), list): - mod._tied_weights_keys = None - - # Disable torch.compile / inductor before saving — transformers 5.x can trigger - # torch._inductor.compile_worker during save_pretrained, causing an indefinite hang. - import os - os.environ["TORCH_COMPILE_DISABLE"] = "1" - torch._dynamo.reset() - - print(f"\n Saving pruned model → {args.output}") - with torch.no_grad(): - model.save_pretrained(args.output, safe_serialization=True) - tokenizer.save_pretrained(args.output) - print(f"\n[Phase 2] Done.") - print(f" Experts per MoE layer : {keep_n}/128") - print(f" Next: fine-tune with Unsloth from {args.output}") - - -# ── Entry point ─────────────────────────────────────────────────────────────── - -def main(): - parser = argparse.ArgumentParser(description="NemotronH Expert Pruner (REAP-style)") - sub = parser.add_subparsers(dest="cmd", required=True) - - p1 = sub.add_parser("profile", help="Phase 1: profile expert activations (GPU, 4-bit)") - p1.add_argument("--model", default="unsloth/Nemotron-3-Nano-30B-A3B") - p1.add_argument("--prompts", required=True) - p1.add_argument("--output", default="expert_stats.json") - p1.add_argument("--keep_ratio", type=float, default=0.20, - help="Preview ratio for summary only — does not affect saved stats") - p1.add_argument("--max_length", type=int, default=2048) - - p2 = sub.add_parser("prune", help="Phase 2: prune model using saved stats (CPU, bf16)") - p2.add_argument("--model", default="unsloth/Nemotron-3-Nano-30B-A3B") - p2.add_argument("--stats", default="expert_stats.json") - p2.add_argument("--keep_ratio", type=float, default=0.20) - p2.add_argument("--output", default="./nemotron-pruned") - - args = parser.parse_args() - if args.cmd == "profile": - cmd_profile(args) - elif args.cmd == "prune": - cmd_prune(args) - - -if __name__ == "__main__": - main()