From 84cab59ec611f28ad413a05dd8776c74bd9635c1 Mon Sep 17 00:00:00 2001 From: Salvatore Rossitto Date: Wed, 11 Mar 2026 14:50:05 +0100 Subject: [PATCH 01/15] added qlora finetuning --- common/arg.cpp | 90 +- common/common.h | 22 +- examples/CMakeLists.txt | 1 + examples/qlora_training/CMakeLists.txt | 5 + examples/qlora_training/README.md | 331 +++++ examples/qlora_training/check_lora_norms.py | 64 + examples/qlora_training/finetune_qlora.cpp | 1217 +++++++++++++++++ examples/qlora_training/grpo_example.py | 397 ++++++ examples/qlora_training/sample_data.jsonl | 7 + .../qlora_training/sample_rwsft_data.jsonl | 6 + ggml/include/ggml-opt.h | 1 + ggml/include/ggml.h | 16 + ggml/src/ggml-cpu/ops.cpp | 9 +- ggml/src/ggml-cuda/ggml-cuda.cu | 9 + ggml/src/ggml-cuda/opt-step-adamw.cu | 5 +- ggml/src/ggml-cuda/out-prod.cu | 114 ++ ggml/src/ggml-cuda/out-prod.cuh | 7 + ggml/src/ggml-opt.cpp | 16 +- ggml/src/ggml.c | 102 +- 19 files changed, 2403 insertions(+), 16 deletions(-) create mode 100644 examples/qlora_training/CMakeLists.txt create mode 100644 examples/qlora_training/README.md create mode 100644 examples/qlora_training/check_lora_norms.py create mode 100644 examples/qlora_training/finetune_qlora.cpp create mode 100644 examples/qlora_training/grpo_example.py create mode 100644 examples/qlora_training/sample_data.jsonl create mode 100644 examples/qlora_training/sample_rwsft_data.jsonl diff --git a/common/arg.cpp b/common/arg.cpp index 41da8563d6..44d155971b 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3601,32 +3601,108 @@ common_params_context common_params_parser_init(common_params & params, llama_ex { "-lr", "--learning-rate" }, "ALPHA", string_format("adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)", (double) params.lr.lr0), [](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); } - ).set_examples({ LLAMA_EXAMPLE_FINETUNE })); + ).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA })); add_opt(common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA", string_format("(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)", (double) params.lr.lr_min), [](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); } - ).set_examples({ LLAMA_EXAMPLE_FINETUNE })); + ).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA })); add_opt(common_arg( {"-decay-epochs", "--learning-rate-decay-epochs"}, "ALPHA", string_format("(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)", (double) params.lr.decay_epochs), [](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); } - ).set_examples({ LLAMA_EXAMPLE_FINETUNE })); + ).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA })); add_opt(common_arg( {"-wd", "--weight-decay"}, "WD", string_format("adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).", (double) params.lr.wd), [](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); } - ).set_examples({ LLAMA_EXAMPLE_FINETUNE })); + ).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA })); add_opt(common_arg( {"-val-split", "--val-split"}, "FRACTION", string_format("fraction of data to use as validation set for training (default: %.2g).", (double) params.val_split), [](common_params & params, const std::string & value) { params.val_split = std::stof(value); } - ).set_examples({ LLAMA_EXAMPLE_FINETUNE })); + ).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA })); + // qlora flags + add_opt(common_arg( + {"--lora-rank"}, "N", + string_format("LoRA rank r (default: %d)", params.lora_rank), + [](common_params & params, int value) { params.lora_rank = value; } + ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA })); + add_opt(common_arg( + {"--lora-alpha"}, "F", + string_format("LoRA alpha (default: %d = use rank value)", (int) params.lora_alpha), + [](common_params & params, const std::string & value) { params.lora_alpha = std::stof(value); } + ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA })); + add_opt(common_arg( + {"--lora-targets"}, "SUBSTRINGS", + string_format("comma-separated substrings of tensor names to add LoRA to (default: %s)", params.lora_targets.c_str()), + [](common_params & params, const std::string & value) { params.lora_targets = value; } + ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA })); + add_opt(common_arg( + {"--lora-out"}, "FNAME", + string_format("output LoRA adapter GGUF path (default: %s)", params.lora_out.c_str()), + [](common_params & params, const std::string & value) { params.lora_out = value; } + ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA })); + add_opt(common_arg( + {"--train-file"}, "FNAME", + "JSONL training dataset (fields: messages|prompt+response|text)", + [](common_params & params, const std::string & value) { params.train_file = value; } + ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA })); + add_opt(common_arg( + {"--save-every"}, "N", + "save adapter checkpoint every N dataset windows during training (default: 0 = only at end)", + [](common_params & params, int value) { params.save_every = value; } + ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA })); + add_opt(common_arg( + {"--freeze-layers"}, "N", + "freeze first N transformer layers — no LoRA adapters allocated for blk.0..blk.N-1 (default: 0 = train all layers)", + [](common_params & params, int value) { params.lora_freeze_layers = value; } + ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA })); + add_opt(common_arg( + {"--grad-checkpoint"}, "N", + "gradient checkpointing interval to reduce peak activation VRAM (0 = disabled, default: 0)", + [](common_params & params, int value) { params.grad_checkpoint_interval = value; } + ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA })); + add_opt(common_arg( + {"--train-on-prompt"}, + "compute loss on prompt tokens too, not just the response (default: response-only loss)", + [](common_params & params) { params.train_on_prompt = true; } + ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA })); + add_opt(common_arg( + {"--shuffle-dataset"}, + "shuffle dataset windows at the start of each epoch (default: sequential order)", + [](common_params & params) { params.shuffle_dataset = true; } + ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA })); + add_opt(common_arg( + {"--grpo-mode"}, + "enable GRPO IPC training loop (prompts and rewards supplied via stdin/stdout)", + [](common_params & params) { params.grpo_mode = true; } + ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA })); + add_opt(common_arg( + {"--n-gen"}, "N", + string_format("GRPO: number of generations per prompt (default: %d)", params.grpo_n_gen), + [](common_params & params, int value) { params.grpo_n_gen = value; } + ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA })); + add_opt(common_arg( + {"--n-steps"}, "N", + string_format("GRPO: total optimizer steps (default: %d)", params.grpo_n_steps), + [](common_params & params, int value) { params.grpo_n_steps = value; } + ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA })); + add_opt(common_arg( + {"--grpo-temp"}, "F", + string_format("GRPO: sampling temperature for rollout generation (default: %.2f)", (double) params.grpo_temperature), + [](common_params & params, const std::string & value) { params.grpo_temperature = std::stof(value); } + ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA })); + add_opt(common_arg( + {"--grpo-max-tokens"}, "N", + string_format("GRPO: max tokens per generation (default: %d)", params.grpo_max_tokens), + [](common_params & params, int value) { params.grpo_max_tokens = value; } + ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA })); add_opt(common_arg( {"-epochs", "--epochs"}, "N", string_format("optimizer max # of epochs (default: %d)", params.lr.epochs), [](common_params & params, int epochs) { params.lr.epochs = epochs; } - ).set_examples({ LLAMA_EXAMPLE_FINETUNE })); + ).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA })); add_opt(common_arg( {"-opt", "--optimizer"}, "sgd|adamw", "adamw or sgd", [](common_params & params, const std::string & name) { @@ -3635,7 +3711,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd"); } } - ).set_examples({ LLAMA_EXAMPLE_FINETUNE })); + ).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA })); add_opt(common_arg( {"--check"}, string_format("check rather than generate results (default: %s)", params.check ? "true" : "false"), diff --git a/common/common.h b/common/common.h index ffaeefd7c9..0655527ad0 100644 --- a/common/common.h +++ b/common/common.h @@ -103,6 +103,7 @@ enum llama_example { LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_FINETUNE, + LLAMA_EXAMPLE_FINETUNE_QLORA, LLAMA_EXAMPLE_FIT_PARAMS, LLAMA_EXAMPLE_RESULTS, @@ -517,7 +518,26 @@ struct common_params { // finetune struct lr_opt lr; enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW; - float val_split = 0.05f; // fraction of the data used for the validation set + float val_split = 0.05f; // fraction of the data used for the validation set + + // qlora fine-tuning + int32_t lora_rank = 16; // LoRA rank (r) + float lora_alpha = 0.0f; // LoRA alpha (0 = use rank value) + std::string lora_targets = "attn_q,attn_output,ffn_gate,ffn_up,ffn_down"; // comma-separated substrings to match trainable tensors + std::string lora_out = "adapter.gguf"; // output adapter GGUF path + std::string train_file = ""; // JSONL training dataset path + int32_t save_every = 0; // save checkpoint every N optimizer steps (0 = disabled) + int32_t lora_freeze_layers = 0; // do not apply LoRA to the first N transformer layers + int32_t grad_checkpoint_interval = 0; // gradient checkpointing interval to reduce peak VRAM (0 = disabled) + bool train_on_prompt = false; // include prompt tokens in training loss (default: response tokens only) + bool shuffle_dataset = false; // shuffle dataset windows at the start of each epoch + + // grpo training + bool grpo_mode = false; // enable GRPO IPC training loop + int32_t grpo_n_gen = 8; // generations per prompt + int32_t grpo_n_steps = 500; // total GRPO optimizer steps + float grpo_temperature = 0.8f; // sampling temperature for rollouts + int32_t grpo_max_tokens = 512; // max tokens per generation // embedding bool embedding = false; // get only sentence embedding diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index a29dc707c3..93f72fc321 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -34,6 +34,7 @@ else() add_subdirectory(speculative-simple) add_subdirectory(gen-docs) add_subdirectory(training) + add_subdirectory(qlora_training) add_subdirectory(diffusion) if (NOT GGML_BACKEND_DL) add_subdirectory(convert-llama2c-to-ggml) diff --git a/examples/qlora_training/CMakeLists.txt b/examples/qlora_training/CMakeLists.txt new file mode 100644 index 0000000000..5ba9ec8afa --- /dev/null +++ b/examples/qlora_training/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET llama-finetune-qlora) +add_executable(${TARGET} finetune_qlora.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/qlora_training/README.md b/examples/qlora_training/README.md new file mode 100644 index 0000000000..bc3b6fcca9 --- /dev/null +++ b/examples/qlora_training/README.md @@ -0,0 +1,331 @@ +# llama.cpp — Native QLoRA Training + +Native QLoRA + Reward-Weighted SFT training pipeline for quantized GGUF models. + +The base model weights remain **frozen** (quantized tensors are skipped by `llama_set_param` because they are not `GGML_TYPE_F32`). Only freshly-allocated F32 LoRA A/B tensors are trained. The saved adapter GGUF is directly compatible with the existing `llama_adapter_lora_init` loader and `llama-export-lora` merge tool. + +**Status:** Working. Phase 1 (QLoRA SFT) and Phase 2 (Reward-Weighted SFT) are implemented and functional. Training speed is currently limited by full backprop through quantized weights — see [Known Limitations](#known-limitations). + +--- + +## Build + +```bash +cd /mnt/w/llm-trading-arena/unsloth-api/llama.cpp + +# First time (CUDA build): +cmake -B build -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_CURL=OFF +cmake --build build -j$(nproc) + +# Incremental rebuild (after code changes): +cmake --build build --target llama-finetune-qlora -j$(nproc) +# If llama-adapter.cpp or llama-context.cpp changed, rebuild all: +cmake --build build -j$(nproc) +``` + +--- + +## Phase 1 — QLoRA SFT (`llama-finetune-qlora`) + +Trains LoRA adapters on a quantized GGUF model. + +### Recommended command (1.7B model, 16 GB card) + +```bash +./build/bin/llama-finetune-qlora \ + --model ~/qwen3-1.7b-q4_k_m.gguf \ + --train-file data/train.jsonl \ + --lora-rank 16 --lora-alpha 16 \ + -c 4096 -b 4096 -ub 512 \ + --save-every 10 \ + --lora-out ~/adapter.gguf \ + --epochs 3 --seed 42 +``` + +### Recommended command (15B model, 16 GB card, partial offload) + +```bash +./build/bin/llama-finetune-qlora \ + --model ~/nemotron-15b-q4_k_m.gguf \ + --train-file data/train.jsonl \ + --lora-rank 16 --lora-alpha 16 \ + -ngl 13 -c 14336 -b 14336 -ub 1024 \ + --save-every 8 \ + --lora-out ~/nemotron-lora.gguf \ + --epochs 3 --seed 42 +``` + +### All flags + +| Flag | Default | Description | +|---|---|---| +| `--model` | *(required)* | Path to quantized GGUF model | +| `--train-file` | *(required)* | JSONL training dataset | +| `--lora-rank` | `16` | LoRA rank r | +| `--lora-alpha` | `0` (= rank) | LoRA alpha; effective scale = alpha/rank | +| `--lora-targets` | see below | Comma-separated internal tensor name substrings | +| `--lora-out` | `adapter.gguf` | Output adapter GGUF path (supports `~`) | +| `--save-every` | `0` | Save checkpoint every N dataset windows (0 = end only) | +| `--freeze-layers` | `0` | Skip LoRA on first N transformer layers (blk.0..N-1); backward already pruned automatically | +| `--grad-checkpoint` | `0` | Mark every Nth forward node persistent to reduce activation VRAM; good values: 32–64 | +| `--train-on-prompt` | off | Compute loss on prompt tokens too (default: response-only loss) | +| `--shuffle-dataset` | off | Shuffle dataset windows at the start of each epoch | +| `--val-split` | `0.0` | Fraction of data to hold out for validation (e.g. `0.1` = 10%); val loss logged per epoch | +| `-epochs` / `--epochs` | `3` | Training epochs | +| `-c` / `--ctx-size` | `512` | Training context window (tokens) | +| `-b` / `--batch-size` | `2048` | Tokens per `llama_decode` call; set equal to `-c` | +| `-ub` / `--ubatch-size` | `512` | GPU micro-batch tokens; controls VRAM vs. step time | +| `-ngl` | `999` | GPU layers to offload | +| `-lr` / `--learning-rate` | `1e-4` | AdamW learning rate | +| `--seed` | `42` | Random seed for LoRA init | + +### VRAM vs. step-time tradeoff + +Step time and VRAM both scale linearly with `-ub`: + +| Model | `-ub` | VRAM | Step time (approx) | +|---|---|---|---| +| 1.7B Q4_K_M | 512 | ~18 GB | ~120 s (OOM on 16 GB) | +| 1.7B Q4_K_M | 128 | ~6 GB | ~30 s | +| 15B Q4_K_M | 1024 | ~11 GB | ~60 s | + +Use `-c` equal to your target sequence length. More context = more windows per sample = more steps per epoch. Reducing `-c` reduces total training time proportionally. + +### Default LoRA targets + +llama.cpp uses **internal GGUF tensor names**, not HuggingFace names: + +| llama.cpp internal | HuggingFace equivalent | Status | +|---|---|---| +| `attn_q` | `q_proj` | ✅ default target, trainable | +| `attn_output` | `o_proj` | ✅ default target, trainable | +| `ffn_gate` | `gate_proj` | ✅ default target, trainable | +| `ffn_up` | `up_proj` | ✅ default target, trainable | +| `ffn_down` | `down_proj` | ✅ default target, trainable | +| `attn_k` | `k_proj` | ❌ not in defaults — zero gradient (KV scatter via SET_ROWS) | +| `attn_v` | `v_proj` | ❌ not in defaults — zero gradient (KV scatter via SET_ROWS) | +| `ssm_in` | `in_proj` | ❌ not in defaults — zero gradient (SSM_SCAN no backward) | +| `ssm_out` | `out_proj` | ❌ not in defaults — zero gradient (SSM_SCAN no backward) | + +**MoE models:** Expert tensors (`*_exps`) are excluded regardless of `--lora-targets`. The quantized expert weights are frozen (stop-gradient), but LoRA on the dense FFN layers (`ffn_gate`, `ffn_up`, `ffn_down`) works — backward via `MUL_MAT_ID` + `OUT_PROD_ID`. + +### Dataset format (JSONL) + +**Chat format** (loss on response only; use `--train-on-prompt` for all tokens): +```json +{"messages": [{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi!"}]} +``` + +**Prompt/response** (loss on response only): +```json +{"prompt": "What is the capital of France?", "response": "Paris."} +``` + +**Plain text** (loss on all tokens): +```json +{"text": "The quick brown fox."} +``` + +**With reward** (Phase 2 — scales gradient by reward): +```json +{"prompt": "...", "response": "...", "reward": 0.85} +``` + +Rewards are normalized per epoch: clipped to `[-1, 1]`, then min-max scaled to `[0, 1]`. Reward 0 = sample ignored; reward 1 = full gradient. + +### Verify and use the adapter + +```bash +# Hot-load for inference (no merge needed) +./build/bin/llama-cli --model base.gguf --lora adapter.gguf -p "Hello" + +# Merge into base model +./build/bin/llama-export-lora \ + --model base.gguf --lora adapter.gguf --output merged.gguf +``` + +--- + +## Phase 2 — Reward-Weighted SFT + +Built into `llama-finetune-qlora`. When the dataset contains a `reward` or `score` field, the cross-entropy loss for that sample is scaled by the reward before backprop. No extra flags needed — detection is automatic. + +--- + +## Phase 3 — GRPO (Online RL via IPC) + +`llama-finetune-qlora --grpo-mode` implements a full GRPO training loop where the Python process owns prompt sampling and reward scoring, and the C++ process owns model state, generation, and gradient updates. + +### Quick start + +```bash +python3 examples/qlora_training/grpo_example.py \ + --model ~/qwen3-1.7b-q4_k_m.gguf \ + --lora-out ~/grpo-adapter.gguf \ + --rank 16 --n-steps 200 --n-gen 8 +``` + +For verbose output (includes IPC message trace): + +```bash +python3 examples/qlora_training/grpo_example.py \ + --model ~/qwen3-1.7b-q4_k_m.gguf \ + --lora-out ~/grpo-adapter.gguf \ + --verbose +``` + +Resume from a checkpoint: + +```bash +python3 examples/qlora_training/grpo_example.py \ + --model ~/qwen3-1.7b-q4_k_m.gguf \ + --lora ~/grpo-adapter.ckpt50.gguf \ + --lora-out ~/grpo-adapter.gguf +``` + +### GRPO-specific flags + +| Flag | Default | Description | +|---|---|---| +| `--grpo-mode` | off | Enable GRPO IPC mode | +| `--n-gen` | `8` | Rollouts per prompt | +| `--n-steps` | `500` | Total GRPO steps | +| `--grpo-temp` | `0.8` | Sampling temperature for rollouts | +| `--grpo-max-tokens` | `512` | Max tokens per generation | + +All standard flags (`--lora-rank`, `-lr`, `-c`, `-ngl`, `--save-every`, etc.) work in GRPO mode too. `--train-file` is **not** required in GRPO mode. + +### IPC protocol + +The protocol is line-based over stdout (C++ → Python) and stdin (Python → C++). All non-protocol C++ output (timing, debug, model logs) goes to **stderr** and never contaminates the protocol channel. + +**C++ → Python (stdout):** + +| Line | When | +|---|---| +| `[QLORA:READY]` | Process initialised, model loaded | +| `[QLORA:PROMPT_REQ:]` | C++ requests the prompt for step N | +| `[QLORA:GEN:/] ` | One generation (newlines escaped as `\n`) | +| `[QLORA:REWARD_REQ:]` | C++ requests N reward scores | +| `[QLORA:PROGRESS] step=X/Y loss=Z epoch=A/B` | After each weight update | +| `[QLORA:CHECKPOINT] ` | After saving a checkpoint | +| `[QLORA:DONE] final_loss=X` | Training complete | +| `[QLORA:ERROR] ` | Fatal error | + +**Python → C++ (stdin):** + +| Line | Meaning | +|---|---| +| `PROMPT ` | Send prompt for the most recent `PROMPT_REQ` | +| `REWARD ` | Send N advantage scores in `[0, 1]` range | +| `STOP` | Request graceful shutdown after current step | + +**Text encoding:** newlines in generation text are escaped as the two-character sequence `\n`; backslashes are doubled. Use `unescape()` from `grpo_example.py` (or any equivalent) to recover the original text. + +### Writing your own driver + +`grpo_example.py` contains two functions you replace with your own logic: + +```python +def get_prompt(step: int) -> str: + """Return the training prompt for step N.""" + ... + +def score_generations(prompt: str, generations: List[str]) -> List[float]: + """Score each generation. Any numeric range — will be normalised.""" + ... +``` + +The IPC helpers (`escape`, `unescape`, `parse_ipc`, `read_ipc`, `write_cmd`, `wait_for`, `normalise_rewards`) are standalone and have no external dependencies — copy them into your own project if needed. + +### Training loop diagram + +``` +Python C++ (llama-finetune-qlora --grpo-mode) + │ │ + │◄──── [QLORA:READY] ────────────┤ model loaded + │ │ + │ ┌─────────────────────────────┤ + │ │ for each step: │ + │ │ ◄── PROMPT_REQ:N ─────────┤ + │ │ ──► PROMPT ────────► generate n_gen rollouts + │ │ ◄── GEN:1/n ──┤ + │ │ ◄── GEN:2/n ──┤ + │ │ ... │ + │ │ ◄── GEN:n/n ──┤ + │ │ ◄── REWARD_REQ:n ─────────┤ + │ │ (score generations) │ + │ │ ──► REWARD a1 a2 … an ────► one backward + AdamW step + │ │ ◄── PROGRESS step=N/M … ──┤ + │ └─────────────────────────────┤ + │ │ + │◄──── [QLORA:DONE] ─────────────┤ adapter saved +``` + +--- + +## Known Limitations & Optimization Roadmap + +### Current limitations + +**1. Full backprop through frozen quantized layers** +Every backward step dequantizes all frozen Q4_K_M weight tensors to compute activation gradients (needed to propagate loss from the output back to each LoRA layer). For a 28-layer 1.7B model at `-ub 512`, this is ~280 dequantizing matmuls per step → step time is 3–5× slower than inference. + +**2. Activation VRAM** *(partially addressed by `--grad-checkpoint`)* +All forward activations are kept in VRAM throughout the backward pass. VRAM ≈ `model + KV + n_layers × hidden × n_ubatch × 10 × 4B + 2 × lora_params × 4B`. Reducing `-ub` reduces VRAM linearly. Use `--grad-checkpoint 48` to prevent the allocator from reusing intermediate activation buffers during backward, which cuts peak activation VRAM at near-zero compute cost. + +**3. Full backprop through all layers** *(partially addressed by `--freeze-layers`)* +Gradients propagate through all layers that have LoRA adapters. Use `--freeze-layers N` to skip LoRA allocation for blk.0..N-1 — those layers receive no gradient (the `grads_needed` pruner already skips their backward ops automatically). Only the top (total_layers - N) layers are trained. + +### Optimization roadmap + +| Priority | Optimization | Expected gain | Status | +|---|---|---|---| +| ✅ Done | **`--freeze-layers N`** — no LoRA on first N layers; backward auto-pruned | Proportional to N/total | Implemented | +| ✅ Done | **`--grad-checkpoint N`** — keep every Nth activation alive through backward | Reduces peak activation VRAM | Implemented | +| ✅ Done | **`--train-on-prompt`** — compute loss on prompt tokens too | Configurable loss target | Implemented | +| ✅ Done | **`--shuffle-dataset`** — shuffle windows each epoch | Better convergence | Implemented | +| ✅ Done | **BOS separators** — insert BOS between concatenated samples | Correct cross-sample boundaries | Implemented | +| ✅ Done | **Per-epoch loss summary** — log train/val loss after each epoch | Observability | Implemented | +| ✅ Done | **`MUL_MAT_ID` backward** — LoRA on MoE dense FFN layers; `OUT_PROD_ID` for scattered outer product | Unlocks Mixtral/Nemotron-MoE | Implemented | +| ✅ Done | **Quantized `OUT_PROD`** — dequantize on GPU + cuBLAS for backward matmul | Full GPU training (no CPU fallback) | Implemented | +| ✅ Done | **Reuse `ctx_compute_opt`** — allocate tensor metadata context once, `ggml_reset()` across ubatches | Eliminate ~0.5 s/step overhead | Implemented | +| ❌ Skip | **Static training graphs** — KV mask shape changes per ubatch (`n_kv` grows); graph topology not static | Would need KV cache redesign | Not feasible | +| Low | **`SSM_SCAN/CONV` backward** — enable LoRA on Mamba SSM layers | Unlocks NemotronH SSM layers | Planned | +| Low | **GELU backward** — implement `ggml_gelu_back` kernel (UNARY + GLU) | Support GPT-2/Phi-style models | Planned (needs new CUDA/CPU kernels) | + +--- + +## Implementation notes (for developers) + +### Modified llama.cpp files + +| File | Change | +|---|---| +| `ggml/src/ggml.c` | Backward graph fixes: `GET_ROWS` 3D, `SET_ROWS`, `MUL_MAT_ID`, `SSM_SCAN/CONV`, `FLASH_ATTN_EXT` all stop gradient; inplace-op assert → warn+skip | +| `src/llama-context.cpp` | `opt_init`: scheduler and graph sized with inflated capacity before `ggml_opt_init`; `opt_epoch_iter`: per-ubatch timing instrumentation; reward scaling via `g_reward_weights` TLS | +| `src/llama-adapter.cpp` | Repack-buft fallback for LoRA tensors: tries device-native buft before CPU | +| `common/common.h` | Added `save_every`, `lora_freeze_layers`, `grad_checkpoint_interval`, `train_on_prompt`, `shuffle_dataset` fields | +| `common/arg.cpp` | Added `--save-every`, `--freeze-layers`, `--grad-checkpoint`, `--train-on-prompt`, `--shuffle-dataset` arguments | +| `include/llama.h` | Added `llama_opt_set_reward_weights()`; `grad_checkpoint_interval` in `llama_opt_params`; `shuffle` param in `llama_opt_epoch` | +| `ggml/src/ggml-cuda/out-prod.cu` | `OUT_PROD` with quantized src0 (dequantize on GPU + cuBLAS); `OUT_PROD_ID` for MoE backward | +| `ggml/src/ggml-cuda/ggml-cuda.cu` | `supports_op` for quantized `OUT_PROD` and `OUT_PROD_ID`; CPU-resident ids fix in `mul_mat_id` | +| `ggml/include/ggml-opt.h` | Added `grad_checkpoint_interval` to `ggml_opt_params` | +| `ggml/src/ggml-opt.cpp` | Gradient checkpointing: marks every Nth forward node `GGML_TENSOR_FLAG_OUTPUT` before backward build | + +### Key invariants + +- `params.use_mmap = false` — forced; mmap'd tensors can't have data written back +- `params.flash_attn_type = DISABLED` — no backward impl for flash attention +- `params.warmup = false` — warmup runs inference with PARAM tensors → segfault +- `params.cache_type_k = F32` — training requires F32 KV (or BF16 with `--cache-type-k bf16`) +- LoRA A/B tensors are marked `PARAM` via `ggml_set_param` on the tensors loaded by `llama_adapter_lora_init`, not on the pre-init scratch tensors in `lt.buf` +- The adapter GGUF is pre-saved and loaded via `params.lora_adapters` BEFORE `common_init_from_params` so that `sched_reserve` includes LoRA graph nodes in its sizing + +### Why opt_init inflation matters + +`ggml_opt_init` captures `sched.get()` at construction time. The backward graph (`gb_grad`, `gb_opt`) is ~3–5× larger than the forward graph in node count. If the scheduler hash_set is sized only for the forward graph, `ggml_backend_sched_alloc_graph` on the backward graph will overflow it. We recreate `sched` with `inflated = fwd_nodes × 4` slots BEFORE calling `ggml_opt_init`. + +### Reward weighting implementation + +`llama_opt_set_reward_weights(weights, n)` sets thread-local `g_reward_weights`. In `opt_epoch`, each window reads `g_reward_weights[idata]` and passes it as `reward_scale` to `opt_epoch_iter`. Inside the iter loop, instead of writing `1.0f` for the correct token's label position in the cross-entropy label tensor, it writes `reward_scale`. Since cross-entropy loss = `-mean(label × log(softmax(logit)))`, scaling the label scales both loss and gradient identically. diff --git a/examples/qlora_training/check_lora_norms.py b/examples/qlora_training/check_lora_norms.py new file mode 100644 index 0000000000..8908f3ad99 --- /dev/null +++ b/examples/qlora_training/check_lora_norms.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +"""Quick check of LoRA tensor norms in a GGUF file.""" +import sys, struct, numpy as np + +def read_gguf(path): + with open(path, 'rb') as f: + assert f.read(4) == b'GGUF' + version = struct.unpack(' +#include +#include + +#define JSON_ASSERT GGML_ASSERT +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +// Expand a leading ~/ to the HOME directory (the shell doesn't do this for us +// when a path is passed as a string argument to std::ofstream). +static std::string expand_tilde(const std::string & path) { + if (path.size() >= 2 && path[0] == '~' && path[1] == '/') { + const char * home = getenv("HOME"); + if (!home) home = getenv("USERPROFILE"); // Windows fallback + if (home) return std::string(home) + path.substr(1); + } + return path; +} + +static std::vector split_csv(const std::string & s) { + std::vector out; + std::istringstream ss(s); + std::string tok; + while (std::getline(ss, tok, ',')) { + if (!tok.empty()) out.push_back(tok); + } + return out; +} + +// Tensors whose names contain these substrings use MUL_MAT_ID (sparse MoE expert dispatch) +// which has no backward implementation — exclude them from LoRA targets unconditionally. +static const std::vector EXCLUDED_SUBSTRINGS = { + "_exps", // MoE expert weight stacks (ffn_gate_exps, ffn_up_exps, ffn_down_exps, ffn_gate_up_exps) +}; + +static bool tensor_is_excluded(const char * name) { + const std::string n(name); + for (const auto & ex : EXCLUDED_SUBSTRINGS) { + if (n.find(ex) != std::string::npos) return true; + } + return false; +} + +// Extract the transformer block index from a tensor name of the form "blk.NN.". +// Returns -1 if the name does not follow this pattern. +static int tensor_layer_index(const char * name) { + // All per-layer tensors in llama.cpp GGUF are named "blk.." + const char * p = strstr(name, "blk."); + if (!p) return -1; + p += 4; // skip "blk." + char * end = nullptr; + long idx = strtol(p, &end, 10); + if (end == p || (*end != '.' && *end != '\0')) return -1; + return (int) idx; +} + +static bool tensor_matches_targets(const char * name, const std::vector & targets, + int freeze_layers = 0) { + if (tensor_is_excluded(name)) return false; + if (freeze_layers > 0) { + const int layer = tensor_layer_index(name); + if (layer >= 0 && layer < freeze_layers) return false; + } + for (const auto & t : targets) { + if (std::string(name).find(t) != std::string::npos) return true; + } + return false; +} + +// --------------------------------------------------------------------------- +// JSONL dataset loading +// --------------------------------------------------------------------------- + +struct training_sample { + std::vector tokens; // full token sequence + std::vector is_label; // true for tokens that contribute to loss + float reward; // reward/score weight (1.0 = neutral, 0.0 = ignore) +}; + +// Apply a very simple ChatML fallback template when the model has no template. +static std::string apply_chatml(const std::vector & msgs) { + std::string out; + for (const auto & m : msgs) { + out += "<|im_start|>" + m.role + "\n"; + // content_parts is a vector; build a plain text string + std::string text; + if (!m.content_parts.empty()) { + for (const auto & p : m.content_parts) { + text += p.text; + } + } + out += text + "<|im_end|>\n"; + } + return out; +} + +static std::vector load_jsonl( + const std::string & path, + llama_context * ctx, + common_chat_templates * tmpls) { + + std::ifstream f(path); + if (!f.is_open()) { + LOG_ERR("%s: cannot open %s\n", __func__, path.c_str()); + return {}; + } + + std::vector samples; + std::string line; + int lineno = 0; + + while (std::getline(f, line)) { + ++lineno; + if (line.empty()) continue; + + nlohmann::json j; + try { j = nlohmann::json::parse(line); } + catch (...) { + LOG_WRN("%s: skipping invalid JSON on line %d\n", __func__, lineno); + continue; + } + + float reward = 1.0f; + if (j.contains("reward")) reward = j["reward"].get(); + else if (j.contains("score")) reward = j["score"].get(); + + std::string prompt_text; + std::string response_text; + + if (j.contains("messages")) { + // chat format — apply template + std::vector msgs; + for (const auto & m : j["messages"]) { + common_chat_msg msg; + msg.role = m.value("role", "user"); + common_chat_msg_content_part part; + part.type = "text"; + part.text = m.value("content", ""); + msg.content_parts.push_back(part); + msgs.push_back(msg); + } + + // Skip samples where the last assistant turn contains an error marker. + // These are malformed/failed generations that should not be trained on. + { + std::string last_assistant_content; + for (int mi = (int)msgs.size() - 1; mi >= 0; --mi) { + if (msgs[mi].role == "assistant") { + last_assistant_content = msgs[mi].content_parts.empty() + ? "" : msgs[mi].content_parts[0].text; + break; + } + } + if (last_assistant_content.find("Error:") != std::string::npos || + last_assistant_content.find("error:") != std::string::npos) { + LOG_DBG("%s: skipping line %d — assistant response contains error marker\n", __func__, lineno); + continue; + } + } + + // Split into prompt (no loss) + last assistant response (loss). + // Render all messages except the last assistant turn as the prompt + // (with add_generation_prompt=true so the template adds the assistant + // prefix), then use the raw last assistant content as response_text. + // This ensures only the assistant's response tokens get loss, not the + // user turns or system prompt. + if (msgs.empty()) continue; + std::string last_assistant_content; + std::vector prompt_msgs; + // Find the last assistant message + int last_asst_idx = -1; + for (int mi = (int)msgs.size() - 1; mi >= 0; --mi) { + if (msgs[mi].role == "assistant") { last_asst_idx = mi; break; } + } + if (last_asst_idx < 0) { + // No assistant turn — skip; nothing to train on + LOG_DBG("%s: skipping line %d — no assistant turn\n", __func__, lineno); + continue; + } + last_assistant_content = msgs[last_asst_idx].content_parts.empty() + ? "" : msgs[last_asst_idx].content_parts[0].text; + for (int mi = 0; mi < last_asst_idx; ++mi) prompt_msgs.push_back(msgs[mi]); + + if (tmpls) { + common_chat_templates_inputs inp; + inp.messages = prompt_msgs; + inp.add_generation_prompt = true; + prompt_text = common_chat_templates_apply(tmpls, inp).prompt; + response_text = last_assistant_content; + } else { + // Fallback: render everything as ChatML, use full text as response + std::vector all_msgs = prompt_msgs; + all_msgs.push_back(msgs[last_asst_idx]); + prompt_text = ""; + response_text = apply_chatml(all_msgs); + } + } else if (j.contains("prompt") && j.contains("response")) { + response_text = j["response"].get(); + if (response_text.find("Error:") != std::string::npos || + response_text.find("error:") != std::string::npos) { + LOG_DBG("%s: skipping line %d — response contains error marker\n", __func__, lineno); + continue; + } + prompt_text = j["prompt"].get(); + } else if (j.contains("text")) { + response_text = j["text"].get(); + } else { + LOG_WRN("%s: unknown format on line %d, skipping\n", __func__, lineno); + continue; + } + + // Tokenize: prompt (no loss) + response (loss) + auto tok_prompt = common_tokenize(ctx, prompt_text, /*add_special=*/true); + auto tok_response = common_tokenize(ctx, response_text, /*add_special=*/false); + + if (tok_prompt.empty() && tok_response.empty()) continue; + + training_sample s; + s.reward = reward; + s.tokens.insert(s.tokens.end(), tok_prompt.begin(), tok_prompt.end()); + s.tokens.insert(s.tokens.end(), tok_response.begin(), tok_response.end()); + s.is_label.resize(s.tokens.size(), false); + // Only response tokens contribute to the loss + for (size_t i = tok_prompt.size(); i < s.tokens.size(); ++i) { + s.is_label[i] = true; + } + samples.push_back(std::move(s)); + } + + LOG_INF("%s: loaded %zu samples from %s\n", __func__, samples.size(), path.c_str()); + return samples; +} + +// Pack variable-length samples into fixed-context-length windows and create +// an ggml_opt_dataset. Labels for prompt tokens are set to -1 (ignored by +// the loss in the epoch loop). +// window_rewards is filled with one reward weight per window (averaged over +// the sample tokens that fall in that window). If all samples have reward=1.0 +// the vector is all-ones and has no effect. +static ggml_opt_dataset_t build_dataset( + const std::vector & samples, + int32_t n_ctx, + std::vector & window_rewards, + bool train_on_prompt = false, + llama_token bos_token = -1) { + + // Flatten samples into token/label/reward streams + std::vector flat_tokens; + std::vector flat_labels; // -1 = no loss, token_id = loss target + std::vector flat_rewards; // per-token reward from the source sample + + for (size_t si = 0; si < samples.size(); ++si) { + const auto & s = samples[si]; + + // Insert BOS separator between samples to prevent cross-sample predictions. + // The first sample already has BOS from tokenization (add_special=true). + if (si > 0 && bos_token >= 0 && !s.tokens.empty()) { + flat_tokens .push_back(bos_token); + flat_labels .push_back(-1); // no loss on separator + flat_rewards.push_back(s.reward); + } + + for (size_t i = 0; i + 1 < s.tokens.size(); ++i) { + flat_tokens .push_back(s.tokens[i]); + if (train_on_prompt) { + // All positions get correct next-token label (prompt + response) + flat_labels.push_back((int32_t)s.tokens[i + 1]); + } else { + // Only response positions get loss; prompt positions get -1 (sentinel). + // The sentinel is passed through to labels_sparse; opt_epoch_iter skips + // writing to the label tensor for those positions, leaving them zeroed → + // zero cross-entropy contribution. No gradient flows from prompt tokens. + flat_labels.push_back(s.is_label[i + 1] ? (int32_t)s.tokens[i + 1] : -1); + } + flat_rewards.push_back(s.reward); + } + } + + if ((int64_t)flat_tokens.size() < n_ctx) { + LOG_ERR("%s: dataset too small (%zu tokens) for context %d\n", + __func__, flat_tokens.size(), n_ctx); + return nullptr; + } + + const int64_t stride = n_ctx / 2; + int64_t ndata = ((int64_t)flat_tokens.size() - n_ctx) / stride; + if (ndata < 1) ndata = 1; // at least one window when flat_tokens >= n_ctx + + window_rewards.resize(ndata); + + ggml_opt_dataset_t dataset = ggml_opt_dataset_init( + GGML_TYPE_I32, GGML_TYPE_I32, n_ctx, n_ctx, ndata, 1); + + int32_t * data = (int32_t *) ggml_opt_dataset_data (dataset)->data; + int32_t * labels = (int32_t *) ggml_opt_dataset_labels(dataset)->data; + + for (int64_t i = 0; i < ndata; ++i) { + const int64_t off = i * stride; + float reward_sum = 0.0f; + for (int32_t j = 0; j < n_ctx; ++j) { + data [i * n_ctx + j] = flat_tokens[off + j]; + // Pass -1 sentinel through unchanged for masked (prompt) positions. + // opt_epoch_iter skips these positions (no label tensor write → zero + // cross-entropy contribution). Do NOT substitute the current token + // here — that trains the model to predict itself (off-by-one) and + // causes repetition degeneration. + labels[i * n_ctx + j] = flat_labels[off + j]; + reward_sum += flat_rewards[off + j]; + } + window_rewards[i] = reward_sum / n_ctx; + } + + // Normalize window rewards to [0, 1]. + // Step 1: clip to [-1, 1] — outliers like 1.3/1.4 would otherwise compress the + // useful signal range after min-max scaling (a reward=1.0 would map to + // only 0.83 instead of 1.0 if the max is 1.4). + // Step 2: min-max scale clipped values → [0, 1]. + // min → 0.0 (window ignored), max → 1.0 (full weight). + // If all rewards are identical (pure SFT dataset) keep at 1.0. + for (float & r : window_rewards) { + r = std::max(-1.0f, std::min(1.0f, r)); + } + float rmin = *std::min_element(window_rewards.begin(), window_rewards.end()); + float rmax = *std::max_element(window_rewards.begin(), window_rewards.end()); + const float rrange = rmax - rmin; + if (rrange > 1e-6f) { + for (float & r : window_rewards) { + r = (r - rmin) / rrange; + } + LOG_INF("%s: reward range [%.4f, %.4f] (after clip to [-1,1]) → normalized to [0, 1]\n", __func__, rmin, rmax); + } else { + std::fill(window_rewards.begin(), window_rewards.end(), 1.0f); + } + + return dataset; +} + +// --------------------------------------------------------------------------- +// LoRA tensor allocation +// --------------------------------------------------------------------------- + +struct lora_tensors { + struct ggml_context * ctx = nullptr; + struct ggml_backend_buffer * buf = nullptr; + // map: base tensor name → {lora_a, lora_b} + std::unordered_map> ab; +}; + +static lora_tensors alloc_lora_tensors( + const std::string & model_path, + const std::vector & targets, + int32_t rank, + std::mt19937 & rng, + int32_t freeze_layers = 0) { + + lora_tensors lt; + + // Open the model GGUF to discover tensor names and shapes + // without needing access to private llama_model internals. + struct ggml_context * ctx_meta = nullptr; + struct gguf_init_params gguf_params = { /*.no_alloc=*/true, /*.ctx=*/&ctx_meta }; + struct gguf_context * ctx_gguf = gguf_init_from_file(model_path.c_str(), gguf_params); + if (!ctx_gguf) { + LOG_ERR("%s: failed to open model GGUF for tensor discovery: %s\n", + __func__, model_path.c_str()); + return lt; + } + + // Collect matching 2-D tensors + struct tensor_info { std::string name; int64_t ne0, ne1; }; + std::vector matched; + + for (ggml_tensor * t = ggml_get_first_tensor(ctx_meta); + t; t = ggml_get_next_tensor(ctx_meta, t)) { + if (ggml_n_dims(t) < 2) continue; + if (!tensor_matches_targets(t->name, targets, freeze_layers)) continue; + matched.push_back({t->name, t->ne[0], t->ne[1]}); + } + + gguf_free(ctx_gguf); + ggml_free(ctx_meta); + + if (matched.empty()) { + LOG_ERR("%s: no model tensors matched --lora-targets; check spelling\n", __func__); + return lt; + } + + if (freeze_layers > 0) { + LOG_INF("%s: freezing layers blk.0 .. blk.%d (no LoRA allocated; backward already pruned by grads_needed)\n", + __func__, freeze_layers - 1); + } + LOG_INF("%s: allocating LoRA A/B tensors for %zu weight matrices, rank=%d\n", + __func__, matched.size(), rank); + + // Allocate ggml context for A+B tensors (2 tensors per matched weight) + const size_t mem = (2 * matched.size() + 16) * ggml_tensor_overhead(); + struct ggml_init_params ip = { mem, nullptr, /*no_alloc=*/true }; + lt.ctx = ggml_init(ip); + + for (const auto & ti : matched) { + const int64_t in_dim = ti.ne0; // columns (input features) + const int64_t out_dim = ti.ne1; // rows (output features) + + // lora_a: [in_dim, rank] applied first: a @ x + // lora_b: [rank, out_dim] applied second: b @ (a @ x) + // Convention matches llama-adapter.cpp:48-60: + // a->ne[0] == in_dim, a->ne[1] == rank + // b->ne[0] == rank, b->ne[1] == out_dim + ggml_tensor * la = ggml_new_tensor_2d(lt.ctx, GGML_TYPE_F32, in_dim, rank); + ggml_tensor * lb = ggml_new_tensor_2d(lt.ctx, GGML_TYPE_F32, rank, out_dim); + + ggml_set_name(la, (ti.name + ".lora_a").c_str()); + ggml_set_name(lb, (ti.name + ".lora_b").c_str()); + + lt.ab[ti.name] = {la, lb}; + } + + // Allocate backend buffer for all LoRA tensors at once + lt.buf = ggml_backend_alloc_ctx_tensors_from_buft(lt.ctx, ggml_backend_cpu_buffer_type()); + + // Initialize: A ~ N(0, 1/sqrt(rank)), B = 0 + const float std_a = 1.0f / std::sqrt((float)rank); + std::normal_distribution dist(0.0f, std_a); + + for (auto & kv : lt.ab) { + ggml_tensor * la = kv.second.first; + ggml_tensor * lb = kv.second.second; + + // Fill A + float * data_a = (float *) la->data; + for (int64_t i = 0; i < ggml_nelements(la); ++i) data_a[i] = dist(rng); + // Zero B + memset(lb->data, 0, ggml_nbytes(lb)); + } + + return lt; +} + +// --------------------------------------------------------------------------- +// Param filter: only train lora_a / lora_b tensors +// --------------------------------------------------------------------------- + +static bool lora_param_filter(const struct ggml_tensor * t, void * /*ud*/) { + const char * n = t->name; + const size_t len = strlen(n); + if (len > 7 && strcmp(n + len - 7, ".lora_a") == 0) return true; + if (len > 7 && strcmp(n + len - 7, ".lora_b") == 0) return true; + return false; +} + +// --------------------------------------------------------------------------- +// Save adapter GGUF +// --------------------------------------------------------------------------- + +static std::string basename_from_path(const std::string & p) { + const size_t pos = p.find_last_of("/\\"); + if (pos == std::string::npos) return p; + return p.substr(pos + 1); +} + +static void save_adapter( + const lora_tensors & lt, + const std::string & out_path, + const std::string & arch, + float alpha, + const std::string & base_model_path) { + + // Build output GGUF context + struct gguf_context * gctx = gguf_init_empty(); + + // Metadata required by llama_adapter_lora_init + gguf_set_val_str(gctx, "general.type", "adapter"); + gguf_set_val_str(gctx, "general.architecture", arch.c_str()); + gguf_set_val_str(gctx, "adapter.type", "lora"); + gguf_set_val_f32(gctx, "adapter.lora.alpha", alpha); + gguf_set_val_str(gctx, "adapter.base_model", basename_from_path(base_model_path).c_str()); + + // Register tensors + for (const auto & kv : lt.ab) { + gguf_add_tensor(gctx, kv.second.first); // lora_a + gguf_add_tensor(gctx, kv.second.second); // lora_b + } + + // Write: meta placeholder → tensor data → rewrite meta + const std::string real_path = expand_tilde(out_path); + std::ofstream fout(real_path, std::ios::binary); + if (!fout.is_open()) { + LOG_ERR("%s: cannot open %s for writing\n", __func__, real_path.c_str()); + gguf_free(gctx); + return; + } + + // Write meta placeholder + const size_t meta_size = gguf_get_meta_size(gctx); + std::vector zeros_buf(meta_size, 0); + fout.write(zeros_buf.data(), meta_size); + + // Write tensor data — copy to CPU first in case tensors live on GPU + for (const auto & kv : lt.ab) { + for (ggml_tensor * t : {kv.second.first, kv.second.second}) { + const size_t nb = ggml_nbytes(t); + std::vector cpu_buf(nb); + ggml_backend_tensor_get(t, cpu_buf.data(), 0, nb); + fout.write(cpu_buf.data(), nb); + // GGUF tensors are 32-byte aligned + const size_t pad = GGML_PAD(nb, 32) - nb; + if (pad > 0) { + std::vector pad_buf(pad, 0); + fout.write(pad_buf.data(), pad); + } + } + } + + // Re-write metadata at offset 0 + std::vector meta(meta_size); + gguf_get_meta_data(gctx, meta.data()); + fout.seekp(0); + fout.write((const char *) meta.data(), meta_size); + + fout.close(); + gguf_free(gctx); + + LOG_INF("%s: adapter saved to %s\n", __func__, real_path.c_str()); +} + +// --------------------------------------------------------------------------- +// Periodic checkpoint callback +// --------------------------------------------------------------------------- + +struct save_ctx { + const lora_tensors * lt; + const std::string * lora_out; + const std::string * arch; + const std::string * base_model_path; + float lora_alpha; + int32_t save_every; // 0 = disabled + int32_t ubatch_per_ctx; + int64_t last_saved; // last window index at which we saved +}; + +// TLS pointer set before each epoch so the static callback can access it. +static thread_local save_ctx * g_save_ctx = nullptr; + +static void save_every_callback( + bool train, + ggml_opt_context_t opt_ctx, + ggml_opt_dataset_t dataset, + ggml_opt_result_t result, + int64_t ibatch, + int64_t ibatch_max, + int64_t t_start_us) { + ggml_opt_epoch_callback_progress_bar(train, opt_ctx, dataset, result, ibatch, ibatch_max, t_start_us); + + // Log loss at every window boundary so we can see if/when it diverges. + if (train && g_save_ctx) { + const int64_t window = ibatch / g_save_ctx->ubatch_per_ctx; + const int64_t ubatch_in_window = ibatch % g_save_ctx->ubatch_per_ctx; + if (ubatch_in_window == g_save_ctx->ubatch_per_ctx - 1) { + double loss = 0.0, loss_unc = 0.0; + ggml_opt_result_loss(result, &loss, &loss_unc); + fprintf(stderr, "\n[window %4ld] loss=%.4f ± %.4f\n", (long)window, loss, loss_unc); + } + } + + if (!train || !g_save_ctx || g_save_ctx->save_every <= 0) return; + const int64_t window = ibatch / g_save_ctx->ubatch_per_ctx; + if (window > 0 && window != g_save_ctx->last_saved && window % g_save_ctx->save_every == 0) { + g_save_ctx->last_saved = window; + const std::string ckpt = *g_save_ctx->lora_out + ".ckpt" + std::to_string(window) + ".gguf"; + save_adapter(*g_save_ctx->lt, ckpt, *g_save_ctx->arch, g_save_ctx->lora_alpha, *g_save_ctx->base_model_path); + fprintf(stderr, "\n"); + LOG_INF("save_every_callback: checkpoint saved -> %s (window %ld)\n", ckpt.c_str(), (long)window); + } +} + +// --------------------------------------------------------------------------- +// IPC helpers (stdout protocol, stdin commands) +// --------------------------------------------------------------------------- + +// Escape newlines and backslashes for single-line IPC transmission. +// Mirrors _escape() in gguf_trainer.py. +static std::string ipc_escape(const std::string & s) { + std::string out; + out.reserve(s.size()); + for (char c : s) { + if (c == '\\') out += "\\\\"; + else if (c == '\n') out += "\\n"; + else if (c == '\r') out += "\\r"; + else out += c; + } + return out; +} + +static void ipc_emit(const char * msg) { + fputs(msg, stdout); + fputc('\n', stdout); + fflush(stdout); +} + +// Read one line from stdin, trimming the trailing newline. +// Returns false on EOF or error. +static bool ipc_read_line(std::string & out) { + out.clear(); + if (!std::getline(std::cin, out)) return false; + // Strip trailing \r if present (Windows line endings) + if (!out.empty() && out.back() == '\r') out.pop_back(); + return true; +} + +// Parse "REWARD r1 r2 ... rN" into a float vector. +static std::vector ipc_parse_rewards(const std::string & line) { + std::vector rewards; + if (line.size() < 8 || line.substr(0, 7) != "REWARD ") return rewards; + std::istringstream ss(line.substr(7)); + float r; + while (ss >> r) rewards.push_back(r); + return rewards; +} + +// --------------------------------------------------------------------------- +// Greedy / temperature sampling for GRPO rollout generation +// --------------------------------------------------------------------------- + +static std::string generate_response( + llama_context * ctx, + llama_model * model, + const std::string & prompt, + int32_t max_tokens, + float temperature, + std::mt19937 & rng) { + + const llama_vocab * vocab = llama_model_get_vocab(model); + auto tokens = common_tokenize(ctx, prompt, /*add_special=*/true); + if (tokens.empty()) return ""; + + // Clear KV cache before each generation (don't carry over previous prompt state) + llama_memory_clear(llama_get_memory(ctx), true); + { + llama_batch batch = llama_batch_get_one(tokens.data(), (int32_t)tokens.size()); + if (llama_decode(ctx, batch) != 0) { + LOG_ERR("%s: llama_decode failed on prompt\n", __func__); + return ""; + } + } + + std::string output; + const llama_token eos = llama_vocab_eos(vocab); + const llama_token nl = llama_vocab_nl(vocab); + + // For ChatML models <|im_end|> is the turn-end marker but may not be the + // vocab EOS token. Look it up by tokenizing the string and taking the + // first token if it tokenizes to exactly one piece. + llama_token im_end = -1; + { + std::vector im_end_tokens(8); + static const char im_end_str[] = "<|im_end|>"; + int n = llama_tokenize(vocab, im_end_str, (int32_t)strlen(im_end_str), im_end_tokens.data(), (int32_t)im_end_tokens.size(), /*add_special=*/false, /*parse_special=*/true); + if (n == 1) im_end = im_end_tokens[0]; + } + const llama_token eot = llama_vocab_eot(vocab); // may equal eos on some models + + for (int32_t i = 0; i < max_tokens; ++i) { + // Sample next token — use ith=-1 to always get the LAST output position's + // logits. llama_get_logits(ctx) returns position 0 which is wrong when the + // prompt batch has multiple output tokens (training context). + float * logits = llama_get_logits_ith(ctx, -1); + if (!logits) { + LOG_ERR("%s: llama_get_logits_ith(-1) returned NULL\n", __func__); + break; + } + const int32_t n_vocab = llama_vocab_n_tokens(vocab); + + llama_token next_token; + if (temperature <= 0.0f) { + // Greedy + next_token = (llama_token)(std::max_element(logits, logits + n_vocab) - logits); + } else { + // Temperature sampling via softmax + categorical draw + std::vector probs(n_vocab); + float max_logit = *std::max_element(logits, logits + n_vocab); + float sum = 0.0f; + for (int32_t k = 0; k < n_vocab; ++k) { + probs[k] = std::exp((logits[k] - max_logit) / temperature); + sum += probs[k]; + } + for (float & p : probs) p /= sum; + std::discrete_distribution dist(probs.begin(), probs.end()); + next_token = dist(rng); + } + + if (next_token == eos) break; + if (next_token == eot) break; + if (im_end >= 0 && next_token == im_end && !output.empty()) break; + + // Decode token to text + char buf[256] = {}; + llama_token_to_piece(vocab, next_token, buf, sizeof(buf) - 1, 0, true); + output += buf; + + // Feed token back for next step + llama_batch batch = llama_batch_get_one(&next_token, 1); + if (llama_decode(ctx, batch) != 0) break; + } + + return output; +} + +// --------------------------------------------------------------------------- +// GRPO IPC training loop +// --------------------------------------------------------------------------- + +// Volatile flag set by SIGINT so the loop can exit cleanly. +static volatile sig_atomic_t g_grpo_stop = 0; +static void grpo_sigint_handler(int) { g_grpo_stop = 1; } + +static int run_grpo_mode( + common_params & params, + llama_model * model, + llama_context * ctx, + lora_tensors & lt, + const std::string & arch, + float lora_alpha, + const std::string & base_model_path) { + + const int32_t n_ctx = llama_n_ctx(ctx); + const int32_t n_gen = params.grpo_n_gen; + const int32_t n_steps = params.grpo_n_steps; + const float temp = params.grpo_temperature; + const int32_t max_tok = params.grpo_max_tokens; + + std::mt19937 rng(params.sampling.seed != LLAMA_DEFAULT_SEED + ? params.sampling.seed : 42); + + // Initialize optimizer + struct llama_opt_params lopt_params { + /*.n_ctx_train =*/0, + /*.param_filter =*/lora_param_filter, + /*.param_filter_ud =*/nullptr, + /*.get_opt_pars =*/common_opt_lr_pars, + /*.get_opt_pars_ud =*/¶ms.lr, + /*.optimizer_type =*/params.optimizer, + /*.grad_checkpoint_interval =*/params.grad_checkpoint_interval, + }; + llama_opt_init(ctx, model, lopt_params); + + const llama_token bos = llama_vocab_bos(llama_model_get_vocab(model)); + + signal(SIGINT, grpo_sigint_handler); + + // Signal Python that we are ready + ipc_emit("[QLORA:READY]"); + + float last_loss = 0.0f; + int step = 0; + + while (step < n_steps && !g_grpo_stop) { + + // ── Request prompt ──────────────────────────────────────────────── + { + char buf[64]; + snprintf(buf, sizeof(buf), "[QLORA:PROMPT_REQ:%d]", step + 1); + ipc_emit(buf); + } + + std::string prompt_line; + if (!ipc_read_line(prompt_line)) break; + if (prompt_line == "STOP") { + LOG_INF("grpo: received STOP from Python\n"); + break; + } + if (prompt_line.size() < 8 || prompt_line.substr(0, 7) != "PROMPT ") { + char buf[128]; + snprintf(buf, sizeof(buf), "[QLORA:ERROR] expected PROMPT, got: %.80s", prompt_line.c_str()); + ipc_emit(buf); + return 1; + } + // Unescape the prompt (\\n → \n etc.) + std::string prompt; + { + const std::string esc = prompt_line.substr(7); + prompt.reserve(esc.size()); + for (size_t i = 0; i < esc.size(); ++i) { + if (esc[i] == '\\' && i + 1 < esc.size()) { + char next = esc[i+1]; + if (next == 'n') { prompt += '\n'; ++i; } + else if (next == 'r') { prompt += '\r'; ++i; } + else if (next == '\\') { prompt += '\\'; ++i; } + else { prompt += esc[i]; } + } else { + prompt += esc[i]; + } + } + } + + // ── Generate N responses ────────────────────────────────────────── + std::vector generations(n_gen); + for (int k = 0; k < n_gen; ++k) { + generations[k] = generate_response(ctx, model, prompt, max_tok, temp, rng); + + char hdr[64]; + snprintf(hdr, sizeof(hdr), "[QLORA:GEN:%d/%d] ", k + 1, n_gen); + std::string msg = std::string(hdr) + ipc_escape(generations[k]); + ipc_emit(msg.c_str()); + } + + // ── Request rewards ─────────────────────────────────────────────── + { + char buf[64]; + snprintf(buf, sizeof(buf), "[QLORA:REWARD_REQ:%d]", n_gen); + ipc_emit(buf); + } + + std::string reward_line; + if (!ipc_read_line(reward_line)) break; + if (reward_line == "STOP") { + LOG_INF("grpo: received STOP from Python\n"); + break; + } + std::vector rewards = ipc_parse_rewards(reward_line); + if ((int32_t)rewards.size() != n_gen) { + char buf[128]; + snprintf(buf, sizeof(buf), "[QLORA:ERROR] expected %d rewards, got %zu", n_gen, rewards.size()); + ipc_emit(buf); + return 1; + } + + // ── Build single-step mini-dataset: prompt+generations with rewards ─ + // Each generation is a separate sample; prompt = no-loss, generation = loss. + std::vector step_samples; + step_samples.reserve(n_gen); + for (int k = 0; k < n_gen; ++k) { + training_sample s; + s.reward = rewards[k]; + + auto tok_prompt = common_tokenize(ctx, prompt, /*add_special=*/true); + auto tok_gen = common_tokenize(ctx, generations[k], /*add_special=*/false); + + s.tokens.insert(s.tokens.end(), tok_prompt.begin(), tok_prompt.end()); + s.tokens.insert(s.tokens.end(), tok_gen.begin(), tok_gen.end()); + s.is_label.resize(s.tokens.size(), false); + for (size_t i = tok_prompt.size(); i < s.tokens.size(); ++i) { + s.is_label[i] = true; + } + step_samples.push_back(std::move(s)); + } + + // Ensure minimum token count for one context window. + // build_dataset drops the last token per sample during flattening, + // so we need total raw tokens > n_ctx to guarantee ndata >= 1. + while (true) { + size_t total = 0; + for (const auto & s : step_samples) total += s.tokens.size(); + if ((int64_t)total > n_ctx + (int64_t)step_samples.size()) break; + step_samples.push_back(step_samples.back()); + } + + std::vector window_rewards; + ggml_opt_dataset_t step_dataset = build_dataset( + step_samples, n_ctx, window_rewards, /*train_on_prompt=*/false, bos); + if (!step_dataset) { + ipc_emit("[QLORA:ERROR] build_dataset failed for step"); + return 1; + } + + // Apply reward weights for this step + const bool has_rewards = std::any_of(window_rewards.begin(), window_rewards.end(), + [](float r){ return std::abs(r - 1.0f) > 1e-4f; }); + if (has_rewards) { + llama_opt_set_reward_weights(window_rewards.data(), (int64_t)window_rewards.size()); + } + + // ── One optimizer step (full dataset = one mini-epoch) ──────────── + const int64_t idata_all = ggml_opt_dataset_ndata(step_dataset); + ggml_opt_result_t step_result = ggml_opt_result_init(); + + llama_opt_epoch(ctx, step_dataset, step_result, nullptr, idata_all, + nullptr, // no progress bar callback — clean stdout + nullptr, + false); // no shuffle for single-step + + double loss = 0.0, loss_unc = 0.0; + ggml_opt_result_loss(step_result, &loss, &loss_unc); + last_loss = (float)loss; + + ggml_opt_result_free(step_result); + ggml_opt_dataset_free(step_dataset); + llama_opt_set_reward_weights(nullptr, 0); + + ++step; + + // ── Emit progress ───────────────────────────────────────────────── + { + char buf[128]; + snprintf(buf, sizeof(buf), + "[QLORA:PROGRESS] step=%d/%d loss=%.4f epoch=1/1", + step, n_steps, last_loss); + ipc_emit(buf); + } + + // ── Optional checkpoint ─────────────────────────────────────────── + if (params.save_every > 0 && step % params.save_every == 0) { + std::string ckpt = params.lora_out + ".ckpt" + std::to_string(step) + ".gguf"; + save_adapter(lt, ckpt, arch, lora_alpha, base_model_path); + char buf[512]; + snprintf(buf, sizeof(buf), "[QLORA:CHECKPOINT] %s", ckpt.c_str()); + ipc_emit(buf); + } + } + + // Save final adapter + save_adapter(lt, params.lora_out, arch, lora_alpha, base_model_path); + + { + char buf[64]; + snprintf(buf, sizeof(buf), "[QLORA:DONE] final_loss=%.4f", last_loss); + ipc_emit(buf); + } + + return 0; +} + +// --------------------------------------------------------------------------- +int main(int argc, char ** argv) { + std::setlocale(LC_NUMERIC, "C"); + + common_params params; + params.escape = false; + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_FINETUNE_QLORA)) { + return 1; + } + + if (!params.grpo_mode && params.train_file.empty()) { + LOG_ERR("%s: --train-file is required (or use --grpo-mode for IPC training)\n", __func__); + return 1; + } + + // Force settings required for training + params.use_mmap = false; + params.cache_type_k = GGML_TYPE_F32; + params.cache_type_v = GGML_TYPE_F32; + // Warmup runs inference with PARAM-flagged tensors which causes a segfault; + // training never benefits from warmup, so disable it unconditionally. + params.warmup = false; + // Flash attention has no backward implementation; force standard attention for training. + params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED; + + const float lora_alpha = (params.lora_alpha > 0.0f) + ? params.lora_alpha : (float) params.lora_rank; + const auto targets = split_csv(params.lora_targets); + + // --- Step 1: Discover tensor shapes from model GGUF (no model load yet) --- + std::string arch; + { + struct ggml_context * ctx_meta = nullptr; + struct gguf_init_params gp = { true, &ctx_meta }; + struct gguf_context * ctx_gguf = gguf_init_from_file(params.model.path.c_str(), gp); + if (!ctx_gguf) { LOG_ERR("failed to open model GGUF\n"); return 1; } + int kid = gguf_find_key(ctx_gguf, "general.architecture"); + if (kid >= 0) arch = gguf_get_val_str(ctx_gguf, kid); + gguf_free(ctx_gguf); + ggml_free(ctx_meta); + } + + // --- Step 2: Allocate LoRA tensors and save initial adapter GGUF --- + // If the user already supplied a --lora adapter we reuse it (resume training). + // Otherwise we allocate fresh tensors (B=0, A=random), write them to a temp + // .init.gguf so common_init_from_params can load them before context creation + // (this makes sched_reserve size the graph to include LoRA nodes). + const bool resume_from_lora = !params.lora_adapters.empty(); + + std::mt19937 rng(42); + lora_tensors lt; // will be populated after context load (Step 4) + std::string init_adapter_path; + + if (!resume_from_lora) { + lt = alloc_lora_tensors(params.model.path, targets, params.lora_rank, rng, params.lora_freeze_layers); + if (lt.ab.empty()) return 1; + + init_adapter_path = params.lora_out + ".init.gguf"; + save_adapter(lt, init_adapter_path, arch, lora_alpha, params.model.path); + + // Register adapter so common_init_from_params loads it before context creation + common_adapter_lora_info adapter_info; + adapter_info.path = init_adapter_path; + adapter_info.scale = 1.0f; + params.lora_adapters.push_back(adapter_info); + } else { + LOG_INF("%s: resuming training from existing LoRA adapter: %s\n", + __func__, params.lora_adapters.back().path.c_str()); + } + + // --- Step 3: Load model + context (graph sized with LoRA nodes) --- + common_init(); + llama_backend_init(); + llama_numa_init(params.numa); + + auto llama_init = common_init_from_params(params); + auto * model = llama_init->model(); + auto * ctx = llama_init->context(); + + if (!model) { LOG_ERR("failed to load model\n"); return 1; } + + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); + + // Arch fallback if not in GGUF metadata + if (arch.empty()) { + char buf[256] = {}; + llama_model_desc(model, buf, sizeof(buf)); + arch = std::string(buf); + arch = arch.substr(0, arch.find_first_of(" /")); + } + + // --- Step 4: Mark the loaded adapter tensors as trainable --- + // common_init_from_params loaded the adapter; params.lora_adapters[back].ptr + // points to the live llama_adapter_lora with its own tensor copies in device + // memory. Mark those tensors trainable so the optimizer graph includes them. + { + llama_adapter_lora * loaded = params.lora_adapters.back().ptr; + if (!loaded) { + LOG_ERR("%s: adapter was not loaded by common_init_from_params\n", __func__); + return 1; + } + for (auto & kv : loaded->ab_map) { + ggml_set_param(kv.second.a); // lora_a → trainable + ggml_set_param(kv.second.b); // lora_b → trainable + } + // Point lt.ab at the live device tensors so save_adapter writes + // the trained weights (not the original init tensors). + lt.ab.clear(); + for (auto & kv : loaded->ab_map) { + lt.ab[kv.first] = {kv.second.a, kv.second.b}; + } + } + + // Remove temp init file when we created it (resume path has no init file) + if (!resume_from_lora && !init_adapter_path.empty()) { + std::remove(expand_tilde(init_adapter_path).c_str()); + } + + // --- Step 5: Load dataset --- + // In GRPO mode the dataset comes from Python via stdin/stdout — skip file loading. + auto tmpls = common_chat_templates_init(model, ""); + if (params.grpo_mode) { + int rc = run_grpo_mode(params, model, ctx, lt, arch, lora_alpha, params.model.path); + if (lt.buf) ggml_backend_buffer_free(lt.buf); + if (lt.ctx) ggml_free(lt.ctx); + llama_backend_free(); + return rc; + } + auto samples = load_jsonl(params.train_file, ctx, tmpls.get()); + if (samples.empty()) { + LOG_ERR("%s: no training samples loaded\n", __func__); + return 1; + } + + const int32_t n_ctx = llama_n_ctx(ctx); + std::vector window_rewards; + const llama_token bos = llama_vocab_bos(llama_model_get_vocab(model)); + auto dataset = build_dataset(samples, n_ctx, window_rewards, params.train_on_prompt, bos); + if (!dataset) return 1; + + // Check if any reward deviates from 1.0 — if so, enable reward-weighted SFT + const bool has_rewards = std::any_of(window_rewards.begin(), window_rewards.end(), + [](float r){ return std::abs(r - 1.0f) > 1e-4f; }); + if (has_rewards) { + LOG_INF("%s: reward-weighted SFT enabled (found non-uniform rewards in dataset)\n", __func__); + llama_opt_set_reward_weights(window_rewards.data(), (int64_t)window_rewards.size()); + } + + // Initialize optimizer — our custom param filter restricts training to lora_a/b + struct llama_opt_params lopt_params { + /*.n_ctx_train =*/0, + /*.param_filter =*/lora_param_filter, + /*.param_filter_ud =*/nullptr, + /*.get_opt_pars =*/common_opt_lr_pars, + /*.get_opt_pars_ud =*/¶ms.lr, + /*.optimizer_type =*/params.optimizer, + /*.grad_checkpoint_interval =*/params.grad_checkpoint_interval, + }; + llama_opt_init(ctx, model, lopt_params); + + const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - params.val_split); + + ggml_opt_result_t result_train = ggml_opt_result_init(); + ggml_opt_result_t result_eval = ggml_opt_result_init(); + + const int32_t n_ubatch = llama_n_ubatch(ctx); + const int32_t ubatch_per_ctx = (n_ubatch > 0) ? (n_ctx / n_ubatch) : 1; + + save_ctx sctx { <, ¶ms.lora_out, &arch, ¶ms.model.path, lora_alpha, params.save_every, ubatch_per_ctx, 0 }; + g_save_ctx = &sctx; + + const int64_t total_windows = ggml_opt_dataset_ndata(dataset); + LOG_INF("%s: starting QLoRA training — rank=%d alpha=%.1f epochs=%d loss=%s\n", + __func__, params.lora_rank, lora_alpha, params.lr.epochs, + params.train_on_prompt ? "prompt+response" : "response-only"); + LOG_INF("%s: dataset: %ld windows × %d ubatches = %ld steps per epoch (n_ctx=%d n_ubatch=%d stride=%d)\n", + __func__, (long)total_windows, ubatch_per_ctx, (long)(idata_split * ubatch_per_ctx), + n_ctx, n_ubatch, n_ctx / 2); + if (params.save_every > 0) { + LOG_INF("%s: will save checkpoint every %d windows → %s.ckptN.gguf\n", + __func__, params.save_every, params.lora_out.c_str()); + } + + ggml_opt_epoch_callback cb_train = (params.save_every > 0) + ? save_every_callback + : ggml_opt_epoch_callback_progress_bar; + + for (params.lr.epoch = 0; params.lr.epoch < params.lr.epochs; ++params.lr.epoch) { + sctx.last_saved = 0; // reset per-epoch window counter + llama_opt_epoch(ctx, dataset, result_train, result_eval, idata_split, + cb_train, + ggml_opt_epoch_callback_progress_bar, + params.shuffle_dataset); + fprintf(stderr, "\n"); + + // Per-epoch loss summary + { + double train_loss = 0.0, train_unc = 0.0; + ggml_opt_result_loss(result_train, &train_loss, &train_unc); + if (idata_split < ggml_opt_dataset_ndata(dataset)) { + double val_loss = 0.0, val_unc = 0.0; + ggml_opt_result_loss(result_eval, &val_loss, &val_unc); + LOG_INF("epoch %d/%d: train_loss=%.4f ± %.4f val_loss=%.4f ± %.4f\n", + params.lr.epoch + 1, params.lr.epochs, train_loss, train_unc, val_loss, val_unc); + } else { + LOG_INF("epoch %d/%d: train_loss=%.4f ± %.4f\n", + params.lr.epoch + 1, params.lr.epochs, train_loss, train_unc); + } + } + + ggml_opt_result_reset(result_train); + ggml_opt_result_reset(result_eval); + } + + ggml_opt_result_free(result_train); + ggml_opt_result_free(result_eval); + llama_opt_set_reward_weights(nullptr, 0); + + // Save final trained adapter + save_adapter(lt, params.lora_out, arch, lora_alpha, params.model.path); + + // Free scratch buffers only when we allocated them (not in resume path) + if (lt.buf) ggml_backend_buffer_free(lt.buf); + if (lt.ctx) ggml_free(lt.ctx); + ggml_opt_dataset_free(dataset); + llama_backend_free(); + + return 0; +} diff --git a/examples/qlora_training/grpo_example.py b/examples/qlora_training/grpo_example.py new file mode 100644 index 0000000000..9c7f2e3c26 --- /dev/null +++ b/examples/qlora_training/grpo_example.py @@ -0,0 +1,397 @@ +#!/usr/bin/env python3 +""" +grpo_example.py — Minimal GRPO training loop using llama-finetune-qlora --grpo-mode + +Demonstrates the IPC protocol between the Python driver and the C++ subprocess. +No external dependencies required — only Python stdlib. + +Usage: + python3 grpo_example.py \ + --model /path/to/model-q4_k_m.gguf \ + --lora-out /path/to/output-adapter.gguf \ + [--lora /path/to/resume-adapter.gguf] \ + [--binary /path/to/llama-finetune-qlora] \ + [--n-steps 200] \ + [--n-gen 8] \ + [--rank 16] + +IPC Protocol (stdout from C++ process): + [QLORA:READY] — process initialised + [QLORA:PROMPT_REQ:] — C++ requests a prompt for step N + [QLORA:GEN:/] — one generation (newlines escaped as \\n) + [QLORA:REWARD_REQ:] — C++ requests N reward scores + [QLORA:PROGRESS] step=X/Y loss=Z epoch=A/B + [QLORA:CHECKPOINT] + [QLORA:DONE] final_loss=X + [QLORA:ERROR] + +Python → C++ stdin: + PROMPT + REWARD ... (advantages, 0..1 range) + STOP (request graceful shutdown) +""" + +import argparse +import logging +import math +import os +import re +import subprocess +import sys +import time +from pathlib import Path +from typing import List, Optional, Tuple + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", +) +log = logging.getLogger("grpo_example") + +# ────────────────────────────────────────────────────────────────────────────── +# IPC helpers +# ────────────────────────────────────────────────────────────────────────────── + +_IPC_RE = re.compile(r"^\[QLORA:([A-Z_]+)(?::([^\]]*))?\](.*)$") + + +def escape(text: str) -> str: + """Escape newlines and backslashes for single-line IPC transport.""" + return text.replace("\\", "\\\\").replace("\n", "\\n").replace("\r", "\\r") + + +def unescape(text: str) -> str: + """Reverse of escape().""" + out, i = [], 0 + while i < len(text): + if text[i] == "\\" and i + 1 < len(text): + c = text[i + 1] + if c == "n": + out.append("\n") + elif c == "r": + out.append("\r") + elif c == "\\": + out.append("\\") + else: + out.append(c) + i += 2 + else: + out.append(text[i]) + i += 1 + return "".join(out) + + +def parse_ipc(line: str) -> Optional[Tuple[str, str, str]]: + """ + Parse an IPC line into (msg_type, seq, payload). + Returns None for non-IPC lines (model output, log lines, etc.). + """ + m = _IPC_RE.match(line.strip()) + if not m: + return None + return m.group(1), (m.group(2) or ""), m.group(3).strip() + + +def read_ipc(proc: subprocess.Popen, timeout: float = 120.0) -> Optional[Tuple[str, str, str]]: + """ + Read lines from proc.stdout until an IPC message arrives. + Non-IPC lines (model output, C++ logs leaked to stdout) are printed. + Returns None on EOF. + Raises TimeoutError if nothing arrives within `timeout` seconds. + """ + deadline = time.monotonic() + timeout + while True: + remaining = deadline - time.monotonic() + if remaining <= 0: + raise TimeoutError(f"No IPC message within {timeout:.0f}s") + + line = proc.stdout.readline() + if not line: + return None # EOF + + line = line.rstrip("\n") + parsed = parse_ipc(line) + if parsed: + return parsed + # Non-IPC — C++ sometimes leaks timing/debug lines to stdout. + # Print them so the user can see what's happening. + print(f" [cpp] {line}", file=sys.stderr) + + +def write_cmd(proc: subprocess.Popen, cmd: str): + """Write one command line to the subprocess stdin.""" + try: + proc.stdin.write(cmd + "\n") + proc.stdin.flush() + except BrokenPipeError: + raise RuntimeError("C++ subprocess stdin closed — did it crash?") + + +def wait_for(proc: subprocess.Popen, expected: str, timeout: float = 120.0) -> Tuple[str, str, str]: + """Block until the expected IPC message type arrives.""" + deadline = time.monotonic() + timeout + while True: + remaining = deadline - time.monotonic() + if remaining <= 0: + raise TimeoutError(f"Timed out waiting for [{expected}]") + parsed = read_ipc(proc, timeout=remaining) + if parsed is None: + raise RuntimeError(f"Subprocess exited before sending [{expected}]") + msg_type, seq, payload = parsed + if msg_type == expected: + return msg_type, seq, payload + log.debug("Ignoring unexpected IPC (%s) while waiting for %s", msg_type, expected) + + +# ────────────────────────────────────────────────────────────────────────────── +# Advantage normalisation (GRPO) +# ────────────────────────────────────────────────────────────────────────────── + +def normalise_rewards(rewards: List[float]) -> List[float]: + """ + Group-relative advantage normalisation: subtract mean, divide by std. + Clipped to [0, 1] so the C++ side always receives values in that range. + + All-equal rewards → uniform 0.5 (no signal, but no NaN either). + """ + if len(rewards) == 0: + return [] + mean = sum(rewards) / len(rewards) + variance = sum((r - mean) ** 2 for r in rewards) / len(rewards) + std = math.sqrt(variance) if variance > 1e-8 else 1.0 + + normalised = [(r - mean) / std for r in rewards] + # Shift to [0,1]: z-scores typically lie in [-3, +3] + clipped = [max(0.0, min(1.0, 0.5 + z / 6.0)) for z in normalised] + return clipped + + +# ────────────────────────────────────────────────────────────────────────────── +# Example prompt / reward providers +# ────────────────────────────────────────────────────────────────────────────── + +# Replace these with your own logic. + +_EXAMPLE_PROMPTS = [ + "Explain the concept of gradient descent in one sentence.", + "What is the capital of France?", + "Write a haiku about machine learning.", + "Describe the difference between SFT and RLHF.", + "What does GRPO stand for?", +] + + +def get_prompt(step: int) -> str: + """Return a prompt for the given training step (0-indexed).""" + return _EXAMPLE_PROMPTS[step % len(_EXAMPLE_PROMPTS)] + + +def score_generations(prompt: str, generations: List[str]) -> List[float]: + """ + Score a list of model generations for the given prompt. + Returns a list of raw reward scores (any numeric range; will be normalised). + + This example uses a trivial heuristic: longer, more varied responses + score higher. Replace with your actual reward model / verifier. + """ + scores = [] + for gen in generations: + words = gen.split() + # Simple heuristics: length + lexical diversity + length_score = min(1.0, len(words) / 50.0) + vocab_score = min(1.0, len(set(words)) / max(1, len(words))) + scores.append(0.6 * length_score + 0.4 * vocab_score) + return scores + + +# ────────────────────────────────────────────────────────────────────────────── +# Main GRPO loop +# ────────────────────────────────────────────────────────────────────────────── + +def run_grpo(args: argparse.Namespace): + # Resolve binary + binary = Path(args.binary) + if not binary.exists(): + log.error("Binary not found: %s", binary) + sys.exit(1) + + # Build command + cmd = [ + str(binary), + "--model", args.model, + "--lora-out", args.lora_out, + "--lora-rank", str(args.rank), + "--lora-alpha", str(args.rank // 2), + "-c", str(args.ctx_size), + "-b", str(args.ctx_size), + "-ub", "512", + "-ngl", str(args.ngl), + "-lr", str(args.lr), + "--seed", str(args.seed), + "--grad-checkpoint","48", + "--shuffle-dataset", + "--grpo-mode", + "--n-gen", str(args.n_gen), + "--n-steps", str(args.n_steps), + "--grpo-temp", str(args.temperature), + "--grpo-max-tokens",str(args.max_tokens), + ] + + if args.lora: + cmd += ["--lora", args.lora] + + if args.save_every > 0: + cmd += ["--save-every", str(args.save_every)] + + log.info("Launching: %s", " ".join(cmd)) + + proc = subprocess.Popen( + cmd, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=sys.stderr, # C++ debug/timing logs go directly to our stderr + text=True, + bufsize=1, + ) + + try: + _grpo_loop(proc, args) + except KeyboardInterrupt: + log.info("Interrupted — requesting graceful stop") + try: + write_cmd(proc, "STOP") + except Exception: + pass + except Exception as e: + log.error("GRPO loop error: %s", e) + proc.kill() + raise + finally: + try: + proc.stdin.close() + except Exception: + pass + rc = proc.wait(timeout=30) + if rc not in (0, None): + log.warning("Subprocess exited with code %d", rc) + + +def _grpo_loop(proc: subprocess.Popen, args: argparse.Namespace): + # ── Wait for READY ────────────────────────────────────────────────────── + log.info("Waiting for subprocess to initialise (model load can take a minute)…") + wait_for(proc, "READY", timeout=300) + log.info("Subprocess ready.") + + current_prompt: str = "" + generations: List[str] = [] + step = 0 + + while True: + parsed = read_ipc(proc, timeout=600) + if parsed is None: + log.info("Subprocess exited (EOF).") + break + + msg_type, seq, payload = parsed + + # ── PROMPT_REQ ────────────────────────────────────────────────────── + if msg_type == "PROMPT_REQ": + step = int(seq) if seq else step + 1 + current_prompt = get_prompt(step - 1) + generations = [] + log.debug("Step %d — sending prompt: %s", step, current_prompt[:60]) + write_cmd(proc, f"PROMPT {escape(current_prompt)}") + + # ── GEN ───────────────────────────────────────────────────────────── + elif msg_type == "GEN": + # seq = "k/n" + parts = seq.split("/") + k = int(parts[0]) + n = int(parts[1]) if len(parts) > 1 else args.n_gen + text = unescape(payload) + generations.append(text) + log.debug(" Generation %d/%d: %s…", k, n, text[:60].replace("\n", "↵")) + + # ── REWARD_REQ ────────────────────────────────────────────────────── + elif msg_type == "REWARD_REQ": + n_expected = int(seq) if seq else len(generations) + if len(generations) != n_expected: + log.warning( + "REWARD_REQ asked for %d rewards but collected %d generations", + n_expected, len(generations), + ) + + raw_rewards = score_generations(current_prompt, generations) + advantages = normalise_rewards(raw_rewards) + + reward_str = " ".join(f"{a:.6f}" for a in advantages) + log.debug(" Rewards (raw): %s", [f"{r:.3f}" for r in raw_rewards]) + log.debug(" Advantages: %s", [f"{a:.3f}" for a in advantages]) + write_cmd(proc, f"REWARD {reward_str}") + + # ── PROGRESS ──────────────────────────────────────────────────────── + elif msg_type == "PROGRESS": + # Format: step=X/Y loss=Z epoch=A/B + sm = re.search(r"step=(\d+)(?:/(\d+))?", payload) + lm = re.search(r"loss=([\d.]+)", payload) + step_str = f"{sm.group(1)}/{sm.group(2)}" if sm and sm.group(2) else (sm.group(1) if sm else "?") + loss_str = lm.group(1) if lm else "?" + print(f" step {step_str} loss {loss_str}", flush=True) + + # ── CHECKPOINT ────────────────────────────────────────────────────── + elif msg_type == "CHECKPOINT": + log.info("Checkpoint saved: %s", payload.strip()) + + # ── DONE ──────────────────────────────────────────────────────────── + elif msg_type == "DONE": + m = re.search(r"final_loss=([\d.]+)", payload) + loss = m.group(1) if m else "?" + log.info("Training complete. final_loss=%s", loss) + break + + # ── ERROR ──────────────────────────────────────────────────────────── + elif msg_type == "ERROR": + log.error("C++ process error: %s", payload.strip()) + raise RuntimeError(f"Training failed: {payload.strip()}") + + else: + log.debug("Unknown IPC message: [%s] seq=%r payload=%r", msg_type, seq, payload) + + +# ────────────────────────────────────────────────────────────────────────────── +# CLI +# ────────────────────────────────────────────────────────────────────────────── + +def parse_args() -> argparse.Namespace: + # Default binary: build/bin/ relative to this script's repo root + script_dir = Path(__file__).resolve().parent + repo_root = script_dir.parents[1] # examples/qlora_training → llama.cpp root + default_bin = repo_root / "build" / "bin" / "llama-finetune-qlora" + + p = argparse.ArgumentParser( + description="Minimal GRPO training loop via llama-finetune-qlora --grpo-mode", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + p.add_argument("--model", required=True, help="Base GGUF model path") + p.add_argument("--lora-out", required=True, help="Output adapter GGUF path") + p.add_argument("--lora", default=None, help="Resume from existing adapter GGUF") + p.add_argument("--binary", default=str(default_bin), help="Path to llama-finetune-qlora binary") + p.add_argument("--rank", type=int, default=16, help="LoRA rank") + p.add_argument("--n-steps", type=int, default=200, help="Number of GRPO steps") + p.add_argument("--n-gen", type=int, default=8, help="Generations per prompt") + p.add_argument("--lr", type=float, default=1e-4, help="Learning rate") + p.add_argument("--ctx-size", type=int, default=4096, help="Context window") + p.add_argument("--ngl", type=int, default=999, help="GPU layers (-ngl)") + p.add_argument("--temperature", type=float, default=0.8, help="Sampling temperature") + p.add_argument("--max-tokens", type=int, default=512, help="Max tokens per generation") + p.add_argument("--save-every", type=int, default=0, help="Save checkpoint every N steps (0=off)") + p.add_argument("--seed", type=int, default=42, help="RNG seed") + p.add_argument("--verbose", action="store_true", help="Enable DEBUG logging") + return p.parse_args() + + +if __name__ == "__main__": + args = parse_args() + if args.verbose: + logging.getLogger().setLevel(logging.DEBUG) + run_grpo(args) diff --git a/examples/qlora_training/sample_data.jsonl b/examples/qlora_training/sample_data.jsonl new file mode 100644 index 0000000000..982168cfc1 --- /dev/null +++ b/examples/qlora_training/sample_data.jsonl @@ -0,0 +1,7 @@ +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "What is the capital of France?"}, {"role": "assistant", "content": "The capital of France is Paris."}]} +{"messages": [{"role": "user", "content": "Explain gradient descent in one sentence."}, {"role": "assistant", "content": "Gradient descent iteratively adjusts model parameters in the direction that most reduces the loss function."}]} +{"messages": [{"role": "system", "content": "You are a concise coding assistant."}, {"role": "user", "content": "Write a Python function that returns the factorial of n."}, {"role": "assistant", "content": "```python\ndef factorial(n):\n return 1 if n <= 1 else n * factorial(n - 1)\n```"}]} +{"prompt": "What is supervised fine-tuning?", "response": "Supervised fine-tuning (SFT) trains a pre-trained model on labelled examples of the desired behaviour, adjusting weights to minimise cross-entropy loss on the target responses."} +{"prompt": "What is LoRA?", "response": "LoRA (Low-Rank Adaptation) fine-tunes large models by injecting trainable low-rank matrices into frozen weight layers, dramatically reducing the number of trainable parameters."} +{"text": "The transformer architecture, introduced in 'Attention Is All You Need' (Vaswani et al., 2017), relies entirely on self-attention mechanisms and has become the foundation of modern large language models."} +{"text": "QLoRA combines quantization with LoRA: the base model is loaded in 4-bit precision (frozen), while only the small LoRA adapter weights are trained in full precision."} diff --git a/examples/qlora_training/sample_rwsft_data.jsonl b/examples/qlora_training/sample_rwsft_data.jsonl new file mode 100644 index 0000000000..3bf0234451 --- /dev/null +++ b/examples/qlora_training/sample_rwsft_data.jsonl @@ -0,0 +1,6 @@ +{"messages": [{"role": "user", "content": "Explain what a neural network is."}, {"role": "assistant", "content": "A neural network is a computational model loosely inspired by biological neurons, consisting of layers of interconnected nodes that learn to map inputs to outputs through training."}], "reward": 0.9} +{"messages": [{"role": "user", "content": "Explain what a neural network is."}, {"role": "assistant", "content": "It's a thing that does AI stuff with nodes and layers."}], "reward": 0.2} +{"messages": [{"role": "user", "content": "What is backpropagation?"}, {"role": "assistant", "content": "Backpropagation computes gradients of the loss with respect to each parameter by applying the chain rule backwards through the computation graph, enabling gradient descent to update all weights efficiently."}], "reward": 1.0} +{"messages": [{"role": "user", "content": "What is backpropagation?"}, {"role": "assistant", "content": "Backpropagation is how neural networks learn by going backwards."}], "reward": 0.3} +{"prompt": "Describe overfitting in machine learning.", "response": "Overfitting occurs when a model learns the training data too closely—capturing noise and idiosyncrasies—resulting in poor generalisation to unseen examples. Regularisation, dropout, and early stopping are common mitigations.", "reward": 0.95} +{"prompt": "Describe overfitting in machine learning.", "response": "Overfitting is when the model is too good at training data but bad at new data.", "reward": 0.4} diff --git a/ggml/include/ggml-opt.h b/ggml/include/ggml-opt.h index 1c2ed79b77..60774575f0 100644 --- a/ggml/include/ggml-opt.h +++ b/ggml/include/ggml-opt.h @@ -89,6 +89,7 @@ extern "C" { float beta2; // second AdamW momentum float eps; // epsilon for numerical stability float wd; // weight decay - 0.0f to disable + float gclip; // element-wise gradient clipping threshold - 0.0f to disable } adamw; struct { float alpha; // learning rate diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 566e271479..92a4ae6352 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -501,6 +501,7 @@ extern "C" { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID, GGML_OP_OUT_PROD, + GGML_OP_OUT_PROD_ID, // scattered outer-product for MUL_MAT_ID backward (MoE LoRA) GGML_OP_SCALE, GGML_OP_SET, @@ -1424,6 +1425,21 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); + // Scattered outer-product for the MUL_MAT_ID backward pass (MoE LoRA gradient). + // + // a: [cols, n_expert_used, n_tokens] F32 — activations + // b: [rows, n_expert_used, n_tokens] F32 — upstream gradient + // ids: [n_expert_used, n_tokens] I32 — expert dispatch indices + // result: [cols, rows, n_expert, 1] F32 + // + // result[:, :, e] += sum_{(i,t): ids[i,t]==e} a[:, i, t] ⊗ b[:, i, t] + GGML_API struct ggml_tensor * ggml_out_prod_id( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * ids, + int64_t n_expert); + // // operations on tensors without backpropagation // diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 331e071a26..adecf1427a 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -11034,7 +11034,7 @@ static void ggml_compute_forward_opt_step_adamw_f32( GGML_ASSERT(ggml_are_same_shape(src0, src0_grad)); GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_m)); GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_v)); - GGML_ASSERT(ggml_nelements(adamw_params) == 7); + GGML_ASSERT(ggml_nelements(adamw_params) == 8); const int ith = params->ith; const int nth = params->nth; @@ -11060,6 +11060,7 @@ static void ggml_compute_forward_opt_step_adamw_f32( const float wd = adamw_params_ptr[4]; const float beta1h = adamw_params_ptr[5]; const float beta2h = adamw_params_ptr[6]; + const float gclip = adamw_params_ptr[7]; // element-wise gradient clip (0 = disabled) const float keep = 1.f - alpha * wd; for (int ir = ir0; ir < ir1; ++ir) { const int64_t i03 = ir/(ne02*ne01); @@ -11074,8 +11075,10 @@ static void ggml_compute_forward_opt_step_adamw_f32( float * v = (float *) ((char *) src0_grad_v->data + offset); for (int i00 = 0; i00 < ne00; ++i00) { - m[i00] = m[i00]*beta1 + g[i00]*(1.0f - beta1); - v[i00] = v[i00]*beta2 + g[i00]*g[i00]*(1.0f - beta2); + const float gi = (gclip > 0.0f) ? fmaxf(-gclip, fminf(gclip, g[i00])) : g[i00]; + + m[i00] = m[i00]*beta1 + gi*(1.0f - beta1); + v[i00] = v[i00]*beta2 + gi*gi*(1.0f - beta2); const float mh = m[i00]*beta1h; const float vh = sqrtf(v[i00]*beta2h) + eps; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index cda275b8c5..3e1cca6c98 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2642,6 +2642,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_OUT_PROD: ggml_cuda_out_prod(ctx, dst); break; + case GGML_OP_OUT_PROD_ID: + ggml_cuda_out_prod_id(ctx, dst); + break; case GGML_OP_SCALE: ggml_cuda_op_scale(ctx, dst); break; @@ -4772,6 +4775,12 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g } break; case GGML_OP_OUT_PROD: return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; + case GGML_OP_OUT_PROD_ID: + return op->src[0] != nullptr && op->src[1] != nullptr && op->src[2] != nullptr + && op->type == GGML_TYPE_F32 + && op->src[0]->type == GGML_TYPE_F32 + && op->src[1]->type == GGML_TYPE_F32 + && op->src[2]->type == GGML_TYPE_I32; case GGML_OP_GET_ROWS: { switch (op->src[0]->type) { diff --git a/ggml/src/ggml-cuda/opt-step-adamw.cu b/ggml/src/ggml-cuda/opt-step-adamw.cu index 35154f2996..dd2c1edf3a 100644 --- a/ggml/src/ggml-cuda/opt-step-adamw.cu +++ b/ggml/src/ggml-cuda/opt-step-adamw.cu @@ -20,8 +20,9 @@ static __global__ void opt_step_adamw_f32( const float wd = pars[4]; const float beta1h = pars[5]; const float beta2h = pars[6]; + const float gclip = pars[7]; // element-wise gradient clip (0 = disabled) - const float gi = g[i]; + const float gi = (gclip > 0.0f) ? fmaxf(-gclip, fminf(gclip, g[i])) : g[i]; const float gmi = g_m[i]*beta1 + gi*(1.0f - beta1); const float gvi = g_v[i]*beta2 + gi*gi*(1.0f - beta2); @@ -62,7 +63,7 @@ void ggml_cuda_opt_step_adamw(ggml_backend_cuda_context & ctx, ggml_tensor * dst GGML_ASSERT(ggml_are_same_shape(src0, src0_grad)); GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_m)); GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_v)); - GGML_ASSERT(ggml_nelements(adamw_params) == 7); + GGML_ASSERT(ggml_nelements(adamw_params) == 8); float * src0_d = (float *) src0->data; const float * src0_grad_d = (const float *) src0_grad->data; diff --git a/ggml/src/ggml-cuda/out-prod.cu b/ggml/src/ggml-cuda/out-prod.cu index c9b2b699c6..9afc323bd9 100644 --- a/ggml/src/ggml-cuda/out-prod.cu +++ b/ggml/src/ggml-cuda/out-prod.cu @@ -1,6 +1,8 @@ #include "out-prod.cuh" #include +#include +#include void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; @@ -66,3 +68,115 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { } } } + +// ggml_cuda_out_prod_id +// +// Scattered outer-product for the MUL_MAT_ID backward pass (gradient w.r.t. expert weights). +// +// src0 = a [cols, n_expert_used, n_tokens] F32 — token activations +// src1 = b [rows, n_expert_used, n_tokens] F32 — upstream gradient +// src2 = ids [n_expert_used, n_tokens] I32 — expert dispatch indices +// dst [cols, rows, n_expert, 1] F32 — gradient w.r.t. expert weight matrices +// +// dst[:, :, e] += sum_{(i,t): ids[i,t]==e} a[:, i, t] ⊗ b[:, i, t] +// +// Algorithm: +// For each expert e: gather the token columns where ids[i,t]==e into contiguous +// GPU buffers, then use cublasSgemm (beta=1) to accumulate the outer product. +// ids may be CPU-resident (common in backward graphs where they are leaf tensors). +void ggml_cuda_out_prod_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; // a [cols, n_exp_used, n_tokens] + const ggml_tensor * src1 = dst->src[1]; // b [rows, n_exp_used, n_tokens] + const ggml_tensor * ids = dst->src[2]; // ids [n_exp_used, n_tokens] i32 + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(ids->type == GGML_TYPE_I32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + const int64_t cols = src0->ne[0]; + const int64_t n_exp_used = src0->ne[1]; + const int64_t n_tokens = src0->ne[2]; + const int64_t rows = src1->ne[0]; + const int64_t n_expert = dst->ne[2]; + + cudaStream_t stream = ctx.stream(); + cublasHandle_t handle = ctx.cublas_handle(); + CUBLAS_CHECK(cublasSetStream(handle, stream)); + + // Zero destination tensor before accumulating + CUDA_CHECK(cudaMemsetAsync(dst->data, 0, ggml_nbytes(dst), stream)); + + // Read ids to host — ids may be CPU-resident (backward graph leaf) or GPU-resident + const size_t ids_nbytes = ggml_nbytes(ids); + std::vector ids_host(ids_nbytes); + if (ids->buffer && !ggml_backend_buffer_is_host(ids->buffer)) { + // GPU-resident: copy to host and synchronize so we can inspect the values + CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids->data, ids_nbytes, cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + } else { + memcpy(ids_host.data(), ids->data, ids_nbytes); + } + + // Build per-expert token list: expert_tokens[e] = list of flat indices (iexp*n_tokens+itok) + // whose dispatch id equals e. + std::vector> expert_tokens(n_expert); + for (int64_t itok = 0; itok < n_tokens; ++itok) { + for (int64_t iexp = 0; iexp < n_exp_used; ++iexp) { + const int32_t eid = *(const int32_t *)(ids_host.data() + + itok * ids->nb[1] + iexp * ids->nb[0]); + GGML_ASSERT(eid >= 0 && eid < (int32_t)n_expert); + expert_tokens[eid].push_back(iexp * n_tokens + itok); + } + } + + // Strides (in elements, not bytes) + const int64_t a_stride_exp = src0->nb[1] / sizeof(float); // cols + const int64_t a_stride_tok = src0->nb[2] / sizeof(float); // cols * n_exp_used + const int64_t b_stride_exp = src1->nb[1] / sizeof(float); // rows + const int64_t b_stride_tok = src1->nb[2] / sizeof(float); // rows * n_exp_used + const int64_t dst_stride_e = dst->nb[2] / sizeof(float); // cols * rows + + const float alpha_one = 1.0f; + const float beta_acc = 1.0f; // accumulate — dst is already zeroed above + + const float * a_base = (const float *) src0->data; + const float * b_base = (const float *) src1->data; + float * d_base = (float *) dst->data; + + for (int64_t e = 0; e < n_expert; ++e) { + const auto & toks = expert_tokens[e]; + if (toks.empty()) { + continue; + } + + const int64_t ntoks_e = (int64_t) toks.size(); + + // Allocate contiguous gather buffers on GPU: a_e [cols, ntoks_e], b_e [rows, ntoks_e] + ggml_cuda_pool_alloc a_gathered(ctx.pool(), cols * ntoks_e); + ggml_cuda_pool_alloc b_gathered(ctx.pool(), rows * ntoks_e); + + // Gather token vectors from GPU src0/src1 into contiguous buffers + for (int64_t ti = 0; ti < ntoks_e; ++ti) { + const int64_t flat = toks[ti]; + const int64_t iexp = flat / n_tokens; + const int64_t itok = flat % n_tokens; + CUDA_CHECK(cudaMemcpyAsync( + a_gathered.ptr + ti * cols, + a_base + iexp * a_stride_exp + itok * a_stride_tok, + cols * sizeof(float), cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync( + b_gathered.ptr + ti * rows, + b_base + iexp * b_stride_exp + itok * b_stride_tok, + rows * sizeof(float), cudaMemcpyDeviceToDevice, stream)); + } + + // dst[:, :, e] += a_gathered @ b_gathered^T + // cuBLAS column-major: A=[cols, ntoks_e] lda=cols, B=[rows, ntoks_e] ldb=rows + CUBLAS_CHECK(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, + (int)cols, (int)rows, (int)ntoks_e, + &alpha_one, a_gathered.ptr, (int)cols, + b_gathered.ptr, (int)rows, + &beta_acc, d_base + e*dst_stride_e, (int)cols)); + } +} diff --git a/ggml/src/ggml-cuda/out-prod.cuh b/ggml/src/ggml-cuda/out-prod.cuh index a0046f5f8f..993307e797 100644 --- a/ggml/src/ggml-cuda/out-prod.cuh +++ b/ggml/src/ggml-cuda/out-prod.cuh @@ -1,3 +1,10 @@ #include "common.cuh" void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +// Scattered outer-product for MUL_MAT_ID backward (gradient w.r.t. expert weight matrices). +// src0: activations [cols, n_expert_used, n_tokens] F32 +// src1: grad_output [rows, n_expert_used, n_tokens] F32 +// src2: expert ids [n_expert_used, n_tokens] I32 (may be CPU-resident) +// dst: grad_weight [cols, rows, n_expert, 1] F32 +void ggml_cuda_out_prod_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-opt.cpp b/ggml/src/ggml-opt.cpp index e078ad14a3..e87fc79c25 100644 --- a/ggml/src/ggml-opt.cpp +++ b/ggml/src/ggml-opt.cpp @@ -230,6 +230,7 @@ struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * us result.adamw.beta2 = 0.999f; result.adamw.eps = 1e-8f; result.adamw.wd = 0.0f; + result.adamw.gclip = 0.0f; result.sgd.alpha = 1e-3f; result.sgd.wd = 0.0f; @@ -503,7 +504,7 @@ static void ggml_opt_build(ggml_opt_context_t opt_ctx) { // gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step. opt_ctx->gb_opt = ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gb_grad, /*force_grads =*/ true); - opt_ctx->opt_step_params = ggml_new_tensor_1d(opt_ctx->ctx_cpu, GGML_TYPE_F32, need_momenta ? 7 : 2); + opt_ctx->opt_step_params = ggml_new_tensor_1d(opt_ctx->ctx_cpu, GGML_TYPE_F32, need_momenta ? 8 : 2); ggml_tensor * adamw_params = opt_ctx->opt_step_params; ggml_set_input(adamw_params); const char * optimizer_name = ggml_opt_optimizer_name(opt_ctx->optimizer); @@ -726,6 +727,17 @@ void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward) { if (opt_ctx->build_type == GGML_OPT_BUILD_TYPE_OPT && opt_ctx->opt_period > 1 && opt_ctx->opt_i == 0) { ggml_graph_reset(opt_ctx->gb_grad); } + + // For non-static graphs the compute graph is rebuilt every call, so ggml_graph_reset + // is not called and grad_accs may carry over values from the previous accumulation window. + // Explicitly zero them at the start of each gradient-accumulation cycle. + if (!opt_ctx->static_graphs && backward && opt_ctx->opt_i == 0) { + for (struct ggml_tensor * ga : opt_ctx->grad_accs) { + if (ga) { + ggml_set_zero(ga); + } + } + } if (backward) { const int32_t opt_i_next = (opt_ctx->opt_i + 1) % opt_ctx->opt_period; opt_ctx->build_type = opt_i_next == 0 ? GGML_OPT_BUILD_TYPE_OPT : GGML_OPT_BUILD_TYPE_GRAD; @@ -793,6 +805,7 @@ void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result) { GGML_ASSERT(opt_pars.adamw.eps >= 0.0f); GGML_ASSERT(opt_pars.adamw.wd >= 0.0f); GGML_ASSERT(opt_pars.adamw.wd <= 1.0f); + GGML_ASSERT(opt_pars.adamw.gclip >= 0.0f); // beta1, beta2 after applying warmup const float beta1h = 1.0f / (1.0f - powf(opt_pars.adamw.beta1, opt_ctx->iter)); @@ -806,6 +819,7 @@ void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result) { adamw_par_data[4] = opt_pars.adamw.wd; adamw_par_data[5] = beta1h; adamw_par_data[6] = beta2h; + adamw_par_data[7] = opt_pars.adamw.gclip; } break; case GGML_OPT_OPTIMIZER_TYPE_SGD: { GGML_ASSERT(opt_pars.sgd.alpha > 0.0f); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index aeafc395d7..1e04911360 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -976,6 +976,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "MUL_MAT", "MUL_MAT_ID", "OUT_PROD", + "OUT_PROD_ID", "SCALE", "SET", @@ -1049,7 +1050,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "GLU", }; -static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96"); +static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT != 97"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1086,6 +1087,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "X*Y", "X[i]*Y", "X*Y", + "X_id⊗Y_id", "x*v", "y-\\>view(x)", @@ -1159,7 +1161,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "glu(x)", }; -static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96"); +static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT != 97"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -3293,6 +3295,44 @@ struct ggml_tensor * ggml_out_prod( return result; } +// ggml_out_prod_id +// +// Scattered outer-product for the MUL_MAT_ID backward pass. +// +// a: [cols, n_expert_used, n_tokens] F32 — activations (src1 of MUL_MAT_ID) +// b: [rows, n_expert_used, n_tokens] F32 — upstream gradient +// ids: [n_expert_used, n_tokens] I32 — expert dispatch indices (src2 of MUL_MAT_ID) +// result: [cols, rows, n_expert, 1] F32 +// +// result[:, :, e] += sum_{(i,t): ids[i,t]==e} a[:, i, t] ⊗ b[:, i, t] +// +// Computes the gradient w.r.t. the expert weight matrices (src0) of MUL_MAT_ID. +struct ggml_tensor * ggml_out_prod_id( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * ids, + int64_t n_expert) { + GGML_ASSERT(a->type == GGML_TYPE_F32); + GGML_ASSERT(b->type == GGML_TYPE_F32); + GGML_ASSERT(ids->type == GGML_TYPE_I32); + GGML_ASSERT(a->ne[1] == b->ne[1]); // n_expert_used matches + GGML_ASSERT(a->ne[2] == b->ne[2]); // n_tokens matches + GGML_ASSERT(ids->ne[0] == a->ne[1]); // n_expert_used matches ids + GGML_ASSERT(ids->ne[1] == a->ne[2]); // n_tokens matches ids + GGML_ASSERT(n_expert > 0); + + const int64_t ne[4] = { a->ne[0], b->ne[0], n_expert, 1 }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + + result->op = GGML_OP_OUT_PROD_ID; + result->src[0] = a; + result->src[1] = b; + result->src[2] = ids; + + return result; +} + // ggml_scale static struct ggml_tensor * ggml_scale_impl( @@ -6518,6 +6558,35 @@ static void ggml_compute_backward( grad))); // [m,p,qq,rr] } } break; + case GGML_OP_MUL_MAT_ID: { + // Backward pass for indirect matrix multiplication (MoE). + // + // Forward: dst[rows, n_exp_used, n_tokens] = as[:,:,ids[i,t]] @ b[:,i,t] + // src0 = as [cols, rows, n_expert] — expert weight matrices + // src1 = b [cols, n_exp_used, n_tokens] — token activations + // src2 = ids [n_exp_used, n_tokens] — expert dispatch indices (I32) + // + // Gradient w.r.t. src1 (activations): + // grad_b[:,i,t] = as[:,:,ids[i,t]]^T @ grad[:,i,t] + // → computed via MUL_MAT_ID with transposed as + // + // Gradient w.r.t. src0 (expert weights, only when F32 i.e. LoRA): + // grad_as[:,:,e] += sum_{(i,t): ids[i,t]==e} b[:,i,t] ⊗ grad[:,i,t] + // → computed via OUT_PROD_ID + // + // Quantized src0 is frozen (stop-gradient) — handled in grads_needed below. + if (src0_needs_grads) { + const int64_t n_expert = src0->ne[2]; + struct ggml_tensor * grad_as = ggml_out_prod_id(ctx, src1, grad, src2, n_expert); + ggml_add_or_set(ctx, cgraph, isrc0, grad_as); + } + if (src1_needs_grads) { + // Transpose expert matrices: as [cols, rows, n_expert] → as_T [rows, cols, n_expert] + struct ggml_tensor * as_T = ggml_cont(ctx, ggml_permute(ctx, src0, 1, 0, 2, 3)); + struct ggml_tensor * grad_b = ggml_mul_mat_id(ctx, as_T, grad, src2); + ggml_add_or_set(ctx, cgraph, isrc1, grad_b); + } + } break; case GGML_OP_SCALE: { if (src0_needs_grads) { float s; @@ -6964,6 +7033,35 @@ void ggml_build_backward_expand( ignore_src[1] = true; break; + // MUL_MAT_ID: expert dispatch indices (src2) are integer — no gradient. + // When src0 is quantized the expert weights are frozen, so stop gradient through + // both src0 and src1 (activations have no path to loss without differentiable weights). + case GGML_OP_MUL_MAT_ID: + if (ggml_is_quantized(node->src[0]->type)) { + ignore_src[0] = true; + ignore_src[1] = true; + } + ignore_src[2] = true; // ids: integer tensor + break; + + // SET_ROWS is a KV-cache scatter write. The gradient of the written data flows + // through the attention read path (GET_ROWS backward), not through this node. + case GGML_OP_SET_ROWS: + ignore_src[0] = true; + ignore_src[1] = true; + break; + + // Ops with no backward implementation — stop gradient through all sources so the + // backward graph builder never tries to propagate through them. + case GGML_OP_SSM_CONV: // Mamba causal conv1d + case GGML_OP_SSM_SCAN: // Mamba selective scan + case GGML_OP_FLASH_ATTN_EXT: // use standard attention for training + ignore_src[0] = true; + ignore_src[1] = true; + ignore_src[2] = true; + ignore_src[3] = true; + break; + default: break; } From 68d9f10057a47e5fc651784c36c0fcbbe5b26ec1 Mon Sep 17 00:00:00 2001 From: Salvatore Rossitto Date: Wed, 11 Mar 2026 14:55:38 +0100 Subject: [PATCH 02/15] added moe experts profiling and pruning --- tools/CMakeLists.txt | 1 + tools/expert-profile/CMakeLists.txt | 8 + tools/expert-profile/expert-profile.cpp | 506 +++++++++++++++++++++ tools/moe-pruning/README.md | 97 ++++ tools/moe-pruning/analyze_stats.py | 284 ++++++++++++ tools/moe-pruning/build_expert_profile.sh | 42 ++ tools/moe-pruning/extract_ppl.py | 41 ++ tools/moe-pruning/gguf_prune.py | 258 +++++++++++ tools/moe-pruning/nemotron_reap.py | 296 ++++++++++++ tools/moe-pruning/requirements.txt | 1 + tools/moe-pruning/sample_calibration.jsonl | 8 + 11 files changed, 1542 insertions(+) create mode 100644 tools/expert-profile/CMakeLists.txt create mode 100644 tools/expert-profile/expert-profile.cpp create mode 100644 tools/moe-pruning/README.md create mode 100644 tools/moe-pruning/analyze_stats.py create mode 100644 tools/moe-pruning/build_expert_profile.sh create mode 100644 tools/moe-pruning/extract_ppl.py create mode 100644 tools/moe-pruning/gguf_prune.py create mode 100644 tools/moe-pruning/nemotron_reap.py create mode 100644 tools/moe-pruning/requirements.txt create mode 100644 tools/moe-pruning/sample_calibration.jsonl diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index b433c91d85..0bc2ad34c1 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -39,4 +39,5 @@ else() endif() add_subdirectory(fit-params) add_subdirectory(results) + add_subdirectory(expert-profile) endif() diff --git a/tools/expert-profile/CMakeLists.txt b/tools/expert-profile/CMakeLists.txt new file mode 100644 index 0000000000..859bd77a53 --- /dev/null +++ b/tools/expert-profile/CMakeLists.txt @@ -0,0 +1,8 @@ +set(TARGET llama-expert-profile) +add_executable(${TARGET} expert-profile.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/expert-profile/expert-profile.cpp b/tools/expert-profile/expert-profile.cpp new file mode 100644 index 0000000000..de381ff1f1 --- /dev/null +++ b/tools/expert-profile/expert-profile.cpp @@ -0,0 +1,506 @@ +/** + * expert-profile: NemotronH MoE expert activation profiler (REAP implementation) + * + * Implements the REAP (Router-weighted Expert Activation Pruning) saliency criterion: + * + * REAP(j) = mean over tokens routed to j of: gate_weight(j,t) * ||expert_output(j,t)||_2 + * + * where expert_output is ffn_moe_down (the FFN output BEFORE gate weighting), + * and gate_weight is ffn_moe_weights (post-softmax routing probability). + * + * Intercepts three tensors per MoE layer via ggml eval callback: + * ffn_moe_topk-{il} [n_expert_used, n_tokens] I32 — which experts were selected + * ffn_moe_weights-{il} [1, n_expert_used, n_tokens] F32 — gate weights (softmax probs) + * ffn_moe_down-{il} [n_embd, n_expert_used, n_tokens] F32 — expert outputs (pre-weighting) + * + * Reference: "REAP: Router-weighted Expert Activation Pruning" (arXiv:2510.13999) + * score = mean_{x in X_j}[ g_j(x) * ||f_j(x)||_2 ] (Equation 9) + * + * Usage: + * llama-expert-profile \ + * -m model.gguf --jsonl training-data.jsonl --output expert_stats.json \ + * [--n-experts 128] [--ctx-size 16384] [-ngl 32] [-t 24] [--save-every 1] + */ + +#include "arg.h" +#include "common.h" +#include "log.h" +#include "llama.h" +#include "ggml-backend.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// ─── Per-layer stats ────────────────────────────────────────────────────────── + +struct LayerStats { + int64_t n_experts = 0; + int64_t total_tokens = 0; // tokens processed through this layer + + // Frequency / weighted-frequency (kept for reference/comparison) + std::vector activation_counts; // [n_experts] — how many tokens routed here + std::vector weighted_freq_sum; // [n_experts] — sum of gate weights + + // REAP: running sum and count for computing mean(gate_weight * ||expert_out||_2) + std::vector reap_sum; // [n_experts] — sum of g_j(t)*||f_j(t)||_2 + std::vector ean_sum; // [n_experts] — sum of ||f_j(t)||_2 (EAN, no gate) + + void init(int64_t n) { + n_experts = n; + activation_counts.assign(n, 0); + weighted_freq_sum.assign(n, 0.0); + reap_sum.assign(n, 0.0); + ean_sum.assign(n, 0.0); + } + + // Called once we have all three tensors for a batch. + // expert_ids: [n_expert_used * n_tokens] I32 — flat, column-major: [k + t*n_expert_used] + // gate_weights:[n_expert_used * n_tokens] F32 — same layout + // expert_outs: [n_embd * n_expert_used * n_tokens] F32 — layout: [e + k*n_embd + t*n_embd*n_expert_used] + // i.e. for token t, expert-slot k: out vector starts at t*n_embd*n_expert_used + k*n_embd + void add_batch(const int32_t * expert_ids, + const float * gate_weights, + const float * expert_outs, + int64_t n_expert_used, + int64_t n_tok, + int64_t n_embd) { + total_tokens += n_tok; + for (int64_t t = 0; t < n_tok; ++t) { + for (int64_t k = 0; k < n_expert_used; ++k) { + const int64_t flat = k + t * n_expert_used; + const int32_t eid = expert_ids[flat]; + if (eid < 0 || eid >= n_experts) continue; + + const float gw = gate_weights[flat]; + + // L2 norm of expert output vector for this (token, expert-slot) + const float * vec = expert_outs + t * n_embd * n_expert_used + k * n_embd; + double norm2 = 0.0; + for (int64_t d = 0; d < n_embd; ++d) { + norm2 += (double)vec[d] * (double)vec[d]; + } + const double norm = std::sqrt(norm2); + + activation_counts [eid] += 1; + weighted_freq_sum [eid] += gw; + reap_sum [eid] += gw * norm; // REAP numerator + ean_sum [eid] += norm; // EAN numerator + } + } + } +}; + +// ─── Collector ──────────────────────────────────────────────────────────────── + +struct ExpertCollector { + int64_t n_experts = 128; + + std::map layer_stats; + std::mutex mtx; + + // We need all three tensors before we can compute REAP. + // They arrive in order: topk → weights → down (per the graph build order). + // Store pending topk+weights until down arrives. + struct PendingBatch { + int64_t n_expert_used = 0; + int64_t n_tokens = 0; + std::vector expert_ids; // [n_expert_used * n_tokens] + std::vector gate_weights; // [n_expert_used * n_tokens] + bool has_topk = false; + bool has_weights = false; + }; + std::map pending; // layer_idx → pending + + // Strip device prefix/suffix: "CUDA0#ffn_moe_down-5#0" → "ffn_moe_down-5" + static std::string clean_name(const char * raw) { + const char * p = strchr(raw, '#'); + if (p) { + ++p; + const char * q = strchr(p, '#'); + return q ? std::string(p, q - p) : std::string(p); + } + return raw; + } + + bool wants(struct ggml_tensor * t) { + if (!t->name[0]) return false; + const std::string n = clean_name(t->name); + return (n.compare(0, 13, "ffn_moe_topk-") == 0 || + n.compare(0, 16, "ffn_moe_weights-") == 0 || + n.compare(0, 13, "ffn_moe_down-") == 0); + } + + bool on_tensor(struct ggml_tensor * t) { + const std::string name = clean_name(t->name); + + // Identify tensor type and layer + int il = -1; + bool is_topk = false; + bool is_weights = false; + bool is_down = false; + + if (name.compare(0, 13, "ffn_moe_topk-") == 0) { il = atoi(name.c_str() + 13); is_topk = true; } + else if (name.compare(0, 16, "ffn_moe_weights-") == 0) { il = atoi(name.c_str() + 16); is_weights = true; } + else if (name.compare(0, 13, "ffn_moe_down-") == 0) { il = atoi(name.c_str() + 13); is_down = true; } + else return true; + + if (il < 0) return true; + + // Copy tensor data from (possibly GPU) buffer to host + const size_t nbytes = ggml_nbytes(t); + std::vector buf(nbytes); + ggml_backend_tensor_get(t, buf.data(), 0, nbytes); + + std::lock_guard lk(mtx); + PendingBatch & pb = pending[il]; + + if (is_topk) { + // [n_expert_used, n_tokens] I32 + pb.n_expert_used = t->ne[0]; + pb.n_tokens = t->ne[1]; + pb.expert_ids.resize(pb.n_expert_used * pb.n_tokens); + memcpy(pb.expert_ids.data(), buf.data(), pb.n_expert_used * pb.n_tokens * sizeof(int32_t)); + pb.has_topk = true; + pb.has_weights = false; // reset in case of re-use + + } else if (is_weights) { + // [1, n_expert_used, n_tokens] F32 — flat layout same as topk + if (!pb.has_topk) return true; // shouldn't happen + pb.gate_weights.resize(pb.n_expert_used * pb.n_tokens); + memcpy(pb.gate_weights.data(), buf.data(), pb.n_expert_used * pb.n_tokens * sizeof(float)); + pb.has_weights = true; + + } else if (is_down) { + // [n_embd, n_expert_used, n_tokens] F32 + if (!pb.has_topk || !pb.has_weights) return true; + + const int64_t n_embd = t->ne[0]; + const int64_t n_expert_used = t->ne[1]; + const int64_t n_tokens = t->ne[2]; + + // Sanity check + if (n_expert_used != pb.n_expert_used || n_tokens != pb.n_tokens) { + LOG_ERR("expert-profile: dimension mismatch at layer %d\n", il); + pending.erase(il); + return true; + } + + // Ensure layer stats initialised + auto & ls = layer_stats[il]; + if (ls.n_experts == 0) ls.init(n_experts); + + const float * expert_outs = reinterpret_cast(buf.data()); + ls.add_batch(pb.expert_ids.data(), pb.gate_weights.data(), + expert_outs, n_expert_used, n_tokens, n_embd); + + // Done with this batch for this layer + pending.erase(il); + } + + return true; + } +}; + +// ─── Global collector + C callback ─────────────────────────────────────────── + +static ExpertCollector g_collector; + +static bool expert_eval_callback(struct ggml_tensor * t, bool ask, void * /*user_data*/) { + if (ask) return g_collector.wants(t); + return g_collector.on_tensor(t); +} + +// ─── JSON output ────────────────────────────────────────────────────────────── + +static void save_stats(const std::string & path) { + std::ofstream f(path); + if (!f) { + LOG_ERR("expert-profile: failed to open output file '%s'\n", path.c_str()); + return; + } + + f << "{\n"; + bool first_layer = true; + for (auto & [il, ls] : g_collector.layer_stats) { + if (!first_layer) f << ",\n"; + first_layer = false; + + f << " \"" << il << "\": {\n"; + f << " \"total_tokens\": " << ls.total_tokens << ",\n"; + + // activation_counts + f << " \"activation_counts\": ["; + for (int64_t i = 0; i < ls.n_experts; ++i) { + if (i) f << ", "; + f << ls.activation_counts[i]; + } + f << "],\n"; + + // activation_frequency + f << " \"activation_frequency\": ["; + for (int64_t i = 0; i < ls.n_experts; ++i) { + if (i) f << ", "; + f << ((ls.total_tokens > 0) ? (double)ls.activation_counts[i] / ls.total_tokens : 0.0); + } + f << "],\n"; + + // avg_gate_weight (weighted_freq_sum / activation_counts) + f << " \"avg_gate_weight\": ["; + for (int64_t i = 0; i < ls.n_experts; ++i) { + if (i) f << ", "; + f << ((ls.activation_counts[i] > 0) ? ls.weighted_freq_sum[i] / ls.activation_counts[i] : 0.0); + } + f << "],\n"; + + // ean_mean = ean_sum / activation_counts (EAN criterion, no gate weight) + f << " \"ean_mean\": ["; + for (int64_t i = 0; i < ls.n_experts; ++i) { + if (i) f << ", "; + f << ((ls.activation_counts[i] > 0) ? ls.ean_sum[i] / ls.activation_counts[i] : 0.0); + } + f << "],\n"; + + // reap = reap_sum / activation_counts (REAP criterion, Eq.9) + f << " \"reap\": ["; + for (int64_t i = 0; i < ls.n_experts; ++i) { + if (i) f << ", "; + f << ((ls.activation_counts[i] > 0) ? ls.reap_sum[i] / ls.activation_counts[i] : 0.0); + } + f << "],\n"; + + // never_activated + int64_t never = 0; + for (int64_t i = 0; i < ls.n_experts; ++i) { + if (ls.activation_counts[i] == 0) ++never; + } + f << " \"never_activated\": " << never << "\n"; + f << " }"; + } + f << "\n}\n"; + + LOG_INF("expert-profile: stats saved to '%s' (%zu MoE layers)\n", + path.c_str(), g_collector.layer_stats.size()); +} + +// ─── JSONL input ────────────────────────────────────────────────────────────── + +struct JsonPair { std::string prompt, response; }; + +static bool json_get_string(const std::string & line, const std::string & key, std::string & out) { + std::string search = "\"" + key + "\""; + size_t kpos = line.find(search); + if (kpos == std::string::npos) return false; + size_t colon = line.find(':', kpos + search.size()); + if (colon == std::string::npos) return false; + size_t q1 = line.find('"', colon + 1); + if (q1 == std::string::npos) return false; + out.clear(); + for (size_t i = q1 + 1; i < line.size(); ++i) { + if (line[i] == '\\' && i + 1 < line.size()) { + ++i; + switch (line[i]) { + case '"': out += '"'; break; + case '\\': out += '\\'; break; + case 'n': out += '\n'; break; + case 'r': out += '\r'; break; + case 't': out += '\t'; break; + default: out += line[i]; break; + } + } else if (line[i] == '"') { + return true; + } else { + out += line[i]; + } + } + return false; +} + +static std::vector load_jsonl(const std::string & path) { + std::vector pairs; + std::ifstream f(path); + if (!f) { LOG_ERR("expert-profile: cannot open JSONL file '%s'\n", path.c_str()); return pairs; } + std::string line; + while (std::getline(f, line)) { + if (line.empty()) continue; + JsonPair p; + json_get_string(line, "prompt", p.prompt); + json_get_string(line, "response", p.response); + if (!p.prompt.empty() || !p.response.empty()) pairs.push_back(std::move(p)); + } + return pairs; +} + +// ─── Inference loop ─────────────────────────────────────────────────────────── + +static void run_inference(llama_context * ctx, + const llama_model * model, + const std::vector & pairs, + int max_tokens, + const std::string & output_path, + int save_every) { + const llama_vocab * vocab = llama_model_get_vocab(model); + const bool add_bos = llama_vocab_get_add_bos(vocab); + + llama_batch batch = llama_batch_init(max_tokens, 0, 1); + + for (size_t pi = 0; pi < pairs.size(); ++pi) { + const std::string text = pairs[pi].prompt + "\n" + pairs[pi].response; + + std::vector tokens = common_tokenize(ctx, text, add_bos, true); + if ((int)tokens.size() > max_tokens) tokens.resize(max_tokens); + if (tokens.empty()) continue; + + LOG_INF(" [%zu/%zu] %zu tokens\n", pi + 1, pairs.size(), tokens.size()); + + llama_memory_clear(llama_get_memory(ctx), true); + + common_batch_clear(batch); + for (int i = 0; i < (int)tokens.size(); ++i) { + common_batch_add(batch, tokens[i], i, {0}, false); + } + batch.logits[batch.n_tokens - 1] = true; + + if (llama_decode(ctx, batch) != 0) { + LOG_ERR(" [%zu/%zu] llama_decode failed — skipping\n", pi + 1, pairs.size()); + } + + if (save_every > 0 && (pi + 1) % save_every == 0) { + save_stats(output_path); + } + } + + llama_batch_free(batch); +} + +// ─── CLI ────────────────────────────────────────────────────────────────────── + +int main(int argc, char ** argv) { + std::string model_path; + std::string jsonl_path; + std::string output_path = "expert_stats.json"; + int n_experts = 128; + int ctx_size = 2048; + int n_gpu_layers = 99; + int n_threads = 4; + int save_every = 100; + enum ggml_type kv_type_k = GGML_TYPE_F16; + enum ggml_type kv_type_v = GGML_TYPE_F16; + + auto parse_ggml_type = [](const char * s) -> enum ggml_type { + if (strcmp(s, "f32") == 0) return GGML_TYPE_F32; + if (strcmp(s, "f16") == 0) return GGML_TYPE_F16; + if (strcmp(s, "q8_0") == 0) return GGML_TYPE_Q8_0; + if (strcmp(s, "q4_0") == 0) return GGML_TYPE_Q4_0; + fprintf(stderr, "Unknown KV type '%s', using f16\n", s); return GGML_TYPE_F16; + }; + + for (int i = 1; i < argc; ++i) { + std::string a(argv[i]); + auto next = [&]() -> const char * { + if (i + 1 >= argc) { fprintf(stderr, "Missing argument for %s\n", argv[i]); exit(1); } + return argv[++i]; + }; + if (a == "-m" || a == "--model") model_path = next(); + else if (a == "--jsonl") jsonl_path = next(); + else if (a == "--output") output_path = next(); + else if (a == "--n-experts") n_experts = atoi(next()); + else if (a == "--ctx-size" || a == "-c") ctx_size = atoi(next()); + else if (a == "-ngl" || a == "--n-gpu-layers") n_gpu_layers = atoi(next()); + else if (a == "-t" || a == "--threads") n_threads = atoi(next()); + else if (a == "--type-k") kv_type_k = parse_ggml_type(next()); + else if (a == "--type-v") kv_type_v = parse_ggml_type(next()); + else if (a == "--save-every") save_every = atoi(next()); + else if (a == "-h" || a == "--help") { + fprintf(stderr, + "\nUsage: %s -m model.gguf --jsonl data.jsonl [options]\n" + " --output PATH Output JSON (default: expert_stats.json)\n" + " --n-experts N Experts per layer (default: 128)\n" + " --ctx-size N Context length (default: 2048)\n" + " -ngl N GPU layers (default: 99)\n" + " -t N CPU threads (default: 4)\n" + " --type-k/v TYPE KV cache type: f32/f16/q8_0/q4_0 (default: f16)\n" + " --save-every N Checkpoint every N samples (default: 100)\n\n", argv[0]); + return 0; + } else { + fprintf(stderr, "Unknown argument: %s\n", a.c_str()); return 1; + } + } + + if (model_path.empty()) { fprintf(stderr, "Error: -m required\n"); return 1; } + if (jsonl_path.empty()) { fprintf(stderr, "Error: --jsonl required\n"); return 1; } + + g_collector.n_experts = n_experts; + + LOG_INF("expert-profile: model = %s\n", model_path.c_str()); + LOG_INF("expert-profile: jsonl = %s\n", jsonl_path.c_str()); + LOG_INF("expert-profile: output = %s\n", output_path.c_str()); + LOG_INF("expert-profile: n_experts = %d\n", n_experts); + LOG_INF("expert-profile: ctx_size = %d\n", ctx_size); + LOG_INF("expert-profile: ngl = %d\n", n_gpu_layers); + LOG_INF("expert-profile: criterion = REAP (gate_weight * ||expert_out||_2)\n"); + + auto pairs = load_jsonl(jsonl_path); + if (pairs.empty()) { LOG_ERR("expert-profile: no pairs loaded\n"); return 1; } + LOG_INF("expert-profile: loaded %zu pairs\n", pairs.size()); + + llama_backend_init(); + + // Suppress INFO/WARN spam (CUDA graph warmup etc.), only pass errors through + llama_log_set([](enum ggml_log_level level, const char * text, void *) { + if (level >= GGML_LOG_LEVEL_ERROR) fputs(text, stderr); + }, nullptr); + + llama_model_params mparams = llama_model_default_params(); + mparams.n_gpu_layers = n_gpu_layers; + + llama_model * model = llama_model_load_from_file(model_path.c_str(), mparams); + if (!model) { LOG_ERR("expert-profile: failed to load model\n"); return 1; } + + llama_context_params cparams = llama_context_default_params(); + cparams.n_ctx = ctx_size; + cparams.n_batch = ctx_size; + cparams.n_ubatch = std::min(ctx_size, 512); + cparams.n_threads = n_threads; + cparams.type_k = kv_type_k; + cparams.type_v = kv_type_v; + cparams.cb_eval = expert_eval_callback; + cparams.cb_eval_user_data = nullptr; + + llama_context * ctx = llama_init_from_model(model, cparams); + if (!ctx) { LOG_ERR("expert-profile: failed to create context\n"); return 1; } + + LOG_INF("expert-profile: running forward passes...\n"); + run_inference(ctx, model, pairs, ctx_size, output_path, save_every); + save_stats(output_path); + + // Summary + LOG_INF("\n MoE layers profiled: %zu\n", g_collector.layer_stats.size()); + for (auto & [il, ls] : g_collector.layer_stats) { + // Find top and bottom REAP expert + int64_t top_e = 0, bot_e = 0; + double top_v = 0.0, bot_v = 1e18; + for (int64_t i = 0; i < ls.n_experts; ++i) { + double v = (ls.activation_counts[i] > 0) ? ls.reap_sum[i] / ls.activation_counts[i] : 0.0; + if (v > top_v) { top_v = v; top_e = i; } + if (v < bot_v) { bot_v = v; bot_e = i; } + } + int64_t never = 0; + for (int64_t i = 0; i < ls.n_experts; ++i) + if (ls.activation_counts[i] == 0) ++never; + LOG_INF(" Layer %3d: tokens=%lld never=%lld reap_top=e%lld(%.4f) reap_bot=e%lld(%.4f)\n", + il, (long long)ls.total_tokens, (long long)never, + (long long)top_e, top_v, (long long)bot_e, bot_v); + } + + llama_free(ctx); + llama_model_free(model); + llama_backend_free(); + return 0; +} diff --git a/tools/moe-pruning/README.md b/tools/moe-pruning/README.md new file mode 100644 index 0000000000..a88499ac43 --- /dev/null +++ b/tools/moe-pruning/README.md @@ -0,0 +1,97 @@ +# MoE Expert Pruning Tools for NemotronH + +REAP-style expert pruning for `NVIDIA-Nemotron-3-Nano-30B-A3B` (and other +NemotronH MoE models), implemented in two complementary ways: + +1. **`tools/expert-profile/`** — C++ profiler built into llama.cpp, collects + REAP scores directly from GGUF inference via the ggml eval callback. +2. **`tools/moe-pruning/`** (this directory) — Python scripts to prune the model + using the collected scores, either on a GGUF file directly or on a + HuggingFace BF16 checkpoint. + +--- + +## Inspiration & Prior Art + +This work is a direct implementation of the **REAP** saliency criterion +introduced in: + +> **REAP the Experts: Why Pruning Prevails for One-Shot MoE Compression** +> Mike Lasby, Ivan Lazarevich, Nish Sinnadurai, Sean Lie, Yani Ioannou, Vithursan Thangarasa +> Cerebras Research, 2025 +> arXiv: https://arxiv.org/abs/2510.13999 +> Code: https://github.com/CerebrasResearch/reap + +The REAP score for expert `j` is (Equation 9 of the paper): + +``` +REAP(j) = mean_{t : j ∈ topk(t)} [ g_j(t) · ‖f_j(t)‖₂ ] +``` + +where `g_j(t)` is the router gate weight and `f_j(t)` is the expert FFN output +(pre-weighting) for token `t`. Experts with the lowest REAP score contribute +least to the layer output and are pruned first. + +The original REAP repo targets HuggingFace models via PyTorch hooks on +standard architectures (Qwen3-MoE, Mixtral, DeepSeek-V2, Llama-4, …). + +**What we added / adapted:** + +- `tools/expert-profile/expert-profile.cpp` — llama.cpp C++ implementation + of REAP that intercepts `ffn_moe_topk`, `ffn_moe_weights`, and `ffn_moe_down` + tensors via `ggml_backend_eval_callback`, enabling REAP profiling on any + GGUF-quantised model (Q4_K_M, Q6_K, etc.) without needing full BF16 VRAM. + +- `gguf_prune.py` — prunes the GGUF file **directly**, slicing the expert axis + of the stacked weight tensors (`ffn_up_exps`, `ffn_down_exps`, `ffn_gate_inp`, + `ffn_exp_probs_b`) and patching `{arch}.expert_count` in the metadata. + Quantised blocks are preserved as raw bytes — no dequantise/requantise step. + +- `nemotron_reap.py` — HuggingFace-based alternative: profiles with 4-bit NF4 + on GPU (phase 1) and prunes the BF16 checkpoint on CPU (phase 2). Adds + NemotronH (`NemotronHForCausalLM`) support that the original REAP repo does + not have. + +--- + +## Recommended Workflow (low-VRAM, e.g. RTX 4060 Ti 16 GB) + +``` +┌─────────────────────────────────────────────┐ +│ Phase 1 — Profile (GPU, GGUF Q4, ~15 GB) │ +│ │ +│ llama-expert-profile │ +│ -m nemotron-Q4_K_M.gguf │ +│ --jsonl sample_calibration.jsonl │ +│ --output expert_stats.json │ +│ -ngl 99 --ctx-size 2048 │ +└───────────────────┬─────────────────────────┘ + │ expert_stats.json +┌───────────────────▼─────────────────────────┐ +│ Phase 2 — Prune (CPU, pure Python, ~2 GB) │ +│ │ +│ python gguf_prune.py │ +│ --input nemotron-Q4_K_M.gguf │ +│ --stats expert_stats.json │ +│ --output nemotron-pruned-26e.gguf │ +│ --keep_ratio 0.20 # 26/128 experts │ +└─────────────────────────────────────────────┘ +``` + +At 20 % keep ratio a ~22 GB Q4_K_M becomes ~4.5 GB. + +--- + +## Files + +| File | Description | +|---|---| +| `gguf_prune.py` | GGUF-native pruner — no GPU needed, preserves quantisation | +| `nemotron_reap.py` | HF-based pruner — 4-bit GPU profile + CPU BF16 prune | +| `build_expert_profile.sh` | Build script for `llama-expert-profile` | +| `run_nemotron_profile.sh` | Example profiling run | +| `run_prune.sh` | Example pruning run | +| `run_convert_quantize.sh` | Convert HF → GGUF and quantise | +| `analyze_stats.py` | Visualise and compare expert stats JSON files | +| `sample_calibration.jsonl` | Sample calibration data (prompt+response pairs) | +| `expert_stats_reap.json` | Example stats output from expert-profile | diff --git a/tools/moe-pruning/analyze_stats.py b/tools/moe-pruning/analyze_stats.py new file mode 100644 index 0000000000..e7641a3bb6 --- /dev/null +++ b/tools/moe-pruning/analyze_stats.py @@ -0,0 +1,284 @@ +#!/usr/bin/env python3 +""" +analyze_stats.py -- Summarize expert_stats.json and model size projections. +Usage: python analyze_stats.py [stats_file] [--keep 0.5] +""" +import json, sys, statistics, argparse + +parser = argparse.ArgumentParser() +parser.add_argument("stats", nargs="?", default="expert_stats_reap.json") +parser.add_argument("--keep", type=float, default=0.5, help="Fraction of experts to keep (default 0.5)") +args = parser.parse_args() + +with open(args.stats) as f: + data = json.load(f) + +layers = sorted(data.keys(), key=int) +n_layers = len(layers) +keep_ratio = args.keep + +# Detect which scoring field is available (new REAP vs old importance_score) +sample_layer = data[layers[0]] +if "reap" in sample_layer: + score_field = "reap" + score_label = "REAP (gate_weight × ||expert_out||₂)" +elif "importance_score" in sample_layer: + score_field = "importance_score" + score_label = "importance_score (freq × avg_gate_weight) [legacy, no EAN]" +else: + raise ValueError(f"No recognised score field in stats. Keys: {list(sample_layer.keys())}") + +# ── Model architecture constants (Nemotron-3-Nano-30B-A3B) ────────────────── +N_EXPERTS = 128 +N_EXPERT_USED = 6 # top-k per token +N_MOE_LAYERS = 23 +N_TOTAL_LAYERS = 53 +# Approximate parameter counts (bf16, billions) +PARAMS_TOTAL_B = 30.0 +PARAMS_MOE_EXPERTS_B = 22.0 # bulk of MoE weight is in expert FFNs +PARAMS_NON_MOE_B = PARAMS_TOTAL_B - PARAMS_MOE_EXPERTS_B + +# ── Header ────────────────────────────────────────────────────────────────── +print("=" * 70) +print(f" Expert Stats Analysis | file: {args.stats}") +print("=" * 70) + +# ── Profiling completeness ─────────────────────────────────────────────────── +sample_tokens = list(data.values())[0]["total_tokens"] +# Each token activates N_EXPERT_USED experts, sum(activation_counts) = total*top_k +# Approximate samples: total_tokens / avg_tokens_per_sample +# We don't know avg, but can infer: total_tokens / (total_tokens / ctx) ≈ ctx chunks +# Better: just report tokens and note the user knows sample count +print(f"\n── Profiling progress ──────────────────────────────────────────────────") +print(f" MoE layers profiled : {n_layers} / {N_MOE_LAYERS}") +print(f" Tokens processed : {sample_tokens:,} (per layer)") +act_sum = sum(data[layers[0]]["activation_counts"]) +assert abs(act_sum / sample_tokens - N_EXPERT_USED) < 0.01, "unexpected top-k" +print(f" top-k confirmed : {N_EXPERT_USED} (sum activations / tokens = {act_sum/sample_tokens:.1f})") + +# ── Per-layer importance score stats ──────────────────────────────────────── +print(f"\n── Per-layer score distribution [{score_label}]") +print(f" {'Layer':>5} {'Min':>9} {'Max':>9} {'Range':>9} {'CV%':>6} {'Never':>5}") +global_cvs = [] +for k in layers: + d = data[k] + s = d[score_field] + mn, mx = min(s), max(s) + cv = statistics.stdev(s) / statistics.mean(s) * 100 + global_cvs.append(cv) + print(f" {k:>5} {mn:>9.5f} {mx:>9.5f} {mx-mn:>9.5f} {cv:>6.3f}% {d['never_activated']:>5}") + +print(f"\n Mean CV across layers : {statistics.mean(global_cvs):.3f}%") +print(f" (CV < 1% = near-uniform; load-balancing is working as designed)") + +# ── Capacity loss sweep across pruning levels ──────────────────────────────── +# Paper (observer.py): REAP[i] = mean(ean_norm * softmax_router_weight) over tokens +# routed to expert i, averaged via OnlineStatsTracker weighted by expert_frequency. +# Our implementation (llama.cpp): same formula but routing weights are the top-k +# gate weights (post-softmax within top-k), not the full softmax over all 128. +# Impact: our weights are slightly higher than the paper's (renormalized to top-k +# only), but relative expert ranking within a layer should be preserved. +# +# IMPORTANT CAVEAT for this model (Nemotron-3-Nano-30B-A3B): +# The model was trained with a strong load-balancing auxiliary loss, so all 128 +# experts have nearly identical activation frequency (~4.69%) AND nearly identical +# REAP scores (Gini ~0.015, top/bottom ratio ~1.1-1.35x). The score distribution +# is a smooth monotone curve with NO natural elbow or gap. +# +# This means: +# - REAP ranking beats random pruning by only ~1pp in mass terms at keep=33% +# - The cut point boundary (rank 42 vs 43) has near-zero gap in most layers +# - REAP paper results on Qwen3-30B-A3B likely had higher Gini (less tight +# load-balancing or more expert specialization in pre-training) +# - For this model, actual quality loss must be measured via eval, not predicted +# from REAP score variance +# +# Metrics reported: +# - kept_mass%: REAP mass in the KEPT experts as % of total (> keep_ratio% = good) +# - vs_random%: how much more mass the REAP-selected set retains vs a random set +# of the same size (= kept_mass% - keep_ratio%). Positive = REAP wins. +# - Rel.gap: score gap at cut / layer score range. Near 0 = no natural cut point. +# - Gini: inequality of score distribution. ~0.015 here = near-uniform. + +def gini(scores): + """Gini coefficient of a list of non-negative values.""" + n = len(scores) + s = sorted(scores) + total = sum(s) + if total == 0: + return 0.0 + cumsum = 0.0 + for i, v in enumerate(s): + cumsum += (2 * (i + 1) - n - 1) * v + return cumsum / (n * total) + +def layer_stats(scores, n_keep): + """Return capacity metrics for a single layer at a given keep count.""" + n = len(scores) + ranked = sorted(range(n), key=lambda i: scores[i], reverse=True) + total = sum(scores) + kept_mass = sum(scores[i] for i in ranked[:n_keep]) + kept_frac = kept_mass / total if total > 0 else 0.0 # fraction of REAP mass kept + random_frac = n_keep / n # uniform expectation + vs_random = kept_frac - random_frac # positive = REAP beats random + score_range = scores[ranked[0]] - scores[ranked[-1]] + gap = scores[ranked[n_keep - 1]] - (scores[ranked[n_keep]] if n_keep < n else 0) + rel_gap = gap / score_range if score_range > 0 else 0.0 + return kept_frac * 100, vs_random * 100, rel_gap + +# Sweep over a range of keep ratios +sweep_ratios = [0.10, 0.20, 0.25, 0.33, 0.40, 0.50, 0.60, 0.75] +if keep_ratio not in sweep_ratios: + sweep_ratios.append(keep_ratio) +sweep_ratios = sorted(set(sweep_ratios)) + +# Per-layer Gini (fixed, independent of keep ratio) +layer_ginis = {k: gini(data[k][score_field]) for k in layers} +mean_gini = statistics.mean(layer_ginis.values()) +worst_gini_layer = max(layer_ginis, key=lambda k: layer_ginis[k]) + +print(f"\n── Score distribution inequality (Gini coefficient) ────────────────────") +print(f" Gini measures how non-uniform REAP scores are within each layer.") +print(f" Gini=0: all experts identical. Gini=1: one expert dominates.") +print(f" With load-balanced MoE, Gini is small — but any Gini > 0 means") +print(f" REAP ranking beats random pruning.") +print(f"") +print(f" {'Layer':>5} {'Gini':>8} {'Score range':>13} {'Max/Min ratio':>14}") +print(f" {'-'*5} {'-'*8} {'-'*13} {'-'*14}") +for k in layers: + s = data[k][score_field] + mn, mx = min(s), max(s) + g = layer_ginis[k] + ratio_mm = mx / mn if mn > 0 else float('inf') + print(f" {k:>5} {g:>8.5f} {mx-mn:>13.5f} {ratio_mm:>13.3f}x") +print(f"") +print(f" Mean Gini : {mean_gini:.5f} (worst layer: {worst_gini_layer})") + +print(f"\n── Capacity retention sweep ─────────────────────────────────────────────") +print(f" Kept mass% = REAP mass in KEPT experts as % of total (higher = better)") +print(f" vs.rand% = Kept mass% minus uniform baseline (keep_ratio%)") +print(f" Positive = REAP beats random. Magnitude = advantage in pp.") +print(f" Rel.gap = score gap at cut / layer score range (higher = cleaner cut)") +print(f" WARNING: near-zero rel.gap and small vs.rand mean eval is the only ground truth.") +print(f"") +print(f" {'Keep':>5} {'Experts':>7} {'Kept mass%':>11} {'vs.rand%':>9} {'Rel.gap avg':>12} {'Worst layer':>11}") +print(f" {'-'*5} {'-'*7} {'-'*11} {'-'*9} {'-'*12} {'-'*11}") + +sweep_results = {} +for ratio in sweep_ratios: + nk = max(1, round(N_EXPERTS * ratio)) + mass_fracs, excesses, rel_gaps = [], [], [] + worst_excess, worst_layer_id = -999.0, None + for k in layers: + scores = data[k][score_field] + mf, exc, rg = layer_stats(scores, nk) + mass_fracs.append(mf) + excesses.append(exc) + rel_gaps.append(rg) + if exc > worst_excess: + worst_excess = exc + worst_layer_id = k + avg_mf = statistics.mean(mass_fracs) + avg_exc = statistics.mean(excesses) + avg_rg = statistics.mean(rel_gaps) + marker = " <--" if abs(ratio - keep_ratio) < 1e-9 else "" + print(f" {ratio:>5.0%} {nk:>7d} {avg_mf:>10.2f}% {avg_exc:>+9.2f}% {avg_rg:>11.4f} layer {worst_layer_id:>3}{marker}") + sweep_results[ratio] = { + "n_keep": nk, "avg_kept_mass": avg_mf, "avg_vs_random": avg_exc, + "avg_rel_gap": avg_rg, "worst_layer_id": worst_layer_id, "worst_vs_random": worst_excess, + } + +print(f"") +print(f" vs.rand% quantifies REAP's advantage over random pruning in REAP-mass terms.") +print(f" For this model it is small (+0.7 to +1.5pp) due to tight load-balancing.") +print(f" Rel.gap near zero means scores are smooth with no natural cut — any threshold") +print(f" is as defensible as another. Actual quality delta requires empirical eval.") + +# ── Expert keep/prune detail at selected keep_ratio ────────────────────────── +n_keep = max(1, round(N_EXPERTS * keep_ratio)) +n_prune = N_EXPERTS - n_keep + +print(f"\n── Expert pruning detail at keep_ratio={keep_ratio:.0%} ({n_keep} keep / {n_prune} prune per layer) ──") +print(f" {'Layer':>5} {'Kept mass%':>11} {'vs.rand%':>9} {'Rel.gap':>9} {'Min kept':>10} {'Max pruned':>11}") +print(f" {'-'*5} {'-'*11} {'-'*9} {'-'*9} {'-'*10} {'-'*11}") + +layer_results = {} +for k in layers: + scores = data[k][score_field] + ranked = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True) + mf, exc, rg = layer_stats(scores, n_keep) + min_kept = scores[ranked[n_keep - 1]] + max_pruned = scores[ranked[n_keep]] if n_prune > 0 else 0 + layer_results[k] = {"mass_frac": mf, "excess": exc, "rel_gap": rg, + "min_kept": min_kept, "max_pruned": max_pruned} + print(f" {k:>5} {mf:>10.2f}% {exc:>+9.2f}% {rg:>9.4f} {min_kept:>10.5f} {max_pruned:>11.5f}") + +avg_mf = statistics.mean(r["mass_frac"] for r in layer_results.values()) +avg_exc = statistics.mean(r["excess"] for r in layer_results.values()) +avg_rg = statistics.mean(r["rel_gap"] for r in layer_results.values()) +print(f" {'AVG':>5} {avg_mf:>10.2f}% {avg_exc:>+9.2f}% {avg_rg:>9.4f}") + +# ── Model size projections ─────────────────────────────────────────────────── +print(f"\n── Model size projections ──────────────────────────────────────────────") + +def model_size(keep): + expert_params = PARAMS_MOE_EXPERTS_B * keep + return PARAMS_NON_MOE_B + expert_params + +original_b = model_size(1.0) +pruned_b = model_size(keep_ratio) +reduction_pct = (1 - pruned_b / original_b) * 100 + +# GGUF sizes at common quant levels (rough: 1B params ≈ quant_bpw/8 GB) +quants = [("Q8_0", 8.0), ("Q5_K_M", 5.5), ("Q4_K_M", 4.5), ("Q3_K_M", 3.35), ("Q2_K", 2.63)] + +print(f" {'':20} {'Original':>10} {'Pruned':>10} {'Saved':>8}") +print(f" {'Parameters (B)':20} {original_b:>10.1f} {pruned_b:>10.1f} {original_b-pruned_b:>8.1f}B") +print(f" {'Reduction':20} {'':>10} {reduction_pct:>9.1f}%") +print() +print(f" Estimated GGUF sizes:") +print(f" {'Quant':10} {'Original':>10} {'Pruned':>10} {'Fits in':>12}") +for name, bpw in quants: + orig_gb = original_b * bpw / 8 + prune_gb = pruned_b * bpw / 8 + # VRAM fit (16GB GPU) + fits = "16GB GPU" if prune_gb <= 15.5 else ("32GB GPU" if prune_gb <= 31 else "CPU/RAM") + print(f" {name:10} {orig_gb:>9.1f}G {prune_gb:>9.1f}G {fits:>12}") + +# ── Active params per token (inference cost) ───────────────────────────────── +print(f"\n── Inference cost (active params per token) ────────────────────────────") +# Active params = non-moe + (n_expert_used/n_experts_kept * moe_expert_params) +# After pruning: router still picks top-k but from n_keep pool +# Active expert params per token = (N_EXPERT_USED / n_keep) * (PARAMS_MOE_EXPERTS_B * keep_ratio) +# But actually active params = N_EXPERT_USED * (params per single expert) +params_per_expert_orig = PARAMS_MOE_EXPERTS_B / N_EXPERTS # B per expert +params_per_expert_pruned = (PARAMS_MOE_EXPERTS_B * keep_ratio) / n_keep # same, just fewer experts + +active_orig = PARAMS_NON_MOE_B + N_EXPERT_USED * params_per_expert_orig * N_MOE_LAYERS / N_TOTAL_LAYERS +active_pruned = PARAMS_NON_MOE_B + N_EXPERT_USED * params_per_expert_pruned * N_MOE_LAYERS / N_TOTAL_LAYERS + +print(f" Original : {active_orig:.2f}B active params/token (same expert size, more choice)") +print(f" Pruned : {active_pruned:.2f}B active params/token (same — top-k still fires {N_EXPERT_USED} experts)") +print(f" Note: active params per token are IDENTICAL — pruning only reduces") +print(f" model file size and memory footprint, not per-token compute.") + +# ── Consistently low-importance experts ────────────────────────────────────── +print(f"\n── Experts consistently ranked low across all layers ───────────────────") +bottom_n = max(1, round(N_EXPERTS * 0.10)) # bottom 10% +low_count = {} +for k in layers: + scores = data[k][score_field] + ranked = sorted(range(len(scores)), key=lambda i: scores[i]) + for eid in ranked[:bottom_n]: + low_count[eid] = low_count.get(eid, 0) + 1 + +consistent = sorted(low_count.items(), key=lambda x: -x[1]) +consistent = [(eid, cnt) for eid, cnt in consistent if cnt >= 3] +print(f" (bottom 10% in >= 3 layers — most dispensable experts globally)") +print(f" Expert ID : layers in bottom 10%") +for eid, cnt in consistent[:20]: + bar = "█" * cnt + print(f" Expert {eid:>3} : {cnt:>2}/{n_layers} {bar}") + +print() +print("=" * 70) diff --git a/tools/moe-pruning/build_expert_profile.sh b/tools/moe-pruning/build_expert_profile.sh new file mode 100644 index 0000000000..0b39604426 --- /dev/null +++ b/tools/moe-pruning/build_expert_profile.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +# build_expert_profile.sh +# Builds llama.cpp with the expert-profile tool in WSL2 with CUDA. +# Run this from the tools/moe-pruning/ directory: bash build_expert_profile.sh + +set -e + +LLAMA_SRC="../.." +BUILD_DIR="$LLAMA_SRC/build_expert" + +echo "=== Building llama.cpp + expert-profile tool ===" +echo " Source : $LLAMA_SRC" +echo " Build : $BUILD_DIR" + +mkdir -p "$BUILD_DIR" +cd "$BUILD_DIR" + +# Configure with CUDA +cmake "$LLAMA_SRC" \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_CUDA=ON \ + -DLLAMA_CURL=OFF \ + -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_BUILD_EXAMPLES=OFF \ + -DCMAKE_CUDA_ARCHITECTURES=86 \ + 2>&1 | tail -20 + +# Build only the expert-profile target (fast) +cmake --build . --target llama-expert-profile --config Release -j$(nproc) + +echo "" +echo "=== Build complete ===" +echo " Binary: $BUILD_DIR/tools/expert-profile/llama-expert-profile" +echo "" +echo "=== Usage ===" +echo " $BUILD_DIR/tools/expert-profile/llama-expert-profile \\" +echo " -m ~/nemotron-3-nano-30b-Q4_K_M.gguf \\" +echo " --jsonl ./sample_calibration.jsonl \\" +echo " --output ./expert_stats_reap.json \\" +echo " --n-experts 128 \\" +echo " --ctx-size 16384 \\" +echo " -ngl 99" diff --git a/tools/moe-pruning/extract_ppl.py b/tools/moe-pruning/extract_ppl.py new file mode 100644 index 0000000000..972a32e99d --- /dev/null +++ b/tools/moe-pruning/extract_ppl.py @@ -0,0 +1,41 @@ +import json, os + +base = os.path.dirname(os.path.abspath(__file__)) + +lines = open(os.path.join(base, 'rwsft-training-data.jsonl'), encoding='utf-8').readlines() +split = int(len(lines) * 0.95) + +train_lines = lines[:split] +val_lines = lines[split:] + +train_out = os.path.join(base, 'ppl-eval-train.txt') +val_out = os.path.join(base, 'ppl-eval-val.txt') + +def fmt(s): + # Full prompt+response so the model is conditioned correctly. + # llama-perplexity scores all tokens, but the prompt PPL is identical + # for base vs adapter — the delta is driven by the response tokens. + prompt = s.get('prompt', '').strip() + response = s.get('response', '').strip() + if not response: + return None + if prompt: + return prompt + '\n' + response + return response + +with open(train_out, 'w', encoding='utf-8') as f: + for line in train_lines: + text = fmt(json.loads(line)) + if text: + f.write(text + '\n\n') + +with open(val_out, 'w', encoding='utf-8') as f: + for line in val_lines: + text = fmt(json.loads(line)) + if text: + f.write(text + '\n\n') + +train_chars = len(open(train_out, encoding='utf-8').read()) +val_chars = len(open(val_out, encoding='utf-8').read()) +print(f'train: {len(train_lines)} samples, {train_chars:,} chars -> ppl-eval-train.txt') +print(f'val: {len(val_lines)} samples, {val_chars:,} chars -> ppl-eval-val.txt') diff --git a/tools/moe-pruning/gguf_prune.py b/tools/moe-pruning/gguf_prune.py new file mode 100644 index 0000000000..2a36d5cf53 --- /dev/null +++ b/tools/moe-pruning/gguf_prune.py @@ -0,0 +1,258 @@ +""" +gguf-prune: REAP-based expert pruning directly on a GGUF file. + +Slices the expert dimension of the four stacked MoE weight tensors per layer: + blk.{il}.ffn_up_exps [n_embd, intermediate, n_experts] + blk.{il}.ffn_down_exps [intermediate, n_embd, n_experts] + blk.{il}.ffn_gate_inp [n_embd, n_experts] + blk.{il}.ffn_exp_probs_b [n_experts] (score-correction bias, if present) + +Quantized blocks (Q4_K, Q6_K, …) are preserved as raw bytes — slicing the +expert axis (last dim) is safe because each expert is independently quantised +in ggml, so dropping experts = dropping whole quantisation blocks. + +Metadata patched: + {arch}.expert_count → keep_n + (expert_used_count = top-k routing k, NOT touched) + +Usage: + # keep top 20% of experts (26/128) per MoE layer + python gguf_prune.py \\ + --input nemotron.gguf \\ + --stats expert_stats.json \\ + --output nemotron-pruned.gguf \\ + --keep_ratio 0.20 + + # or keep an absolute number + python gguf_prune.py \\ + --input nemotron.gguf \\ + --stats expert_stats.json \\ + --output nemotron-pruned.gguf \\ + --keep_n 32 +""" + +import argparse +import json +import re +import sys +from pathlib import Path + +import numpy as np +from gguf import GGUFReader, GGUFWriter, GGMLQuantizationType, GGUFValueType + + +# ── Constants ───────────────────────────────────────────────────────────────── + +# Base tensor names that carry the expert dimension (last axis in ggml layout). +# Some GGUFs append parameter tails like ".weight" / ".bias". +EXPERT_BASE_SUFFIXES = { + "ffn_up_exps", + "ffn_down_exps", + "ffn_gate_inp", +} + + +def is_expert_suffix(suffix: str) -> bool: + """Return True if a tensor suffix is one of the MoE expert tensors to prune.""" + if suffix in ("ffn_exp_probs_b", "exp_probs_b", "exp_probs_b.bias"): + return True + return any(suffix == base or suffix.startswith(base + ".") for base in EXPERT_BASE_SUFFIXES) + + +# ── Helpers ─────────────────────────────────────────────────────────────────── + +def layer_and_suffix(name: str) -> tuple[int, str] | tuple[None, None]: + m = re.match(r"blk\.(\d+)\.(.+)$", name) + if m: + return int(m.group(1)), m.group(2) + return None, None + + +def pick_experts(layer_stats: dict, keep_n: int) -> list[int]: + """ + Return sorted indices of the top `keep_n` experts by REAP score. + Falls back to 'importance_score' (weighted frequency) if 'reap' absent. + """ + if "reap" in layer_stats: + scores = np.array(layer_stats["reap"], dtype=np.float64) + elif "importance_score" in layer_stats: + scores = np.array(layer_stats["importance_score"], dtype=np.float64) + else: + raise KeyError( + "Layer stats has neither 'reap' nor 'importance_score'. " + "Run expert-profile / nemotron_reap.py profile first." + ) + return sorted(np.argsort(scores)[-keep_n:].tolist()) + + +def slice_expert_axis(data: np.ndarray, keep: list[int]) -> np.ndarray: + """ + Slice the expert axis of reader tensor data keeping only `keep` indices. + + GGUFReader reshapes tensors to NumPy with reversed ggml dims, so for MoE + tensors where experts are the last ggml dim, expert is axis 0 in `data`. + This also preserves quantized row-byte alignment (axis -1 is byte-packed + rows for quantized tensors and must not be sliced for expert pruning). + """ + return np.take(data, keep, axis=0) + + +def copy_field(writer: GGUFWriter, field, reader: GGUFReader) -> bool: + """Copy a single metadata field to writer. Returns False if skipped.""" + key = field.name + val_type = field.types[0] + part = field.parts[-1] + + if val_type == GGUFValueType.STRING: + # Preserve raw bytes: GGUF metadata can contain non-UTF8 strings. + writer.add_key_value(key, bytes(part), GGUFValueType.STRING) + elif val_type == GGUFValueType.UINT8: + writer.add_uint8(key, int(part[0])) + elif val_type == GGUFValueType.INT8: + writer.add_int8(key, int(part[0])) + elif val_type == GGUFValueType.UINT16: + writer.add_uint16(key, int(part[0])) + elif val_type == GGUFValueType.INT16: + writer.add_int16(key, int(part[0])) + elif val_type == GGUFValueType.UINT32: + writer.add_uint32(key, int(part[0])) + elif val_type == GGUFValueType.INT32: + writer.add_int32(key, int(part[0])) + elif val_type == GGUFValueType.FLOAT32: + writer.add_float32(key, float(part[0])) + elif val_type == GGUFValueType.UINT64: + writer.add_uint64(key, int(part[0])) + elif val_type == GGUFValueType.INT64: + writer.add_int64(key, int(part[0])) + elif val_type == GGUFValueType.FLOAT64: + writer.add_float64(key, float(part[0])) + elif val_type == GGUFValueType.BOOL: + writer.add_bool(key, bool(part[0])) + elif val_type == GGUFValueType.ARRAY: + elem_type = field.types[1] + if elem_type == GGUFValueType.STRING: + # ReaderField.data stores indices of ARRAY payload items; for + # STRING arrays this points at each string byte payload. + vals = [bytes(field.parts[idx]) for idx in field.data] + writer.add_key_value(key, vals, GGUFValueType.ARRAY, sub_type=GGUFValueType.STRING) + else: + # ReaderField.data stores part-indices, not payload values. + vals = field.contents() + if not isinstance(vals, list): + print(f" WARNING: skipping array field {key!r} (unexpected non-list contents)") + return False + writer.add_array(key, vals) + else: + print(f" WARNING: skipping field {key!r} (unsupported type {val_type})") + return False + return True + + +# ── Main ────────────────────────────────────────────────────────────────────── + +def main(): + ap = argparse.ArgumentParser(description="REAP expert pruning on a GGUF file") + ap.add_argument("--input", required=True, help="Input .gguf path") + ap.add_argument("--stats", required=True, help="expert_stats.json from expert-profile") + ap.add_argument("--output", required=True, help="Output .gguf path") + ap.add_argument("--keep_ratio", type=float, default=None, help="Fraction to keep, e.g. 0.20") + ap.add_argument("--keep_n", type=int, default=None, help="Absolute count to keep, e.g. 32") + ap.add_argument("--n_experts", type=int, default=128, help="Experts per MoE layer in source model") + args = ap.parse_args() + + if args.keep_ratio is None and args.keep_n is None: + ap.error("Provide --keep_ratio or --keep_n") + if args.keep_ratio is not None and args.keep_n is not None: + ap.error("Provide --keep_ratio OR --keep_n, not both") + + keep_n = args.keep_n if args.keep_n is not None else max(1, int(args.n_experts * args.keep_ratio)) + print(f"[gguf-prune] keeping {keep_n}/{args.n_experts} experts per MoE layer") + + # ── Load stats ───────────────────────────────────────────────────────────── + with open(args.stats) as f: + stats = {int(k): v for k, v in json.load(f).items()} + print(f"[gguf-prune] stats loaded for {len(stats)} MoE layers") + + # ── Open source GGUF ─────────────────────────────────────────────────────── + print(f"[gguf-prune] reading {args.input}") + reader = GGUFReader(args.input, mode="r") + + arch_field = reader.get_field("general.architecture") + arch = str(bytes(arch_field.parts[-1]), "utf-8") if arch_field else "nemotron_h_moe" + print(f"[gguf-prune] arch {arch}") + + expert_count_key = f"{arch}.expert_count" + + # ── Compute kept indices per layer ───────────────────────────────────────── + kept: dict[int, list[int]] = {} + for tensor in reader.tensors: + il, suffix = layer_and_suffix(tensor.name) + if il is None or not is_expert_suffix(suffix): + continue + if il in kept: + continue # already computed for this layer + if il not in stats: + print(f" Layer {il:3d}: no stats — keeping ALL {args.n_experts} experts") + kept[il] = list(range(args.n_experts)) + else: + kept[il] = pick_experts(stats[il], keep_n) + never = stats[il].get("never_activated", "?") + crit = "reap" if "reap" in stats[il] else "importance_score" + print(f" Layer {il:3d}: keep {kept[il][:4]}… never_activated={never} criterion={crit}") + + # ── Build output GGUF ────────────────────────────────────────────────────── + print(f"\n[gguf-prune] writing {args.output}") + writer = GGUFWriter(args.output, arch=arch) + + # --- metadata: copy all fields, replace expert_count --- + for field in reader.fields.values(): + # Reader exposes synthetic header fields (GGUF.*) that are not KV + # metadata and must not be copied back as normal keys. + if field.name.startswith("GGUF."): + continue + # Writer already sets general.architecture from ctor; avoid duplicate warning. + if field.name in (expert_count_key, "general.architecture"): + continue # replaced below + copy_field(writer, field, reader) + + writer.add_expert_count(keep_n) + print(f"[gguf-prune] patched {expert_count_key} → {keep_n}") + + # --- tensors --- + n_pruned = 0 + for tensor in reader.tensors: + il, suffix = layer_and_suffix(tensor.name) + is_expert = il is not None and is_expert_suffix(suffix) + + if is_expert: + k = kept[il] + data = slice_expert_axis(tensor.data, k) + writer.add_tensor( + tensor.name, + data, + raw_dtype=tensor.tensor_type, + ) + n_pruned += 1 + else: + writer.add_tensor( + tensor.name, + tensor.data, + raw_dtype=tensor.tensor_type, + ) + + writer.write_header_to_file() + writer.write_kv_data_to_file() + writer.write_tensors_to_file(progress=True) + writer.close() + + out = Path(args.output) + size_gb = out.stat().st_size / 1024**3 + print(f"\n[gguf-prune] done") + print(f" Expert tensors sliced : {n_pruned}") + print(f" MoE layers pruned : {len(kept)}") + print(f" Experts per layer : {keep_n}/{args.n_experts}") + print(f" Output size : {size_gb:.2f} GB → {out}") + + +if __name__ == "__main__": + main() diff --git a/tools/moe-pruning/nemotron_reap.py b/tools/moe-pruning/nemotron_reap.py new file mode 100644 index 0000000000..fac5831d3c --- /dev/null +++ b/tools/moe-pruning/nemotron_reap.py @@ -0,0 +1,296 @@ +""" +NemotronH Expert Activation Profiler + Pruner +Two-phase: profile with 4-bit on GPU, prune bf16 on CPU. + +Usage: + # Phase 1 - profile + python nemotron_reap.py profile \ + --model unsloth/Nemotron-3-Nano-30B-A3B \ + --prompts training-data.jsonl \ + --output expert_stats.json + + # Phase 2 - prune + python nemotron_reap.py prune \ + --model unsloth/Nemotron-3-Nano-30B-A3B \ + --stats expert_stats.json \ + --keep_ratio 0.20 \ + --output ./nemotron-pruned-25e +""" + +import os +os.environ["TORCH_COMPILE_DISABLE"] = "1" # prevent inductor hang during save_pretrained + +import json +import argparse +import torch +import numpy as np +from collections import defaultdict +from transformers import AutoTokenizer, AutoModelForCausalLM + +try: + from transformers import BitsAndBytesConfig + import patch_bnb # noqa: F401 — patches Params4bit.__new__ for transformers 5.x compat + HAS_BNB = True +except ImportError: + HAS_BNB = False + + +# ── Tracker ─────────────────────────────────────────────────────────────────── + +class ExpertActivationTracker: + def __init__(self, n_experts: int = 128): + self.n_experts = n_experts + self.activation_counts = defaultdict(lambda: np.zeros(n_experts, dtype=np.int64)) + self.activation_weights = defaultdict(lambda: np.zeros(n_experts, dtype=np.float64)) + self.total_tokens = defaultdict(int) + self._hooks = [] + + def register_hooks(self, model): + count = 0 + for layer_idx, block in enumerate(model.backbone.layers): + if block.block_type == "moe": + h = block.mixer.gate.register_forward_hook(self._make_hook(layer_idx)) + self._hooks.append(h) + count += 1 + print(f" Hooks attached to {count} MoE layers") + + def _make_hook(self, layer_idx): + def hook(module, input, output): + topk_indices, topk_weights = output + idx = topk_indices.detach().cpu().numpy() # [T, 6] + wgt = topk_weights.detach().float().cpu().numpy() # [T, 6] + T = idx.shape[0] + self.total_tokens[layer_idx] += T + np.add.at(self.activation_counts[layer_idx], idx.flatten(), 1) + np.add.at(self.activation_weights[layer_idx], idx.flatten(), wgt.flatten()) + return hook + + def remove_hooks(self): + for h in self._hooks: + h.remove() + self._hooks.clear() + + def get_stats(self) -> dict: + stats = {} + for layer_idx in sorted(self.activation_counts): + counts = self.activation_counts[layer_idx] + weights = self.activation_weights[layer_idx] + total = self.total_tokens[layer_idx] + freq = counts / (total + 1e-9) + avg_w = np.where(counts > 0, weights / counts, 0.0) + importance = freq * avg_w + stats[layer_idx] = { + "total_tokens": int(total), + "activation_counts": counts.tolist(), + "activation_frequency": freq.tolist(), + "avg_weight": avg_w.tolist(), + "importance_score": importance.tolist(), + "never_activated": int((counts == 0).sum()), + } + return stats + + def print_summary(self, stats, keep_ratio): + keep_n = max(1, int(self.n_experts * keep_ratio)) + print(f"\n{'='*70}") + print(f" PROFILING SUMMARY | keep_ratio={keep_ratio:.0%} | keeping {keep_n}/128 experts/layer") + print(f"{'='*70}") + for li, s in stats.items(): + imp = np.array(s['importance_score']) + threshold = np.sort(imp)[self.n_experts - keep_n] + print( + f" Layer {li:3d}: " + f"never_activated={s['never_activated']:3d}/128 " + f"top_freq={max(s['activation_frequency']):.3f} " + f"threshold={threshold:.4f}" + ) + total_moe = len(stats) + print(f"\n MoE layers : {total_moe}") + print(f" Kept : {total_moe * keep_n} experts total") + print(f" Pruned : {total_moe * (self.n_experts - keep_n)} experts total") + print(f"{'='*70}\n") + + +# ── Phase 1: Profile ────────────────────────────────────────────────────────── + +def cmd_profile(args): + # Mamba2 layers use Triton kernels — CUDA required. + # 4-bit NF4 fits in 16GB VRAM (~15GB). We must keep ALL layers on GPU + # (no CPU spillover) otherwise PCIe transfers make inference unusably slow. + print(f"\n[Phase 1] Profiling — 4-bit NF4, GPU only") + print(f" Model : {args.model}") + print(f" Prompts: {args.prompts}") + + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16, + bnb_4bit_use_double_quant=True, + ) + + tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) + print(" Loading model in 4-bit NF4...") + model = AutoModelForCausalLM.from_pretrained( + args.model, + trust_remote_code=True, + quantization_config=bnb_config, + device_map={"": 0}, # force ALL layers onto GPU 0, no CPU spillover + ) + model.eval() + print(" Model loaded on GPU.") + + # Load prompt+response pairs + pairs = [] + with open(args.prompts) as f: + for line in f: + line = line.strip() + if not line: + continue + obj = json.loads(line) + text = obj.get("prompt", "") + "\n" + obj.get("response", "") + pairs.append(text) + print(f" Loaded {len(pairs)} prompt+response pairs") + + tracker = ExpertActivationTracker(n_experts=128) + tracker.register_hooks(model) + + with torch.no_grad(): + for i, text in enumerate(pairs): + if i % 100 == 0: + print(f" [{i+1}/{len(pairs)}] processing...") + inputs = tokenizer( + text, + return_tensors="pt", + truncation=True, + max_length=args.max_length, + ).to("cuda") + try: + model(**inputs) + except torch.cuda.OutOfMemoryError: + print(f" [{i+1}] OOM — skipping") + torch.cuda.empty_cache() + + tracker.remove_hooks() + stats = tracker.get_stats() + tracker.print_summary(stats, keep_ratio=args.keep_ratio) + + stats_out = {str(k): v for k, v in stats.items()} + with open(args.output, "w") as f: + json.dump(stats_out, f, indent=2) + print(f" Stats saved → {args.output}") + print(f"\n[Phase 1] Done. Run 'prune' next (CPU only).") + + +# ── Phase 2: Prune ──────────────────────────────────────────────────────────── + +def cmd_prune(args): + print(f"\n[Phase 2] Pruning — bf16 on CPU") + print(f" Model : {args.model}") + print(f" Stats : {args.stats}") + print(f" Keep ratio : {args.keep_ratio:.0%}") + print(f" Output : {args.output}") + + with open(args.stats) as f: + stats = {int(k): v for k, v in json.load(f).items()} + + tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) + + print(" Loading model in bf16 on CPU — this takes a few minutes...") + model = AutoModelForCausalLM.from_pretrained( + args.model, + trust_remote_code=True, + dtype=torch.bfloat16, + device_map="cpu", + ) + + keep_n = max(1, int(128 * args.keep_ratio)) + print(f"\n Pruning to top-{keep_n} experts per MoE layer...\n") + + for layer_idx, block in enumerate(model.backbone.layers): + if block.block_type != "moe": + continue + + if layer_idx not in stats: + print(f" Layer {layer_idx:3d}: no profiling data — skipping") + continue + + # Use REAP score if available (from llama.cpp profiler), else fall back to legacy importance_score + layer_stats = stats[layer_idx] + if "reap" in layer_stats: + importance = np.array(layer_stats["reap"]) + else: + importance = np.array(layer_stats["importance_score"]) + keep_sorted = sorted(np.argsort(importance)[-keep_n:].tolist()) + prune_count = 128 - len(keep_sorted) + + # Prune expert list + block.mixer.experts = torch.nn.ModuleList( + [block.mixer.experts[i] for i in keep_sorted] + ) + + # Prune router weights to match new expert indices + keep_t = torch.tensor(keep_sorted, dtype=torch.long) + block.mixer.gate.weight = torch.nn.Parameter( + block.mixer.gate.weight.data[keep_t].clone() + ) + old_bias = block.mixer.gate.e_score_correction_bias.data[keep_t].clone() + block.mixer.gate.register_buffer("e_score_correction_bias", old_bias) + block.mixer.gate.n_routed_experts = keep_n + + never = stats[layer_idx]["never_activated"] + print(f" Layer {layer_idx:3d}: kept {keep_n}, pruned {prune_count} (was {never} never-activated)") + + # Patch top-level config + model.config.n_routed_experts = keep_n + + # Fix transformers 5.x incompatibility: _tied_weights_keys must be a list of dicts, + # but the custom NemotronH modeling code sets it as a plain list of strings. + # _get_tied_weight_keys() calls .keys() on each element → AttributeError. + # Clear it — lm_head weight tying is not needed for inference on the pruned model. + for mod in model.modules(): + if isinstance(getattr(mod, '_tied_weights_keys', None), list): + mod._tied_weights_keys = None + + # Disable torch.compile / inductor before saving — transformers 5.x can trigger + # torch._inductor.compile_worker during save_pretrained, causing an indefinite hang. + import os + os.environ["TORCH_COMPILE_DISABLE"] = "1" + torch._dynamo.reset() + + print(f"\n Saving pruned model → {args.output}") + with torch.no_grad(): + model.save_pretrained(args.output, safe_serialization=True) + tokenizer.save_pretrained(args.output) + print(f"\n[Phase 2] Done.") + print(f" Experts per MoE layer : {keep_n}/128") + print(f" Next: fine-tune with Unsloth from {args.output}") + + +# ── Entry point ─────────────────────────────────────────────────────────────── + +def main(): + parser = argparse.ArgumentParser(description="NemotronH Expert Pruner (REAP-style)") + sub = parser.add_subparsers(dest="cmd", required=True) + + p1 = sub.add_parser("profile", help="Phase 1: profile expert activations (GPU, 4-bit)") + p1.add_argument("--model", default="unsloth/Nemotron-3-Nano-30B-A3B") + p1.add_argument("--prompts", required=True) + p1.add_argument("--output", default="expert_stats.json") + p1.add_argument("--keep_ratio", type=float, default=0.20, + help="Preview ratio for summary only — does not affect saved stats") + p1.add_argument("--max_length", type=int, default=2048) + + p2 = sub.add_parser("prune", help="Phase 2: prune model using saved stats (CPU, bf16)") + p2.add_argument("--model", default="unsloth/Nemotron-3-Nano-30B-A3B") + p2.add_argument("--stats", default="expert_stats.json") + p2.add_argument("--keep_ratio", type=float, default=0.20) + p2.add_argument("--output", default="./nemotron-pruned") + + args = parser.parse_args() + if args.cmd == "profile": + cmd_profile(args) + elif args.cmd == "prune": + cmd_prune(args) + + +if __name__ == "__main__": + main() diff --git a/tools/moe-pruning/requirements.txt b/tools/moe-pruning/requirements.txt new file mode 100644 index 0000000000..8f778e9fe5 --- /dev/null +++ b/tools/moe-pruning/requirements.txt @@ -0,0 +1 @@ +gguf \ No newline at end of file diff --git a/tools/moe-pruning/sample_calibration.jsonl b/tools/moe-pruning/sample_calibration.jsonl new file mode 100644 index 0000000000..44e2627f97 --- /dev/null +++ b/tools/moe-pruning/sample_calibration.jsonl @@ -0,0 +1,8 @@ +{"prompt": "What is the difference between a transformer and an RNN?", "response": "Transformers process all tokens in parallel using self-attention, while RNNs process tokens sequentially and maintain a hidden state. Transformers scale better and handle long-range dependencies more effectively, but require more memory for the attention matrix."} +{"prompt": "Explain the concept of weight quantization in neural networks.", "response": "Weight quantization reduces model size and inference latency by representing parameters in lower precision formats such as INT8 or INT4 instead of FP32. This trades a small amount of accuracy for significant gains in memory efficiency and compute throughput."} +{"prompt": "What is mixture of experts (MoE)?", "response": "MoE is a neural network architecture where the feed-forward layers are replaced by multiple expert sub-networks and a routing mechanism. For each token, only a small subset of experts is activated, allowing the model to have a large parameter count while keeping inference compute constant."} +{"prompt": "Describe the attention mechanism.", "response": "Attention computes a weighted sum of value vectors, where the weights are determined by the compatibility between query and key vectors. Scaled dot-product attention computes scores as Q*K^T/sqrt(d_k), applies softmax to get weights, then multiplies by V."} +{"prompt": "What is GGUF and how does it differ from GGML?", "response": "GGUF is the successor to the GGML file format for storing quantized models. It supports arbitrary key-value metadata, is extensible without breaking backward compatibility, and encodes tensor names and shapes explicitly, making it more robust than the original GGML format."} +{"prompt": "How does LoRA work?", "response": "LoRA (Low-Rank Adaptation) injects trainable rank-decomposition matrices A and B into frozen weight layers. The adapted weight is W + alpha/r * B*A. Since rank r is much smaller than the weight dimensions, only a tiny fraction of parameters are trained."} +{"prompt": "What is perplexity in language modeling?", "response": "Perplexity measures how well a language model predicts a sample text. It is the exponentiated average negative log-likelihood per token: PPL = exp(-1/N * sum log P(token_i)). Lower perplexity indicates a better fit to the data."} +{"prompt": "Explain rotary position embeddings (RoPE).", "response": "RoPE encodes position by rotating query and key vectors in 2D subspaces using a position-dependent rotation matrix. This makes the dot product between Q and K depend only on their relative position, enabling the model to generalise to sequence lengths longer than those seen during training."} From 76d5b6798044fde6b50915a905c00ad8844523f4 Mon Sep 17 00:00:00 2001 From: Salvatore Rossitto Date: Thu, 12 Mar 2026 11:58:14 +0100 Subject: [PATCH 03/15] added missing llama_opt_set_reward_weights --- include/llama.h | 9 ++++++++- src/llama-context.h | 4 +++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/include/llama.h b/include/llama.h index 0bd10294cb..0bf8ead384 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1556,6 +1556,12 @@ extern "C" { LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params); + // weights: array of floats, one per dataset window (indexed by idata), already normalized to [0,1]. + // n_weights: length of the array. + // Pass NULL/0 to disable (equivalent to all-ones, i.e. standard SFT). + // The pointer must remain valid for the duration of all llama_opt_epoch calls. + LLAMA_API void llama_opt_set_reward_weights(const float * weights, int64_t n_weights); + LLAMA_API void llama_opt_epoch( struct llama_context * lctx, ggml_opt_dataset_t dataset, @@ -1563,7 +1569,8 @@ extern "C" { ggml_opt_result_t result_eval, int64_t idata_split, ggml_opt_epoch_callback callback_train, - ggml_opt_epoch_callback callback_eval); + ggml_opt_epoch_callback callback_eval, + bool shuffle); #ifdef __cplusplus } diff --git a/src/llama-context.h b/src/llama-context.h index e0d0085c1c..21d0ba6299 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -187,7 +187,8 @@ struct llama_context { ggml_opt_result_t result_eval, int64_t idata_split, ggml_opt_epoch_callback callback_train, - ggml_opt_epoch_callback callback_eval); + ggml_opt_epoch_callback callback_eval, + bool shuffle); void opt_epoch_iter( ggml_opt_dataset_t dataset, @@ -195,6 +196,7 @@ struct llama_context { const std::vector & tokens, const std::vector & labels_sparse, llama_batch & batch, + float reward_scale, ggml_opt_epoch_callback callback, bool train, int64_t idata_in_loop, From 70730e8d283070ba8e1b775ba2e2879d49c45707 Mon Sep 17 00:00:00 2001 From: Salvatore Rossitto Date: Thu, 12 Mar 2026 12:04:34 +0100 Subject: [PATCH 04/15] added reward scaling to opt_epoch_iter calls --- examples/training/finetune.cpp | 2 +- src/llama-context.cpp | 38 +++++++++++++++++++++++++++------- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/examples/training/finetune.cpp b/examples/training/finetune.cpp index e20f89488f..dd58f9418e 100644 --- a/examples/training/finetune.cpp +++ b/examples/training/finetune.cpp @@ -83,7 +83,7 @@ int main(int argc, char ** argv) { for (lr.epoch = 0; lr.epoch < lr.epochs; ++lr.epoch) { llama_opt_epoch(ctx, dataset, result_train, result_eval, idata_split, - ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar); + ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar, /*shuffle=*/false); fprintf(stderr, "\n"); ggml_opt_result_reset(result_train); diff --git a/src/llama-context.cpp b/src/llama-context.cpp index ee2669c154..9f67d47b50 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2657,6 +2657,7 @@ void llama_context::opt_epoch_iter( const std::vector & tokens, const std::vector & labels_sparse, llama_batch & batch, + float reward_scale, ggml_opt_epoch_callback callback, bool train, int64_t idata_in_loop, @@ -2742,11 +2743,14 @@ void llama_context::opt_epoch_iter( struct ggml_tensor * labels = ggml_opt_labels(opt_ctx); GGML_ASSERT(labels->ne[1] == n_ubatch); ggml_set_zero(labels); - const float onef = 1.0f; for (uint32_t pos_ubatch = 0; pos_ubatch < n_ubatch; ++pos_ubatch) { const uint32_t ilabel = pos_ctx + pos_batch + pos_ubatch; + // -1 sentinel means "masked position" (prompt token, BOS separator, etc). + // Leave the label tensor zeroed at this position → zero cross-entropy + // contribution. Do NOT write anything — ggml_set_zero already handled it. + if (labels_sparse[ilabel] < 0) continue; GGML_ASSERT(labels_sparse[ilabel] < labels->ne[0]); - ggml_backend_tensor_set(labels, &onef, (pos_ubatch*labels->ne[0] + labels_sparse[ilabel])*sizeof(float), sizeof(float)); + ggml_backend_tensor_set(labels, &reward_scale, (pos_ubatch*labels->ne[0] + labels_sparse[ilabel])*sizeof(float), sizeof(float)); } } ggml_opt_eval(opt_ctx, result); @@ -2760,13 +2764,25 @@ void llama_context::opt_epoch_iter( } } +// Optional per-window reward weights for reward-weighted SFT. +// Set via llama_opt_set_reward_weights() before calling llama_opt_epoch(). +// Null/0 means all rewards are 1.0 (standard SFT). +static thread_local const float * g_reward_weights = nullptr; +static thread_local int64_t g_reward_weights_n = 0; + +void llama_opt_set_reward_weights(const float * weights, int64_t n_weights) { + g_reward_weights = weights; + g_reward_weights_n = n_weights; +} + void llama_context::opt_epoch( ggml_opt_dataset_t dataset, ggml_opt_result_t result_train, ggml_opt_result_t result_eval, int64_t idata_split, ggml_opt_epoch_callback callback_train, - ggml_opt_epoch_callback callback_eval) { + ggml_opt_epoch_callback callback_eval, + bool shuffle) { const uint32_t n_ctx = this->n_ctx(); const uint32_t n_batch = std::min(cparams.n_batch, n_ctx); const uint32_t n_ubatch = std::min(cparams.n_ubatch, n_batch); @@ -2775,6 +2791,10 @@ void llama_context::opt_epoch( GGML_ASSERT(idata_split >= 0); GGML_ASSERT(idata_split <= ndata); + if (shuffle && idata_split > 1) { + ggml_opt_dataset_shuffle(opt_ctx, dataset, idata_split); + } + const uint32_t ubatch_per_ctx = n_ctx / n_ubatch; struct llama_batch batch = llama_batch_init(n_batch, 0, 1); @@ -2788,9 +2808,11 @@ void llama_context::opt_epoch( for (; idata < idata_split; ++idata) { constexpr bool train = true; const int64_t idata_in_loop = idata*ubatch_per_ctx; + const float reward = (g_reward_weights && idata < g_reward_weights_n) + ? g_reward_weights[idata] : 1.0f; ggml_opt_dataset_get_batch_host(dataset, tokens.data(), n_ctx*sizeof(llama_token), labels_sparse.data(), idata); - opt_epoch_iter(dataset, result_train, tokens, labels_sparse, batch, + opt_epoch_iter(dataset, result_train, tokens, labels_sparse, batch, reward, callback_train, train, idata_in_loop, ndata_in_loop, t_loop_start); } @@ -2801,7 +2823,7 @@ void llama_context::opt_epoch( const int64_t idata_in_loop = (idata - idata_split)*ubatch_per_ctx; ggml_opt_dataset_get_batch_host(dataset, tokens.data(), n_ctx*sizeof(llama_token), labels_sparse.data(), idata); - opt_epoch_iter(dataset, result_eval, tokens, labels_sparse, batch, + opt_epoch_iter(dataset, result_eval, tokens, labels_sparse, batch, 1.0f, callback_eval, train, idata_in_loop, ndata_in_loop, t_loop_start); } @@ -3550,12 +3572,14 @@ void llama_opt_epoch( ggml_opt_result_t result_eval, int64_t idata_split, ggml_opt_epoch_callback callback_train, - ggml_opt_epoch_callback callback_eval) { + ggml_opt_epoch_callback callback_eval, + bool shuffle) { ctx->opt_epoch( dataset, result_train, result_eval, idata_split, callback_train, - callback_eval); + callback_eval, + shuffle); } From 22277e3cbfb47fc85dadc7c8ce6c83a10977c503 Mon Sep 17 00:00:00 2001 From: Salvatore Rossitto Date: Thu, 12 Mar 2026 12:22:12 +0100 Subject: [PATCH 05/15] ported residual changes about grad_checkpointing --- examples/training/finetune.cpp | 1 + ggml/include/ggml-opt.h | 7 +++ ggml/src/ggml-opt.cpp | 69 +++++++++++++++++++--- include/llama.h | 6 ++ src/llama-context.cpp | 105 ++++++++++++++++++++++++++++++--- 5 files changed, 171 insertions(+), 17 deletions(-) diff --git a/examples/training/finetune.cpp b/examples/training/finetune.cpp index dd58f9418e..88d3db741d 100644 --- a/examples/training/finetune.cpp +++ b/examples/training/finetune.cpp @@ -73,6 +73,7 @@ int main(int argc, char ** argv) { /*get_opt_pars =*/common_opt_lr_pars, /*get_opt_pars_ud =*/¶ms.lr, /*optimizer_type =*/params.optimizer, + /*grad_checkpoint_interval =*/params.grad_checkpoint_interval, }; llama_opt_init(ctx, model, lopt_params); diff --git a/ggml/include/ggml-opt.h b/ggml/include/ggml-opt.h index 60774575f0..cac543c02d 100644 --- a/ggml/include/ggml-opt.h +++ b/ggml/include/ggml-opt.h @@ -126,6 +126,13 @@ extern "C" { ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters void * get_opt_pars_ud; // userdata for calculating optimizer parameters + // Gradient checkpointing: keep the output of every Nth forward node alive through + // the backward pass so the allocator cannot reuse its memory for other tensors. + // This trades compute for VRAM — intermediate activations between checkpoints are + // freed and recomputed during the backward pass by the existing graph structure. + // Set to 0 (default) to disable. A value of ~32–64 cuts activation VRAM by ~50%. + int32_t grad_checkpoint_interval; + // only GGML_OPT_OPTIMIZER_TYPE_ADAMW needs m, v momenta per parameter tensor enum ggml_opt_optimizer_type optimizer; }; diff --git a/ggml/src/ggml-opt.cpp b/ggml/src/ggml-opt.cpp index e87fc79c25..8be90c8944 100644 --- a/ggml/src/ggml-opt.cpp +++ b/ggml/src/ggml-opt.cpp @@ -58,10 +58,13 @@ struct ggml_opt_context { std::vector grad_accs; std::vector grad_m; std::vector grad_v; + std::vector bufs_momenta; // per-param moment buffers (one per param node) + std::vector ctxs_momenta; // corresponding ggml contexts (keep alive for tensor metadata) int64_t iter = 1; int32_t opt_period = 1; int32_t opt_i = 0; + int32_t grad_checkpoint_interval = 0; bool loss_per_datapoint = false; ggml_opt_get_optimizer_params get_opt_pars = nullptr; @@ -254,9 +257,10 @@ struct ggml_opt_params ggml_opt_default_params( /*loss_type =*/ loss_type, /*build_type =*/ GGML_OPT_BUILD_TYPE_OPT, /*opt_period =*/ 1, - /*get_opt_pars =*/ ggml_opt_get_default_optimizer_params, - /*get_opt_pars_ud =*/ nullptr, - /*optimizer =*/ GGML_OPT_OPTIMIZER_TYPE_ADAMW, + /*get_opt_pars =*/ ggml_opt_get_default_optimizer_params, + /*get_opt_pars_ud =*/ nullptr, + /*grad_checkpoint_interval =*/ 0, + /*optimizer =*/ GGML_OPT_OPTIMIZER_TYPE_ADAMW, }; } @@ -476,8 +480,23 @@ static void ggml_opt_build(ggml_opt_context_t opt_ctx) { for (int i = 0; i < n_nodes; ++i) { ggml_tensor * node = opt_ctx->gf->nodes[i]; if (node->flags & GGML_TENSOR_FLAG_PARAM) { - opt_ctx->grad_m[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne); - opt_ctx->grad_v[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne); + // Allocate moments on the same buffer type as the param tensor so + // the ADAMW op runs on the correct backend (avoids cross-device mismatch + // when some LoRA tensors are on CPU and others on GPU with partial offload). + ggml_backend_buffer_type_t param_buft = node->buffer + ? ggml_backend_buffer_get_type(node->buffer) + : ggml_backend_cpu_buffer_type(); + + // Allocate a tiny context + buffer for this pair of moment tensors. + const size_t sz = 2 * ggml_tensor_overhead(); + struct ggml_init_params mip = { sz, nullptr, true }; + struct ggml_context * mctx = ggml_init(mip); + opt_ctx->grad_m[i] = ggml_new_tensor(mctx, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne); + opt_ctx->grad_v[i] = ggml_new_tensor(mctx, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne); + ggml_backend_buffer_t mbuf = ggml_backend_alloc_ctx_tensors_from_buft(mctx, param_buft); + ggml_backend_buffer_clear(mbuf, 0); + opt_ctx->bufs_momenta.push_back(mbuf); + opt_ctx->ctxs_momenta.push_back(mctx); // keep alive for tensor metadata } else { opt_ctx->grad_m[i] = nullptr; opt_ctx->grad_v[i] = nullptr; @@ -486,6 +505,31 @@ static void ggml_opt_build(ggml_opt_context_t opt_ctx) { } } + // Gradient checkpointing: mark every Nth forward node as OUTPUT so the allocator + // keeps its memory alive through the backward pass. The backward graph already + // contains the forward ops (gb_grad is a superset of gf), so the checkpointed + // activations are naturally available for backward matmuls without recomputation. + // This prevents the allocator from aliasing those buffers to later ops, cutting + // peak activation VRAM at the cost of slightly larger static allocation. + if (opt_ctx->grad_checkpoint_interval > 0) { + const int interval = opt_ctx->grad_checkpoint_interval; + const int n_fwd = opt_ctx->gf->n_nodes; + int ckpt_count = 0; + for (int i = interval - 1; i < n_fwd; i += interval) { + struct ggml_tensor * node = opt_ctx->gf->nodes[i]; + // Only checkpoint F32 compute nodes — skip I32 index tensors and already-output nodes. + if (node->type != GGML_TYPE_F32) continue; + if (node->flags & GGML_TENSOR_FLAG_OUTPUT) continue; + if (node->flags & GGML_TENSOR_FLAG_INPUT) continue; + node->flags |= GGML_TENSOR_FLAG_OUTPUT; + ckpt_count++; + } + if (ckpt_count > 0) { + GGML_LOG_DEBUG("%s: gradient checkpointing: marked %d/%d nodes as persistent (interval=%d)\n", + __func__, ckpt_count, n_fwd, interval); + } + } + // gb_grad == graph backward gradients, forward pass, then backward pass to calculate gradients. opt_ctx->gb_grad = ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gf, /*force_grads =*/ true); ggml_build_backward_expand(opt_ctx->ctx_compute, opt_ctx->gb_grad, opt_ctx->grad_accs.data()); @@ -556,10 +600,11 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) { result->build_type_alloc = params.build_type; result->inputs = params.inputs; result->outputs = params.outputs; - result->opt_period = params.opt_period; - result->get_opt_pars = params.get_opt_pars; - result->get_opt_pars_ud = params.get_opt_pars_ud; - result->optimizer = params.optimizer; + result->opt_period = params.opt_period; + result->grad_checkpoint_interval = params.grad_checkpoint_interval; + result->get_opt_pars = params.get_opt_pars; + result->get_opt_pars_ud = params.get_opt_pars_ud; + result->optimizer = params.optimizer; GGML_ASSERT(result->opt_period >= 1); @@ -588,6 +633,12 @@ void ggml_opt_free(ggml_opt_context_t opt_ctx) { } ggml_backend_buffer_free(opt_ctx->buf_static); ggml_backend_buffer_free(opt_ctx->buf_cpu); + for (ggml_backend_buffer_t buf : opt_ctx->bufs_momenta) { + ggml_backend_buffer_free(buf); + } + for (struct ggml_context * ctx : opt_ctx->ctxs_momenta) { + ggml_free(ctx); + } ggml_free(opt_ctx->ctx_static); ggml_free(opt_ctx->ctx_cpu); delete opt_ctx; diff --git a/include/llama.h b/include/llama.h index 0bf8ead384..6a3a1ebe38 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1552,6 +1552,12 @@ extern "C" { void * get_opt_pars_ud; // userdata for calculating optimizer parameters enum ggml_opt_optimizer_type optimizer_type; + + // Gradient checkpointing: mark every Nth forward graph node as persistent so the + // allocator cannot reuse its memory during backward. Reduces peak activation VRAM + // at the cost of ~0 extra compute (activations are kept, not recomputed). + // Set to 0 (default) to disable. Good values: 32–64 nodes ≈ every 1–2 transformer layers. + int32_t grad_checkpoint_interval; }; LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params); diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 9f67d47b50..ba98acd403 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2618,11 +2618,71 @@ void llama_context::opt_init(struct llama_model * model, struct llama_opt_params GGML_ASSERT(model->hparams.n_ctx_train % n_batch == 0); GGML_ASSERT(n_batch % n_ubatch == 0); + // Recreate the scheduler and gf_res_prev with a training-inflated graph size before + // creating opt_ctx, so opt_ctx captures the new (larger) scheduler pointer. + // The backward graph (gb_grad) duplicates gf and adds ~2-3x more nodes+leafs; + // gb_opt adds optimizer step nodes on top. + // + // We measure the actual training forward graph node count at n_ubatch here, + // then multiply by 4 to cover gf + gb_grad + gb_opt. This is exact for any + // model size — no magic constant needed. + { + uint32_t train_fwd_nodes = 0; + + // Build a real training-ubatch forward graph in split-only mode (no buffer realloc) + // so we can count its actual nodes. Fall back to n_tensors formula if it fails. + if (memory) { + auto mctx_tmp = memory->init_full(); + if (mctx_tmp) { + // graph_reserve() uses gf_res_reserve to build the graph, so both + // must be large enough to hold the training forward graph. + // Use 16x n_tensors as a generous temporary cap for the measurement pass. + const uint32_t tmp_cap = std::max(4096u, 16u * model->n_tensors()); + gf_res_prev.reset(new llm_graph_result(tmp_cap)); + gf_res_reserve.reset(new llm_graph_result(tmp_cap)); + // split_only=true: only splits the graph, doesn't reallocate compute buffers + auto * gf_train = graph_reserve(n_ubatch, 1, n_ubatch, mctx_tmp.get(), /*split_only=*/true); + if (gf_train) { + train_fwd_nodes = (uint32_t)ggml_graph_n_nodes(gf_train); + LLAMA_LOG_INFO("%s: measured training graph nodes = %u (n_ubatch=%u)\n", + __func__, train_fwd_nodes, n_ubatch); + } + } + } + + if (train_fwd_nodes == 0) { + // Fallback: use n_tensors formula + train_fwd_nodes = std::max(1024u, 8u * model->n_tensors()); + LLAMA_LOG_WARN("%s: could not measure training graph, using fallback nodes=%u\n", + __func__, train_fwd_nodes); + } + + // gf + gb_grad + gb_opt each need ~train_fwd_nodes; multiply by 4 for safety headroom. + // Multiply by 2 again for the scheduler's n_nodes + n_leafs check. + const int64_t inflated = (int64_t)std::max(train_fwd_nodes, 1024u) * 4; + const int64_t sched_size = inflated * 2; + // Both gf_res_prev and gf_res_reserve are used to build forward graphs + // (graph_reserve uses gf_res_reserve; opt_epoch_iter uses gf_res_prev). + // Both must have capacity for the full backward graph. + gf_res_prev.reset(new llm_graph_result(inflated)); + gf_res_reserve.reset(new llm_graph_result(inflated)); + sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), + sched_size, cparams.pipeline_parallel, cparams.op_offload)); + // Suppress the next sched_reserve() call so that llama_decode() during GRPO inference + // steps does NOT replace the training sched with a smaller inference sched. + // opt_ctx->backend_sched stores a raw pointer to sched.get(); replacing sched while + // opt_ctx is alive would leave that pointer dangling and crash on the next opt_epoch. + sched_need_reserve = false; + LLAMA_LOG_INFO("%s: training graph capacity = %lld (train_fwd_nodes=%u x4)\n", + __func__, (long long)inflated, train_fwd_nodes); + } + ggml_opt_params opt_params = ggml_opt_default_params(sched.get(), GGML_OPT_LOSS_TYPE_CROSS_ENTROPY); - opt_params.opt_period = n_batch / n_ubatch; - opt_params.get_opt_pars = lopt_params.get_opt_pars; - opt_params.get_opt_pars_ud = lopt_params.get_opt_pars_ud; - opt_params.optimizer = lopt_params.optimizer_type; + opt_params.opt_period = n_batch / n_ubatch; + opt_params.get_opt_pars = lopt_params.get_opt_pars; + opt_params.get_opt_pars_ud = lopt_params.get_opt_pars_ud; + opt_params.optimizer = lopt_params.optimizer_type; + opt_params.grad_checkpoint_interval = lopt_params.grad_checkpoint_interval; opt_ctx = ggml_opt_init(opt_params); llama_opt_param_filter param_filter = lopt_params.param_filter; @@ -2706,6 +2766,8 @@ void llama_context::opt_epoch_iter( }; uint32_t pos_batch = 0; + static bool timings_printed = false; // print per-ubatch timings only for the first window + struct ggml_context * ctx_compute_opt = nullptr; do { const auto & ubatch = mctx->get_ubatch(); @@ -2718,26 +2780,38 @@ void llama_context::opt_epoch_iter( auto * res = gf_res_prev.get(); + const int64_t t0_build = ggml_time_ms(); const auto gparams = graph_params(res, ubatch, mctx.get(), LLM_GRAPH_TYPE_DEFAULT); res->reset(); auto * gf = model.build_graph(gparams); - struct ggml_context * ctx_compute_opt; - { + // Allocate the tensor metadata context once, then reset it each iteration. + // ggml_reset() is much cheaper than ggml_free()+ggml_init() — it just resets the + // allocation pointer without freeing/reallocating the backing memory buffer. + if (!ctx_compute_opt) { const size_t size_gf = ggml_graph_size(gf); - const size_t size_meta = 4*size_gf*ggml_tensor_overhead() + 2*ggml_graph_overhead_custom(size_gf, /*grads = */ true); + const size_t size_meta = 4*size_gf*ggml_tensor_overhead() + 3*ggml_graph_overhead_custom(size_gf, /*grads = */ true); struct ggml_init_params params = { /*.mem_size =*/ size_meta, /*.mem_buffer =*/ nullptr, /*.no_alloc =*/ true, }; ctx_compute_opt = ggml_init(params); + if (!timings_printed) { + LLAMA_LOG_INFO("%s: [timing] graph capacity=%zu n_nodes=%d size_meta=%.1fMB\n", __func__, + size_gf, ggml_graph_n_nodes(gf), (double)size_meta / (1024*1024)); + } + } else { + ggml_reset(ctx_compute_opt); } + + const int64_t t1_alloc = ggml_time_ms(); ggml_opt_prepare_alloc(opt_ctx, ctx_compute_opt, gf, res->get_inp_tokens(), res->get_logits()); ggml_opt_alloc(opt_ctx, train); + const int64_t t2_inputs = ggml_time_ms(); res->set_inputs(&ubatch); { struct ggml_tensor * labels = ggml_opt_labels(opt_ctx); @@ -2753,14 +2827,29 @@ void llama_context::opt_epoch_iter( ggml_backend_tensor_set(labels, &reward_scale, (pos_ubatch*labels->ne[0] + labels_sparse[ilabel])*sizeof(float), sizeof(float)); } } + + const int64_t t3_eval = ggml_time_ms(); ggml_opt_eval(opt_ctx, result); + + const int64_t t4_done = ggml_time_ms(); + if (!timings_printed) { + LLAMA_LOG_INFO("%s: [timing] build=%" PRId64 "ms alloc=%" PRId64 "ms inputs=%" PRId64 "ms eval=%" PRId64 "ms total=%" PRId64 "ms\n", + __func__, + t1_alloc - t0_build, + t2_inputs - t1_alloc, + t3_eval - t2_inputs, + t4_done - t3_eval, + t4_done - t0_build); + timings_printed = true; + } + if (callback) { callback(train, opt_ctx, dataset, result, idata_in_loop + (pos_ctx + pos_batch)/n_ubatch + 1, ndata_in_loop, t_loop_start); } - ggml_free(ctx_compute_opt); pos_batch += ubatch.n_tokens; } while (mctx->next()); + ggml_free(ctx_compute_opt); } } From 3e4166d3fd9b4ba55ecebc60dcaf4d4e00fe8792 Mon Sep 17 00:00:00 2001 From: Salvatore Rossitto Date: Thu, 12 Mar 2026 13:19:46 +0100 Subject: [PATCH 06/15] fixed assert in ggml.c GGML_ASSERT(ggml_nelements(adamw_params) == 8) --- ggml/src/ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 1e04911360..f146f13b5f 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -6095,7 +6095,7 @@ struct ggml_tensor * ggml_opt_step_adamw( GGML_ASSERT(ggml_are_same_shape(a, m)); GGML_ASSERT(ggml_are_same_shape(a, v)); GGML_ASSERT(adamw_params->type == GGML_TYPE_F32); - GGML_ASSERT(ggml_nelements(adamw_params) == 7); + GGML_ASSERT(ggml_nelements(adamw_params) == 8); struct ggml_tensor * result = ggml_view_tensor(ctx, a); From e18d20d6c68464c749c6fed059dd3a5449810577 Mon Sep 17 00:00:00 2001 From: Salvatore Rossitto Date: Thu, 12 Mar 2026 13:28:07 +0100 Subject: [PATCH 07/15] fixed missing changes from dev version --- ggml/src/ggml-cuda/ggml-cuda.cu | 4 +++- ggml/src/ggml-cuda/out-prod.cu | 37 +++++++++++++++++++++++++-------- ggml/src/ggml.c | 22 +++++++++++++------- src/llama-adapter.cpp | 24 ++++++++++++++------- 4 files changed, 63 insertions(+), 24 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 3e1cca6c98..9e5492f5fc 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -4774,7 +4774,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g } } break; case GGML_OP_OUT_PROD: - return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; + return op->type == GGML_TYPE_F32 + && (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) + && op->src[1]->type == GGML_TYPE_F32; case GGML_OP_OUT_PROD_ID: return op->src[0] != nullptr && op->src[1] != nullptr && op->src[2] != nullptr && op->type == GGML_TYPE_F32 diff --git a/ggml/src/ggml-cuda/out-prod.cu b/ggml/src/ggml-cuda/out-prod.cu index 9afc323bd9..392de34d1c 100644 --- a/ggml/src/ggml-cuda/out-prod.cu +++ b/ggml/src/ggml-cuda/out-prod.cu @@ -1,4 +1,5 @@ #include "out-prod.cuh" +#include "convert.cuh" #include #include @@ -10,7 +11,7 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { GGML_TENSOR_BINARY_OP_LOCALS - GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type)); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); @@ -24,19 +25,37 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { GGML_ASSERT(ne2 == src1->ne[2]); GGML_ASSERT(ne3 == src1->ne[3]); - const float * src0_d = (const float *) src0->data; - const float * src1_d = (const float *) src1->data; - float * dst_d = (float *) dst->data; - cudaStream_t stream = ctx.stream(); cublasHandle_t handle = ctx.cublas_handle(); + // If src0 is quantized, dequantize to a temp F32 buffer on GPU + ggml_cuda_pool_alloc src0_f32_alloc; + const float * src0_d; + int64_t lda; + + if (src0->type != GGML_TYPE_F32) { + const int64_t n_elements = ggml_nelements(src0); + src0_f32_alloc.alloc(ctx.pool(), n_elements); + + to_fp32_cuda_t to_fp32 = ggml_get_to_fp32_cuda(src0->type); + GGML_ASSERT(to_fp32 != nullptr); + to_fp32(src0->data, src0_f32_alloc.ptr, n_elements, stream); + + src0_d = src0_f32_alloc.ptr; + lda = ne00; // dequantized data is contiguous: stride = ne00 + } else { + src0_d = (const float *) src0->data; + lda = nb01 / sizeof(float); + } + + const float * src1_d = (const float *) src1->data; + float * dst_d = (float *) dst->data; + const float alpha = 1.0f; const float beta = 0.0f; CUBLAS_CHECK(cublasSetStream(handle, stream)); - const int64_t lda = nb01 / sizeof(float); const int64_t ldc = nb1 / sizeof(float); const bool src1_T = ggml_is_transposed(src1); @@ -44,9 +63,9 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float); GGML_ASSERT( (src1_T ? nb11 : nb10) == sizeof(float)); - // data strides in dimensions 2/3 - const size_t s02 = nb02 / sizeof(float); - const size_t s03 = nb03 / sizeof(float); + // data strides in dimensions 2/3 (for dequantized src0, use element-based strides) + const size_t s02 = (src0->type != GGML_TYPE_F32) ? (ne00 * ne01) : (nb02 / sizeof(float)); + const size_t s03 = (src0->type != GGML_TYPE_F32) ? (ne00 * ne01 * ne02) : (nb03 / sizeof(float)); const size_t s12 = nb12 / sizeof(float); const size_t s13 = nb13 / sizeof(float); const size_t s2 = nb2 / sizeof(float); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index f146f13b5f..255e7d5a88 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -3871,12 +3871,17 @@ struct ggml_tensor * ggml_get_rows_back( struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_tensor * c) { - GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_is_matrix(c) && (a->ne[0] == c->ne[0])); + GGML_ASSERT(b->type == GGML_TYPE_I32); + GGML_ASSERT(a->ne[0] == c->ne[0]); + // Support both 2D and 3D: result shape matches c (the source tensor shape) // TODO: implement non F32 return - //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]); - struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]); + struct ggml_tensor * result; + if (c->ne[2] > 1) { + result = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1], c->ne[2]); + } else { + result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]); + } result->op = GGML_OP_GET_ROWS_BACK; result->src[0] = a; @@ -7077,9 +7082,12 @@ void ggml_build_backward_expand( continue; } - // inplace operations are currently not supported - GGML_ASSERT(!node->view_src || node->op == GGML_OP_CPY || node->op == GGML_OP_VIEW || - node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE); + // inplace operations are currently not supported — warn and skip instead of crashing + if (node->view_src && node->op != GGML_OP_CPY && node->op != GGML_OP_VIEW && + node->op != GGML_OP_RESHAPE && node->op != GGML_OP_PERMUTE && node->op != GGML_OP_TRANSPOSE) { + GGML_LOG_WARN("%s: skipping unsupported inplace op '%s' in backward graph\n", __func__, ggml_op_name(node->op)); + continue; + } const size_t ihash = ggml_hash_find(&cgraph->visited_hash_set, node); GGML_ASSERT(ihash != GGML_HASHSET_FULL); diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index d6a5800e63..1fee93ba40 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -334,16 +334,26 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_ auto * buft = ggml_backend_buffer_get_type(model_tensor->buffer); - // do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case + // do not load loras to extra buffer types (i.e. bufts for repacking) + // try device-native buft first (keeps LoRA on GPU), fall back to CPU only as last resort for (auto & ex : buft_extra) { if (ex == buft) { - LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft)); - - auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); - if (!cpu_dev) { - throw std::runtime_error(format("%s: no CPU backend found", __func__)); + // try to get the device's native (non-repack) buffer type + auto * dev = ggml_backend_buft_get_device(buft); + auto * native_buft = dev ? ggml_backend_dev_buffer_type(dev) : nullptr; + if (native_buft && native_buft != buft) { + LLAMA_LOG_WARN("%s: lora for '%s' cannot use repack buft '%s', using device-native '%s'\n", + __func__, model_tensor->name, ggml_backend_buft_name(buft), ggml_backend_buft_name(native_buft)); + buft = native_buft; + } else { + LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", + __func__, model_tensor->name, ggml_backend_buft_name(buft)); + auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + if (!cpu_dev) { + throw std::runtime_error(format("%s: no CPU backend found", __func__)); + } + buft = ggml_backend_dev_buffer_type(cpu_dev); } - buft = ggml_backend_dev_buffer_type(cpu_dev); break; } From dbe24a74717b86236cfc71e278a7f7d42c4a244c Mon Sep 17 00:00:00 2001 From: Salvatore Rossitto Date: Thu, 12 Mar 2026 18:59:24 +0100 Subject: [PATCH 08/15] - fixed some python warning - removed nemotron_reap.py based on bnb (off topic) --- tools/moe-pruning/analyze_stats.py | 2 +- tools/moe-pruning/gguf_prune.py | 10 +- tools/moe-pruning/nemotron_reap.py | 296 ----------------------------- 3 files changed, 7 insertions(+), 301 deletions(-) delete mode 100644 tools/moe-pruning/nemotron_reap.py diff --git a/tools/moe-pruning/analyze_stats.py b/tools/moe-pruning/analyze_stats.py index e7641a3bb6..2e0821f323 100644 --- a/tools/moe-pruning/analyze_stats.py +++ b/tools/moe-pruning/analyze_stats.py @@ -3,7 +3,7 @@ analyze_stats.py -- Summarize expert_stats.json and model size projections. Usage: python analyze_stats.py [stats_file] [--keep 0.5] """ -import json, sys, statistics, argparse +import json, statistics, argparse parser = argparse.ArgumentParser() parser.add_argument("stats", nargs="?", default="expert_stats_reap.json") diff --git a/tools/moe-pruning/gguf_prune.py b/tools/moe-pruning/gguf_prune.py index 2a36d5cf53..df3e638ab4 100644 --- a/tools/moe-pruning/gguf_prune.py +++ b/tools/moe-pruning/gguf_prune.py @@ -31,14 +31,15 @@ Usage: --keep_n 32 """ +from __future__ import annotations + import argparse import json import re -import sys from pathlib import Path import numpy as np -from gguf import GGUFReader, GGUFWriter, GGMLQuantizationType, GGUFValueType +from gguf import GGUFReader, GGUFWriter, GGUFValueType # ── Constants ───────────────────────────────────────────────────────────────── @@ -187,7 +188,7 @@ def main(): kept: dict[int, list[int]] = {} for tensor in reader.tensors: il, suffix = layer_and_suffix(tensor.name) - if il is None or not is_expert_suffix(suffix): + if il is None or suffix is None or not is_expert_suffix(suffix): continue if il in kept: continue # already computed for this layer @@ -222,9 +223,10 @@ def main(): n_pruned = 0 for tensor in reader.tensors: il, suffix = layer_and_suffix(tensor.name) - is_expert = il is not None and is_expert_suffix(suffix) + is_expert = il is not None and suffix is not None and is_expert_suffix(suffix) if is_expert: + assert il is not None k = kept[il] data = slice_expert_axis(tensor.data, k) writer.add_tensor( diff --git a/tools/moe-pruning/nemotron_reap.py b/tools/moe-pruning/nemotron_reap.py deleted file mode 100644 index fac5831d3c..0000000000 --- a/tools/moe-pruning/nemotron_reap.py +++ /dev/null @@ -1,296 +0,0 @@ -""" -NemotronH Expert Activation Profiler + Pruner -Two-phase: profile with 4-bit on GPU, prune bf16 on CPU. - -Usage: - # Phase 1 - profile - python nemotron_reap.py profile \ - --model unsloth/Nemotron-3-Nano-30B-A3B \ - --prompts training-data.jsonl \ - --output expert_stats.json - - # Phase 2 - prune - python nemotron_reap.py prune \ - --model unsloth/Nemotron-3-Nano-30B-A3B \ - --stats expert_stats.json \ - --keep_ratio 0.20 \ - --output ./nemotron-pruned-25e -""" - -import os -os.environ["TORCH_COMPILE_DISABLE"] = "1" # prevent inductor hang during save_pretrained - -import json -import argparse -import torch -import numpy as np -from collections import defaultdict -from transformers import AutoTokenizer, AutoModelForCausalLM - -try: - from transformers import BitsAndBytesConfig - import patch_bnb # noqa: F401 — patches Params4bit.__new__ for transformers 5.x compat - HAS_BNB = True -except ImportError: - HAS_BNB = False - - -# ── Tracker ─────────────────────────────────────────────────────────────────── - -class ExpertActivationTracker: - def __init__(self, n_experts: int = 128): - self.n_experts = n_experts - self.activation_counts = defaultdict(lambda: np.zeros(n_experts, dtype=np.int64)) - self.activation_weights = defaultdict(lambda: np.zeros(n_experts, dtype=np.float64)) - self.total_tokens = defaultdict(int) - self._hooks = [] - - def register_hooks(self, model): - count = 0 - for layer_idx, block in enumerate(model.backbone.layers): - if block.block_type == "moe": - h = block.mixer.gate.register_forward_hook(self._make_hook(layer_idx)) - self._hooks.append(h) - count += 1 - print(f" Hooks attached to {count} MoE layers") - - def _make_hook(self, layer_idx): - def hook(module, input, output): - topk_indices, topk_weights = output - idx = topk_indices.detach().cpu().numpy() # [T, 6] - wgt = topk_weights.detach().float().cpu().numpy() # [T, 6] - T = idx.shape[0] - self.total_tokens[layer_idx] += T - np.add.at(self.activation_counts[layer_idx], idx.flatten(), 1) - np.add.at(self.activation_weights[layer_idx], idx.flatten(), wgt.flatten()) - return hook - - def remove_hooks(self): - for h in self._hooks: - h.remove() - self._hooks.clear() - - def get_stats(self) -> dict: - stats = {} - for layer_idx in sorted(self.activation_counts): - counts = self.activation_counts[layer_idx] - weights = self.activation_weights[layer_idx] - total = self.total_tokens[layer_idx] - freq = counts / (total + 1e-9) - avg_w = np.where(counts > 0, weights / counts, 0.0) - importance = freq * avg_w - stats[layer_idx] = { - "total_tokens": int(total), - "activation_counts": counts.tolist(), - "activation_frequency": freq.tolist(), - "avg_weight": avg_w.tolist(), - "importance_score": importance.tolist(), - "never_activated": int((counts == 0).sum()), - } - return stats - - def print_summary(self, stats, keep_ratio): - keep_n = max(1, int(self.n_experts * keep_ratio)) - print(f"\n{'='*70}") - print(f" PROFILING SUMMARY | keep_ratio={keep_ratio:.0%} | keeping {keep_n}/128 experts/layer") - print(f"{'='*70}") - for li, s in stats.items(): - imp = np.array(s['importance_score']) - threshold = np.sort(imp)[self.n_experts - keep_n] - print( - f" Layer {li:3d}: " - f"never_activated={s['never_activated']:3d}/128 " - f"top_freq={max(s['activation_frequency']):.3f} " - f"threshold={threshold:.4f}" - ) - total_moe = len(stats) - print(f"\n MoE layers : {total_moe}") - print(f" Kept : {total_moe * keep_n} experts total") - print(f" Pruned : {total_moe * (self.n_experts - keep_n)} experts total") - print(f"{'='*70}\n") - - -# ── Phase 1: Profile ────────────────────────────────────────────────────────── - -def cmd_profile(args): - # Mamba2 layers use Triton kernels — CUDA required. - # 4-bit NF4 fits in 16GB VRAM (~15GB). We must keep ALL layers on GPU - # (no CPU spillover) otherwise PCIe transfers make inference unusably slow. - print(f"\n[Phase 1] Profiling — 4-bit NF4, GPU only") - print(f" Model : {args.model}") - print(f" Prompts: {args.prompts}") - - bnb_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.bfloat16, - bnb_4bit_use_double_quant=True, - ) - - tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) - print(" Loading model in 4-bit NF4...") - model = AutoModelForCausalLM.from_pretrained( - args.model, - trust_remote_code=True, - quantization_config=bnb_config, - device_map={"": 0}, # force ALL layers onto GPU 0, no CPU spillover - ) - model.eval() - print(" Model loaded on GPU.") - - # Load prompt+response pairs - pairs = [] - with open(args.prompts) as f: - for line in f: - line = line.strip() - if not line: - continue - obj = json.loads(line) - text = obj.get("prompt", "") + "\n" + obj.get("response", "") - pairs.append(text) - print(f" Loaded {len(pairs)} prompt+response pairs") - - tracker = ExpertActivationTracker(n_experts=128) - tracker.register_hooks(model) - - with torch.no_grad(): - for i, text in enumerate(pairs): - if i % 100 == 0: - print(f" [{i+1}/{len(pairs)}] processing...") - inputs = tokenizer( - text, - return_tensors="pt", - truncation=True, - max_length=args.max_length, - ).to("cuda") - try: - model(**inputs) - except torch.cuda.OutOfMemoryError: - print(f" [{i+1}] OOM — skipping") - torch.cuda.empty_cache() - - tracker.remove_hooks() - stats = tracker.get_stats() - tracker.print_summary(stats, keep_ratio=args.keep_ratio) - - stats_out = {str(k): v for k, v in stats.items()} - with open(args.output, "w") as f: - json.dump(stats_out, f, indent=2) - print(f" Stats saved → {args.output}") - print(f"\n[Phase 1] Done. Run 'prune' next (CPU only).") - - -# ── Phase 2: Prune ──────────────────────────────────────────────────────────── - -def cmd_prune(args): - print(f"\n[Phase 2] Pruning — bf16 on CPU") - print(f" Model : {args.model}") - print(f" Stats : {args.stats}") - print(f" Keep ratio : {args.keep_ratio:.0%}") - print(f" Output : {args.output}") - - with open(args.stats) as f: - stats = {int(k): v for k, v in json.load(f).items()} - - tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) - - print(" Loading model in bf16 on CPU — this takes a few minutes...") - model = AutoModelForCausalLM.from_pretrained( - args.model, - trust_remote_code=True, - dtype=torch.bfloat16, - device_map="cpu", - ) - - keep_n = max(1, int(128 * args.keep_ratio)) - print(f"\n Pruning to top-{keep_n} experts per MoE layer...\n") - - for layer_idx, block in enumerate(model.backbone.layers): - if block.block_type != "moe": - continue - - if layer_idx not in stats: - print(f" Layer {layer_idx:3d}: no profiling data — skipping") - continue - - # Use REAP score if available (from llama.cpp profiler), else fall back to legacy importance_score - layer_stats = stats[layer_idx] - if "reap" in layer_stats: - importance = np.array(layer_stats["reap"]) - else: - importance = np.array(layer_stats["importance_score"]) - keep_sorted = sorted(np.argsort(importance)[-keep_n:].tolist()) - prune_count = 128 - len(keep_sorted) - - # Prune expert list - block.mixer.experts = torch.nn.ModuleList( - [block.mixer.experts[i] for i in keep_sorted] - ) - - # Prune router weights to match new expert indices - keep_t = torch.tensor(keep_sorted, dtype=torch.long) - block.mixer.gate.weight = torch.nn.Parameter( - block.mixer.gate.weight.data[keep_t].clone() - ) - old_bias = block.mixer.gate.e_score_correction_bias.data[keep_t].clone() - block.mixer.gate.register_buffer("e_score_correction_bias", old_bias) - block.mixer.gate.n_routed_experts = keep_n - - never = stats[layer_idx]["never_activated"] - print(f" Layer {layer_idx:3d}: kept {keep_n}, pruned {prune_count} (was {never} never-activated)") - - # Patch top-level config - model.config.n_routed_experts = keep_n - - # Fix transformers 5.x incompatibility: _tied_weights_keys must be a list of dicts, - # but the custom NemotronH modeling code sets it as a plain list of strings. - # _get_tied_weight_keys() calls .keys() on each element → AttributeError. - # Clear it — lm_head weight tying is not needed for inference on the pruned model. - for mod in model.modules(): - if isinstance(getattr(mod, '_tied_weights_keys', None), list): - mod._tied_weights_keys = None - - # Disable torch.compile / inductor before saving — transformers 5.x can trigger - # torch._inductor.compile_worker during save_pretrained, causing an indefinite hang. - import os - os.environ["TORCH_COMPILE_DISABLE"] = "1" - torch._dynamo.reset() - - print(f"\n Saving pruned model → {args.output}") - with torch.no_grad(): - model.save_pretrained(args.output, safe_serialization=True) - tokenizer.save_pretrained(args.output) - print(f"\n[Phase 2] Done.") - print(f" Experts per MoE layer : {keep_n}/128") - print(f" Next: fine-tune with Unsloth from {args.output}") - - -# ── Entry point ─────────────────────────────────────────────────────────────── - -def main(): - parser = argparse.ArgumentParser(description="NemotronH Expert Pruner (REAP-style)") - sub = parser.add_subparsers(dest="cmd", required=True) - - p1 = sub.add_parser("profile", help="Phase 1: profile expert activations (GPU, 4-bit)") - p1.add_argument("--model", default="unsloth/Nemotron-3-Nano-30B-A3B") - p1.add_argument("--prompts", required=True) - p1.add_argument("--output", default="expert_stats.json") - p1.add_argument("--keep_ratio", type=float, default=0.20, - help="Preview ratio for summary only — does not affect saved stats") - p1.add_argument("--max_length", type=int, default=2048) - - p2 = sub.add_parser("prune", help="Phase 2: prune model using saved stats (CPU, bf16)") - p2.add_argument("--model", default="unsloth/Nemotron-3-Nano-30B-A3B") - p2.add_argument("--stats", default="expert_stats.json") - p2.add_argument("--keep_ratio", type=float, default=0.20) - p2.add_argument("--output", default="./nemotron-pruned") - - args = parser.parse_args() - if args.cmd == "profile": - cmd_profile(args) - elif args.cmd == "prune": - cmd_prune(args) - - -if __name__ == "__main__": - main() From 2e324f6c9a9f7964c033d5edd2576a4e83a5fa66 Mon Sep 17 00:00:00 2001 From: Salvatore Rossitto Date: Thu, 12 Mar 2026 19:01:46 +0100 Subject: [PATCH 09/15] removed some python warning/unused import --- examples/qlora_training/grpo_example.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/qlora_training/grpo_example.py b/examples/qlora_training/grpo_example.py index 9c7f2e3c26..c56ff9395f 100644 --- a/examples/qlora_training/grpo_example.py +++ b/examples/qlora_training/grpo_example.py @@ -34,7 +34,6 @@ Python → C++ stdin: import argparse import logging import math -import os import re import subprocess import sys @@ -99,6 +98,7 @@ def read_ipc(proc: subprocess.Popen, timeout: float = 120.0) -> Optional[Tuple[s Returns None on EOF. Raises TimeoutError if nothing arrives within `timeout` seconds. """ + assert proc.stdout is not None deadline = time.monotonic() + timeout while True: remaining = deadline - time.monotonic() @@ -120,6 +120,7 @@ def read_ipc(proc: subprocess.Popen, timeout: float = 120.0) -> Optional[Tuple[s def write_cmd(proc: subprocess.Popen, cmd: str): """Write one command line to the subprocess stdin.""" + assert proc.stdin is not None try: proc.stdin.write(cmd + "\n") proc.stdin.flush() @@ -268,7 +269,8 @@ def run_grpo(args: argparse.Namespace): raise finally: try: - proc.stdin.close() + if proc.stdin is not None: + proc.stdin.close() except Exception: pass rc = proc.wait(timeout=30) From 959f789bc9ba56e963b32a3ff637ed6f0c6ac97b Mon Sep 17 00:00:00 2001 From: Salvatore Rossitto Date: Thu, 12 Mar 2026 19:02:54 +0100 Subject: [PATCH 10/15] removed trailing whitespaces --- examples/qlora_training/check_lora_norms.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/qlora_training/check_lora_norms.py b/examples/qlora_training/check_lora_norms.py index 8908f3ad99..3f9d20c8c8 100644 --- a/examples/qlora_training/check_lora_norms.py +++ b/examples/qlora_training/check_lora_norms.py @@ -42,10 +42,10 @@ def read_gguf(path): pos = f.tell() align = 32 data_start = (pos + align - 1) & ~(align - 1) - + print(f"\nFile: {path}") print(f"Tensors: {n_tensors}") - + for name, dims, dtype, offset in tensors[:10]: # first 10 if dtype != 0: # only F32 (type 0) print(f" {name}: dims={dims} type={dtype} (non-F32, skipping norm)") From 1ebb82862ad7ff2ecc0fc6e432d40ad2b97bdd4c Mon Sep 17 00:00:00 2001 From: Salvatore Rossitto Date: Thu, 12 Mar 2026 19:03:45 +0100 Subject: [PATCH 11/15] added final newline in requirements.txt --- tools/moe-pruning/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/moe-pruning/requirements.txt b/tools/moe-pruning/requirements.txt index 8f778e9fe5..6b21f239f4 100644 --- a/tools/moe-pruning/requirements.txt +++ b/tools/moe-pruning/requirements.txt @@ -1 +1 @@ -gguf \ No newline at end of file +gguf From d1f8d527ba869d1c56808ee0105bba781c2a3902 Mon Sep 17 00:00:00 2001 From: Salvatore Rossitto Date: Thu, 12 Mar 2026 19:09:16 +0100 Subject: [PATCH 12/15] added new line at end --- examples/qlora_training/check_lora_norms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/qlora_training/check_lora_norms.py b/examples/qlora_training/check_lora_norms.py index 3f9d20c8c8..c54ed4e7d1 100644 --- a/examples/qlora_training/check_lora_norms.py +++ b/examples/qlora_training/check_lora_norms.py @@ -61,4 +61,4 @@ if __name__ == '__main__': try: read_gguf(p) except Exception as e: - print(f"Error reading {p}: {e}") \ No newline at end of file + print(f"Error reading {p}: {e}") From 99c2456bbbf290074e9200e91af973d92a014774 Mon Sep 17 00:00:00 2001 From: Salvatore Rossitto Date: Fri, 13 Mar 2026 10:29:55 +0100 Subject: [PATCH 13/15] removed error guard on dataset (its not generic) --- examples/qlora_training/finetune_qlora.cpp | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/examples/qlora_training/finetune_qlora.cpp b/examples/qlora_training/finetune_qlora.cpp index 57972712fc..cf6705c5a6 100644 --- a/examples/qlora_training/finetune_qlora.cpp +++ b/examples/qlora_training/finetune_qlora.cpp @@ -217,11 +217,12 @@ static std::vector load_jsonl( break; } } - if (last_assistant_content.find("Error:") != std::string::npos || - last_assistant_content.find("error:") != std::string::npos) { - LOG_DBG("%s: skipping line %d — assistant response contains error marker\n", __func__, lineno); - continue; - } + // // this should be done on the python side... + // if (last_assistant_content.find("Error:") != std::string::npos || + // last_assistant_content.find("error:") != std::string::npos) { + // LOG_DBG("%s: skipping line %d — assistant response contains error marker\n", __func__, lineno); + // continue; + // } } // Split into prompt (no loss) + last assistant response (loss). @@ -262,11 +263,12 @@ static std::vector load_jsonl( } } else if (j.contains("prompt") && j.contains("response")) { response_text = j["response"].get(); - if (response_text.find("Error:") != std::string::npos || - response_text.find("error:") != std::string::npos) { - LOG_DBG("%s: skipping line %d — response contains error marker\n", __func__, lineno); - continue; - } + // // this should be done on the python side... + // if (response_text.find("Error:") != std::string::npos || + // response_text.find("error:") != std::string::npos) { + // LOG_DBG("%s: skipping line %d — response contains error marker\n", __func__, lineno); + // continue; + // } prompt_text = j["prompt"].get(); } else if (j.contains("text")) { response_text = j["text"].get(); From a6979f3d5667019998e04879ead5b2cae91fa276 Mon Sep 17 00:00:00 2001 From: Salvatore Rossitto Date: Tue, 17 Mar 2026 09:07:21 +0100 Subject: [PATCH 14/15] added sigmoid backward pass --- ggml/src/ggml.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index f85f45b3de..4ac4d9ea7d 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -6837,6 +6837,13 @@ static void ggml_compute_backward( ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, grad, ggml_exp(ctx, src0))); } } break; + case GGML_UNARY_OP_SIGMOID: { + // d/dx sigmoid(x) = sigmoid(x) * (1 - sigmoid(x)) = tensor - tensor^2 + if (src0_needs_grads) { + struct ggml_tensor * dsigmoid = ggml_sub(ctx, tensor, ggml_sqr(ctx, tensor)); + ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, grad, dsigmoid)); + } + } break; case GGML_UNARY_OP_SOFTPLUS: { if (src0_needs_grads) { ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, grad, ggml_sigmoid(ctx, src0))); From 51fee3e7ef73cc35ad76da07805597d5f2b9c672 Mon Sep 17 00:00:00 2001 From: Salvatore Rossitto Date: Wed, 18 Mar 2026 08:05:05 +0100 Subject: [PATCH 15/15] removed moe reap code merged by mistake --- tools/expert-profile/CMakeLists.txt | 8 - tools/expert-profile/expert-profile.cpp | 506 --------------------- tools/moe-pruning/README.md | 97 ---- tools/moe-pruning/analyze_stats.py | 284 ------------ tools/moe-pruning/build_expert_profile.sh | 42 -- tools/moe-pruning/extract_ppl.py | 41 -- tools/moe-pruning/gguf_prune.py | 260 ----------- tools/moe-pruning/requirements.txt | 1 - tools/moe-pruning/sample_calibration.jsonl | 8 - 9 files changed, 1247 deletions(-) delete mode 100644 tools/expert-profile/CMakeLists.txt delete mode 100644 tools/expert-profile/expert-profile.cpp delete mode 100644 tools/moe-pruning/README.md delete mode 100644 tools/moe-pruning/analyze_stats.py delete mode 100644 tools/moe-pruning/build_expert_profile.sh delete mode 100644 tools/moe-pruning/extract_ppl.py delete mode 100644 tools/moe-pruning/gguf_prune.py delete mode 100644 tools/moe-pruning/requirements.txt delete mode 100644 tools/moe-pruning/sample_calibration.jsonl diff --git a/tools/expert-profile/CMakeLists.txt b/tools/expert-profile/CMakeLists.txt deleted file mode 100644 index 859bd77a53..0000000000 --- a/tools/expert-profile/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -set(TARGET llama-expert-profile) -add_executable(${TARGET} expert-profile.cpp) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) - -if(LLAMA_TOOLS_INSTALL) - install(TARGETS ${TARGET} RUNTIME) -endif() diff --git a/tools/expert-profile/expert-profile.cpp b/tools/expert-profile/expert-profile.cpp deleted file mode 100644 index de381ff1f1..0000000000 --- a/tools/expert-profile/expert-profile.cpp +++ /dev/null @@ -1,506 +0,0 @@ -/** - * expert-profile: NemotronH MoE expert activation profiler (REAP implementation) - * - * Implements the REAP (Router-weighted Expert Activation Pruning) saliency criterion: - * - * REAP(j) = mean over tokens routed to j of: gate_weight(j,t) * ||expert_output(j,t)||_2 - * - * where expert_output is ffn_moe_down (the FFN output BEFORE gate weighting), - * and gate_weight is ffn_moe_weights (post-softmax routing probability). - * - * Intercepts three tensors per MoE layer via ggml eval callback: - * ffn_moe_topk-{il} [n_expert_used, n_tokens] I32 — which experts were selected - * ffn_moe_weights-{il} [1, n_expert_used, n_tokens] F32 — gate weights (softmax probs) - * ffn_moe_down-{il} [n_embd, n_expert_used, n_tokens] F32 — expert outputs (pre-weighting) - * - * Reference: "REAP: Router-weighted Expert Activation Pruning" (arXiv:2510.13999) - * score = mean_{x in X_j}[ g_j(x) * ||f_j(x)||_2 ] (Equation 9) - * - * Usage: - * llama-expert-profile \ - * -m model.gguf --jsonl training-data.jsonl --output expert_stats.json \ - * [--n-experts 128] [--ctx-size 16384] [-ngl 32] [-t 24] [--save-every 1] - */ - -#include "arg.h" -#include "common.h" -#include "log.h" -#include "llama.h" -#include "ggml-backend.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// ─── Per-layer stats ────────────────────────────────────────────────────────── - -struct LayerStats { - int64_t n_experts = 0; - int64_t total_tokens = 0; // tokens processed through this layer - - // Frequency / weighted-frequency (kept for reference/comparison) - std::vector activation_counts; // [n_experts] — how many tokens routed here - std::vector weighted_freq_sum; // [n_experts] — sum of gate weights - - // REAP: running sum and count for computing mean(gate_weight * ||expert_out||_2) - std::vector reap_sum; // [n_experts] — sum of g_j(t)*||f_j(t)||_2 - std::vector ean_sum; // [n_experts] — sum of ||f_j(t)||_2 (EAN, no gate) - - void init(int64_t n) { - n_experts = n; - activation_counts.assign(n, 0); - weighted_freq_sum.assign(n, 0.0); - reap_sum.assign(n, 0.0); - ean_sum.assign(n, 0.0); - } - - // Called once we have all three tensors for a batch. - // expert_ids: [n_expert_used * n_tokens] I32 — flat, column-major: [k + t*n_expert_used] - // gate_weights:[n_expert_used * n_tokens] F32 — same layout - // expert_outs: [n_embd * n_expert_used * n_tokens] F32 — layout: [e + k*n_embd + t*n_embd*n_expert_used] - // i.e. for token t, expert-slot k: out vector starts at t*n_embd*n_expert_used + k*n_embd - void add_batch(const int32_t * expert_ids, - const float * gate_weights, - const float * expert_outs, - int64_t n_expert_used, - int64_t n_tok, - int64_t n_embd) { - total_tokens += n_tok; - for (int64_t t = 0; t < n_tok; ++t) { - for (int64_t k = 0; k < n_expert_used; ++k) { - const int64_t flat = k + t * n_expert_used; - const int32_t eid = expert_ids[flat]; - if (eid < 0 || eid >= n_experts) continue; - - const float gw = gate_weights[flat]; - - // L2 norm of expert output vector for this (token, expert-slot) - const float * vec = expert_outs + t * n_embd * n_expert_used + k * n_embd; - double norm2 = 0.0; - for (int64_t d = 0; d < n_embd; ++d) { - norm2 += (double)vec[d] * (double)vec[d]; - } - const double norm = std::sqrt(norm2); - - activation_counts [eid] += 1; - weighted_freq_sum [eid] += gw; - reap_sum [eid] += gw * norm; // REAP numerator - ean_sum [eid] += norm; // EAN numerator - } - } - } -}; - -// ─── Collector ──────────────────────────────────────────────────────────────── - -struct ExpertCollector { - int64_t n_experts = 128; - - std::map layer_stats; - std::mutex mtx; - - // We need all three tensors before we can compute REAP. - // They arrive in order: topk → weights → down (per the graph build order). - // Store pending topk+weights until down arrives. - struct PendingBatch { - int64_t n_expert_used = 0; - int64_t n_tokens = 0; - std::vector expert_ids; // [n_expert_used * n_tokens] - std::vector gate_weights; // [n_expert_used * n_tokens] - bool has_topk = false; - bool has_weights = false; - }; - std::map pending; // layer_idx → pending - - // Strip device prefix/suffix: "CUDA0#ffn_moe_down-5#0" → "ffn_moe_down-5" - static std::string clean_name(const char * raw) { - const char * p = strchr(raw, '#'); - if (p) { - ++p; - const char * q = strchr(p, '#'); - return q ? std::string(p, q - p) : std::string(p); - } - return raw; - } - - bool wants(struct ggml_tensor * t) { - if (!t->name[0]) return false; - const std::string n = clean_name(t->name); - return (n.compare(0, 13, "ffn_moe_topk-") == 0 || - n.compare(0, 16, "ffn_moe_weights-") == 0 || - n.compare(0, 13, "ffn_moe_down-") == 0); - } - - bool on_tensor(struct ggml_tensor * t) { - const std::string name = clean_name(t->name); - - // Identify tensor type and layer - int il = -1; - bool is_topk = false; - bool is_weights = false; - bool is_down = false; - - if (name.compare(0, 13, "ffn_moe_topk-") == 0) { il = atoi(name.c_str() + 13); is_topk = true; } - else if (name.compare(0, 16, "ffn_moe_weights-") == 0) { il = atoi(name.c_str() + 16); is_weights = true; } - else if (name.compare(0, 13, "ffn_moe_down-") == 0) { il = atoi(name.c_str() + 13); is_down = true; } - else return true; - - if (il < 0) return true; - - // Copy tensor data from (possibly GPU) buffer to host - const size_t nbytes = ggml_nbytes(t); - std::vector buf(nbytes); - ggml_backend_tensor_get(t, buf.data(), 0, nbytes); - - std::lock_guard lk(mtx); - PendingBatch & pb = pending[il]; - - if (is_topk) { - // [n_expert_used, n_tokens] I32 - pb.n_expert_used = t->ne[0]; - pb.n_tokens = t->ne[1]; - pb.expert_ids.resize(pb.n_expert_used * pb.n_tokens); - memcpy(pb.expert_ids.data(), buf.data(), pb.n_expert_used * pb.n_tokens * sizeof(int32_t)); - pb.has_topk = true; - pb.has_weights = false; // reset in case of re-use - - } else if (is_weights) { - // [1, n_expert_used, n_tokens] F32 — flat layout same as topk - if (!pb.has_topk) return true; // shouldn't happen - pb.gate_weights.resize(pb.n_expert_used * pb.n_tokens); - memcpy(pb.gate_weights.data(), buf.data(), pb.n_expert_used * pb.n_tokens * sizeof(float)); - pb.has_weights = true; - - } else if (is_down) { - // [n_embd, n_expert_used, n_tokens] F32 - if (!pb.has_topk || !pb.has_weights) return true; - - const int64_t n_embd = t->ne[0]; - const int64_t n_expert_used = t->ne[1]; - const int64_t n_tokens = t->ne[2]; - - // Sanity check - if (n_expert_used != pb.n_expert_used || n_tokens != pb.n_tokens) { - LOG_ERR("expert-profile: dimension mismatch at layer %d\n", il); - pending.erase(il); - return true; - } - - // Ensure layer stats initialised - auto & ls = layer_stats[il]; - if (ls.n_experts == 0) ls.init(n_experts); - - const float * expert_outs = reinterpret_cast(buf.data()); - ls.add_batch(pb.expert_ids.data(), pb.gate_weights.data(), - expert_outs, n_expert_used, n_tokens, n_embd); - - // Done with this batch for this layer - pending.erase(il); - } - - return true; - } -}; - -// ─── Global collector + C callback ─────────────────────────────────────────── - -static ExpertCollector g_collector; - -static bool expert_eval_callback(struct ggml_tensor * t, bool ask, void * /*user_data*/) { - if (ask) return g_collector.wants(t); - return g_collector.on_tensor(t); -} - -// ─── JSON output ────────────────────────────────────────────────────────────── - -static void save_stats(const std::string & path) { - std::ofstream f(path); - if (!f) { - LOG_ERR("expert-profile: failed to open output file '%s'\n", path.c_str()); - return; - } - - f << "{\n"; - bool first_layer = true; - for (auto & [il, ls] : g_collector.layer_stats) { - if (!first_layer) f << ",\n"; - first_layer = false; - - f << " \"" << il << "\": {\n"; - f << " \"total_tokens\": " << ls.total_tokens << ",\n"; - - // activation_counts - f << " \"activation_counts\": ["; - for (int64_t i = 0; i < ls.n_experts; ++i) { - if (i) f << ", "; - f << ls.activation_counts[i]; - } - f << "],\n"; - - // activation_frequency - f << " \"activation_frequency\": ["; - for (int64_t i = 0; i < ls.n_experts; ++i) { - if (i) f << ", "; - f << ((ls.total_tokens > 0) ? (double)ls.activation_counts[i] / ls.total_tokens : 0.0); - } - f << "],\n"; - - // avg_gate_weight (weighted_freq_sum / activation_counts) - f << " \"avg_gate_weight\": ["; - for (int64_t i = 0; i < ls.n_experts; ++i) { - if (i) f << ", "; - f << ((ls.activation_counts[i] > 0) ? ls.weighted_freq_sum[i] / ls.activation_counts[i] : 0.0); - } - f << "],\n"; - - // ean_mean = ean_sum / activation_counts (EAN criterion, no gate weight) - f << " \"ean_mean\": ["; - for (int64_t i = 0; i < ls.n_experts; ++i) { - if (i) f << ", "; - f << ((ls.activation_counts[i] > 0) ? ls.ean_sum[i] / ls.activation_counts[i] : 0.0); - } - f << "],\n"; - - // reap = reap_sum / activation_counts (REAP criterion, Eq.9) - f << " \"reap\": ["; - for (int64_t i = 0; i < ls.n_experts; ++i) { - if (i) f << ", "; - f << ((ls.activation_counts[i] > 0) ? ls.reap_sum[i] / ls.activation_counts[i] : 0.0); - } - f << "],\n"; - - // never_activated - int64_t never = 0; - for (int64_t i = 0; i < ls.n_experts; ++i) { - if (ls.activation_counts[i] == 0) ++never; - } - f << " \"never_activated\": " << never << "\n"; - f << " }"; - } - f << "\n}\n"; - - LOG_INF("expert-profile: stats saved to '%s' (%zu MoE layers)\n", - path.c_str(), g_collector.layer_stats.size()); -} - -// ─── JSONL input ────────────────────────────────────────────────────────────── - -struct JsonPair { std::string prompt, response; }; - -static bool json_get_string(const std::string & line, const std::string & key, std::string & out) { - std::string search = "\"" + key + "\""; - size_t kpos = line.find(search); - if (kpos == std::string::npos) return false; - size_t colon = line.find(':', kpos + search.size()); - if (colon == std::string::npos) return false; - size_t q1 = line.find('"', colon + 1); - if (q1 == std::string::npos) return false; - out.clear(); - for (size_t i = q1 + 1; i < line.size(); ++i) { - if (line[i] == '\\' && i + 1 < line.size()) { - ++i; - switch (line[i]) { - case '"': out += '"'; break; - case '\\': out += '\\'; break; - case 'n': out += '\n'; break; - case 'r': out += '\r'; break; - case 't': out += '\t'; break; - default: out += line[i]; break; - } - } else if (line[i] == '"') { - return true; - } else { - out += line[i]; - } - } - return false; -} - -static std::vector load_jsonl(const std::string & path) { - std::vector pairs; - std::ifstream f(path); - if (!f) { LOG_ERR("expert-profile: cannot open JSONL file '%s'\n", path.c_str()); return pairs; } - std::string line; - while (std::getline(f, line)) { - if (line.empty()) continue; - JsonPair p; - json_get_string(line, "prompt", p.prompt); - json_get_string(line, "response", p.response); - if (!p.prompt.empty() || !p.response.empty()) pairs.push_back(std::move(p)); - } - return pairs; -} - -// ─── Inference loop ─────────────────────────────────────────────────────────── - -static void run_inference(llama_context * ctx, - const llama_model * model, - const std::vector & pairs, - int max_tokens, - const std::string & output_path, - int save_every) { - const llama_vocab * vocab = llama_model_get_vocab(model); - const bool add_bos = llama_vocab_get_add_bos(vocab); - - llama_batch batch = llama_batch_init(max_tokens, 0, 1); - - for (size_t pi = 0; pi < pairs.size(); ++pi) { - const std::string text = pairs[pi].prompt + "\n" + pairs[pi].response; - - std::vector tokens = common_tokenize(ctx, text, add_bos, true); - if ((int)tokens.size() > max_tokens) tokens.resize(max_tokens); - if (tokens.empty()) continue; - - LOG_INF(" [%zu/%zu] %zu tokens\n", pi + 1, pairs.size(), tokens.size()); - - llama_memory_clear(llama_get_memory(ctx), true); - - common_batch_clear(batch); - for (int i = 0; i < (int)tokens.size(); ++i) { - common_batch_add(batch, tokens[i], i, {0}, false); - } - batch.logits[batch.n_tokens - 1] = true; - - if (llama_decode(ctx, batch) != 0) { - LOG_ERR(" [%zu/%zu] llama_decode failed — skipping\n", pi + 1, pairs.size()); - } - - if (save_every > 0 && (pi + 1) % save_every == 0) { - save_stats(output_path); - } - } - - llama_batch_free(batch); -} - -// ─── CLI ────────────────────────────────────────────────────────────────────── - -int main(int argc, char ** argv) { - std::string model_path; - std::string jsonl_path; - std::string output_path = "expert_stats.json"; - int n_experts = 128; - int ctx_size = 2048; - int n_gpu_layers = 99; - int n_threads = 4; - int save_every = 100; - enum ggml_type kv_type_k = GGML_TYPE_F16; - enum ggml_type kv_type_v = GGML_TYPE_F16; - - auto parse_ggml_type = [](const char * s) -> enum ggml_type { - if (strcmp(s, "f32") == 0) return GGML_TYPE_F32; - if (strcmp(s, "f16") == 0) return GGML_TYPE_F16; - if (strcmp(s, "q8_0") == 0) return GGML_TYPE_Q8_0; - if (strcmp(s, "q4_0") == 0) return GGML_TYPE_Q4_0; - fprintf(stderr, "Unknown KV type '%s', using f16\n", s); return GGML_TYPE_F16; - }; - - for (int i = 1; i < argc; ++i) { - std::string a(argv[i]); - auto next = [&]() -> const char * { - if (i + 1 >= argc) { fprintf(stderr, "Missing argument for %s\n", argv[i]); exit(1); } - return argv[++i]; - }; - if (a == "-m" || a == "--model") model_path = next(); - else if (a == "--jsonl") jsonl_path = next(); - else if (a == "--output") output_path = next(); - else if (a == "--n-experts") n_experts = atoi(next()); - else if (a == "--ctx-size" || a == "-c") ctx_size = atoi(next()); - else if (a == "-ngl" || a == "--n-gpu-layers") n_gpu_layers = atoi(next()); - else if (a == "-t" || a == "--threads") n_threads = atoi(next()); - else if (a == "--type-k") kv_type_k = parse_ggml_type(next()); - else if (a == "--type-v") kv_type_v = parse_ggml_type(next()); - else if (a == "--save-every") save_every = atoi(next()); - else if (a == "-h" || a == "--help") { - fprintf(stderr, - "\nUsage: %s -m model.gguf --jsonl data.jsonl [options]\n" - " --output PATH Output JSON (default: expert_stats.json)\n" - " --n-experts N Experts per layer (default: 128)\n" - " --ctx-size N Context length (default: 2048)\n" - " -ngl N GPU layers (default: 99)\n" - " -t N CPU threads (default: 4)\n" - " --type-k/v TYPE KV cache type: f32/f16/q8_0/q4_0 (default: f16)\n" - " --save-every N Checkpoint every N samples (default: 100)\n\n", argv[0]); - return 0; - } else { - fprintf(stderr, "Unknown argument: %s\n", a.c_str()); return 1; - } - } - - if (model_path.empty()) { fprintf(stderr, "Error: -m required\n"); return 1; } - if (jsonl_path.empty()) { fprintf(stderr, "Error: --jsonl required\n"); return 1; } - - g_collector.n_experts = n_experts; - - LOG_INF("expert-profile: model = %s\n", model_path.c_str()); - LOG_INF("expert-profile: jsonl = %s\n", jsonl_path.c_str()); - LOG_INF("expert-profile: output = %s\n", output_path.c_str()); - LOG_INF("expert-profile: n_experts = %d\n", n_experts); - LOG_INF("expert-profile: ctx_size = %d\n", ctx_size); - LOG_INF("expert-profile: ngl = %d\n", n_gpu_layers); - LOG_INF("expert-profile: criterion = REAP (gate_weight * ||expert_out||_2)\n"); - - auto pairs = load_jsonl(jsonl_path); - if (pairs.empty()) { LOG_ERR("expert-profile: no pairs loaded\n"); return 1; } - LOG_INF("expert-profile: loaded %zu pairs\n", pairs.size()); - - llama_backend_init(); - - // Suppress INFO/WARN spam (CUDA graph warmup etc.), only pass errors through - llama_log_set([](enum ggml_log_level level, const char * text, void *) { - if (level >= GGML_LOG_LEVEL_ERROR) fputs(text, stderr); - }, nullptr); - - llama_model_params mparams = llama_model_default_params(); - mparams.n_gpu_layers = n_gpu_layers; - - llama_model * model = llama_model_load_from_file(model_path.c_str(), mparams); - if (!model) { LOG_ERR("expert-profile: failed to load model\n"); return 1; } - - llama_context_params cparams = llama_context_default_params(); - cparams.n_ctx = ctx_size; - cparams.n_batch = ctx_size; - cparams.n_ubatch = std::min(ctx_size, 512); - cparams.n_threads = n_threads; - cparams.type_k = kv_type_k; - cparams.type_v = kv_type_v; - cparams.cb_eval = expert_eval_callback; - cparams.cb_eval_user_data = nullptr; - - llama_context * ctx = llama_init_from_model(model, cparams); - if (!ctx) { LOG_ERR("expert-profile: failed to create context\n"); return 1; } - - LOG_INF("expert-profile: running forward passes...\n"); - run_inference(ctx, model, pairs, ctx_size, output_path, save_every); - save_stats(output_path); - - // Summary - LOG_INF("\n MoE layers profiled: %zu\n", g_collector.layer_stats.size()); - for (auto & [il, ls] : g_collector.layer_stats) { - // Find top and bottom REAP expert - int64_t top_e = 0, bot_e = 0; - double top_v = 0.0, bot_v = 1e18; - for (int64_t i = 0; i < ls.n_experts; ++i) { - double v = (ls.activation_counts[i] > 0) ? ls.reap_sum[i] / ls.activation_counts[i] : 0.0; - if (v > top_v) { top_v = v; top_e = i; } - if (v < bot_v) { bot_v = v; bot_e = i; } - } - int64_t never = 0; - for (int64_t i = 0; i < ls.n_experts; ++i) - if (ls.activation_counts[i] == 0) ++never; - LOG_INF(" Layer %3d: tokens=%lld never=%lld reap_top=e%lld(%.4f) reap_bot=e%lld(%.4f)\n", - il, (long long)ls.total_tokens, (long long)never, - (long long)top_e, top_v, (long long)bot_e, bot_v); - } - - llama_free(ctx); - llama_model_free(model); - llama_backend_free(); - return 0; -} diff --git a/tools/moe-pruning/README.md b/tools/moe-pruning/README.md deleted file mode 100644 index a88499ac43..0000000000 --- a/tools/moe-pruning/README.md +++ /dev/null @@ -1,97 +0,0 @@ -# MoE Expert Pruning Tools for NemotronH - -REAP-style expert pruning for `NVIDIA-Nemotron-3-Nano-30B-A3B` (and other -NemotronH MoE models), implemented in two complementary ways: - -1. **`tools/expert-profile/`** — C++ profiler built into llama.cpp, collects - REAP scores directly from GGUF inference via the ggml eval callback. -2. **`tools/moe-pruning/`** (this directory) — Python scripts to prune the model - using the collected scores, either on a GGUF file directly or on a - HuggingFace BF16 checkpoint. - ---- - -## Inspiration & Prior Art - -This work is a direct implementation of the **REAP** saliency criterion -introduced in: - -> **REAP the Experts: Why Pruning Prevails for One-Shot MoE Compression** -> Mike Lasby, Ivan Lazarevich, Nish Sinnadurai, Sean Lie, Yani Ioannou, Vithursan Thangarasa -> Cerebras Research, 2025 -> arXiv: https://arxiv.org/abs/2510.13999 -> Code: https://github.com/CerebrasResearch/reap - -The REAP score for expert `j` is (Equation 9 of the paper): - -``` -REAP(j) = mean_{t : j ∈ topk(t)} [ g_j(t) · ‖f_j(t)‖₂ ] -``` - -where `g_j(t)` is the router gate weight and `f_j(t)` is the expert FFN output -(pre-weighting) for token `t`. Experts with the lowest REAP score contribute -least to the layer output and are pruned first. - -The original REAP repo targets HuggingFace models via PyTorch hooks on -standard architectures (Qwen3-MoE, Mixtral, DeepSeek-V2, Llama-4, …). - -**What we added / adapted:** - -- `tools/expert-profile/expert-profile.cpp` — llama.cpp C++ implementation - of REAP that intercepts `ffn_moe_topk`, `ffn_moe_weights`, and `ffn_moe_down` - tensors via `ggml_backend_eval_callback`, enabling REAP profiling on any - GGUF-quantised model (Q4_K_M, Q6_K, etc.) without needing full BF16 VRAM. - -- `gguf_prune.py` — prunes the GGUF file **directly**, slicing the expert axis - of the stacked weight tensors (`ffn_up_exps`, `ffn_down_exps`, `ffn_gate_inp`, - `ffn_exp_probs_b`) and patching `{arch}.expert_count` in the metadata. - Quantised blocks are preserved as raw bytes — no dequantise/requantise step. - -- `nemotron_reap.py` — HuggingFace-based alternative: profiles with 4-bit NF4 - on GPU (phase 1) and prunes the BF16 checkpoint on CPU (phase 2). Adds - NemotronH (`NemotronHForCausalLM`) support that the original REAP repo does - not have. - ---- - -## Recommended Workflow (low-VRAM, e.g. RTX 4060 Ti 16 GB) - -``` -┌─────────────────────────────────────────────┐ -│ Phase 1 — Profile (GPU, GGUF Q4, ~15 GB) │ -│ │ -│ llama-expert-profile │ -│ -m nemotron-Q4_K_M.gguf │ -│ --jsonl sample_calibration.jsonl │ -│ --output expert_stats.json │ -│ -ngl 99 --ctx-size 2048 │ -└───────────────────┬─────────────────────────┘ - │ expert_stats.json -┌───────────────────▼─────────────────────────┐ -│ Phase 2 — Prune (CPU, pure Python, ~2 GB) │ -│ │ -│ python gguf_prune.py │ -│ --input nemotron-Q4_K_M.gguf │ -│ --stats expert_stats.json │ -│ --output nemotron-pruned-26e.gguf │ -│ --keep_ratio 0.20 # 26/128 experts │ -└─────────────────────────────────────────────┘ -``` - -At 20 % keep ratio a ~22 GB Q4_K_M becomes ~4.5 GB. - ---- - -## Files - -| File | Description | -|---|---| -| `gguf_prune.py` | GGUF-native pruner — no GPU needed, preserves quantisation | -| `nemotron_reap.py` | HF-based pruner — 4-bit GPU profile + CPU BF16 prune | -| `build_expert_profile.sh` | Build script for `llama-expert-profile` | -| `run_nemotron_profile.sh` | Example profiling run | -| `run_prune.sh` | Example pruning run | -| `run_convert_quantize.sh` | Convert HF → GGUF and quantise | -| `analyze_stats.py` | Visualise and compare expert stats JSON files | -| `sample_calibration.jsonl` | Sample calibration data (prompt+response pairs) | -| `expert_stats_reap.json` | Example stats output from expert-profile | diff --git a/tools/moe-pruning/analyze_stats.py b/tools/moe-pruning/analyze_stats.py deleted file mode 100644 index 2e0821f323..0000000000 --- a/tools/moe-pruning/analyze_stats.py +++ /dev/null @@ -1,284 +0,0 @@ -#!/usr/bin/env python3 -""" -analyze_stats.py -- Summarize expert_stats.json and model size projections. -Usage: python analyze_stats.py [stats_file] [--keep 0.5] -""" -import json, statistics, argparse - -parser = argparse.ArgumentParser() -parser.add_argument("stats", nargs="?", default="expert_stats_reap.json") -parser.add_argument("--keep", type=float, default=0.5, help="Fraction of experts to keep (default 0.5)") -args = parser.parse_args() - -with open(args.stats) as f: - data = json.load(f) - -layers = sorted(data.keys(), key=int) -n_layers = len(layers) -keep_ratio = args.keep - -# Detect which scoring field is available (new REAP vs old importance_score) -sample_layer = data[layers[0]] -if "reap" in sample_layer: - score_field = "reap" - score_label = "REAP (gate_weight × ||expert_out||₂)" -elif "importance_score" in sample_layer: - score_field = "importance_score" - score_label = "importance_score (freq × avg_gate_weight) [legacy, no EAN]" -else: - raise ValueError(f"No recognised score field in stats. Keys: {list(sample_layer.keys())}") - -# ── Model architecture constants (Nemotron-3-Nano-30B-A3B) ────────────────── -N_EXPERTS = 128 -N_EXPERT_USED = 6 # top-k per token -N_MOE_LAYERS = 23 -N_TOTAL_LAYERS = 53 -# Approximate parameter counts (bf16, billions) -PARAMS_TOTAL_B = 30.0 -PARAMS_MOE_EXPERTS_B = 22.0 # bulk of MoE weight is in expert FFNs -PARAMS_NON_MOE_B = PARAMS_TOTAL_B - PARAMS_MOE_EXPERTS_B - -# ── Header ────────────────────────────────────────────────────────────────── -print("=" * 70) -print(f" Expert Stats Analysis | file: {args.stats}") -print("=" * 70) - -# ── Profiling completeness ─────────────────────────────────────────────────── -sample_tokens = list(data.values())[0]["total_tokens"] -# Each token activates N_EXPERT_USED experts, sum(activation_counts) = total*top_k -# Approximate samples: total_tokens / avg_tokens_per_sample -# We don't know avg, but can infer: total_tokens / (total_tokens / ctx) ≈ ctx chunks -# Better: just report tokens and note the user knows sample count -print(f"\n── Profiling progress ──────────────────────────────────────────────────") -print(f" MoE layers profiled : {n_layers} / {N_MOE_LAYERS}") -print(f" Tokens processed : {sample_tokens:,} (per layer)") -act_sum = sum(data[layers[0]]["activation_counts"]) -assert abs(act_sum / sample_tokens - N_EXPERT_USED) < 0.01, "unexpected top-k" -print(f" top-k confirmed : {N_EXPERT_USED} (sum activations / tokens = {act_sum/sample_tokens:.1f})") - -# ── Per-layer importance score stats ──────────────────────────────────────── -print(f"\n── Per-layer score distribution [{score_label}]") -print(f" {'Layer':>5} {'Min':>9} {'Max':>9} {'Range':>9} {'CV%':>6} {'Never':>5}") -global_cvs = [] -for k in layers: - d = data[k] - s = d[score_field] - mn, mx = min(s), max(s) - cv = statistics.stdev(s) / statistics.mean(s) * 100 - global_cvs.append(cv) - print(f" {k:>5} {mn:>9.5f} {mx:>9.5f} {mx-mn:>9.5f} {cv:>6.3f}% {d['never_activated']:>5}") - -print(f"\n Mean CV across layers : {statistics.mean(global_cvs):.3f}%") -print(f" (CV < 1% = near-uniform; load-balancing is working as designed)") - -# ── Capacity loss sweep across pruning levels ──────────────────────────────── -# Paper (observer.py): REAP[i] = mean(ean_norm * softmax_router_weight) over tokens -# routed to expert i, averaged via OnlineStatsTracker weighted by expert_frequency. -# Our implementation (llama.cpp): same formula but routing weights are the top-k -# gate weights (post-softmax within top-k), not the full softmax over all 128. -# Impact: our weights are slightly higher than the paper's (renormalized to top-k -# only), but relative expert ranking within a layer should be preserved. -# -# IMPORTANT CAVEAT for this model (Nemotron-3-Nano-30B-A3B): -# The model was trained with a strong load-balancing auxiliary loss, so all 128 -# experts have nearly identical activation frequency (~4.69%) AND nearly identical -# REAP scores (Gini ~0.015, top/bottom ratio ~1.1-1.35x). The score distribution -# is a smooth monotone curve with NO natural elbow or gap. -# -# This means: -# - REAP ranking beats random pruning by only ~1pp in mass terms at keep=33% -# - The cut point boundary (rank 42 vs 43) has near-zero gap in most layers -# - REAP paper results on Qwen3-30B-A3B likely had higher Gini (less tight -# load-balancing or more expert specialization in pre-training) -# - For this model, actual quality loss must be measured via eval, not predicted -# from REAP score variance -# -# Metrics reported: -# - kept_mass%: REAP mass in the KEPT experts as % of total (> keep_ratio% = good) -# - vs_random%: how much more mass the REAP-selected set retains vs a random set -# of the same size (= kept_mass% - keep_ratio%). Positive = REAP wins. -# - Rel.gap: score gap at cut / layer score range. Near 0 = no natural cut point. -# - Gini: inequality of score distribution. ~0.015 here = near-uniform. - -def gini(scores): - """Gini coefficient of a list of non-negative values.""" - n = len(scores) - s = sorted(scores) - total = sum(s) - if total == 0: - return 0.0 - cumsum = 0.0 - for i, v in enumerate(s): - cumsum += (2 * (i + 1) - n - 1) * v - return cumsum / (n * total) - -def layer_stats(scores, n_keep): - """Return capacity metrics for a single layer at a given keep count.""" - n = len(scores) - ranked = sorted(range(n), key=lambda i: scores[i], reverse=True) - total = sum(scores) - kept_mass = sum(scores[i] for i in ranked[:n_keep]) - kept_frac = kept_mass / total if total > 0 else 0.0 # fraction of REAP mass kept - random_frac = n_keep / n # uniform expectation - vs_random = kept_frac - random_frac # positive = REAP beats random - score_range = scores[ranked[0]] - scores[ranked[-1]] - gap = scores[ranked[n_keep - 1]] - (scores[ranked[n_keep]] if n_keep < n else 0) - rel_gap = gap / score_range if score_range > 0 else 0.0 - return kept_frac * 100, vs_random * 100, rel_gap - -# Sweep over a range of keep ratios -sweep_ratios = [0.10, 0.20, 0.25, 0.33, 0.40, 0.50, 0.60, 0.75] -if keep_ratio not in sweep_ratios: - sweep_ratios.append(keep_ratio) -sweep_ratios = sorted(set(sweep_ratios)) - -# Per-layer Gini (fixed, independent of keep ratio) -layer_ginis = {k: gini(data[k][score_field]) for k in layers} -mean_gini = statistics.mean(layer_ginis.values()) -worst_gini_layer = max(layer_ginis, key=lambda k: layer_ginis[k]) - -print(f"\n── Score distribution inequality (Gini coefficient) ────────────────────") -print(f" Gini measures how non-uniform REAP scores are within each layer.") -print(f" Gini=0: all experts identical. Gini=1: one expert dominates.") -print(f" With load-balanced MoE, Gini is small — but any Gini > 0 means") -print(f" REAP ranking beats random pruning.") -print(f"") -print(f" {'Layer':>5} {'Gini':>8} {'Score range':>13} {'Max/Min ratio':>14}") -print(f" {'-'*5} {'-'*8} {'-'*13} {'-'*14}") -for k in layers: - s = data[k][score_field] - mn, mx = min(s), max(s) - g = layer_ginis[k] - ratio_mm = mx / mn if mn > 0 else float('inf') - print(f" {k:>5} {g:>8.5f} {mx-mn:>13.5f} {ratio_mm:>13.3f}x") -print(f"") -print(f" Mean Gini : {mean_gini:.5f} (worst layer: {worst_gini_layer})") - -print(f"\n── Capacity retention sweep ─────────────────────────────────────────────") -print(f" Kept mass% = REAP mass in KEPT experts as % of total (higher = better)") -print(f" vs.rand% = Kept mass% minus uniform baseline (keep_ratio%)") -print(f" Positive = REAP beats random. Magnitude = advantage in pp.") -print(f" Rel.gap = score gap at cut / layer score range (higher = cleaner cut)") -print(f" WARNING: near-zero rel.gap and small vs.rand mean eval is the only ground truth.") -print(f"") -print(f" {'Keep':>5} {'Experts':>7} {'Kept mass%':>11} {'vs.rand%':>9} {'Rel.gap avg':>12} {'Worst layer':>11}") -print(f" {'-'*5} {'-'*7} {'-'*11} {'-'*9} {'-'*12} {'-'*11}") - -sweep_results = {} -for ratio in sweep_ratios: - nk = max(1, round(N_EXPERTS * ratio)) - mass_fracs, excesses, rel_gaps = [], [], [] - worst_excess, worst_layer_id = -999.0, None - for k in layers: - scores = data[k][score_field] - mf, exc, rg = layer_stats(scores, nk) - mass_fracs.append(mf) - excesses.append(exc) - rel_gaps.append(rg) - if exc > worst_excess: - worst_excess = exc - worst_layer_id = k - avg_mf = statistics.mean(mass_fracs) - avg_exc = statistics.mean(excesses) - avg_rg = statistics.mean(rel_gaps) - marker = " <--" if abs(ratio - keep_ratio) < 1e-9 else "" - print(f" {ratio:>5.0%} {nk:>7d} {avg_mf:>10.2f}% {avg_exc:>+9.2f}% {avg_rg:>11.4f} layer {worst_layer_id:>3}{marker}") - sweep_results[ratio] = { - "n_keep": nk, "avg_kept_mass": avg_mf, "avg_vs_random": avg_exc, - "avg_rel_gap": avg_rg, "worst_layer_id": worst_layer_id, "worst_vs_random": worst_excess, - } - -print(f"") -print(f" vs.rand% quantifies REAP's advantage over random pruning in REAP-mass terms.") -print(f" For this model it is small (+0.7 to +1.5pp) due to tight load-balancing.") -print(f" Rel.gap near zero means scores are smooth with no natural cut — any threshold") -print(f" is as defensible as another. Actual quality delta requires empirical eval.") - -# ── Expert keep/prune detail at selected keep_ratio ────────────────────────── -n_keep = max(1, round(N_EXPERTS * keep_ratio)) -n_prune = N_EXPERTS - n_keep - -print(f"\n── Expert pruning detail at keep_ratio={keep_ratio:.0%} ({n_keep} keep / {n_prune} prune per layer) ──") -print(f" {'Layer':>5} {'Kept mass%':>11} {'vs.rand%':>9} {'Rel.gap':>9} {'Min kept':>10} {'Max pruned':>11}") -print(f" {'-'*5} {'-'*11} {'-'*9} {'-'*9} {'-'*10} {'-'*11}") - -layer_results = {} -for k in layers: - scores = data[k][score_field] - ranked = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True) - mf, exc, rg = layer_stats(scores, n_keep) - min_kept = scores[ranked[n_keep - 1]] - max_pruned = scores[ranked[n_keep]] if n_prune > 0 else 0 - layer_results[k] = {"mass_frac": mf, "excess": exc, "rel_gap": rg, - "min_kept": min_kept, "max_pruned": max_pruned} - print(f" {k:>5} {mf:>10.2f}% {exc:>+9.2f}% {rg:>9.4f} {min_kept:>10.5f} {max_pruned:>11.5f}") - -avg_mf = statistics.mean(r["mass_frac"] for r in layer_results.values()) -avg_exc = statistics.mean(r["excess"] for r in layer_results.values()) -avg_rg = statistics.mean(r["rel_gap"] for r in layer_results.values()) -print(f" {'AVG':>5} {avg_mf:>10.2f}% {avg_exc:>+9.2f}% {avg_rg:>9.4f}") - -# ── Model size projections ─────────────────────────────────────────────────── -print(f"\n── Model size projections ──────────────────────────────────────────────") - -def model_size(keep): - expert_params = PARAMS_MOE_EXPERTS_B * keep - return PARAMS_NON_MOE_B + expert_params - -original_b = model_size(1.0) -pruned_b = model_size(keep_ratio) -reduction_pct = (1 - pruned_b / original_b) * 100 - -# GGUF sizes at common quant levels (rough: 1B params ≈ quant_bpw/8 GB) -quants = [("Q8_0", 8.0), ("Q5_K_M", 5.5), ("Q4_K_M", 4.5), ("Q3_K_M", 3.35), ("Q2_K", 2.63)] - -print(f" {'':20} {'Original':>10} {'Pruned':>10} {'Saved':>8}") -print(f" {'Parameters (B)':20} {original_b:>10.1f} {pruned_b:>10.1f} {original_b-pruned_b:>8.1f}B") -print(f" {'Reduction':20} {'':>10} {reduction_pct:>9.1f}%") -print() -print(f" Estimated GGUF sizes:") -print(f" {'Quant':10} {'Original':>10} {'Pruned':>10} {'Fits in':>12}") -for name, bpw in quants: - orig_gb = original_b * bpw / 8 - prune_gb = pruned_b * bpw / 8 - # VRAM fit (16GB GPU) - fits = "16GB GPU" if prune_gb <= 15.5 else ("32GB GPU" if prune_gb <= 31 else "CPU/RAM") - print(f" {name:10} {orig_gb:>9.1f}G {prune_gb:>9.1f}G {fits:>12}") - -# ── Active params per token (inference cost) ───────────────────────────────── -print(f"\n── Inference cost (active params per token) ────────────────────────────") -# Active params = non-moe + (n_expert_used/n_experts_kept * moe_expert_params) -# After pruning: router still picks top-k but from n_keep pool -# Active expert params per token = (N_EXPERT_USED / n_keep) * (PARAMS_MOE_EXPERTS_B * keep_ratio) -# But actually active params = N_EXPERT_USED * (params per single expert) -params_per_expert_orig = PARAMS_MOE_EXPERTS_B / N_EXPERTS # B per expert -params_per_expert_pruned = (PARAMS_MOE_EXPERTS_B * keep_ratio) / n_keep # same, just fewer experts - -active_orig = PARAMS_NON_MOE_B + N_EXPERT_USED * params_per_expert_orig * N_MOE_LAYERS / N_TOTAL_LAYERS -active_pruned = PARAMS_NON_MOE_B + N_EXPERT_USED * params_per_expert_pruned * N_MOE_LAYERS / N_TOTAL_LAYERS - -print(f" Original : {active_orig:.2f}B active params/token (same expert size, more choice)") -print(f" Pruned : {active_pruned:.2f}B active params/token (same — top-k still fires {N_EXPERT_USED} experts)") -print(f" Note: active params per token are IDENTICAL — pruning only reduces") -print(f" model file size and memory footprint, not per-token compute.") - -# ── Consistently low-importance experts ────────────────────────────────────── -print(f"\n── Experts consistently ranked low across all layers ───────────────────") -bottom_n = max(1, round(N_EXPERTS * 0.10)) # bottom 10% -low_count = {} -for k in layers: - scores = data[k][score_field] - ranked = sorted(range(len(scores)), key=lambda i: scores[i]) - for eid in ranked[:bottom_n]: - low_count[eid] = low_count.get(eid, 0) + 1 - -consistent = sorted(low_count.items(), key=lambda x: -x[1]) -consistent = [(eid, cnt) for eid, cnt in consistent if cnt >= 3] -print(f" (bottom 10% in >= 3 layers — most dispensable experts globally)") -print(f" Expert ID : layers in bottom 10%") -for eid, cnt in consistent[:20]: - bar = "█" * cnt - print(f" Expert {eid:>3} : {cnt:>2}/{n_layers} {bar}") - -print() -print("=" * 70) diff --git a/tools/moe-pruning/build_expert_profile.sh b/tools/moe-pruning/build_expert_profile.sh deleted file mode 100644 index 0b39604426..0000000000 --- a/tools/moe-pruning/build_expert_profile.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env bash -# build_expert_profile.sh -# Builds llama.cpp with the expert-profile tool in WSL2 with CUDA. -# Run this from the tools/moe-pruning/ directory: bash build_expert_profile.sh - -set -e - -LLAMA_SRC="../.." -BUILD_DIR="$LLAMA_SRC/build_expert" - -echo "=== Building llama.cpp + expert-profile tool ===" -echo " Source : $LLAMA_SRC" -echo " Build : $BUILD_DIR" - -mkdir -p "$BUILD_DIR" -cd "$BUILD_DIR" - -# Configure with CUDA -cmake "$LLAMA_SRC" \ - -DCMAKE_BUILD_TYPE=Release \ - -DGGML_CUDA=ON \ - -DLLAMA_CURL=OFF \ - -DLLAMA_BUILD_TESTS=OFF \ - -DLLAMA_BUILD_EXAMPLES=OFF \ - -DCMAKE_CUDA_ARCHITECTURES=86 \ - 2>&1 | tail -20 - -# Build only the expert-profile target (fast) -cmake --build . --target llama-expert-profile --config Release -j$(nproc) - -echo "" -echo "=== Build complete ===" -echo " Binary: $BUILD_DIR/tools/expert-profile/llama-expert-profile" -echo "" -echo "=== Usage ===" -echo " $BUILD_DIR/tools/expert-profile/llama-expert-profile \\" -echo " -m ~/nemotron-3-nano-30b-Q4_K_M.gguf \\" -echo " --jsonl ./sample_calibration.jsonl \\" -echo " --output ./expert_stats_reap.json \\" -echo " --n-experts 128 \\" -echo " --ctx-size 16384 \\" -echo " -ngl 99" diff --git a/tools/moe-pruning/extract_ppl.py b/tools/moe-pruning/extract_ppl.py deleted file mode 100644 index 972a32e99d..0000000000 --- a/tools/moe-pruning/extract_ppl.py +++ /dev/null @@ -1,41 +0,0 @@ -import json, os - -base = os.path.dirname(os.path.abspath(__file__)) - -lines = open(os.path.join(base, 'rwsft-training-data.jsonl'), encoding='utf-8').readlines() -split = int(len(lines) * 0.95) - -train_lines = lines[:split] -val_lines = lines[split:] - -train_out = os.path.join(base, 'ppl-eval-train.txt') -val_out = os.path.join(base, 'ppl-eval-val.txt') - -def fmt(s): - # Full prompt+response so the model is conditioned correctly. - # llama-perplexity scores all tokens, but the prompt PPL is identical - # for base vs adapter — the delta is driven by the response tokens. - prompt = s.get('prompt', '').strip() - response = s.get('response', '').strip() - if not response: - return None - if prompt: - return prompt + '\n' + response - return response - -with open(train_out, 'w', encoding='utf-8') as f: - for line in train_lines: - text = fmt(json.loads(line)) - if text: - f.write(text + '\n\n') - -with open(val_out, 'w', encoding='utf-8') as f: - for line in val_lines: - text = fmt(json.loads(line)) - if text: - f.write(text + '\n\n') - -train_chars = len(open(train_out, encoding='utf-8').read()) -val_chars = len(open(val_out, encoding='utf-8').read()) -print(f'train: {len(train_lines)} samples, {train_chars:,} chars -> ppl-eval-train.txt') -print(f'val: {len(val_lines)} samples, {val_chars:,} chars -> ppl-eval-val.txt') diff --git a/tools/moe-pruning/gguf_prune.py b/tools/moe-pruning/gguf_prune.py deleted file mode 100644 index df3e638ab4..0000000000 --- a/tools/moe-pruning/gguf_prune.py +++ /dev/null @@ -1,260 +0,0 @@ -""" -gguf-prune: REAP-based expert pruning directly on a GGUF file. - -Slices the expert dimension of the four stacked MoE weight tensors per layer: - blk.{il}.ffn_up_exps [n_embd, intermediate, n_experts] - blk.{il}.ffn_down_exps [intermediate, n_embd, n_experts] - blk.{il}.ffn_gate_inp [n_embd, n_experts] - blk.{il}.ffn_exp_probs_b [n_experts] (score-correction bias, if present) - -Quantized blocks (Q4_K, Q6_K, …) are preserved as raw bytes — slicing the -expert axis (last dim) is safe because each expert is independently quantised -in ggml, so dropping experts = dropping whole quantisation blocks. - -Metadata patched: - {arch}.expert_count → keep_n - (expert_used_count = top-k routing k, NOT touched) - -Usage: - # keep top 20% of experts (26/128) per MoE layer - python gguf_prune.py \\ - --input nemotron.gguf \\ - --stats expert_stats.json \\ - --output nemotron-pruned.gguf \\ - --keep_ratio 0.20 - - # or keep an absolute number - python gguf_prune.py \\ - --input nemotron.gguf \\ - --stats expert_stats.json \\ - --output nemotron-pruned.gguf \\ - --keep_n 32 -""" - -from __future__ import annotations - -import argparse -import json -import re -from pathlib import Path - -import numpy as np -from gguf import GGUFReader, GGUFWriter, GGUFValueType - - -# ── Constants ───────────────────────────────────────────────────────────────── - -# Base tensor names that carry the expert dimension (last axis in ggml layout). -# Some GGUFs append parameter tails like ".weight" / ".bias". -EXPERT_BASE_SUFFIXES = { - "ffn_up_exps", - "ffn_down_exps", - "ffn_gate_inp", -} - - -def is_expert_suffix(suffix: str) -> bool: - """Return True if a tensor suffix is one of the MoE expert tensors to prune.""" - if suffix in ("ffn_exp_probs_b", "exp_probs_b", "exp_probs_b.bias"): - return True - return any(suffix == base or suffix.startswith(base + ".") for base in EXPERT_BASE_SUFFIXES) - - -# ── Helpers ─────────────────────────────────────────────────────────────────── - -def layer_and_suffix(name: str) -> tuple[int, str] | tuple[None, None]: - m = re.match(r"blk\.(\d+)\.(.+)$", name) - if m: - return int(m.group(1)), m.group(2) - return None, None - - -def pick_experts(layer_stats: dict, keep_n: int) -> list[int]: - """ - Return sorted indices of the top `keep_n` experts by REAP score. - Falls back to 'importance_score' (weighted frequency) if 'reap' absent. - """ - if "reap" in layer_stats: - scores = np.array(layer_stats["reap"], dtype=np.float64) - elif "importance_score" in layer_stats: - scores = np.array(layer_stats["importance_score"], dtype=np.float64) - else: - raise KeyError( - "Layer stats has neither 'reap' nor 'importance_score'. " - "Run expert-profile / nemotron_reap.py profile first." - ) - return sorted(np.argsort(scores)[-keep_n:].tolist()) - - -def slice_expert_axis(data: np.ndarray, keep: list[int]) -> np.ndarray: - """ - Slice the expert axis of reader tensor data keeping only `keep` indices. - - GGUFReader reshapes tensors to NumPy with reversed ggml dims, so for MoE - tensors where experts are the last ggml dim, expert is axis 0 in `data`. - This also preserves quantized row-byte alignment (axis -1 is byte-packed - rows for quantized tensors and must not be sliced for expert pruning). - """ - return np.take(data, keep, axis=0) - - -def copy_field(writer: GGUFWriter, field, reader: GGUFReader) -> bool: - """Copy a single metadata field to writer. Returns False if skipped.""" - key = field.name - val_type = field.types[0] - part = field.parts[-1] - - if val_type == GGUFValueType.STRING: - # Preserve raw bytes: GGUF metadata can contain non-UTF8 strings. - writer.add_key_value(key, bytes(part), GGUFValueType.STRING) - elif val_type == GGUFValueType.UINT8: - writer.add_uint8(key, int(part[0])) - elif val_type == GGUFValueType.INT8: - writer.add_int8(key, int(part[0])) - elif val_type == GGUFValueType.UINT16: - writer.add_uint16(key, int(part[0])) - elif val_type == GGUFValueType.INT16: - writer.add_int16(key, int(part[0])) - elif val_type == GGUFValueType.UINT32: - writer.add_uint32(key, int(part[0])) - elif val_type == GGUFValueType.INT32: - writer.add_int32(key, int(part[0])) - elif val_type == GGUFValueType.FLOAT32: - writer.add_float32(key, float(part[0])) - elif val_type == GGUFValueType.UINT64: - writer.add_uint64(key, int(part[0])) - elif val_type == GGUFValueType.INT64: - writer.add_int64(key, int(part[0])) - elif val_type == GGUFValueType.FLOAT64: - writer.add_float64(key, float(part[0])) - elif val_type == GGUFValueType.BOOL: - writer.add_bool(key, bool(part[0])) - elif val_type == GGUFValueType.ARRAY: - elem_type = field.types[1] - if elem_type == GGUFValueType.STRING: - # ReaderField.data stores indices of ARRAY payload items; for - # STRING arrays this points at each string byte payload. - vals = [bytes(field.parts[idx]) for idx in field.data] - writer.add_key_value(key, vals, GGUFValueType.ARRAY, sub_type=GGUFValueType.STRING) - else: - # ReaderField.data stores part-indices, not payload values. - vals = field.contents() - if not isinstance(vals, list): - print(f" WARNING: skipping array field {key!r} (unexpected non-list contents)") - return False - writer.add_array(key, vals) - else: - print(f" WARNING: skipping field {key!r} (unsupported type {val_type})") - return False - return True - - -# ── Main ────────────────────────────────────────────────────────────────────── - -def main(): - ap = argparse.ArgumentParser(description="REAP expert pruning on a GGUF file") - ap.add_argument("--input", required=True, help="Input .gguf path") - ap.add_argument("--stats", required=True, help="expert_stats.json from expert-profile") - ap.add_argument("--output", required=True, help="Output .gguf path") - ap.add_argument("--keep_ratio", type=float, default=None, help="Fraction to keep, e.g. 0.20") - ap.add_argument("--keep_n", type=int, default=None, help="Absolute count to keep, e.g. 32") - ap.add_argument("--n_experts", type=int, default=128, help="Experts per MoE layer in source model") - args = ap.parse_args() - - if args.keep_ratio is None and args.keep_n is None: - ap.error("Provide --keep_ratio or --keep_n") - if args.keep_ratio is not None and args.keep_n is not None: - ap.error("Provide --keep_ratio OR --keep_n, not both") - - keep_n = args.keep_n if args.keep_n is not None else max(1, int(args.n_experts * args.keep_ratio)) - print(f"[gguf-prune] keeping {keep_n}/{args.n_experts} experts per MoE layer") - - # ── Load stats ───────────────────────────────────────────────────────────── - with open(args.stats) as f: - stats = {int(k): v for k, v in json.load(f).items()} - print(f"[gguf-prune] stats loaded for {len(stats)} MoE layers") - - # ── Open source GGUF ─────────────────────────────────────────────────────── - print(f"[gguf-prune] reading {args.input}") - reader = GGUFReader(args.input, mode="r") - - arch_field = reader.get_field("general.architecture") - arch = str(bytes(arch_field.parts[-1]), "utf-8") if arch_field else "nemotron_h_moe" - print(f"[gguf-prune] arch {arch}") - - expert_count_key = f"{arch}.expert_count" - - # ── Compute kept indices per layer ───────────────────────────────────────── - kept: dict[int, list[int]] = {} - for tensor in reader.tensors: - il, suffix = layer_and_suffix(tensor.name) - if il is None or suffix is None or not is_expert_suffix(suffix): - continue - if il in kept: - continue # already computed for this layer - if il not in stats: - print(f" Layer {il:3d}: no stats — keeping ALL {args.n_experts} experts") - kept[il] = list(range(args.n_experts)) - else: - kept[il] = pick_experts(stats[il], keep_n) - never = stats[il].get("never_activated", "?") - crit = "reap" if "reap" in stats[il] else "importance_score" - print(f" Layer {il:3d}: keep {kept[il][:4]}… never_activated={never} criterion={crit}") - - # ── Build output GGUF ────────────────────────────────────────────────────── - print(f"\n[gguf-prune] writing {args.output}") - writer = GGUFWriter(args.output, arch=arch) - - # --- metadata: copy all fields, replace expert_count --- - for field in reader.fields.values(): - # Reader exposes synthetic header fields (GGUF.*) that are not KV - # metadata and must not be copied back as normal keys. - if field.name.startswith("GGUF."): - continue - # Writer already sets general.architecture from ctor; avoid duplicate warning. - if field.name in (expert_count_key, "general.architecture"): - continue # replaced below - copy_field(writer, field, reader) - - writer.add_expert_count(keep_n) - print(f"[gguf-prune] patched {expert_count_key} → {keep_n}") - - # --- tensors --- - n_pruned = 0 - for tensor in reader.tensors: - il, suffix = layer_and_suffix(tensor.name) - is_expert = il is not None and suffix is not None and is_expert_suffix(suffix) - - if is_expert: - assert il is not None - k = kept[il] - data = slice_expert_axis(tensor.data, k) - writer.add_tensor( - tensor.name, - data, - raw_dtype=tensor.tensor_type, - ) - n_pruned += 1 - else: - writer.add_tensor( - tensor.name, - tensor.data, - raw_dtype=tensor.tensor_type, - ) - - writer.write_header_to_file() - writer.write_kv_data_to_file() - writer.write_tensors_to_file(progress=True) - writer.close() - - out = Path(args.output) - size_gb = out.stat().st_size / 1024**3 - print(f"\n[gguf-prune] done") - print(f" Expert tensors sliced : {n_pruned}") - print(f" MoE layers pruned : {len(kept)}") - print(f" Experts per layer : {keep_n}/{args.n_experts}") - print(f" Output size : {size_gb:.2f} GB → {out}") - - -if __name__ == "__main__": - main() diff --git a/tools/moe-pruning/requirements.txt b/tools/moe-pruning/requirements.txt deleted file mode 100644 index 6b21f239f4..0000000000 --- a/tools/moe-pruning/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -gguf diff --git a/tools/moe-pruning/sample_calibration.jsonl b/tools/moe-pruning/sample_calibration.jsonl deleted file mode 100644 index 44e2627f97..0000000000 --- a/tools/moe-pruning/sample_calibration.jsonl +++ /dev/null @@ -1,8 +0,0 @@ -{"prompt": "What is the difference between a transformer and an RNN?", "response": "Transformers process all tokens in parallel using self-attention, while RNNs process tokens sequentially and maintain a hidden state. Transformers scale better and handle long-range dependencies more effectively, but require more memory for the attention matrix."} -{"prompt": "Explain the concept of weight quantization in neural networks.", "response": "Weight quantization reduces model size and inference latency by representing parameters in lower precision formats such as INT8 or INT4 instead of FP32. This trades a small amount of accuracy for significant gains in memory efficiency and compute throughput."} -{"prompt": "What is mixture of experts (MoE)?", "response": "MoE is a neural network architecture where the feed-forward layers are replaced by multiple expert sub-networks and a routing mechanism. For each token, only a small subset of experts is activated, allowing the model to have a large parameter count while keeping inference compute constant."} -{"prompt": "Describe the attention mechanism.", "response": "Attention computes a weighted sum of value vectors, where the weights are determined by the compatibility between query and key vectors. Scaled dot-product attention computes scores as Q*K^T/sqrt(d_k), applies softmax to get weights, then multiplies by V."} -{"prompt": "What is GGUF and how does it differ from GGML?", "response": "GGUF is the successor to the GGML file format for storing quantized models. It supports arbitrary key-value metadata, is extensible without breaking backward compatibility, and encodes tensor names and shapes explicitly, making it more robust than the original GGML format."} -{"prompt": "How does LoRA work?", "response": "LoRA (Low-Rank Adaptation) injects trainable rank-decomposition matrices A and B into frozen weight layers. The adapted weight is W + alpha/r * B*A. Since rank r is much smaller than the weight dimensions, only a tiny fraction of parameters are trained."} -{"prompt": "What is perplexity in language modeling?", "response": "Perplexity measures how well a language model predicts a sample text. It is the exponentiated average negative log-likelihood per token: PPL = exp(-1/N * sum log P(token_i)). Lower perplexity indicates a better fit to the data."} -{"prompt": "Explain rotary position embeddings (RoPE).", "response": "RoPE encodes position by rotating query and key vectors in 2D subspaces using a position-dependent rotation matrix. This makes the dot product between Q and K depend only on their relative position, enabling the model to generalise to sequence lengths longer than those seen during training."}