From 84cab59ec611f28ad413a05dd8776c74bd9635c1 Mon Sep 17 00:00:00 2001
From: Salvatore Rossitto <srossitto79@gmail.com>
Date: Wed, 11 Mar 2026 14:50:05 +0100
Subject: [PATCH 01/15] added qlora finetuning

---
 common/arg.cpp                                |   90 +-
 common/common.h                               |   22 +-
 examples/CMakeLists.txt                       |    1 +
 examples/qlora_training/CMakeLists.txt        |    5 +
 examples/qlora_training/README.md             |  331 +++++
 examples/qlora_training/check_lora_norms.py   |   64 +
 examples/qlora_training/finetune_qlora.cpp    | 1217 +++++++++++++++++
 examples/qlora_training/grpo_example.py       |  397 ++++++
 examples/qlora_training/sample_data.jsonl     |    7 +
 .../qlora_training/sample_rwsft_data.jsonl    |    6 +
 ggml/include/ggml-opt.h                       |    1 +
 ggml/include/ggml.h                           |   16 +
 ggml/src/ggml-cpu/ops.cpp                     |    9 +-
 ggml/src/ggml-cuda/ggml-cuda.cu               |    9 +
 ggml/src/ggml-cuda/opt-step-adamw.cu          |    5 +-
 ggml/src/ggml-cuda/out-prod.cu                |  114 ++
 ggml/src/ggml-cuda/out-prod.cuh               |    7 +
 ggml/src/ggml-opt.cpp                         |   16 +-
 ggml/src/ggml.c                               |  102 +-
 19 files changed, 2403 insertions(+), 16 deletions(-)
 create mode 100644 examples/qlora_training/CMakeLists.txt
 create mode 100644 examples/qlora_training/README.md
 create mode 100644 examples/qlora_training/check_lora_norms.py
 create mode 100644 examples/qlora_training/finetune_qlora.cpp
 create mode 100644 examples/qlora_training/grpo_example.py
 create mode 100644 examples/qlora_training/sample_data.jsonl
 create mode 100644 examples/qlora_training/sample_rwsft_data.jsonl

diff --git a/common/arg.cpp b/common/arg.cpp
index 41da8563d6..44d155971b 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -3601,32 +3601,108 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         { "-lr", "--learning-rate" }, "ALPHA",
         string_format("adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)", (double) params.lr.lr0),
         [](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); }
-    ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA }));
     add_opt(common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
         string_format("(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
             (double) params.lr.lr_min),
         [](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); }
-    ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA }));
     add_opt(common_arg(
         {"-decay-epochs", "--learning-rate-decay-epochs"}, "ALPHA",
         string_format("(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)", (double) params.lr.decay_epochs),
         [](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); }
-    ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA }));
     add_opt(common_arg(
         {"-wd", "--weight-decay"}, "WD",
         string_format("adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).", (double) params.lr.wd),
         [](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); }
-    ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA }));
     add_opt(common_arg(
         {"-val-split", "--val-split"}, "FRACTION",
         string_format("fraction of data to use as validation set for training (default: %.2g).", (double) params.val_split),
         [](common_params & params, const std::string & value) { params.val_split = std::stof(value); }
-    ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA }));
+    // qlora flags
+    add_opt(common_arg(
+        {"--lora-rank"}, "N",
+        string_format("LoRA rank r (default: %d)", params.lora_rank),
+        [](common_params & params, int value) { params.lora_rank = value; }
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
+    add_opt(common_arg(
+        {"--lora-alpha"}, "F",
+        string_format("LoRA alpha (default: %d = use rank value)", (int) params.lora_alpha),
+        [](common_params & params, const std::string & value) { params.lora_alpha = std::stof(value); }
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
+    add_opt(common_arg(
+        {"--lora-targets"}, "SUBSTRINGS",
+        string_format("comma-separated substrings of tensor names to add LoRA to (default: %s)", params.lora_targets.c_str()),
+        [](common_params & params, const std::string & value) { params.lora_targets = value; }
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
+    add_opt(common_arg(
+        {"--lora-out"}, "FNAME",
+        string_format("output LoRA adapter GGUF path (default: %s)", params.lora_out.c_str()),
+        [](common_params & params, const std::string & value) { params.lora_out = value; }
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
+    add_opt(common_arg(
+        {"--train-file"}, "FNAME",
+        "JSONL training dataset (fields: messages|prompt+response|text)",
+        [](common_params & params, const std::string & value) { params.train_file = value; }
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
+    add_opt(common_arg(
+        {"--save-every"}, "N",
+        "save adapter checkpoint every N dataset windows during training (default: 0 = only at end)",
+        [](common_params & params, int value) { params.save_every = value; }
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
+    add_opt(common_arg(
+        {"--freeze-layers"}, "N",
+        "freeze first N transformer layers — no LoRA adapters allocated for blk.0..blk.N-1 (default: 0 = train all layers)",
+        [](common_params & params, int value) { params.lora_freeze_layers = value; }
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
+    add_opt(common_arg(
+        {"--grad-checkpoint"}, "N",
+        "gradient checkpointing interval to reduce peak activation VRAM (0 = disabled, default: 0)",
+        [](common_params & params, int value) { params.grad_checkpoint_interval = value; }
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
+    add_opt(common_arg(
+        {"--train-on-prompt"},
+        "compute loss on prompt tokens too, not just the response (default: response-only loss)",
+        [](common_params & params) { params.train_on_prompt = true; }
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
+    add_opt(common_arg(
+        {"--shuffle-dataset"},
+        "shuffle dataset windows at the start of each epoch (default: sequential order)",
+        [](common_params & params) { params.shuffle_dataset = true; }
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
+    add_opt(common_arg(
+        {"--grpo-mode"},
+        "enable GRPO IPC training loop (prompts and rewards supplied via stdin/stdout)",
+        [](common_params & params) { params.grpo_mode = true; }
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
+    add_opt(common_arg(
+        {"--n-gen"}, "N",
+        string_format("GRPO: number of generations per prompt (default: %d)", params.grpo_n_gen),
+        [](common_params & params, int value) { params.grpo_n_gen = value; }
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
+    add_opt(common_arg(
+        {"--n-steps"}, "N",
+        string_format("GRPO: total optimizer steps (default: %d)", params.grpo_n_steps),
+        [](common_params & params, int value) { params.grpo_n_steps = value; }
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
+    add_opt(common_arg(
+        {"--grpo-temp"}, "F",
+        string_format("GRPO: sampling temperature for rollout generation (default: %.2f)", (double) params.grpo_temperature),
+        [](common_params & params, const std::string & value) { params.grpo_temperature = std::stof(value); }
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
+    add_opt(common_arg(
+        {"--grpo-max-tokens"}, "N",
+        string_format("GRPO: max tokens per generation (default: %d)", params.grpo_max_tokens),
+        [](common_params & params, int value) { params.grpo_max_tokens = value; }
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
     add_opt(common_arg(
         {"-epochs", "--epochs"}, "N",
         string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
         [](common_params & params, int epochs) { params.lr.epochs = epochs; }
-    ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA }));
     add_opt(common_arg(
         {"-opt", "--optimizer"}, "sgd|adamw", "adamw or sgd",
         [](common_params & params, const std::string & name) {
@@ -3635,7 +3711,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
             }
         }
-    ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA }));
     add_opt(common_arg(
         {"--check"},
         string_format("check rather than generate results (default: %s)", params.check ? "true" : "false"),
diff --git a/common/common.h b/common/common.h
index ffaeefd7c9..0655527ad0 100644
--- a/common/common.h
+++ b/common/common.h
@@ -103,6 +103,7 @@ enum llama_example {
     LLAMA_EXAMPLE_TTS,
     LLAMA_EXAMPLE_DIFFUSION,
     LLAMA_EXAMPLE_FINETUNE,
+    LLAMA_EXAMPLE_FINETUNE_QLORA,
     LLAMA_EXAMPLE_FIT_PARAMS,
     LLAMA_EXAMPLE_RESULTS,
 
@@ -517,7 +518,26 @@ struct common_params {
     // finetune
     struct lr_opt lr;
     enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
-    float val_split = 0.05f; // fraction of the data used for the validation set
+    float   val_split              = 0.05f; // fraction of the data used for the validation set
+
+    // qlora fine-tuning
+    int32_t     lora_rank          = 16;              // LoRA rank (r)
+    float       lora_alpha         = 0.0f;            // LoRA alpha (0 = use rank value)
+    std::string lora_targets       = "attn_q,attn_output,ffn_gate,ffn_up,ffn_down"; // comma-separated substrings to match trainable tensors
+    std::string lora_out           = "adapter.gguf";  // output adapter GGUF path
+    std::string train_file         = "";              // JSONL training dataset path
+    int32_t save_every             = 0;     // save checkpoint every N optimizer steps (0 = disabled)
+    int32_t lora_freeze_layers     = 0;     // do not apply LoRA to the first N transformer layers
+    int32_t grad_checkpoint_interval = 0;  // gradient checkpointing interval to reduce peak VRAM (0 = disabled)
+    bool    train_on_prompt        = false; // include prompt tokens in training loss (default: response tokens only)
+    bool    shuffle_dataset        = false; // shuffle dataset windows at the start of each epoch
+
+    // grpo training
+    bool    grpo_mode              = false; // enable GRPO IPC training loop
+    int32_t grpo_n_gen             = 8;     // generations per prompt
+    int32_t grpo_n_steps           = 500;   // total GRPO optimizer steps
+    float   grpo_temperature       = 0.8f;  // sampling temperature for rollouts
+    int32_t grpo_max_tokens        = 512;   // max tokens per generation
 
     // embedding
     bool embedding         = false; // get only sentence embedding
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index a29dc707c3..93f72fc321 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -34,6 +34,7 @@ else()
     add_subdirectory(speculative-simple)
     add_subdirectory(gen-docs)
     add_subdirectory(training)
+    add_subdirectory(qlora_training)
     add_subdirectory(diffusion)
     if (NOT GGML_BACKEND_DL)
         add_subdirectory(convert-llama2c-to-ggml)
diff --git a/examples/qlora_training/CMakeLists.txt b/examples/qlora_training/CMakeLists.txt
new file mode 100644
index 0000000000..5ba9ec8afa
--- /dev/null
+++ b/examples/qlora_training/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-finetune-qlora)
+add_executable(${TARGET} finetune_qlora.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/qlora_training/README.md b/examples/qlora_training/README.md
new file mode 100644
index 0000000000..bc3b6fcca9
--- /dev/null
+++ b/examples/qlora_training/README.md
@@ -0,0 +1,331 @@
+# llama.cpp — Native QLoRA Training
+
+Native QLoRA + Reward-Weighted SFT training pipeline for quantized GGUF models.
+
+The base model weights remain **frozen** (quantized tensors are skipped by `llama_set_param` because they are not `GGML_TYPE_F32`). Only freshly-allocated F32 LoRA A/B tensors are trained. The saved adapter GGUF is directly compatible with the existing `llama_adapter_lora_init` loader and `llama-export-lora` merge tool.
+
+**Status:** Working. Phase 1 (QLoRA SFT) and Phase 2 (Reward-Weighted SFT) are implemented and functional. Training speed is currently limited by full backprop through quantized weights — see [Known Limitations](#known-limitations).
+
+---
+
+## Build
+
+```bash
+cd /mnt/w/llm-trading-arena/unsloth-api/llama.cpp
+
+# First time (CUDA build):
+cmake -B build -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_CURL=OFF
+cmake --build build -j$(nproc)
+
+# Incremental rebuild (after code changes):
+cmake --build build --target llama-finetune-qlora -j$(nproc)
+# If llama-adapter.cpp or llama-context.cpp changed, rebuild all:
+cmake --build build -j$(nproc)
+```
+
+---
+
+## Phase 1 — QLoRA SFT (`llama-finetune-qlora`)
+
+Trains LoRA adapters on a quantized GGUF model.
+
+### Recommended command (1.7B model, 16 GB card)
+
+```bash
+./build/bin/llama-finetune-qlora \
+  --model ~/qwen3-1.7b-q4_k_m.gguf \
+  --train-file data/train.jsonl \
+  --lora-rank 16 --lora-alpha 16 \
+  -c 4096 -b 4096 -ub 512 \
+  --save-every 10 \
+  --lora-out ~/adapter.gguf \
+  --epochs 3 --seed 42
+```
+
+### Recommended command (15B model, 16 GB card, partial offload)
+
+```bash
+./build/bin/llama-finetune-qlora \
+  --model ~/nemotron-15b-q4_k_m.gguf \
+  --train-file data/train.jsonl \
+  --lora-rank 16 --lora-alpha 16 \
+  -ngl 13 -c 14336 -b 14336 -ub 1024 \
+  --save-every 8 \
+  --lora-out ~/nemotron-lora.gguf \
+  --epochs 3 --seed 42
+```
+
+### All flags
+
+| Flag | Default | Description |
+|---|---|---|
+| `--model` | *(required)* | Path to quantized GGUF model |
+| `--train-file` | *(required)* | JSONL training dataset |
+| `--lora-rank` | `16` | LoRA rank r |
+| `--lora-alpha` | `0` (= rank) | LoRA alpha; effective scale = alpha/rank |
+| `--lora-targets` | see below | Comma-separated internal tensor name substrings |
+| `--lora-out` | `adapter.gguf` | Output adapter GGUF path (supports `~`) |
+| `--save-every` | `0` | Save checkpoint every N dataset windows (0 = end only) |
+| `--freeze-layers` | `0` | Skip LoRA on first N transformer layers (blk.0..N-1); backward already pruned automatically |
+| `--grad-checkpoint` | `0` | Mark every Nth forward node persistent to reduce activation VRAM; good values: 32–64 |
+| `--train-on-prompt` | off | Compute loss on prompt tokens too (default: response-only loss) |
+| `--shuffle-dataset` | off | Shuffle dataset windows at the start of each epoch |
+| `--val-split` | `0.0` | Fraction of data to hold out for validation (e.g. `0.1` = 10%); val loss logged per epoch |
+| `-epochs` / `--epochs` | `3` | Training epochs |
+| `-c` / `--ctx-size` | `512` | Training context window (tokens) |
+| `-b` / `--batch-size` | `2048` | Tokens per `llama_decode` call; set equal to `-c` |
+| `-ub` / `--ubatch-size` | `512` | GPU micro-batch tokens; controls VRAM vs. step time |
+| `-ngl` | `999` | GPU layers to offload |
+| `-lr` / `--learning-rate` | `1e-4` | AdamW learning rate |
+| `--seed` | `42` | Random seed for LoRA init |
+
+### VRAM vs. step-time tradeoff
+
+Step time and VRAM both scale linearly with `-ub`:
+
+| Model | `-ub` | VRAM | Step time (approx) |
+|---|---|---|---|
+| 1.7B Q4_K_M | 512 | ~18 GB | ~120 s (OOM on 16 GB) |
+| 1.7B Q4_K_M | 128 | ~6 GB | ~30 s |
+| 15B Q4_K_M | 1024 | ~11 GB | ~60 s |
+
+Use `-c` equal to your target sequence length. More context = more windows per sample = more steps per epoch. Reducing `-c` reduces total training time proportionally.
+
+### Default LoRA targets
+
+llama.cpp uses **internal GGUF tensor names**, not HuggingFace names:
+
+| llama.cpp internal | HuggingFace equivalent | Status |
+|---|---|---|
+| `attn_q` | `q_proj` | ✅ default target, trainable |
+| `attn_output` | `o_proj` | ✅ default target, trainable |
+| `ffn_gate` | `gate_proj` | ✅ default target, trainable |
+| `ffn_up` | `up_proj` | ✅ default target, trainable |
+| `ffn_down` | `down_proj` | ✅ default target, trainable |
+| `attn_k` | `k_proj` | ❌ not in defaults — zero gradient (KV scatter via SET_ROWS) |
+| `attn_v` | `v_proj` | ❌ not in defaults — zero gradient (KV scatter via SET_ROWS) |
+| `ssm_in` | `in_proj` | ❌ not in defaults — zero gradient (SSM_SCAN no backward) |
+| `ssm_out` | `out_proj` | ❌ not in defaults — zero gradient (SSM_SCAN no backward) |
+
+**MoE models:** Expert tensors (`*_exps`) are excluded regardless of `--lora-targets`. The quantized expert weights are frozen (stop-gradient), but LoRA on the dense FFN layers (`ffn_gate`, `ffn_up`, `ffn_down`) works — backward via `MUL_MAT_ID` + `OUT_PROD_ID`.
+
+### Dataset format (JSONL)
+
+**Chat format** (loss on response only; use `--train-on-prompt` for all tokens):
+```json
+{"messages": [{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi!"}]}
+```
+
+**Prompt/response** (loss on response only):
+```json
+{"prompt": "What is the capital of France?", "response": "Paris."}
+```
+
+**Plain text** (loss on all tokens):
+```json
+{"text": "The quick brown fox."}
+```
+
+**With reward** (Phase 2 — scales gradient by reward):
+```json
+{"prompt": "...", "response": "...", "reward": 0.85}
+```
+
+Rewards are normalized per epoch: clipped to `[-1, 1]`, then min-max scaled to `[0, 1]`. Reward 0 = sample ignored; reward 1 = full gradient.
+
+### Verify and use the adapter
+
+```bash
+# Hot-load for inference (no merge needed)
+./build/bin/llama-cli --model base.gguf --lora adapter.gguf -p "Hello"
+
+# Merge into base model
+./build/bin/llama-export-lora \
+  --model base.gguf --lora adapter.gguf --output merged.gguf
+```
+
+---
+
+## Phase 2 — Reward-Weighted SFT
+
+Built into `llama-finetune-qlora`. When the dataset contains a `reward` or `score` field, the cross-entropy loss for that sample is scaled by the reward before backprop. No extra flags needed — detection is automatic.
+
+---
+
+## Phase 3 — GRPO (Online RL via IPC)
+
+`llama-finetune-qlora --grpo-mode` implements a full GRPO training loop where the Python process owns prompt sampling and reward scoring, and the C++ process owns model state, generation, and gradient updates.
+
+### Quick start
+
+```bash
+python3 examples/qlora_training/grpo_example.py \
+    --model  ~/qwen3-1.7b-q4_k_m.gguf \
+    --lora-out ~/grpo-adapter.gguf \
+    --rank 16 --n-steps 200 --n-gen 8
+```
+
+For verbose output (includes IPC message trace):
+
+```bash
+python3 examples/qlora_training/grpo_example.py \
+    --model ~/qwen3-1.7b-q4_k_m.gguf \
+    --lora-out ~/grpo-adapter.gguf \
+    --verbose
+```
+
+Resume from a checkpoint:
+
+```bash
+python3 examples/qlora_training/grpo_example.py \
+    --model ~/qwen3-1.7b-q4_k_m.gguf \
+    --lora     ~/grpo-adapter.ckpt50.gguf \
+    --lora-out ~/grpo-adapter.gguf
+```
+
+### GRPO-specific flags
+
+| Flag | Default | Description |
+|---|---|---|
+| `--grpo-mode` | off | Enable GRPO IPC mode |
+| `--n-gen` | `8` | Rollouts per prompt |
+| `--n-steps` | `500` | Total GRPO steps |
+| `--grpo-temp` | `0.8` | Sampling temperature for rollouts |
+| `--grpo-max-tokens` | `512` | Max tokens per generation |
+
+All standard flags (`--lora-rank`, `-lr`, `-c`, `-ngl`, `--save-every`, etc.) work in GRPO mode too. `--train-file` is **not** required in GRPO mode.
+
+### IPC protocol
+
+The protocol is line-based over stdout (C++ → Python) and stdin (Python → C++). All non-protocol C++ output (timing, debug, model logs) goes to **stderr** and never contaminates the protocol channel.
+
+**C++ → Python (stdout):**
+
+| Line | When |
+|---|---|
+| `[QLORA:READY]` | Process initialised, model loaded |
+| `[QLORA:PROMPT_REQ:<step>]` | C++ requests the prompt for step N |
+| `[QLORA:GEN:<k>/<n>] <text>` | One generation (newlines escaped as `\n`) |
+| `[QLORA:REWARD_REQ:<n>]` | C++ requests N reward scores |
+| `[QLORA:PROGRESS] step=X/Y loss=Z epoch=A/B` | After each weight update |
+| `[QLORA:CHECKPOINT] <path>` | After saving a checkpoint |
+| `[QLORA:DONE] final_loss=X` | Training complete |
+| `[QLORA:ERROR] <message>` | Fatal error |
+
+**Python → C++ (stdin):**
+
+| Line | Meaning |
+|---|---|
+| `PROMPT <escaped_text>` | Send prompt for the most recent `PROMPT_REQ` |
+| `REWARD <r1> <r2> … <rN>` | Send N advantage scores in `[0, 1]` range |
+| `STOP` | Request graceful shutdown after current step |
+
+**Text encoding:** newlines in generation text are escaped as the two-character sequence `\n`; backslashes are doubled. Use `unescape()` from `grpo_example.py` (or any equivalent) to recover the original text.
+
+### Writing your own driver
+
+`grpo_example.py` contains two functions you replace with your own logic:
+
+```python
+def get_prompt(step: int) -> str:
+    """Return the training prompt for step N."""
+    ...
+
+def score_generations(prompt: str, generations: List[str]) -> List[float]:
+    """Score each generation. Any numeric range — will be normalised."""
+    ...
+```
+
+The IPC helpers (`escape`, `unescape`, `parse_ipc`, `read_ipc`, `write_cmd`, `wait_for`, `normalise_rewards`) are standalone and have no external dependencies — copy them into your own project if needed.
+
+### Training loop diagram
+
+```
+Python                         C++ (llama-finetune-qlora --grpo-mode)
+  │                                │
+  │◄──── [QLORA:READY] ────────────┤  model loaded
+  │                                │
+  │  ┌─────────────────────────────┤
+  │  │ for each step:              │
+  │  │   ◄── PROMPT_REQ:N ─────────┤
+  │  │   ──► PROMPT <text> ────────►  generate n_gen rollouts
+  │  │        ◄── GEN:1/n <text> ──┤
+  │  │        ◄── GEN:2/n <text> ──┤
+  │  │        ...                  │
+  │  │        ◄── GEN:n/n <text> ──┤
+  │  │   ◄── REWARD_REQ:n ─────────┤
+  │  │   (score generations)       │
+  │  │   ──► REWARD a1 a2 … an ────►  one backward + AdamW step
+  │  │   ◄── PROGRESS step=N/M … ──┤
+  │  └─────────────────────────────┤
+  │                                │
+  │◄──── [QLORA:DONE] ─────────────┤  adapter saved
+```
+
+---
+
+## Known Limitations & Optimization Roadmap
+
+### Current limitations
+
+**1. Full backprop through frozen quantized layers**
+Every backward step dequantizes all frozen Q4_K_M weight tensors to compute activation gradients (needed to propagate loss from the output back to each LoRA layer). For a 28-layer 1.7B model at `-ub 512`, this is ~280 dequantizing matmuls per step → step time is 3–5× slower than inference.
+
+**2. Activation VRAM** *(partially addressed by `--grad-checkpoint`)*
+All forward activations are kept in VRAM throughout the backward pass. VRAM ≈ `model + KV + n_layers × hidden × n_ubatch × 10 × 4B + 2 × lora_params × 4B`. Reducing `-ub` reduces VRAM linearly. Use `--grad-checkpoint 48` to prevent the allocator from reusing intermediate activation buffers during backward, which cuts peak activation VRAM at near-zero compute cost.
+
+**3. Full backprop through all layers** *(partially addressed by `--freeze-layers`)*
+Gradients propagate through all layers that have LoRA adapters. Use `--freeze-layers N` to skip LoRA allocation for blk.0..N-1 — those layers receive no gradient (the `grads_needed` pruner already skips their backward ops automatically). Only the top (total_layers - N) layers are trained.
+
+### Optimization roadmap
+
+| Priority | Optimization | Expected gain | Status |
+|---|---|---|---|
+| ✅ Done | **`--freeze-layers N`** — no LoRA on first N layers; backward auto-pruned | Proportional to N/total | Implemented |
+| ✅ Done | **`--grad-checkpoint N`** — keep every Nth activation alive through backward | Reduces peak activation VRAM | Implemented |
+| ✅ Done | **`--train-on-prompt`** — compute loss on prompt tokens too | Configurable loss target | Implemented |
+| ✅ Done | **`--shuffle-dataset`** — shuffle windows each epoch | Better convergence | Implemented |
+| ✅ Done | **BOS separators** — insert BOS between concatenated samples | Correct cross-sample boundaries | Implemented |
+| ✅ Done | **Per-epoch loss summary** — log train/val loss after each epoch | Observability | Implemented |
+| ✅ Done | **`MUL_MAT_ID` backward** — LoRA on MoE dense FFN layers; `OUT_PROD_ID` for scattered outer product | Unlocks Mixtral/Nemotron-MoE | Implemented |
+| ✅ Done | **Quantized `OUT_PROD`** — dequantize on GPU + cuBLAS for backward matmul | Full GPU training (no CPU fallback) | Implemented |
+| ✅ Done | **Reuse `ctx_compute_opt`** — allocate tensor metadata context once, `ggml_reset()` across ubatches | Eliminate ~0.5 s/step overhead | Implemented |
+| ❌ Skip | **Static training graphs** — KV mask shape changes per ubatch (`n_kv` grows); graph topology not static | Would need KV cache redesign | Not feasible |
+| Low | **`SSM_SCAN/CONV` backward** — enable LoRA on Mamba SSM layers | Unlocks NemotronH SSM layers | Planned |
+| Low | **GELU backward** — implement `ggml_gelu_back` kernel (UNARY + GLU) | Support GPT-2/Phi-style models | Planned (needs new CUDA/CPU kernels) |
+
+---
+
+## Implementation notes (for developers)
+
+### Modified llama.cpp files
+
+| File | Change |
+|---|---|
+| `ggml/src/ggml.c` | Backward graph fixes: `GET_ROWS` 3D, `SET_ROWS`, `MUL_MAT_ID`, `SSM_SCAN/CONV`, `FLASH_ATTN_EXT` all stop gradient; inplace-op assert → warn+skip |
+| `src/llama-context.cpp` | `opt_init`: scheduler and graph sized with inflated capacity before `ggml_opt_init`; `opt_epoch_iter`: per-ubatch timing instrumentation; reward scaling via `g_reward_weights` TLS |
+| `src/llama-adapter.cpp` | Repack-buft fallback for LoRA tensors: tries device-native buft before CPU |
+| `common/common.h` | Added `save_every`, `lora_freeze_layers`, `grad_checkpoint_interval`, `train_on_prompt`, `shuffle_dataset` fields |
+| `common/arg.cpp` | Added `--save-every`, `--freeze-layers`, `--grad-checkpoint`, `--train-on-prompt`, `--shuffle-dataset` arguments |
+| `include/llama.h` | Added `llama_opt_set_reward_weights()`; `grad_checkpoint_interval` in `llama_opt_params`; `shuffle` param in `llama_opt_epoch` |
+| `ggml/src/ggml-cuda/out-prod.cu` | `OUT_PROD` with quantized src0 (dequantize on GPU + cuBLAS); `OUT_PROD_ID` for MoE backward |
+| `ggml/src/ggml-cuda/ggml-cuda.cu` | `supports_op` for quantized `OUT_PROD` and `OUT_PROD_ID`; CPU-resident ids fix in `mul_mat_id` |
+| `ggml/include/ggml-opt.h` | Added `grad_checkpoint_interval` to `ggml_opt_params` |
+| `ggml/src/ggml-opt.cpp` | Gradient checkpointing: marks every Nth forward node `GGML_TENSOR_FLAG_OUTPUT` before backward build |
+
+### Key invariants
+
+- `params.use_mmap = false` — forced; mmap'd tensors can't have data written back
+- `params.flash_attn_type = DISABLED` — no backward impl for flash attention
+- `params.warmup = false` — warmup runs inference with PARAM tensors → segfault
+- `params.cache_type_k = F32` — training requires F32 KV (or BF16 with `--cache-type-k bf16`)
+- LoRA A/B tensors are marked `PARAM` via `ggml_set_param` on the tensors loaded by `llama_adapter_lora_init`, not on the pre-init scratch tensors in `lt.buf`
+- The adapter GGUF is pre-saved and loaded via `params.lora_adapters` BEFORE `common_init_from_params` so that `sched_reserve` includes LoRA graph nodes in its sizing
+
+### Why opt_init inflation matters
+
+`ggml_opt_init` captures `sched.get()` at construction time. The backward graph (`gb_grad`, `gb_opt`) is ~3–5× larger than the forward graph in node count. If the scheduler hash_set is sized only for the forward graph, `ggml_backend_sched_alloc_graph` on the backward graph will overflow it. We recreate `sched` with `inflated = fwd_nodes × 4` slots BEFORE calling `ggml_opt_init`.
+
+### Reward weighting implementation
+
+`llama_opt_set_reward_weights(weights, n)` sets thread-local `g_reward_weights`. In `opt_epoch`, each window reads `g_reward_weights[idata]` and passes it as `reward_scale` to `opt_epoch_iter`. Inside the iter loop, instead of writing `1.0f` for the correct token's label position in the cross-entropy label tensor, it writes `reward_scale`. Since cross-entropy loss = `-mean(label × log(softmax(logit)))`, scaling the label scales both loss and gradient identically.
diff --git a/examples/qlora_training/check_lora_norms.py b/examples/qlora_training/check_lora_norms.py
new file mode 100644
index 0000000000..8908f3ad99
--- /dev/null
+++ b/examples/qlora_training/check_lora_norms.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+"""Quick check of LoRA tensor norms in a GGUF file."""
+import sys, struct, numpy as np
+
+def read_gguf(path):
+    with open(path, 'rb') as f:
+        assert f.read(4) == b'GGUF'
+        version = struct.unpack('<I', f.read(4))[0]
+        n_tensors, n_kv = struct.unpack('<QQ', f.read(16))
+        # skip KV pairs (simplified - just seek past them)
+        # Read tensor infos
+        tensors = []
+        for _ in range(n_kv):
+            # key
+            klen = struct.unpack('<Q', f.read(8))[0]
+            key = f.read(klen).decode()
+            vtype = struct.unpack('<I', f.read(4))[0]
+            # skip value based on type (simplified)
+            if vtype == 8:   # string
+                slen = struct.unpack('<Q', f.read(8))[0]; f.read(slen)
+            elif vtype == 6: # float32
+                f.read(4)
+            elif vtype in (0,1,2,3,4,5,10,11,12): # int types
+                sizes = {0:1,1:1,2:2,3:4,4:8,5:1,10:2,11:4,12:8}
+                f.read(sizes.get(vtype,4))
+            elif vtype == 9: # bool
+                f.read(1)
+            else:
+                print(f"unknown kv type {vtype} for key {key}, stopping"); break
+
+        data_offset = None
+        for i in range(n_tensors):
+            nlen = struct.unpack('<Q', f.read(8))[0]
+            name = f.read(nlen).decode()
+            ndims = struct.unpack('<I', f.read(4))[0]
+            dims = struct.unpack('<' + 'Q'*ndims, f.read(8*ndims))
+            dtype = struct.unpack('<I', f.read(4))[0]
+            offset = struct.unpack('<Q', f.read(8))[0]
+            tensors.append((name, dims, dtype, offset))
+
+        # data section starts after alignment
+        pos = f.tell()
+        align = 32
+        data_start = (pos + align - 1) & ~(align - 1)
+        
+        print(f"\nFile: {path}")
+        print(f"Tensors: {n_tensors}")
+        
+        for name, dims, dtype, offset in tensors[:10]:  # first 10
+            if dtype != 0:  # only F32 (type 0)
+                print(f"  {name}: dims={dims} type={dtype} (non-F32, skipping norm)")
+                continue
+            f.seek(data_start + offset)
+            n = 1
+            for d in dims: n *= d
+            data = np.frombuffer(f.read(n*4), dtype=np.float32)
+            print(f"  {name}: dims={dims} norm={np.linalg.norm(data):.4f} max={np.abs(data).max():.4f} mean={np.abs(data).mean():.6f}")
+
+if __name__ == '__main__':
+    for p in sys.argv[1:]:
+        try:
+            read_gguf(p)
+        except Exception as e:
+            print(f"Error reading {p}: {e}")
\ No newline at end of file
diff --git a/examples/qlora_training/finetune_qlora.cpp b/examples/qlora_training/finetune_qlora.cpp
new file mode 100644
index 0000000000..57972712fc
--- /dev/null
+++ b/examples/qlora_training/finetune_qlora.cpp
@@ -0,0 +1,1217 @@
+// QLoRA fine-tuning for quantized GGUF models.
+//
+// The base model weights stay frozen (quantized tensors are skipped by
+// llama_set_param because they are not GGML_TYPE_F32).  Only the freshly
+// allocated F32 LoRA A/B tensors are trained.  After training the adapter
+// is saved as a GGUF file that is directly compatible with the existing
+// llama_adapter_lora_init() loader and llama-export-lora merge tool.
+//
+// Usage example:
+/*   llama-finetune-qlora \
+         --model model-q4_k_m.gguf \
+         --train-file train.jsonl \
+         --lora-rank 16 --lora-alpha 16 \
+         --lora-out adapter.gguf \
+         --epochs 3 -c 4096 -b 4096 -ub 512
+*/
+// Default targets: attn_q, attn_output, ffn_gate, ffn_up, ffn_down
+// Override with --lora-targets "comma,separated,substrings"
+//
+// NOTE: attn_k and attn_v are excluded from defaults.  The KV write path uses
+// ggml_set_rows (scatter op) — backward cannot propagate gradients through it.
+// LoRA K/V would receive zero gradient.
+//
+// NOTE: ssm_in and ssm_out (Mamba/NemotronH) are excluded from defaults.
+// SSM_SCAN/SSM_CONV have no backward implementation — LoRA on these layers
+// would receive zero gradient.  Adding them wastes memory with no benefit.
+//
+// NOTE: MoE expert tensors (*_exps) are excluded regardless of --lora-targets.
+// The quantized expert weights are frozen (stop-gradient), but LoRA on dense
+// FFN layers (ffn_gate, ffn_up, ffn_down) works via MUL_MAT_ID backward.
+//
+// Target substrings use llama.cpp internal GGUF names (NOT HuggingFace names):
+//   attn_q      = q_proj       attn_k     = k_proj
+//   attn_v      = v_proj       attn_output= o_proj
+//   ffn_gate    = gate_proj    ffn_up     = up_proj    ffn_down = down_proj
+//   ssm_in      = in_proj (Mamba/NemotronH)  — zero gradient, not in defaults
+//   ssm_out     = out_proj (Mamba/NemotronH)  — zero gradient, not in defaults
+
+#include "arg.h"
+#include "chat.h"
+#include "common.h"
+#include "log.h"
+#include "llama.h"
+#include "gguf.h"
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+
+// Internal adapter struct — included directly to avoid the temp-GGUF roundtrip
+// for wiring trainable LoRA tensors into the compute graph.
+#include "../../src/llama-adapter.h"
+
+#include <cerrno>
+#include <csignal>
+#include <iostream>
+
+#define JSON_ASSERT GGML_ASSERT
+#include <nlohmann/json.hpp>
+
+#include <algorithm>
+#include <clocale>
+#include <cmath>
+#include <cstring>
+#include <fstream>
+#include <random>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+// Expand a leading ~/ to the HOME directory (the shell doesn't do this for us
+// when a path is passed as a string argument to std::ofstream).
+static std::string expand_tilde(const std::string & path) {
+    if (path.size() >= 2 && path[0] == '~' && path[1] == '/') {
+        const char * home = getenv("HOME");
+        if (!home) home = getenv("USERPROFILE"); // Windows fallback
+        if (home) return std::string(home) + path.substr(1);
+    }
+    return path;
+}
+
+static std::vector<std::string> split_csv(const std::string & s) {
+    std::vector<std::string> out;
+    std::istringstream ss(s);
+    std::string tok;
+    while (std::getline(ss, tok, ',')) {
+        if (!tok.empty()) out.push_back(tok);
+    }
+    return out;
+}
+
+// Tensors whose names contain these substrings use MUL_MAT_ID (sparse MoE expert dispatch)
+// which has no backward implementation — exclude them from LoRA targets unconditionally.
+static const std::vector<std::string> EXCLUDED_SUBSTRINGS = {
+    "_exps",      // MoE expert weight stacks (ffn_gate_exps, ffn_up_exps, ffn_down_exps, ffn_gate_up_exps)
+};
+
+static bool tensor_is_excluded(const char * name) {
+    const std::string n(name);
+    for (const auto & ex : EXCLUDED_SUBSTRINGS) {
+        if (n.find(ex) != std::string::npos) return true;
+    }
+    return false;
+}
+
+// Extract the transformer block index from a tensor name of the form "blk.NN.<rest>".
+// Returns -1 if the name does not follow this pattern.
+static int tensor_layer_index(const char * name) {
+    // All per-layer tensors in llama.cpp GGUF are named "blk.<N>.<suffix>"
+    const char * p = strstr(name, "blk.");
+    if (!p) return -1;
+    p += 4; // skip "blk."
+    char * end = nullptr;
+    long idx = strtol(p, &end, 10);
+    if (end == p || (*end != '.' && *end != '\0')) return -1;
+    return (int) idx;
+}
+
+static bool tensor_matches_targets(const char * name, const std::vector<std::string> & targets,
+                                   int freeze_layers = 0) {
+    if (tensor_is_excluded(name)) return false;
+    if (freeze_layers > 0) {
+        const int layer = tensor_layer_index(name);
+        if (layer >= 0 && layer < freeze_layers) return false;
+    }
+    for (const auto & t : targets) {
+        if (std::string(name).find(t) != std::string::npos) return true;
+    }
+    return false;
+}
+
+// ---------------------------------------------------------------------------
+// JSONL dataset loading
+// ---------------------------------------------------------------------------
+
+struct training_sample {
+    std::vector<llama_token> tokens;   // full token sequence
+    std::vector<bool>        is_label; // true for tokens that contribute to loss
+    float                    reward;   // reward/score weight (1.0 = neutral, 0.0 = ignore)
+};
+
+// Apply a very simple ChatML fallback template when the model has no template.
+static std::string apply_chatml(const std::vector<common_chat_msg> & msgs) {
+    std::string out;
+    for (const auto & m : msgs) {
+        out += "<|im_start|>" + m.role + "\n";
+        // content_parts is a vector; build a plain text string
+        std::string text;
+        if (!m.content_parts.empty()) {
+            for (const auto & p : m.content_parts) {
+                text += p.text;
+            }
+        }
+        out += text + "<|im_end|>\n";
+    }
+    return out;
+}
+
+static std::vector<training_sample> load_jsonl(
+        const std::string & path,
+        llama_context      * ctx,
+        common_chat_templates * tmpls) {
+
+    std::ifstream f(path);
+    if (!f.is_open()) {
+        LOG_ERR("%s: cannot open %s\n", __func__, path.c_str());
+        return {};
+    }
+
+    std::vector<training_sample> samples;
+    std::string line;
+    int lineno = 0;
+
+    while (std::getline(f, line)) {
+        ++lineno;
+        if (line.empty()) continue;
+
+        nlohmann::json j;
+        try { j = nlohmann::json::parse(line); }
+        catch (...) {
+            LOG_WRN("%s: skipping invalid JSON on line %d\n", __func__, lineno);
+            continue;
+        }
+
+        float reward = 1.0f;
+        if      (j.contains("reward")) reward = j["reward"].get<float>();
+        else if (j.contains("score"))  reward = j["score"].get<float>();
+
+        std::string prompt_text;
+        std::string response_text;
+
+        if (j.contains("messages")) {
+            // chat format — apply template
+            std::vector<common_chat_msg> msgs;
+            for (const auto & m : j["messages"]) {
+                common_chat_msg msg;
+                msg.role = m.value("role", "user");
+                common_chat_msg_content_part part;
+                part.type = "text";
+                part.text = m.value("content", "");
+                msg.content_parts.push_back(part);
+                msgs.push_back(msg);
+            }
+
+            // Skip samples where the last assistant turn contains an error marker.
+            // These are malformed/failed generations that should not be trained on.
+            {
+                std::string last_assistant_content;
+                for (int mi = (int)msgs.size() - 1; mi >= 0; --mi) {
+                    if (msgs[mi].role == "assistant") {
+                        last_assistant_content = msgs[mi].content_parts.empty()
+                            ? "" : msgs[mi].content_parts[0].text;
+                        break;
+                    }
+                }
+                if (last_assistant_content.find("Error:") != std::string::npos ||
+                    last_assistant_content.find("error:") != std::string::npos) {
+                    LOG_DBG("%s: skipping line %d — assistant response contains error marker\n", __func__, lineno);
+                    continue;
+                }
+            }
+
+            // Split into prompt (no loss) + last assistant response (loss).
+            // Render all messages except the last assistant turn as the prompt
+            // (with add_generation_prompt=true so the template adds the assistant
+            // prefix), then use the raw last assistant content as response_text.
+            // This ensures only the assistant's response tokens get loss, not the
+            // user turns or system prompt.
+            if (msgs.empty()) continue;
+            std::string last_assistant_content;
+            std::vector<common_chat_msg> prompt_msgs;
+            // Find the last assistant message
+            int last_asst_idx = -1;
+            for (int mi = (int)msgs.size() - 1; mi >= 0; --mi) {
+                if (msgs[mi].role == "assistant") { last_asst_idx = mi; break; }
+            }
+            if (last_asst_idx < 0) {
+                // No assistant turn — skip; nothing to train on
+                LOG_DBG("%s: skipping line %d — no assistant turn\n", __func__, lineno);
+                continue;
+            }
+            last_assistant_content = msgs[last_asst_idx].content_parts.empty()
+                ? "" : msgs[last_asst_idx].content_parts[0].text;
+            for (int mi = 0; mi < last_asst_idx; ++mi) prompt_msgs.push_back(msgs[mi]);
+
+            if (tmpls) {
+                common_chat_templates_inputs inp;
+                inp.messages = prompt_msgs;
+                inp.add_generation_prompt = true;
+                prompt_text   = common_chat_templates_apply(tmpls, inp).prompt;
+                response_text = last_assistant_content;
+            } else {
+                // Fallback: render everything as ChatML, use full text as response
+                std::vector<common_chat_msg> all_msgs = prompt_msgs;
+                all_msgs.push_back(msgs[last_asst_idx]);
+                prompt_text   = "";
+                response_text = apply_chatml(all_msgs);
+            }
+        } else if (j.contains("prompt") && j.contains("response")) {
+            response_text = j["response"].get<std::string>();
+            if (response_text.find("Error:") != std::string::npos ||
+                response_text.find("error:") != std::string::npos) {
+                LOG_DBG("%s: skipping line %d — response contains error marker\n", __func__, lineno);
+                continue;
+            }
+            prompt_text = j["prompt"].get<std::string>();
+        } else if (j.contains("text")) {
+            response_text = j["text"].get<std::string>();
+        } else {
+            LOG_WRN("%s: unknown format on line %d, skipping\n", __func__, lineno);
+            continue;
+        }
+
+        // Tokenize: prompt (no loss) + response (loss)
+        auto tok_prompt   = common_tokenize(ctx, prompt_text,   /*add_special=*/true);
+        auto tok_response = common_tokenize(ctx, response_text, /*add_special=*/false);
+
+        if (tok_prompt.empty() && tok_response.empty()) continue;
+
+        training_sample s;
+        s.reward = reward;
+        s.tokens.insert(s.tokens.end(), tok_prompt.begin(),   tok_prompt.end());
+        s.tokens.insert(s.tokens.end(), tok_response.begin(), tok_response.end());
+        s.is_label.resize(s.tokens.size(), false);
+        // Only response tokens contribute to the loss
+        for (size_t i = tok_prompt.size(); i < s.tokens.size(); ++i) {
+            s.is_label[i] = true;
+        }
+        samples.push_back(std::move(s));
+    }
+
+    LOG_INF("%s: loaded %zu samples from %s\n", __func__, samples.size(), path.c_str());
+    return samples;
+}
+
+// Pack variable-length samples into fixed-context-length windows and create
+// an ggml_opt_dataset. Labels for prompt tokens are set to -1 (ignored by
+// the loss in the epoch loop).
+// window_rewards is filled with one reward weight per window (averaged over
+// the sample tokens that fall in that window). If all samples have reward=1.0
+// the vector is all-ones and has no effect.
+static ggml_opt_dataset_t build_dataset(
+        const std::vector<training_sample> & samples,
+        int32_t                              n_ctx,
+        std::vector<float>                 & window_rewards,
+        bool                                 train_on_prompt = false,
+        llama_token                          bos_token = -1) {
+
+    // Flatten samples into token/label/reward streams
+    std::vector<llama_token> flat_tokens;
+    std::vector<int32_t>     flat_labels;  // -1 = no loss, token_id = loss target
+    std::vector<float>       flat_rewards; // per-token reward from the source sample
+
+    for (size_t si = 0; si < samples.size(); ++si) {
+        const auto & s = samples[si];
+
+        // Insert BOS separator between samples to prevent cross-sample predictions.
+        // The first sample already has BOS from tokenization (add_special=true).
+        if (si > 0 && bos_token >= 0 && !s.tokens.empty()) {
+            flat_tokens .push_back(bos_token);
+            flat_labels .push_back(-1);  // no loss on separator
+            flat_rewards.push_back(s.reward);
+        }
+
+        for (size_t i = 0; i + 1 < s.tokens.size(); ++i) {
+            flat_tokens .push_back(s.tokens[i]);
+            if (train_on_prompt) {
+                // All positions get correct next-token label (prompt + response)
+                flat_labels.push_back((int32_t)s.tokens[i + 1]);
+            } else {
+                // Only response positions get loss; prompt positions get -1 (sentinel).
+                // The sentinel is passed through to labels_sparse; opt_epoch_iter skips
+                // writing to the label tensor for those positions, leaving them zeroed →
+                // zero cross-entropy contribution.  No gradient flows from prompt tokens.
+                flat_labels.push_back(s.is_label[i + 1] ? (int32_t)s.tokens[i + 1] : -1);
+            }
+            flat_rewards.push_back(s.reward);
+        }
+    }
+
+    if ((int64_t)flat_tokens.size() < n_ctx) {
+        LOG_ERR("%s: dataset too small (%zu tokens) for context %d\n",
+                __func__, flat_tokens.size(), n_ctx);
+        return nullptr;
+    }
+
+    const int64_t stride = n_ctx / 2;
+    int64_t ndata  = ((int64_t)flat_tokens.size() - n_ctx) / stride;
+    if (ndata < 1) ndata = 1;  // at least one window when flat_tokens >= n_ctx
+
+    window_rewards.resize(ndata);
+
+    ggml_opt_dataset_t dataset = ggml_opt_dataset_init(
+            GGML_TYPE_I32, GGML_TYPE_I32, n_ctx, n_ctx, ndata, 1);
+
+    int32_t * data   = (int32_t *) ggml_opt_dataset_data  (dataset)->data;
+    int32_t * labels = (int32_t *) ggml_opt_dataset_labels(dataset)->data;
+
+    for (int64_t i = 0; i < ndata; ++i) {
+        const int64_t off = i * stride;
+        float reward_sum = 0.0f;
+        for (int32_t j = 0; j < n_ctx; ++j) {
+            data  [i * n_ctx + j] = flat_tokens[off + j];
+            // Pass -1 sentinel through unchanged for masked (prompt) positions.
+            // opt_epoch_iter skips these positions (no label tensor write → zero
+            // cross-entropy contribution).  Do NOT substitute the current token
+            // here — that trains the model to predict itself (off-by-one) and
+            // causes repetition degeneration.
+            labels[i * n_ctx + j] = flat_labels[off + j];
+            reward_sum += flat_rewards[off + j];
+        }
+        window_rewards[i] = reward_sum / n_ctx;
+    }
+
+    // Normalize window rewards to [0, 1].
+    // Step 1: clip to [-1, 1] — outliers like 1.3/1.4 would otherwise compress the
+    //         useful signal range after min-max scaling (a reward=1.0 would map to
+    //         only 0.83 instead of 1.0 if the max is 1.4).
+    // Step 2: min-max scale clipped values → [0, 1].
+    //         min → 0.0 (window ignored), max → 1.0 (full weight).
+    // If all rewards are identical (pure SFT dataset) keep at 1.0.
+    for (float & r : window_rewards) {
+        r = std::max(-1.0f, std::min(1.0f, r));
+    }
+    float rmin = *std::min_element(window_rewards.begin(), window_rewards.end());
+    float rmax = *std::max_element(window_rewards.begin(), window_rewards.end());
+    const float rrange = rmax - rmin;
+    if (rrange > 1e-6f) {
+        for (float & r : window_rewards) {
+            r = (r - rmin) / rrange;
+        }
+        LOG_INF("%s: reward range [%.4f, %.4f] (after clip to [-1,1]) → normalized to [0, 1]\n", __func__, rmin, rmax);
+    } else {
+        std::fill(window_rewards.begin(), window_rewards.end(), 1.0f);
+    }
+
+    return dataset;
+}
+
+// ---------------------------------------------------------------------------
+// LoRA tensor allocation
+// ---------------------------------------------------------------------------
+
+struct lora_tensors {
+    struct ggml_context      * ctx  = nullptr;
+    struct ggml_backend_buffer * buf = nullptr;
+    // map: base tensor name → {lora_a, lora_b}
+    std::unordered_map<std::string, std::pair<ggml_tensor*, ggml_tensor*>> ab;
+};
+
+static lora_tensors alloc_lora_tensors(
+        const std::string        & model_path,
+        const std::vector<std::string> & targets,
+        int32_t                   rank,
+        std::mt19937            & rng,
+        int32_t                   freeze_layers = 0) {
+
+    lora_tensors lt;
+
+    // Open the model GGUF to discover tensor names and shapes
+    // without needing access to private llama_model internals.
+    struct ggml_context * ctx_meta = nullptr;
+    struct gguf_init_params gguf_params = { /*.no_alloc=*/true, /*.ctx=*/&ctx_meta };
+    struct gguf_context * ctx_gguf = gguf_init_from_file(model_path.c_str(), gguf_params);
+    if (!ctx_gguf) {
+        LOG_ERR("%s: failed to open model GGUF for tensor discovery: %s\n",
+                __func__, model_path.c_str());
+        return lt;
+    }
+
+    // Collect matching 2-D tensors
+    struct tensor_info { std::string name; int64_t ne0, ne1; };
+    std::vector<tensor_info> matched;
+
+    for (ggml_tensor * t = ggml_get_first_tensor(ctx_meta);
+         t; t = ggml_get_next_tensor(ctx_meta, t)) {
+        if (ggml_n_dims(t) < 2) continue;
+        if (!tensor_matches_targets(t->name, targets, freeze_layers)) continue;
+        matched.push_back({t->name, t->ne[0], t->ne[1]});
+    }
+
+    gguf_free(ctx_gguf);
+    ggml_free(ctx_meta);
+
+    if (matched.empty()) {
+        LOG_ERR("%s: no model tensors matched --lora-targets; check spelling\n", __func__);
+        return lt;
+    }
+
+    if (freeze_layers > 0) {
+        LOG_INF("%s: freezing layers blk.0 .. blk.%d (no LoRA allocated; backward already pruned by grads_needed)\n",
+                __func__, freeze_layers - 1);
+    }
+    LOG_INF("%s: allocating LoRA A/B tensors for %zu weight matrices, rank=%d\n",
+            __func__, matched.size(), rank);
+
+    // Allocate ggml context for A+B tensors (2 tensors per matched weight)
+    const size_t mem = (2 * matched.size() + 16) * ggml_tensor_overhead();
+    struct ggml_init_params ip = { mem, nullptr, /*no_alloc=*/true };
+    lt.ctx = ggml_init(ip);
+
+    for (const auto & ti : matched) {
+        const int64_t in_dim  = ti.ne0; // columns (input features)
+        const int64_t out_dim = ti.ne1; // rows    (output features)
+
+        // lora_a: [in_dim, rank]   applied first: a @ x
+        // lora_b: [rank,   out_dim] applied second: b @ (a @ x)
+        // Convention matches llama-adapter.cpp:48-60:
+        //   a->ne[0] == in_dim,  a->ne[1] == rank
+        //   b->ne[0] == rank,    b->ne[1] == out_dim
+        ggml_tensor * la = ggml_new_tensor_2d(lt.ctx, GGML_TYPE_F32, in_dim, rank);
+        ggml_tensor * lb = ggml_new_tensor_2d(lt.ctx, GGML_TYPE_F32, rank,   out_dim);
+
+        ggml_set_name(la, (ti.name + ".lora_a").c_str());
+        ggml_set_name(lb, (ti.name + ".lora_b").c_str());
+
+        lt.ab[ti.name] = {la, lb};
+    }
+
+    // Allocate backend buffer for all LoRA tensors at once
+    lt.buf = ggml_backend_alloc_ctx_tensors_from_buft(lt.ctx, ggml_backend_cpu_buffer_type());
+
+    // Initialize: A ~ N(0, 1/sqrt(rank)), B = 0
+    const float std_a = 1.0f / std::sqrt((float)rank);
+    std::normal_distribution<float> dist(0.0f, std_a);
+
+    for (auto & kv : lt.ab) {
+        ggml_tensor * la = kv.second.first;
+        ggml_tensor * lb = kv.second.second;
+
+        // Fill A
+        float * data_a = (float *) la->data;
+        for (int64_t i = 0; i < ggml_nelements(la); ++i) data_a[i] = dist(rng);
+        // Zero B
+        memset(lb->data, 0, ggml_nbytes(lb));
+    }
+
+    return lt;
+}
+
+// ---------------------------------------------------------------------------
+// Param filter: only train lora_a / lora_b tensors
+// ---------------------------------------------------------------------------
+
+static bool lora_param_filter(const struct ggml_tensor * t, void * /*ud*/) {
+    const char * n = t->name;
+    const size_t len = strlen(n);
+    if (len > 7 && strcmp(n + len - 7, ".lora_a") == 0) return true;
+    if (len > 7 && strcmp(n + len - 7, ".lora_b") == 0) return true;
+    return false;
+}
+
+// ---------------------------------------------------------------------------
+// Save adapter GGUF
+// ---------------------------------------------------------------------------
+
+static std::string basename_from_path(const std::string & p) {
+    const size_t pos = p.find_last_of("/\\");
+    if (pos == std::string::npos) return p;
+    return p.substr(pos + 1);
+}
+
+static void save_adapter(
+        const lora_tensors & lt,
+        const std::string  & out_path,
+        const std::string  & arch,
+        float                alpha,
+        const std::string  & base_model_path) {
+
+    // Build output GGUF context
+    struct gguf_context * gctx = gguf_init_empty();
+
+    // Metadata required by llama_adapter_lora_init
+    gguf_set_val_str(gctx, "general.type",         "adapter");
+    gguf_set_val_str(gctx, "general.architecture",  arch.c_str());
+    gguf_set_val_str(gctx, "adapter.type",          "lora");
+    gguf_set_val_f32(gctx, "adapter.lora.alpha",    alpha);
+    gguf_set_val_str(gctx, "adapter.base_model",    basename_from_path(base_model_path).c_str());
+
+    // Register tensors
+    for (const auto & kv : lt.ab) {
+        gguf_add_tensor(gctx, kv.second.first);   // lora_a
+        gguf_add_tensor(gctx, kv.second.second);  // lora_b
+    }
+
+    // Write: meta placeholder → tensor data → rewrite meta
+    const std::string real_path = expand_tilde(out_path);
+    std::ofstream fout(real_path, std::ios::binary);
+    if (!fout.is_open()) {
+        LOG_ERR("%s: cannot open %s for writing\n", __func__, real_path.c_str());
+        gguf_free(gctx);
+        return;
+    }
+
+    // Write meta placeholder
+    const size_t meta_size = gguf_get_meta_size(gctx);
+    std::vector<char> zeros_buf(meta_size, 0);
+    fout.write(zeros_buf.data(), meta_size);
+
+    // Write tensor data — copy to CPU first in case tensors live on GPU
+    for (const auto & kv : lt.ab) {
+        for (ggml_tensor * t : {kv.second.first, kv.second.second}) {
+            const size_t nb = ggml_nbytes(t);
+            std::vector<char> cpu_buf(nb);
+            ggml_backend_tensor_get(t, cpu_buf.data(), 0, nb);
+            fout.write(cpu_buf.data(), nb);
+            // GGUF tensors are 32-byte aligned
+            const size_t pad = GGML_PAD(nb, 32) - nb;
+            if (pad > 0) {
+                std::vector<char> pad_buf(pad, 0);
+                fout.write(pad_buf.data(), pad);
+            }
+        }
+    }
+
+    // Re-write metadata at offset 0
+    std::vector<uint8_t> meta(meta_size);
+    gguf_get_meta_data(gctx, meta.data());
+    fout.seekp(0);
+    fout.write((const char *) meta.data(), meta_size);
+
+    fout.close();
+    gguf_free(gctx);
+
+    LOG_INF("%s: adapter saved to %s\n", __func__, real_path.c_str());
+}
+
+// ---------------------------------------------------------------------------
+// Periodic checkpoint callback
+// ---------------------------------------------------------------------------
+
+struct save_ctx {
+    const lora_tensors * lt;
+    const std::string  * lora_out;
+    const std::string  * arch;
+    const std::string  * base_model_path;
+    float                lora_alpha;
+    int32_t              save_every;     // 0 = disabled
+    int32_t              ubatch_per_ctx;
+    int64_t              last_saved;     // last window index at which we saved
+};
+
+// TLS pointer set before each epoch so the static callback can access it.
+static thread_local save_ctx * g_save_ctx = nullptr;
+
+static void save_every_callback(
+        bool               train,
+        ggml_opt_context_t opt_ctx,
+        ggml_opt_dataset_t dataset,
+        ggml_opt_result_t  result,
+        int64_t            ibatch,
+        int64_t            ibatch_max,
+        int64_t            t_start_us) {
+    ggml_opt_epoch_callback_progress_bar(train, opt_ctx, dataset, result, ibatch, ibatch_max, t_start_us);
+
+    // Log loss at every window boundary so we can see if/when it diverges.
+    if (train && g_save_ctx) {
+        const int64_t window = ibatch / g_save_ctx->ubatch_per_ctx;
+        const int64_t ubatch_in_window = ibatch % g_save_ctx->ubatch_per_ctx;
+        if (ubatch_in_window == g_save_ctx->ubatch_per_ctx - 1) {
+            double loss = 0.0, loss_unc = 0.0;
+            ggml_opt_result_loss(result, &loss, &loss_unc);
+            fprintf(stderr, "\n[window %4ld] loss=%.4f ± %.4f\n", (long)window, loss, loss_unc);
+        }
+    }
+
+    if (!train || !g_save_ctx || g_save_ctx->save_every <= 0) return;
+    const int64_t window = ibatch / g_save_ctx->ubatch_per_ctx;
+    if (window > 0 && window != g_save_ctx->last_saved && window % g_save_ctx->save_every == 0) {
+        g_save_ctx->last_saved = window;
+        const std::string ckpt = *g_save_ctx->lora_out + ".ckpt" + std::to_string(window) + ".gguf";
+        save_adapter(*g_save_ctx->lt, ckpt, *g_save_ctx->arch, g_save_ctx->lora_alpha, *g_save_ctx->base_model_path);
+        fprintf(stderr, "\n");
+        LOG_INF("save_every_callback: checkpoint saved -> %s (window %ld)\n", ckpt.c_str(), (long)window);
+    }
+}
+
+// ---------------------------------------------------------------------------
+// IPC helpers  (stdout protocol, stdin commands)
+// ---------------------------------------------------------------------------
+
+// Escape newlines and backslashes for single-line IPC transmission.
+// Mirrors _escape() in gguf_trainer.py.
+static std::string ipc_escape(const std::string & s) {
+    std::string out;
+    out.reserve(s.size());
+    for (char c : s) {
+        if      (c == '\\') out += "\\\\";
+        else if (c == '\n') out += "\\n";
+        else if (c == '\r') out += "\\r";
+        else                out += c;
+    }
+    return out;
+}
+
+static void ipc_emit(const char * msg) {
+    fputs(msg, stdout);
+    fputc('\n', stdout);
+    fflush(stdout);
+}
+
+// Read one line from stdin, trimming the trailing newline.
+// Returns false on EOF or error.
+static bool ipc_read_line(std::string & out) {
+    out.clear();
+    if (!std::getline(std::cin, out)) return false;
+    // Strip trailing \r if present (Windows line endings)
+    if (!out.empty() && out.back() == '\r') out.pop_back();
+    return true;
+}
+
+// Parse "REWARD r1 r2 ... rN" into a float vector.
+static std::vector<float> ipc_parse_rewards(const std::string & line) {
+    std::vector<float> rewards;
+    if (line.size() < 8 || line.substr(0, 7) != "REWARD ") return rewards;
+    std::istringstream ss(line.substr(7));
+    float r;
+    while (ss >> r) rewards.push_back(r);
+    return rewards;
+}
+
+// ---------------------------------------------------------------------------
+// Greedy / temperature sampling for GRPO rollout generation
+// ---------------------------------------------------------------------------
+
+static std::string generate_response(
+        llama_context      * ctx,
+        llama_model        * model,
+        const std::string  & prompt,
+        int32_t              max_tokens,
+        float                temperature,
+        std::mt19937       & rng) {
+
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+    auto tokens = common_tokenize(ctx, prompt, /*add_special=*/true);
+    if (tokens.empty()) return "";
+
+    // Clear KV cache before each generation (don't carry over previous prompt state)
+    llama_memory_clear(llama_get_memory(ctx), true);
+    {
+        llama_batch batch = llama_batch_get_one(tokens.data(), (int32_t)tokens.size());
+        if (llama_decode(ctx, batch) != 0) {
+            LOG_ERR("%s: llama_decode failed on prompt\n", __func__);
+            return "";
+        }
+    }
+
+    std::string output;
+    const llama_token eos = llama_vocab_eos(vocab);
+    const llama_token nl  = llama_vocab_nl(vocab);
+
+    // For ChatML models <|im_end|> is the turn-end marker but may not be the
+    // vocab EOS token.  Look it up by tokenizing the string and taking the
+    // first token if it tokenizes to exactly one piece.
+    llama_token im_end = -1;
+    {
+        std::vector<llama_token> im_end_tokens(8);
+        static const char im_end_str[] = "<|im_end|>";
+        int n = llama_tokenize(vocab, im_end_str, (int32_t)strlen(im_end_str), im_end_tokens.data(), (int32_t)im_end_tokens.size(), /*add_special=*/false, /*parse_special=*/true);
+        if (n == 1) im_end = im_end_tokens[0];
+    }
+    const llama_token eot = llama_vocab_eot(vocab);  // may equal eos on some models
+
+    for (int32_t i = 0; i < max_tokens; ++i) {
+        // Sample next token — use ith=-1 to always get the LAST output position's
+        // logits.  llama_get_logits(ctx) returns position 0 which is wrong when the
+        // prompt batch has multiple output tokens (training context).
+        float * logits = llama_get_logits_ith(ctx, -1);
+        if (!logits) {
+            LOG_ERR("%s: llama_get_logits_ith(-1) returned NULL\n", __func__);
+            break;
+        }
+        const int32_t n_vocab = llama_vocab_n_tokens(vocab);
+
+        llama_token next_token;
+        if (temperature <= 0.0f) {
+            // Greedy
+            next_token = (llama_token)(std::max_element(logits, logits + n_vocab) - logits);
+        } else {
+            // Temperature sampling via softmax + categorical draw
+            std::vector<float> probs(n_vocab);
+            float max_logit = *std::max_element(logits, logits + n_vocab);
+            float sum = 0.0f;
+            for (int32_t k = 0; k < n_vocab; ++k) {
+                probs[k] = std::exp((logits[k] - max_logit) / temperature);
+                sum += probs[k];
+            }
+            for (float & p : probs) p /= sum;
+            std::discrete_distribution<int32_t> dist(probs.begin(), probs.end());
+            next_token = dist(rng);
+        }
+
+        if (next_token == eos)     break;
+        if (next_token == eot)     break;
+        if (im_end >= 0 && next_token == im_end && !output.empty()) break;
+
+        // Decode token to text
+        char buf[256] = {};
+        llama_token_to_piece(vocab, next_token, buf, sizeof(buf) - 1, 0, true);
+        output += buf;
+
+        // Feed token back for next step
+        llama_batch batch = llama_batch_get_one(&next_token, 1);
+        if (llama_decode(ctx, batch) != 0) break;
+    }
+
+    return output;
+}
+
+// ---------------------------------------------------------------------------
+// GRPO IPC training loop
+// ---------------------------------------------------------------------------
+
+// Volatile flag set by SIGINT so the loop can exit cleanly.
+static volatile sig_atomic_t g_grpo_stop = 0;
+static void grpo_sigint_handler(int) { g_grpo_stop = 1; }
+
+static int run_grpo_mode(
+        common_params    & params,
+        llama_model      * model,
+        llama_context    * ctx,
+        lora_tensors     & lt,
+        const std::string & arch,
+    float              lora_alpha,
+    const std::string & base_model_path) {
+
+    const int32_t n_ctx    = llama_n_ctx(ctx);
+    const int32_t n_gen    = params.grpo_n_gen;
+    const int32_t n_steps  = params.grpo_n_steps;
+    const float   temp     = params.grpo_temperature;
+    const int32_t max_tok  = params.grpo_max_tokens;
+
+    std::mt19937 rng(params.sampling.seed != LLAMA_DEFAULT_SEED
+                     ? params.sampling.seed : 42);
+
+    // Initialize optimizer
+    struct llama_opt_params lopt_params {
+        /*.n_ctx_train              =*/0,
+        /*.param_filter             =*/lora_param_filter,
+        /*.param_filter_ud          =*/nullptr,
+        /*.get_opt_pars             =*/common_opt_lr_pars,
+        /*.get_opt_pars_ud          =*/&params.lr,
+        /*.optimizer_type           =*/params.optimizer,
+        /*.grad_checkpoint_interval =*/params.grad_checkpoint_interval,
+    };
+    llama_opt_init(ctx, model, lopt_params);
+
+    const llama_token bos = llama_vocab_bos(llama_model_get_vocab(model));
+
+    signal(SIGINT, grpo_sigint_handler);
+
+    // Signal Python that we are ready
+    ipc_emit("[QLORA:READY]");
+
+    float last_loss = 0.0f;
+    int   step      = 0;
+
+    while (step < n_steps && !g_grpo_stop) {
+
+        // ── Request prompt ────────────────────────────────────────────────
+        {
+            char buf[64];
+            snprintf(buf, sizeof(buf), "[QLORA:PROMPT_REQ:%d]", step + 1);
+            ipc_emit(buf);
+        }
+
+        std::string prompt_line;
+        if (!ipc_read_line(prompt_line)) break;
+        if (prompt_line == "STOP") {
+            LOG_INF("grpo: received STOP from Python\n");
+            break;
+        }
+        if (prompt_line.size() < 8 || prompt_line.substr(0, 7) != "PROMPT ") {
+            char buf[128];
+            snprintf(buf, sizeof(buf), "[QLORA:ERROR] expected PROMPT, got: %.80s", prompt_line.c_str());
+            ipc_emit(buf);
+            return 1;
+        }
+        // Unescape the prompt (\\n → \n etc.)
+        std::string prompt;
+        {
+            const std::string esc = prompt_line.substr(7);
+            prompt.reserve(esc.size());
+            for (size_t i = 0; i < esc.size(); ++i) {
+                if (esc[i] == '\\' && i + 1 < esc.size()) {
+                    char next = esc[i+1];
+                    if      (next == 'n')  { prompt += '\n'; ++i; }
+                    else if (next == 'r')  { prompt += '\r'; ++i; }
+                    else if (next == '\\') { prompt += '\\'; ++i; }
+                    else                   { prompt += esc[i]; }
+                } else {
+                    prompt += esc[i];
+                }
+            }
+        }
+
+        // ── Generate N responses ──────────────────────────────────────────
+        std::vector<std::string> generations(n_gen);
+        for (int k = 0; k < n_gen; ++k) {
+            generations[k] = generate_response(ctx, model, prompt, max_tok, temp, rng);
+
+            char hdr[64];
+            snprintf(hdr, sizeof(hdr), "[QLORA:GEN:%d/%d] ", k + 1, n_gen);
+            std::string msg = std::string(hdr) + ipc_escape(generations[k]);
+            ipc_emit(msg.c_str());
+        }
+
+        // ── Request rewards ───────────────────────────────────────────────
+        {
+            char buf[64];
+            snprintf(buf, sizeof(buf), "[QLORA:REWARD_REQ:%d]", n_gen);
+            ipc_emit(buf);
+        }
+
+        std::string reward_line;
+        if (!ipc_read_line(reward_line)) break;
+        if (reward_line == "STOP") {
+            LOG_INF("grpo: received STOP from Python\n");
+            break;
+        }
+        std::vector<float> rewards = ipc_parse_rewards(reward_line);
+        if ((int32_t)rewards.size() != n_gen) {
+            char buf[128];
+            snprintf(buf, sizeof(buf), "[QLORA:ERROR] expected %d rewards, got %zu", n_gen, rewards.size());
+            ipc_emit(buf);
+            return 1;
+        }
+
+        // ── Build single-step mini-dataset: prompt+generations with rewards ─
+        // Each generation is a separate sample; prompt = no-loss, generation = loss.
+        std::vector<training_sample> step_samples;
+        step_samples.reserve(n_gen);
+        for (int k = 0; k < n_gen; ++k) {
+            training_sample s;
+            s.reward = rewards[k];
+
+            auto tok_prompt = common_tokenize(ctx, prompt,         /*add_special=*/true);
+            auto tok_gen    = common_tokenize(ctx, generations[k], /*add_special=*/false);
+
+            s.tokens.insert(s.tokens.end(), tok_prompt.begin(), tok_prompt.end());
+            s.tokens.insert(s.tokens.end(), tok_gen.begin(),    tok_gen.end());
+            s.is_label.resize(s.tokens.size(), false);
+            for (size_t i = tok_prompt.size(); i < s.tokens.size(); ++i) {
+                s.is_label[i] = true;
+            }
+            step_samples.push_back(std::move(s));
+        }
+
+        // Ensure minimum token count for one context window.
+        // build_dataset drops the last token per sample during flattening,
+        // so we need total raw tokens > n_ctx to guarantee ndata >= 1.
+        while (true) {
+            size_t total = 0;
+            for (const auto & s : step_samples) total += s.tokens.size();
+            if ((int64_t)total > n_ctx + (int64_t)step_samples.size()) break;
+            step_samples.push_back(step_samples.back());
+        }
+
+        std::vector<float> window_rewards;
+        ggml_opt_dataset_t step_dataset = build_dataset(
+                step_samples, n_ctx, window_rewards, /*train_on_prompt=*/false, bos);
+        if (!step_dataset) {
+            ipc_emit("[QLORA:ERROR] build_dataset failed for step");
+            return 1;
+        }
+
+        // Apply reward weights for this step
+        const bool has_rewards = std::any_of(window_rewards.begin(), window_rewards.end(),
+                                             [](float r){ return std::abs(r - 1.0f) > 1e-4f; });
+        if (has_rewards) {
+            llama_opt_set_reward_weights(window_rewards.data(), (int64_t)window_rewards.size());
+        }
+
+        // ── One optimizer step (full dataset = one mini-epoch) ────────────
+        const int64_t idata_all = ggml_opt_dataset_ndata(step_dataset);
+        ggml_opt_result_t step_result = ggml_opt_result_init();
+
+        llama_opt_epoch(ctx, step_dataset, step_result, nullptr, idata_all,
+                        nullptr,   // no progress bar callback — clean stdout
+                        nullptr,
+                        false);    // no shuffle for single-step
+
+        double loss = 0.0, loss_unc = 0.0;
+        ggml_opt_result_loss(step_result, &loss, &loss_unc);
+        last_loss = (float)loss;
+
+        ggml_opt_result_free(step_result);
+        ggml_opt_dataset_free(step_dataset);
+        llama_opt_set_reward_weights(nullptr, 0);
+
+        ++step;
+
+        // ── Emit progress ─────────────────────────────────────────────────
+        {
+            char buf[128];
+            snprintf(buf, sizeof(buf),
+                     "[QLORA:PROGRESS] step=%d/%d loss=%.4f epoch=1/1",
+                     step, n_steps, last_loss);
+            ipc_emit(buf);
+        }
+
+        // ── Optional checkpoint ───────────────────────────────────────────
+        if (params.save_every > 0 && step % params.save_every == 0) {
+            std::string ckpt = params.lora_out + ".ckpt" + std::to_string(step) + ".gguf";
+            save_adapter(lt, ckpt, arch, lora_alpha, base_model_path);
+            char buf[512];
+            snprintf(buf, sizeof(buf), "[QLORA:CHECKPOINT] %s", ckpt.c_str());
+            ipc_emit(buf);
+        }
+    }
+
+    // Save final adapter
+    save_adapter(lt, params.lora_out, arch, lora_alpha, base_model_path);
+
+    {
+        char buf[64];
+        snprintf(buf, sizeof(buf), "[QLORA:DONE] final_loss=%.4f", last_loss);
+        ipc_emit(buf);
+    }
+
+    return 0;
+}
+
+// ---------------------------------------------------------------------------
+int main(int argc, char ** argv) {
+    std::setlocale(LC_NUMERIC, "C");
+
+    common_params params;
+    params.escape = false;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_FINETUNE_QLORA)) {
+        return 1;
+    }
+
+    if (!params.grpo_mode && params.train_file.empty()) {
+        LOG_ERR("%s: --train-file is required (or use --grpo-mode for IPC training)\n", __func__);
+        return 1;
+    }
+
+    // Force settings required for training
+    params.use_mmap     = false;
+    params.cache_type_k = GGML_TYPE_F32;
+    params.cache_type_v = GGML_TYPE_F32;
+    // Warmup runs inference with PARAM-flagged tensors which causes a segfault;
+    // training never benefits from warmup, so disable it unconditionally.
+    params.warmup       = false;
+    // Flash attention has no backward implementation; force standard attention for training.
+    params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
+
+    const float lora_alpha = (params.lora_alpha > 0.0f)
+        ? params.lora_alpha : (float) params.lora_rank;
+    const auto targets = split_csv(params.lora_targets);
+
+    // --- Step 1: Discover tensor shapes from model GGUF (no model load yet) ---
+    std::string arch;
+    {
+        struct ggml_context * ctx_meta = nullptr;
+        struct gguf_init_params gp = { true, &ctx_meta };
+        struct gguf_context * ctx_gguf = gguf_init_from_file(params.model.path.c_str(), gp);
+        if (!ctx_gguf) { LOG_ERR("failed to open model GGUF\n"); return 1; }
+        int kid = gguf_find_key(ctx_gguf, "general.architecture");
+        if (kid >= 0) arch = gguf_get_val_str(ctx_gguf, kid);
+        gguf_free(ctx_gguf);
+        ggml_free(ctx_meta);
+    }
+
+    // --- Step 2: Allocate LoRA tensors and save initial adapter GGUF ---
+    // If the user already supplied a --lora adapter we reuse it (resume training).
+    // Otherwise we allocate fresh tensors (B=0, A=random), write them to a temp
+    // .init.gguf so common_init_from_params can load them before context creation
+    // (this makes sched_reserve size the graph to include LoRA nodes).
+    const bool resume_from_lora = !params.lora_adapters.empty();
+
+    std::mt19937 rng(42);
+    lora_tensors lt; // will be populated after context load (Step 4)
+    std::string init_adapter_path;
+
+    if (!resume_from_lora) {
+        lt = alloc_lora_tensors(params.model.path, targets, params.lora_rank, rng, params.lora_freeze_layers);
+        if (lt.ab.empty()) return 1;
+
+        init_adapter_path = params.lora_out + ".init.gguf";
+        save_adapter(lt, init_adapter_path, arch, lora_alpha, params.model.path);
+
+        // Register adapter so common_init_from_params loads it before context creation
+        common_adapter_lora_info adapter_info;
+        adapter_info.path  = init_adapter_path;
+        adapter_info.scale = 1.0f;
+        params.lora_adapters.push_back(adapter_info);
+    } else {
+        LOG_INF("%s: resuming training from existing LoRA adapter: %s\n",
+                __func__, params.lora_adapters.back().path.c_str());
+    }
+
+    // --- Step 3: Load model + context (graph sized with LoRA nodes) ---
+    common_init();
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    auto llama_init = common_init_from_params(params);
+    auto * model = llama_init->model();
+    auto * ctx   = llama_init->context();
+
+    if (!model) { LOG_ERR("failed to load model\n"); return 1; }
+
+    LOG_INF("%s\n", common_params_get_system_info(params).c_str());
+
+    // Arch fallback if not in GGUF metadata
+    if (arch.empty()) {
+        char buf[256] = {};
+        llama_model_desc(model, buf, sizeof(buf));
+        arch = std::string(buf);
+        arch = arch.substr(0, arch.find_first_of(" /"));
+    }
+
+    // --- Step 4: Mark the loaded adapter tensors as trainable ---
+    // common_init_from_params loaded the adapter; params.lora_adapters[back].ptr
+    // points to the live llama_adapter_lora with its own tensor copies in device
+    // memory. Mark those tensors trainable so the optimizer graph includes them.
+    {
+        llama_adapter_lora * loaded = params.lora_adapters.back().ptr;
+        if (!loaded) {
+            LOG_ERR("%s: adapter was not loaded by common_init_from_params\n", __func__);
+            return 1;
+        }
+        for (auto & kv : loaded->ab_map) {
+            ggml_set_param(kv.second.a);  // lora_a → trainable
+            ggml_set_param(kv.second.b);  // lora_b → trainable
+        }
+        // Point lt.ab at the live device tensors so save_adapter writes
+        // the trained weights (not the original init tensors).
+        lt.ab.clear();
+        for (auto & kv : loaded->ab_map) {
+            lt.ab[kv.first] = {kv.second.a, kv.second.b};
+        }
+    }
+
+    // Remove temp init file when we created it (resume path has no init file)
+    if (!resume_from_lora && !init_adapter_path.empty()) {
+        std::remove(expand_tilde(init_adapter_path).c_str());
+    }
+
+    // --- Step 5: Load dataset ---
+    // In GRPO mode the dataset comes from Python via stdin/stdout — skip file loading.
+    auto tmpls = common_chat_templates_init(model, "");
+    if (params.grpo_mode) {
+        int rc = run_grpo_mode(params, model, ctx, lt, arch, lora_alpha, params.model.path);
+        if (lt.buf) ggml_backend_buffer_free(lt.buf);
+        if (lt.ctx) ggml_free(lt.ctx);
+        llama_backend_free();
+        return rc;
+    }
+    auto samples = load_jsonl(params.train_file, ctx, tmpls.get());
+    if (samples.empty()) {
+        LOG_ERR("%s: no training samples loaded\n", __func__);
+        return 1;
+    }
+
+    const int32_t n_ctx = llama_n_ctx(ctx);
+    std::vector<float> window_rewards;
+    const llama_token bos = llama_vocab_bos(llama_model_get_vocab(model));
+    auto dataset = build_dataset(samples, n_ctx, window_rewards, params.train_on_prompt, bos);
+    if (!dataset) return 1;
+
+    // Check if any reward deviates from 1.0 — if so, enable reward-weighted SFT
+    const bool has_rewards = std::any_of(window_rewards.begin(), window_rewards.end(),
+                                         [](float r){ return std::abs(r - 1.0f) > 1e-4f; });
+    if (has_rewards) {
+        LOG_INF("%s: reward-weighted SFT enabled (found non-uniform rewards in dataset)\n", __func__);
+        llama_opt_set_reward_weights(window_rewards.data(), (int64_t)window_rewards.size());
+    }
+
+    // Initialize optimizer — our custom param filter restricts training to lora_a/b
+    struct llama_opt_params lopt_params {
+        /*.n_ctx_train              =*/0,
+        /*.param_filter             =*/lora_param_filter,
+        /*.param_filter_ud          =*/nullptr,
+        /*.get_opt_pars             =*/common_opt_lr_pars,
+        /*.get_opt_pars_ud          =*/&params.lr,
+        /*.optimizer_type           =*/params.optimizer,
+        /*.grad_checkpoint_interval =*/params.grad_checkpoint_interval,
+    };
+    llama_opt_init(ctx, model, lopt_params);
+
+    const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - params.val_split);
+
+    ggml_opt_result_t result_train = ggml_opt_result_init();
+    ggml_opt_result_t result_eval  = ggml_opt_result_init();
+
+    const int32_t n_ubatch       = llama_n_ubatch(ctx);
+    const int32_t ubatch_per_ctx = (n_ubatch > 0) ? (n_ctx / n_ubatch) : 1;
+
+    save_ctx sctx { &lt, &params.lora_out, &arch, &params.model.path, lora_alpha, params.save_every, ubatch_per_ctx, 0 };
+    g_save_ctx = &sctx;
+
+    const int64_t total_windows = ggml_opt_dataset_ndata(dataset);
+    LOG_INF("%s: starting QLoRA training — rank=%d alpha=%.1f epochs=%d loss=%s\n",
+            __func__, params.lora_rank, lora_alpha, params.lr.epochs,
+            params.train_on_prompt ? "prompt+response" : "response-only");
+    LOG_INF("%s: dataset: %ld windows × %d ubatches = %ld steps per epoch  (n_ctx=%d n_ubatch=%d stride=%d)\n",
+            __func__, (long)total_windows, ubatch_per_ctx, (long)(idata_split * ubatch_per_ctx),
+            n_ctx, n_ubatch, n_ctx / 2);
+    if (params.save_every > 0) {
+        LOG_INF("%s: will save checkpoint every %d windows → %s.ckptN.gguf\n",
+                __func__, params.save_every, params.lora_out.c_str());
+    }
+
+    ggml_opt_epoch_callback cb_train = (params.save_every > 0)
+        ? save_every_callback
+        : ggml_opt_epoch_callback_progress_bar;
+
+    for (params.lr.epoch = 0; params.lr.epoch < params.lr.epochs; ++params.lr.epoch) {
+        sctx.last_saved = 0;  // reset per-epoch window counter
+        llama_opt_epoch(ctx, dataset, result_train, result_eval, idata_split,
+                        cb_train,
+                        ggml_opt_epoch_callback_progress_bar,
+                        params.shuffle_dataset);
+        fprintf(stderr, "\n");
+
+        // Per-epoch loss summary
+        {
+            double train_loss = 0.0, train_unc = 0.0;
+            ggml_opt_result_loss(result_train, &train_loss, &train_unc);
+            if (idata_split < ggml_opt_dataset_ndata(dataset)) {
+                double val_loss = 0.0, val_unc = 0.0;
+                ggml_opt_result_loss(result_eval, &val_loss, &val_unc);
+                LOG_INF("epoch %d/%d: train_loss=%.4f ± %.4f  val_loss=%.4f ± %.4f\n",
+                        params.lr.epoch + 1, params.lr.epochs, train_loss, train_unc, val_loss, val_unc);
+            } else {
+                LOG_INF("epoch %d/%d: train_loss=%.4f ± %.4f\n",
+                        params.lr.epoch + 1, params.lr.epochs, train_loss, train_unc);
+            }
+        }
+
+        ggml_opt_result_reset(result_train);
+        ggml_opt_result_reset(result_eval);
+    }
+
+    ggml_opt_result_free(result_train);
+    ggml_opt_result_free(result_eval);
+    llama_opt_set_reward_weights(nullptr, 0);
+
+    // Save final trained adapter
+    save_adapter(lt, params.lora_out, arch, lora_alpha, params.model.path);
+
+    // Free scratch buffers only when we allocated them (not in resume path)
+    if (lt.buf) ggml_backend_buffer_free(lt.buf);
+    if (lt.ctx) ggml_free(lt.ctx);
+    ggml_opt_dataset_free(dataset);
+    llama_backend_free();
+
+    return 0;
+}
diff --git a/examples/qlora_training/grpo_example.py b/examples/qlora_training/grpo_example.py
new file mode 100644
index 0000000000..9c7f2e3c26
--- /dev/null
+++ b/examples/qlora_training/grpo_example.py
@@ -0,0 +1,397 @@
+#!/usr/bin/env python3
+"""
+grpo_example.py — Minimal GRPO training loop using llama-finetune-qlora --grpo-mode
+
+Demonstrates the IPC protocol between the Python driver and the C++ subprocess.
+No external dependencies required — only Python stdlib.
+
+Usage:
+    python3 grpo_example.py \
+        --model   /path/to/model-q4_k_m.gguf \
+        --lora-out /path/to/output-adapter.gguf \
+        [--lora    /path/to/resume-adapter.gguf] \
+        [--binary  /path/to/llama-finetune-qlora] \
+        [--n-steps 200] \
+        [--n-gen   8] \
+        [--rank    16]
+
+IPC Protocol (stdout from C++ process):
+    [QLORA:READY]               — process initialised
+    [QLORA:PROMPT_REQ:<step>]   — C++ requests a prompt for step N
+    [QLORA:GEN:<k>/<n>] <text>  — one generation (newlines escaped as \\n)
+    [QLORA:REWARD_REQ:<n>]      — C++ requests N reward scores
+    [QLORA:PROGRESS] step=X/Y loss=Z epoch=A/B
+    [QLORA:CHECKPOINT] <path>
+    [QLORA:DONE] final_loss=X
+    [QLORA:ERROR] <message>
+
+Python → C++ stdin:
+    PROMPT <escaped_text>
+    REWARD <r1> <r2> ... <rN>    (advantages, 0..1 range)
+    STOP                         (request graceful shutdown)
+"""
+
+import argparse
+import logging
+import math
+import os
+import re
+import subprocess
+import sys
+import time
+from pathlib import Path
+from typing import List, Optional, Tuple
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+)
+log = logging.getLogger("grpo_example")
+
+# ──────────────────────────────────────────────────────────────────────────────
+# IPC helpers
+# ──────────────────────────────────────────────────────────────────────────────
+
+_IPC_RE = re.compile(r"^\[QLORA:([A-Z_]+)(?::([^\]]*))?\](.*)$")
+
+
+def escape(text: str) -> str:
+    """Escape newlines and backslashes for single-line IPC transport."""
+    return text.replace("\\", "\\\\").replace("\n", "\\n").replace("\r", "\\r")
+
+
+def unescape(text: str) -> str:
+    """Reverse of escape()."""
+    out, i = [], 0
+    while i < len(text):
+        if text[i] == "\\" and i + 1 < len(text):
+            c = text[i + 1]
+            if c == "n":
+                out.append("\n")
+            elif c == "r":
+                out.append("\r")
+            elif c == "\\":
+                out.append("\\")
+            else:
+                out.append(c)
+            i += 2
+        else:
+            out.append(text[i])
+            i += 1
+    return "".join(out)
+
+
+def parse_ipc(line: str) -> Optional[Tuple[str, str, str]]:
+    """
+    Parse an IPC line into (msg_type, seq, payload).
+    Returns None for non-IPC lines (model output, log lines, etc.).
+    """
+    m = _IPC_RE.match(line.strip())
+    if not m:
+        return None
+    return m.group(1), (m.group(2) or ""), m.group(3).strip()
+
+
+def read_ipc(proc: subprocess.Popen, timeout: float = 120.0) -> Optional[Tuple[str, str, str]]:
+    """
+    Read lines from proc.stdout until an IPC message arrives.
+    Non-IPC lines (model output, C++ logs leaked to stdout) are printed.
+    Returns None on EOF.
+    Raises TimeoutError if nothing arrives within `timeout` seconds.
+    """
+    deadline = time.monotonic() + timeout
+    while True:
+        remaining = deadline - time.monotonic()
+        if remaining <= 0:
+            raise TimeoutError(f"No IPC message within {timeout:.0f}s")
+
+        line = proc.stdout.readline()
+        if not line:
+            return None  # EOF
+
+        line = line.rstrip("\n")
+        parsed = parse_ipc(line)
+        if parsed:
+            return parsed
+        # Non-IPC — C++ sometimes leaks timing/debug lines to stdout.
+        # Print them so the user can see what's happening.
+        print(f"  [cpp] {line}", file=sys.stderr)
+
+
+def write_cmd(proc: subprocess.Popen, cmd: str):
+    """Write one command line to the subprocess stdin."""
+    try:
+        proc.stdin.write(cmd + "\n")
+        proc.stdin.flush()
+    except BrokenPipeError:
+        raise RuntimeError("C++ subprocess stdin closed — did it crash?")
+
+
+def wait_for(proc: subprocess.Popen, expected: str, timeout: float = 120.0) -> Tuple[str, str, str]:
+    """Block until the expected IPC message type arrives."""
+    deadline = time.monotonic() + timeout
+    while True:
+        remaining = deadline - time.monotonic()
+        if remaining <= 0:
+            raise TimeoutError(f"Timed out waiting for [{expected}]")
+        parsed = read_ipc(proc, timeout=remaining)
+        if parsed is None:
+            raise RuntimeError(f"Subprocess exited before sending [{expected}]")
+        msg_type, seq, payload = parsed
+        if msg_type == expected:
+            return msg_type, seq, payload
+        log.debug("Ignoring unexpected IPC (%s) while waiting for %s", msg_type, expected)
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# Advantage normalisation (GRPO)
+# ──────────────────────────────────────────────────────────────────────────────
+
+def normalise_rewards(rewards: List[float]) -> List[float]:
+    """
+    Group-relative advantage normalisation: subtract mean, divide by std.
+    Clipped to [0, 1] so the C++ side always receives values in that range.
+
+    All-equal rewards → uniform 0.5 (no signal, but no NaN either).
+    """
+    if len(rewards) == 0:
+        return []
+    mean = sum(rewards) / len(rewards)
+    variance = sum((r - mean) ** 2 for r in rewards) / len(rewards)
+    std = math.sqrt(variance) if variance > 1e-8 else 1.0
+
+    normalised = [(r - mean) / std for r in rewards]
+    # Shift to [0,1]: z-scores typically lie in [-3, +3]
+    clipped = [max(0.0, min(1.0, 0.5 + z / 6.0)) for z in normalised]
+    return clipped
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# Example prompt / reward providers
+# ──────────────────────────────────────────────────────────────────────────────
+
+# Replace these with your own logic.
+
+_EXAMPLE_PROMPTS = [
+    "Explain the concept of gradient descent in one sentence.",
+    "What is the capital of France?",
+    "Write a haiku about machine learning.",
+    "Describe the difference between SFT and RLHF.",
+    "What does GRPO stand for?",
+]
+
+
+def get_prompt(step: int) -> str:
+    """Return a prompt for the given training step (0-indexed)."""
+    return _EXAMPLE_PROMPTS[step % len(_EXAMPLE_PROMPTS)]
+
+
+def score_generations(prompt: str, generations: List[str]) -> List[float]:
+    """
+    Score a list of model generations for the given prompt.
+    Returns a list of raw reward scores (any numeric range; will be normalised).
+
+    This example uses a trivial heuristic: longer, more varied responses
+    score higher.  Replace with your actual reward model / verifier.
+    """
+    scores = []
+    for gen in generations:
+        words = gen.split()
+        # Simple heuristics: length + lexical diversity
+        length_score = min(1.0, len(words) / 50.0)
+        vocab_score  = min(1.0, len(set(words)) / max(1, len(words)))
+        scores.append(0.6 * length_score + 0.4 * vocab_score)
+    return scores
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# Main GRPO loop
+# ──────────────────────────────────────────────────────────────────────────────
+
+def run_grpo(args: argparse.Namespace):
+    # Resolve binary
+    binary = Path(args.binary)
+    if not binary.exists():
+        log.error("Binary not found: %s", binary)
+        sys.exit(1)
+
+    # Build command
+    cmd = [
+        str(binary),
+        "--model",          args.model,
+        "--lora-out",       args.lora_out,
+        "--lora-rank",      str(args.rank),
+        "--lora-alpha",     str(args.rank // 2),
+        "-c",               str(args.ctx_size),
+        "-b",               str(args.ctx_size),
+        "-ub",              "512",
+        "-ngl",             str(args.ngl),
+        "-lr",              str(args.lr),
+        "--seed",           str(args.seed),
+        "--grad-checkpoint","48",
+        "--shuffle-dataset",
+        "--grpo-mode",
+        "--n-gen",          str(args.n_gen),
+        "--n-steps",        str(args.n_steps),
+        "--grpo-temp",      str(args.temperature),
+        "--grpo-max-tokens",str(args.max_tokens),
+    ]
+
+    if args.lora:
+        cmd += ["--lora", args.lora]
+
+    if args.save_every > 0:
+        cmd += ["--save-every", str(args.save_every)]
+
+    log.info("Launching: %s", " ".join(cmd))
+
+    proc = subprocess.Popen(
+        cmd,
+        stdin=subprocess.PIPE,
+        stdout=subprocess.PIPE,
+        stderr=sys.stderr,          # C++ debug/timing logs go directly to our stderr
+        text=True,
+        bufsize=1,
+    )
+
+    try:
+        _grpo_loop(proc, args)
+    except KeyboardInterrupt:
+        log.info("Interrupted — requesting graceful stop")
+        try:
+            write_cmd(proc, "STOP")
+        except Exception:
+            pass
+    except Exception as e:
+        log.error("GRPO loop error: %s", e)
+        proc.kill()
+        raise
+    finally:
+        try:
+            proc.stdin.close()
+        except Exception:
+            pass
+        rc = proc.wait(timeout=30)
+        if rc not in (0, None):
+            log.warning("Subprocess exited with code %d", rc)
+
+
+def _grpo_loop(proc: subprocess.Popen, args: argparse.Namespace):
+    # ── Wait for READY ──────────────────────────────────────────────────────
+    log.info("Waiting for subprocess to initialise (model load can take a minute)…")
+    wait_for(proc, "READY", timeout=300)
+    log.info("Subprocess ready.")
+
+    current_prompt: str = ""
+    generations: List[str] = []
+    step = 0
+
+    while True:
+        parsed = read_ipc(proc, timeout=600)
+        if parsed is None:
+            log.info("Subprocess exited (EOF).")
+            break
+
+        msg_type, seq, payload = parsed
+
+        # ── PROMPT_REQ ──────────────────────────────────────────────────────
+        if msg_type == "PROMPT_REQ":
+            step = int(seq) if seq else step + 1
+            current_prompt = get_prompt(step - 1)
+            generations = []
+            log.debug("Step %d — sending prompt: %s", step, current_prompt[:60])
+            write_cmd(proc, f"PROMPT {escape(current_prompt)}")
+
+        # ── GEN ─────────────────────────────────────────────────────────────
+        elif msg_type == "GEN":
+            # seq = "k/n"
+            parts = seq.split("/")
+            k = int(parts[0])
+            n = int(parts[1]) if len(parts) > 1 else args.n_gen
+            text = unescape(payload)
+            generations.append(text)
+            log.debug("  Generation %d/%d: %s…", k, n, text[:60].replace("\n", "↵"))
+
+        # ── REWARD_REQ ──────────────────────────────────────────────────────
+        elif msg_type == "REWARD_REQ":
+            n_expected = int(seq) if seq else len(generations)
+            if len(generations) != n_expected:
+                log.warning(
+                    "REWARD_REQ asked for %d rewards but collected %d generations",
+                    n_expected, len(generations),
+                )
+
+            raw_rewards = score_generations(current_prompt, generations)
+            advantages  = normalise_rewards(raw_rewards)
+
+            reward_str = " ".join(f"{a:.6f}" for a in advantages)
+            log.debug("  Rewards (raw): %s", [f"{r:.3f}" for r in raw_rewards])
+            log.debug("  Advantages:    %s", [f"{a:.3f}" for a in advantages])
+            write_cmd(proc, f"REWARD {reward_str}")
+
+        # ── PROGRESS ────────────────────────────────────────────────────────
+        elif msg_type == "PROGRESS":
+            # Format: step=X/Y loss=Z epoch=A/B
+            sm = re.search(r"step=(\d+)(?:/(\d+))?", payload)
+            lm = re.search(r"loss=([\d.]+)", payload)
+            step_str = f"{sm.group(1)}/{sm.group(2)}" if sm and sm.group(2) else (sm.group(1) if sm else "?")
+            loss_str = lm.group(1) if lm else "?"
+            print(f"  step {step_str}  loss {loss_str}", flush=True)
+
+        # ── CHECKPOINT ──────────────────────────────────────────────────────
+        elif msg_type == "CHECKPOINT":
+            log.info("Checkpoint saved: %s", payload.strip())
+
+        # ── DONE ────────────────────────────────────────────────────────────
+        elif msg_type == "DONE":
+            m = re.search(r"final_loss=([\d.]+)", payload)
+            loss = m.group(1) if m else "?"
+            log.info("Training complete. final_loss=%s", loss)
+            break
+
+        # ── ERROR ────────────────────────────────────────────────────────────
+        elif msg_type == "ERROR":
+            log.error("C++ process error: %s", payload.strip())
+            raise RuntimeError(f"Training failed: {payload.strip()}")
+
+        else:
+            log.debug("Unknown IPC message: [%s] seq=%r payload=%r", msg_type, seq, payload)
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# CLI
+# ──────────────────────────────────────────────────────────────────────────────
+
+def parse_args() -> argparse.Namespace:
+    # Default binary: build/bin/ relative to this script's repo root
+    script_dir = Path(__file__).resolve().parent
+    repo_root   = script_dir.parents[1]          # examples/qlora_training → llama.cpp root
+    default_bin = repo_root / "build" / "bin" / "llama-finetune-qlora"
+
+    p = argparse.ArgumentParser(
+        description="Minimal GRPO training loop via llama-finetune-qlora --grpo-mode",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    p.add_argument("--model",       required=True,          help="Base GGUF model path")
+    p.add_argument("--lora-out",    required=True,          help="Output adapter GGUF path")
+    p.add_argument("--lora",        default=None,           help="Resume from existing adapter GGUF")
+    p.add_argument("--binary",      default=str(default_bin), help="Path to llama-finetune-qlora binary")
+    p.add_argument("--rank",        type=int,   default=16,   help="LoRA rank")
+    p.add_argument("--n-steps",     type=int,   default=200,  help="Number of GRPO steps")
+    p.add_argument("--n-gen",       type=int,   default=8,    help="Generations per prompt")
+    p.add_argument("--lr",          type=float, default=1e-4, help="Learning rate")
+    p.add_argument("--ctx-size",    type=int,   default=4096, help="Context window")
+    p.add_argument("--ngl",         type=int,   default=999,  help="GPU layers (-ngl)")
+    p.add_argument("--temperature", type=float, default=0.8,  help="Sampling temperature")
+    p.add_argument("--max-tokens",  type=int,   default=512,  help="Max tokens per generation")
+    p.add_argument("--save-every",  type=int,   default=0,    help="Save checkpoint every N steps (0=off)")
+    p.add_argument("--seed",        type=int,   default=42,   help="RNG seed")
+    p.add_argument("--verbose",     action="store_true",     help="Enable DEBUG logging")
+    return p.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    if args.verbose:
+        logging.getLogger().setLevel(logging.DEBUG)
+    run_grpo(args)
diff --git a/examples/qlora_training/sample_data.jsonl b/examples/qlora_training/sample_data.jsonl
new file mode 100644
index 0000000000..982168cfc1
--- /dev/null
+++ b/examples/qlora_training/sample_data.jsonl
@@ -0,0 +1,7 @@
+{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "What is the capital of France?"}, {"role": "assistant", "content": "The capital of France is Paris."}]}
+{"messages": [{"role": "user", "content": "Explain gradient descent in one sentence."}, {"role": "assistant", "content": "Gradient descent iteratively adjusts model parameters in the direction that most reduces the loss function."}]}
+{"messages": [{"role": "system", "content": "You are a concise coding assistant."}, {"role": "user", "content": "Write a Python function that returns the factorial of n."}, {"role": "assistant", "content": "```python\ndef factorial(n):\n    return 1 if n <= 1 else n * factorial(n - 1)\n```"}]}
+{"prompt": "What is supervised fine-tuning?", "response": "Supervised fine-tuning (SFT) trains a pre-trained model on labelled examples of the desired behaviour, adjusting weights to minimise cross-entropy loss on the target responses."}
+{"prompt": "What is LoRA?", "response": "LoRA (Low-Rank Adaptation) fine-tunes large models by injecting trainable low-rank matrices into frozen weight layers, dramatically reducing the number of trainable parameters."}
+{"text": "The transformer architecture, introduced in 'Attention Is All You Need' (Vaswani et al., 2017), relies entirely on self-attention mechanisms and has become the foundation of modern large language models."}
+{"text": "QLoRA combines quantization with LoRA: the base model is loaded in 4-bit precision (frozen), while only the small LoRA adapter weights are trained in full precision."}
diff --git a/examples/qlora_training/sample_rwsft_data.jsonl b/examples/qlora_training/sample_rwsft_data.jsonl
new file mode 100644
index 0000000000..3bf0234451
--- /dev/null
+++ b/examples/qlora_training/sample_rwsft_data.jsonl
@@ -0,0 +1,6 @@
+{"messages": [{"role": "user", "content": "Explain what a neural network is."}, {"role": "assistant", "content": "A neural network is a computational model loosely inspired by biological neurons, consisting of layers of interconnected nodes that learn to map inputs to outputs through training."}], "reward": 0.9}
+{"messages": [{"role": "user", "content": "Explain what a neural network is."}, {"role": "assistant", "content": "It's a thing that does AI stuff with nodes and layers."}], "reward": 0.2}
+{"messages": [{"role": "user", "content": "What is backpropagation?"}, {"role": "assistant", "content": "Backpropagation computes gradients of the loss with respect to each parameter by applying the chain rule backwards through the computation graph, enabling gradient descent to update all weights efficiently."}], "reward": 1.0}
+{"messages": [{"role": "user", "content": "What is backpropagation?"}, {"role": "assistant", "content": "Backpropagation is how neural networks learn by going backwards."}], "reward": 0.3}
+{"prompt": "Describe overfitting in machine learning.", "response": "Overfitting occurs when a model learns the training data too closely—capturing noise and idiosyncrasies—resulting in poor generalisation to unseen examples. Regularisation, dropout, and early stopping are common mitigations.", "reward": 0.95}
+{"prompt": "Describe overfitting in machine learning.", "response": "Overfitting is when the model is too good at training data but bad at new data.", "reward": 0.4}
diff --git a/ggml/include/ggml-opt.h b/ggml/include/ggml-opt.h
index 1c2ed79b77..60774575f0 100644
--- a/ggml/include/ggml-opt.h
+++ b/ggml/include/ggml-opt.h
@@ -89,6 +89,7 @@ extern "C" {
             float beta2; // second AdamW momentum
             float eps;   // epsilon for numerical stability
             float wd;    // weight decay - 0.0f to disable
+            float gclip; // element-wise gradient clipping threshold - 0.0f to disable
         } adamw;
         struct {
             float alpha; // learning rate
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 566e271479..92a4ae6352 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -501,6 +501,7 @@ extern "C" {
         GGML_OP_MUL_MAT,
         GGML_OP_MUL_MAT_ID,
         GGML_OP_OUT_PROD,
+        GGML_OP_OUT_PROD_ID, // scattered outer-product for MUL_MAT_ID backward (MoE LoRA)
 
         GGML_OP_SCALE,
         GGML_OP_SET,
@@ -1424,6 +1425,21 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
 
+    // Scattered outer-product for the MUL_MAT_ID backward pass (MoE LoRA gradient).
+    //
+    //   a:   [cols, n_expert_used, n_tokens]  F32  — activations
+    //   b:   [rows, n_expert_used, n_tokens]  F32  — upstream gradient
+    //   ids: [n_expert_used, n_tokens]        I32  — expert dispatch indices
+    //   result: [cols, rows, n_expert, 1]     F32
+    //
+    //   result[:, :, e] += sum_{(i,t): ids[i,t]==e} a[:, i, t] ⊗ b[:, i, t]
+    GGML_API struct ggml_tensor * ggml_out_prod_id(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            struct ggml_tensor  * ids,
+            int64_t               n_expert);
+
     //
     // operations on tensors without backpropagation
     //
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 331e071a26..adecf1427a 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -11034,7 +11034,7 @@ static void ggml_compute_forward_opt_step_adamw_f32(
     GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
     GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_m));
     GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_v));
-    GGML_ASSERT(ggml_nelements(adamw_params) == 7);
+    GGML_ASSERT(ggml_nelements(adamw_params) == 8);
 
     const int ith = params->ith;
     const int nth = params->nth;
@@ -11060,6 +11060,7 @@ static void ggml_compute_forward_opt_step_adamw_f32(
     const float wd     = adamw_params_ptr[4];
     const float beta1h = adamw_params_ptr[5];
     const float beta2h = adamw_params_ptr[6];
+    const float gclip  = adamw_params_ptr[7]; // element-wise gradient clip (0 = disabled)
     const float keep   = 1.f - alpha * wd;
     for (int ir = ir0; ir < ir1; ++ir) {
         const int64_t i03 = ir/(ne02*ne01);
@@ -11074,8 +11075,10 @@ static void ggml_compute_forward_opt_step_adamw_f32(
         float       * v = (float       *) ((char       *) src0_grad_v->data + offset);
 
         for (int i00 = 0; i00 < ne00; ++i00) {
-            m[i00] = m[i00]*beta1 +        g[i00]*(1.0f - beta1);
-            v[i00] = v[i00]*beta2 + g[i00]*g[i00]*(1.0f - beta2);
+            const float gi = (gclip > 0.0f) ? fmaxf(-gclip, fminf(gclip, g[i00])) : g[i00];
+
+            m[i00] = m[i00]*beta1 +       gi*(1.0f - beta1);
+            v[i00] = v[i00]*beta2 + gi*gi*(1.0f - beta2);
 
             const float mh =       m[i00]*beta1h;
             const float vh = sqrtf(v[i00]*beta2h) + eps;
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index cda275b8c5..3e1cca6c98 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2642,6 +2642,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_OUT_PROD:
             ggml_cuda_out_prod(ctx, dst);
             break;
+        case GGML_OP_OUT_PROD_ID:
+            ggml_cuda_out_prod_id(ctx, dst);
+            break;
         case GGML_OP_SCALE:
             ggml_cuda_op_scale(ctx, dst);
             break;
@@ -4772,6 +4775,12 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
             } break;
         case GGML_OP_OUT_PROD:
             return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
+        case GGML_OP_OUT_PROD_ID:
+            return op->src[0] != nullptr && op->src[1] != nullptr && op->src[2] != nullptr
+                && op->type        == GGML_TYPE_F32
+                && op->src[0]->type == GGML_TYPE_F32
+                && op->src[1]->type == GGML_TYPE_F32
+                && op->src[2]->type == GGML_TYPE_I32;
         case GGML_OP_GET_ROWS:
             {
                 switch (op->src[0]->type) {
diff --git a/ggml/src/ggml-cuda/opt-step-adamw.cu b/ggml/src/ggml-cuda/opt-step-adamw.cu
index 35154f2996..dd2c1edf3a 100644
--- a/ggml/src/ggml-cuda/opt-step-adamw.cu
+++ b/ggml/src/ggml-cuda/opt-step-adamw.cu
@@ -20,8 +20,9 @@ static __global__ void opt_step_adamw_f32(
     const float wd     = pars[4];
     const float beta1h = pars[5];
     const float beta2h = pars[6];
+    const float gclip  = pars[7]; // element-wise gradient clip (0 = disabled)
 
-    const float gi = g[i];
+    const float gi = (gclip > 0.0f) ? fmaxf(-gclip, fminf(gclip, g[i])) : g[i];
     const float gmi = g_m[i]*beta1 +    gi*(1.0f - beta1);
     const float gvi = g_v[i]*beta2 + gi*gi*(1.0f - beta2);
 
@@ -62,7 +63,7 @@ void ggml_cuda_opt_step_adamw(ggml_backend_cuda_context & ctx, ggml_tensor * dst
     GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
     GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_m));
     GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_v));
-    GGML_ASSERT(ggml_nelements(adamw_params) == 7);
+    GGML_ASSERT(ggml_nelements(adamw_params) == 8);
 
     float       * src0_d         = (float       *) src0->data;
     const float * src0_grad_d    = (const float *) src0_grad->data;
diff --git a/ggml/src/ggml-cuda/out-prod.cu b/ggml/src/ggml-cuda/out-prod.cu
index c9b2b699c6..9afc323bd9 100644
--- a/ggml/src/ggml-cuda/out-prod.cu
+++ b/ggml/src/ggml-cuda/out-prod.cu
@@ -1,6 +1,8 @@
 #include "out-prod.cuh"
 
 #include <cstdint>
+#include <cstring>
+#include <vector>
 
 void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const ggml_tensor * src0 = dst->src[0];
@@ -66,3 +68,115 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
         }
     }
 }
+
+// ggml_cuda_out_prod_id
+//
+// Scattered outer-product for the MUL_MAT_ID backward pass (gradient w.r.t. expert weights).
+//
+//   src0 = a   [cols, n_expert_used, n_tokens]  F32  — token activations
+//   src1 = b   [rows, n_expert_used, n_tokens]  F32  — upstream gradient
+//   src2 = ids [n_expert_used, n_tokens]        I32  — expert dispatch indices
+//   dst        [cols, rows, n_expert, 1]         F32  — gradient w.r.t. expert weight matrices
+//
+//   dst[:, :, e] += sum_{(i,t): ids[i,t]==e} a[:, i, t] ⊗ b[:, i, t]
+//
+// Algorithm:
+//   For each expert e: gather the token columns where ids[i,t]==e into contiguous
+//   GPU buffers, then use cublasSgemm (beta=1) to accumulate the outer product.
+//   ids may be CPU-resident (common in backward graphs where they are leaf tensors).
+void ggml_cuda_out_prod_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0]; // a   [cols, n_exp_used, n_tokens]
+    const ggml_tensor * src1 = dst->src[1]; // b   [rows, n_exp_used, n_tokens]
+    const ggml_tensor * ids  = dst->src[2]; // ids [n_exp_used, n_tokens]  i32
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(ids->type  == GGML_TYPE_I32);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
+
+    const int64_t cols       = src0->ne[0];
+    const int64_t n_exp_used = src0->ne[1];
+    const int64_t n_tokens   = src0->ne[2];
+    const int64_t rows       = src1->ne[0];
+    const int64_t n_expert   = dst->ne[2];
+
+    cudaStream_t   stream = ctx.stream();
+    cublasHandle_t handle = ctx.cublas_handle();
+    CUBLAS_CHECK(cublasSetStream(handle, stream));
+
+    // Zero destination tensor before accumulating
+    CUDA_CHECK(cudaMemsetAsync(dst->data, 0, ggml_nbytes(dst), stream));
+
+    // Read ids to host — ids may be CPU-resident (backward graph leaf) or GPU-resident
+    const size_t ids_nbytes = ggml_nbytes(ids);
+    std::vector<char> ids_host(ids_nbytes);
+    if (ids->buffer && !ggml_backend_buffer_is_host(ids->buffer)) {
+        // GPU-resident: copy to host and synchronize so we can inspect the values
+        CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids->data, ids_nbytes, cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+    } else {
+        memcpy(ids_host.data(), ids->data, ids_nbytes);
+    }
+
+    // Build per-expert token list: expert_tokens[e] = list of flat indices (iexp*n_tokens+itok)
+    // whose dispatch id equals e.
+    std::vector<std::vector<int64_t>> expert_tokens(n_expert);
+    for (int64_t itok = 0; itok < n_tokens; ++itok) {
+        for (int64_t iexp = 0; iexp < n_exp_used; ++iexp) {
+            const int32_t eid = *(const int32_t *)(ids_host.data()
+                + itok * ids->nb[1] + iexp * ids->nb[0]);
+            GGML_ASSERT(eid >= 0 && eid < (int32_t)n_expert);
+            expert_tokens[eid].push_back(iexp * n_tokens + itok);
+        }
+    }
+
+    // Strides (in elements, not bytes)
+    const int64_t a_stride_exp = src0->nb[1] / sizeof(float); // cols
+    const int64_t a_stride_tok = src0->nb[2] / sizeof(float); // cols * n_exp_used
+    const int64_t b_stride_exp = src1->nb[1] / sizeof(float); // rows
+    const int64_t b_stride_tok = src1->nb[2] / sizeof(float); // rows * n_exp_used
+    const int64_t dst_stride_e = dst->nb[2]  / sizeof(float); // cols * rows
+
+    const float alpha_one = 1.0f;
+    const float beta_acc  = 1.0f; // accumulate — dst is already zeroed above
+
+    const float * a_base = (const float *) src0->data;
+    const float * b_base = (const float *) src1->data;
+    float       * d_base = (float       *)  dst->data;
+
+    for (int64_t e = 0; e < n_expert; ++e) {
+        const auto & toks = expert_tokens[e];
+        if (toks.empty()) {
+            continue;
+        }
+
+        const int64_t ntoks_e = (int64_t) toks.size();
+
+        // Allocate contiguous gather buffers on GPU: a_e [cols, ntoks_e], b_e [rows, ntoks_e]
+        ggml_cuda_pool_alloc<float> a_gathered(ctx.pool(), cols * ntoks_e);
+        ggml_cuda_pool_alloc<float> b_gathered(ctx.pool(), rows * ntoks_e);
+
+        // Gather token vectors from GPU src0/src1 into contiguous buffers
+        for (int64_t ti = 0; ti < ntoks_e; ++ti) {
+            const int64_t flat = toks[ti];
+            const int64_t iexp = flat / n_tokens;
+            const int64_t itok = flat % n_tokens;
+            CUDA_CHECK(cudaMemcpyAsync(
+                a_gathered.ptr + ti * cols,
+                a_base + iexp * a_stride_exp + itok * a_stride_tok,
+                cols * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+            CUDA_CHECK(cudaMemcpyAsync(
+                b_gathered.ptr + ti * rows,
+                b_base + iexp * b_stride_exp + itok * b_stride_tok,
+                rows * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+        }
+
+        // dst[:, :, e] += a_gathered @ b_gathered^T
+        // cuBLAS column-major: A=[cols, ntoks_e] lda=cols, B=[rows, ntoks_e] ldb=rows
+        CUBLAS_CHECK(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T,
+            (int)cols, (int)rows, (int)ntoks_e,
+            &alpha_one, a_gathered.ptr,         (int)cols,
+                        b_gathered.ptr,         (int)rows,
+            &beta_acc,  d_base + e*dst_stride_e, (int)cols));
+    }
+}
diff --git a/ggml/src/ggml-cuda/out-prod.cuh b/ggml/src/ggml-cuda/out-prod.cuh
index a0046f5f8f..993307e797 100644
--- a/ggml/src/ggml-cuda/out-prod.cuh
+++ b/ggml/src/ggml-cuda/out-prod.cuh
@@ -1,3 +1,10 @@
 #include "common.cuh"
 
 void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+// Scattered outer-product for MUL_MAT_ID backward (gradient w.r.t. expert weight matrices).
+// src0: activations [cols, n_expert_used, n_tokens]  F32
+// src1: grad_output [rows, n_expert_used, n_tokens]  F32
+// src2: expert ids  [n_expert_used, n_tokens]        I32 (may be CPU-resident)
+// dst:  grad_weight [cols, rows, n_expert, 1]         F32
+void ggml_cuda_out_prod_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-opt.cpp b/ggml/src/ggml-opt.cpp
index e078ad14a3..e87fc79c25 100644
--- a/ggml/src/ggml-opt.cpp
+++ b/ggml/src/ggml-opt.cpp
@@ -230,6 +230,7 @@ struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * us
     result.adamw.beta2 = 0.999f;
     result.adamw.eps   = 1e-8f;
     result.adamw.wd    = 0.0f;
+    result.adamw.gclip = 0.0f;
 
     result.sgd.alpha   = 1e-3f;
     result.sgd.wd      = 0.0f;
@@ -503,7 +504,7 @@ static void ggml_opt_build(ggml_opt_context_t opt_ctx) {
     // gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step.
     opt_ctx->gb_opt = ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gb_grad, /*force_grads =*/ true);
 
-    opt_ctx->opt_step_params = ggml_new_tensor_1d(opt_ctx->ctx_cpu, GGML_TYPE_F32, need_momenta ? 7 : 2);
+    opt_ctx->opt_step_params = ggml_new_tensor_1d(opt_ctx->ctx_cpu, GGML_TYPE_F32, need_momenta ? 8 : 2);
     ggml_tensor * adamw_params = opt_ctx->opt_step_params;
     ggml_set_input(adamw_params);
     const char * optimizer_name = ggml_opt_optimizer_name(opt_ctx->optimizer);
@@ -726,6 +727,17 @@ void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward) {
     if (opt_ctx->build_type == GGML_OPT_BUILD_TYPE_OPT && opt_ctx->opt_period > 1 && opt_ctx->opt_i == 0) {
         ggml_graph_reset(opt_ctx->gb_grad);
     }
+
+    // For non-static graphs the compute graph is rebuilt every call, so ggml_graph_reset
+    // is not called and grad_accs may carry over values from the previous accumulation window.
+    // Explicitly zero them at the start of each gradient-accumulation cycle.
+    if (!opt_ctx->static_graphs && backward && opt_ctx->opt_i == 0) {
+        for (struct ggml_tensor * ga : opt_ctx->grad_accs) {
+            if (ga) {
+                ggml_set_zero(ga);
+            }
+        }
+    }
     if (backward) {
         const int32_t opt_i_next = (opt_ctx->opt_i + 1) % opt_ctx->opt_period;
         opt_ctx->build_type = opt_i_next == 0 ? GGML_OPT_BUILD_TYPE_OPT : GGML_OPT_BUILD_TYPE_GRAD;
@@ -793,6 +805,7 @@ void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result) {
                 GGML_ASSERT(opt_pars.adamw.eps >= 0.0f);
                 GGML_ASSERT(opt_pars.adamw.wd >= 0.0f);
                 GGML_ASSERT(opt_pars.adamw.wd <= 1.0f);
+                GGML_ASSERT(opt_pars.adamw.gclip >= 0.0f);
 
                 // beta1, beta2 after applying warmup
                 const float beta1h = 1.0f / (1.0f - powf(opt_pars.adamw.beta1, opt_ctx->iter));
@@ -806,6 +819,7 @@ void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result) {
                 adamw_par_data[4] = opt_pars.adamw.wd;
                 adamw_par_data[5] = beta1h;
                 adamw_par_data[6] = beta2h;
+                adamw_par_data[7] = opt_pars.adamw.gclip;
             } break;
             case GGML_OPT_OPTIMIZER_TYPE_SGD: {
                 GGML_ASSERT(opt_pars.sgd.alpha > 0.0f);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index aeafc395d7..1e04911360 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -976,6 +976,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "MUL_MAT",
     "MUL_MAT_ID",
     "OUT_PROD",
+    "OUT_PROD_ID",
 
     "SCALE",
     "SET",
@@ -1049,7 +1050,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "GLU",
 };
 
-static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
+static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT != 97");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -1086,6 +1087,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "X*Y",
     "X[i]*Y",
     "X*Y",
+    "X_id⊗Y_id",
 
     "x*v",
     "y-\\>view(x)",
@@ -1159,7 +1161,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "glu(x)",
 };
 
-static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
+static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT != 97");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -3293,6 +3295,44 @@ struct ggml_tensor * ggml_out_prod(
     return result;
 }
 
+// ggml_out_prod_id
+//
+// Scattered outer-product for the MUL_MAT_ID backward pass.
+//
+//   a:   [cols, n_expert_used, n_tokens]  F32  — activations (src1 of MUL_MAT_ID)
+//   b:   [rows, n_expert_used, n_tokens]  F32  — upstream gradient
+//   ids: [n_expert_used, n_tokens]        I32  — expert dispatch indices (src2 of MUL_MAT_ID)
+//   result: [cols, rows, n_expert, 1]     F32
+//
+//   result[:, :, e] += sum_{(i,t): ids[i,t]==e} a[:, i, t] ⊗ b[:, i, t]
+//
+// Computes the gradient w.r.t. the expert weight matrices (src0) of MUL_MAT_ID.
+struct ggml_tensor * ggml_out_prod_id(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        struct ggml_tensor  * ids,
+        int64_t               n_expert) {
+    GGML_ASSERT(a->type   == GGML_TYPE_F32);
+    GGML_ASSERT(b->type   == GGML_TYPE_F32);
+    GGML_ASSERT(ids->type == GGML_TYPE_I32);
+    GGML_ASSERT(a->ne[1]  == b->ne[1]);   // n_expert_used matches
+    GGML_ASSERT(a->ne[2]  == b->ne[2]);   // n_tokens matches
+    GGML_ASSERT(ids->ne[0] == a->ne[1]);  // n_expert_used matches ids
+    GGML_ASSERT(ids->ne[1] == a->ne[2]);  // n_tokens matches ids
+    GGML_ASSERT(n_expert > 0);
+
+    const int64_t ne[4] = { a->ne[0], b->ne[0], n_expert, 1 };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+
+    result->op     = GGML_OP_OUT_PROD_ID;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = ids;
+
+    return result;
+}
+
 // ggml_scale
 
 static struct ggml_tensor * ggml_scale_impl(
@@ -6518,6 +6558,35 @@ static void ggml_compute_backward(
                                 grad)));        // [m,p,qq,rr]
             }
         } break;
+        case GGML_OP_MUL_MAT_ID: {
+            // Backward pass for indirect matrix multiplication (MoE).
+            //
+            // Forward:  dst[rows, n_exp_used, n_tokens] = as[:,:,ids[i,t]] @ b[:,i,t]
+            //   src0 = as  [cols, rows, n_expert]    — expert weight matrices
+            //   src1 = b   [cols, n_exp_used, n_tokens] — token activations
+            //   src2 = ids [n_exp_used, n_tokens]    — expert dispatch indices (I32)
+            //
+            // Gradient w.r.t. src1 (activations):
+            //   grad_b[:,i,t] = as[:,:,ids[i,t]]^T @ grad[:,i,t]
+            //   → computed via MUL_MAT_ID with transposed as
+            //
+            // Gradient w.r.t. src0 (expert weights, only when F32 i.e. LoRA):
+            //   grad_as[:,:,e] += sum_{(i,t): ids[i,t]==e} b[:,i,t] ⊗ grad[:,i,t]
+            //   → computed via OUT_PROD_ID
+            //
+            // Quantized src0 is frozen (stop-gradient) — handled in grads_needed below.
+            if (src0_needs_grads) {
+                const int64_t n_expert = src0->ne[2];
+                struct ggml_tensor * grad_as = ggml_out_prod_id(ctx, src1, grad, src2, n_expert);
+                ggml_add_or_set(ctx, cgraph, isrc0, grad_as);
+            }
+            if (src1_needs_grads) {
+                // Transpose expert matrices: as [cols, rows, n_expert] → as_T [rows, cols, n_expert]
+                struct ggml_tensor * as_T = ggml_cont(ctx, ggml_permute(ctx, src0, 1, 0, 2, 3));
+                struct ggml_tensor * grad_b = ggml_mul_mat_id(ctx, as_T, grad, src2);
+                ggml_add_or_set(ctx, cgraph, isrc1, grad_b);
+            }
+        } break;
         case GGML_OP_SCALE: {
             if (src0_needs_grads) {
                 float s;
@@ -6964,6 +7033,35 @@ void ggml_build_backward_expand(
                 ignore_src[1] = true;
                 break;
 
+            // MUL_MAT_ID: expert dispatch indices (src2) are integer — no gradient.
+            // When src0 is quantized the expert weights are frozen, so stop gradient through
+            // both src0 and src1 (activations have no path to loss without differentiable weights).
+            case GGML_OP_MUL_MAT_ID:
+                if (ggml_is_quantized(node->src[0]->type)) {
+                    ignore_src[0] = true;
+                    ignore_src[1] = true;
+                }
+                ignore_src[2] = true; // ids: integer tensor
+                break;
+
+            // SET_ROWS is a KV-cache scatter write.  The gradient of the written data flows
+            // through the attention read path (GET_ROWS backward), not through this node.
+            case GGML_OP_SET_ROWS:
+                ignore_src[0] = true;
+                ignore_src[1] = true;
+                break;
+
+            // Ops with no backward implementation — stop gradient through all sources so the
+            // backward graph builder never tries to propagate through them.
+            case GGML_OP_SSM_CONV:       // Mamba causal conv1d
+            case GGML_OP_SSM_SCAN:       // Mamba selective scan
+            case GGML_OP_FLASH_ATTN_EXT: // use standard attention for training
+                ignore_src[0] = true;
+                ignore_src[1] = true;
+                ignore_src[2] = true;
+                ignore_src[3] = true;
+                break;
+
             default:
                 break;
         }

From 68d9f10057a47e5fc651784c36c0fcbbe5b26ec1 Mon Sep 17 00:00:00 2001
From: Salvatore Rossitto <srossitto79@gmail.com>
Date: Wed, 11 Mar 2026 14:55:38 +0100
Subject: [PATCH 02/15] added moe experts profiling and pruning

---
 tools/CMakeLists.txt                       |   1 +
 tools/expert-profile/CMakeLists.txt        |   8 +
 tools/expert-profile/expert-profile.cpp    | 506 +++++++++++++++++++++
 tools/moe-pruning/README.md                |  97 ++++
 tools/moe-pruning/analyze_stats.py         | 284 ++++++++++++
 tools/moe-pruning/build_expert_profile.sh  |  42 ++
 tools/moe-pruning/extract_ppl.py           |  41 ++
 tools/moe-pruning/gguf_prune.py            | 258 +++++++++++
 tools/moe-pruning/nemotron_reap.py         | 296 ++++++++++++
 tools/moe-pruning/requirements.txt         |   1 +
 tools/moe-pruning/sample_calibration.jsonl |   8 +
 11 files changed, 1542 insertions(+)
 create mode 100644 tools/expert-profile/CMakeLists.txt
 create mode 100644 tools/expert-profile/expert-profile.cpp
 create mode 100644 tools/moe-pruning/README.md
 create mode 100644 tools/moe-pruning/analyze_stats.py
 create mode 100644 tools/moe-pruning/build_expert_profile.sh
 create mode 100644 tools/moe-pruning/extract_ppl.py
 create mode 100644 tools/moe-pruning/gguf_prune.py
 create mode 100644 tools/moe-pruning/nemotron_reap.py
 create mode 100644 tools/moe-pruning/requirements.txt
 create mode 100644 tools/moe-pruning/sample_calibration.jsonl

diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index b433c91d85..0bc2ad34c1 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -39,4 +39,5 @@ else()
     endif()
     add_subdirectory(fit-params)
     add_subdirectory(results)
+    add_subdirectory(expert-profile)
 endif()
diff --git a/tools/expert-profile/CMakeLists.txt b/tools/expert-profile/CMakeLists.txt
new file mode 100644
index 0000000000..859bd77a53
--- /dev/null
+++ b/tools/expert-profile/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(TARGET llama-expert-profile)
+add_executable(${TARGET} expert-profile.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
diff --git a/tools/expert-profile/expert-profile.cpp b/tools/expert-profile/expert-profile.cpp
new file mode 100644
index 0000000000..de381ff1f1
--- /dev/null
+++ b/tools/expert-profile/expert-profile.cpp
@@ -0,0 +1,506 @@
+/**
+ * expert-profile: NemotronH MoE expert activation profiler (REAP implementation)
+ *
+ * Implements the REAP (Router-weighted Expert Activation Pruning) saliency criterion:
+ *
+ *   REAP(j) = mean over tokens routed to j of:  gate_weight(j,t) * ||expert_output(j,t)||_2
+ *
+ * where expert_output is ffn_moe_down (the FFN output BEFORE gate weighting),
+ * and gate_weight is ffn_moe_weights (post-softmax routing probability).
+ *
+ * Intercepts three tensors per MoE layer via ggml eval callback:
+ *   ffn_moe_topk-{il}    [n_expert_used, n_tokens] I32  — which experts were selected
+ *   ffn_moe_weights-{il} [1, n_expert_used, n_tokens] F32 — gate weights (softmax probs)
+ *   ffn_moe_down-{il}    [n_embd, n_expert_used, n_tokens] F32 — expert outputs (pre-weighting)
+ *
+ * Reference: "REAP: Router-weighted Expert Activation Pruning" (arXiv:2510.13999)
+ *   score = mean_{x in X_j}[ g_j(x) * ||f_j(x)||_2 ]  (Equation 9)
+ *
+ * Usage:
+ *   llama-expert-profile \
+ *     -m model.gguf --jsonl training-data.jsonl --output expert_stats.json \
+ *     [--n-experts 128] [--ctx-size 16384] [-ngl 32] [-t 24] [--save-every 1]
+ */
+
+#include "arg.h"
+#include "common.h"
+#include "log.h"
+#include "llama.h"
+#include "ggml-backend.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <mutex>
+#include <string>
+#include <vector>
+
+// ─── Per-layer stats ──────────────────────────────────────────────────────────
+
+struct LayerStats {
+    int64_t n_experts    = 0;
+    int64_t total_tokens = 0;  // tokens processed through this layer
+
+    // Frequency / weighted-frequency (kept for reference/comparison)
+    std::vector<int64_t> activation_counts;   // [n_experts] — how many tokens routed here
+    std::vector<double>  weighted_freq_sum;   // [n_experts] — sum of gate weights
+
+    // REAP: running sum and count for computing mean(gate_weight * ||expert_out||_2)
+    std::vector<double>  reap_sum;            // [n_experts] — sum of g_j(t)*||f_j(t)||_2
+    std::vector<double>  ean_sum;             // [n_experts] — sum of ||f_j(t)||_2 (EAN, no gate)
+
+    void init(int64_t n) {
+        n_experts = n;
+        activation_counts.assign(n, 0);
+        weighted_freq_sum.assign(n, 0.0);
+        reap_sum.assign(n, 0.0);
+        ean_sum.assign(n, 0.0);
+    }
+
+    // Called once we have all three tensors for a batch.
+    // expert_ids:  [n_expert_used * n_tokens]  I32  — flat, column-major: [k + t*n_expert_used]
+    // gate_weights:[n_expert_used * n_tokens]  F32  — same layout
+    // expert_outs: [n_embd * n_expert_used * n_tokens] F32 — layout: [e + k*n_embd + t*n_embd*n_expert_used]
+    //              i.e. for token t, expert-slot k: out vector starts at t*n_embd*n_expert_used + k*n_embd
+    void add_batch(const int32_t * expert_ids,
+                   const float   * gate_weights,
+                   const float   * expert_outs,
+                   int64_t         n_expert_used,
+                   int64_t         n_tok,
+                   int64_t         n_embd) {
+        total_tokens += n_tok;
+        for (int64_t t = 0; t < n_tok; ++t) {
+            for (int64_t k = 0; k < n_expert_used; ++k) {
+                const int64_t flat = k + t * n_expert_used;
+                const int32_t eid  = expert_ids[flat];
+                if (eid < 0 || eid >= n_experts) continue;
+
+                const float gw = gate_weights[flat];
+
+                // L2 norm of expert output vector for this (token, expert-slot)
+                const float * vec = expert_outs + t * n_embd * n_expert_used + k * n_embd;
+                double norm2 = 0.0;
+                for (int64_t d = 0; d < n_embd; ++d) {
+                    norm2 += (double)vec[d] * (double)vec[d];
+                }
+                const double norm = std::sqrt(norm2);
+
+                activation_counts [eid] += 1;
+                weighted_freq_sum [eid] += gw;
+                reap_sum          [eid] += gw * norm;   // REAP numerator
+                ean_sum           [eid] += norm;         // EAN numerator
+            }
+        }
+    }
+};
+
+// ─── Collector ────────────────────────────────────────────────────────────────
+
+struct ExpertCollector {
+    int64_t n_experts = 128;
+
+    std::map<int, LayerStats> layer_stats;
+    std::mutex                mtx;
+
+    // We need all three tensors before we can compute REAP.
+    // They arrive in order: topk → weights → down (per the graph build order).
+    // Store pending topk+weights until down arrives.
+    struct PendingBatch {
+        int64_t              n_expert_used = 0;
+        int64_t              n_tokens      = 0;
+        std::vector<int32_t> expert_ids;    // [n_expert_used * n_tokens]
+        std::vector<float>   gate_weights;  // [n_expert_used * n_tokens]
+        bool                 has_topk    = false;
+        bool                 has_weights = false;
+    };
+    std::map<int, PendingBatch> pending; // layer_idx → pending
+
+    // Strip device prefix/suffix: "CUDA0#ffn_moe_down-5#0" → "ffn_moe_down-5"
+    static std::string clean_name(const char * raw) {
+        const char * p = strchr(raw, '#');
+        if (p) {
+            ++p;
+            const char * q = strchr(p, '#');
+            return q ? std::string(p, q - p) : std::string(p);
+        }
+        return raw;
+    }
+
+    bool wants(struct ggml_tensor * t) {
+        if (!t->name[0]) return false;
+        const std::string n = clean_name(t->name);
+        return (n.compare(0, 13, "ffn_moe_topk-")    == 0 ||
+                n.compare(0, 16, "ffn_moe_weights-") == 0 ||
+                n.compare(0, 13, "ffn_moe_down-")    == 0);
+    }
+
+    bool on_tensor(struct ggml_tensor * t) {
+        const std::string name = clean_name(t->name);
+
+        // Identify tensor type and layer
+        int  il         = -1;
+        bool is_topk    = false;
+        bool is_weights = false;
+        bool is_down    = false;
+
+        if      (name.compare(0, 13, "ffn_moe_topk-")    == 0) { il = atoi(name.c_str() + 13); is_topk    = true; }
+        else if (name.compare(0, 16, "ffn_moe_weights-") == 0) { il = atoi(name.c_str() + 16); is_weights = true; }
+        else if (name.compare(0, 13, "ffn_moe_down-")    == 0) { il = atoi(name.c_str() + 13); is_down    = true; }
+        else return true;
+
+        if (il < 0) return true;
+
+        // Copy tensor data from (possibly GPU) buffer to host
+        const size_t nbytes = ggml_nbytes(t);
+        std::vector<char> buf(nbytes);
+        ggml_backend_tensor_get(t, buf.data(), 0, nbytes);
+
+        std::lock_guard<std::mutex> lk(mtx);
+        PendingBatch & pb = pending[il];
+
+        if (is_topk) {
+            // [n_expert_used, n_tokens] I32
+            pb.n_expert_used = t->ne[0];
+            pb.n_tokens      = t->ne[1];
+            pb.expert_ids.resize(pb.n_expert_used * pb.n_tokens);
+            memcpy(pb.expert_ids.data(), buf.data(), pb.n_expert_used * pb.n_tokens * sizeof(int32_t));
+            pb.has_topk    = true;
+            pb.has_weights = false; // reset in case of re-use
+
+        } else if (is_weights) {
+            // [1, n_expert_used, n_tokens] F32 — flat layout same as topk
+            if (!pb.has_topk) return true; // shouldn't happen
+            pb.gate_weights.resize(pb.n_expert_used * pb.n_tokens);
+            memcpy(pb.gate_weights.data(), buf.data(), pb.n_expert_used * pb.n_tokens * sizeof(float));
+            pb.has_weights = true;
+
+        } else if (is_down) {
+            // [n_embd, n_expert_used, n_tokens] F32
+            if (!pb.has_topk || !pb.has_weights) return true;
+
+            const int64_t n_embd        = t->ne[0];
+            const int64_t n_expert_used = t->ne[1];
+            const int64_t n_tokens      = t->ne[2];
+
+            // Sanity check
+            if (n_expert_used != pb.n_expert_used || n_tokens != pb.n_tokens) {
+                LOG_ERR("expert-profile: dimension mismatch at layer %d\n", il);
+                pending.erase(il);
+                return true;
+            }
+
+            // Ensure layer stats initialised
+            auto & ls = layer_stats[il];
+            if (ls.n_experts == 0) ls.init(n_experts);
+
+            const float * expert_outs = reinterpret_cast<const float *>(buf.data());
+            ls.add_batch(pb.expert_ids.data(), pb.gate_weights.data(),
+                         expert_outs, n_expert_used, n_tokens, n_embd);
+
+            // Done with this batch for this layer
+            pending.erase(il);
+        }
+
+        return true;
+    }
+};
+
+// ─── Global collector + C callback ───────────────────────────────────────────
+
+static ExpertCollector g_collector;
+
+static bool expert_eval_callback(struct ggml_tensor * t, bool ask, void * /*user_data*/) {
+    if (ask) return g_collector.wants(t);
+    return g_collector.on_tensor(t);
+}
+
+// ─── JSON output ──────────────────────────────────────────────────────────────
+
+static void save_stats(const std::string & path) {
+    std::ofstream f(path);
+    if (!f) {
+        LOG_ERR("expert-profile: failed to open output file '%s'\n", path.c_str());
+        return;
+    }
+
+    f << "{\n";
+    bool first_layer = true;
+    for (auto & [il, ls] : g_collector.layer_stats) {
+        if (!first_layer) f << ",\n";
+        first_layer = false;
+
+        f << "  \"" << il << "\": {\n";
+        f << "    \"total_tokens\": " << ls.total_tokens << ",\n";
+
+        // activation_counts
+        f << "    \"activation_counts\": [";
+        for (int64_t i = 0; i < ls.n_experts; ++i) {
+            if (i) f << ", ";
+            f << ls.activation_counts[i];
+        }
+        f << "],\n";
+
+        // activation_frequency
+        f << "    \"activation_frequency\": [";
+        for (int64_t i = 0; i < ls.n_experts; ++i) {
+            if (i) f << ", ";
+            f << ((ls.total_tokens > 0) ? (double)ls.activation_counts[i] / ls.total_tokens : 0.0);
+        }
+        f << "],\n";
+
+        // avg_gate_weight  (weighted_freq_sum / activation_counts)
+        f << "    \"avg_gate_weight\": [";
+        for (int64_t i = 0; i < ls.n_experts; ++i) {
+            if (i) f << ", ";
+            f << ((ls.activation_counts[i] > 0) ? ls.weighted_freq_sum[i] / ls.activation_counts[i] : 0.0);
+        }
+        f << "],\n";
+
+        // ean_mean  = ean_sum / activation_counts  (EAN criterion, no gate weight)
+        f << "    \"ean_mean\": [";
+        for (int64_t i = 0; i < ls.n_experts; ++i) {
+            if (i) f << ", ";
+            f << ((ls.activation_counts[i] > 0) ? ls.ean_sum[i] / ls.activation_counts[i] : 0.0);
+        }
+        f << "],\n";
+
+        // reap  = reap_sum / activation_counts  (REAP criterion, Eq.9)
+        f << "    \"reap\": [";
+        for (int64_t i = 0; i < ls.n_experts; ++i) {
+            if (i) f << ", ";
+            f << ((ls.activation_counts[i] > 0) ? ls.reap_sum[i] / ls.activation_counts[i] : 0.0);
+        }
+        f << "],\n";
+
+        // never_activated
+        int64_t never = 0;
+        for (int64_t i = 0; i < ls.n_experts; ++i) {
+            if (ls.activation_counts[i] == 0) ++never;
+        }
+        f << "    \"never_activated\": " << never << "\n";
+        f << "  }";
+    }
+    f << "\n}\n";
+
+    LOG_INF("expert-profile: stats saved to '%s'  (%zu MoE layers)\n",
+            path.c_str(), g_collector.layer_stats.size());
+}
+
+// ─── JSONL input ──────────────────────────────────────────────────────────────
+
+struct JsonPair { std::string prompt, response; };
+
+static bool json_get_string(const std::string & line, const std::string & key, std::string & out) {
+    std::string search = "\"" + key + "\"";
+    size_t kpos = line.find(search);
+    if (kpos == std::string::npos) return false;
+    size_t colon = line.find(':', kpos + search.size());
+    if (colon == std::string::npos) return false;
+    size_t q1 = line.find('"', colon + 1);
+    if (q1 == std::string::npos) return false;
+    out.clear();
+    for (size_t i = q1 + 1; i < line.size(); ++i) {
+        if (line[i] == '\\' && i + 1 < line.size()) {
+            ++i;
+            switch (line[i]) {
+                case '"':  out += '"';  break;
+                case '\\': out += '\\'; break;
+                case 'n':  out += '\n'; break;
+                case 'r':  out += '\r'; break;
+                case 't':  out += '\t'; break;
+                default:   out += line[i]; break;
+            }
+        } else if (line[i] == '"') {
+            return true;
+        } else {
+            out += line[i];
+        }
+    }
+    return false;
+}
+
+static std::vector<JsonPair> load_jsonl(const std::string & path) {
+    std::vector<JsonPair> pairs;
+    std::ifstream f(path);
+    if (!f) { LOG_ERR("expert-profile: cannot open JSONL file '%s'\n", path.c_str()); return pairs; }
+    std::string line;
+    while (std::getline(f, line)) {
+        if (line.empty()) continue;
+        JsonPair p;
+        json_get_string(line, "prompt",   p.prompt);
+        json_get_string(line, "response", p.response);
+        if (!p.prompt.empty() || !p.response.empty()) pairs.push_back(std::move(p));
+    }
+    return pairs;
+}
+
+// ─── Inference loop ───────────────────────────────────────────────────────────
+
+static void run_inference(llama_context * ctx,
+                          const llama_model * model,
+                          const std::vector<JsonPair> & pairs,
+                          int max_tokens,
+                          const std::string & output_path,
+                          int save_every) {
+    const llama_vocab * vocab  = llama_model_get_vocab(model);
+    const bool          add_bos = llama_vocab_get_add_bos(vocab);
+
+    llama_batch batch = llama_batch_init(max_tokens, 0, 1);
+
+    for (size_t pi = 0; pi < pairs.size(); ++pi) {
+        const std::string text = pairs[pi].prompt + "\n" + pairs[pi].response;
+
+        std::vector<llama_token> tokens = common_tokenize(ctx, text, add_bos, true);
+        if ((int)tokens.size() > max_tokens) tokens.resize(max_tokens);
+        if (tokens.empty()) continue;
+
+        LOG_INF("  [%zu/%zu] %zu tokens\n", pi + 1, pairs.size(), tokens.size());
+
+        llama_memory_clear(llama_get_memory(ctx), true);
+
+        common_batch_clear(batch);
+        for (int i = 0; i < (int)tokens.size(); ++i) {
+            common_batch_add(batch, tokens[i], i, {0}, false);
+        }
+        batch.logits[batch.n_tokens - 1] = true;
+
+        if (llama_decode(ctx, batch) != 0) {
+            LOG_ERR("  [%zu/%zu] llama_decode failed — skipping\n", pi + 1, pairs.size());
+        }
+
+        if (save_every > 0 && (pi + 1) % save_every == 0) {
+            save_stats(output_path);
+        }
+    }
+
+    llama_batch_free(batch);
+}
+
+// ─── CLI ──────────────────────────────────────────────────────────────────────
+
+int main(int argc, char ** argv) {
+    std::string model_path;
+    std::string jsonl_path;
+    std::string output_path  = "expert_stats.json";
+    int         n_experts    = 128;
+    int         ctx_size     = 2048;
+    int         n_gpu_layers = 99;
+    int         n_threads    = 4;
+    int         save_every   = 100;
+    enum ggml_type kv_type_k = GGML_TYPE_F16;
+    enum ggml_type kv_type_v = GGML_TYPE_F16;
+
+    auto parse_ggml_type = [](const char * s) -> enum ggml_type {
+        if (strcmp(s, "f32")  == 0) return GGML_TYPE_F32;
+        if (strcmp(s, "f16")  == 0) return GGML_TYPE_F16;
+        if (strcmp(s, "q8_0") == 0) return GGML_TYPE_Q8_0;
+        if (strcmp(s, "q4_0") == 0) return GGML_TYPE_Q4_0;
+        fprintf(stderr, "Unknown KV type '%s', using f16\n", s); return GGML_TYPE_F16;
+    };
+
+    for (int i = 1; i < argc; ++i) {
+        std::string a(argv[i]);
+        auto next = [&]() -> const char * {
+            if (i + 1 >= argc) { fprintf(stderr, "Missing argument for %s\n", argv[i]); exit(1); }
+            return argv[++i];
+        };
+        if      (a == "-m" || a == "--model")           model_path  = next();
+        else if (a == "--jsonl")                         jsonl_path  = next();
+        else if (a == "--output")                        output_path = next();
+        else if (a == "--n-experts")                     n_experts    = atoi(next());
+        else if (a == "--ctx-size" || a == "-c")         ctx_size     = atoi(next());
+        else if (a == "-ngl" || a == "--n-gpu-layers")   n_gpu_layers = atoi(next());
+        else if (a == "-t" || a == "--threads")          n_threads    = atoi(next());
+        else if (a == "--type-k")                        kv_type_k   = parse_ggml_type(next());
+        else if (a == "--type-v")                        kv_type_v   = parse_ggml_type(next());
+        else if (a == "--save-every")                    save_every  = atoi(next());
+        else if (a == "-h" || a == "--help") {
+            fprintf(stderr,
+                "\nUsage: %s -m model.gguf --jsonl data.jsonl [options]\n"
+                "  --output PATH       Output JSON (default: expert_stats.json)\n"
+                "  --n-experts N       Experts per layer (default: 128)\n"
+                "  --ctx-size N        Context length (default: 2048)\n"
+                "  -ngl N              GPU layers (default: 99)\n"
+                "  -t N                CPU threads (default: 4)\n"
+                "  --type-k/v TYPE     KV cache type: f32/f16/q8_0/q4_0 (default: f16)\n"
+                "  --save-every N      Checkpoint every N samples (default: 100)\n\n", argv[0]);
+            return 0;
+        } else {
+            fprintf(stderr, "Unknown argument: %s\n", a.c_str()); return 1;
+        }
+    }
+
+    if (model_path.empty()) { fprintf(stderr, "Error: -m required\n"); return 1; }
+    if (jsonl_path.empty()) { fprintf(stderr, "Error: --jsonl required\n"); return 1; }
+
+    g_collector.n_experts = n_experts;
+
+    LOG_INF("expert-profile: model     = %s\n", model_path.c_str());
+    LOG_INF("expert-profile: jsonl     = %s\n", jsonl_path.c_str());
+    LOG_INF("expert-profile: output    = %s\n", output_path.c_str());
+    LOG_INF("expert-profile: n_experts = %d\n", n_experts);
+    LOG_INF("expert-profile: ctx_size  = %d\n", ctx_size);
+    LOG_INF("expert-profile: ngl       = %d\n", n_gpu_layers);
+    LOG_INF("expert-profile: criterion = REAP (gate_weight * ||expert_out||_2)\n");
+
+    auto pairs = load_jsonl(jsonl_path);
+    if (pairs.empty()) { LOG_ERR("expert-profile: no pairs loaded\n"); return 1; }
+    LOG_INF("expert-profile: loaded %zu pairs\n", pairs.size());
+
+    llama_backend_init();
+
+    // Suppress INFO/WARN spam (CUDA graph warmup etc.), only pass errors through
+    llama_log_set([](enum ggml_log_level level, const char * text, void *) {
+        if (level >= GGML_LOG_LEVEL_ERROR) fputs(text, stderr);
+    }, nullptr);
+
+    llama_model_params mparams = llama_model_default_params();
+    mparams.n_gpu_layers = n_gpu_layers;
+
+    llama_model * model = llama_model_load_from_file(model_path.c_str(), mparams);
+    if (!model) { LOG_ERR("expert-profile: failed to load model\n"); return 1; }
+
+    llama_context_params cparams  = llama_context_default_params();
+    cparams.n_ctx                 = ctx_size;
+    cparams.n_batch               = ctx_size;
+    cparams.n_ubatch              = std::min(ctx_size, 512);
+    cparams.n_threads             = n_threads;
+    cparams.type_k                = kv_type_k;
+    cparams.type_v                = kv_type_v;
+    cparams.cb_eval               = expert_eval_callback;
+    cparams.cb_eval_user_data     = nullptr;
+
+    llama_context * ctx = llama_init_from_model(model, cparams);
+    if (!ctx) { LOG_ERR("expert-profile: failed to create context\n"); return 1; }
+
+    LOG_INF("expert-profile: running forward passes...\n");
+    run_inference(ctx, model, pairs, ctx_size, output_path, save_every);
+    save_stats(output_path);
+
+    // Summary
+    LOG_INF("\n  MoE layers profiled: %zu\n", g_collector.layer_stats.size());
+    for (auto & [il, ls] : g_collector.layer_stats) {
+        // Find top and bottom REAP expert
+        int64_t top_e = 0, bot_e = 0;
+        double  top_v = 0.0, bot_v = 1e18;
+        for (int64_t i = 0; i < ls.n_experts; ++i) {
+            double v = (ls.activation_counts[i] > 0) ? ls.reap_sum[i] / ls.activation_counts[i] : 0.0;
+            if (v > top_v) { top_v = v; top_e = i; }
+            if (v < bot_v) { bot_v = v; bot_e = i; }
+        }
+        int64_t never = 0;
+        for (int64_t i = 0; i < ls.n_experts; ++i)
+            if (ls.activation_counts[i] == 0) ++never;
+        LOG_INF("  Layer %3d: tokens=%lld  never=%lld  reap_top=e%lld(%.4f)  reap_bot=e%lld(%.4f)\n",
+                il, (long long)ls.total_tokens, (long long)never,
+                (long long)top_e, top_v, (long long)bot_e, bot_v);
+    }
+
+    llama_free(ctx);
+    llama_model_free(model);
+    llama_backend_free();
+    return 0;
+}
diff --git a/tools/moe-pruning/README.md b/tools/moe-pruning/README.md
new file mode 100644
index 0000000000..a88499ac43
--- /dev/null
+++ b/tools/moe-pruning/README.md
@@ -0,0 +1,97 @@
+# MoE Expert Pruning Tools for NemotronH
+
+REAP-style expert pruning for `NVIDIA-Nemotron-3-Nano-30B-A3B` (and other
+NemotronH MoE models), implemented in two complementary ways:
+
+1. **`tools/expert-profile/`** — C++ profiler built into llama.cpp, collects
+   REAP scores directly from GGUF inference via the ggml eval callback.
+2. **`tools/moe-pruning/`** (this directory) — Python scripts to prune the model
+   using the collected scores, either on a GGUF file directly or on a
+   HuggingFace BF16 checkpoint.
+
+---
+
+## Inspiration & Prior Art
+
+This work is a direct implementation of the **REAP** saliency criterion
+introduced in:
+
+> **REAP the Experts: Why Pruning Prevails for One-Shot MoE Compression**
+> Mike Lasby, Ivan Lazarevich, Nish Sinnadurai, Sean Lie, Yani Ioannou, Vithursan Thangarasa
+> Cerebras Research, 2025
+> arXiv: https://arxiv.org/abs/2510.13999
+> Code:  https://github.com/CerebrasResearch/reap
+
+The REAP score for expert `j` is (Equation 9 of the paper):
+
+```
+REAP(j) = mean_{t : j ∈ topk(t)} [ g_j(t) · ‖f_j(t)‖₂ ]
+```
+
+where `g_j(t)` is the router gate weight and `f_j(t)` is the expert FFN output
+(pre-weighting) for token `t`. Experts with the lowest REAP score contribute
+least to the layer output and are pruned first.
+
+The original REAP repo targets HuggingFace models via PyTorch hooks on
+standard architectures (Qwen3-MoE, Mixtral, DeepSeek-V2, Llama-4, …).
+
+**What we added / adapted:**
+
+- `tools/expert-profile/expert-profile.cpp` — llama.cpp C++ implementation
+  of REAP that intercepts `ffn_moe_topk`, `ffn_moe_weights`, and `ffn_moe_down`
+  tensors via `ggml_backend_eval_callback`, enabling REAP profiling on any
+  GGUF-quantised model (Q4_K_M, Q6_K, etc.) without needing full BF16 VRAM.
+
+- `gguf_prune.py` — prunes the GGUF file **directly**, slicing the expert axis
+  of the stacked weight tensors (`ffn_up_exps`, `ffn_down_exps`, `ffn_gate_inp`,
+  `ffn_exp_probs_b`) and patching `{arch}.expert_count` in the metadata.
+  Quantised blocks are preserved as raw bytes — no dequantise/requantise step.
+
+- `nemotron_reap.py` — HuggingFace-based alternative: profiles with 4-bit NF4
+  on GPU (phase 1) and prunes the BF16 checkpoint on CPU (phase 2). Adds
+  NemotronH (`NemotronHForCausalLM`) support that the original REAP repo does
+  not have.
+
+---
+
+## Recommended Workflow (low-VRAM, e.g. RTX 4060 Ti 16 GB)
+
+```
+┌─────────────────────────────────────────────┐
+│  Phase 1 — Profile  (GPU, GGUF Q4, ~15 GB)  │
+│                                             │
+│  llama-expert-profile                       │
+│    -m nemotron-Q4_K_M.gguf                  │
+│    --jsonl sample_calibration.jsonl         │
+│    --output expert_stats.json               │
+│    -ngl 99 --ctx-size 2048                  │
+└───────────────────┬─────────────────────────┘
+                    │ expert_stats.json
+┌───────────────────▼─────────────────────────┐
+│  Phase 2 — Prune  (CPU, pure Python, ~2 GB) │
+│                                             │
+│  python gguf_prune.py                       │
+│    --input  nemotron-Q4_K_M.gguf            │
+│    --stats  expert_stats.json               │
+│    --output nemotron-pruned-26e.gguf        │
+│    --keep_ratio 0.20   # 26/128 experts     │
+└─────────────────────────────────────────────┘
+```
+
+At 20 % keep ratio a ~22 GB Q4_K_M becomes ~4.5 GB.
+
+---
+
+## Files
+
+| File | Description |
+|---|---|
+| `gguf_prune.py` | GGUF-native pruner — no GPU needed, preserves quantisation |
+| `nemotron_reap.py` | HF-based pruner — 4-bit GPU profile + CPU BF16 prune |
+| `build_expert_profile.sh` | Build script for `llama-expert-profile` |
+| `run_nemotron_profile.sh` | Example profiling run |
+| `run_prune.sh` | Example pruning run |
+| `run_convert_quantize.sh` | Convert HF → GGUF and quantise |
+| `analyze_stats.py` | Visualise and compare expert stats JSON files |
+| `sample_calibration.jsonl` | Sample calibration data (prompt+response pairs) |
+| `expert_stats_reap.json` | Example stats output from expert-profile |
diff --git a/tools/moe-pruning/analyze_stats.py b/tools/moe-pruning/analyze_stats.py
new file mode 100644
index 0000000000..e7641a3bb6
--- /dev/null
+++ b/tools/moe-pruning/analyze_stats.py
@@ -0,0 +1,284 @@
+#!/usr/bin/env python3
+"""
+analyze_stats.py  --  Summarize expert_stats.json and model size projections.
+Usage: python analyze_stats.py [stats_file] [--keep 0.5]
+"""
+import json, sys, statistics, argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument("stats", nargs="?", default="expert_stats_reap.json")
+parser.add_argument("--keep", type=float, default=0.5, help="Fraction of experts to keep (default 0.5)")
+args = parser.parse_args()
+
+with open(args.stats) as f:
+    data = json.load(f)
+
+layers = sorted(data.keys(), key=int)
+n_layers = len(layers)
+keep_ratio = args.keep
+
+# Detect which scoring field is available (new REAP vs old importance_score)
+sample_layer = data[layers[0]]
+if "reap" in sample_layer:
+    score_field = "reap"
+    score_label = "REAP (gate_weight × ||expert_out||₂)"
+elif "importance_score" in sample_layer:
+    score_field = "importance_score"
+    score_label = "importance_score (freq × avg_gate_weight)  [legacy, no EAN]"
+else:
+    raise ValueError(f"No recognised score field in stats. Keys: {list(sample_layer.keys())}")
+
+# ── Model architecture constants (Nemotron-3-Nano-30B-A3B) ──────────────────
+N_EXPERTS        = 128
+N_EXPERT_USED    = 6       # top-k per token
+N_MOE_LAYERS     = 23
+N_TOTAL_LAYERS   = 53
+# Approximate parameter counts (bf16, billions)
+PARAMS_TOTAL_B        = 30.0
+PARAMS_MOE_EXPERTS_B  = 22.0   # bulk of MoE weight is in expert FFNs
+PARAMS_NON_MOE_B      = PARAMS_TOTAL_B - PARAMS_MOE_EXPERTS_B
+
+# ── Header ──────────────────────────────────────────────────────────────────
+print("=" * 70)
+print(f"  Expert Stats Analysis  |  file: {args.stats}")
+print("=" * 70)
+
+# ── Profiling completeness ───────────────────────────────────────────────────
+sample_tokens = list(data.values())[0]["total_tokens"]
+# Each token activates N_EXPERT_USED experts, sum(activation_counts) = total*top_k
+# Approximate samples: total_tokens / avg_tokens_per_sample
+# We don't know avg, but can infer: total_tokens / (total_tokens / ctx) ≈ ctx chunks
+# Better: just report tokens and note the user knows sample count
+print(f"\n── Profiling progress ──────────────────────────────────────────────────")
+print(f"  MoE layers profiled    : {n_layers} / {N_MOE_LAYERS}")
+print(f"  Tokens processed       : {sample_tokens:,}  (per layer)")
+act_sum = sum(data[layers[0]]["activation_counts"])
+assert abs(act_sum / sample_tokens - N_EXPERT_USED) < 0.01, "unexpected top-k"
+print(f"  top-k confirmed        : {N_EXPERT_USED}  (sum activations / tokens = {act_sum/sample_tokens:.1f})")
+
+# ── Per-layer importance score stats ────────────────────────────────────────
+print(f"\n── Per-layer score distribution  [{score_label}]")
+print(f"  {'Layer':>5}  {'Min':>9}  {'Max':>9}  {'Range':>9}  {'CV%':>6}  {'Never':>5}")
+global_cvs = []
+for k in layers:
+    d = data[k]
+    s = d[score_field]
+    mn, mx = min(s), max(s)
+    cv = statistics.stdev(s) / statistics.mean(s) * 100
+    global_cvs.append(cv)
+    print(f"  {k:>5}  {mn:>9.5f}  {mx:>9.5f}  {mx-mn:>9.5f}  {cv:>6.3f}%  {d['never_activated']:>5}")
+
+print(f"\n  Mean CV across layers  : {statistics.mean(global_cvs):.3f}%")
+print(f"  (CV < 1% = near-uniform; load-balancing is working as designed)")
+
+# ── Capacity loss sweep across pruning levels ────────────────────────────────
+# Paper (observer.py): REAP[i] = mean(ean_norm * softmax_router_weight) over tokens
+#   routed to expert i, averaged via OnlineStatsTracker weighted by expert_frequency.
+# Our implementation (llama.cpp): same formula but routing weights are the top-k
+#   gate weights (post-softmax within top-k), not the full softmax over all 128.
+# Impact: our weights are slightly higher than the paper's (renormalized to top-k
+#   only), but relative expert ranking within a layer should be preserved.
+#
+# IMPORTANT CAVEAT for this model (Nemotron-3-Nano-30B-A3B):
+#   The model was trained with a strong load-balancing auxiliary loss, so all 128
+#   experts have nearly identical activation frequency (~4.69%) AND nearly identical
+#   REAP scores (Gini ~0.015, top/bottom ratio ~1.1-1.35x). The score distribution
+#   is a smooth monotone curve with NO natural elbow or gap.
+#
+#   This means:
+#   - REAP ranking beats random pruning by only ~1pp in mass terms at keep=33%
+#   - The cut point boundary (rank 42 vs 43) has near-zero gap in most layers
+#   - REAP paper results on Qwen3-30B-A3B likely had higher Gini (less tight
+#     load-balancing or more expert specialization in pre-training)
+#   - For this model, actual quality loss must be measured via eval, not predicted
+#     from REAP score variance
+#
+# Metrics reported:
+# - kept_mass%: REAP mass in the KEPT experts as % of total (> keep_ratio% = good)
+# - vs_random%: how much more mass the REAP-selected set retains vs a random set
+#               of the same size (= kept_mass% - keep_ratio%). Positive = REAP wins.
+# - Rel.gap:    score gap at cut / layer score range. Near 0 = no natural cut point.
+# - Gini:       inequality of score distribution. ~0.015 here = near-uniform.
+
+def gini(scores):
+    """Gini coefficient of a list of non-negative values."""
+    n = len(scores)
+    s = sorted(scores)
+    total = sum(s)
+    if total == 0:
+        return 0.0
+    cumsum = 0.0
+    for i, v in enumerate(s):
+        cumsum += (2 * (i + 1) - n - 1) * v
+    return cumsum / (n * total)
+
+def layer_stats(scores, n_keep):
+    """Return capacity metrics for a single layer at a given keep count."""
+    n = len(scores)
+    ranked = sorted(range(n), key=lambda i: scores[i], reverse=True)
+    total  = sum(scores)
+    kept_mass   = sum(scores[i] for i in ranked[:n_keep])
+    kept_frac   = kept_mass / total if total > 0 else 0.0     # fraction of REAP mass kept
+    random_frac = n_keep / n                                   # uniform expectation
+    vs_random   = kept_frac - random_frac                     # positive = REAP beats random
+    score_range = scores[ranked[0]] - scores[ranked[-1]]
+    gap         = scores[ranked[n_keep - 1]] - (scores[ranked[n_keep]] if n_keep < n else 0)
+    rel_gap     = gap / score_range if score_range > 0 else 0.0
+    return kept_frac * 100, vs_random * 100, rel_gap
+
+# Sweep over a range of keep ratios
+sweep_ratios = [0.10, 0.20, 0.25, 0.33, 0.40, 0.50, 0.60, 0.75]
+if keep_ratio not in sweep_ratios:
+    sweep_ratios.append(keep_ratio)
+sweep_ratios = sorted(set(sweep_ratios))
+
+# Per-layer Gini (fixed, independent of keep ratio)
+layer_ginis = {k: gini(data[k][score_field]) for k in layers}
+mean_gini = statistics.mean(layer_ginis.values())
+worst_gini_layer = max(layer_ginis, key=lambda k: layer_ginis[k])
+
+print(f"\n── Score distribution inequality (Gini coefficient) ────────────────────")
+print(f"  Gini measures how non-uniform REAP scores are within each layer.")
+print(f"  Gini=0: all experts identical. Gini=1: one expert dominates.")
+print(f"  With load-balanced MoE, Gini is small — but any Gini > 0 means")
+print(f"  REAP ranking beats random pruning.")
+print(f"")
+print(f"  {'Layer':>5}  {'Gini':>8}  {'Score range':>13}  {'Max/Min ratio':>14}")
+print(f"  {'-'*5}  {'-'*8}  {'-'*13}  {'-'*14}")
+for k in layers:
+    s = data[k][score_field]
+    mn, mx = min(s), max(s)
+    g = layer_ginis[k]
+    ratio_mm = mx / mn if mn > 0 else float('inf')
+    print(f"  {k:>5}  {g:>8.5f}  {mx-mn:>13.5f}  {ratio_mm:>13.3f}x")
+print(f"")
+print(f"  Mean Gini : {mean_gini:.5f}  (worst layer: {worst_gini_layer})")
+
+print(f"\n── Capacity retention sweep ─────────────────────────────────────────────")
+print(f"  Kept mass%  = REAP mass in KEPT experts as % of total (higher = better)")
+print(f"  vs.rand%    = Kept mass% minus uniform baseline (keep_ratio%)")
+print(f"                Positive = REAP beats random. Magnitude = advantage in pp.")
+print(f"  Rel.gap     = score gap at cut / layer score range (higher = cleaner cut)")
+print(f"  WARNING: near-zero rel.gap and small vs.rand mean eval is the only ground truth.")
+print(f"")
+print(f"  {'Keep':>5}  {'Experts':>7}  {'Kept mass%':>11}  {'vs.rand%':>9}  {'Rel.gap avg':>12}  {'Worst layer':>11}")
+print(f"  {'-'*5}  {'-'*7}  {'-'*11}  {'-'*9}  {'-'*12}  {'-'*11}")
+
+sweep_results = {}
+for ratio in sweep_ratios:
+    nk = max(1, round(N_EXPERTS * ratio))
+    mass_fracs, excesses, rel_gaps = [], [], []
+    worst_excess, worst_layer_id = -999.0, None
+    for k in layers:
+        scores = data[k][score_field]
+        mf, exc, rg = layer_stats(scores, nk)
+        mass_fracs.append(mf)
+        excesses.append(exc)
+        rel_gaps.append(rg)
+        if exc > worst_excess:
+            worst_excess = exc
+            worst_layer_id = k
+    avg_mf  = statistics.mean(mass_fracs)
+    avg_exc = statistics.mean(excesses)
+    avg_rg  = statistics.mean(rel_gaps)
+    marker  = " <--" if abs(ratio - keep_ratio) < 1e-9 else ""
+    print(f"  {ratio:>5.0%}  {nk:>7d}  {avg_mf:>10.2f}%  {avg_exc:>+9.2f}%  {avg_rg:>11.4f}  layer {worst_layer_id:>3}{marker}")
+    sweep_results[ratio] = {
+        "n_keep": nk, "avg_kept_mass": avg_mf, "avg_vs_random": avg_exc,
+        "avg_rel_gap": avg_rg, "worst_layer_id": worst_layer_id, "worst_vs_random": worst_excess,
+    }
+
+print(f"")
+print(f"  vs.rand% quantifies REAP's advantage over random pruning in REAP-mass terms.")
+print(f"  For this model it is small (+0.7 to +1.5pp) due to tight load-balancing.")
+print(f"  Rel.gap near zero means scores are smooth with no natural cut — any threshold")
+print(f"  is as defensible as another. Actual quality delta requires empirical eval.")
+
+# ── Expert keep/prune detail at selected keep_ratio ──────────────────────────
+n_keep   = max(1, round(N_EXPERTS * keep_ratio))
+n_prune  = N_EXPERTS - n_keep
+
+print(f"\n── Expert pruning detail at keep_ratio={keep_ratio:.0%}  ({n_keep} keep / {n_prune} prune per layer) ──")
+print(f"  {'Layer':>5}  {'Kept mass%':>11}  {'vs.rand%':>9}  {'Rel.gap':>9}  {'Min kept':>10}  {'Max pruned':>11}")
+print(f"  {'-'*5}  {'-'*11}  {'-'*9}  {'-'*9}  {'-'*10}  {'-'*11}")
+
+layer_results = {}
+for k in layers:
+    scores = data[k][score_field]
+    ranked = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
+    mf, exc, rg = layer_stats(scores, n_keep)
+    min_kept   = scores[ranked[n_keep - 1]]
+    max_pruned = scores[ranked[n_keep]] if n_prune > 0 else 0
+    layer_results[k] = {"mass_frac": mf, "excess": exc, "rel_gap": rg,
+                        "min_kept": min_kept, "max_pruned": max_pruned}
+    print(f"  {k:>5}  {mf:>10.2f}%  {exc:>+9.2f}%  {rg:>9.4f}  {min_kept:>10.5f}  {max_pruned:>11.5f}")
+
+avg_mf  = statistics.mean(r["mass_frac"] for r in layer_results.values())
+avg_exc = statistics.mean(r["excess"]    for r in layer_results.values())
+avg_rg  = statistics.mean(r["rel_gap"]   for r in layer_results.values())
+print(f"  {'AVG':>5}  {avg_mf:>10.2f}%  {avg_exc:>+9.2f}%  {avg_rg:>9.4f}")
+
+# ── Model size projections ───────────────────────────────────────────────────
+print(f"\n── Model size projections ──────────────────────────────────────────────")
+
+def model_size(keep):
+    expert_params = PARAMS_MOE_EXPERTS_B * keep
+    return PARAMS_NON_MOE_B + expert_params
+
+original_b   = model_size(1.0)
+pruned_b     = model_size(keep_ratio)
+reduction_pct = (1 - pruned_b / original_b) * 100
+
+# GGUF sizes at common quant levels (rough: 1B params ≈ quant_bpw/8 GB)
+quants = [("Q8_0", 8.0), ("Q5_K_M", 5.5), ("Q4_K_M", 4.5), ("Q3_K_M", 3.35), ("Q2_K", 2.63)]
+
+print(f"  {'':20}  {'Original':>10}  {'Pruned':>10}  {'Saved':>8}")
+print(f"  {'Parameters (B)':20}  {original_b:>10.1f}  {pruned_b:>10.1f}  {original_b-pruned_b:>8.1f}B")
+print(f"  {'Reduction':20}  {'':>10}  {reduction_pct:>9.1f}%")
+print()
+print(f"  Estimated GGUF sizes:")
+print(f"  {'Quant':10}  {'Original':>10}  {'Pruned':>10}  {'Fits in':>12}")
+for name, bpw in quants:
+    orig_gb  = original_b * bpw / 8
+    prune_gb = pruned_b   * bpw / 8
+    # VRAM fit (16GB GPU)
+    fits = "16GB GPU" if prune_gb <= 15.5 else ("32GB GPU" if prune_gb <= 31 else "CPU/RAM")
+    print(f"  {name:10}  {orig_gb:>9.1f}G  {prune_gb:>9.1f}G  {fits:>12}")
+
+# ── Active params per token (inference cost) ─────────────────────────────────
+print(f"\n── Inference cost (active params per token) ────────────────────────────")
+# Active params = non-moe + (n_expert_used/n_experts_kept * moe_expert_params)
+# After pruning: router still picks top-k but from n_keep pool
+# Active expert params per token = (N_EXPERT_USED / n_keep) * (PARAMS_MOE_EXPERTS_B * keep_ratio)
+# But actually active params = N_EXPERT_USED * (params per single expert)
+params_per_expert_orig   = PARAMS_MOE_EXPERTS_B / N_EXPERTS          # B per expert
+params_per_expert_pruned = (PARAMS_MOE_EXPERTS_B * keep_ratio) / n_keep  # same, just fewer experts
+
+active_orig   = PARAMS_NON_MOE_B + N_EXPERT_USED * params_per_expert_orig   * N_MOE_LAYERS / N_TOTAL_LAYERS
+active_pruned = PARAMS_NON_MOE_B + N_EXPERT_USED * params_per_expert_pruned * N_MOE_LAYERS / N_TOTAL_LAYERS
+
+print(f"  Original  : {active_orig:.2f}B active params/token  (same expert size, more choice)")
+print(f"  Pruned    : {active_pruned:.2f}B active params/token  (same — top-k still fires {N_EXPERT_USED} experts)")
+print(f"  Note: active params per token are IDENTICAL — pruning only reduces")
+print(f"        model file size and memory footprint, not per-token compute.")
+
+# ── Consistently low-importance experts ──────────────────────────────────────
+print(f"\n── Experts consistently ranked low across all layers ───────────────────")
+bottom_n = max(1, round(N_EXPERTS * 0.10))  # bottom 10%
+low_count = {}
+for k in layers:
+    scores = data[k][score_field]
+    ranked = sorted(range(len(scores)), key=lambda i: scores[i])
+    for eid in ranked[:bottom_n]:
+        low_count[eid] = low_count.get(eid, 0) + 1
+
+consistent = sorted(low_count.items(), key=lambda x: -x[1])
+consistent = [(eid, cnt) for eid, cnt in consistent if cnt >= 3]
+print(f"  (bottom 10% in >= 3 layers — most dispensable experts globally)")
+print(f"  Expert ID : layers in bottom 10%")
+for eid, cnt in consistent[:20]:
+    bar = "█" * cnt
+    print(f"  Expert {eid:>3} : {cnt:>2}/{n_layers}  {bar}")
+
+print()
+print("=" * 70)
diff --git a/tools/moe-pruning/build_expert_profile.sh b/tools/moe-pruning/build_expert_profile.sh
new file mode 100644
index 0000000000..0b39604426
--- /dev/null
+++ b/tools/moe-pruning/build_expert_profile.sh
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+# build_expert_profile.sh
+# Builds llama.cpp with the expert-profile tool in WSL2 with CUDA.
+# Run this from the tools/moe-pruning/ directory: bash build_expert_profile.sh
+
+set -e
+
+LLAMA_SRC="../.."
+BUILD_DIR="$LLAMA_SRC/build_expert"
+
+echo "=== Building llama.cpp + expert-profile tool ==="
+echo "  Source : $LLAMA_SRC"
+echo "  Build  : $BUILD_DIR"
+
+mkdir -p "$BUILD_DIR"
+cd "$BUILD_DIR"
+
+# Configure with CUDA
+cmake "$LLAMA_SRC" \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DGGML_CUDA=ON \
+    -DLLAMA_CURL=OFF \
+    -DLLAMA_BUILD_TESTS=OFF \
+    -DLLAMA_BUILD_EXAMPLES=OFF \
+    -DCMAKE_CUDA_ARCHITECTURES=86 \
+    2>&1 | tail -20
+
+# Build only the expert-profile target (fast)
+cmake --build . --target llama-expert-profile --config Release -j$(nproc)
+
+echo ""
+echo "=== Build complete ==="
+echo "  Binary: $BUILD_DIR/tools/expert-profile/llama-expert-profile"
+echo ""
+echo "=== Usage ==="
+echo "  $BUILD_DIR/tools/expert-profile/llama-expert-profile \\"
+echo "    -m ~/nemotron-3-nano-30b-Q4_K_M.gguf \\"
+echo "    --jsonl ./sample_calibration.jsonl \\"
+echo "    --output ./expert_stats_reap.json \\"
+echo "    --n-experts 128 \\"
+echo "    --ctx-size 16384 \\"
+echo "    -ngl 99"
diff --git a/tools/moe-pruning/extract_ppl.py b/tools/moe-pruning/extract_ppl.py
new file mode 100644
index 0000000000..972a32e99d
--- /dev/null
+++ b/tools/moe-pruning/extract_ppl.py
@@ -0,0 +1,41 @@
+import json, os
+
+base = os.path.dirname(os.path.abspath(__file__))
+
+lines = open(os.path.join(base, 'rwsft-training-data.jsonl'), encoding='utf-8').readlines()
+split = int(len(lines) * 0.95)
+
+train_lines = lines[:split]
+val_lines   = lines[split:]
+
+train_out = os.path.join(base, 'ppl-eval-train.txt')
+val_out   = os.path.join(base, 'ppl-eval-val.txt')
+
+def fmt(s):
+    # Full prompt+response so the model is conditioned correctly.
+    # llama-perplexity scores all tokens, but the prompt PPL is identical
+    # for base vs adapter — the delta is driven by the response tokens.
+    prompt   = s.get('prompt', '').strip()
+    response = s.get('response', '').strip()
+    if not response:
+        return None
+    if prompt:
+        return prompt + '\n' + response
+    return response
+
+with open(train_out, 'w', encoding='utf-8') as f:
+    for line in train_lines:
+        text = fmt(json.loads(line))
+        if text:
+            f.write(text + '\n\n')
+
+with open(val_out, 'w', encoding='utf-8') as f:
+    for line in val_lines:
+        text = fmt(json.loads(line))
+        if text:
+            f.write(text + '\n\n')
+
+train_chars = len(open(train_out, encoding='utf-8').read())
+val_chars   = len(open(val_out,   encoding='utf-8').read())
+print(f'train: {len(train_lines)} samples, {train_chars:,} chars -> ppl-eval-train.txt')
+print(f'val:   {len(val_lines)} samples,  {val_chars:,} chars  -> ppl-eval-val.txt')
diff --git a/tools/moe-pruning/gguf_prune.py b/tools/moe-pruning/gguf_prune.py
new file mode 100644
index 0000000000..2a36d5cf53
--- /dev/null
+++ b/tools/moe-pruning/gguf_prune.py
@@ -0,0 +1,258 @@
+"""
+gguf-prune: REAP-based expert pruning directly on a GGUF file.
+
+Slices the expert dimension of the four stacked MoE weight tensors per layer:
+    blk.{il}.ffn_up_exps      [n_embd, intermediate, n_experts]
+    blk.{il}.ffn_down_exps    [intermediate, n_embd, n_experts]
+    blk.{il}.ffn_gate_inp     [n_embd, n_experts]
+    blk.{il}.ffn_exp_probs_b  [n_experts]  (score-correction bias, if present)
+
+Quantized blocks (Q4_K, Q6_K, …) are preserved as raw bytes — slicing the
+expert axis (last dim) is safe because each expert is independently quantised
+in ggml, so dropping experts = dropping whole quantisation blocks.
+
+Metadata patched:
+    {arch}.expert_count  → keep_n
+    (expert_used_count = top-k routing k, NOT touched)
+
+Usage:
+    # keep top 20% of experts (26/128) per MoE layer
+    python gguf_prune.py \\
+        --input  nemotron.gguf \\
+        --stats  expert_stats.json \\
+        --output nemotron-pruned.gguf \\
+        --keep_ratio 0.20
+
+    # or keep an absolute number
+    python gguf_prune.py \\
+        --input  nemotron.gguf \\
+        --stats  expert_stats.json \\
+        --output nemotron-pruned.gguf \\
+        --keep_n 32
+"""
+
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+
+import numpy as np
+from gguf import GGUFReader, GGUFWriter, GGMLQuantizationType, GGUFValueType
+
+
+# ── Constants ─────────────────────────────────────────────────────────────────
+
+# Base tensor names that carry the expert dimension (last axis in ggml layout).
+# Some GGUFs append parameter tails like ".weight" / ".bias".
+EXPERT_BASE_SUFFIXES = {
+    "ffn_up_exps",
+    "ffn_down_exps",
+    "ffn_gate_inp",
+}
+
+
+def is_expert_suffix(suffix: str) -> bool:
+    """Return True if a tensor suffix is one of the MoE expert tensors to prune."""
+    if suffix in ("ffn_exp_probs_b", "exp_probs_b", "exp_probs_b.bias"):
+        return True
+    return any(suffix == base or suffix.startswith(base + ".") for base in EXPERT_BASE_SUFFIXES)
+
+
+# ── Helpers ───────────────────────────────────────────────────────────────────
+
+def layer_and_suffix(name: str) -> tuple[int, str] | tuple[None, None]:
+    m = re.match(r"blk\.(\d+)\.(.+)$", name)
+    if m:
+        return int(m.group(1)), m.group(2)
+    return None, None
+
+
+def pick_experts(layer_stats: dict, keep_n: int) -> list[int]:
+    """
+    Return sorted indices of the top `keep_n` experts by REAP score.
+    Falls back to 'importance_score' (weighted frequency) if 'reap' absent.
+    """
+    if "reap" in layer_stats:
+        scores = np.array(layer_stats["reap"], dtype=np.float64)
+    elif "importance_score" in layer_stats:
+        scores = np.array(layer_stats["importance_score"], dtype=np.float64)
+    else:
+        raise KeyError(
+            "Layer stats has neither 'reap' nor 'importance_score'. "
+            "Run expert-profile / nemotron_reap.py profile first."
+        )
+    return sorted(np.argsort(scores)[-keep_n:].tolist())
+
+
+def slice_expert_axis(data: np.ndarray, keep: list[int]) -> np.ndarray:
+    """
+    Slice the expert axis of reader tensor data keeping only `keep` indices.
+
+    GGUFReader reshapes tensors to NumPy with reversed ggml dims, so for MoE
+    tensors where experts are the last ggml dim, expert is axis 0 in `data`.
+    This also preserves quantized row-byte alignment (axis -1 is byte-packed
+    rows for quantized tensors and must not be sliced for expert pruning).
+    """
+    return np.take(data, keep, axis=0)
+
+
+def copy_field(writer: GGUFWriter, field, reader: GGUFReader) -> bool:
+    """Copy a single metadata field to writer. Returns False if skipped."""
+    key = field.name
+    val_type = field.types[0]
+    part = field.parts[-1]
+
+    if val_type == GGUFValueType.STRING:
+        # Preserve raw bytes: GGUF metadata can contain non-UTF8 strings.
+        writer.add_key_value(key, bytes(part), GGUFValueType.STRING)
+    elif val_type == GGUFValueType.UINT8:
+        writer.add_uint8(key, int(part[0]))
+    elif val_type == GGUFValueType.INT8:
+        writer.add_int8(key, int(part[0]))
+    elif val_type == GGUFValueType.UINT16:
+        writer.add_uint16(key, int(part[0]))
+    elif val_type == GGUFValueType.INT16:
+        writer.add_int16(key, int(part[0]))
+    elif val_type == GGUFValueType.UINT32:
+        writer.add_uint32(key, int(part[0]))
+    elif val_type == GGUFValueType.INT32:
+        writer.add_int32(key, int(part[0]))
+    elif val_type == GGUFValueType.FLOAT32:
+        writer.add_float32(key, float(part[0]))
+    elif val_type == GGUFValueType.UINT64:
+        writer.add_uint64(key, int(part[0]))
+    elif val_type == GGUFValueType.INT64:
+        writer.add_int64(key, int(part[0]))
+    elif val_type == GGUFValueType.FLOAT64:
+        writer.add_float64(key, float(part[0]))
+    elif val_type == GGUFValueType.BOOL:
+        writer.add_bool(key, bool(part[0]))
+    elif val_type == GGUFValueType.ARRAY:
+        elem_type = field.types[1]
+        if elem_type == GGUFValueType.STRING:
+            # ReaderField.data stores indices of ARRAY payload items; for
+            # STRING arrays this points at each string byte payload.
+            vals = [bytes(field.parts[idx]) for idx in field.data]
+            writer.add_key_value(key, vals, GGUFValueType.ARRAY, sub_type=GGUFValueType.STRING)
+        else:
+            # ReaderField.data stores part-indices, not payload values.
+            vals = field.contents()
+            if not isinstance(vals, list):
+                print(f"  WARNING: skipping array field {key!r} (unexpected non-list contents)")
+                return False
+            writer.add_array(key, vals)
+    else:
+        print(f"  WARNING: skipping field {key!r} (unsupported type {val_type})")
+        return False
+    return True
+
+
+# ── Main ──────────────────────────────────────────────────────────────────────
+
+def main():
+    ap = argparse.ArgumentParser(description="REAP expert pruning on a GGUF file")
+    ap.add_argument("--input",      required=True,              help="Input .gguf path")
+    ap.add_argument("--stats",      required=True,              help="expert_stats.json from expert-profile")
+    ap.add_argument("--output",     required=True,              help="Output .gguf path")
+    ap.add_argument("--keep_ratio", type=float, default=None,   help="Fraction to keep, e.g. 0.20")
+    ap.add_argument("--keep_n",     type=int,   default=None,   help="Absolute count to keep, e.g. 32")
+    ap.add_argument("--n_experts",  type=int,   default=128,    help="Experts per MoE layer in source model")
+    args = ap.parse_args()
+
+    if args.keep_ratio is None and args.keep_n is None:
+        ap.error("Provide --keep_ratio or --keep_n")
+    if args.keep_ratio is not None and args.keep_n is not None:
+        ap.error("Provide --keep_ratio OR --keep_n, not both")
+
+    keep_n = args.keep_n if args.keep_n is not None else max(1, int(args.n_experts * args.keep_ratio))
+    print(f"[gguf-prune] keeping {keep_n}/{args.n_experts} experts per MoE layer")
+
+    # ── Load stats ─────────────────────────────────────────────────────────────
+    with open(args.stats) as f:
+        stats = {int(k): v for k, v in json.load(f).items()}
+    print(f"[gguf-prune] stats loaded for {len(stats)} MoE layers")
+
+    # ── Open source GGUF ───────────────────────────────────────────────────────
+    print(f"[gguf-prune] reading  {args.input}")
+    reader = GGUFReader(args.input, mode="r")
+
+    arch_field = reader.get_field("general.architecture")
+    arch = str(bytes(arch_field.parts[-1]), "utf-8") if arch_field else "nemotron_h_moe"
+    print(f"[gguf-prune] arch     {arch}")
+
+    expert_count_key = f"{arch}.expert_count"
+
+    # ── Compute kept indices per layer ─────────────────────────────────────────
+    kept: dict[int, list[int]] = {}
+    for tensor in reader.tensors:
+        il, suffix = layer_and_suffix(tensor.name)
+        if il is None or not is_expert_suffix(suffix):
+            continue
+        if il in kept:
+            continue  # already computed for this layer
+        if il not in stats:
+            print(f"  Layer {il:3d}: no stats — keeping ALL {args.n_experts} experts")
+            kept[il] = list(range(args.n_experts))
+        else:
+            kept[il] = pick_experts(stats[il], keep_n)
+            never = stats[il].get("never_activated", "?")
+            crit  = "reap" if "reap" in stats[il] else "importance_score"
+            print(f"  Layer {il:3d}: keep {kept[il][:4]}…  never_activated={never}  criterion={crit}")
+
+    # ── Build output GGUF ──────────────────────────────────────────────────────
+    print(f"\n[gguf-prune] writing  {args.output}")
+    writer = GGUFWriter(args.output, arch=arch)
+
+    # --- metadata: copy all fields, replace expert_count ---
+    for field in reader.fields.values():
+        # Reader exposes synthetic header fields (GGUF.*) that are not KV
+        # metadata and must not be copied back as normal keys.
+        if field.name.startswith("GGUF."):
+            continue
+        # Writer already sets general.architecture from ctor; avoid duplicate warning.
+        if field.name in (expert_count_key, "general.architecture"):
+            continue  # replaced below
+        copy_field(writer, field, reader)
+
+    writer.add_expert_count(keep_n)
+    print(f"[gguf-prune] patched  {expert_count_key} → {keep_n}")
+
+    # --- tensors ---
+    n_pruned = 0
+    for tensor in reader.tensors:
+        il, suffix = layer_and_suffix(tensor.name)
+        is_expert = il is not None and is_expert_suffix(suffix)
+
+        if is_expert:
+            k = kept[il]
+            data = slice_expert_axis(tensor.data, k)
+            writer.add_tensor(
+                tensor.name,
+                data,
+                raw_dtype=tensor.tensor_type,
+            )
+            n_pruned += 1
+        else:
+            writer.add_tensor(
+                tensor.name,
+                tensor.data,
+                raw_dtype=tensor.tensor_type,
+            )
+
+    writer.write_header_to_file()
+    writer.write_kv_data_to_file()
+    writer.write_tensors_to_file(progress=True)
+    writer.close()
+
+    out = Path(args.output)
+    size_gb = out.stat().st_size / 1024**3
+    print(f"\n[gguf-prune] done")
+    print(f"  Expert tensors sliced : {n_pruned}")
+    print(f"  MoE layers pruned     : {len(kept)}")
+    print(f"  Experts per layer     : {keep_n}/{args.n_experts}")
+    print(f"  Output size           : {size_gb:.2f} GB  →  {out}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/moe-pruning/nemotron_reap.py b/tools/moe-pruning/nemotron_reap.py
new file mode 100644
index 0000000000..fac5831d3c
--- /dev/null
+++ b/tools/moe-pruning/nemotron_reap.py
@@ -0,0 +1,296 @@
+"""
+NemotronH Expert Activation Profiler + Pruner
+Two-phase: profile with 4-bit on GPU, prune bf16 on CPU.
+
+Usage:
+  # Phase 1 - profile
+  python nemotron_reap.py profile \
+    --model unsloth/Nemotron-3-Nano-30B-A3B \
+    --prompts training-data.jsonl \
+    --output expert_stats.json
+
+  # Phase 2 - prune
+  python nemotron_reap.py prune \
+    --model unsloth/Nemotron-3-Nano-30B-A3B \
+    --stats expert_stats.json \
+    --keep_ratio 0.20 \
+    --output ./nemotron-pruned-25e
+"""
+
+import os
+os.environ["TORCH_COMPILE_DISABLE"] = "1"  # prevent inductor hang during save_pretrained
+
+import json
+import argparse
+import torch
+import numpy as np
+from collections import defaultdict
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+try:
+    from transformers import BitsAndBytesConfig
+    import patch_bnb  # noqa: F401 — patches Params4bit.__new__ for transformers 5.x compat
+    HAS_BNB = True
+except ImportError:
+    HAS_BNB = False
+
+
+# ── Tracker ───────────────────────────────────────────────────────────────────
+
+class ExpertActivationTracker:
+    def __init__(self, n_experts: int = 128):
+        self.n_experts = n_experts
+        self.activation_counts  = defaultdict(lambda: np.zeros(n_experts, dtype=np.int64))
+        self.activation_weights = defaultdict(lambda: np.zeros(n_experts, dtype=np.float64))
+        self.total_tokens = defaultdict(int)
+        self._hooks = []
+
+    def register_hooks(self, model):
+        count = 0
+        for layer_idx, block in enumerate(model.backbone.layers):
+            if block.block_type == "moe":
+                h = block.mixer.gate.register_forward_hook(self._make_hook(layer_idx))
+                self._hooks.append(h)
+                count += 1
+        print(f"  Hooks attached to {count} MoE layers")
+
+    def _make_hook(self, layer_idx):
+        def hook(module, input, output):
+            topk_indices, topk_weights = output
+            idx = topk_indices.detach().cpu().numpy()           # [T, 6]
+            wgt = topk_weights.detach().float().cpu().numpy()   # [T, 6]
+            T = idx.shape[0]
+            self.total_tokens[layer_idx] += T
+            np.add.at(self.activation_counts[layer_idx],  idx.flatten(), 1)
+            np.add.at(self.activation_weights[layer_idx], idx.flatten(), wgt.flatten())
+        return hook
+
+    def remove_hooks(self):
+        for h in self._hooks:
+            h.remove()
+        self._hooks.clear()
+
+    def get_stats(self) -> dict:
+        stats = {}
+        for layer_idx in sorted(self.activation_counts):
+            counts  = self.activation_counts[layer_idx]
+            weights = self.activation_weights[layer_idx]
+            total   = self.total_tokens[layer_idx]
+            freq    = counts / (total + 1e-9)
+            avg_w   = np.where(counts > 0, weights / counts, 0.0)
+            importance = freq * avg_w
+            stats[layer_idx] = {
+                "total_tokens":         int(total),
+                "activation_counts":    counts.tolist(),
+                "activation_frequency": freq.tolist(),
+                "avg_weight":           avg_w.tolist(),
+                "importance_score":     importance.tolist(),
+                "never_activated":      int((counts == 0).sum()),
+            }
+        return stats
+
+    def print_summary(self, stats, keep_ratio):
+        keep_n = max(1, int(self.n_experts * keep_ratio))
+        print(f"\n{'='*70}")
+        print(f"  PROFILING SUMMARY  |  keep_ratio={keep_ratio:.0%}  |  keeping {keep_n}/128 experts/layer")
+        print(f"{'='*70}")
+        for li, s in stats.items():
+            imp = np.array(s['importance_score'])
+            threshold = np.sort(imp)[self.n_experts - keep_n]
+            print(
+                f"  Layer {li:3d}: "
+                f"never_activated={s['never_activated']:3d}/128  "
+                f"top_freq={max(s['activation_frequency']):.3f}  "
+                f"threshold={threshold:.4f}"
+            )
+        total_moe = len(stats)
+        print(f"\n  MoE layers : {total_moe}")
+        print(f"  Kept       : {total_moe * keep_n} experts total")
+        print(f"  Pruned     : {total_moe * (self.n_experts - keep_n)} experts total")
+        print(f"{'='*70}\n")
+
+
+# ── Phase 1: Profile ──────────────────────────────────────────────────────────
+
+def cmd_profile(args):
+    # Mamba2 layers use Triton kernels — CUDA required.
+    # 4-bit NF4 fits in 16GB VRAM (~15GB). We must keep ALL layers on GPU
+    # (no CPU spillover) otherwise PCIe transfers make inference unusably slow.
+    print(f"\n[Phase 1] Profiling — 4-bit NF4, GPU only")
+    print(f"  Model  : {args.model}")
+    print(f"  Prompts: {args.prompts}")
+
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+        bnb_4bit_use_double_quant=True,
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+    print("  Loading model in 4-bit NF4...")
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model,
+        trust_remote_code=True,
+        quantization_config=bnb_config,
+        device_map={"": 0},  # force ALL layers onto GPU 0, no CPU spillover
+    )
+    model.eval()
+    print("  Model loaded on GPU.")
+
+    # Load prompt+response pairs
+    pairs = []
+    with open(args.prompts) as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            obj = json.loads(line)
+            text = obj.get("prompt", "") + "\n" + obj.get("response", "")
+            pairs.append(text)
+    print(f"  Loaded {len(pairs)} prompt+response pairs")
+
+    tracker = ExpertActivationTracker(n_experts=128)
+    tracker.register_hooks(model)
+
+    with torch.no_grad():
+        for i, text in enumerate(pairs):
+            if i % 100 == 0:
+                print(f"  [{i+1}/{len(pairs)}] processing...")
+            inputs = tokenizer(
+                text,
+                return_tensors="pt",
+                truncation=True,
+                max_length=args.max_length,
+            ).to("cuda")
+            try:
+                model(**inputs)
+            except torch.cuda.OutOfMemoryError:
+                print(f"  [{i+1}] OOM — skipping")
+                torch.cuda.empty_cache()
+
+    tracker.remove_hooks()
+    stats = tracker.get_stats()
+    tracker.print_summary(stats, keep_ratio=args.keep_ratio)
+
+    stats_out = {str(k): v for k, v in stats.items()}
+    with open(args.output, "w") as f:
+        json.dump(stats_out, f, indent=2)
+    print(f"  Stats saved → {args.output}")
+    print(f"\n[Phase 1] Done. Run 'prune' next (CPU only).")
+
+
+# ── Phase 2: Prune ────────────────────────────────────────────────────────────
+
+def cmd_prune(args):
+    print(f"\n[Phase 2] Pruning — bf16 on CPU")
+    print(f"  Model      : {args.model}")
+    print(f"  Stats      : {args.stats}")
+    print(f"  Keep ratio : {args.keep_ratio:.0%}")
+    print(f"  Output     : {args.output}")
+
+    with open(args.stats) as f:
+        stats = {int(k): v for k, v in json.load(f).items()}
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+
+    print("  Loading model in bf16 on CPU — this takes a few minutes...")
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model,
+        trust_remote_code=True,
+        dtype=torch.bfloat16,
+        device_map="cpu",
+    )
+
+    keep_n = max(1, int(128 * args.keep_ratio))
+    print(f"\n  Pruning to top-{keep_n} experts per MoE layer...\n")
+
+    for layer_idx, block in enumerate(model.backbone.layers):
+        if block.block_type != "moe":
+            continue
+
+        if layer_idx not in stats:
+            print(f"  Layer {layer_idx:3d}: no profiling data — skipping")
+            continue
+
+        # Use REAP score if available (from llama.cpp profiler), else fall back to legacy importance_score
+        layer_stats = stats[layer_idx]
+        if "reap" in layer_stats:
+            importance = np.array(layer_stats["reap"])
+        else:
+            importance = np.array(layer_stats["importance_score"])
+        keep_sorted = sorted(np.argsort(importance)[-keep_n:].tolist())
+        prune_count = 128 - len(keep_sorted)
+
+        # Prune expert list
+        block.mixer.experts = torch.nn.ModuleList(
+            [block.mixer.experts[i] for i in keep_sorted]
+        )
+
+        # Prune router weights to match new expert indices
+        keep_t = torch.tensor(keep_sorted, dtype=torch.long)
+        block.mixer.gate.weight = torch.nn.Parameter(
+            block.mixer.gate.weight.data[keep_t].clone()
+        )
+        old_bias = block.mixer.gate.e_score_correction_bias.data[keep_t].clone()
+        block.mixer.gate.register_buffer("e_score_correction_bias", old_bias)
+        block.mixer.gate.n_routed_experts = keep_n
+
+        never = stats[layer_idx]["never_activated"]
+        print(f"  Layer {layer_idx:3d}: kept {keep_n}, pruned {prune_count}  (was {never} never-activated)")
+
+    # Patch top-level config
+    model.config.n_routed_experts = keep_n
+
+    # Fix transformers 5.x incompatibility: _tied_weights_keys must be a list of dicts,
+    # but the custom NemotronH modeling code sets it as a plain list of strings.
+    # _get_tied_weight_keys() calls .keys() on each element → AttributeError.
+    # Clear it — lm_head weight tying is not needed for inference on the pruned model.
+    for mod in model.modules():
+        if isinstance(getattr(mod, '_tied_weights_keys', None), list):
+            mod._tied_weights_keys = None
+
+    # Disable torch.compile / inductor before saving — transformers 5.x can trigger
+    # torch._inductor.compile_worker during save_pretrained, causing an indefinite hang.
+    import os
+    os.environ["TORCH_COMPILE_DISABLE"] = "1"
+    torch._dynamo.reset()
+
+    print(f"\n  Saving pruned model → {args.output}")
+    with torch.no_grad():
+        model.save_pretrained(args.output, safe_serialization=True)
+    tokenizer.save_pretrained(args.output)
+    print(f"\n[Phase 2] Done.")
+    print(f"  Experts per MoE layer : {keep_n}/128")
+    print(f"  Next: fine-tune with Unsloth from {args.output}")
+
+
+# ── Entry point ───────────────────────────────────────────────────────────────
+
+def main():
+    parser = argparse.ArgumentParser(description="NemotronH Expert Pruner (REAP-style)")
+    sub = parser.add_subparsers(dest="cmd", required=True)
+
+    p1 = sub.add_parser("profile", help="Phase 1: profile expert activations (GPU, 4-bit)")
+    p1.add_argument("--model",      default="unsloth/Nemotron-3-Nano-30B-A3B")
+    p1.add_argument("--prompts",    required=True)
+    p1.add_argument("--output",     default="expert_stats.json")
+    p1.add_argument("--keep_ratio", type=float, default=0.20,
+                    help="Preview ratio for summary only — does not affect saved stats")
+    p1.add_argument("--max_length", type=int,   default=2048)
+
+    p2 = sub.add_parser("prune", help="Phase 2: prune model using saved stats (CPU, bf16)")
+    p2.add_argument("--model",      default="unsloth/Nemotron-3-Nano-30B-A3B")
+    p2.add_argument("--stats",      default="expert_stats.json")
+    p2.add_argument("--keep_ratio", type=float, default=0.20)
+    p2.add_argument("--output",     default="./nemotron-pruned")
+
+    args = parser.parse_args()
+    if args.cmd == "profile":
+        cmd_profile(args)
+    elif args.cmd == "prune":
+        cmd_prune(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/moe-pruning/requirements.txt b/tools/moe-pruning/requirements.txt
new file mode 100644
index 0000000000..8f778e9fe5
--- /dev/null
+++ b/tools/moe-pruning/requirements.txt
@@ -0,0 +1 @@
+gguf
\ No newline at end of file
diff --git a/tools/moe-pruning/sample_calibration.jsonl b/tools/moe-pruning/sample_calibration.jsonl
new file mode 100644
index 0000000000..44e2627f97
--- /dev/null
+++ b/tools/moe-pruning/sample_calibration.jsonl
@@ -0,0 +1,8 @@
+{"prompt": "What is the difference between a transformer and an RNN?", "response": "Transformers process all tokens in parallel using self-attention, while RNNs process tokens sequentially and maintain a hidden state. Transformers scale better and handle long-range dependencies more effectively, but require more memory for the attention matrix."}
+{"prompt": "Explain the concept of weight quantization in neural networks.", "response": "Weight quantization reduces model size and inference latency by representing parameters in lower precision formats such as INT8 or INT4 instead of FP32. This trades a small amount of accuracy for significant gains in memory efficiency and compute throughput."}
+{"prompt": "What is mixture of experts (MoE)?", "response": "MoE is a neural network architecture where the feed-forward layers are replaced by multiple expert sub-networks and a routing mechanism. For each token, only a small subset of experts is activated, allowing the model to have a large parameter count while keeping inference compute constant."}
+{"prompt": "Describe the attention mechanism.", "response": "Attention computes a weighted sum of value vectors, where the weights are determined by the compatibility between query and key vectors. Scaled dot-product attention computes scores as Q*K^T/sqrt(d_k), applies softmax to get weights, then multiplies by V."}
+{"prompt": "What is GGUF and how does it differ from GGML?", "response": "GGUF is the successor to the GGML file format for storing quantized models. It supports arbitrary key-value metadata, is extensible without breaking backward compatibility, and encodes tensor names and shapes explicitly, making it more robust than the original GGML format."}
+{"prompt": "How does LoRA work?", "response": "LoRA (Low-Rank Adaptation) injects trainable rank-decomposition matrices A and B into frozen weight layers. The adapted weight is W + alpha/r * B*A. Since rank r is much smaller than the weight dimensions, only a tiny fraction of parameters are trained."}
+{"prompt": "What is perplexity in language modeling?", "response": "Perplexity measures how well a language model predicts a sample text. It is the exponentiated average negative log-likelihood per token: PPL = exp(-1/N * sum log P(token_i)). Lower perplexity indicates a better fit to the data."}
+{"prompt": "Explain rotary position embeddings (RoPE).", "response": "RoPE encodes position by rotating query and key vectors in 2D subspaces using a position-dependent rotation matrix. This makes the dot product between Q and K depend only on their relative position, enabling the model to generalise to sequence lengths longer than those seen during training."}

From 76d5b6798044fde6b50915a905c00ad8844523f4 Mon Sep 17 00:00:00 2001
From: Salvatore Rossitto <srossitto79@gmail.com>
Date: Thu, 12 Mar 2026 11:58:14 +0100
Subject: [PATCH 03/15] added missing llama_opt_set_reward_weights

---
 include/llama.h     | 9 ++++++++-
 src/llama-context.h | 4 +++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index 0bd10294cb..0bf8ead384 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1556,6 +1556,12 @@ extern "C" {
 
     LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
 
+    // weights: array of floats, one per dataset window (indexed by idata), already normalized to [0,1].
+    // n_weights: length of the array.
+    // Pass NULL/0 to disable (equivalent to all-ones, i.e. standard SFT).
+    // The pointer must remain valid for the duration of all llama_opt_epoch calls.
+    LLAMA_API void llama_opt_set_reward_weights(const float * weights, int64_t n_weights);
+
     LLAMA_API void llama_opt_epoch(
             struct llama_context    * lctx,
             ggml_opt_dataset_t        dataset,
@@ -1563,7 +1569,8 @@ extern "C" {
             ggml_opt_result_t         result_eval,
             int64_t                   idata_split,
             ggml_opt_epoch_callback   callback_train,
-            ggml_opt_epoch_callback   callback_eval);
+            ggml_opt_epoch_callback   callback_eval,
+            bool                      shuffle);
 
 #ifdef __cplusplus
 }
diff --git a/src/llama-context.h b/src/llama-context.h
index e0d0085c1c..21d0ba6299 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -187,7 +187,8 @@ struct llama_context {
             ggml_opt_result_t       result_eval,
             int64_t                 idata_split,
             ggml_opt_epoch_callback callback_train,
-            ggml_opt_epoch_callback callback_eval);
+            ggml_opt_epoch_callback callback_eval,
+            bool                    shuffle);
 
     void opt_epoch_iter(
             ggml_opt_dataset_t               dataset,
@@ -195,6 +196,7 @@ struct llama_context {
             const std::vector<llama_token> & tokens,
             const std::vector<llama_token> & labels_sparse,
             llama_batch                    & batch,
+            float                            reward_scale,
             ggml_opt_epoch_callback          callback,
             bool                             train,
             int64_t                          idata_in_loop,

From 70730e8d283070ba8e1b775ba2e2879d49c45707 Mon Sep 17 00:00:00 2001
From: Salvatore Rossitto <srossitto79@gmail.com>
Date: Thu, 12 Mar 2026 12:04:34 +0100
Subject: [PATCH 04/15] added reward scaling to opt_epoch_iter calls

---
 examples/training/finetune.cpp |  2 +-
 src/llama-context.cpp          | 38 +++++++++++++++++++++++++++-------
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/examples/training/finetune.cpp b/examples/training/finetune.cpp
index e20f89488f..dd58f9418e 100644
--- a/examples/training/finetune.cpp
+++ b/examples/training/finetune.cpp
@@ -83,7 +83,7 @@ int main(int argc, char ** argv) {
 
     for (lr.epoch = 0; lr.epoch < lr.epochs; ++lr.epoch) {
         llama_opt_epoch(ctx, dataset, result_train, result_eval, idata_split,
-                        ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
+                        ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar, /*shuffle=*/false);
         fprintf(stderr, "\n");
 
         ggml_opt_result_reset(result_train);
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index ee2669c154..9f67d47b50 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -2657,6 +2657,7 @@ void llama_context::opt_epoch_iter(
         const std::vector<llama_token> & tokens,
         const std::vector<llama_token> & labels_sparse,
         llama_batch                    & batch,
+        float                            reward_scale,
         ggml_opt_epoch_callback          callback,
         bool                             train,
         int64_t                          idata_in_loop,
@@ -2742,11 +2743,14 @@ void llama_context::opt_epoch_iter(
                 struct ggml_tensor * labels = ggml_opt_labels(opt_ctx);
                 GGML_ASSERT(labels->ne[1] == n_ubatch);
                 ggml_set_zero(labels);
-                const float onef = 1.0f;
                 for (uint32_t pos_ubatch = 0; pos_ubatch < n_ubatch; ++pos_ubatch) {
                     const uint32_t ilabel = pos_ctx + pos_batch + pos_ubatch;
+                    // -1 sentinel means "masked position" (prompt token, BOS separator, etc).
+                    // Leave the label tensor zeroed at this position → zero cross-entropy
+                    // contribution.  Do NOT write anything — ggml_set_zero already handled it.
+                    if (labels_sparse[ilabel] < 0) continue;
                     GGML_ASSERT(labels_sparse[ilabel] < labels->ne[0]);
-                    ggml_backend_tensor_set(labels, &onef, (pos_ubatch*labels->ne[0] + labels_sparse[ilabel])*sizeof(float), sizeof(float));
+                    ggml_backend_tensor_set(labels, &reward_scale, (pos_ubatch*labels->ne[0] + labels_sparse[ilabel])*sizeof(float), sizeof(float));
                 }
             }
             ggml_opt_eval(opt_ctx, result);
@@ -2760,13 +2764,25 @@ void llama_context::opt_epoch_iter(
     }
 }
 
+// Optional per-window reward weights for reward-weighted SFT.
+// Set via llama_opt_set_reward_weights() before calling llama_opt_epoch().
+// Null/0 means all rewards are 1.0 (standard SFT).
+static thread_local const float * g_reward_weights   = nullptr;
+static thread_local int64_t       g_reward_weights_n = 0;
+
+void llama_opt_set_reward_weights(const float * weights, int64_t n_weights) {
+    g_reward_weights   = weights;
+    g_reward_weights_n = n_weights;
+}
+
 void llama_context::opt_epoch(
         ggml_opt_dataset_t        dataset,
         ggml_opt_result_t         result_train,
         ggml_opt_result_t         result_eval,
         int64_t                   idata_split,
         ggml_opt_epoch_callback   callback_train,
-        ggml_opt_epoch_callback   callback_eval) {
+        ggml_opt_epoch_callback   callback_eval,
+        bool                      shuffle) {
     const uint32_t n_ctx    = this->n_ctx();
     const uint32_t n_batch  = std::min(cparams.n_batch,  n_ctx);
     const uint32_t n_ubatch = std::min(cparams.n_ubatch, n_batch);
@@ -2775,6 +2791,10 @@ void llama_context::opt_epoch(
     GGML_ASSERT(idata_split >= 0);
     GGML_ASSERT(idata_split <= ndata);
 
+    if (shuffle && idata_split > 1) {
+        ggml_opt_dataset_shuffle(opt_ctx, dataset, idata_split);
+    }
+
     const uint32_t ubatch_per_ctx = n_ctx / n_ubatch;
 
     struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
@@ -2788,9 +2808,11 @@ void llama_context::opt_epoch(
     for (; idata < idata_split; ++idata) {
         constexpr bool train = true;
         const int64_t idata_in_loop = idata*ubatch_per_ctx;
+        const float reward = (g_reward_weights && idata < g_reward_weights_n)
+                             ? g_reward_weights[idata] : 1.0f;
 
         ggml_opt_dataset_get_batch_host(dataset, tokens.data(), n_ctx*sizeof(llama_token), labels_sparse.data(), idata);
-        opt_epoch_iter(dataset, result_train, tokens, labels_sparse, batch,
+        opt_epoch_iter(dataset, result_train, tokens, labels_sparse, batch, reward,
             callback_train, train, idata_in_loop, ndata_in_loop, t_loop_start);
     }
 
@@ -2801,7 +2823,7 @@ void llama_context::opt_epoch(
         const int64_t idata_in_loop = (idata - idata_split)*ubatch_per_ctx;
 
         ggml_opt_dataset_get_batch_host(dataset, tokens.data(), n_ctx*sizeof(llama_token), labels_sparse.data(), idata);
-        opt_epoch_iter(dataset, result_eval, tokens, labels_sparse, batch,
+        opt_epoch_iter(dataset, result_eval, tokens, labels_sparse, batch, 1.0f,
             callback_eval, train, idata_in_loop, ndata_in_loop, t_loop_start);
     }
 
@@ -3550,12 +3572,14 @@ void llama_opt_epoch(
         ggml_opt_result_t         result_eval,
         int64_t                   idata_split,
         ggml_opt_epoch_callback   callback_train,
-        ggml_opt_epoch_callback   callback_eval) {
+        ggml_opt_epoch_callback   callback_eval,
+        bool                      shuffle) {
     ctx->opt_epoch(
         dataset,
         result_train,
         result_eval,
         idata_split,
         callback_train,
-        callback_eval);
+        callback_eval,
+        shuffle);
 }

From 22277e3cbfb47fc85dadc7c8ce6c83a10977c503 Mon Sep 17 00:00:00 2001
From: Salvatore Rossitto <srossitto79@gmail.com>
Date: Thu, 12 Mar 2026 12:22:12 +0100
Subject: [PATCH 05/15] ported residual changes about grad_checkpointing

---
 examples/training/finetune.cpp |   1 +
 ggml/include/ggml-opt.h        |   7 +++
 ggml/src/ggml-opt.cpp          |  69 +++++++++++++++++++---
 include/llama.h                |   6 ++
 src/llama-context.cpp          | 105 ++++++++++++++++++++++++++++++---
 5 files changed, 171 insertions(+), 17 deletions(-)

diff --git a/examples/training/finetune.cpp b/examples/training/finetune.cpp
index dd58f9418e..88d3db741d 100644
--- a/examples/training/finetune.cpp
+++ b/examples/training/finetune.cpp
@@ -73,6 +73,7 @@ int main(int argc, char ** argv) {
         /*get_opt_pars    =*/common_opt_lr_pars,
         /*get_opt_pars_ud =*/&params.lr,
         /*optimizer_type  =*/params.optimizer,
+        /*grad_checkpoint_interval =*/params.grad_checkpoint_interval,
     };
     llama_opt_init(ctx, model, lopt_params);
 
diff --git a/ggml/include/ggml-opt.h b/ggml/include/ggml-opt.h
index 60774575f0..cac543c02d 100644
--- a/ggml/include/ggml-opt.h
+++ b/ggml/include/ggml-opt.h
@@ -126,6 +126,13 @@ extern "C" {
         ggml_opt_get_optimizer_params get_opt_pars;    // callback for calculating optimizer parameters
         void *                        get_opt_pars_ud; // userdata for calculating optimizer parameters
 
+        // Gradient checkpointing: keep the output of every Nth forward node alive through
+        // the backward pass so the allocator cannot reuse its memory for other tensors.
+        // This trades compute for VRAM — intermediate activations between checkpoints are
+        // freed and recomputed during the backward pass by the existing graph structure.
+        // Set to 0 (default) to disable.  A value of ~32–64 cuts activation VRAM by ~50%.
+        int32_t grad_checkpoint_interval;
+
         // only GGML_OPT_OPTIMIZER_TYPE_ADAMW needs m, v momenta per parameter tensor
         enum ggml_opt_optimizer_type optimizer;
     };
diff --git a/ggml/src/ggml-opt.cpp b/ggml/src/ggml-opt.cpp
index e87fc79c25..8be90c8944 100644
--- a/ggml/src/ggml-opt.cpp
+++ b/ggml/src/ggml-opt.cpp
@@ -58,10 +58,13 @@ struct ggml_opt_context {
     std::vector<struct ggml_tensor *> grad_accs;
     std::vector<struct ggml_tensor *> grad_m;
     std::vector<struct ggml_tensor *> grad_v;
+    std::vector<ggml_backend_buffer_t> bufs_momenta;  // per-param moment buffers (one per param node)
+    std::vector<struct ggml_context *> ctxs_momenta;  // corresponding ggml contexts (keep alive for tensor metadata)
 
     int64_t iter               = 1;
     int32_t opt_period         = 1;
     int32_t opt_i              = 0;
+    int32_t grad_checkpoint_interval = 0;
     bool    loss_per_datapoint = false;
 
     ggml_opt_get_optimizer_params get_opt_pars    = nullptr;
@@ -254,9 +257,10 @@ struct ggml_opt_params ggml_opt_default_params(
         /*loss_type       =*/ loss_type,
         /*build_type      =*/ GGML_OPT_BUILD_TYPE_OPT,
         /*opt_period      =*/ 1,
-        /*get_opt_pars    =*/ ggml_opt_get_default_optimizer_params,
-        /*get_opt_pars_ud =*/ nullptr,
-        /*optimizer       =*/ GGML_OPT_OPTIMIZER_TYPE_ADAMW,
+        /*get_opt_pars              =*/ ggml_opt_get_default_optimizer_params,
+        /*get_opt_pars_ud          =*/ nullptr,
+        /*grad_checkpoint_interval =*/ 0,
+        /*optimizer                =*/ GGML_OPT_OPTIMIZER_TYPE_ADAMW,
     };
 }
 
@@ -476,8 +480,23 @@ static void ggml_opt_build(ggml_opt_context_t opt_ctx) {
             for (int i = 0; i < n_nodes; ++i) {
                 ggml_tensor * node = opt_ctx->gf->nodes[i];
                 if (node->flags & GGML_TENSOR_FLAG_PARAM) {
-                    opt_ctx->grad_m[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
-                    opt_ctx->grad_v[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
+                    // Allocate moments on the same buffer type as the param tensor so
+                    // the ADAMW op runs on the correct backend (avoids cross-device mismatch
+                    // when some LoRA tensors are on CPU and others on GPU with partial offload).
+                    ggml_backend_buffer_type_t param_buft = node->buffer
+                        ? ggml_backend_buffer_get_type(node->buffer)
+                        : ggml_backend_cpu_buffer_type();
+
+                    // Allocate a tiny context + buffer for this pair of moment tensors.
+                    const size_t sz = 2 * ggml_tensor_overhead();
+                    struct ggml_init_params mip = { sz, nullptr, true };
+                    struct ggml_context * mctx = ggml_init(mip);
+                    opt_ctx->grad_m[i] = ggml_new_tensor(mctx, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
+                    opt_ctx->grad_v[i] = ggml_new_tensor(mctx, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
+                    ggml_backend_buffer_t mbuf = ggml_backend_alloc_ctx_tensors_from_buft(mctx, param_buft);
+                    ggml_backend_buffer_clear(mbuf, 0);
+                    opt_ctx->bufs_momenta.push_back(mbuf);
+                    opt_ctx->ctxs_momenta.push_back(mctx); // keep alive for tensor metadata
                 } else {
                     opt_ctx->grad_m[i] = nullptr;
                     opt_ctx->grad_v[i] = nullptr;
@@ -486,6 +505,31 @@ static void ggml_opt_build(ggml_opt_context_t opt_ctx) {
         }
     }
 
+    // Gradient checkpointing: mark every Nth forward node as OUTPUT so the allocator
+    // keeps its memory alive through the backward pass.  The backward graph already
+    // contains the forward ops (gb_grad is a superset of gf), so the checkpointed
+    // activations are naturally available for backward matmuls without recomputation.
+    // This prevents the allocator from aliasing those buffers to later ops, cutting
+    // peak activation VRAM at the cost of slightly larger static allocation.
+    if (opt_ctx->grad_checkpoint_interval > 0) {
+        const int interval = opt_ctx->grad_checkpoint_interval;
+        const int n_fwd    = opt_ctx->gf->n_nodes;
+        int ckpt_count = 0;
+        for (int i = interval - 1; i < n_fwd; i += interval) {
+            struct ggml_tensor * node = opt_ctx->gf->nodes[i];
+            // Only checkpoint F32 compute nodes — skip I32 index tensors and already-output nodes.
+            if (node->type != GGML_TYPE_F32) continue;
+            if (node->flags & GGML_TENSOR_FLAG_OUTPUT)  continue;
+            if (node->flags & GGML_TENSOR_FLAG_INPUT)   continue;
+            node->flags |= GGML_TENSOR_FLAG_OUTPUT;
+            ckpt_count++;
+        }
+        if (ckpt_count > 0) {
+            GGML_LOG_DEBUG("%s: gradient checkpointing: marked %d/%d nodes as persistent (interval=%d)\n",
+                __func__, ckpt_count, n_fwd, interval);
+        }
+    }
+
     // gb_grad == graph backward gradients, forward pass, then backward pass to calculate gradients.
     opt_ctx->gb_grad = ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gf, /*force_grads =*/ true);
     ggml_build_backward_expand(opt_ctx->ctx_compute, opt_ctx->gb_grad, opt_ctx->grad_accs.data());
@@ -556,10 +600,11 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
     result->build_type_alloc = params.build_type;
     result->inputs           = params.inputs;
     result->outputs          = params.outputs;
-    result->opt_period       = params.opt_period;
-    result->get_opt_pars     = params.get_opt_pars;
-    result->get_opt_pars_ud  = params.get_opt_pars_ud;
-    result->optimizer        = params.optimizer;
+    result->opt_period                = params.opt_period;
+    result->grad_checkpoint_interval  = params.grad_checkpoint_interval;
+    result->get_opt_pars              = params.get_opt_pars;
+    result->get_opt_pars_ud           = params.get_opt_pars_ud;
+    result->optimizer                 = params.optimizer;
 
     GGML_ASSERT(result->opt_period >= 1);
 
@@ -588,6 +633,12 @@ void ggml_opt_free(ggml_opt_context_t opt_ctx) {
     }
     ggml_backend_buffer_free(opt_ctx->buf_static);
     ggml_backend_buffer_free(opt_ctx->buf_cpu);
+    for (ggml_backend_buffer_t buf : opt_ctx->bufs_momenta) {
+        ggml_backend_buffer_free(buf);
+    }
+    for (struct ggml_context * ctx : opt_ctx->ctxs_momenta) {
+        ggml_free(ctx);
+    }
     ggml_free(opt_ctx->ctx_static);
     ggml_free(opt_ctx->ctx_cpu);
     delete opt_ctx;
diff --git a/include/llama.h b/include/llama.h
index 0bf8ead384..6a3a1ebe38 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1552,6 +1552,12 @@ extern "C" {
         void * get_opt_pars_ud;                     // userdata for calculating optimizer parameters
 
         enum ggml_opt_optimizer_type optimizer_type;
+
+        // Gradient checkpointing: mark every Nth forward graph node as persistent so the
+        // allocator cannot reuse its memory during backward.  Reduces peak activation VRAM
+        // at the cost of ~0 extra compute (activations are kept, not recomputed).
+        // Set to 0 (default) to disable.  Good values: 32–64 nodes ≈ every 1–2 transformer layers.
+        int32_t grad_checkpoint_interval;
     };
 
     LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 9f67d47b50..ba98acd403 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -2618,11 +2618,71 @@ void llama_context::opt_init(struct llama_model * model, struct llama_opt_params
     GGML_ASSERT(model->hparams.n_ctx_train % n_batch  == 0);
     GGML_ASSERT(n_batch                    % n_ubatch == 0);
 
+    // Recreate the scheduler and gf_res_prev with a training-inflated graph size before
+    // creating opt_ctx, so opt_ctx captures the new (larger) scheduler pointer.
+    // The backward graph (gb_grad) duplicates gf and adds ~2-3x more nodes+leafs;
+    // gb_opt adds optimizer step nodes on top.
+    //
+    // We measure the actual training forward graph node count at n_ubatch here,
+    // then multiply by 4 to cover gf + gb_grad + gb_opt.  This is exact for any
+    // model size — no magic constant needed.
+    {
+        uint32_t train_fwd_nodes = 0;
+
+        // Build a real training-ubatch forward graph in split-only mode (no buffer realloc)
+        // so we can count its actual nodes.  Fall back to n_tensors formula if it fails.
+        if (memory) {
+            auto mctx_tmp = memory->init_full();
+            if (mctx_tmp) {
+                // graph_reserve() uses gf_res_reserve to build the graph, so both
+                // must be large enough to hold the training forward graph.
+                // Use 16x n_tensors as a generous temporary cap for the measurement pass.
+                const uint32_t tmp_cap = std::max<uint32_t>(4096u, 16u * model->n_tensors());
+                gf_res_prev.reset(new llm_graph_result(tmp_cap));
+                gf_res_reserve.reset(new llm_graph_result(tmp_cap));
+                // split_only=true: only splits the graph, doesn't reallocate compute buffers
+                auto * gf_train = graph_reserve(n_ubatch, 1, n_ubatch, mctx_tmp.get(), /*split_only=*/true);
+                if (gf_train) {
+                    train_fwd_nodes = (uint32_t)ggml_graph_n_nodes(gf_train);
+                    LLAMA_LOG_INFO("%s: measured training graph nodes = %u (n_ubatch=%u)\n",
+                                   __func__, train_fwd_nodes, n_ubatch);
+                }
+            }
+        }
+
+        if (train_fwd_nodes == 0) {
+            // Fallback: use n_tensors formula
+            train_fwd_nodes = std::max<uint32_t>(1024u, 8u * model->n_tensors());
+            LLAMA_LOG_WARN("%s: could not measure training graph, using fallback nodes=%u\n",
+                           __func__, train_fwd_nodes);
+        }
+
+        // gf + gb_grad + gb_opt each need ~train_fwd_nodes; multiply by 4 for safety headroom.
+        // Multiply by 2 again for the scheduler's n_nodes + n_leafs check.
+        const int64_t inflated   = (int64_t)std::max<uint32_t>(train_fwd_nodes, 1024u) * 4;
+        const int64_t sched_size = inflated * 2;
+        // Both gf_res_prev and gf_res_reserve are used to build forward graphs
+        // (graph_reserve uses gf_res_reserve; opt_epoch_iter uses gf_res_prev).
+        // Both must have capacity for the full backward graph.
+        gf_res_prev.reset(new llm_graph_result(inflated));
+        gf_res_reserve.reset(new llm_graph_result(inflated));
+        sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(),
+                                           sched_size, cparams.pipeline_parallel, cparams.op_offload));
+        // Suppress the next sched_reserve() call so that llama_decode() during GRPO inference
+        // steps does NOT replace the training sched with a smaller inference sched.
+        // opt_ctx->backend_sched stores a raw pointer to sched.get(); replacing sched while
+        // opt_ctx is alive would leave that pointer dangling and crash on the next opt_epoch.
+        sched_need_reserve = false;
+        LLAMA_LOG_INFO("%s: training graph capacity = %lld (train_fwd_nodes=%u x4)\n",
+                       __func__, (long long)inflated, train_fwd_nodes);
+    }
+
     ggml_opt_params opt_params = ggml_opt_default_params(sched.get(), GGML_OPT_LOSS_TYPE_CROSS_ENTROPY);
-    opt_params.opt_period      = n_batch / n_ubatch;
-    opt_params.get_opt_pars    = lopt_params.get_opt_pars;
-    opt_params.get_opt_pars_ud = lopt_params.get_opt_pars_ud;
-    opt_params.optimizer       = lopt_params.optimizer_type;
+    opt_params.opt_period                = n_batch / n_ubatch;
+    opt_params.get_opt_pars              = lopt_params.get_opt_pars;
+    opt_params.get_opt_pars_ud           = lopt_params.get_opt_pars_ud;
+    opt_params.optimizer                 = lopt_params.optimizer_type;
+    opt_params.grad_checkpoint_interval  = lopt_params.grad_checkpoint_interval;
     opt_ctx = ggml_opt_init(opt_params);
 
     llama_opt_param_filter param_filter = lopt_params.param_filter;
@@ -2706,6 +2766,8 @@ void llama_context::opt_epoch_iter(
         };
 
         uint32_t pos_batch = 0;
+        static bool timings_printed = false;  // print per-ubatch timings only for the first window
+        struct ggml_context * ctx_compute_opt = nullptr;
         do {
             const auto & ubatch = mctx->get_ubatch();
 
@@ -2718,26 +2780,38 @@ void llama_context::opt_epoch_iter(
 
             auto * res = gf_res_prev.get();
 
+            const int64_t t0_build = ggml_time_ms();
             const auto gparams = graph_params(res, ubatch, mctx.get(), LLM_GRAPH_TYPE_DEFAULT);
 
             res->reset();
 
             auto * gf = model.build_graph(gparams);
 
-            struct ggml_context * ctx_compute_opt;
-            {
+            // Allocate the tensor metadata context once, then reset it each iteration.
+            // ggml_reset() is much cheaper than ggml_free()+ggml_init() — it just resets the
+            // allocation pointer without freeing/reallocating the backing memory buffer.
+            if (!ctx_compute_opt) {
                 const size_t size_gf = ggml_graph_size(gf);
-                const size_t size_meta = 4*size_gf*ggml_tensor_overhead() + 2*ggml_graph_overhead_custom(size_gf, /*grads = */ true);
+                const size_t size_meta = 4*size_gf*ggml_tensor_overhead() + 3*ggml_graph_overhead_custom(size_gf, /*grads = */ true);
                 struct ggml_init_params params = {
                     /*.mem_size   =*/ size_meta,
                     /*.mem_buffer =*/ nullptr,
                     /*.no_alloc   =*/ true,
                 };
                 ctx_compute_opt = ggml_init(params);
+                if (!timings_printed) {
+                    LLAMA_LOG_INFO("%s: [timing] graph capacity=%zu n_nodes=%d size_meta=%.1fMB\n", __func__,
+                            size_gf, ggml_graph_n_nodes(gf), (double)size_meta / (1024*1024));
+                }
+            } else {
+                ggml_reset(ctx_compute_opt);
             }
+
+            const int64_t t1_alloc = ggml_time_ms();
             ggml_opt_prepare_alloc(opt_ctx, ctx_compute_opt, gf, res->get_inp_tokens(), res->get_logits());
             ggml_opt_alloc(opt_ctx, train);
 
+            const int64_t t2_inputs = ggml_time_ms();
             res->set_inputs(&ubatch);
             {
                 struct ggml_tensor * labels = ggml_opt_labels(opt_ctx);
@@ -2753,14 +2827,29 @@ void llama_context::opt_epoch_iter(
                     ggml_backend_tensor_set(labels, &reward_scale, (pos_ubatch*labels->ne[0] + labels_sparse[ilabel])*sizeof(float), sizeof(float));
                 }
             }
+
+            const int64_t t3_eval = ggml_time_ms();
             ggml_opt_eval(opt_ctx, result);
+
+            const int64_t t4_done = ggml_time_ms();
+            if (!timings_printed) {
+                LLAMA_LOG_INFO("%s: [timing] build=%" PRId64 "ms alloc=%" PRId64 "ms inputs=%" PRId64 "ms eval=%" PRId64 "ms total=%" PRId64 "ms\n",
+                        __func__,
+                        t1_alloc - t0_build,
+                        t2_inputs - t1_alloc,
+                        t3_eval - t2_inputs,
+                        t4_done - t3_eval,
+                        t4_done - t0_build);
+                timings_printed = true;
+            }
+
             if (callback) {
                 callback(train, opt_ctx, dataset, result, idata_in_loop + (pos_ctx + pos_batch)/n_ubatch + 1, ndata_in_loop, t_loop_start);
             }
-            ggml_free(ctx_compute_opt);
 
             pos_batch += ubatch.n_tokens;
         } while (mctx->next());
+        ggml_free(ctx_compute_opt);
     }
 }
 

From 3e4166d3fd9b4ba55ecebc60dcaf4d4e00fe8792 Mon Sep 17 00:00:00 2001
From: Salvatore Rossitto <srossitto79@gmail.com>
Date: Thu, 12 Mar 2026 13:19:46 +0100
Subject: [PATCH 06/15] fixed assert in ggml.c
 GGML_ASSERT(ggml_nelements(adamw_params) == 8)

---
 ggml/src/ggml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 1e04911360..f146f13b5f 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -6095,7 +6095,7 @@ struct ggml_tensor * ggml_opt_step_adamw(
     GGML_ASSERT(ggml_are_same_shape(a, m));
     GGML_ASSERT(ggml_are_same_shape(a, v));
     GGML_ASSERT(adamw_params->type == GGML_TYPE_F32);
-    GGML_ASSERT(ggml_nelements(adamw_params) == 7);
+    GGML_ASSERT(ggml_nelements(adamw_params) == 8);
 
     struct ggml_tensor * result = ggml_view_tensor(ctx, a);
 

From e18d20d6c68464c749c6fed059dd3a5449810577 Mon Sep 17 00:00:00 2001
From: Salvatore Rossitto <srossitto79@gmail.com>
Date: Thu, 12 Mar 2026 13:28:07 +0100
Subject: [PATCH 07/15] fixed missing changes from dev version

---
 ggml/src/ggml-cuda/ggml-cuda.cu |  4 +++-
 ggml/src/ggml-cuda/out-prod.cu  | 37 +++++++++++++++++++++++++--------
 ggml/src/ggml.c                 | 22 +++++++++++++-------
 src/llama-adapter.cpp           | 24 ++++++++++++++-------
 4 files changed, 63 insertions(+), 24 deletions(-)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 3e1cca6c98..9e5492f5fc 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -4774,7 +4774,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                 }
             } break;
         case GGML_OP_OUT_PROD:
-            return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
+            return op->type == GGML_TYPE_F32
+                && (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type))
+                && op->src[1]->type == GGML_TYPE_F32;
         case GGML_OP_OUT_PROD_ID:
             return op->src[0] != nullptr && op->src[1] != nullptr && op->src[2] != nullptr
                 && op->type        == GGML_TYPE_F32
diff --git a/ggml/src/ggml-cuda/out-prod.cu b/ggml/src/ggml-cuda/out-prod.cu
index 9afc323bd9..392de34d1c 100644
--- a/ggml/src/ggml-cuda/out-prod.cu
+++ b/ggml/src/ggml-cuda/out-prod.cu
@@ -1,4 +1,5 @@
 #include "out-prod.cuh"
+#include "convert.cuh"
 
 #include <cstdint>
 #include <cstring>
@@ -10,7 +11,7 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
     GGML_TENSOR_BINARY_OP_LOCALS
 
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type));
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
     GGML_ASSERT(dst->type  == GGML_TYPE_F32);
 
@@ -24,19 +25,37 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     GGML_ASSERT(ne2 == src1->ne[2]);
     GGML_ASSERT(ne3 == src1->ne[3]);
 
-    const float * src0_d = (const float *) src0->data;
-    const float * src1_d = (const float *) src1->data;
-    float       *  dst_d = (float       *)  dst->data;
-
     cudaStream_t   stream = ctx.stream();
     cublasHandle_t handle = ctx.cublas_handle();
 
+    // If src0 is quantized, dequantize to a temp F32 buffer on GPU
+    ggml_cuda_pool_alloc<float> src0_f32_alloc;
+    const float * src0_d;
+    int64_t       lda;
+
+    if (src0->type != GGML_TYPE_F32) {
+        const int64_t n_elements = ggml_nelements(src0);
+        src0_f32_alloc.alloc(ctx.pool(), n_elements);
+
+        to_fp32_cuda_t to_fp32 = ggml_get_to_fp32_cuda(src0->type);
+        GGML_ASSERT(to_fp32 != nullptr);
+        to_fp32(src0->data, src0_f32_alloc.ptr, n_elements, stream);
+
+        src0_d = src0_f32_alloc.ptr;
+        lda    = ne00; // dequantized data is contiguous: stride = ne00
+    } else {
+        src0_d = (const float *) src0->data;
+        lda    = nb01 / sizeof(float);
+    }
+
+    const float * src1_d = (const float *) src1->data;
+    float       *  dst_d = (float       *)  dst->data;
+
     const float alpha = 1.0f;
     const float beta = 0.0f;
 
     CUBLAS_CHECK(cublasSetStream(handle, stream));
 
-    const int64_t lda = nb01 / sizeof(float);
     const int64_t ldc = nb1  / sizeof(float);
 
     const bool src1_T = ggml_is_transposed(src1);
@@ -44,9 +63,9 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const int64_t           ldb            = (src1_T ?        nb10 :        nb11) /  sizeof(float);
     GGML_ASSERT(                             (src1_T ?        nb11 :        nb10) == sizeof(float));
 
-    // data strides in dimensions 2/3
-    const size_t s02 = nb02 / sizeof(float);
-    const size_t s03 = nb03 / sizeof(float);
+    // data strides in dimensions 2/3 (for dequantized src0, use element-based strides)
+    const size_t s02 = (src0->type != GGML_TYPE_F32) ? (ne00 * ne01)        : (nb02 / sizeof(float));
+    const size_t s03 = (src0->type != GGML_TYPE_F32) ? (ne00 * ne01 * ne02) : (nb03 / sizeof(float));
     const size_t s12 = nb12 / sizeof(float);
     const size_t s13 = nb13 / sizeof(float);
     const size_t s2  = nb2  / sizeof(float);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index f146f13b5f..255e7d5a88 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -3871,12 +3871,17 @@ struct ggml_tensor * ggml_get_rows_back(
         struct ggml_tensor  * a,
         struct ggml_tensor  * b,
         struct ggml_tensor  * c) {
-    GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32);
-    GGML_ASSERT(ggml_is_matrix(c) && (a->ne[0] == c->ne[0]));
+    GGML_ASSERT(b->type == GGML_TYPE_I32);
+    GGML_ASSERT(a->ne[0] == c->ne[0]);
 
+    // Support both 2D and 3D: result shape matches c (the source tensor shape)
     // TODO: implement non F32 return
-    //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
-    struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]);
+    struct ggml_tensor * result;
+    if (c->ne[2] > 1) {
+        result = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1], c->ne[2]);
+    } else {
+        result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]);
+    }
 
     result->op     = GGML_OP_GET_ROWS_BACK;
     result->src[0] = a;
@@ -7077,9 +7082,12 @@ void ggml_build_backward_expand(
             continue;
         }
 
-        // inplace operations are currently not supported
-        GGML_ASSERT(!node->view_src || node->op == GGML_OP_CPY || node->op == GGML_OP_VIEW ||
-            node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE);
+        // inplace operations are currently not supported — warn and skip instead of crashing
+        if (node->view_src && node->op != GGML_OP_CPY && node->op != GGML_OP_VIEW &&
+            node->op != GGML_OP_RESHAPE && node->op != GGML_OP_PERMUTE && node->op != GGML_OP_TRANSPOSE) {
+            GGML_LOG_WARN("%s: skipping unsupported inplace op '%s' in backward graph\n", __func__, ggml_op_name(node->op));
+            continue;
+        }
 
         const size_t ihash = ggml_hash_find(&cgraph->visited_hash_set, node);
         GGML_ASSERT(ihash != GGML_HASHSET_FULL);
diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp
index d6a5800e63..1fee93ba40 100644
--- a/src/llama-adapter.cpp
+++ b/src/llama-adapter.cpp
@@ -334,16 +334,26 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
 
         auto * buft = ggml_backend_buffer_get_type(model_tensor->buffer);
 
-        // do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case
+        // do not load loras to extra buffer types (i.e. bufts for repacking)
+        // try device-native buft first (keeps LoRA on GPU), fall back to CPU only as last resort
         for (auto & ex : buft_extra) {
             if (ex == buft) {
-                LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
-
-                auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-                if (!cpu_dev) {
-                    throw std::runtime_error(format("%s: no CPU backend found", __func__));
+                // try to get the device's native (non-repack) buffer type
+                auto * dev = ggml_backend_buft_get_device(buft);
+                auto * native_buft = dev ? ggml_backend_dev_buffer_type(dev) : nullptr;
+                if (native_buft && native_buft != buft) {
+                    LLAMA_LOG_WARN("%s: lora for '%s' cannot use repack buft '%s', using device-native '%s'\n",
+                        __func__, model_tensor->name, ggml_backend_buft_name(buft), ggml_backend_buft_name(native_buft));
+                    buft = native_buft;
+                } else {
+                    LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n",
+                        __func__, model_tensor->name, ggml_backend_buft_name(buft));
+                    auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+                    if (!cpu_dev) {
+                        throw std::runtime_error(format("%s: no CPU backend found", __func__));
+                    }
+                    buft = ggml_backend_dev_buffer_type(cpu_dev);
                 }
-                buft = ggml_backend_dev_buffer_type(cpu_dev);
 
                 break;
             }

From dbe24a74717b86236cfc71e278a7f7d42c4a244c Mon Sep 17 00:00:00 2001
From: Salvatore Rossitto <srossitto79@gmail.com>
Date: Thu, 12 Mar 2026 18:59:24 +0100
Subject: [PATCH 08/15] - fixed some python warning - removed nemotron_reap.py
 based on bnb (off topic)

---
 tools/moe-pruning/analyze_stats.py |   2 +-
 tools/moe-pruning/gguf_prune.py    |  10 +-
 tools/moe-pruning/nemotron_reap.py | 296 -----------------------------
 3 files changed, 7 insertions(+), 301 deletions(-)
 delete mode 100644 tools/moe-pruning/nemotron_reap.py

diff --git a/tools/moe-pruning/analyze_stats.py b/tools/moe-pruning/analyze_stats.py
index e7641a3bb6..2e0821f323 100644
--- a/tools/moe-pruning/analyze_stats.py
+++ b/tools/moe-pruning/analyze_stats.py
@@ -3,7 +3,7 @@
 analyze_stats.py  --  Summarize expert_stats.json and model size projections.
 Usage: python analyze_stats.py [stats_file] [--keep 0.5]
 """
-import json, sys, statistics, argparse
+import json, statistics, argparse
 
 parser = argparse.ArgumentParser()
 parser.add_argument("stats", nargs="?", default="expert_stats_reap.json")
diff --git a/tools/moe-pruning/gguf_prune.py b/tools/moe-pruning/gguf_prune.py
index 2a36d5cf53..df3e638ab4 100644
--- a/tools/moe-pruning/gguf_prune.py
+++ b/tools/moe-pruning/gguf_prune.py
@@ -31,14 +31,15 @@ Usage:
         --keep_n 32
 """
 
+from __future__ import annotations
+
 import argparse
 import json
 import re
-import sys
 from pathlib import Path
 
 import numpy as np
-from gguf import GGUFReader, GGUFWriter, GGMLQuantizationType, GGUFValueType
+from gguf import GGUFReader, GGUFWriter, GGUFValueType
 
 
 # ── Constants ─────────────────────────────────────────────────────────────────
@@ -187,7 +188,7 @@ def main():
     kept: dict[int, list[int]] = {}
     for tensor in reader.tensors:
         il, suffix = layer_and_suffix(tensor.name)
-        if il is None or not is_expert_suffix(suffix):
+        if il is None or suffix is None or not is_expert_suffix(suffix):
             continue
         if il in kept:
             continue  # already computed for this layer
@@ -222,9 +223,10 @@ def main():
     n_pruned = 0
     for tensor in reader.tensors:
         il, suffix = layer_and_suffix(tensor.name)
-        is_expert = il is not None and is_expert_suffix(suffix)
+        is_expert = il is not None and suffix is not None and is_expert_suffix(suffix)
 
         if is_expert:
+            assert il is not None
             k = kept[il]
             data = slice_expert_axis(tensor.data, k)
             writer.add_tensor(
diff --git a/tools/moe-pruning/nemotron_reap.py b/tools/moe-pruning/nemotron_reap.py
deleted file mode 100644
index fac5831d3c..0000000000
--- a/tools/moe-pruning/nemotron_reap.py
+++ /dev/null
@@ -1,296 +0,0 @@
-"""
-NemotronH Expert Activation Profiler + Pruner
-Two-phase: profile with 4-bit on GPU, prune bf16 on CPU.
-
-Usage:
-  # Phase 1 - profile
-  python nemotron_reap.py profile \
-    --model unsloth/Nemotron-3-Nano-30B-A3B \
-    --prompts training-data.jsonl \
-    --output expert_stats.json
-
-  # Phase 2 - prune
-  python nemotron_reap.py prune \
-    --model unsloth/Nemotron-3-Nano-30B-A3B \
-    --stats expert_stats.json \
-    --keep_ratio 0.20 \
-    --output ./nemotron-pruned-25e
-"""
-
-import os
-os.environ["TORCH_COMPILE_DISABLE"] = "1"  # prevent inductor hang during save_pretrained
-
-import json
-import argparse
-import torch
-import numpy as np
-from collections import defaultdict
-from transformers import AutoTokenizer, AutoModelForCausalLM
-
-try:
-    from transformers import BitsAndBytesConfig
-    import patch_bnb  # noqa: F401 — patches Params4bit.__new__ for transformers 5.x compat
-    HAS_BNB = True
-except ImportError:
-    HAS_BNB = False
-
-
-# ── Tracker ───────────────────────────────────────────────────────────────────
-
-class ExpertActivationTracker:
-    def __init__(self, n_experts: int = 128):
-        self.n_experts = n_experts
-        self.activation_counts  = defaultdict(lambda: np.zeros(n_experts, dtype=np.int64))
-        self.activation_weights = defaultdict(lambda: np.zeros(n_experts, dtype=np.float64))
-        self.total_tokens = defaultdict(int)
-        self._hooks = []
-
-    def register_hooks(self, model):
-        count = 0
-        for layer_idx, block in enumerate(model.backbone.layers):
-            if block.block_type == "moe":
-                h = block.mixer.gate.register_forward_hook(self._make_hook(layer_idx))
-                self._hooks.append(h)
-                count += 1
-        print(f"  Hooks attached to {count} MoE layers")
-
-    def _make_hook(self, layer_idx):
-        def hook(module, input, output):
-            topk_indices, topk_weights = output
-            idx = topk_indices.detach().cpu().numpy()           # [T, 6]
-            wgt = topk_weights.detach().float().cpu().numpy()   # [T, 6]
-            T = idx.shape[0]
-            self.total_tokens[layer_idx] += T
-            np.add.at(self.activation_counts[layer_idx],  idx.flatten(), 1)
-            np.add.at(self.activation_weights[layer_idx], idx.flatten(), wgt.flatten())
-        return hook
-
-    def remove_hooks(self):
-        for h in self._hooks:
-            h.remove()
-        self._hooks.clear()
-
-    def get_stats(self) -> dict:
-        stats = {}
-        for layer_idx in sorted(self.activation_counts):
-            counts  = self.activation_counts[layer_idx]
-            weights = self.activation_weights[layer_idx]
-            total   = self.total_tokens[layer_idx]
-            freq    = counts / (total + 1e-9)
-            avg_w   = np.where(counts > 0, weights / counts, 0.0)
-            importance = freq * avg_w
-            stats[layer_idx] = {
-                "total_tokens":         int(total),
-                "activation_counts":    counts.tolist(),
-                "activation_frequency": freq.tolist(),
-                "avg_weight":           avg_w.tolist(),
-                "importance_score":     importance.tolist(),
-                "never_activated":      int((counts == 0).sum()),
-            }
-        return stats
-
-    def print_summary(self, stats, keep_ratio):
-        keep_n = max(1, int(self.n_experts * keep_ratio))
-        print(f"\n{'='*70}")
-        print(f"  PROFILING SUMMARY  |  keep_ratio={keep_ratio:.0%}  |  keeping {keep_n}/128 experts/layer")
-        print(f"{'='*70}")
-        for li, s in stats.items():
-            imp = np.array(s['importance_score'])
-            threshold = np.sort(imp)[self.n_experts - keep_n]
-            print(
-                f"  Layer {li:3d}: "
-                f"never_activated={s['never_activated']:3d}/128  "
-                f"top_freq={max(s['activation_frequency']):.3f}  "
-                f"threshold={threshold:.4f}"
-            )
-        total_moe = len(stats)
-        print(f"\n  MoE layers : {total_moe}")
-        print(f"  Kept       : {total_moe * keep_n} experts total")
-        print(f"  Pruned     : {total_moe * (self.n_experts - keep_n)} experts total")
-        print(f"{'='*70}\n")
-
-
-# ── Phase 1: Profile ──────────────────────────────────────────────────────────
-
-def cmd_profile(args):
-    # Mamba2 layers use Triton kernels — CUDA required.
-    # 4-bit NF4 fits in 16GB VRAM (~15GB). We must keep ALL layers on GPU
-    # (no CPU spillover) otherwise PCIe transfers make inference unusably slow.
-    print(f"\n[Phase 1] Profiling — 4-bit NF4, GPU only")
-    print(f"  Model  : {args.model}")
-    print(f"  Prompts: {args.prompts}")
-
-    bnb_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_compute_dtype=torch.bfloat16,
-        bnb_4bit_use_double_quant=True,
-    )
-
-    tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
-    print("  Loading model in 4-bit NF4...")
-    model = AutoModelForCausalLM.from_pretrained(
-        args.model,
-        trust_remote_code=True,
-        quantization_config=bnb_config,
-        device_map={"": 0},  # force ALL layers onto GPU 0, no CPU spillover
-    )
-    model.eval()
-    print("  Model loaded on GPU.")
-
-    # Load prompt+response pairs
-    pairs = []
-    with open(args.prompts) as f:
-        for line in f:
-            line = line.strip()
-            if not line:
-                continue
-            obj = json.loads(line)
-            text = obj.get("prompt", "") + "\n" + obj.get("response", "")
-            pairs.append(text)
-    print(f"  Loaded {len(pairs)} prompt+response pairs")
-
-    tracker = ExpertActivationTracker(n_experts=128)
-    tracker.register_hooks(model)
-
-    with torch.no_grad():
-        for i, text in enumerate(pairs):
-            if i % 100 == 0:
-                print(f"  [{i+1}/{len(pairs)}] processing...")
-            inputs = tokenizer(
-                text,
-                return_tensors="pt",
-                truncation=True,
-                max_length=args.max_length,
-            ).to("cuda")
-            try:
-                model(**inputs)
-            except torch.cuda.OutOfMemoryError:
-                print(f"  [{i+1}] OOM — skipping")
-                torch.cuda.empty_cache()
-
-    tracker.remove_hooks()
-    stats = tracker.get_stats()
-    tracker.print_summary(stats, keep_ratio=args.keep_ratio)
-
-    stats_out = {str(k): v for k, v in stats.items()}
-    with open(args.output, "w") as f:
-        json.dump(stats_out, f, indent=2)
-    print(f"  Stats saved → {args.output}")
-    print(f"\n[Phase 1] Done. Run 'prune' next (CPU only).")
-
-
-# ── Phase 2: Prune ────────────────────────────────────────────────────────────
-
-def cmd_prune(args):
-    print(f"\n[Phase 2] Pruning — bf16 on CPU")
-    print(f"  Model      : {args.model}")
-    print(f"  Stats      : {args.stats}")
-    print(f"  Keep ratio : {args.keep_ratio:.0%}")
-    print(f"  Output     : {args.output}")
-
-    with open(args.stats) as f:
-        stats = {int(k): v for k, v in json.load(f).items()}
-
-    tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
-
-    print("  Loading model in bf16 on CPU — this takes a few minutes...")
-    model = AutoModelForCausalLM.from_pretrained(
-        args.model,
-        trust_remote_code=True,
-        dtype=torch.bfloat16,
-        device_map="cpu",
-    )
-
-    keep_n = max(1, int(128 * args.keep_ratio))
-    print(f"\n  Pruning to top-{keep_n} experts per MoE layer...\n")
-
-    for layer_idx, block in enumerate(model.backbone.layers):
-        if block.block_type != "moe":
-            continue
-
-        if layer_idx not in stats:
-            print(f"  Layer {layer_idx:3d}: no profiling data — skipping")
-            continue
-
-        # Use REAP score if available (from llama.cpp profiler), else fall back to legacy importance_score
-        layer_stats = stats[layer_idx]
-        if "reap" in layer_stats:
-            importance = np.array(layer_stats["reap"])
-        else:
-            importance = np.array(layer_stats["importance_score"])
-        keep_sorted = sorted(np.argsort(importance)[-keep_n:].tolist())
-        prune_count = 128 - len(keep_sorted)
-
-        # Prune expert list
-        block.mixer.experts = torch.nn.ModuleList(
-            [block.mixer.experts[i] for i in keep_sorted]
-        )
-
-        # Prune router weights to match new expert indices
-        keep_t = torch.tensor(keep_sorted, dtype=torch.long)
-        block.mixer.gate.weight = torch.nn.Parameter(
-            block.mixer.gate.weight.data[keep_t].clone()
-        )
-        old_bias = block.mixer.gate.e_score_correction_bias.data[keep_t].clone()
-        block.mixer.gate.register_buffer("e_score_correction_bias", old_bias)
-        block.mixer.gate.n_routed_experts = keep_n
-
-        never = stats[layer_idx]["never_activated"]
-        print(f"  Layer {layer_idx:3d}: kept {keep_n}, pruned {prune_count}  (was {never} never-activated)")
-
-    # Patch top-level config
-    model.config.n_routed_experts = keep_n
-
-    # Fix transformers 5.x incompatibility: _tied_weights_keys must be a list of dicts,
-    # but the custom NemotronH modeling code sets it as a plain list of strings.
-    # _get_tied_weight_keys() calls .keys() on each element → AttributeError.
-    # Clear it — lm_head weight tying is not needed for inference on the pruned model.
-    for mod in model.modules():
-        if isinstance(getattr(mod, '_tied_weights_keys', None), list):
-            mod._tied_weights_keys = None
-
-    # Disable torch.compile / inductor before saving — transformers 5.x can trigger
-    # torch._inductor.compile_worker during save_pretrained, causing an indefinite hang.
-    import os
-    os.environ["TORCH_COMPILE_DISABLE"] = "1"
-    torch._dynamo.reset()
-
-    print(f"\n  Saving pruned model → {args.output}")
-    with torch.no_grad():
-        model.save_pretrained(args.output, safe_serialization=True)
-    tokenizer.save_pretrained(args.output)
-    print(f"\n[Phase 2] Done.")
-    print(f"  Experts per MoE layer : {keep_n}/128")
-    print(f"  Next: fine-tune with Unsloth from {args.output}")
-
-
-# ── Entry point ───────────────────────────────────────────────────────────────
-
-def main():
-    parser = argparse.ArgumentParser(description="NemotronH Expert Pruner (REAP-style)")
-    sub = parser.add_subparsers(dest="cmd", required=True)
-
-    p1 = sub.add_parser("profile", help="Phase 1: profile expert activations (GPU, 4-bit)")
-    p1.add_argument("--model",      default="unsloth/Nemotron-3-Nano-30B-A3B")
-    p1.add_argument("--prompts",    required=True)
-    p1.add_argument("--output",     default="expert_stats.json")
-    p1.add_argument("--keep_ratio", type=float, default=0.20,
-                    help="Preview ratio for summary only — does not affect saved stats")
-    p1.add_argument("--max_length", type=int,   default=2048)
-
-    p2 = sub.add_parser("prune", help="Phase 2: prune model using saved stats (CPU, bf16)")
-    p2.add_argument("--model",      default="unsloth/Nemotron-3-Nano-30B-A3B")
-    p2.add_argument("--stats",      default="expert_stats.json")
-    p2.add_argument("--keep_ratio", type=float, default=0.20)
-    p2.add_argument("--output",     default="./nemotron-pruned")
-
-    args = parser.parse_args()
-    if args.cmd == "profile":
-        cmd_profile(args)
-    elif args.cmd == "prune":
-        cmd_prune(args)
-
-
-if __name__ == "__main__":
-    main()

From 2e324f6c9a9f7964c033d5edd2576a4e83a5fa66 Mon Sep 17 00:00:00 2001
From: Salvatore Rossitto <srossitto79@gmail.com>
Date: Thu, 12 Mar 2026 19:01:46 +0100
Subject: [PATCH 09/15] removed some python warning/unused import

---
 examples/qlora_training/grpo_example.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/qlora_training/grpo_example.py b/examples/qlora_training/grpo_example.py
index 9c7f2e3c26..c56ff9395f 100644
--- a/examples/qlora_training/grpo_example.py
+++ b/examples/qlora_training/grpo_example.py
@@ -34,7 +34,6 @@ Python → C++ stdin:
 import argparse
 import logging
 import math
-import os
 import re
 import subprocess
 import sys
@@ -99,6 +98,7 @@ def read_ipc(proc: subprocess.Popen, timeout: float = 120.0) -> Optional[Tuple[s
     Returns None on EOF.
     Raises TimeoutError if nothing arrives within `timeout` seconds.
     """
+    assert proc.stdout is not None
     deadline = time.monotonic() + timeout
     while True:
         remaining = deadline - time.monotonic()
@@ -120,6 +120,7 @@ def read_ipc(proc: subprocess.Popen, timeout: float = 120.0) -> Optional[Tuple[s
 
 def write_cmd(proc: subprocess.Popen, cmd: str):
     """Write one command line to the subprocess stdin."""
+    assert proc.stdin is not None
     try:
         proc.stdin.write(cmd + "\n")
         proc.stdin.flush()
@@ -268,7 +269,8 @@ def run_grpo(args: argparse.Namespace):
         raise
     finally:
         try:
-            proc.stdin.close()
+            if proc.stdin is not None:
+                proc.stdin.close()
         except Exception:
             pass
         rc = proc.wait(timeout=30)

From 959f789bc9ba56e963b32a3ff637ed6f0c6ac97b Mon Sep 17 00:00:00 2001
From: Salvatore Rossitto <srossitto79@gmail.com>
Date: Thu, 12 Mar 2026 19:02:54 +0100
Subject: [PATCH 10/15] removed trailing whitespaces

---
 examples/qlora_training/check_lora_norms.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/qlora_training/check_lora_norms.py b/examples/qlora_training/check_lora_norms.py
index 8908f3ad99..3f9d20c8c8 100644
--- a/examples/qlora_training/check_lora_norms.py
+++ b/examples/qlora_training/check_lora_norms.py
@@ -42,10 +42,10 @@ def read_gguf(path):
         pos = f.tell()
         align = 32
         data_start = (pos + align - 1) & ~(align - 1)
-        
+
         print(f"\nFile: {path}")
         print(f"Tensors: {n_tensors}")
-        
+
         for name, dims, dtype, offset in tensors[:10]:  # first 10
             if dtype != 0:  # only F32 (type 0)
                 print(f"  {name}: dims={dims} type={dtype} (non-F32, skipping norm)")

From 1ebb82862ad7ff2ecc0fc6e432d40ad2b97bdd4c Mon Sep 17 00:00:00 2001
From: Salvatore Rossitto <srossitto79@gmail.com>
Date: Thu, 12 Mar 2026 19:03:45 +0100
Subject: [PATCH 11/15] added final newline in requirements.txt

---
 tools/moe-pruning/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/moe-pruning/requirements.txt b/tools/moe-pruning/requirements.txt
index 8f778e9fe5..6b21f239f4 100644
--- a/tools/moe-pruning/requirements.txt
+++ b/tools/moe-pruning/requirements.txt
@@ -1 +1 @@
-gguf
\ No newline at end of file
+gguf

From d1f8d527ba869d1c56808ee0105bba781c2a3902 Mon Sep 17 00:00:00 2001
From: Salvatore Rossitto <srossitto79@gmail.com>
Date: Thu, 12 Mar 2026 19:09:16 +0100
Subject: [PATCH 12/15] added new line at end

---
 examples/qlora_training/check_lora_norms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/qlora_training/check_lora_norms.py b/examples/qlora_training/check_lora_norms.py
index 3f9d20c8c8..c54ed4e7d1 100644
--- a/examples/qlora_training/check_lora_norms.py
+++ b/examples/qlora_training/check_lora_norms.py
@@ -61,4 +61,4 @@ if __name__ == '__main__':
         try:
             read_gguf(p)
         except Exception as e:
-            print(f"Error reading {p}: {e}")
\ No newline at end of file
+            print(f"Error reading {p}: {e}")

From 99c2456bbbf290074e9200e91af973d92a014774 Mon Sep 17 00:00:00 2001
From: Salvatore Rossitto <srossitto79@gmail.com>
Date: Fri, 13 Mar 2026 10:29:55 +0100
Subject: [PATCH 13/15] removed error guard on dataset (its not generic)

---
 examples/qlora_training/finetune_qlora.cpp | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/examples/qlora_training/finetune_qlora.cpp b/examples/qlora_training/finetune_qlora.cpp
index 57972712fc..cf6705c5a6 100644
--- a/examples/qlora_training/finetune_qlora.cpp
+++ b/examples/qlora_training/finetune_qlora.cpp
@@ -217,11 +217,12 @@ static std::vector<training_sample> load_jsonl(
                         break;
                     }
                 }
-                if (last_assistant_content.find("Error:") != std::string::npos ||
-                    last_assistant_content.find("error:") != std::string::npos) {
-                    LOG_DBG("%s: skipping line %d — assistant response contains error marker\n", __func__, lineno);
-                    continue;
-                }
+                // // this should be done on the python side...
+                // if (last_assistant_content.find("Error:") != std::string::npos ||
+                //     last_assistant_content.find("error:") != std::string::npos) {
+                //     LOG_DBG("%s: skipping line %d — assistant response contains error marker\n", __func__, lineno);
+                //     continue;
+                // }
             }
 
             // Split into prompt (no loss) + last assistant response (loss).
@@ -262,11 +263,12 @@ static std::vector<training_sample> load_jsonl(
             }
         } else if (j.contains("prompt") && j.contains("response")) {
             response_text = j["response"].get<std::string>();
-            if (response_text.find("Error:") != std::string::npos ||
-                response_text.find("error:") != std::string::npos) {
-                LOG_DBG("%s: skipping line %d — response contains error marker\n", __func__, lineno);
-                continue;
-            }
+            // // this should be done on the python side...
+            // if (response_text.find("Error:") != std::string::npos ||
+            //     response_text.find("error:") != std::string::npos) {
+            //     LOG_DBG("%s: skipping line %d — response contains error marker\n", __func__, lineno);
+            //     continue;
+            // }
             prompt_text = j["prompt"].get<std::string>();
         } else if (j.contains("text")) {
             response_text = j["text"].get<std::string>();

From a6979f3d5667019998e04879ead5b2cae91fa276 Mon Sep 17 00:00:00 2001
From: Salvatore Rossitto <srossitto79@gmail.com>
Date: Tue, 17 Mar 2026 09:07:21 +0100
Subject: [PATCH 14/15] added sigmoid backward pass

---
 ggml/src/ggml.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index f85f45b3de..4ac4d9ea7d 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -6837,6 +6837,13 @@ static void ggml_compute_backward(
                         ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, grad, ggml_exp(ctx, src0)));
                     }
                 } break;
+                case GGML_UNARY_OP_SIGMOID: {
+                    // d/dx sigmoid(x) = sigmoid(x) * (1 - sigmoid(x)) = tensor - tensor^2
+                    if (src0_needs_grads) {
+                        struct ggml_tensor * dsigmoid = ggml_sub(ctx, tensor, ggml_sqr(ctx, tensor));
+                        ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, grad, dsigmoid));
+                    }
+                } break;
                 case GGML_UNARY_OP_SOFTPLUS: {
                     if (src0_needs_grads) {
                         ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, grad, ggml_sigmoid(ctx, src0)));

From 51fee3e7ef73cc35ad76da07805597d5f2b9c672 Mon Sep 17 00:00:00 2001
From: Salvatore Rossitto <srossitto79@gmail.com>
Date: Wed, 18 Mar 2026 08:05:05 +0100
Subject: [PATCH 15/15] removed moe reap code merged by mistake

---
 tools/expert-profile/CMakeLists.txt        |   8 -
 tools/expert-profile/expert-profile.cpp    | 506 ---------------------
 tools/moe-pruning/README.md                |  97 ----
 tools/moe-pruning/analyze_stats.py         | 284 ------------
 tools/moe-pruning/build_expert_profile.sh  |  42 --
 tools/moe-pruning/extract_ppl.py           |  41 --
 tools/moe-pruning/gguf_prune.py            | 260 -----------
 tools/moe-pruning/requirements.txt         |   1 -
 tools/moe-pruning/sample_calibration.jsonl |   8 -
 9 files changed, 1247 deletions(-)
 delete mode 100644 tools/expert-profile/CMakeLists.txt
 delete mode 100644 tools/expert-profile/expert-profile.cpp
 delete mode 100644 tools/moe-pruning/README.md
 delete mode 100644 tools/moe-pruning/analyze_stats.py
 delete mode 100644 tools/moe-pruning/build_expert_profile.sh
 delete mode 100644 tools/moe-pruning/extract_ppl.py
 delete mode 100644 tools/moe-pruning/gguf_prune.py
 delete mode 100644 tools/moe-pruning/requirements.txt
 delete mode 100644 tools/moe-pruning/sample_calibration.jsonl

diff --git a/tools/expert-profile/CMakeLists.txt b/tools/expert-profile/CMakeLists.txt
deleted file mode 100644
index 859bd77a53..0000000000
--- a/tools/expert-profile/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-set(TARGET llama-expert-profile)
-add_executable(${TARGET} expert-profile.cpp)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
-
-if(LLAMA_TOOLS_INSTALL)
-    install(TARGETS ${TARGET} RUNTIME)
-endif()
diff --git a/tools/expert-profile/expert-profile.cpp b/tools/expert-profile/expert-profile.cpp
deleted file mode 100644
index de381ff1f1..0000000000
--- a/tools/expert-profile/expert-profile.cpp
+++ /dev/null
@@ -1,506 +0,0 @@
-/**
- * expert-profile: NemotronH MoE expert activation profiler (REAP implementation)
- *
- * Implements the REAP (Router-weighted Expert Activation Pruning) saliency criterion:
- *
- *   REAP(j) = mean over tokens routed to j of:  gate_weight(j,t) * ||expert_output(j,t)||_2
- *
- * where expert_output is ffn_moe_down (the FFN output BEFORE gate weighting),
- * and gate_weight is ffn_moe_weights (post-softmax routing probability).
- *
- * Intercepts three tensors per MoE layer via ggml eval callback:
- *   ffn_moe_topk-{il}    [n_expert_used, n_tokens] I32  — which experts were selected
- *   ffn_moe_weights-{il} [1, n_expert_used, n_tokens] F32 — gate weights (softmax probs)
- *   ffn_moe_down-{il}    [n_embd, n_expert_used, n_tokens] F32 — expert outputs (pre-weighting)
- *
- * Reference: "REAP: Router-weighted Expert Activation Pruning" (arXiv:2510.13999)
- *   score = mean_{x in X_j}[ g_j(x) * ||f_j(x)||_2 ]  (Equation 9)
- *
- * Usage:
- *   llama-expert-profile \
- *     -m model.gguf --jsonl training-data.jsonl --output expert_stats.json \
- *     [--n-experts 128] [--ctx-size 16384] [-ngl 32] [-t 24] [--save-every 1]
- */
-
-#include "arg.h"
-#include "common.h"
-#include "log.h"
-#include "llama.h"
-#include "ggml-backend.h"
-
-#include <algorithm>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <mutex>
-#include <string>
-#include <vector>
-
-// ─── Per-layer stats ──────────────────────────────────────────────────────────
-
-struct LayerStats {
-    int64_t n_experts    = 0;
-    int64_t total_tokens = 0;  // tokens processed through this layer
-
-    // Frequency / weighted-frequency (kept for reference/comparison)
-    std::vector<int64_t> activation_counts;   // [n_experts] — how many tokens routed here
-    std::vector<double>  weighted_freq_sum;   // [n_experts] — sum of gate weights
-
-    // REAP: running sum and count for computing mean(gate_weight * ||expert_out||_2)
-    std::vector<double>  reap_sum;            // [n_experts] — sum of g_j(t)*||f_j(t)||_2
-    std::vector<double>  ean_sum;             // [n_experts] — sum of ||f_j(t)||_2 (EAN, no gate)
-
-    void init(int64_t n) {
-        n_experts = n;
-        activation_counts.assign(n, 0);
-        weighted_freq_sum.assign(n, 0.0);
-        reap_sum.assign(n, 0.0);
-        ean_sum.assign(n, 0.0);
-    }
-
-    // Called once we have all three tensors for a batch.
-    // expert_ids:  [n_expert_used * n_tokens]  I32  — flat, column-major: [k + t*n_expert_used]
-    // gate_weights:[n_expert_used * n_tokens]  F32  — same layout
-    // expert_outs: [n_embd * n_expert_used * n_tokens] F32 — layout: [e + k*n_embd + t*n_embd*n_expert_used]
-    //              i.e. for token t, expert-slot k: out vector starts at t*n_embd*n_expert_used + k*n_embd
-    void add_batch(const int32_t * expert_ids,
-                   const float   * gate_weights,
-                   const float   * expert_outs,
-                   int64_t         n_expert_used,
-                   int64_t         n_tok,
-                   int64_t         n_embd) {
-        total_tokens += n_tok;
-        for (int64_t t = 0; t < n_tok; ++t) {
-            for (int64_t k = 0; k < n_expert_used; ++k) {
-                const int64_t flat = k + t * n_expert_used;
-                const int32_t eid  = expert_ids[flat];
-                if (eid < 0 || eid >= n_experts) continue;
-
-                const float gw = gate_weights[flat];
-
-                // L2 norm of expert output vector for this (token, expert-slot)
-                const float * vec = expert_outs + t * n_embd * n_expert_used + k * n_embd;
-                double norm2 = 0.0;
-                for (int64_t d = 0; d < n_embd; ++d) {
-                    norm2 += (double)vec[d] * (double)vec[d];
-                }
-                const double norm = std::sqrt(norm2);
-
-                activation_counts [eid] += 1;
-                weighted_freq_sum [eid] += gw;
-                reap_sum          [eid] += gw * norm;   // REAP numerator
-                ean_sum           [eid] += norm;         // EAN numerator
-            }
-        }
-    }
-};
-
-// ─── Collector ────────────────────────────────────────────────────────────────
-
-struct ExpertCollector {
-    int64_t n_experts = 128;
-
-    std::map<int, LayerStats> layer_stats;
-    std::mutex                mtx;
-
-    // We need all three tensors before we can compute REAP.
-    // They arrive in order: topk → weights → down (per the graph build order).
-    // Store pending topk+weights until down arrives.
-    struct PendingBatch {
-        int64_t              n_expert_used = 0;
-        int64_t              n_tokens      = 0;
-        std::vector<int32_t> expert_ids;    // [n_expert_used * n_tokens]
-        std::vector<float>   gate_weights;  // [n_expert_used * n_tokens]
-        bool                 has_topk    = false;
-        bool                 has_weights = false;
-    };
-    std::map<int, PendingBatch> pending; // layer_idx → pending
-
-    // Strip device prefix/suffix: "CUDA0#ffn_moe_down-5#0" → "ffn_moe_down-5"
-    static std::string clean_name(const char * raw) {
-        const char * p = strchr(raw, '#');
-        if (p) {
-            ++p;
-            const char * q = strchr(p, '#');
-            return q ? std::string(p, q - p) : std::string(p);
-        }
-        return raw;
-    }
-
-    bool wants(struct ggml_tensor * t) {
-        if (!t->name[0]) return false;
-        const std::string n = clean_name(t->name);
-        return (n.compare(0, 13, "ffn_moe_topk-")    == 0 ||
-                n.compare(0, 16, "ffn_moe_weights-") == 0 ||
-                n.compare(0, 13, "ffn_moe_down-")    == 0);
-    }
-
-    bool on_tensor(struct ggml_tensor * t) {
-        const std::string name = clean_name(t->name);
-
-        // Identify tensor type and layer
-        int  il         = -1;
-        bool is_topk    = false;
-        bool is_weights = false;
-        bool is_down    = false;
-
-        if      (name.compare(0, 13, "ffn_moe_topk-")    == 0) { il = atoi(name.c_str() + 13); is_topk    = true; }
-        else if (name.compare(0, 16, "ffn_moe_weights-") == 0) { il = atoi(name.c_str() + 16); is_weights = true; }
-        else if (name.compare(0, 13, "ffn_moe_down-")    == 0) { il = atoi(name.c_str() + 13); is_down    = true; }
-        else return true;
-
-        if (il < 0) return true;
-
-        // Copy tensor data from (possibly GPU) buffer to host
-        const size_t nbytes = ggml_nbytes(t);
-        std::vector<char> buf(nbytes);
-        ggml_backend_tensor_get(t, buf.data(), 0, nbytes);
-
-        std::lock_guard<std::mutex> lk(mtx);
-        PendingBatch & pb = pending[il];
-
-        if (is_topk) {
-            // [n_expert_used, n_tokens] I32
-            pb.n_expert_used = t->ne[0];
-            pb.n_tokens      = t->ne[1];
-            pb.expert_ids.resize(pb.n_expert_used * pb.n_tokens);
-            memcpy(pb.expert_ids.data(), buf.data(), pb.n_expert_used * pb.n_tokens * sizeof(int32_t));
-            pb.has_topk    = true;
-            pb.has_weights = false; // reset in case of re-use
-
-        } else if (is_weights) {
-            // [1, n_expert_used, n_tokens] F32 — flat layout same as topk
-            if (!pb.has_topk) return true; // shouldn't happen
-            pb.gate_weights.resize(pb.n_expert_used * pb.n_tokens);
-            memcpy(pb.gate_weights.data(), buf.data(), pb.n_expert_used * pb.n_tokens * sizeof(float));
-            pb.has_weights = true;
-
-        } else if (is_down) {
-            // [n_embd, n_expert_used, n_tokens] F32
-            if (!pb.has_topk || !pb.has_weights) return true;
-
-            const int64_t n_embd        = t->ne[0];
-            const int64_t n_expert_used = t->ne[1];
-            const int64_t n_tokens      = t->ne[2];
-
-            // Sanity check
-            if (n_expert_used != pb.n_expert_used || n_tokens != pb.n_tokens) {
-                LOG_ERR("expert-profile: dimension mismatch at layer %d\n", il);
-                pending.erase(il);
-                return true;
-            }
-
-            // Ensure layer stats initialised
-            auto & ls = layer_stats[il];
-            if (ls.n_experts == 0) ls.init(n_experts);
-
-            const float * expert_outs = reinterpret_cast<const float *>(buf.data());
-            ls.add_batch(pb.expert_ids.data(), pb.gate_weights.data(),
-                         expert_outs, n_expert_used, n_tokens, n_embd);
-
-            // Done with this batch for this layer
-            pending.erase(il);
-        }
-
-        return true;
-    }
-};
-
-// ─── Global collector + C callback ───────────────────────────────────────────
-
-static ExpertCollector g_collector;
-
-static bool expert_eval_callback(struct ggml_tensor * t, bool ask, void * /*user_data*/) {
-    if (ask) return g_collector.wants(t);
-    return g_collector.on_tensor(t);
-}
-
-// ─── JSON output ──────────────────────────────────────────────────────────────
-
-static void save_stats(const std::string & path) {
-    std::ofstream f(path);
-    if (!f) {
-        LOG_ERR("expert-profile: failed to open output file '%s'\n", path.c_str());
-        return;
-    }
-
-    f << "{\n";
-    bool first_layer = true;
-    for (auto & [il, ls] : g_collector.layer_stats) {
-        if (!first_layer) f << ",\n";
-        first_layer = false;
-
-        f << "  \"" << il << "\": {\n";
-        f << "    \"total_tokens\": " << ls.total_tokens << ",\n";
-
-        // activation_counts
-        f << "    \"activation_counts\": [";
-        for (int64_t i = 0; i < ls.n_experts; ++i) {
-            if (i) f << ", ";
-            f << ls.activation_counts[i];
-        }
-        f << "],\n";
-
-        // activation_frequency
-        f << "    \"activation_frequency\": [";
-        for (int64_t i = 0; i < ls.n_experts; ++i) {
-            if (i) f << ", ";
-            f << ((ls.total_tokens > 0) ? (double)ls.activation_counts[i] / ls.total_tokens : 0.0);
-        }
-        f << "],\n";
-
-        // avg_gate_weight  (weighted_freq_sum / activation_counts)
-        f << "    \"avg_gate_weight\": [";
-        for (int64_t i = 0; i < ls.n_experts; ++i) {
-            if (i) f << ", ";
-            f << ((ls.activation_counts[i] > 0) ? ls.weighted_freq_sum[i] / ls.activation_counts[i] : 0.0);
-        }
-        f << "],\n";
-
-        // ean_mean  = ean_sum / activation_counts  (EAN criterion, no gate weight)
-        f << "    \"ean_mean\": [";
-        for (int64_t i = 0; i < ls.n_experts; ++i) {
-            if (i) f << ", ";
-            f << ((ls.activation_counts[i] > 0) ? ls.ean_sum[i] / ls.activation_counts[i] : 0.0);
-        }
-        f << "],\n";
-
-        // reap  = reap_sum / activation_counts  (REAP criterion, Eq.9)
-        f << "    \"reap\": [";
-        for (int64_t i = 0; i < ls.n_experts; ++i) {
-            if (i) f << ", ";
-            f << ((ls.activation_counts[i] > 0) ? ls.reap_sum[i] / ls.activation_counts[i] : 0.0);
-        }
-        f << "],\n";
-
-        // never_activated
-        int64_t never = 0;
-        for (int64_t i = 0; i < ls.n_experts; ++i) {
-            if (ls.activation_counts[i] == 0) ++never;
-        }
-        f << "    \"never_activated\": " << never << "\n";
-        f << "  }";
-    }
-    f << "\n}\n";
-
-    LOG_INF("expert-profile: stats saved to '%s'  (%zu MoE layers)\n",
-            path.c_str(), g_collector.layer_stats.size());
-}
-
-// ─── JSONL input ──────────────────────────────────────────────────────────────
-
-struct JsonPair { std::string prompt, response; };
-
-static bool json_get_string(const std::string & line, const std::string & key, std::string & out) {
-    std::string search = "\"" + key + "\"";
-    size_t kpos = line.find(search);
-    if (kpos == std::string::npos) return false;
-    size_t colon = line.find(':', kpos + search.size());
-    if (colon == std::string::npos) return false;
-    size_t q1 = line.find('"', colon + 1);
-    if (q1 == std::string::npos) return false;
-    out.clear();
-    for (size_t i = q1 + 1; i < line.size(); ++i) {
-        if (line[i] == '\\' && i + 1 < line.size()) {
-            ++i;
-            switch (line[i]) {
-                case '"':  out += '"';  break;
-                case '\\': out += '\\'; break;
-                case 'n':  out += '\n'; break;
-                case 'r':  out += '\r'; break;
-                case 't':  out += '\t'; break;
-                default:   out += line[i]; break;
-            }
-        } else if (line[i] == '"') {
-            return true;
-        } else {
-            out += line[i];
-        }
-    }
-    return false;
-}
-
-static std::vector<JsonPair> load_jsonl(const std::string & path) {
-    std::vector<JsonPair> pairs;
-    std::ifstream f(path);
-    if (!f) { LOG_ERR("expert-profile: cannot open JSONL file '%s'\n", path.c_str()); return pairs; }
-    std::string line;
-    while (std::getline(f, line)) {
-        if (line.empty()) continue;
-        JsonPair p;
-        json_get_string(line, "prompt",   p.prompt);
-        json_get_string(line, "response", p.response);
-        if (!p.prompt.empty() || !p.response.empty()) pairs.push_back(std::move(p));
-    }
-    return pairs;
-}
-
-// ─── Inference loop ───────────────────────────────────────────────────────────
-
-static void run_inference(llama_context * ctx,
-                          const llama_model * model,
-                          const std::vector<JsonPair> & pairs,
-                          int max_tokens,
-                          const std::string & output_path,
-                          int save_every) {
-    const llama_vocab * vocab  = llama_model_get_vocab(model);
-    const bool          add_bos = llama_vocab_get_add_bos(vocab);
-
-    llama_batch batch = llama_batch_init(max_tokens, 0, 1);
-
-    for (size_t pi = 0; pi < pairs.size(); ++pi) {
-        const std::string text = pairs[pi].prompt + "\n" + pairs[pi].response;
-
-        std::vector<llama_token> tokens = common_tokenize(ctx, text, add_bos, true);
-        if ((int)tokens.size() > max_tokens) tokens.resize(max_tokens);
-        if (tokens.empty()) continue;
-
-        LOG_INF("  [%zu/%zu] %zu tokens\n", pi + 1, pairs.size(), tokens.size());
-
-        llama_memory_clear(llama_get_memory(ctx), true);
-
-        common_batch_clear(batch);
-        for (int i = 0; i < (int)tokens.size(); ++i) {
-            common_batch_add(batch, tokens[i], i, {0}, false);
-        }
-        batch.logits[batch.n_tokens - 1] = true;
-
-        if (llama_decode(ctx, batch) != 0) {
-            LOG_ERR("  [%zu/%zu] llama_decode failed — skipping\n", pi + 1, pairs.size());
-        }
-
-        if (save_every > 0 && (pi + 1) % save_every == 0) {
-            save_stats(output_path);
-        }
-    }
-
-    llama_batch_free(batch);
-}
-
-// ─── CLI ──────────────────────────────────────────────────────────────────────
-
-int main(int argc, char ** argv) {
-    std::string model_path;
-    std::string jsonl_path;
-    std::string output_path  = "expert_stats.json";
-    int         n_experts    = 128;
-    int         ctx_size     = 2048;
-    int         n_gpu_layers = 99;
-    int         n_threads    = 4;
-    int         save_every   = 100;
-    enum ggml_type kv_type_k = GGML_TYPE_F16;
-    enum ggml_type kv_type_v = GGML_TYPE_F16;
-
-    auto parse_ggml_type = [](const char * s) -> enum ggml_type {
-        if (strcmp(s, "f32")  == 0) return GGML_TYPE_F32;
-        if (strcmp(s, "f16")  == 0) return GGML_TYPE_F16;
-        if (strcmp(s, "q8_0") == 0) return GGML_TYPE_Q8_0;
-        if (strcmp(s, "q4_0") == 0) return GGML_TYPE_Q4_0;
-        fprintf(stderr, "Unknown KV type '%s', using f16\n", s); return GGML_TYPE_F16;
-    };
-
-    for (int i = 1; i < argc; ++i) {
-        std::string a(argv[i]);
-        auto next = [&]() -> const char * {
-            if (i + 1 >= argc) { fprintf(stderr, "Missing argument for %s\n", argv[i]); exit(1); }
-            return argv[++i];
-        };
-        if      (a == "-m" || a == "--model")           model_path  = next();
-        else if (a == "--jsonl")                         jsonl_path  = next();
-        else if (a == "--output")                        output_path = next();
-        else if (a == "--n-experts")                     n_experts    = atoi(next());
-        else if (a == "--ctx-size" || a == "-c")         ctx_size     = atoi(next());
-        else if (a == "-ngl" || a == "--n-gpu-layers")   n_gpu_layers = atoi(next());
-        else if (a == "-t" || a == "--threads")          n_threads    = atoi(next());
-        else if (a == "--type-k")                        kv_type_k   = parse_ggml_type(next());
-        else if (a == "--type-v")                        kv_type_v   = parse_ggml_type(next());
-        else if (a == "--save-every")                    save_every  = atoi(next());
-        else if (a == "-h" || a == "--help") {
-            fprintf(stderr,
-                "\nUsage: %s -m model.gguf --jsonl data.jsonl [options]\n"
-                "  --output PATH       Output JSON (default: expert_stats.json)\n"
-                "  --n-experts N       Experts per layer (default: 128)\n"
-                "  --ctx-size N        Context length (default: 2048)\n"
-                "  -ngl N              GPU layers (default: 99)\n"
-                "  -t N                CPU threads (default: 4)\n"
-                "  --type-k/v TYPE     KV cache type: f32/f16/q8_0/q4_0 (default: f16)\n"
-                "  --save-every N      Checkpoint every N samples (default: 100)\n\n", argv[0]);
-            return 0;
-        } else {
-            fprintf(stderr, "Unknown argument: %s\n", a.c_str()); return 1;
-        }
-    }
-
-    if (model_path.empty()) { fprintf(stderr, "Error: -m required\n"); return 1; }
-    if (jsonl_path.empty()) { fprintf(stderr, "Error: --jsonl required\n"); return 1; }
-
-    g_collector.n_experts = n_experts;
-
-    LOG_INF("expert-profile: model     = %s\n", model_path.c_str());
-    LOG_INF("expert-profile: jsonl     = %s\n", jsonl_path.c_str());
-    LOG_INF("expert-profile: output    = %s\n", output_path.c_str());
-    LOG_INF("expert-profile: n_experts = %d\n", n_experts);
-    LOG_INF("expert-profile: ctx_size  = %d\n", ctx_size);
-    LOG_INF("expert-profile: ngl       = %d\n", n_gpu_layers);
-    LOG_INF("expert-profile: criterion = REAP (gate_weight * ||expert_out||_2)\n");
-
-    auto pairs = load_jsonl(jsonl_path);
-    if (pairs.empty()) { LOG_ERR("expert-profile: no pairs loaded\n"); return 1; }
-    LOG_INF("expert-profile: loaded %zu pairs\n", pairs.size());
-
-    llama_backend_init();
-
-    // Suppress INFO/WARN spam (CUDA graph warmup etc.), only pass errors through
-    llama_log_set([](enum ggml_log_level level, const char * text, void *) {
-        if (level >= GGML_LOG_LEVEL_ERROR) fputs(text, stderr);
-    }, nullptr);
-
-    llama_model_params mparams = llama_model_default_params();
-    mparams.n_gpu_layers = n_gpu_layers;
-
-    llama_model * model = llama_model_load_from_file(model_path.c_str(), mparams);
-    if (!model) { LOG_ERR("expert-profile: failed to load model\n"); return 1; }
-
-    llama_context_params cparams  = llama_context_default_params();
-    cparams.n_ctx                 = ctx_size;
-    cparams.n_batch               = ctx_size;
-    cparams.n_ubatch              = std::min(ctx_size, 512);
-    cparams.n_threads             = n_threads;
-    cparams.type_k                = kv_type_k;
-    cparams.type_v                = kv_type_v;
-    cparams.cb_eval               = expert_eval_callback;
-    cparams.cb_eval_user_data     = nullptr;
-
-    llama_context * ctx = llama_init_from_model(model, cparams);
-    if (!ctx) { LOG_ERR("expert-profile: failed to create context\n"); return 1; }
-
-    LOG_INF("expert-profile: running forward passes...\n");
-    run_inference(ctx, model, pairs, ctx_size, output_path, save_every);
-    save_stats(output_path);
-
-    // Summary
-    LOG_INF("\n  MoE layers profiled: %zu\n", g_collector.layer_stats.size());
-    for (auto & [il, ls] : g_collector.layer_stats) {
-        // Find top and bottom REAP expert
-        int64_t top_e = 0, bot_e = 0;
-        double  top_v = 0.0, bot_v = 1e18;
-        for (int64_t i = 0; i < ls.n_experts; ++i) {
-            double v = (ls.activation_counts[i] > 0) ? ls.reap_sum[i] / ls.activation_counts[i] : 0.0;
-            if (v > top_v) { top_v = v; top_e = i; }
-            if (v < bot_v) { bot_v = v; bot_e = i; }
-        }
-        int64_t never = 0;
-        for (int64_t i = 0; i < ls.n_experts; ++i)
-            if (ls.activation_counts[i] == 0) ++never;
-        LOG_INF("  Layer %3d: tokens=%lld  never=%lld  reap_top=e%lld(%.4f)  reap_bot=e%lld(%.4f)\n",
-                il, (long long)ls.total_tokens, (long long)never,
-                (long long)top_e, top_v, (long long)bot_e, bot_v);
-    }
-
-    llama_free(ctx);
-    llama_model_free(model);
-    llama_backend_free();
-    return 0;
-}
diff --git a/tools/moe-pruning/README.md b/tools/moe-pruning/README.md
deleted file mode 100644
index a88499ac43..0000000000
--- a/tools/moe-pruning/README.md
+++ /dev/null
@@ -1,97 +0,0 @@
-# MoE Expert Pruning Tools for NemotronH
-
-REAP-style expert pruning for `NVIDIA-Nemotron-3-Nano-30B-A3B` (and other
-NemotronH MoE models), implemented in two complementary ways:
-
-1. **`tools/expert-profile/`** — C++ profiler built into llama.cpp, collects
-   REAP scores directly from GGUF inference via the ggml eval callback.
-2. **`tools/moe-pruning/`** (this directory) — Python scripts to prune the model
-   using the collected scores, either on a GGUF file directly or on a
-   HuggingFace BF16 checkpoint.
-
----
-
-## Inspiration & Prior Art
-
-This work is a direct implementation of the **REAP** saliency criterion
-introduced in:
-
-> **REAP the Experts: Why Pruning Prevails for One-Shot MoE Compression**
-> Mike Lasby, Ivan Lazarevich, Nish Sinnadurai, Sean Lie, Yani Ioannou, Vithursan Thangarasa
-> Cerebras Research, 2025
-> arXiv: https://arxiv.org/abs/2510.13999
-> Code:  https://github.com/CerebrasResearch/reap
-
-The REAP score for expert `j` is (Equation 9 of the paper):
-
-```
-REAP(j) = mean_{t : j ∈ topk(t)} [ g_j(t) · ‖f_j(t)‖₂ ]
-```
-
-where `g_j(t)` is the router gate weight and `f_j(t)` is the expert FFN output
-(pre-weighting) for token `t`. Experts with the lowest REAP score contribute
-least to the layer output and are pruned first.
-
-The original REAP repo targets HuggingFace models via PyTorch hooks on
-standard architectures (Qwen3-MoE, Mixtral, DeepSeek-V2, Llama-4, …).
-
-**What we added / adapted:**
-
-- `tools/expert-profile/expert-profile.cpp` — llama.cpp C++ implementation
-  of REAP that intercepts `ffn_moe_topk`, `ffn_moe_weights`, and `ffn_moe_down`
-  tensors via `ggml_backend_eval_callback`, enabling REAP profiling on any
-  GGUF-quantised model (Q4_K_M, Q6_K, etc.) without needing full BF16 VRAM.
-
-- `gguf_prune.py` — prunes the GGUF file **directly**, slicing the expert axis
-  of the stacked weight tensors (`ffn_up_exps`, `ffn_down_exps`, `ffn_gate_inp`,
-  `ffn_exp_probs_b`) and patching `{arch}.expert_count` in the metadata.
-  Quantised blocks are preserved as raw bytes — no dequantise/requantise step.
-
-- `nemotron_reap.py` — HuggingFace-based alternative: profiles with 4-bit NF4
-  on GPU (phase 1) and prunes the BF16 checkpoint on CPU (phase 2). Adds
-  NemotronH (`NemotronHForCausalLM`) support that the original REAP repo does
-  not have.
-
----
-
-## Recommended Workflow (low-VRAM, e.g. RTX 4060 Ti 16 GB)
-
-```
-┌─────────────────────────────────────────────┐
-│  Phase 1 — Profile  (GPU, GGUF Q4, ~15 GB)  │
-│                                             │
-│  llama-expert-profile                       │
-│    -m nemotron-Q4_K_M.gguf                  │
-│    --jsonl sample_calibration.jsonl         │
-│    --output expert_stats.json               │
-│    -ngl 99 --ctx-size 2048                  │
-└───────────────────┬─────────────────────────┘
-                    │ expert_stats.json
-┌───────────────────▼─────────────────────────┐
-│  Phase 2 — Prune  (CPU, pure Python, ~2 GB) │
-│                                             │
-│  python gguf_prune.py                       │
-│    --input  nemotron-Q4_K_M.gguf            │
-│    --stats  expert_stats.json               │
-│    --output nemotron-pruned-26e.gguf        │
-│    --keep_ratio 0.20   # 26/128 experts     │
-└─────────────────────────────────────────────┘
-```
-
-At 20 % keep ratio a ~22 GB Q4_K_M becomes ~4.5 GB.
-
----
-
-## Files
-
-| File | Description |
-|---|---|
-| `gguf_prune.py` | GGUF-native pruner — no GPU needed, preserves quantisation |
-| `nemotron_reap.py` | HF-based pruner — 4-bit GPU profile + CPU BF16 prune |
-| `build_expert_profile.sh` | Build script for `llama-expert-profile` |
-| `run_nemotron_profile.sh` | Example profiling run |
-| `run_prune.sh` | Example pruning run |
-| `run_convert_quantize.sh` | Convert HF → GGUF and quantise |
-| `analyze_stats.py` | Visualise and compare expert stats JSON files |
-| `sample_calibration.jsonl` | Sample calibration data (prompt+response pairs) |
-| `expert_stats_reap.json` | Example stats output from expert-profile |
diff --git a/tools/moe-pruning/analyze_stats.py b/tools/moe-pruning/analyze_stats.py
deleted file mode 100644
index 2e0821f323..0000000000
--- a/tools/moe-pruning/analyze_stats.py
+++ /dev/null
@@ -1,284 +0,0 @@
-#!/usr/bin/env python3
-"""
-analyze_stats.py  --  Summarize expert_stats.json and model size projections.
-Usage: python analyze_stats.py [stats_file] [--keep 0.5]
-"""
-import json, statistics, argparse
-
-parser = argparse.ArgumentParser()
-parser.add_argument("stats", nargs="?", default="expert_stats_reap.json")
-parser.add_argument("--keep", type=float, default=0.5, help="Fraction of experts to keep (default 0.5)")
-args = parser.parse_args()
-
-with open(args.stats) as f:
-    data = json.load(f)
-
-layers = sorted(data.keys(), key=int)
-n_layers = len(layers)
-keep_ratio = args.keep
-
-# Detect which scoring field is available (new REAP vs old importance_score)
-sample_layer = data[layers[0]]
-if "reap" in sample_layer:
-    score_field = "reap"
-    score_label = "REAP (gate_weight × ||expert_out||₂)"
-elif "importance_score" in sample_layer:
-    score_field = "importance_score"
-    score_label = "importance_score (freq × avg_gate_weight)  [legacy, no EAN]"
-else:
-    raise ValueError(f"No recognised score field in stats. Keys: {list(sample_layer.keys())}")
-
-# ── Model architecture constants (Nemotron-3-Nano-30B-A3B) ──────────────────
-N_EXPERTS        = 128
-N_EXPERT_USED    = 6       # top-k per token
-N_MOE_LAYERS     = 23
-N_TOTAL_LAYERS   = 53
-# Approximate parameter counts (bf16, billions)
-PARAMS_TOTAL_B        = 30.0
-PARAMS_MOE_EXPERTS_B  = 22.0   # bulk of MoE weight is in expert FFNs
-PARAMS_NON_MOE_B      = PARAMS_TOTAL_B - PARAMS_MOE_EXPERTS_B
-
-# ── Header ──────────────────────────────────────────────────────────────────
-print("=" * 70)
-print(f"  Expert Stats Analysis  |  file: {args.stats}")
-print("=" * 70)
-
-# ── Profiling completeness ───────────────────────────────────────────────────
-sample_tokens = list(data.values())[0]["total_tokens"]
-# Each token activates N_EXPERT_USED experts, sum(activation_counts) = total*top_k
-# Approximate samples: total_tokens / avg_tokens_per_sample
-# We don't know avg, but can infer: total_tokens / (total_tokens / ctx) ≈ ctx chunks
-# Better: just report tokens and note the user knows sample count
-print(f"\n── Profiling progress ──────────────────────────────────────────────────")
-print(f"  MoE layers profiled    : {n_layers} / {N_MOE_LAYERS}")
-print(f"  Tokens processed       : {sample_tokens:,}  (per layer)")
-act_sum = sum(data[layers[0]]["activation_counts"])
-assert abs(act_sum / sample_tokens - N_EXPERT_USED) < 0.01, "unexpected top-k"
-print(f"  top-k confirmed        : {N_EXPERT_USED}  (sum activations / tokens = {act_sum/sample_tokens:.1f})")
-
-# ── Per-layer importance score stats ────────────────────────────────────────
-print(f"\n── Per-layer score distribution  [{score_label}]")
-print(f"  {'Layer':>5}  {'Min':>9}  {'Max':>9}  {'Range':>9}  {'CV%':>6}  {'Never':>5}")
-global_cvs = []
-for k in layers:
-    d = data[k]
-    s = d[score_field]
-    mn, mx = min(s), max(s)
-    cv = statistics.stdev(s) / statistics.mean(s) * 100
-    global_cvs.append(cv)
-    print(f"  {k:>5}  {mn:>9.5f}  {mx:>9.5f}  {mx-mn:>9.5f}  {cv:>6.3f}%  {d['never_activated']:>5}")
-
-print(f"\n  Mean CV across layers  : {statistics.mean(global_cvs):.3f}%")
-print(f"  (CV < 1% = near-uniform; load-balancing is working as designed)")
-
-# ── Capacity loss sweep across pruning levels ────────────────────────────────
-# Paper (observer.py): REAP[i] = mean(ean_norm * softmax_router_weight) over tokens
-#   routed to expert i, averaged via OnlineStatsTracker weighted by expert_frequency.
-# Our implementation (llama.cpp): same formula but routing weights are the top-k
-#   gate weights (post-softmax within top-k), not the full softmax over all 128.
-# Impact: our weights are slightly higher than the paper's (renormalized to top-k
-#   only), but relative expert ranking within a layer should be preserved.
-#
-# IMPORTANT CAVEAT for this model (Nemotron-3-Nano-30B-A3B):
-#   The model was trained with a strong load-balancing auxiliary loss, so all 128
-#   experts have nearly identical activation frequency (~4.69%) AND nearly identical
-#   REAP scores (Gini ~0.015, top/bottom ratio ~1.1-1.35x). The score distribution
-#   is a smooth monotone curve with NO natural elbow or gap.
-#
-#   This means:
-#   - REAP ranking beats random pruning by only ~1pp in mass terms at keep=33%
-#   - The cut point boundary (rank 42 vs 43) has near-zero gap in most layers
-#   - REAP paper results on Qwen3-30B-A3B likely had higher Gini (less tight
-#     load-balancing or more expert specialization in pre-training)
-#   - For this model, actual quality loss must be measured via eval, not predicted
-#     from REAP score variance
-#
-# Metrics reported:
-# - kept_mass%: REAP mass in the KEPT experts as % of total (> keep_ratio% = good)
-# - vs_random%: how much more mass the REAP-selected set retains vs a random set
-#               of the same size (= kept_mass% - keep_ratio%). Positive = REAP wins.
-# - Rel.gap:    score gap at cut / layer score range. Near 0 = no natural cut point.
-# - Gini:       inequality of score distribution. ~0.015 here = near-uniform.
-
-def gini(scores):
-    """Gini coefficient of a list of non-negative values."""
-    n = len(scores)
-    s = sorted(scores)
-    total = sum(s)
-    if total == 0:
-        return 0.0
-    cumsum = 0.0
-    for i, v in enumerate(s):
-        cumsum += (2 * (i + 1) - n - 1) * v
-    return cumsum / (n * total)
-
-def layer_stats(scores, n_keep):
-    """Return capacity metrics for a single layer at a given keep count."""
-    n = len(scores)
-    ranked = sorted(range(n), key=lambda i: scores[i], reverse=True)
-    total  = sum(scores)
-    kept_mass   = sum(scores[i] for i in ranked[:n_keep])
-    kept_frac   = kept_mass / total if total > 0 else 0.0     # fraction of REAP mass kept
-    random_frac = n_keep / n                                   # uniform expectation
-    vs_random   = kept_frac - random_frac                     # positive = REAP beats random
-    score_range = scores[ranked[0]] - scores[ranked[-1]]
-    gap         = scores[ranked[n_keep - 1]] - (scores[ranked[n_keep]] if n_keep < n else 0)
-    rel_gap     = gap / score_range if score_range > 0 else 0.0
-    return kept_frac * 100, vs_random * 100, rel_gap
-
-# Sweep over a range of keep ratios
-sweep_ratios = [0.10, 0.20, 0.25, 0.33, 0.40, 0.50, 0.60, 0.75]
-if keep_ratio not in sweep_ratios:
-    sweep_ratios.append(keep_ratio)
-sweep_ratios = sorted(set(sweep_ratios))
-
-# Per-layer Gini (fixed, independent of keep ratio)
-layer_ginis = {k: gini(data[k][score_field]) for k in layers}
-mean_gini = statistics.mean(layer_ginis.values())
-worst_gini_layer = max(layer_ginis, key=lambda k: layer_ginis[k])
-
-print(f"\n── Score distribution inequality (Gini coefficient) ────────────────────")
-print(f"  Gini measures how non-uniform REAP scores are within each layer.")
-print(f"  Gini=0: all experts identical. Gini=1: one expert dominates.")
-print(f"  With load-balanced MoE, Gini is small — but any Gini > 0 means")
-print(f"  REAP ranking beats random pruning.")
-print(f"")
-print(f"  {'Layer':>5}  {'Gini':>8}  {'Score range':>13}  {'Max/Min ratio':>14}")
-print(f"  {'-'*5}  {'-'*8}  {'-'*13}  {'-'*14}")
-for k in layers:
-    s = data[k][score_field]
-    mn, mx = min(s), max(s)
-    g = layer_ginis[k]
-    ratio_mm = mx / mn if mn > 0 else float('inf')
-    print(f"  {k:>5}  {g:>8.5f}  {mx-mn:>13.5f}  {ratio_mm:>13.3f}x")
-print(f"")
-print(f"  Mean Gini : {mean_gini:.5f}  (worst layer: {worst_gini_layer})")
-
-print(f"\n── Capacity retention sweep ─────────────────────────────────────────────")
-print(f"  Kept mass%  = REAP mass in KEPT experts as % of total (higher = better)")
-print(f"  vs.rand%    = Kept mass% minus uniform baseline (keep_ratio%)")
-print(f"                Positive = REAP beats random. Magnitude = advantage in pp.")
-print(f"  Rel.gap     = score gap at cut / layer score range (higher = cleaner cut)")
-print(f"  WARNING: near-zero rel.gap and small vs.rand mean eval is the only ground truth.")
-print(f"")
-print(f"  {'Keep':>5}  {'Experts':>7}  {'Kept mass%':>11}  {'vs.rand%':>9}  {'Rel.gap avg':>12}  {'Worst layer':>11}")
-print(f"  {'-'*5}  {'-'*7}  {'-'*11}  {'-'*9}  {'-'*12}  {'-'*11}")
-
-sweep_results = {}
-for ratio in sweep_ratios:
-    nk = max(1, round(N_EXPERTS * ratio))
-    mass_fracs, excesses, rel_gaps = [], [], []
-    worst_excess, worst_layer_id = -999.0, None
-    for k in layers:
-        scores = data[k][score_field]
-        mf, exc, rg = layer_stats(scores, nk)
-        mass_fracs.append(mf)
-        excesses.append(exc)
-        rel_gaps.append(rg)
-        if exc > worst_excess:
-            worst_excess = exc
-            worst_layer_id = k
-    avg_mf  = statistics.mean(mass_fracs)
-    avg_exc = statistics.mean(excesses)
-    avg_rg  = statistics.mean(rel_gaps)
-    marker  = " <--" if abs(ratio - keep_ratio) < 1e-9 else ""
-    print(f"  {ratio:>5.0%}  {nk:>7d}  {avg_mf:>10.2f}%  {avg_exc:>+9.2f}%  {avg_rg:>11.4f}  layer {worst_layer_id:>3}{marker}")
-    sweep_results[ratio] = {
-        "n_keep": nk, "avg_kept_mass": avg_mf, "avg_vs_random": avg_exc,
-        "avg_rel_gap": avg_rg, "worst_layer_id": worst_layer_id, "worst_vs_random": worst_excess,
-    }
-
-print(f"")
-print(f"  vs.rand% quantifies REAP's advantage over random pruning in REAP-mass terms.")
-print(f"  For this model it is small (+0.7 to +1.5pp) due to tight load-balancing.")
-print(f"  Rel.gap near zero means scores are smooth with no natural cut — any threshold")
-print(f"  is as defensible as another. Actual quality delta requires empirical eval.")
-
-# ── Expert keep/prune detail at selected keep_ratio ──────────────────────────
-n_keep   = max(1, round(N_EXPERTS * keep_ratio))
-n_prune  = N_EXPERTS - n_keep
-
-print(f"\n── Expert pruning detail at keep_ratio={keep_ratio:.0%}  ({n_keep} keep / {n_prune} prune per layer) ──")
-print(f"  {'Layer':>5}  {'Kept mass%':>11}  {'vs.rand%':>9}  {'Rel.gap':>9}  {'Min kept':>10}  {'Max pruned':>11}")
-print(f"  {'-'*5}  {'-'*11}  {'-'*9}  {'-'*9}  {'-'*10}  {'-'*11}")
-
-layer_results = {}
-for k in layers:
-    scores = data[k][score_field]
-    ranked = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
-    mf, exc, rg = layer_stats(scores, n_keep)
-    min_kept   = scores[ranked[n_keep - 1]]
-    max_pruned = scores[ranked[n_keep]] if n_prune > 0 else 0
-    layer_results[k] = {"mass_frac": mf, "excess": exc, "rel_gap": rg,
-                        "min_kept": min_kept, "max_pruned": max_pruned}
-    print(f"  {k:>5}  {mf:>10.2f}%  {exc:>+9.2f}%  {rg:>9.4f}  {min_kept:>10.5f}  {max_pruned:>11.5f}")
-
-avg_mf  = statistics.mean(r["mass_frac"] for r in layer_results.values())
-avg_exc = statistics.mean(r["excess"]    for r in layer_results.values())
-avg_rg  = statistics.mean(r["rel_gap"]   for r in layer_results.values())
-print(f"  {'AVG':>5}  {avg_mf:>10.2f}%  {avg_exc:>+9.2f}%  {avg_rg:>9.4f}")
-
-# ── Model size projections ───────────────────────────────────────────────────
-print(f"\n── Model size projections ──────────────────────────────────────────────")
-
-def model_size(keep):
-    expert_params = PARAMS_MOE_EXPERTS_B * keep
-    return PARAMS_NON_MOE_B + expert_params
-
-original_b   = model_size(1.0)
-pruned_b     = model_size(keep_ratio)
-reduction_pct = (1 - pruned_b / original_b) * 100
-
-# GGUF sizes at common quant levels (rough: 1B params ≈ quant_bpw/8 GB)
-quants = [("Q8_0", 8.0), ("Q5_K_M", 5.5), ("Q4_K_M", 4.5), ("Q3_K_M", 3.35), ("Q2_K", 2.63)]
-
-print(f"  {'':20}  {'Original':>10}  {'Pruned':>10}  {'Saved':>8}")
-print(f"  {'Parameters (B)':20}  {original_b:>10.1f}  {pruned_b:>10.1f}  {original_b-pruned_b:>8.1f}B")
-print(f"  {'Reduction':20}  {'':>10}  {reduction_pct:>9.1f}%")
-print()
-print(f"  Estimated GGUF sizes:")
-print(f"  {'Quant':10}  {'Original':>10}  {'Pruned':>10}  {'Fits in':>12}")
-for name, bpw in quants:
-    orig_gb  = original_b * bpw / 8
-    prune_gb = pruned_b   * bpw / 8
-    # VRAM fit (16GB GPU)
-    fits = "16GB GPU" if prune_gb <= 15.5 else ("32GB GPU" if prune_gb <= 31 else "CPU/RAM")
-    print(f"  {name:10}  {orig_gb:>9.1f}G  {prune_gb:>9.1f}G  {fits:>12}")
-
-# ── Active params per token (inference cost) ─────────────────────────────────
-print(f"\n── Inference cost (active params per token) ────────────────────────────")
-# Active params = non-moe + (n_expert_used/n_experts_kept * moe_expert_params)
-# After pruning: router still picks top-k but from n_keep pool
-# Active expert params per token = (N_EXPERT_USED / n_keep) * (PARAMS_MOE_EXPERTS_B * keep_ratio)
-# But actually active params = N_EXPERT_USED * (params per single expert)
-params_per_expert_orig   = PARAMS_MOE_EXPERTS_B / N_EXPERTS          # B per expert
-params_per_expert_pruned = (PARAMS_MOE_EXPERTS_B * keep_ratio) / n_keep  # same, just fewer experts
-
-active_orig   = PARAMS_NON_MOE_B + N_EXPERT_USED * params_per_expert_orig   * N_MOE_LAYERS / N_TOTAL_LAYERS
-active_pruned = PARAMS_NON_MOE_B + N_EXPERT_USED * params_per_expert_pruned * N_MOE_LAYERS / N_TOTAL_LAYERS
-
-print(f"  Original  : {active_orig:.2f}B active params/token  (same expert size, more choice)")
-print(f"  Pruned    : {active_pruned:.2f}B active params/token  (same — top-k still fires {N_EXPERT_USED} experts)")
-print(f"  Note: active params per token are IDENTICAL — pruning only reduces")
-print(f"        model file size and memory footprint, not per-token compute.")
-
-# ── Consistently low-importance experts ──────────────────────────────────────
-print(f"\n── Experts consistently ranked low across all layers ───────────────────")
-bottom_n = max(1, round(N_EXPERTS * 0.10))  # bottom 10%
-low_count = {}
-for k in layers:
-    scores = data[k][score_field]
-    ranked = sorted(range(len(scores)), key=lambda i: scores[i])
-    for eid in ranked[:bottom_n]:
-        low_count[eid] = low_count.get(eid, 0) + 1
-
-consistent = sorted(low_count.items(), key=lambda x: -x[1])
-consistent = [(eid, cnt) for eid, cnt in consistent if cnt >= 3]
-print(f"  (bottom 10% in >= 3 layers — most dispensable experts globally)")
-print(f"  Expert ID : layers in bottom 10%")
-for eid, cnt in consistent[:20]:
-    bar = "█" * cnt
-    print(f"  Expert {eid:>3} : {cnt:>2}/{n_layers}  {bar}")
-
-print()
-print("=" * 70)
diff --git a/tools/moe-pruning/build_expert_profile.sh b/tools/moe-pruning/build_expert_profile.sh
deleted file mode 100644
index 0b39604426..0000000000
--- a/tools/moe-pruning/build_expert_profile.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/usr/bin/env bash
-# build_expert_profile.sh
-# Builds llama.cpp with the expert-profile tool in WSL2 with CUDA.
-# Run this from the tools/moe-pruning/ directory: bash build_expert_profile.sh
-
-set -e
-
-LLAMA_SRC="../.."
-BUILD_DIR="$LLAMA_SRC/build_expert"
-
-echo "=== Building llama.cpp + expert-profile tool ==="
-echo "  Source : $LLAMA_SRC"
-echo "  Build  : $BUILD_DIR"
-
-mkdir -p "$BUILD_DIR"
-cd "$BUILD_DIR"
-
-# Configure with CUDA
-cmake "$LLAMA_SRC" \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DGGML_CUDA=ON \
-    -DLLAMA_CURL=OFF \
-    -DLLAMA_BUILD_TESTS=OFF \
-    -DLLAMA_BUILD_EXAMPLES=OFF \
-    -DCMAKE_CUDA_ARCHITECTURES=86 \
-    2>&1 | tail -20
-
-# Build only the expert-profile target (fast)
-cmake --build . --target llama-expert-profile --config Release -j$(nproc)
-
-echo ""
-echo "=== Build complete ==="
-echo "  Binary: $BUILD_DIR/tools/expert-profile/llama-expert-profile"
-echo ""
-echo "=== Usage ==="
-echo "  $BUILD_DIR/tools/expert-profile/llama-expert-profile \\"
-echo "    -m ~/nemotron-3-nano-30b-Q4_K_M.gguf \\"
-echo "    --jsonl ./sample_calibration.jsonl \\"
-echo "    --output ./expert_stats_reap.json \\"
-echo "    --n-experts 128 \\"
-echo "    --ctx-size 16384 \\"
-echo "    -ngl 99"
diff --git a/tools/moe-pruning/extract_ppl.py b/tools/moe-pruning/extract_ppl.py
deleted file mode 100644
index 972a32e99d..0000000000
--- a/tools/moe-pruning/extract_ppl.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import json, os
-
-base = os.path.dirname(os.path.abspath(__file__))
-
-lines = open(os.path.join(base, 'rwsft-training-data.jsonl'), encoding='utf-8').readlines()
-split = int(len(lines) * 0.95)
-
-train_lines = lines[:split]
-val_lines   = lines[split:]
-
-train_out = os.path.join(base, 'ppl-eval-train.txt')
-val_out   = os.path.join(base, 'ppl-eval-val.txt')
-
-def fmt(s):
-    # Full prompt+response so the model is conditioned correctly.
-    # llama-perplexity scores all tokens, but the prompt PPL is identical
-    # for base vs adapter — the delta is driven by the response tokens.
-    prompt   = s.get('prompt', '').strip()
-    response = s.get('response', '').strip()
-    if not response:
-        return None
-    if prompt:
-        return prompt + '\n' + response
-    return response
-
-with open(train_out, 'w', encoding='utf-8') as f:
-    for line in train_lines:
-        text = fmt(json.loads(line))
-        if text:
-            f.write(text + '\n\n')
-
-with open(val_out, 'w', encoding='utf-8') as f:
-    for line in val_lines:
-        text = fmt(json.loads(line))
-        if text:
-            f.write(text + '\n\n')
-
-train_chars = len(open(train_out, encoding='utf-8').read())
-val_chars   = len(open(val_out,   encoding='utf-8').read())
-print(f'train: {len(train_lines)} samples, {train_chars:,} chars -> ppl-eval-train.txt')
-print(f'val:   {len(val_lines)} samples,  {val_chars:,} chars  -> ppl-eval-val.txt')
diff --git a/tools/moe-pruning/gguf_prune.py b/tools/moe-pruning/gguf_prune.py
deleted file mode 100644
index df3e638ab4..0000000000
--- a/tools/moe-pruning/gguf_prune.py
+++ /dev/null
@@ -1,260 +0,0 @@
-"""
-gguf-prune: REAP-based expert pruning directly on a GGUF file.
-
-Slices the expert dimension of the four stacked MoE weight tensors per layer:
-    blk.{il}.ffn_up_exps      [n_embd, intermediate, n_experts]
-    blk.{il}.ffn_down_exps    [intermediate, n_embd, n_experts]
-    blk.{il}.ffn_gate_inp     [n_embd, n_experts]
-    blk.{il}.ffn_exp_probs_b  [n_experts]  (score-correction bias, if present)
-
-Quantized blocks (Q4_K, Q6_K, …) are preserved as raw bytes — slicing the
-expert axis (last dim) is safe because each expert is independently quantised
-in ggml, so dropping experts = dropping whole quantisation blocks.
-
-Metadata patched:
-    {arch}.expert_count  → keep_n
-    (expert_used_count = top-k routing k, NOT touched)
-
-Usage:
-    # keep top 20% of experts (26/128) per MoE layer
-    python gguf_prune.py \\
-        --input  nemotron.gguf \\
-        --stats  expert_stats.json \\
-        --output nemotron-pruned.gguf \\
-        --keep_ratio 0.20
-
-    # or keep an absolute number
-    python gguf_prune.py \\
-        --input  nemotron.gguf \\
-        --stats  expert_stats.json \\
-        --output nemotron-pruned.gguf \\
-        --keep_n 32
-"""
-
-from __future__ import annotations
-
-import argparse
-import json
-import re
-from pathlib import Path
-
-import numpy as np
-from gguf import GGUFReader, GGUFWriter, GGUFValueType
-
-
-# ── Constants ─────────────────────────────────────────────────────────────────
-
-# Base tensor names that carry the expert dimension (last axis in ggml layout).
-# Some GGUFs append parameter tails like ".weight" / ".bias".
-EXPERT_BASE_SUFFIXES = {
-    "ffn_up_exps",
-    "ffn_down_exps",
-    "ffn_gate_inp",
-}
-
-
-def is_expert_suffix(suffix: str) -> bool:
-    """Return True if a tensor suffix is one of the MoE expert tensors to prune."""
-    if suffix in ("ffn_exp_probs_b", "exp_probs_b", "exp_probs_b.bias"):
-        return True
-    return any(suffix == base or suffix.startswith(base + ".") for base in EXPERT_BASE_SUFFIXES)
-
-
-# ── Helpers ───────────────────────────────────────────────────────────────────
-
-def layer_and_suffix(name: str) -> tuple[int, str] | tuple[None, None]:
-    m = re.match(r"blk\.(\d+)\.(.+)$", name)
-    if m:
-        return int(m.group(1)), m.group(2)
-    return None, None
-
-
-def pick_experts(layer_stats: dict, keep_n: int) -> list[int]:
-    """
-    Return sorted indices of the top `keep_n` experts by REAP score.
-    Falls back to 'importance_score' (weighted frequency) if 'reap' absent.
-    """
-    if "reap" in layer_stats:
-        scores = np.array(layer_stats["reap"], dtype=np.float64)
-    elif "importance_score" in layer_stats:
-        scores = np.array(layer_stats["importance_score"], dtype=np.float64)
-    else:
-        raise KeyError(
-            "Layer stats has neither 'reap' nor 'importance_score'. "
-            "Run expert-profile / nemotron_reap.py profile first."
-        )
-    return sorted(np.argsort(scores)[-keep_n:].tolist())
-
-
-def slice_expert_axis(data: np.ndarray, keep: list[int]) -> np.ndarray:
-    """
-    Slice the expert axis of reader tensor data keeping only `keep` indices.
-
-    GGUFReader reshapes tensors to NumPy with reversed ggml dims, so for MoE
-    tensors where experts are the last ggml dim, expert is axis 0 in `data`.
-    This also preserves quantized row-byte alignment (axis -1 is byte-packed
-    rows for quantized tensors and must not be sliced for expert pruning).
-    """
-    return np.take(data, keep, axis=0)
-
-
-def copy_field(writer: GGUFWriter, field, reader: GGUFReader) -> bool:
-    """Copy a single metadata field to writer. Returns False if skipped."""
-    key = field.name
-    val_type = field.types[0]
-    part = field.parts[-1]
-
-    if val_type == GGUFValueType.STRING:
-        # Preserve raw bytes: GGUF metadata can contain non-UTF8 strings.
-        writer.add_key_value(key, bytes(part), GGUFValueType.STRING)
-    elif val_type == GGUFValueType.UINT8:
-        writer.add_uint8(key, int(part[0]))
-    elif val_type == GGUFValueType.INT8:
-        writer.add_int8(key, int(part[0]))
-    elif val_type == GGUFValueType.UINT16:
-        writer.add_uint16(key, int(part[0]))
-    elif val_type == GGUFValueType.INT16:
-        writer.add_int16(key, int(part[0]))
-    elif val_type == GGUFValueType.UINT32:
-        writer.add_uint32(key, int(part[0]))
-    elif val_type == GGUFValueType.INT32:
-        writer.add_int32(key, int(part[0]))
-    elif val_type == GGUFValueType.FLOAT32:
-        writer.add_float32(key, float(part[0]))
-    elif val_type == GGUFValueType.UINT64:
-        writer.add_uint64(key, int(part[0]))
-    elif val_type == GGUFValueType.INT64:
-        writer.add_int64(key, int(part[0]))
-    elif val_type == GGUFValueType.FLOAT64:
-        writer.add_float64(key, float(part[0]))
-    elif val_type == GGUFValueType.BOOL:
-        writer.add_bool(key, bool(part[0]))
-    elif val_type == GGUFValueType.ARRAY:
-        elem_type = field.types[1]
-        if elem_type == GGUFValueType.STRING:
-            # ReaderField.data stores indices of ARRAY payload items; for
-            # STRING arrays this points at each string byte payload.
-            vals = [bytes(field.parts[idx]) for idx in field.data]
-            writer.add_key_value(key, vals, GGUFValueType.ARRAY, sub_type=GGUFValueType.STRING)
-        else:
-            # ReaderField.data stores part-indices, not payload values.
-            vals = field.contents()
-            if not isinstance(vals, list):
-                print(f"  WARNING: skipping array field {key!r} (unexpected non-list contents)")
-                return False
-            writer.add_array(key, vals)
-    else:
-        print(f"  WARNING: skipping field {key!r} (unsupported type {val_type})")
-        return False
-    return True
-
-
-# ── Main ──────────────────────────────────────────────────────────────────────
-
-def main():
-    ap = argparse.ArgumentParser(description="REAP expert pruning on a GGUF file")
-    ap.add_argument("--input",      required=True,              help="Input .gguf path")
-    ap.add_argument("--stats",      required=True,              help="expert_stats.json from expert-profile")
-    ap.add_argument("--output",     required=True,              help="Output .gguf path")
-    ap.add_argument("--keep_ratio", type=float, default=None,   help="Fraction to keep, e.g. 0.20")
-    ap.add_argument("--keep_n",     type=int,   default=None,   help="Absolute count to keep, e.g. 32")
-    ap.add_argument("--n_experts",  type=int,   default=128,    help="Experts per MoE layer in source model")
-    args = ap.parse_args()
-
-    if args.keep_ratio is None and args.keep_n is None:
-        ap.error("Provide --keep_ratio or --keep_n")
-    if args.keep_ratio is not None and args.keep_n is not None:
-        ap.error("Provide --keep_ratio OR --keep_n, not both")
-
-    keep_n = args.keep_n if args.keep_n is not None else max(1, int(args.n_experts * args.keep_ratio))
-    print(f"[gguf-prune] keeping {keep_n}/{args.n_experts} experts per MoE layer")
-
-    # ── Load stats ─────────────────────────────────────────────────────────────
-    with open(args.stats) as f:
-        stats = {int(k): v for k, v in json.load(f).items()}
-    print(f"[gguf-prune] stats loaded for {len(stats)} MoE layers")
-
-    # ── Open source GGUF ───────────────────────────────────────────────────────
-    print(f"[gguf-prune] reading  {args.input}")
-    reader = GGUFReader(args.input, mode="r")
-
-    arch_field = reader.get_field("general.architecture")
-    arch = str(bytes(arch_field.parts[-1]), "utf-8") if arch_field else "nemotron_h_moe"
-    print(f"[gguf-prune] arch     {arch}")
-
-    expert_count_key = f"{arch}.expert_count"
-
-    # ── Compute kept indices per layer ─────────────────────────────────────────
-    kept: dict[int, list[int]] = {}
-    for tensor in reader.tensors:
-        il, suffix = layer_and_suffix(tensor.name)
-        if il is None or suffix is None or not is_expert_suffix(suffix):
-            continue
-        if il in kept:
-            continue  # already computed for this layer
-        if il not in stats:
-            print(f"  Layer {il:3d}: no stats — keeping ALL {args.n_experts} experts")
-            kept[il] = list(range(args.n_experts))
-        else:
-            kept[il] = pick_experts(stats[il], keep_n)
-            never = stats[il].get("never_activated", "?")
-            crit  = "reap" if "reap" in stats[il] else "importance_score"
-            print(f"  Layer {il:3d}: keep {kept[il][:4]}…  never_activated={never}  criterion={crit}")
-
-    # ── Build output GGUF ──────────────────────────────────────────────────────
-    print(f"\n[gguf-prune] writing  {args.output}")
-    writer = GGUFWriter(args.output, arch=arch)
-
-    # --- metadata: copy all fields, replace expert_count ---
-    for field in reader.fields.values():
-        # Reader exposes synthetic header fields (GGUF.*) that are not KV
-        # metadata and must not be copied back as normal keys.
-        if field.name.startswith("GGUF."):
-            continue
-        # Writer already sets general.architecture from ctor; avoid duplicate warning.
-        if field.name in (expert_count_key, "general.architecture"):
-            continue  # replaced below
-        copy_field(writer, field, reader)
-
-    writer.add_expert_count(keep_n)
-    print(f"[gguf-prune] patched  {expert_count_key} → {keep_n}")
-
-    # --- tensors ---
-    n_pruned = 0
-    for tensor in reader.tensors:
-        il, suffix = layer_and_suffix(tensor.name)
-        is_expert = il is not None and suffix is not None and is_expert_suffix(suffix)
-
-        if is_expert:
-            assert il is not None
-            k = kept[il]
-            data = slice_expert_axis(tensor.data, k)
-            writer.add_tensor(
-                tensor.name,
-                data,
-                raw_dtype=tensor.tensor_type,
-            )
-            n_pruned += 1
-        else:
-            writer.add_tensor(
-                tensor.name,
-                tensor.data,
-                raw_dtype=tensor.tensor_type,
-            )
-
-    writer.write_header_to_file()
-    writer.write_kv_data_to_file()
-    writer.write_tensors_to_file(progress=True)
-    writer.close()
-
-    out = Path(args.output)
-    size_gb = out.stat().st_size / 1024**3
-    print(f"\n[gguf-prune] done")
-    print(f"  Expert tensors sliced : {n_pruned}")
-    print(f"  MoE layers pruned     : {len(kept)}")
-    print(f"  Experts per layer     : {keep_n}/{args.n_experts}")
-    print(f"  Output size           : {size_gb:.2f} GB  →  {out}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/moe-pruning/requirements.txt b/tools/moe-pruning/requirements.txt
deleted file mode 100644
index 6b21f239f4..0000000000
--- a/tools/moe-pruning/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-gguf
diff --git a/tools/moe-pruning/sample_calibration.jsonl b/tools/moe-pruning/sample_calibration.jsonl
deleted file mode 100644
index 44e2627f97..0000000000
--- a/tools/moe-pruning/sample_calibration.jsonl
+++ /dev/null
@@ -1,8 +0,0 @@
-{"prompt": "What is the difference between a transformer and an RNN?", "response": "Transformers process all tokens in parallel using self-attention, while RNNs process tokens sequentially and maintain a hidden state. Transformers scale better and handle long-range dependencies more effectively, but require more memory for the attention matrix."}
-{"prompt": "Explain the concept of weight quantization in neural networks.", "response": "Weight quantization reduces model size and inference latency by representing parameters in lower precision formats such as INT8 or INT4 instead of FP32. This trades a small amount of accuracy for significant gains in memory efficiency and compute throughput."}
-{"prompt": "What is mixture of experts (MoE)?", "response": "MoE is a neural network architecture where the feed-forward layers are replaced by multiple expert sub-networks and a routing mechanism. For each token, only a small subset of experts is activated, allowing the model to have a large parameter count while keeping inference compute constant."}
-{"prompt": "Describe the attention mechanism.", "response": "Attention computes a weighted sum of value vectors, where the weights are determined by the compatibility between query and key vectors. Scaled dot-product attention computes scores as Q*K^T/sqrt(d_k), applies softmax to get weights, then multiplies by V."}
-{"prompt": "What is GGUF and how does it differ from GGML?", "response": "GGUF is the successor to the GGML file format for storing quantized models. It supports arbitrary key-value metadata, is extensible without breaking backward compatibility, and encodes tensor names and shapes explicitly, making it more robust than the original GGML format."}
-{"prompt": "How does LoRA work?", "response": "LoRA (Low-Rank Adaptation) injects trainable rank-decomposition matrices A and B into frozen weight layers. The adapted weight is W + alpha/r * B*A. Since rank r is much smaller than the weight dimensions, only a tiny fraction of parameters are trained."}
-{"prompt": "What is perplexity in language modeling?", "response": "Perplexity measures how well a language model predicts a sample text. It is the exponentiated average negative log-likelihood per token: PPL = exp(-1/N * sum log P(token_i)). Lower perplexity indicates a better fit to the data."}
-{"prompt": "Explain rotary position embeddings (RoPE).", "response": "RoPE encodes position by rotating query and key vectors in 2D subspaces using a position-dependent rotation matrix. This makes the dot product between Q and K depend only on their relative position, enabling the model to generalise to sequence lengths longer than those seen during training."}