added qlora finetuning

2026-03-11 14:50:05 +01:00 · 2026-03-11 14:50:05 +01:00 · 84cab59ec6
parent b5fe4559ae
commit 84cab59ec6
19 changed files with 2403 additions and 16 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -3601,32 +3601,108 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        { "-lr", "--learning-rate" }, "ALPHA",
        string_format("adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)", (double) params.lr.lr0),
        [](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); }
-    ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA }));
    add_opt(common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
        string_format("(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
            (double) params.lr.lr_min),
        [](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); }
-    ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA }));
    add_opt(common_arg(
        {"-decay-epochs", "--learning-rate-decay-epochs"}, "ALPHA",
        string_format("(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)", (double) params.lr.decay_epochs),
        [](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); }
-    ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA }));
    add_opt(common_arg(
        {"-wd", "--weight-decay"}, "WD",
        string_format("adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).", (double) params.lr.wd),
        [](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); }
-    ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA }));
    add_opt(common_arg(
        {"-val-split", "--val-split"}, "FRACTION",
        string_format("fraction of data to use as validation set for training (default: %.2g).", (double) params.val_split),
        [](common_params & params, const std::string & value) { params.val_split = std::stof(value); }
-    ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA }));
+    // qlora flags
+    add_opt(common_arg(
+        {"--lora-rank"}, "N",
+        string_format("LoRA rank r (default: %d)", params.lora_rank),
+        [](common_params & params, int value) { params.lora_rank = value; }
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
+    add_opt(common_arg(
+        {"--lora-alpha"}, "F",
+        string_format("LoRA alpha (default: %d = use rank value)", (int) params.lora_alpha),
+        [](common_params & params, const std::string & value) { params.lora_alpha = std::stof(value); }
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
+    add_opt(common_arg(
+        {"--lora-targets"}, "SUBSTRINGS",
+        string_format("comma-separated substrings of tensor names to add LoRA to (default: %s)", params.lora_targets.c_str()),
+        [](common_params & params, const std::string & value) { params.lora_targets = value; }
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
+    add_opt(common_arg(
+        {"--lora-out"}, "FNAME",
+        string_format("output LoRA adapter GGUF path (default: %s)", params.lora_out.c_str()),
+        [](common_params & params, const std::string & value) { params.lora_out = value; }
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
+    add_opt(common_arg(
+        {"--train-file"}, "FNAME",
+        "JSONL training dataset (fields: messages|prompt+response|text)",
+        [](common_params & params, const std::string & value) { params.train_file = value; }
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
+    add_opt(common_arg(
+        {"--save-every"}, "N",
+        "save adapter checkpoint every N dataset windows during training (default: 0 = only at end)",
+        [](common_params & params, int value) { params.save_every = value; }
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
+    add_opt(common_arg(
+        {"--freeze-layers"}, "N",
+        "freeze first N transformer layers — no LoRA adapters allocated for blk.0..blk.N-1 (default: 0 = train all layers)",
+        [](common_params & params, int value) { params.lora_freeze_layers = value; }
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
+    add_opt(common_arg(
+        {"--grad-checkpoint"}, "N",
+        "gradient checkpointing interval to reduce peak activation VRAM (0 = disabled, default: 0)",
+        [](common_params & params, int value) { params.grad_checkpoint_interval = value; }
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
+    add_opt(common_arg(
+        {"--train-on-prompt"},
+        "compute loss on prompt tokens too, not just the response (default: response-only loss)",
+        [](common_params & params) { params.train_on_prompt = true; }
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
+    add_opt(common_arg(
+        {"--shuffle-dataset"},
+        "shuffle dataset windows at the start of each epoch (default: sequential order)",
+        [](common_params & params) { params.shuffle_dataset = true; }
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
+    add_opt(common_arg(
+        {"--grpo-mode"},
+        "enable GRPO IPC training loop (prompts and rewards supplied via stdin/stdout)",
+        [](common_params & params) { params.grpo_mode = true; }
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
+    add_opt(common_arg(
+        {"--n-gen"}, "N",
+        string_format("GRPO: number of generations per prompt (default: %d)", params.grpo_n_gen),
+        [](common_params & params, int value) { params.grpo_n_gen = value; }
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
+    add_opt(common_arg(
+        {"--n-steps"}, "N",
+        string_format("GRPO: total optimizer steps (default: %d)", params.grpo_n_steps),
+        [](common_params & params, int value) { params.grpo_n_steps = value; }
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
+    add_opt(common_arg(
+        {"--grpo-temp"}, "F",
+        string_format("GRPO: sampling temperature for rollout generation (default: %.2f)", (double) params.grpo_temperature),
+        [](common_params & params, const std::string & value) { params.grpo_temperature = std::stof(value); }
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
+    add_opt(common_arg(
+        {"--grpo-max-tokens"}, "N",
+        string_format("GRPO: max tokens per generation (default: %d)", params.grpo_max_tokens),
+        [](common_params & params, int value) { params.grpo_max_tokens = value; }
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
    add_opt(common_arg(
        {"-epochs", "--epochs"}, "N",
        string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
        [](common_params & params, int epochs) { params.lr.epochs = epochs; }
-    ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA }));
    add_opt(common_arg(
        {"-opt", "--optimizer"}, "sgd|adamw", "adamw or sgd",
        [](common_params & params, const std::string & name) {
@ -3635,7 +3711,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
            }
        }
-    ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+    ).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA }));
    add_opt(common_arg(
        {"--check"},
        string_format("check rather than generate results (default: %s)", params.check ? "true" : "false"),
--- a/common/common.h
+++ b/common/common.h
@ -103,6 +103,7 @@ enum llama_example {
    LLAMA_EXAMPLE_TTS,
    LLAMA_EXAMPLE_DIFFUSION,
    LLAMA_EXAMPLE_FINETUNE,
+    LLAMA_EXAMPLE_FINETUNE_QLORA,
    LLAMA_EXAMPLE_FIT_PARAMS,
    LLAMA_EXAMPLE_RESULTS,

@ -517,7 +518,26 @@ struct common_params {
    // finetune
    struct lr_opt lr;
    enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
-    float val_split = 0.05f; // fraction of the data used for the validation set
+    float   val_split              = 0.05f; // fraction of the data used for the validation set
+
+    // qlora fine-tuning
+    int32_t     lora_rank          = 16;              // LoRA rank (r)
+    float       lora_alpha         = 0.0f;            // LoRA alpha (0 = use rank value)
+    std::string lora_targets       = "attn_q,attn_output,ffn_gate,ffn_up,ffn_down"; // comma-separated substrings to match trainable tensors
+    std::string lora_out           = "adapter.gguf";  // output adapter GGUF path
+    std::string train_file         = "";              // JSONL training dataset path
+    int32_t save_every             = 0;     // save checkpoint every N optimizer steps (0 = disabled)
+    int32_t lora_freeze_layers     = 0;     // do not apply LoRA to the first N transformer layers
+    int32_t grad_checkpoint_interval = 0;  // gradient checkpointing interval to reduce peak VRAM (0 = disabled)
+    bool    train_on_prompt        = false; // include prompt tokens in training loss (default: response tokens only)
+    bool    shuffle_dataset        = false; // shuffle dataset windows at the start of each epoch
+
+    // grpo training
+    bool    grpo_mode              = false; // enable GRPO IPC training loop
+    int32_t grpo_n_gen             = 8;     // generations per prompt
+    int32_t grpo_n_steps           = 500;   // total GRPO optimizer steps
+    float   grpo_temperature       = 0.8f;  // sampling temperature for rollouts
+    int32_t grpo_max_tokens        = 512;   // max tokens per generation

    // embedding
    bool embedding         = false; // get only sentence embedding
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -34,6 +34,7 @@ else()
    add_subdirectory(speculative-simple)
    add_subdirectory(gen-docs)
    add_subdirectory(training)
+    add_subdirectory(qlora_training)
    add_subdirectory(diffusion)
    if (NOT GGML_BACKEND_DL)
        add_subdirectory(convert-llama2c-to-ggml)
--- a/examples/qlora_training/CMakeLists.txt
+++ b/examples/qlora_training/CMakeLists.txt
@ -0,0 +1,5 @@
+set(TARGET llama-finetune-qlora)
+add_executable(${TARGET} finetune_qlora.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/qlora_training/README.md
+++ b/examples/qlora_training/README.md
@ -0,0 +1,331 @@
+# llama.cpp — Native QLoRA Training
+
+Native QLoRA + Reward-Weighted SFT training pipeline for quantized GGUF models.
+
+The base model weights remain **frozen** (quantized tensors are skipped by `llama_set_param` because they are not `GGML_TYPE_F32`). Only freshly-allocated F32 LoRA A/B tensors are trained. The saved adapter GGUF is directly compatible with the existing `llama_adapter_lora_init` loader and `llama-export-lora` merge tool.
+
+**Status:** Working. Phase 1 (QLoRA SFT) and Phase 2 (Reward-Weighted SFT) are implemented and functional. Training speed is currently limited by full backprop through quantized weights — see [Known Limitations](#known-limitations).
+
+---
+
+## Build
+
+```bash
+cd /mnt/w/llm-trading-arena/unsloth-api/llama.cpp
+
+# First time (CUDA build):
+cmake -B build -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_CURL=OFF
+cmake --build build -j$(nproc)
+
+# Incremental rebuild (after code changes):
+cmake --build build --target llama-finetune-qlora -j$(nproc)
+# If llama-adapter.cpp or llama-context.cpp changed, rebuild all:
+cmake --build build -j$(nproc)
+```
+
+---
+
+## Phase 1 — QLoRA SFT (`llama-finetune-qlora`)
+
+Trains LoRA adapters on a quantized GGUF model.
+
+### Recommended command (1.7B model, 16 GB card)
+
+```bash
+./build/bin/llama-finetune-qlora \
+  --model ~/qwen3-1.7b-q4_k_m.gguf \
+  --train-file data/train.jsonl \
+  --lora-rank 16 --lora-alpha 16 \
+  -c 4096 -b 4096 -ub 512 \
+  --save-every 10 \
+  --lora-out ~/adapter.gguf \
+  --epochs 3 --seed 42
+```
+
+### Recommended command (15B model, 16 GB card, partial offload)
+
+```bash
+./build/bin/llama-finetune-qlora \
+  --model ~/nemotron-15b-q4_k_m.gguf \
+  --train-file data/train.jsonl \
+  --lora-rank 16 --lora-alpha 16 \
+  -ngl 13 -c 14336 -b 14336 -ub 1024 \
+  --save-every 8 \
+  --lora-out ~/nemotron-lora.gguf \
+  --epochs 3 --seed 42
+```
+
+### All flags
+
+| Flag | Default | Description |
+|---|---|---|
+| `--model` | *(required)* | Path to quantized GGUF model |
+| `--train-file` | *(required)* | JSONL training dataset |
+| `--lora-rank` | `16` | LoRA rank r |
+| `--lora-alpha` | `0` (= rank) | LoRA alpha; effective scale = alpha/rank |
+| `--lora-targets` | see below | Comma-separated internal tensor name substrings |
+| `--lora-out` | `adapter.gguf` | Output adapter GGUF path (supports `~`) |
+| `--save-every` | `0` | Save checkpoint every N dataset windows (0 = end only) |
+| `--freeze-layers` | `0` | Skip LoRA on first N transformer layers (blk.0..N-1); backward already pruned automatically |
+| `--grad-checkpoint` | `0` | Mark every Nth forward node persistent to reduce activation VRAM; good values: 32–64 |
+| `--train-on-prompt` | off | Compute loss on prompt tokens too (default: response-only loss) |
+| `--shuffle-dataset` | off | Shuffle dataset windows at the start of each epoch |
+| `--val-split` | `0.0` | Fraction of data to hold out for validation (e.g. `0.1` = 10%); val loss logged per epoch |
+| `-epochs` / `--epochs` | `3` | Training epochs |
+| `-c` / `--ctx-size` | `512` | Training context window (tokens) |
+| `-b` / `--batch-size` | `2048` | Tokens per `llama_decode` call; set equal to `-c` |
+| `-ub` / `--ubatch-size` | `512` | GPU micro-batch tokens; controls VRAM vs. step time |
+| `-ngl` | `999` | GPU layers to offload |
+| `-lr` / `--learning-rate` | `1e-4` | AdamW learning rate |
+| `--seed` | `42` | Random seed for LoRA init |
+
+### VRAM vs. step-time tradeoff
+
+Step time and VRAM both scale linearly with `-ub`:
+
+| Model | `-ub` | VRAM | Step time (approx) |
+|---|---|---|---|
+| 1.7B Q4_K_M | 512 | ~18 GB | ~120 s (OOM on 16 GB) |
+| 1.7B Q4_K_M | 128 | ~6 GB | ~30 s |
+| 15B Q4_K_M | 1024 | ~11 GB | ~60 s |
+
+Use `-c` equal to your target sequence length. More context = more windows per sample = more steps per epoch. Reducing `-c` reduces total training time proportionally.
+
+### Default LoRA targets
+
+llama.cpp uses **internal GGUF tensor names**, not HuggingFace names:
+
+| llama.cpp internal | HuggingFace equivalent | Status |
+|---|---|---|
+| `attn_q` | `q_proj` | ✅ default target, trainable |
+| `attn_output` | `o_proj` | ✅ default target, trainable |
+| `ffn_gate` | `gate_proj` | ✅ default target, trainable |
+| `ffn_up` | `up_proj` | ✅ default target, trainable |
+| `ffn_down` | `down_proj` | ✅ default target, trainable |
+| `attn_k` | `k_proj` | ❌ not in defaults — zero gradient (KV scatter via SET_ROWS) |
+| `attn_v` | `v_proj` | ❌ not in defaults — zero gradient (KV scatter via SET_ROWS) |
+| `ssm_in` | `in_proj` | ❌ not in defaults — zero gradient (SSM_SCAN no backward) |
+| `ssm_out` | `out_proj` | ❌ not in defaults — zero gradient (SSM_SCAN no backward) |
+
+**MoE models:** Expert tensors (`*_exps`) are excluded regardless of `--lora-targets`. The quantized expert weights are frozen (stop-gradient), but LoRA on the dense FFN layers (`ffn_gate`, `ffn_up`, `ffn_down`) works — backward via `MUL_MAT_ID` + `OUT_PROD_ID`.
+
+### Dataset format (JSONL)
+
+**Chat format** (loss on response only; use `--train-on-prompt` for all tokens):
+```json
+{"messages": [{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi!"}]}
+```
+
+**Prompt/response** (loss on response only):
+```json
+{"prompt": "What is the capital of France?", "response": "Paris."}
+```
+
+**Plain text** (loss on all tokens):
+```json
+{"text": "The quick brown fox."}
+```
+
+**With reward** (Phase 2 — scales gradient by reward):
+```json
+{"prompt": "...", "response": "...", "reward": 0.85}
+```
+
+Rewards are normalized per epoch: clipped to `[-1, 1]`, then min-max scaled to `[0, 1]`. Reward 0 = sample ignored; reward 1 = full gradient.
+
+### Verify and use the adapter
+
+```bash
+# Hot-load for inference (no merge needed)
+./build/bin/llama-cli --model base.gguf --lora adapter.gguf -p "Hello"
+
+# Merge into base model
+./build/bin/llama-export-lora \
+  --model base.gguf --lora adapter.gguf --output merged.gguf
+```
+
+---
+
+## Phase 2 — Reward-Weighted SFT
+
+Built into `llama-finetune-qlora`. When the dataset contains a `reward` or `score` field, the cross-entropy loss for that sample is scaled by the reward before backprop. No extra flags needed — detection is automatic.
+
+---
+
+## Phase 3 — GRPO (Online RL via IPC)
+
+`llama-finetune-qlora --grpo-mode` implements a full GRPO training loop where the Python process owns prompt sampling and reward scoring, and the C++ process owns model state, generation, and gradient updates.
+
+### Quick start
+
+```bash
+python3 examples/qlora_training/grpo_example.py \
+    --model  ~/qwen3-1.7b-q4_k_m.gguf \
+    --lora-out ~/grpo-adapter.gguf \
+    --rank 16 --n-steps 200 --n-gen 8
+```
+
+For verbose output (includes IPC message trace):
+
+```bash
+python3 examples/qlora_training/grpo_example.py \
+    --model ~/qwen3-1.7b-q4_k_m.gguf \
+    --lora-out ~/grpo-adapter.gguf \
+    --verbose
+```
+
+Resume from a checkpoint:
+
+```bash
+python3 examples/qlora_training/grpo_example.py \
+    --model ~/qwen3-1.7b-q4_k_m.gguf \
+    --lora     ~/grpo-adapter.ckpt50.gguf \
+    --lora-out ~/grpo-adapter.gguf
+```
+
+### GRPO-specific flags
+
+| Flag | Default | Description |
+|---|---|---|
+| `--grpo-mode` | off | Enable GRPO IPC mode |
+| `--n-gen` | `8` | Rollouts per prompt |
+| `--n-steps` | `500` | Total GRPO steps |
+| `--grpo-temp` | `0.8` | Sampling temperature for rollouts |
+| `--grpo-max-tokens` | `512` | Max tokens per generation |
+
+All standard flags (`--lora-rank`, `-lr`, `-c`, `-ngl`, `--save-every`, etc.) work in GRPO mode too. `--train-file` is **not** required in GRPO mode.
+
+### IPC protocol
+
+The protocol is line-based over stdout (C++ → Python) and stdin (Python → C++). All non-protocol C++ output (timing, debug, model logs) goes to **stderr** and never contaminates the protocol channel.
+
+**C++ → Python (stdout):**
+
+| Line | When |
+|---|---|
+| `[QLORA:READY]` | Process initialised, model loaded |
+| `[QLORA:PROMPT_REQ:<step>]` | C++ requests the prompt for step N |
+| `[QLORA:GEN:<k>/<n>] <text>` | One generation (newlines escaped as `\n`) |
+| `[QLORA:REWARD_REQ:<n>]` | C++ requests N reward scores |
+| `[QLORA:PROGRESS] step=X/Y loss=Z epoch=A/B` | After each weight update |
+| `[QLORA:CHECKPOINT] <path>` | After saving a checkpoint |
+| `[QLORA:DONE] final_loss=X` | Training complete |
+| `[QLORA:ERROR] <message>` | Fatal error |
+
+**Python → C++ (stdin):**
+
+| Line | Meaning |
+|---|---|
+| `PROMPT <escaped_text>` | Send prompt for the most recent `PROMPT_REQ` |
+| `REWARD <r1> <r2> … <rN>` | Send N advantage scores in `[0, 1]` range |
+| `STOP` | Request graceful shutdown after current step |
+
+**Text encoding:** newlines in generation text are escaped as the two-character sequence `\n`; backslashes are doubled. Use `unescape()` from `grpo_example.py` (or any equivalent) to recover the original text.
+
+### Writing your own driver
+
+`grpo_example.py` contains two functions you replace with your own logic:
+
+```python
+def get_prompt(step: int) -> str:
+    """Return the training prompt for step N."""
+    ...
+
+def score_generations(prompt: str, generations: List[str]) -> List[float]:
+    """Score each generation. Any numeric range — will be normalised."""
+    ...
+```
+
+The IPC helpers (`escape`, `unescape`, `parse_ipc`, `read_ipc`, `write_cmd`, `wait_for`, `normalise_rewards`) are standalone and have no external dependencies — copy them into your own project if needed.
+
+### Training loop diagram
+
+```
+Python                         C++ (llama-finetune-qlora --grpo-mode)
+  │                                │
+  │◄──── [QLORA:READY] ────────────┤  model loaded
+  │                                │
+  │  ┌─────────────────────────────┤
+  │  │ for each step:              │
+  │  │   ◄── PROMPT_REQ:N ─────────┤
+  │  │   ──► PROMPT <text> ────────►  generate n_gen rollouts
+  │  │        ◄── GEN:1/n <text> ──┤
+  │  │        ◄── GEN:2/n <text> ──┤
+  │  │        ...                  │
+  │  │        ◄── GEN:n/n <text> ──┤
+  │  │   ◄── REWARD_REQ:n ─────────┤
+  │  │   (score generations)       │
+  │  │   ──► REWARD a1 a2 … an ────►  one backward + AdamW step
+  │  │   ◄── PROGRESS step=N/M … ──┤
+  │  └─────────────────────────────┤
+  │                                │
+  │◄──── [QLORA:DONE] ─────────────┤  adapter saved
+```
+
+---
+
+## Known Limitations & Optimization Roadmap
+
+### Current limitations
+
+**1. Full backprop through frozen quantized layers**
+Every backward step dequantizes all frozen Q4_K_M weight tensors to compute activation gradients (needed to propagate loss from the output back to each LoRA layer). For a 28-layer 1.7B model at `-ub 512`, this is ~280 dequantizing matmuls per step → step time is 3–5× slower than inference.
+
+**2. Activation VRAM** *(partially addressed by `--grad-checkpoint`)*
+All forward activations are kept in VRAM throughout the backward pass. VRAM ≈ `model + KV + n_layers × hidden × n_ubatch × 10 × 4B + 2 × lora_params × 4B`. Reducing `-ub` reduces VRAM linearly. Use `--grad-checkpoint 48` to prevent the allocator from reusing intermediate activation buffers during backward, which cuts peak activation VRAM at near-zero compute cost.
+
+**3. Full backprop through all layers** *(partially addressed by `--freeze-layers`)*
+Gradients propagate through all layers that have LoRA adapters. Use `--freeze-layers N` to skip LoRA allocation for blk.0..N-1 — those layers receive no gradient (the `grads_needed` pruner already skips their backward ops automatically). Only the top (total_layers - N) layers are trained.
+
+### Optimization roadmap
+
+| Priority | Optimization | Expected gain | Status |
+|---|---|---|---|
+| ✅ Done | **`--freeze-layers N`** — no LoRA on first N layers; backward auto-pruned | Proportional to N/total | Implemented |
+| ✅ Done | **`--grad-checkpoint N`** — keep every Nth activation alive through backward | Reduces peak activation VRAM | Implemented |
+| ✅ Done | **`--train-on-prompt`** — compute loss on prompt tokens too | Configurable loss target | Implemented |
+| ✅ Done | **`--shuffle-dataset`** — shuffle windows each epoch | Better convergence | Implemented |
+| ✅ Done | **BOS separators** — insert BOS between concatenated samples | Correct cross-sample boundaries | Implemented |
+| ✅ Done | **Per-epoch loss summary** — log train/val loss after each epoch | Observability | Implemented |
+| ✅ Done | **`MUL_MAT_ID` backward** — LoRA on MoE dense FFN layers; `OUT_PROD_ID` for scattered outer product | Unlocks Mixtral/Nemotron-MoE | Implemented |
+| ✅ Done | **Quantized `OUT_PROD`** — dequantize on GPU + cuBLAS for backward matmul | Full GPU training (no CPU fallback) | Implemented |
+| ✅ Done | **Reuse `ctx_compute_opt`** — allocate tensor metadata context once, `ggml_reset()` across ubatches | Eliminate ~0.5 s/step overhead | Implemented |
+| ❌ Skip | **Static training graphs** — KV mask shape changes per ubatch (`n_kv` grows); graph topology not static | Would need KV cache redesign | Not feasible |
+| Low | **`SSM_SCAN/CONV` backward** — enable LoRA on Mamba SSM layers | Unlocks NemotronH SSM layers | Planned |
+| Low | **GELU backward** — implement `ggml_gelu_back` kernel (UNARY + GLU) | Support GPT-2/Phi-style models | Planned (needs new CUDA/CPU kernels) |
+
+---
+
+## Implementation notes (for developers)
+
+### Modified llama.cpp files
+
+| File | Change |
+|---|---|
+| `ggml/src/ggml.c` | Backward graph fixes: `GET_ROWS` 3D, `SET_ROWS`, `MUL_MAT_ID`, `SSM_SCAN/CONV`, `FLASH_ATTN_EXT` all stop gradient; inplace-op assert → warn+skip |
+| `src/llama-context.cpp` | `opt_init`: scheduler and graph sized with inflated capacity before `ggml_opt_init`; `opt_epoch_iter`: per-ubatch timing instrumentation; reward scaling via `g_reward_weights` TLS |
+| `src/llama-adapter.cpp` | Repack-buft fallback for LoRA tensors: tries device-native buft before CPU |
+| `common/common.h` | Added `save_every`, `lora_freeze_layers`, `grad_checkpoint_interval`, `train_on_prompt`, `shuffle_dataset` fields |
+| `common/arg.cpp` | Added `--save-every`, `--freeze-layers`, `--grad-checkpoint`, `--train-on-prompt`, `--shuffle-dataset` arguments |
+| `include/llama.h` | Added `llama_opt_set_reward_weights()`; `grad_checkpoint_interval` in `llama_opt_params`; `shuffle` param in `llama_opt_epoch` |
+| `ggml/src/ggml-cuda/out-prod.cu` | `OUT_PROD` with quantized src0 (dequantize on GPU + cuBLAS); `OUT_PROD_ID` for MoE backward |
+| `ggml/src/ggml-cuda/ggml-cuda.cu` | `supports_op` for quantized `OUT_PROD` and `OUT_PROD_ID`; CPU-resident ids fix in `mul_mat_id` |
+| `ggml/include/ggml-opt.h` | Added `grad_checkpoint_interval` to `ggml_opt_params` |
+| `ggml/src/ggml-opt.cpp` | Gradient checkpointing: marks every Nth forward node `GGML_TENSOR_FLAG_OUTPUT` before backward build |
+
+### Key invariants
+
+- `params.use_mmap = false` — forced; mmap'd tensors can't have data written back
+- `params.flash_attn_type = DISABLED` — no backward impl for flash attention
+- `params.warmup = false` — warmup runs inference with PARAM tensors → segfault
+- `params.cache_type_k = F32` — training requires F32 KV (or BF16 with `--cache-type-k bf16`)
+- LoRA A/B tensors are marked `PARAM` via `ggml_set_param` on the tensors loaded by `llama_adapter_lora_init`, not on the pre-init scratch tensors in `lt.buf`
+- The adapter GGUF is pre-saved and loaded via `params.lora_adapters` BEFORE `common_init_from_params` so that `sched_reserve` includes LoRA graph nodes in its sizing
+
+### Why opt_init inflation matters
+
+`ggml_opt_init` captures `sched.get()` at construction time. The backward graph (`gb_grad`, `gb_opt`) is ~3–5× larger than the forward graph in node count. If the scheduler hash_set is sized only for the forward graph, `ggml_backend_sched_alloc_graph` on the backward graph will overflow it. We recreate `sched` with `inflated = fwd_nodes × 4` slots BEFORE calling `ggml_opt_init`.
+
+### Reward weighting implementation
+
+`llama_opt_set_reward_weights(weights, n)` sets thread-local `g_reward_weights`. In `opt_epoch`, each window reads `g_reward_weights[idata]` and passes it as `reward_scale` to `opt_epoch_iter`. Inside the iter loop, instead of writing `1.0f` for the correct token's label position in the cross-entropy label tensor, it writes `reward_scale`. Since cross-entropy loss = `-mean(label × log(softmax(logit)))`, scaling the label scales both loss and gradient identically.
--- a/examples/qlora_training/check_lora_norms.py
+++ b/examples/qlora_training/check_lora_norms.py
@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+"""Quick check of LoRA tensor norms in a GGUF file."""
+import sys, struct, numpy as np
+
+def read_gguf(path):
+    with open(path, 'rb') as f:
+        assert f.read(4) == b'GGUF'
+        version = struct.unpack('<I', f.read(4))[0]
+        n_tensors, n_kv = struct.unpack('<QQ', f.read(16))
+        # skip KV pairs (simplified - just seek past them)
+        # Read tensor infos
+        tensors = []
+        for _ in range(n_kv):
+            # key
+            klen = struct.unpack('<Q', f.read(8))[0]
+            key = f.read(klen).decode()
+            vtype = struct.unpack('<I', f.read(4))[0]
+            # skip value based on type (simplified)
+            if vtype == 8:   # string
+                slen = struct.unpack('<Q', f.read(8))[0]; f.read(slen)
+            elif vtype == 6: # float32
+                f.read(4)
+            elif vtype in (0,1,2,3,4,5,10,11,12): # int types
+                sizes = {0:1,1:1,2:2,3:4,4:8,5:1,10:2,11:4,12:8}
+                f.read(sizes.get(vtype,4))
+            elif vtype == 9: # bool
+                f.read(1)
+            else:
+                print(f"unknown kv type {vtype} for key {key}, stopping"); break
+
+        data_offset = None
+        for i in range(n_tensors):
+            nlen = struct.unpack('<Q', f.read(8))[0]
+            name = f.read(nlen).decode()
+            ndims = struct.unpack('<I', f.read(4))[0]
+            dims = struct.unpack('<' + 'Q'*ndims, f.read(8*ndims))
+            dtype = struct.unpack('<I', f.read(4))[0]
+            offset = struct.unpack('<Q', f.read(8))[0]
+            tensors.append((name, dims, dtype, offset))
+
+        # data section starts after alignment
+        pos = f.tell()
+        align = 32
+        data_start = (pos + align - 1) & ~(align - 1)
+        
+        print(f"\nFile: {path}")
+        print(f"Tensors: {n_tensors}")
+        
+        for name, dims, dtype, offset in tensors[:10]:  # first 10
+            if dtype != 0:  # only F32 (type 0)
+                print(f"  {name}: dims={dims} type={dtype} (non-F32, skipping norm)")
+                continue
+            f.seek(data_start + offset)
+            n = 1
+            for d in dims: n *= d
+            data = np.frombuffer(f.read(n*4), dtype=np.float32)
+            print(f"  {name}: dims={dims} norm={np.linalg.norm(data):.4f} max={np.abs(data).max():.4f} mean={np.abs(data).mean():.6f}")
+
+if __name__ == '__main__':
+    for p in sys.argv[1:]:
+        try:
+            read_gguf(p)
+        except Exception as e:
+            print(f"Error reading {p}: {e}")
--- a/examples/qlora_training/finetune_qlora.cpp
+++ b/examples/qlora_training/finetune_qlora.cpp
--- a/examples/qlora_training/grpo_example.py
+++ b/examples/qlora_training/grpo_example.py
@ -0,0 +1,397 @@
+#!/usr/bin/env python3
+"""
+grpo_example.py — Minimal GRPO training loop using llama-finetune-qlora --grpo-mode
+
+Demonstrates the IPC protocol between the Python driver and the C++ subprocess.
+No external dependencies required — only Python stdlib.
+
+Usage:
+    python3 grpo_example.py \
+        --model   /path/to/model-q4_k_m.gguf \
+        --lora-out /path/to/output-adapter.gguf \
+        [--lora    /path/to/resume-adapter.gguf] \
+        [--binary  /path/to/llama-finetune-qlora] \
+        [--n-steps 200] \
+        [--n-gen   8] \
+        [--rank    16]
+
+IPC Protocol (stdout from C++ process):
+    [QLORA:READY]               — process initialised
+    [QLORA:PROMPT_REQ:<step>]   — C++ requests a prompt for step N
+    [QLORA:GEN:<k>/<n>] <text>  — one generation (newlines escaped as \\n)
+    [QLORA:REWARD_REQ:<n>]      — C++ requests N reward scores
+    [QLORA:PROGRESS] step=X/Y loss=Z epoch=A/B
+    [QLORA:CHECKPOINT] <path>
+    [QLORA:DONE] final_loss=X
+    [QLORA:ERROR] <message>
+
+Python → C++ stdin:
+    PROMPT <escaped_text>
+    REWARD <r1> <r2> ... <rN>    (advantages, 0..1 range)
+    STOP                         (request graceful shutdown)
+"""
+
+import argparse
+import logging
+import math
+import os
+import re
+import subprocess
+import sys
+import time
+from pathlib import Path
+from typing import List, Optional, Tuple
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+)
+log = logging.getLogger("grpo_example")
+
+# ──────────────────────────────────────────────────────────────────────────────
+# IPC helpers
+# ──────────────────────────────────────────────────────────────────────────────
+
+_IPC_RE = re.compile(r"^\[QLORA:([A-Z_]+)(?::([^\]]*))?\](.*)$")
+
+
+def escape(text: str) -> str:
+    """Escape newlines and backslashes for single-line IPC transport."""
+    return text.replace("\\", "\\\\").replace("\n", "\\n").replace("\r", "\\r")
+
+
+def unescape(text: str) -> str:
+    """Reverse of escape()."""
+    out, i = [], 0
+    while i < len(text):
+        if text[i] == "\\" and i + 1 < len(text):
+            c = text[i + 1]
+            if c == "n":
+                out.append("\n")
+            elif c == "r":
+                out.append("\r")
+            elif c == "\\":
+                out.append("\\")
+            else:
+                out.append(c)
+            i += 2
+        else:
+            out.append(text[i])
+            i += 1
+    return "".join(out)
+
+
+def parse_ipc(line: str) -> Optional[Tuple[str, str, str]]:
+    """
+    Parse an IPC line into (msg_type, seq, payload).
+    Returns None for non-IPC lines (model output, log lines, etc.).
+    """
+    m = _IPC_RE.match(line.strip())
+    if not m:
+        return None
+    return m.group(1), (m.group(2) or ""), m.group(3).strip()
+
+
+def read_ipc(proc: subprocess.Popen, timeout: float = 120.0) -> Optional[Tuple[str, str, str]]:
+    """
+    Read lines from proc.stdout until an IPC message arrives.
+    Non-IPC lines (model output, C++ logs leaked to stdout) are printed.
+    Returns None on EOF.
+    Raises TimeoutError if nothing arrives within `timeout` seconds.
+    """
+    deadline = time.monotonic() + timeout
+    while True:
+        remaining = deadline - time.monotonic()
+        if remaining <= 0:
+            raise TimeoutError(f"No IPC message within {timeout:.0f}s")
+
+        line = proc.stdout.readline()
+        if not line:
+            return None  # EOF
+
+        line = line.rstrip("\n")
+        parsed = parse_ipc(line)
+        if parsed:
+            return parsed
+        # Non-IPC — C++ sometimes leaks timing/debug lines to stdout.
+        # Print them so the user can see what's happening.
+        print(f"  [cpp] {line}", file=sys.stderr)
+
+
+def write_cmd(proc: subprocess.Popen, cmd: str):
+    """Write one command line to the subprocess stdin."""
+    try:
+        proc.stdin.write(cmd + "\n")
+        proc.stdin.flush()
+    except BrokenPipeError:
+        raise RuntimeError("C++ subprocess stdin closed — did it crash?")
+
+
+def wait_for(proc: subprocess.Popen, expected: str, timeout: float = 120.0) -> Tuple[str, str, str]:
+    """Block until the expected IPC message type arrives."""
+    deadline = time.monotonic() + timeout
+    while True:
+        remaining = deadline - time.monotonic()
+        if remaining <= 0:
+            raise TimeoutError(f"Timed out waiting for [{expected}]")
+        parsed = read_ipc(proc, timeout=remaining)
+        if parsed is None:
+            raise RuntimeError(f"Subprocess exited before sending [{expected}]")
+        msg_type, seq, payload = parsed
+        if msg_type == expected:
+            return msg_type, seq, payload
+        log.debug("Ignoring unexpected IPC (%s) while waiting for %s", msg_type, expected)
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# Advantage normalisation (GRPO)
+# ──────────────────────────────────────────────────────────────────────────────
+
+def normalise_rewards(rewards: List[float]) -> List[float]:
+    """
+    Group-relative advantage normalisation: subtract mean, divide by std.
+    Clipped to [0, 1] so the C++ side always receives values in that range.
+
+    All-equal rewards → uniform 0.5 (no signal, but no NaN either).
+    """
+    if len(rewards) == 0:
+        return []
+    mean = sum(rewards) / len(rewards)
+    variance = sum((r - mean) ** 2 for r in rewards) / len(rewards)
+    std = math.sqrt(variance) if variance > 1e-8 else 1.0
+
+    normalised = [(r - mean) / std for r in rewards]
+    # Shift to [0,1]: z-scores typically lie in [-3, +3]
+    clipped = [max(0.0, min(1.0, 0.5 + z / 6.0)) for z in normalised]
+    return clipped
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# Example prompt / reward providers
+# ──────────────────────────────────────────────────────────────────────────────
+
+# Replace these with your own logic.
+
+_EXAMPLE_PROMPTS = [
+    "Explain the concept of gradient descent in one sentence.",
+    "What is the capital of France?",
+    "Write a haiku about machine learning.",
+    "Describe the difference between SFT and RLHF.",
+    "What does GRPO stand for?",
+]
+
+
+def get_prompt(step: int) -> str:
+    """Return a prompt for the given training step (0-indexed)."""
+    return _EXAMPLE_PROMPTS[step % len(_EXAMPLE_PROMPTS)]
+
+
+def score_generations(prompt: str, generations: List[str]) -> List[float]:
+    """
+    Score a list of model generations for the given prompt.
+    Returns a list of raw reward scores (any numeric range; will be normalised).
+
+    This example uses a trivial heuristic: longer, more varied responses
+    score higher.  Replace with your actual reward model / verifier.
+    """
+    scores = []
+    for gen in generations:
+        words = gen.split()
+        # Simple heuristics: length + lexical diversity
+        length_score = min(1.0, len(words) / 50.0)
+        vocab_score  = min(1.0, len(set(words)) / max(1, len(words)))
+        scores.append(0.6 * length_score + 0.4 * vocab_score)
+    return scores
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# Main GRPO loop
+# ──────────────────────────────────────────────────────────────────────────────
+
+def run_grpo(args: argparse.Namespace):
+    # Resolve binary
+    binary = Path(args.binary)
+    if not binary.exists():
+        log.error("Binary not found: %s", binary)
+        sys.exit(1)
+
+    # Build command
+    cmd = [
+        str(binary),
+        "--model",          args.model,
+        "--lora-out",       args.lora_out,
+        "--lora-rank",      str(args.rank),
+        "--lora-alpha",     str(args.rank // 2),
+        "-c",               str(args.ctx_size),
+        "-b",               str(args.ctx_size),
+        "-ub",              "512",
+        "-ngl",             str(args.ngl),
+        "-lr",              str(args.lr),
+        "--seed",           str(args.seed),
+        "--grad-checkpoint","48",
+        "--shuffle-dataset",
+        "--grpo-mode",
+        "--n-gen",          str(args.n_gen),
+        "--n-steps",        str(args.n_steps),
+        "--grpo-temp",      str(args.temperature),
+        "--grpo-max-tokens",str(args.max_tokens),
+    ]
+
+    if args.lora:
+        cmd += ["--lora", args.lora]
+
+    if args.save_every > 0:
+        cmd += ["--save-every", str(args.save_every)]
+
+    log.info("Launching: %s", " ".join(cmd))
+
+    proc = subprocess.Popen(
+        cmd,
+        stdin=subprocess.PIPE,
+        stdout=subprocess.PIPE,
+        stderr=sys.stderr,          # C++ debug/timing logs go directly to our stderr
+        text=True,
+        bufsize=1,
+    )
+
+    try:
+        _grpo_loop(proc, args)
+    except KeyboardInterrupt:
+        log.info("Interrupted — requesting graceful stop")
+        try:
+            write_cmd(proc, "STOP")
+        except Exception:
+            pass
+    except Exception as e:
+        log.error("GRPO loop error: %s", e)
+        proc.kill()
+        raise
+    finally:
+        try:
+            proc.stdin.close()
+        except Exception:
+            pass
+        rc = proc.wait(timeout=30)
+        if rc not in (0, None):
+            log.warning("Subprocess exited with code %d", rc)
+
+
+def _grpo_loop(proc: subprocess.Popen, args: argparse.Namespace):
+    # ── Wait for READY ──────────────────────────────────────────────────────
+    log.info("Waiting for subprocess to initialise (model load can take a minute)…")
+    wait_for(proc, "READY", timeout=300)
+    log.info("Subprocess ready.")
+
+    current_prompt: str = ""
+    generations: List[str] = []
+    step = 0
+
+    while True:
+        parsed = read_ipc(proc, timeout=600)
+        if parsed is None:
+            log.info("Subprocess exited (EOF).")
+            break
+
+        msg_type, seq, payload = parsed
+
+        # ── PROMPT_REQ ──────────────────────────────────────────────────────
+        if msg_type == "PROMPT_REQ":
+            step = int(seq) if seq else step + 1
+            current_prompt = get_prompt(step - 1)
+            generations = []
+            log.debug("Step %d — sending prompt: %s", step, current_prompt[:60])
+            write_cmd(proc, f"PROMPT {escape(current_prompt)}")
+
+        # ── GEN ─────────────────────────────────────────────────────────────
+        elif msg_type == "GEN":
+            # seq = "k/n"
+            parts = seq.split("/")
+            k = int(parts[0])
+            n = int(parts[1]) if len(parts) > 1 else args.n_gen
+            text = unescape(payload)
+            generations.append(text)
+            log.debug("  Generation %d/%d: %s…", k, n, text[:60].replace("\n", "↵"))
+
+        # ── REWARD_REQ ──────────────────────────────────────────────────────
+        elif msg_type == "REWARD_REQ":
+            n_expected = int(seq) if seq else len(generations)
+            if len(generations) != n_expected:
+                log.warning(
+                    "REWARD_REQ asked for %d rewards but collected %d generations",
+                    n_expected, len(generations),
+                )
+
+            raw_rewards = score_generations(current_prompt, generations)
+            advantages  = normalise_rewards(raw_rewards)
+
+            reward_str = " ".join(f"{a:.6f}" for a in advantages)
+            log.debug("  Rewards (raw): %s", [f"{r:.3f}" for r in raw_rewards])
+            log.debug("  Advantages:    %s", [f"{a:.3f}" for a in advantages])
+            write_cmd(proc, f"REWARD {reward_str}")
+
+        # ── PROGRESS ────────────────────────────────────────────────────────
+        elif msg_type == "PROGRESS":
+            # Format: step=X/Y loss=Z epoch=A/B
+            sm = re.search(r"step=(\d+)(?:/(\d+))?", payload)
+            lm = re.search(r"loss=([\d.]+)", payload)
+            step_str = f"{sm.group(1)}/{sm.group(2)}" if sm and sm.group(2) else (sm.group(1) if sm else "?")
+            loss_str = lm.group(1) if lm else "?"
+            print(f"  step {step_str}  loss {loss_str}", flush=True)
+
+        # ── CHECKPOINT ──────────────────────────────────────────────────────
+        elif msg_type == "CHECKPOINT":
+            log.info("Checkpoint saved: %s", payload.strip())
+
+        # ── DONE ────────────────────────────────────────────────────────────
+        elif msg_type == "DONE":
+            m = re.search(r"final_loss=([\d.]+)", payload)
+            loss = m.group(1) if m else "?"
+            log.info("Training complete. final_loss=%s", loss)
+            break
+
+        # ── ERROR ────────────────────────────────────────────────────────────
+        elif msg_type == "ERROR":
+            log.error("C++ process error: %s", payload.strip())
+            raise RuntimeError(f"Training failed: {payload.strip()}")
+
+        else:
+            log.debug("Unknown IPC message: [%s] seq=%r payload=%r", msg_type, seq, payload)
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# CLI
+# ──────────────────────────────────────────────────────────────────────────────
+
+def parse_args() -> argparse.Namespace:
+    # Default binary: build/bin/ relative to this script's repo root
+    script_dir = Path(__file__).resolve().parent
+    repo_root   = script_dir.parents[1]          # examples/qlora_training → llama.cpp root
+    default_bin = repo_root / "build" / "bin" / "llama-finetune-qlora"
+
+    p = argparse.ArgumentParser(
+        description="Minimal GRPO training loop via llama-finetune-qlora --grpo-mode",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    p.add_argument("--model",       required=True,          help="Base GGUF model path")
+    p.add_argument("--lora-out",    required=True,          help="Output adapter GGUF path")
+    p.add_argument("--lora",        default=None,           help="Resume from existing adapter GGUF")
+    p.add_argument("--binary",      default=str(default_bin), help="Path to llama-finetune-qlora binary")
+    p.add_argument("--rank",        type=int,   default=16,   help="LoRA rank")
+    p.add_argument("--n-steps",     type=int,   default=200,  help="Number of GRPO steps")
+    p.add_argument("--n-gen",       type=int,   default=8,    help="Generations per prompt")
+    p.add_argument("--lr",          type=float, default=1e-4, help="Learning rate")
+    p.add_argument("--ctx-size",    type=int,   default=4096, help="Context window")
+    p.add_argument("--ngl",         type=int,   default=999,  help="GPU layers (-ngl)")
+    p.add_argument("--temperature", type=float, default=0.8,  help="Sampling temperature")
+    p.add_argument("--max-tokens",  type=int,   default=512,  help="Max tokens per generation")
+    p.add_argument("--save-every",  type=int,   default=0,    help="Save checkpoint every N steps (0=off)")
+    p.add_argument("--seed",        type=int,   default=42,   help="RNG seed")
+    p.add_argument("--verbose",     action="store_true",     help="Enable DEBUG logging")
+    return p.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    if args.verbose:
+        logging.getLogger().setLevel(logging.DEBUG)
+    run_grpo(args)
--- a/examples/qlora_training/sample_data.jsonl
+++ b/examples/qlora_training/sample_data.jsonl
@ -0,0 +1,7 @@
+{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "What is the capital of France?"}, {"role": "assistant", "content": "The capital of France is Paris."}]}
+{"messages": [{"role": "user", "content": "Explain gradient descent in one sentence."}, {"role": "assistant", "content": "Gradient descent iteratively adjusts model parameters in the direction that most reduces the loss function."}]}
+{"messages": [{"role": "system", "content": "You are a concise coding assistant."}, {"role": "user", "content": "Write a Python function that returns the factorial of n."}, {"role": "assistant", "content": "```python\ndef factorial(n):\n    return 1 if n <= 1 else n * factorial(n - 1)\n```"}]}
+{"prompt": "What is supervised fine-tuning?", "response": "Supervised fine-tuning (SFT) trains a pre-trained model on labelled examples of the desired behaviour, adjusting weights to minimise cross-entropy loss on the target responses."}
+{"prompt": "What is LoRA?", "response": "LoRA (Low-Rank Adaptation) fine-tunes large models by injecting trainable low-rank matrices into frozen weight layers, dramatically reducing the number of trainable parameters."}
+{"text": "The transformer architecture, introduced in 'Attention Is All You Need' (Vaswani et al., 2017), relies entirely on self-attention mechanisms and has become the foundation of modern large language models."}
+{"text": "QLoRA combines quantization with LoRA: the base model is loaded in 4-bit precision (frozen), while only the small LoRA adapter weights are trained in full precision."}
--- a/examples/qlora_training/sample_rwsft_data.jsonl
+++ b/examples/qlora_training/sample_rwsft_data.jsonl
@ -0,0 +1,6 @@
+{"messages": [{"role": "user", "content": "Explain what a neural network is."}, {"role": "assistant", "content": "A neural network is a computational model loosely inspired by biological neurons, consisting of layers of interconnected nodes that learn to map inputs to outputs through training."}], "reward": 0.9}
+{"messages": [{"role": "user", "content": "Explain what a neural network is."}, {"role": "assistant", "content": "It's a thing that does AI stuff with nodes and layers."}], "reward": 0.2}
+{"messages": [{"role": "user", "content": "What is backpropagation?"}, {"role": "assistant", "content": "Backpropagation computes gradients of the loss with respect to each parameter by applying the chain rule backwards through the computation graph, enabling gradient descent to update all weights efficiently."}], "reward": 1.0}
+{"messages": [{"role": "user", "content": "What is backpropagation?"}, {"role": "assistant", "content": "Backpropagation is how neural networks learn by going backwards."}], "reward": 0.3}
+{"prompt": "Describe overfitting in machine learning.", "response": "Overfitting occurs when a model learns the training data too closely—capturing noise and idiosyncrasies—resulting in poor generalisation to unseen examples. Regularisation, dropout, and early stopping are common mitigations.", "reward": 0.95}
+{"prompt": "Describe overfitting in machine learning.", "response": "Overfitting is when the model is too good at training data but bad at new data.", "reward": 0.4}
--- a/ggml/include/ggml-opt.h
+++ b/ggml/include/ggml-opt.h
@ -89,6 +89,7 @@ extern "C" {
            float beta2; // second AdamW momentum
            float eps;   // epsilon for numerical stability
            float wd;    // weight decay - 0.0f to disable
+            float gclip; // element-wise gradient clipping threshold - 0.0f to disable
        } adamw;
        struct {
            float alpha; // learning rate
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -501,6 +501,7 @@ extern "C" {
        GGML_OP_MUL_MAT,
        GGML_OP_MUL_MAT_ID,
        GGML_OP_OUT_PROD,
+        GGML_OP_OUT_PROD_ID, // scattered outer-product for MUL_MAT_ID backward (MoE LoRA)

        GGML_OP_SCALE,
        GGML_OP_SET,
@ -1424,6 +1425,21 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    // Scattered outer-product for the MUL_MAT_ID backward pass (MoE LoRA gradient).
+    //
+    //   a:   [cols, n_expert_used, n_tokens]  F32  — activations
+    //   b:   [rows, n_expert_used, n_tokens]  F32  — upstream gradient
+    //   ids: [n_expert_used, n_tokens]        I32  — expert dispatch indices
+    //   result: [cols, rows, n_expert, 1]     F32
+    //
+    //   result[:, :, e] += sum_{(i,t): ids[i,t]==e} a[:, i, t] ⊗ b[:, i, t]
+    GGML_API struct ggml_tensor * ggml_out_prod_id(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            struct ggml_tensor  * ids,
+            int64_t               n_expert);
+
    //
    // operations on tensors without backpropagation
    //
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@ -11034,7 +11034,7 @@ static void ggml_compute_forward_opt_step_adamw_f32(
    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_m));
    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_v));
-    GGML_ASSERT(ggml_nelements(adamw_params) == 7);
+    GGML_ASSERT(ggml_nelements(adamw_params) == 8);

    const int ith = params->ith;
    const int nth = params->nth;
@ -11060,6 +11060,7 @@ static void ggml_compute_forward_opt_step_adamw_f32(
    const float wd     = adamw_params_ptr[4];
    const float beta1h = adamw_params_ptr[5];
    const float beta2h = adamw_params_ptr[6];
+    const float gclip  = adamw_params_ptr[7]; // element-wise gradient clip (0 = disabled)
    const float keep   = 1.f - alpha * wd;
    for (int ir = ir0; ir < ir1; ++ir) {
        const int64_t i03 = ir/(ne02*ne01);
@ -11074,8 +11075,10 @@ static void ggml_compute_forward_opt_step_adamw_f32(
        float       * v = (float       *) ((char       *) src0_grad_v->data + offset);

        for (int i00 = 0; i00 < ne00; ++i00) {
-            m[i00] = m[i00]*beta1 +        g[i00]*(1.0f - beta1);
-            v[i00] = v[i00]*beta2 + g[i00]*g[i00]*(1.0f - beta2);
+            const float gi = (gclip > 0.0f) ? fmaxf(-gclip, fminf(gclip, g[i00])) : g[i00];
+
+            m[i00] = m[i00]*beta1 +       gi*(1.0f - beta1);
+            v[i00] = v[i00]*beta2 + gi*gi*(1.0f - beta2);

            const float mh =       m[i00]*beta1h;
            const float vh = sqrtf(v[i00]*beta2h) + eps;
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -2642,6 +2642,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_OUT_PROD:
            ggml_cuda_out_prod(ctx, dst);
            break;
+        case GGML_OP_OUT_PROD_ID:
+            ggml_cuda_out_prod_id(ctx, dst);
+            break;
        case GGML_OP_SCALE:
            ggml_cuda_op_scale(ctx, dst);
            break;
@ -4772,6 +4775,12 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
            } break;
        case GGML_OP_OUT_PROD:
            return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
+        case GGML_OP_OUT_PROD_ID:
+            return op->src[0] != nullptr && op->src[1] != nullptr && op->src[2] != nullptr
+                && op->type        == GGML_TYPE_F32
+                && op->src[0]->type == GGML_TYPE_F32
+                && op->src[1]->type == GGML_TYPE_F32
+                && op->src[2]->type == GGML_TYPE_I32;
        case GGML_OP_GET_ROWS:
            {
                switch (op->src[0]->type) {
--- a/ggml/src/ggml-cuda/opt-step-adamw.cu
+++ b/ggml/src/ggml-cuda/opt-step-adamw.cu
@ -20,8 +20,9 @@ static __global__ void opt_step_adamw_f32(
    const float wd     = pars[4];
    const float beta1h = pars[5];
    const float beta2h = pars[6];
+    const float gclip  = pars[7]; // element-wise gradient clip (0 = disabled)

-    const float gi = g[i];
+    const float gi = (gclip > 0.0f) ? fmaxf(-gclip, fminf(gclip, g[i])) : g[i];
    const float gmi = g_m[i]*beta1 +    gi*(1.0f - beta1);
    const float gvi = g_v[i]*beta2 + gi*gi*(1.0f - beta2);

@ -62,7 +63,7 @@ void ggml_cuda_opt_step_adamw(ggml_backend_cuda_context & ctx, ggml_tensor * dst
    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_m));
    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_v));
-    GGML_ASSERT(ggml_nelements(adamw_params) == 7);
+    GGML_ASSERT(ggml_nelements(adamw_params) == 8);

    float       * src0_d         = (float       *) src0->data;
    const float * src0_grad_d    = (const float *) src0_grad->data;
--- a/ggml/src/ggml-cuda/out-prod.cu
+++ b/ggml/src/ggml-cuda/out-prod.cu
@ -1,6 +1,8 @@
 #include "out-prod.cuh"

 #include <cstdint>
+#include <cstring>
+#include <vector>

 void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
@ -66,3 +68,115 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
        }
    }
 }
+
+// ggml_cuda_out_prod_id
+//
+// Scattered outer-product for the MUL_MAT_ID backward pass (gradient w.r.t. expert weights).
+//
+//   src0 = a   [cols, n_expert_used, n_tokens]  F32  — token activations
+//   src1 = b   [rows, n_expert_used, n_tokens]  F32  — upstream gradient
+//   src2 = ids [n_expert_used, n_tokens]        I32  — expert dispatch indices
+//   dst        [cols, rows, n_expert, 1]         F32  — gradient w.r.t. expert weight matrices
+//
+//   dst[:, :, e] += sum_{(i,t): ids[i,t]==e} a[:, i, t] ⊗ b[:, i, t]
+//
+// Algorithm:
+//   For each expert e: gather the token columns where ids[i,t]==e into contiguous
+//   GPU buffers, then use cublasSgemm (beta=1) to accumulate the outer product.
+//   ids may be CPU-resident (common in backward graphs where they are leaf tensors).
+void ggml_cuda_out_prod_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0]; // a   [cols, n_exp_used, n_tokens]
+    const ggml_tensor * src1 = dst->src[1]; // b   [rows, n_exp_used, n_tokens]
+    const ggml_tensor * ids  = dst->src[2]; // ids [n_exp_used, n_tokens]  i32
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(ids->type  == GGML_TYPE_I32);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
+
+    const int64_t cols       = src0->ne[0];
+    const int64_t n_exp_used = src0->ne[1];
+    const int64_t n_tokens   = src0->ne[2];
+    const int64_t rows       = src1->ne[0];
+    const int64_t n_expert   = dst->ne[2];
+
+    cudaStream_t   stream = ctx.stream();
+    cublasHandle_t handle = ctx.cublas_handle();
+    CUBLAS_CHECK(cublasSetStream(handle, stream));
+
+    // Zero destination tensor before accumulating
+    CUDA_CHECK(cudaMemsetAsync(dst->data, 0, ggml_nbytes(dst), stream));
+
+    // Read ids to host — ids may be CPU-resident (backward graph leaf) or GPU-resident
+    const size_t ids_nbytes = ggml_nbytes(ids);
+    std::vector<char> ids_host(ids_nbytes);
+    if (ids->buffer && !ggml_backend_buffer_is_host(ids->buffer)) {
+        // GPU-resident: copy to host and synchronize so we can inspect the values
+        CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids->data, ids_nbytes, cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+    } else {
+        memcpy(ids_host.data(), ids->data, ids_nbytes);
+    }
+
+    // Build per-expert token list: expert_tokens[e] = list of flat indices (iexp*n_tokens+itok)
+    // whose dispatch id equals e.
+    std::vector<std::vector<int64_t>> expert_tokens(n_expert);
+    for (int64_t itok = 0; itok < n_tokens; ++itok) {
+        for (int64_t iexp = 0; iexp < n_exp_used; ++iexp) {
+            const int32_t eid = *(const int32_t *)(ids_host.data()
+                + itok * ids->nb[1] + iexp * ids->nb[0]);
+            GGML_ASSERT(eid >= 0 && eid < (int32_t)n_expert);
+            expert_tokens[eid].push_back(iexp * n_tokens + itok);
+        }
+    }
+
+    // Strides (in elements, not bytes)
+    const int64_t a_stride_exp = src0->nb[1] / sizeof(float); // cols
+    const int64_t a_stride_tok = src0->nb[2] / sizeof(float); // cols * n_exp_used
+    const int64_t b_stride_exp = src1->nb[1] / sizeof(float); // rows
+    const int64_t b_stride_tok = src1->nb[2] / sizeof(float); // rows * n_exp_used
+    const int64_t dst_stride_e = dst->nb[2]  / sizeof(float); // cols * rows
+
+    const float alpha_one = 1.0f;
+    const float beta_acc  = 1.0f; // accumulate — dst is already zeroed above
+
+    const float * a_base = (const float *) src0->data;
+    const float * b_base = (const float *) src1->data;
+    float       * d_base = (float       *)  dst->data;
+
+    for (int64_t e = 0; e < n_expert; ++e) {
+        const auto & toks = expert_tokens[e];
+        if (toks.empty()) {
+            continue;
+        }
+
+        const int64_t ntoks_e = (int64_t) toks.size();
+
+        // Allocate contiguous gather buffers on GPU: a_e [cols, ntoks_e], b_e [rows, ntoks_e]
+        ggml_cuda_pool_alloc<float> a_gathered(ctx.pool(), cols * ntoks_e);
+        ggml_cuda_pool_alloc<float> b_gathered(ctx.pool(), rows * ntoks_e);
+
+        // Gather token vectors from GPU src0/src1 into contiguous buffers
+        for (int64_t ti = 0; ti < ntoks_e; ++ti) {
+            const int64_t flat = toks[ti];
+            const int64_t iexp = flat / n_tokens;
+            const int64_t itok = flat % n_tokens;
+            CUDA_CHECK(cudaMemcpyAsync(
+                a_gathered.ptr + ti * cols,
+                a_base + iexp * a_stride_exp + itok * a_stride_tok,
+                cols * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+            CUDA_CHECK(cudaMemcpyAsync(
+                b_gathered.ptr + ti * rows,
+                b_base + iexp * b_stride_exp + itok * b_stride_tok,
+                rows * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+        }
+
+        // dst[:, :, e] += a_gathered @ b_gathered^T
+        // cuBLAS column-major: A=[cols, ntoks_e] lda=cols, B=[rows, ntoks_e] ldb=rows
+        CUBLAS_CHECK(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T,
+            (int)cols, (int)rows, (int)ntoks_e,
+            &alpha_one, a_gathered.ptr,         (int)cols,
+                        b_gathered.ptr,         (int)rows,
+            &beta_acc,  d_base + e*dst_stride_e, (int)cols));
+    }
+}
--- a/ggml/src/ggml-cuda/out-prod.cuh
+++ b/ggml/src/ggml-cuda/out-prod.cuh
@ -1,3 +1,10 @@
 #include "common.cuh"

 void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+// Scattered outer-product for MUL_MAT_ID backward (gradient w.r.t. expert weight matrices).
+// src0: activations [cols, n_expert_used, n_tokens]  F32
+// src1: grad_output [rows, n_expert_used, n_tokens]  F32
+// src2: expert ids  [n_expert_used, n_tokens]        I32 (may be CPU-resident)
+// dst:  grad_weight [cols, rows, n_expert, 1]         F32
+void ggml_cuda_out_prod_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-opt.cpp
+++ b/ggml/src/ggml-opt.cpp
@ -230,6 +230,7 @@ struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * us
    result.adamw.beta2 = 0.999f;
    result.adamw.eps   = 1e-8f;
    result.adamw.wd    = 0.0f;
+    result.adamw.gclip = 0.0f;

    result.sgd.alpha   = 1e-3f;
    result.sgd.wd      = 0.0f;
@ -503,7 +504,7 @@ static void ggml_opt_build(ggml_opt_context_t opt_ctx) {
    // gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step.
    opt_ctx->gb_opt = ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gb_grad, /*force_grads =*/ true);

-    opt_ctx->opt_step_params = ggml_new_tensor_1d(opt_ctx->ctx_cpu, GGML_TYPE_F32, need_momenta ? 7 : 2);
+    opt_ctx->opt_step_params = ggml_new_tensor_1d(opt_ctx->ctx_cpu, GGML_TYPE_F32, need_momenta ? 8 : 2);
    ggml_tensor * adamw_params = opt_ctx->opt_step_params;
    ggml_set_input(adamw_params);
    const char * optimizer_name = ggml_opt_optimizer_name(opt_ctx->optimizer);
@ -726,6 +727,17 @@ void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward) {
    if (opt_ctx->build_type == GGML_OPT_BUILD_TYPE_OPT && opt_ctx->opt_period > 1 && opt_ctx->opt_i == 0) {
        ggml_graph_reset(opt_ctx->gb_grad);
    }
+
+    // For non-static graphs the compute graph is rebuilt every call, so ggml_graph_reset
+    // is not called and grad_accs may carry over values from the previous accumulation window.
+    // Explicitly zero them at the start of each gradient-accumulation cycle.
+    if (!opt_ctx->static_graphs && backward && opt_ctx->opt_i == 0) {
+        for (struct ggml_tensor * ga : opt_ctx->grad_accs) {
+            if (ga) {
+                ggml_set_zero(ga);
+            }
+        }
+    }
    if (backward) {
        const int32_t opt_i_next = (opt_ctx->opt_i + 1) % opt_ctx->opt_period;
        opt_ctx->build_type = opt_i_next == 0 ? GGML_OPT_BUILD_TYPE_OPT : GGML_OPT_BUILD_TYPE_GRAD;
@ -793,6 +805,7 @@ void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result) {
                GGML_ASSERT(opt_pars.adamw.eps >= 0.0f);
                GGML_ASSERT(opt_pars.adamw.wd >= 0.0f);
                GGML_ASSERT(opt_pars.adamw.wd <= 1.0f);
+                GGML_ASSERT(opt_pars.adamw.gclip >= 0.0f);

                // beta1, beta2 after applying warmup
                const float beta1h = 1.0f / (1.0f - powf(opt_pars.adamw.beta1, opt_ctx->iter));
@ -806,6 +819,7 @@ void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result) {
                adamw_par_data[4] = opt_pars.adamw.wd;
                adamw_par_data[5] = beta1h;
                adamw_par_data[6] = beta2h;
+                adamw_par_data[7] = opt_pars.adamw.gclip;
            } break;
            case GGML_OPT_OPTIMIZER_TYPE_SGD: {
                GGML_ASSERT(opt_pars.sgd.alpha > 0.0f);
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -976,6 +976,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "MUL_MAT",
    "MUL_MAT_ID",
    "OUT_PROD",
+    "OUT_PROD_ID",

    "SCALE",
    "SET",
@ -1049,7 +1050,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "GLU",
 };

-static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
+static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT != 97");

 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "none",
@ -1086,6 +1087,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "X*Y",
    "X[i]*Y",
    "X*Y",
+    "X_id⊗Y_id",

    "x*v",
    "y-\\>view(x)",
@ -1159,7 +1161,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "glu(x)",
 };

-static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
+static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT != 97");

 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");

@ -3293,6 +3295,44 @@ struct ggml_tensor * ggml_out_prod(
    return result;
 }

+// ggml_out_prod_id
+//
+// Scattered outer-product for the MUL_MAT_ID backward pass.
+//
+//   a:   [cols, n_expert_used, n_tokens]  F32  — activations (src1 of MUL_MAT_ID)
+//   b:   [rows, n_expert_used, n_tokens]  F32  — upstream gradient
+//   ids: [n_expert_used, n_tokens]        I32  — expert dispatch indices (src2 of MUL_MAT_ID)
+//   result: [cols, rows, n_expert, 1]     F32
+//
+//   result[:, :, e] += sum_{(i,t): ids[i,t]==e} a[:, i, t] ⊗ b[:, i, t]
+//
+// Computes the gradient w.r.t. the expert weight matrices (src0) of MUL_MAT_ID.
+struct ggml_tensor * ggml_out_prod_id(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        struct ggml_tensor  * ids,
+        int64_t               n_expert) {
+    GGML_ASSERT(a->type   == GGML_TYPE_F32);
+    GGML_ASSERT(b->type   == GGML_TYPE_F32);
+    GGML_ASSERT(ids->type == GGML_TYPE_I32);
+    GGML_ASSERT(a->ne[1]  == b->ne[1]);   // n_expert_used matches
+    GGML_ASSERT(a->ne[2]  == b->ne[2]);   // n_tokens matches
+    GGML_ASSERT(ids->ne[0] == a->ne[1]);  // n_expert_used matches ids
+    GGML_ASSERT(ids->ne[1] == a->ne[2]);  // n_tokens matches ids
+    GGML_ASSERT(n_expert > 0);
+
+    const int64_t ne[4] = { a->ne[0], b->ne[0], n_expert, 1 };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+
+    result->op     = GGML_OP_OUT_PROD_ID;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = ids;
+
+    return result;
+}
+
 // ggml_scale

 static struct ggml_tensor * ggml_scale_impl(
@ -6518,6 +6558,35 @@ static void ggml_compute_backward(
                                grad)));        // [m,p,qq,rr]
            }
        } break;
+        case GGML_OP_MUL_MAT_ID: {
+            // Backward pass for indirect matrix multiplication (MoE).
+            //
+            // Forward:  dst[rows, n_exp_used, n_tokens] = as[:,:,ids[i,t]] @ b[:,i,t]
+            //   src0 = as  [cols, rows, n_expert]    — expert weight matrices
+            //   src1 = b   [cols, n_exp_used, n_tokens] — token activations
+            //   src2 = ids [n_exp_used, n_tokens]    — expert dispatch indices (I32)
+            //
+            // Gradient w.r.t. src1 (activations):
+            //   grad_b[:,i,t] = as[:,:,ids[i,t]]^T @ grad[:,i,t]
+            //   → computed via MUL_MAT_ID with transposed as
+            //
+            // Gradient w.r.t. src0 (expert weights, only when F32 i.e. LoRA):
+            //   grad_as[:,:,e] += sum_{(i,t): ids[i,t]==e} b[:,i,t] ⊗ grad[:,i,t]
+            //   → computed via OUT_PROD_ID
+            //
+            // Quantized src0 is frozen (stop-gradient) — handled in grads_needed below.
+            if (src0_needs_grads) {
+                const int64_t n_expert = src0->ne[2];
+                struct ggml_tensor * grad_as = ggml_out_prod_id(ctx, src1, grad, src2, n_expert);
+                ggml_add_or_set(ctx, cgraph, isrc0, grad_as);
+            }
+            if (src1_needs_grads) {
+                // Transpose expert matrices: as [cols, rows, n_expert] → as_T [rows, cols, n_expert]
+                struct ggml_tensor * as_T = ggml_cont(ctx, ggml_permute(ctx, src0, 1, 0, 2, 3));
+                struct ggml_tensor * grad_b = ggml_mul_mat_id(ctx, as_T, grad, src2);
+                ggml_add_or_set(ctx, cgraph, isrc1, grad_b);
+            }
+        } break;
        case GGML_OP_SCALE: {
            if (src0_needs_grads) {
                float s;
@ -6964,6 +7033,35 @@ void ggml_build_backward_expand(
                ignore_src[1] = true;
                break;

+            // MUL_MAT_ID: expert dispatch indices (src2) are integer — no gradient.
+            // When src0 is quantized the expert weights are frozen, so stop gradient through
+            // both src0 and src1 (activations have no path to loss without differentiable weights).
+            case GGML_OP_MUL_MAT_ID:
+                if (ggml_is_quantized(node->src[0]->type)) {
+                    ignore_src[0] = true;
+                    ignore_src[1] = true;
+                }
+                ignore_src[2] = true; // ids: integer tensor
+                break;
+
+            // SET_ROWS is a KV-cache scatter write.  The gradient of the written data flows
+            // through the attention read path (GET_ROWS backward), not through this node.
+            case GGML_OP_SET_ROWS:
+                ignore_src[0] = true;
+                ignore_src[1] = true;
+                break;
+
+            // Ops with no backward implementation — stop gradient through all sources so the
+            // backward graph builder never tries to propagate through them.
+            case GGML_OP_SSM_CONV:       // Mamba causal conv1d
+            case GGML_OP_SSM_SCAN:       // Mamba selective scan
+            case GGML_OP_FLASH_ATTN_EXT: // use standard attention for training
+                ignore_src[0] = true;
+                ignore_src[1] = true;
+                ignore_src[2] = true;
+                ignore_src[3] = true;
+                break;
+
            default:
                break;
        }