diff --git a/common/arg.cpp b/common/arg.cpp index 10aa1b5e4f..4ab6114c49 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3607,32 +3607,108 @@ common_params_context common_params_parser_init(common_params & params, llama_ex { "-lr", "--learning-rate" }, "ALPHA", string_format("adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)", (double) params.lr.lr0), [](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); } - ).set_examples({ LLAMA_EXAMPLE_FINETUNE })); + ).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA })); add_opt(common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA", string_format("(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)", (double) params.lr.lr_min), [](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); } - ).set_examples({ LLAMA_EXAMPLE_FINETUNE })); + ).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA })); add_opt(common_arg( {"-decay-epochs", "--learning-rate-decay-epochs"}, "ALPHA", string_format("(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)", (double) params.lr.decay_epochs), [](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); } - ).set_examples({ LLAMA_EXAMPLE_FINETUNE })); + ).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA })); add_opt(common_arg( {"-wd", "--weight-decay"}, "WD", string_format("adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).", (double) params.lr.wd), [](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); } - ).set_examples({ LLAMA_EXAMPLE_FINETUNE })); + ).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA })); add_opt(common_arg( {"-val-split", "--val-split"}, "FRACTION", string_format("fraction of data to use as validation set for training (default: %.2g).", (double) params.val_split), [](common_params & params, const std::string & value) { params.val_split = std::stof(value); } - ).set_examples({ LLAMA_EXAMPLE_FINETUNE })); + ).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA })); + // qlora flags + add_opt(common_arg( + {"--lora-rank"}, "N", + string_format("LoRA rank r (default: %d)", params.lora_rank), + [](common_params & params, int value) { params.lora_rank = value; } + ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA })); + add_opt(common_arg( + {"--lora-alpha"}, "F", + string_format("LoRA alpha (default: %d = use rank value)", (int) params.lora_alpha), + [](common_params & params, const std::string & value) { params.lora_alpha = std::stof(value); } + ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA })); + add_opt(common_arg( + {"--lora-targets"}, "SUBSTRINGS", + string_format("comma-separated substrings of tensor names to add LoRA to (default: %s)", params.lora_targets.c_str()), + [](common_params & params, const std::string & value) { params.lora_targets = value; } + ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA })); + add_opt(common_arg( + {"--lora-out"}, "FNAME", + string_format("output LoRA adapter GGUF path (default: %s)", params.lora_out.c_str()), + [](common_params & params, const std::string & value) { params.lora_out = value; } + ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA })); + add_opt(common_arg( + {"--train-file"}, "FNAME", + "JSONL training dataset (fields: messages|prompt+response|text)", + [](common_params & params, const std::string & value) { params.train_file = value; } + ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA })); + add_opt(common_arg( + {"--save-every"}, "N", + "save adapter checkpoint every N dataset windows during training (default: 0 = only at end)", + [](common_params & params, int value) { params.save_every = value; } + ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA })); + add_opt(common_arg( + {"--freeze-layers"}, "N", + "freeze first N transformer layers — no LoRA adapters allocated for blk.0..blk.N-1 (default: 0 = train all layers)", + [](common_params & params, int value) { params.lora_freeze_layers = value; } + ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA })); + add_opt(common_arg( + {"--grad-checkpoint"}, "N", + "gradient checkpointing interval to reduce peak activation VRAM (0 = disabled, default: 0)", + [](common_params & params, int value) { params.grad_checkpoint_interval = value; } + ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA })); + add_opt(common_arg( + {"--train-on-prompt"}, + "compute loss on prompt tokens too, not just the response (default: response-only loss)", + [](common_params & params) { params.train_on_prompt = true; } + ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA })); + add_opt(common_arg( + {"--shuffle-dataset"}, + "shuffle dataset windows at the start of each epoch (default: sequential order)", + [](common_params & params) { params.shuffle_dataset = true; } + ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA })); + add_opt(common_arg( + {"--grpo-mode"}, + "enable GRPO IPC training loop (prompts and rewards supplied via stdin/stdout)", + [](common_params & params) { params.grpo_mode = true; } + ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA })); + add_opt(common_arg( + {"--n-gen"}, "N", + string_format("GRPO: number of generations per prompt (default: %d)", params.grpo_n_gen), + [](common_params & params, int value) { params.grpo_n_gen = value; } + ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA })); + add_opt(common_arg( + {"--n-steps"}, "N", + string_format("GRPO: total optimizer steps (default: %d)", params.grpo_n_steps), + [](common_params & params, int value) { params.grpo_n_steps = value; } + ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA })); + add_opt(common_arg( + {"--grpo-temp"}, "F", + string_format("GRPO: sampling temperature for rollout generation (default: %.2f)", (double) params.grpo_temperature), + [](common_params & params, const std::string & value) { params.grpo_temperature = std::stof(value); } + ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA })); + add_opt(common_arg( + {"--grpo-max-tokens"}, "N", + string_format("GRPO: max tokens per generation (default: %d)", params.grpo_max_tokens), + [](common_params & params, int value) { params.grpo_max_tokens = value; } + ).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA })); add_opt(common_arg( {"-epochs", "--epochs"}, "N", string_format("optimizer max # of epochs (default: %d)", params.lr.epochs), [](common_params & params, int epochs) { params.lr.epochs = epochs; } - ).set_examples({ LLAMA_EXAMPLE_FINETUNE })); + ).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA })); add_opt(common_arg( {"-opt", "--optimizer"}, "sgd|adamw", "adamw or sgd", [](common_params & params, const std::string & name) { @@ -3641,7 +3717,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd"); } } - ).set_examples({ LLAMA_EXAMPLE_FINETUNE })); + ).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA })); add_opt(common_arg( {"--check"}, string_format("check rather than generate results (default: %s)", params.check ? "true" : "false"), diff --git a/common/common.h b/common/common.h index ee7a2d805e..4fdf6f4cdb 100644 --- a/common/common.h +++ b/common/common.h @@ -103,6 +103,7 @@ enum llama_example { LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_FINETUNE, + LLAMA_EXAMPLE_FINETUNE_QLORA, LLAMA_EXAMPLE_FIT_PARAMS, LLAMA_EXAMPLE_RESULTS, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS, @@ -518,7 +519,26 @@ struct common_params { // finetune struct lr_opt lr; enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW; - float val_split = 0.05f; // fraction of the data used for the validation set + float val_split = 0.05f; // fraction of the data used for the validation set + + // qlora fine-tuning + int32_t lora_rank = 16; // LoRA rank (r) + float lora_alpha = 0.0f; // LoRA alpha (0 = use rank value) + std::string lora_targets = "attn_q,attn_output,ffn_gate,ffn_up,ffn_down"; // comma-separated substrings to match trainable tensors + std::string lora_out = "adapter.gguf"; // output adapter GGUF path + std::string train_file = ""; // JSONL training dataset path + int32_t save_every = 0; // save checkpoint every N optimizer steps (0 = disabled) + int32_t lora_freeze_layers = 0; // do not apply LoRA to the first N transformer layers + int32_t grad_checkpoint_interval = 0; // gradient checkpointing interval to reduce peak VRAM (0 = disabled) + bool train_on_prompt = false; // include prompt tokens in training loss (default: response tokens only) + bool shuffle_dataset = false; // shuffle dataset windows at the start of each epoch + + // grpo training + bool grpo_mode = false; // enable GRPO IPC training loop + int32_t grpo_n_gen = 8; // generations per prompt + int32_t grpo_n_steps = 500; // total GRPO optimizer steps + float grpo_temperature = 0.8f; // sampling temperature for rollouts + int32_t grpo_max_tokens = 512; // max tokens per generation // embedding bool embedding = false; // get only sentence embedding diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index a29dc707c3..93f72fc321 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -34,6 +34,7 @@ else() add_subdirectory(speculative-simple) add_subdirectory(gen-docs) add_subdirectory(training) + add_subdirectory(qlora_training) add_subdirectory(diffusion) if (NOT GGML_BACKEND_DL) add_subdirectory(convert-llama2c-to-ggml) diff --git a/examples/qlora_training/CMakeLists.txt b/examples/qlora_training/CMakeLists.txt new file mode 100644 index 0000000000..5ba9ec8afa --- /dev/null +++ b/examples/qlora_training/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET llama-finetune-qlora) +add_executable(${TARGET} finetune_qlora.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/qlora_training/README.md b/examples/qlora_training/README.md new file mode 100644 index 0000000000..bc3b6fcca9 --- /dev/null +++ b/examples/qlora_training/README.md @@ -0,0 +1,331 @@ +# llama.cpp — Native QLoRA Training + +Native QLoRA + Reward-Weighted SFT training pipeline for quantized GGUF models. + +The base model weights remain **frozen** (quantized tensors are skipped by `llama_set_param` because they are not `GGML_TYPE_F32`). Only freshly-allocated F32 LoRA A/B tensors are trained. The saved adapter GGUF is directly compatible with the existing `llama_adapter_lora_init` loader and `llama-export-lora` merge tool. + +**Status:** Working. Phase 1 (QLoRA SFT) and Phase 2 (Reward-Weighted SFT) are implemented and functional. Training speed is currently limited by full backprop through quantized weights — see [Known Limitations](#known-limitations). + +--- + +## Build + +```bash +cd /mnt/w/llm-trading-arena/unsloth-api/llama.cpp + +# First time (CUDA build): +cmake -B build -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_CURL=OFF +cmake --build build -j$(nproc) + +# Incremental rebuild (after code changes): +cmake --build build --target llama-finetune-qlora -j$(nproc) +# If llama-adapter.cpp or llama-context.cpp changed, rebuild all: +cmake --build build -j$(nproc) +``` + +--- + +## Phase 1 — QLoRA SFT (`llama-finetune-qlora`) + +Trains LoRA adapters on a quantized GGUF model. + +### Recommended command (1.7B model, 16 GB card) + +```bash +./build/bin/llama-finetune-qlora \ + --model ~/qwen3-1.7b-q4_k_m.gguf \ + --train-file data/train.jsonl \ + --lora-rank 16 --lora-alpha 16 \ + -c 4096 -b 4096 -ub 512 \ + --save-every 10 \ + --lora-out ~/adapter.gguf \ + --epochs 3 --seed 42 +``` + +### Recommended command (15B model, 16 GB card, partial offload) + +```bash +./build/bin/llama-finetune-qlora \ + --model ~/nemotron-15b-q4_k_m.gguf \ + --train-file data/train.jsonl \ + --lora-rank 16 --lora-alpha 16 \ + -ngl 13 -c 14336 -b 14336 -ub 1024 \ + --save-every 8 \ + --lora-out ~/nemotron-lora.gguf \ + --epochs 3 --seed 42 +``` + +### All flags + +| Flag | Default | Description | +|---|---|---| +| `--model` | *(required)* | Path to quantized GGUF model | +| `--train-file` | *(required)* | JSONL training dataset | +| `--lora-rank` | `16` | LoRA rank r | +| `--lora-alpha` | `0` (= rank) | LoRA alpha; effective scale = alpha/rank | +| `--lora-targets` | see below | Comma-separated internal tensor name substrings | +| `--lora-out` | `adapter.gguf` | Output adapter GGUF path (supports `~`) | +| `--save-every` | `0` | Save checkpoint every N dataset windows (0 = end only) | +| `--freeze-layers` | `0` | Skip LoRA on first N transformer layers (blk.0..N-1); backward already pruned automatically | +| `--grad-checkpoint` | `0` | Mark every Nth forward node persistent to reduce activation VRAM; good values: 32–64 | +| `--train-on-prompt` | off | Compute loss on prompt tokens too (default: response-only loss) | +| `--shuffle-dataset` | off | Shuffle dataset windows at the start of each epoch | +| `--val-split` | `0.0` | Fraction of data to hold out for validation (e.g. `0.1` = 10%); val loss logged per epoch | +| `-epochs` / `--epochs` | `3` | Training epochs | +| `-c` / `--ctx-size` | `512` | Training context window (tokens) | +| `-b` / `--batch-size` | `2048` | Tokens per `llama_decode` call; set equal to `-c` | +| `-ub` / `--ubatch-size` | `512` | GPU micro-batch tokens; controls VRAM vs. step time | +| `-ngl` | `999` | GPU layers to offload | +| `-lr` / `--learning-rate` | `1e-4` | AdamW learning rate | +| `--seed` | `42` | Random seed for LoRA init | + +### VRAM vs. step-time tradeoff + +Step time and VRAM both scale linearly with `-ub`: + +| Model | `-ub` | VRAM | Step time (approx) | +|---|---|---|---| +| 1.7B Q4_K_M | 512 | ~18 GB | ~120 s (OOM on 16 GB) | +| 1.7B Q4_K_M | 128 | ~6 GB | ~30 s | +| 15B Q4_K_M | 1024 | ~11 GB | ~60 s | + +Use `-c` equal to your target sequence length. More context = more windows per sample = more steps per epoch. Reducing `-c` reduces total training time proportionally. + +### Default LoRA targets + +llama.cpp uses **internal GGUF tensor names**, not HuggingFace names: + +| llama.cpp internal | HuggingFace equivalent | Status | +|---|---|---| +| `attn_q` | `q_proj` | ✅ default target, trainable | +| `attn_output` | `o_proj` | ✅ default target, trainable | +| `ffn_gate` | `gate_proj` | ✅ default target, trainable | +| `ffn_up` | `up_proj` | ✅ default target, trainable | +| `ffn_down` | `down_proj` | ✅ default target, trainable | +| `attn_k` | `k_proj` | ❌ not in defaults — zero gradient (KV scatter via SET_ROWS) | +| `attn_v` | `v_proj` | ❌ not in defaults — zero gradient (KV scatter via SET_ROWS) | +| `ssm_in` | `in_proj` | ❌ not in defaults — zero gradient (SSM_SCAN no backward) | +| `ssm_out` | `out_proj` | ❌ not in defaults — zero gradient (SSM_SCAN no backward) | + +**MoE models:** Expert tensors (`*_exps`) are excluded regardless of `--lora-targets`. The quantized expert weights are frozen (stop-gradient), but LoRA on the dense FFN layers (`ffn_gate`, `ffn_up`, `ffn_down`) works — backward via `MUL_MAT_ID` + `OUT_PROD_ID`. + +### Dataset format (JSONL) + +**Chat format** (loss on response only; use `--train-on-prompt` for all tokens): +```json +{"messages": [{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi!"}]} +``` + +**Prompt/response** (loss on response only): +```json +{"prompt": "What is the capital of France?", "response": "Paris."} +``` + +**Plain text** (loss on all tokens): +```json +{"text": "The quick brown fox."} +``` + +**With reward** (Phase 2 — scales gradient by reward): +```json +{"prompt": "...", "response": "...", "reward": 0.85} +``` + +Rewards are normalized per epoch: clipped to `[-1, 1]`, then min-max scaled to `[0, 1]`. Reward 0 = sample ignored; reward 1 = full gradient. + +### Verify and use the adapter + +```bash +# Hot-load for inference (no merge needed) +./build/bin/llama-cli --model base.gguf --lora adapter.gguf -p "Hello" + +# Merge into base model +./build/bin/llama-export-lora \ + --model base.gguf --lora adapter.gguf --output merged.gguf +``` + +--- + +## Phase 2 — Reward-Weighted SFT + +Built into `llama-finetune-qlora`. When the dataset contains a `reward` or `score` field, the cross-entropy loss for that sample is scaled by the reward before backprop. No extra flags needed — detection is automatic. + +--- + +## Phase 3 — GRPO (Online RL via IPC) + +`llama-finetune-qlora --grpo-mode` implements a full GRPO training loop where the Python process owns prompt sampling and reward scoring, and the C++ process owns model state, generation, and gradient updates. + +### Quick start + +```bash +python3 examples/qlora_training/grpo_example.py \ + --model ~/qwen3-1.7b-q4_k_m.gguf \ + --lora-out ~/grpo-adapter.gguf \ + --rank 16 --n-steps 200 --n-gen 8 +``` + +For verbose output (includes IPC message trace): + +```bash +python3 examples/qlora_training/grpo_example.py \ + --model ~/qwen3-1.7b-q4_k_m.gguf \ + --lora-out ~/grpo-adapter.gguf \ + --verbose +``` + +Resume from a checkpoint: + +```bash +python3 examples/qlora_training/grpo_example.py \ + --model ~/qwen3-1.7b-q4_k_m.gguf \ + --lora ~/grpo-adapter.ckpt50.gguf \ + --lora-out ~/grpo-adapter.gguf +``` + +### GRPO-specific flags + +| Flag | Default | Description | +|---|---|---| +| `--grpo-mode` | off | Enable GRPO IPC mode | +| `--n-gen` | `8` | Rollouts per prompt | +| `--n-steps` | `500` | Total GRPO steps | +| `--grpo-temp` | `0.8` | Sampling temperature for rollouts | +| `--grpo-max-tokens` | `512` | Max tokens per generation | + +All standard flags (`--lora-rank`, `-lr`, `-c`, `-ngl`, `--save-every`, etc.) work in GRPO mode too. `--train-file` is **not** required in GRPO mode. + +### IPC protocol + +The protocol is line-based over stdout (C++ → Python) and stdin (Python → C++). All non-protocol C++ output (timing, debug, model logs) goes to **stderr** and never contaminates the protocol channel. + +**C++ → Python (stdout):** + +| Line | When | +|---|---| +| `[QLORA:READY]` | Process initialised, model loaded | +| `[QLORA:PROMPT_REQ:]` | C++ requests the prompt for step N | +| `[QLORA:GEN:/] ` | One generation (newlines escaped as `\n`) | +| `[QLORA:REWARD_REQ:]` | C++ requests N reward scores | +| `[QLORA:PROGRESS] step=X/Y loss=Z epoch=A/B` | After each weight update | +| `[QLORA:CHECKPOINT] ` | After saving a checkpoint | +| `[QLORA:DONE] final_loss=X` | Training complete | +| `[QLORA:ERROR] ` | Fatal error | + +**Python → C++ (stdin):** + +| Line | Meaning | +|---|---| +| `PROMPT ` | Send prompt for the most recent `PROMPT_REQ` | +| `REWARD ` | Send N advantage scores in `[0, 1]` range | +| `STOP` | Request graceful shutdown after current step | + +**Text encoding:** newlines in generation text are escaped as the two-character sequence `\n`; backslashes are doubled. Use `unescape()` from `grpo_example.py` (or any equivalent) to recover the original text. + +### Writing your own driver + +`grpo_example.py` contains two functions you replace with your own logic: + +```python +def get_prompt(step: int) -> str: + """Return the training prompt for step N.""" + ... + +def score_generations(prompt: str, generations: List[str]) -> List[float]: + """Score each generation. Any numeric range — will be normalised.""" + ... +``` + +The IPC helpers (`escape`, `unescape`, `parse_ipc`, `read_ipc`, `write_cmd`, `wait_for`, `normalise_rewards`) are standalone and have no external dependencies — copy them into your own project if needed. + +### Training loop diagram + +``` +Python C++ (llama-finetune-qlora --grpo-mode) + │ │ + │◄──── [QLORA:READY] ────────────┤ model loaded + │ │ + │ ┌─────────────────────────────┤ + │ │ for each step: │ + │ │ ◄── PROMPT_REQ:N ─────────┤ + │ │ ──► PROMPT ────────► generate n_gen rollouts + │ │ ◄── GEN:1/n ──┤ + │ │ ◄── GEN:2/n ──┤ + │ │ ... │ + │ │ ◄── GEN:n/n ──┤ + │ │ ◄── REWARD_REQ:n ─────────┤ + │ │ (score generations) │ + │ │ ──► REWARD a1 a2 … an ────► one backward + AdamW step + │ │ ◄── PROGRESS step=N/M … ──┤ + │ └─────────────────────────────┤ + │ │ + │◄──── [QLORA:DONE] ─────────────┤ adapter saved +``` + +--- + +## Known Limitations & Optimization Roadmap + +### Current limitations + +**1. Full backprop through frozen quantized layers** +Every backward step dequantizes all frozen Q4_K_M weight tensors to compute activation gradients (needed to propagate loss from the output back to each LoRA layer). For a 28-layer 1.7B model at `-ub 512`, this is ~280 dequantizing matmuls per step → step time is 3–5× slower than inference. + +**2. Activation VRAM** *(partially addressed by `--grad-checkpoint`)* +All forward activations are kept in VRAM throughout the backward pass. VRAM ≈ `model + KV + n_layers × hidden × n_ubatch × 10 × 4B + 2 × lora_params × 4B`. Reducing `-ub` reduces VRAM linearly. Use `--grad-checkpoint 48` to prevent the allocator from reusing intermediate activation buffers during backward, which cuts peak activation VRAM at near-zero compute cost. + +**3. Full backprop through all layers** *(partially addressed by `--freeze-layers`)* +Gradients propagate through all layers that have LoRA adapters. Use `--freeze-layers N` to skip LoRA allocation for blk.0..N-1 — those layers receive no gradient (the `grads_needed` pruner already skips their backward ops automatically). Only the top (total_layers - N) layers are trained. + +### Optimization roadmap + +| Priority | Optimization | Expected gain | Status | +|---|---|---|---| +| ✅ Done | **`--freeze-layers N`** — no LoRA on first N layers; backward auto-pruned | Proportional to N/total | Implemented | +| ✅ Done | **`--grad-checkpoint N`** — keep every Nth activation alive through backward | Reduces peak activation VRAM | Implemented | +| ✅ Done | **`--train-on-prompt`** — compute loss on prompt tokens too | Configurable loss target | Implemented | +| ✅ Done | **`--shuffle-dataset`** — shuffle windows each epoch | Better convergence | Implemented | +| ✅ Done | **BOS separators** — insert BOS between concatenated samples | Correct cross-sample boundaries | Implemented | +| ✅ Done | **Per-epoch loss summary** — log train/val loss after each epoch | Observability | Implemented | +| ✅ Done | **`MUL_MAT_ID` backward** — LoRA on MoE dense FFN layers; `OUT_PROD_ID` for scattered outer product | Unlocks Mixtral/Nemotron-MoE | Implemented | +| ✅ Done | **Quantized `OUT_PROD`** — dequantize on GPU + cuBLAS for backward matmul | Full GPU training (no CPU fallback) | Implemented | +| ✅ Done | **Reuse `ctx_compute_opt`** — allocate tensor metadata context once, `ggml_reset()` across ubatches | Eliminate ~0.5 s/step overhead | Implemented | +| ❌ Skip | **Static training graphs** — KV mask shape changes per ubatch (`n_kv` grows); graph topology not static | Would need KV cache redesign | Not feasible | +| Low | **`SSM_SCAN/CONV` backward** — enable LoRA on Mamba SSM layers | Unlocks NemotronH SSM layers | Planned | +| Low | **GELU backward** — implement `ggml_gelu_back` kernel (UNARY + GLU) | Support GPT-2/Phi-style models | Planned (needs new CUDA/CPU kernels) | + +--- + +## Implementation notes (for developers) + +### Modified llama.cpp files + +| File | Change | +|---|---| +| `ggml/src/ggml.c` | Backward graph fixes: `GET_ROWS` 3D, `SET_ROWS`, `MUL_MAT_ID`, `SSM_SCAN/CONV`, `FLASH_ATTN_EXT` all stop gradient; inplace-op assert → warn+skip | +| `src/llama-context.cpp` | `opt_init`: scheduler and graph sized with inflated capacity before `ggml_opt_init`; `opt_epoch_iter`: per-ubatch timing instrumentation; reward scaling via `g_reward_weights` TLS | +| `src/llama-adapter.cpp` | Repack-buft fallback for LoRA tensors: tries device-native buft before CPU | +| `common/common.h` | Added `save_every`, `lora_freeze_layers`, `grad_checkpoint_interval`, `train_on_prompt`, `shuffle_dataset` fields | +| `common/arg.cpp` | Added `--save-every`, `--freeze-layers`, `--grad-checkpoint`, `--train-on-prompt`, `--shuffle-dataset` arguments | +| `include/llama.h` | Added `llama_opt_set_reward_weights()`; `grad_checkpoint_interval` in `llama_opt_params`; `shuffle` param in `llama_opt_epoch` | +| `ggml/src/ggml-cuda/out-prod.cu` | `OUT_PROD` with quantized src0 (dequantize on GPU + cuBLAS); `OUT_PROD_ID` for MoE backward | +| `ggml/src/ggml-cuda/ggml-cuda.cu` | `supports_op` for quantized `OUT_PROD` and `OUT_PROD_ID`; CPU-resident ids fix in `mul_mat_id` | +| `ggml/include/ggml-opt.h` | Added `grad_checkpoint_interval` to `ggml_opt_params` | +| `ggml/src/ggml-opt.cpp` | Gradient checkpointing: marks every Nth forward node `GGML_TENSOR_FLAG_OUTPUT` before backward build | + +### Key invariants + +- `params.use_mmap = false` — forced; mmap'd tensors can't have data written back +- `params.flash_attn_type = DISABLED` — no backward impl for flash attention +- `params.warmup = false` — warmup runs inference with PARAM tensors → segfault +- `params.cache_type_k = F32` — training requires F32 KV (or BF16 with `--cache-type-k bf16`) +- LoRA A/B tensors are marked `PARAM` via `ggml_set_param` on the tensors loaded by `llama_adapter_lora_init`, not on the pre-init scratch tensors in `lt.buf` +- The adapter GGUF is pre-saved and loaded via `params.lora_adapters` BEFORE `common_init_from_params` so that `sched_reserve` includes LoRA graph nodes in its sizing + +### Why opt_init inflation matters + +`ggml_opt_init` captures `sched.get()` at construction time. The backward graph (`gb_grad`, `gb_opt`) is ~3–5× larger than the forward graph in node count. If the scheduler hash_set is sized only for the forward graph, `ggml_backend_sched_alloc_graph` on the backward graph will overflow it. We recreate `sched` with `inflated = fwd_nodes × 4` slots BEFORE calling `ggml_opt_init`. + +### Reward weighting implementation + +`llama_opt_set_reward_weights(weights, n)` sets thread-local `g_reward_weights`. In `opt_epoch`, each window reads `g_reward_weights[idata]` and passes it as `reward_scale` to `opt_epoch_iter`. Inside the iter loop, instead of writing `1.0f` for the correct token's label position in the cross-entropy label tensor, it writes `reward_scale`. Since cross-entropy loss = `-mean(label × log(softmax(logit)))`, scaling the label scales both loss and gradient identically. diff --git a/examples/qlora_training/check_lora_norms.py b/examples/qlora_training/check_lora_norms.py new file mode 100644 index 0000000000..c54ed4e7d1 --- /dev/null +++ b/examples/qlora_training/check_lora_norms.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +"""Quick check of LoRA tensor norms in a GGUF file.""" +import sys, struct, numpy as np + +def read_gguf(path): + with open(path, 'rb') as f: + assert f.read(4) == b'GGUF' + version = struct.unpack(' +#include +#include + +#define JSON_ASSERT GGML_ASSERT +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +// Expand a leading ~/ to the HOME directory (the shell doesn't do this for us +// when a path is passed as a string argument to std::ofstream). +static std::string expand_tilde(const std::string & path) { + if (path.size() >= 2 && path[0] == '~' && path[1] == '/') { + const char * home = getenv("HOME"); + if (!home) home = getenv("USERPROFILE"); // Windows fallback + if (home) return std::string(home) + path.substr(1); + } + return path; +} + +static std::vector split_csv(const std::string & s) { + std::vector out; + std::istringstream ss(s); + std::string tok; + while (std::getline(ss, tok, ',')) { + if (!tok.empty()) out.push_back(tok); + } + return out; +} + +// Tensors whose names contain these substrings use MUL_MAT_ID (sparse MoE expert dispatch) +// which has no backward implementation — exclude them from LoRA targets unconditionally. +static const std::vector EXCLUDED_SUBSTRINGS = { + "_exps", // MoE expert weight stacks (ffn_gate_exps, ffn_up_exps, ffn_down_exps, ffn_gate_up_exps) +}; + +static bool tensor_is_excluded(const char * name) { + const std::string n(name); + for (const auto & ex : EXCLUDED_SUBSTRINGS) { + if (n.find(ex) != std::string::npos) return true; + } + return false; +} + +// Extract the transformer block index from a tensor name of the form "blk.NN.". +// Returns -1 if the name does not follow this pattern. +static int tensor_layer_index(const char * name) { + // All per-layer tensors in llama.cpp GGUF are named "blk.." + const char * p = strstr(name, "blk."); + if (!p) return -1; + p += 4; // skip "blk." + char * end = nullptr; + long idx = strtol(p, &end, 10); + if (end == p || (*end != '.' && *end != '\0')) return -1; + return (int) idx; +} + +static bool tensor_matches_targets(const char * name, const std::vector & targets, + int freeze_layers = 0) { + if (tensor_is_excluded(name)) return false; + if (freeze_layers > 0) { + const int layer = tensor_layer_index(name); + if (layer >= 0 && layer < freeze_layers) return false; + } + for (const auto & t : targets) { + if (std::string(name).find(t) != std::string::npos) return true; + } + return false; +} + +// --------------------------------------------------------------------------- +// JSONL dataset loading +// --------------------------------------------------------------------------- + +struct training_sample { + std::vector tokens; // full token sequence + std::vector is_label; // true for tokens that contribute to loss + float reward; // reward/score weight (1.0 = neutral, 0.0 = ignore) +}; + +// Apply a very simple ChatML fallback template when the model has no template. +static std::string apply_chatml(const std::vector & msgs) { + std::string out; + for (const auto & m : msgs) { + out += "<|im_start|>" + m.role + "\n"; + // content_parts is a vector; build a plain text string + std::string text; + if (!m.content_parts.empty()) { + for (const auto & p : m.content_parts) { + text += p.text; + } + } + out += text + "<|im_end|>\n"; + } + return out; +} + +static std::vector load_jsonl( + const std::string & path, + llama_context * ctx, + common_chat_templates * tmpls) { + + std::ifstream f(path); + if (!f.is_open()) { + LOG_ERR("%s: cannot open %s\n", __func__, path.c_str()); + return {}; + } + + std::vector samples; + std::string line; + int lineno = 0; + + while (std::getline(f, line)) { + ++lineno; + if (line.empty()) continue; + + nlohmann::json j; + try { j = nlohmann::json::parse(line); } + catch (...) { + LOG_WRN("%s: skipping invalid JSON on line %d\n", __func__, lineno); + continue; + } + + float reward = 1.0f; + if (j.contains("reward")) reward = j["reward"].get(); + else if (j.contains("score")) reward = j["score"].get(); + + std::string prompt_text; + std::string response_text; + + if (j.contains("messages")) { + // chat format — apply template + std::vector msgs; + for (const auto & m : j["messages"]) { + common_chat_msg msg; + msg.role = m.value("role", "user"); + common_chat_msg_content_part part; + part.type = "text"; + part.text = m.value("content", ""); + msg.content_parts.push_back(part); + msgs.push_back(msg); + } + + // Skip samples where the last assistant turn contains an error marker. + // These are malformed/failed generations that should not be trained on. + { + std::string last_assistant_content; + for (int mi = (int)msgs.size() - 1; mi >= 0; --mi) { + if (msgs[mi].role == "assistant") { + last_assistant_content = msgs[mi].content_parts.empty() + ? "" : msgs[mi].content_parts[0].text; + break; + } + } + // // this should be done on the python side... + // if (last_assistant_content.find("Error:") != std::string::npos || + // last_assistant_content.find("error:") != std::string::npos) { + // LOG_DBG("%s: skipping line %d — assistant response contains error marker\n", __func__, lineno); + // continue; + // } + } + + // Split into prompt (no loss) + last assistant response (loss). + // Render all messages except the last assistant turn as the prompt + // (with add_generation_prompt=true so the template adds the assistant + // prefix), then use the raw last assistant content as response_text. + // This ensures only the assistant's response tokens get loss, not the + // user turns or system prompt. + if (msgs.empty()) continue; + std::string last_assistant_content; + std::vector prompt_msgs; + // Find the last assistant message + int last_asst_idx = -1; + for (int mi = (int)msgs.size() - 1; mi >= 0; --mi) { + if (msgs[mi].role == "assistant") { last_asst_idx = mi; break; } + } + if (last_asst_idx < 0) { + // No assistant turn — skip; nothing to train on + LOG_DBG("%s: skipping line %d — no assistant turn\n", __func__, lineno); + continue; + } + last_assistant_content = msgs[last_asst_idx].content_parts.empty() + ? "" : msgs[last_asst_idx].content_parts[0].text; + for (int mi = 0; mi < last_asst_idx; ++mi) prompt_msgs.push_back(msgs[mi]); + + if (tmpls) { + common_chat_templates_inputs inp; + inp.messages = prompt_msgs; + inp.add_generation_prompt = true; + prompt_text = common_chat_templates_apply(tmpls, inp).prompt; + response_text = last_assistant_content; + } else { + // Fallback: render everything as ChatML, use full text as response + std::vector all_msgs = prompt_msgs; + all_msgs.push_back(msgs[last_asst_idx]); + prompt_text = ""; + response_text = apply_chatml(all_msgs); + } + } else if (j.contains("prompt") && j.contains("response")) { + response_text = j["response"].get(); + // // this should be done on the python side... + // if (response_text.find("Error:") != std::string::npos || + // response_text.find("error:") != std::string::npos) { + // LOG_DBG("%s: skipping line %d — response contains error marker\n", __func__, lineno); + // continue; + // } + prompt_text = j["prompt"].get(); + } else if (j.contains("text")) { + response_text = j["text"].get(); + } else { + LOG_WRN("%s: unknown format on line %d, skipping\n", __func__, lineno); + continue; + } + + // Tokenize: prompt (no loss) + response (loss) + auto tok_prompt = common_tokenize(ctx, prompt_text, /*add_special=*/true); + auto tok_response = common_tokenize(ctx, response_text, /*add_special=*/false); + + if (tok_prompt.empty() && tok_response.empty()) continue; + + training_sample s; + s.reward = reward; + s.tokens.insert(s.tokens.end(), tok_prompt.begin(), tok_prompt.end()); + s.tokens.insert(s.tokens.end(), tok_response.begin(), tok_response.end()); + s.is_label.resize(s.tokens.size(), false); + // Only response tokens contribute to the loss + for (size_t i = tok_prompt.size(); i < s.tokens.size(); ++i) { + s.is_label[i] = true; + } + samples.push_back(std::move(s)); + } + + LOG_INF("%s: loaded %zu samples from %s\n", __func__, samples.size(), path.c_str()); + return samples; +} + +// Pack variable-length samples into fixed-context-length windows and create +// an ggml_opt_dataset. Labels for prompt tokens are set to -1 (ignored by +// the loss in the epoch loop). +// window_rewards is filled with one reward weight per window (averaged over +// the sample tokens that fall in that window). If all samples have reward=1.0 +// the vector is all-ones and has no effect. +static ggml_opt_dataset_t build_dataset( + const std::vector & samples, + int32_t n_ctx, + std::vector & window_rewards, + bool train_on_prompt = false, + llama_token bos_token = -1) { + + // Flatten samples into token/label/reward streams + std::vector flat_tokens; + std::vector flat_labels; // -1 = no loss, token_id = loss target + std::vector flat_rewards; // per-token reward from the source sample + + for (size_t si = 0; si < samples.size(); ++si) { + const auto & s = samples[si]; + + // Insert BOS separator between samples to prevent cross-sample predictions. + // The first sample already has BOS from tokenization (add_special=true). + if (si > 0 && bos_token >= 0 && !s.tokens.empty()) { + flat_tokens .push_back(bos_token); + flat_labels .push_back(-1); // no loss on separator + flat_rewards.push_back(s.reward); + } + + for (size_t i = 0; i + 1 < s.tokens.size(); ++i) { + flat_tokens .push_back(s.tokens[i]); + if (train_on_prompt) { + // All positions get correct next-token label (prompt + response) + flat_labels.push_back((int32_t)s.tokens[i + 1]); + } else { + // Only response positions get loss; prompt positions get -1 (sentinel). + // The sentinel is passed through to labels_sparse; opt_epoch_iter skips + // writing to the label tensor for those positions, leaving them zeroed → + // zero cross-entropy contribution. No gradient flows from prompt tokens. + flat_labels.push_back(s.is_label[i + 1] ? (int32_t)s.tokens[i + 1] : -1); + } + flat_rewards.push_back(s.reward); + } + } + + if ((int64_t)flat_tokens.size() < n_ctx) { + LOG_ERR("%s: dataset too small (%zu tokens) for context %d\n", + __func__, flat_tokens.size(), n_ctx); + return nullptr; + } + + const int64_t stride = n_ctx / 2; + int64_t ndata = ((int64_t)flat_tokens.size() - n_ctx) / stride; + if (ndata < 1) ndata = 1; // at least one window when flat_tokens >= n_ctx + + window_rewards.resize(ndata); + + ggml_opt_dataset_t dataset = ggml_opt_dataset_init( + GGML_TYPE_I32, GGML_TYPE_I32, n_ctx, n_ctx, ndata, 1); + + int32_t * data = (int32_t *) ggml_opt_dataset_data (dataset)->data; + int32_t * labels = (int32_t *) ggml_opt_dataset_labels(dataset)->data; + + for (int64_t i = 0; i < ndata; ++i) { + const int64_t off = i * stride; + float reward_sum = 0.0f; + for (int32_t j = 0; j < n_ctx; ++j) { + data [i * n_ctx + j] = flat_tokens[off + j]; + // Pass -1 sentinel through unchanged for masked (prompt) positions. + // opt_epoch_iter skips these positions (no label tensor write → zero + // cross-entropy contribution). Do NOT substitute the current token + // here — that trains the model to predict itself (off-by-one) and + // causes repetition degeneration. + labels[i * n_ctx + j] = flat_labels[off + j]; + reward_sum += flat_rewards[off + j]; + } + window_rewards[i] = reward_sum / n_ctx; + } + + // Normalize window rewards to [0, 1]. + // Step 1: clip to [-1, 1] — outliers like 1.3/1.4 would otherwise compress the + // useful signal range after min-max scaling (a reward=1.0 would map to + // only 0.83 instead of 1.0 if the max is 1.4). + // Step 2: min-max scale clipped values → [0, 1]. + // min → 0.0 (window ignored), max → 1.0 (full weight). + // If all rewards are identical (pure SFT dataset) keep at 1.0. + for (float & r : window_rewards) { + r = std::max(-1.0f, std::min(1.0f, r)); + } + float rmin = *std::min_element(window_rewards.begin(), window_rewards.end()); + float rmax = *std::max_element(window_rewards.begin(), window_rewards.end()); + const float rrange = rmax - rmin; + if (rrange > 1e-6f) { + for (float & r : window_rewards) { + r = (r - rmin) / rrange; + } + LOG_INF("%s: reward range [%.4f, %.4f] (after clip to [-1,1]) → normalized to [0, 1]\n", __func__, rmin, rmax); + } else { + std::fill(window_rewards.begin(), window_rewards.end(), 1.0f); + } + + return dataset; +} + +// --------------------------------------------------------------------------- +// LoRA tensor allocation +// --------------------------------------------------------------------------- + +struct lora_tensors { + struct ggml_context * ctx = nullptr; + struct ggml_backend_buffer * buf = nullptr; + // map: base tensor name → {lora_a, lora_b} + std::unordered_map> ab; +}; + +static lora_tensors alloc_lora_tensors( + const std::string & model_path, + const std::vector & targets, + int32_t rank, + std::mt19937 & rng, + int32_t freeze_layers = 0) { + + lora_tensors lt; + + // Open the model GGUF to discover tensor names and shapes + // without needing access to private llama_model internals. + struct ggml_context * ctx_meta = nullptr; + struct gguf_init_params gguf_params = { /*.no_alloc=*/true, /*.ctx=*/&ctx_meta }; + struct gguf_context * ctx_gguf = gguf_init_from_file(model_path.c_str(), gguf_params); + if (!ctx_gguf) { + LOG_ERR("%s: failed to open model GGUF for tensor discovery: %s\n", + __func__, model_path.c_str()); + return lt; + } + + // Collect matching 2-D tensors + struct tensor_info { std::string name; int64_t ne0, ne1; }; + std::vector matched; + + for (ggml_tensor * t = ggml_get_first_tensor(ctx_meta); + t; t = ggml_get_next_tensor(ctx_meta, t)) { + if (ggml_n_dims(t) < 2) continue; + if (!tensor_matches_targets(t->name, targets, freeze_layers)) continue; + matched.push_back({t->name, t->ne[0], t->ne[1]}); + } + + gguf_free(ctx_gguf); + ggml_free(ctx_meta); + + if (matched.empty()) { + LOG_ERR("%s: no model tensors matched --lora-targets; check spelling\n", __func__); + return lt; + } + + if (freeze_layers > 0) { + LOG_INF("%s: freezing layers blk.0 .. blk.%d (no LoRA allocated; backward already pruned by grads_needed)\n", + __func__, freeze_layers - 1); + } + LOG_INF("%s: allocating LoRA A/B tensors for %zu weight matrices, rank=%d\n", + __func__, matched.size(), rank); + + // Allocate ggml context for A+B tensors (2 tensors per matched weight) + const size_t mem = (2 * matched.size() + 16) * ggml_tensor_overhead(); + struct ggml_init_params ip = { mem, nullptr, /*no_alloc=*/true }; + lt.ctx = ggml_init(ip); + + for (const auto & ti : matched) { + const int64_t in_dim = ti.ne0; // columns (input features) + const int64_t out_dim = ti.ne1; // rows (output features) + + // lora_a: [in_dim, rank] applied first: a @ x + // lora_b: [rank, out_dim] applied second: b @ (a @ x) + // Convention matches llama-adapter.cpp:48-60: + // a->ne[0] == in_dim, a->ne[1] == rank + // b->ne[0] == rank, b->ne[1] == out_dim + ggml_tensor * la = ggml_new_tensor_2d(lt.ctx, GGML_TYPE_F32, in_dim, rank); + ggml_tensor * lb = ggml_new_tensor_2d(lt.ctx, GGML_TYPE_F32, rank, out_dim); + + ggml_set_name(la, (ti.name + ".lora_a").c_str()); + ggml_set_name(lb, (ti.name + ".lora_b").c_str()); + + lt.ab[ti.name] = {la, lb}; + } + + // Allocate backend buffer for all LoRA tensors at once + lt.buf = ggml_backend_alloc_ctx_tensors_from_buft(lt.ctx, ggml_backend_cpu_buffer_type()); + + // Initialize: A ~ N(0, 1/sqrt(rank)), B = 0 + const float std_a = 1.0f / std::sqrt((float)rank); + std::normal_distribution dist(0.0f, std_a); + + for (auto & kv : lt.ab) { + ggml_tensor * la = kv.second.first; + ggml_tensor * lb = kv.second.second; + + // Fill A + float * data_a = (float *) la->data; + for (int64_t i = 0; i < ggml_nelements(la); ++i) data_a[i] = dist(rng); + // Zero B + memset(lb->data, 0, ggml_nbytes(lb)); + } + + return lt; +} + +// --------------------------------------------------------------------------- +// Param filter: only train lora_a / lora_b tensors +// --------------------------------------------------------------------------- + +static bool lora_param_filter(const struct ggml_tensor * t, void * /*ud*/) { + const char * n = t->name; + const size_t len = strlen(n); + if (len > 7 && strcmp(n + len - 7, ".lora_a") == 0) return true; + if (len > 7 && strcmp(n + len - 7, ".lora_b") == 0) return true; + return false; +} + +// --------------------------------------------------------------------------- +// Save adapter GGUF +// --------------------------------------------------------------------------- + +static std::string basename_from_path(const std::string & p) { + const size_t pos = p.find_last_of("/\\"); + if (pos == std::string::npos) return p; + return p.substr(pos + 1); +} + +static void save_adapter( + const lora_tensors & lt, + const std::string & out_path, + const std::string & arch, + float alpha, + const std::string & base_model_path) { + + // Build output GGUF context + struct gguf_context * gctx = gguf_init_empty(); + + // Metadata required by llama_adapter_lora_init + gguf_set_val_str(gctx, "general.type", "adapter"); + gguf_set_val_str(gctx, "general.architecture", arch.c_str()); + gguf_set_val_str(gctx, "adapter.type", "lora"); + gguf_set_val_f32(gctx, "adapter.lora.alpha", alpha); + gguf_set_val_str(gctx, "adapter.base_model", basename_from_path(base_model_path).c_str()); + + // Register tensors + for (const auto & kv : lt.ab) { + gguf_add_tensor(gctx, kv.second.first); // lora_a + gguf_add_tensor(gctx, kv.second.second); // lora_b + } + + // Write: meta placeholder → tensor data → rewrite meta + const std::string real_path = expand_tilde(out_path); + std::ofstream fout(real_path, std::ios::binary); + if (!fout.is_open()) { + LOG_ERR("%s: cannot open %s for writing\n", __func__, real_path.c_str()); + gguf_free(gctx); + return; + } + + // Write meta placeholder + const size_t meta_size = gguf_get_meta_size(gctx); + std::vector zeros_buf(meta_size, 0); + fout.write(zeros_buf.data(), meta_size); + + // Write tensor data — copy to CPU first in case tensors live on GPU + for (const auto & kv : lt.ab) { + for (ggml_tensor * t : {kv.second.first, kv.second.second}) { + const size_t nb = ggml_nbytes(t); + std::vector cpu_buf(nb); + ggml_backend_tensor_get(t, cpu_buf.data(), 0, nb); + fout.write(cpu_buf.data(), nb); + // GGUF tensors are 32-byte aligned + const size_t pad = GGML_PAD(nb, 32) - nb; + if (pad > 0) { + std::vector pad_buf(pad, 0); + fout.write(pad_buf.data(), pad); + } + } + } + + // Re-write metadata at offset 0 + std::vector meta(meta_size); + gguf_get_meta_data(gctx, meta.data()); + fout.seekp(0); + fout.write((const char *) meta.data(), meta_size); + + fout.close(); + gguf_free(gctx); + + LOG_INF("%s: adapter saved to %s\n", __func__, real_path.c_str()); +} + +// --------------------------------------------------------------------------- +// Periodic checkpoint callback +// --------------------------------------------------------------------------- + +struct save_ctx { + const lora_tensors * lt; + const std::string * lora_out; + const std::string * arch; + const std::string * base_model_path; + float lora_alpha; + int32_t save_every; // 0 = disabled + int32_t ubatch_per_ctx; + int64_t last_saved; // last window index at which we saved +}; + +// TLS pointer set before each epoch so the static callback can access it. +static thread_local save_ctx * g_save_ctx = nullptr; + +static void save_every_callback( + bool train, + ggml_opt_context_t opt_ctx, + ggml_opt_dataset_t dataset, + ggml_opt_result_t result, + int64_t ibatch, + int64_t ibatch_max, + int64_t t_start_us) { + ggml_opt_epoch_callback_progress_bar(train, opt_ctx, dataset, result, ibatch, ibatch_max, t_start_us); + + // Log loss at every window boundary so we can see if/when it diverges. + if (train && g_save_ctx) { + const int64_t window = ibatch / g_save_ctx->ubatch_per_ctx; + const int64_t ubatch_in_window = ibatch % g_save_ctx->ubatch_per_ctx; + if (ubatch_in_window == g_save_ctx->ubatch_per_ctx - 1) { + double loss = 0.0, loss_unc = 0.0; + ggml_opt_result_loss(result, &loss, &loss_unc); + fprintf(stderr, "\n[window %4ld] loss=%.4f ± %.4f\n", (long)window, loss, loss_unc); + } + } + + if (!train || !g_save_ctx || g_save_ctx->save_every <= 0) return; + const int64_t window = ibatch / g_save_ctx->ubatch_per_ctx; + if (window > 0 && window != g_save_ctx->last_saved && window % g_save_ctx->save_every == 0) { + g_save_ctx->last_saved = window; + const std::string ckpt = *g_save_ctx->lora_out + ".ckpt" + std::to_string(window) + ".gguf"; + save_adapter(*g_save_ctx->lt, ckpt, *g_save_ctx->arch, g_save_ctx->lora_alpha, *g_save_ctx->base_model_path); + fprintf(stderr, "\n"); + LOG_INF("save_every_callback: checkpoint saved -> %s (window %ld)\n", ckpt.c_str(), (long)window); + } +} + +// --------------------------------------------------------------------------- +// IPC helpers (stdout protocol, stdin commands) +// --------------------------------------------------------------------------- + +// Escape newlines and backslashes for single-line IPC transmission. +// Mirrors _escape() in gguf_trainer.py. +static std::string ipc_escape(const std::string & s) { + std::string out; + out.reserve(s.size()); + for (char c : s) { + if (c == '\\') out += "\\\\"; + else if (c == '\n') out += "\\n"; + else if (c == '\r') out += "\\r"; + else out += c; + } + return out; +} + +static void ipc_emit(const char * msg) { + fputs(msg, stdout); + fputc('\n', stdout); + fflush(stdout); +} + +// Read one line from stdin, trimming the trailing newline. +// Returns false on EOF or error. +static bool ipc_read_line(std::string & out) { + out.clear(); + if (!std::getline(std::cin, out)) return false; + // Strip trailing \r if present (Windows line endings) + if (!out.empty() && out.back() == '\r') out.pop_back(); + return true; +} + +// Parse "REWARD r1 r2 ... rN" into a float vector. +static std::vector ipc_parse_rewards(const std::string & line) { + std::vector rewards; + if (line.size() < 8 || line.substr(0, 7) != "REWARD ") return rewards; + std::istringstream ss(line.substr(7)); + float r; + while (ss >> r) rewards.push_back(r); + return rewards; +} + +// --------------------------------------------------------------------------- +// Greedy / temperature sampling for GRPO rollout generation +// --------------------------------------------------------------------------- + +static std::string generate_response( + llama_context * ctx, + llama_model * model, + const std::string & prompt, + int32_t max_tokens, + float temperature, + std::mt19937 & rng) { + + const llama_vocab * vocab = llama_model_get_vocab(model); + auto tokens = common_tokenize(ctx, prompt, /*add_special=*/true); + if (tokens.empty()) return ""; + + // Clear KV cache before each generation (don't carry over previous prompt state) + llama_memory_clear(llama_get_memory(ctx), true); + { + llama_batch batch = llama_batch_get_one(tokens.data(), (int32_t)tokens.size()); + if (llama_decode(ctx, batch) != 0) { + LOG_ERR("%s: llama_decode failed on prompt\n", __func__); + return ""; + } + } + + std::string output; + const llama_token eos = llama_vocab_eos(vocab); + const llama_token nl = llama_vocab_nl(vocab); + + // For ChatML models <|im_end|> is the turn-end marker but may not be the + // vocab EOS token. Look it up by tokenizing the string and taking the + // first token if it tokenizes to exactly one piece. + llama_token im_end = -1; + { + std::vector im_end_tokens(8); + static const char im_end_str[] = "<|im_end|>"; + int n = llama_tokenize(vocab, im_end_str, (int32_t)strlen(im_end_str), im_end_tokens.data(), (int32_t)im_end_tokens.size(), /*add_special=*/false, /*parse_special=*/true); + if (n == 1) im_end = im_end_tokens[0]; + } + const llama_token eot = llama_vocab_eot(vocab); // may equal eos on some models + + for (int32_t i = 0; i < max_tokens; ++i) { + // Sample next token — use ith=-1 to always get the LAST output position's + // logits. llama_get_logits(ctx) returns position 0 which is wrong when the + // prompt batch has multiple output tokens (training context). + float * logits = llama_get_logits_ith(ctx, -1); + if (!logits) { + LOG_ERR("%s: llama_get_logits_ith(-1) returned NULL\n", __func__); + break; + } + const int32_t n_vocab = llama_vocab_n_tokens(vocab); + + llama_token next_token; + if (temperature <= 0.0f) { + // Greedy + next_token = (llama_token)(std::max_element(logits, logits + n_vocab) - logits); + } else { + // Temperature sampling via softmax + categorical draw + std::vector probs(n_vocab); + float max_logit = *std::max_element(logits, logits + n_vocab); + float sum = 0.0f; + for (int32_t k = 0; k < n_vocab; ++k) { + probs[k] = std::exp((logits[k] - max_logit) / temperature); + sum += probs[k]; + } + for (float & p : probs) p /= sum; + std::discrete_distribution dist(probs.begin(), probs.end()); + next_token = dist(rng); + } + + if (next_token == eos) break; + if (next_token == eot) break; + if (im_end >= 0 && next_token == im_end && !output.empty()) break; + + // Decode token to text + char buf[256] = {}; + llama_token_to_piece(vocab, next_token, buf, sizeof(buf) - 1, 0, true); + output += buf; + + // Feed token back for next step + llama_batch batch = llama_batch_get_one(&next_token, 1); + if (llama_decode(ctx, batch) != 0) break; + } + + return output; +} + +// --------------------------------------------------------------------------- +// GRPO IPC training loop +// --------------------------------------------------------------------------- + +// Volatile flag set by SIGINT so the loop can exit cleanly. +static volatile sig_atomic_t g_grpo_stop = 0; +static void grpo_sigint_handler(int) { g_grpo_stop = 1; } + +static int run_grpo_mode( + common_params & params, + llama_model * model, + llama_context * ctx, + lora_tensors & lt, + const std::string & arch, + float lora_alpha, + const std::string & base_model_path) { + + const int32_t n_ctx = llama_n_ctx(ctx); + const int32_t n_gen = params.grpo_n_gen; + const int32_t n_steps = params.grpo_n_steps; + const float temp = params.grpo_temperature; + const int32_t max_tok = params.grpo_max_tokens; + + std::mt19937 rng(params.sampling.seed != LLAMA_DEFAULT_SEED + ? params.sampling.seed : 42); + + // Initialize optimizer + struct llama_opt_params lopt_params { + /*.n_ctx_train =*/0, + /*.param_filter =*/lora_param_filter, + /*.param_filter_ud =*/nullptr, + /*.get_opt_pars =*/common_opt_lr_pars, + /*.get_opt_pars_ud =*/¶ms.lr, + /*.optimizer_type =*/params.optimizer, + /*.grad_checkpoint_interval =*/params.grad_checkpoint_interval, + }; + llama_opt_init(ctx, model, lopt_params); + + const llama_token bos = llama_vocab_bos(llama_model_get_vocab(model)); + + signal(SIGINT, grpo_sigint_handler); + + // Signal Python that we are ready + ipc_emit("[QLORA:READY]"); + + float last_loss = 0.0f; + int step = 0; + + while (step < n_steps && !g_grpo_stop) { + + // ── Request prompt ──────────────────────────────────────────────── + { + char buf[64]; + snprintf(buf, sizeof(buf), "[QLORA:PROMPT_REQ:%d]", step + 1); + ipc_emit(buf); + } + + std::string prompt_line; + if (!ipc_read_line(prompt_line)) break; + if (prompt_line == "STOP") { + LOG_INF("grpo: received STOP from Python\n"); + break; + } + if (prompt_line.size() < 8 || prompt_line.substr(0, 7) != "PROMPT ") { + char buf[128]; + snprintf(buf, sizeof(buf), "[QLORA:ERROR] expected PROMPT, got: %.80s", prompt_line.c_str()); + ipc_emit(buf); + return 1; + } + // Unescape the prompt (\\n → \n etc.) + std::string prompt; + { + const std::string esc = prompt_line.substr(7); + prompt.reserve(esc.size()); + for (size_t i = 0; i < esc.size(); ++i) { + if (esc[i] == '\\' && i + 1 < esc.size()) { + char next = esc[i+1]; + if (next == 'n') { prompt += '\n'; ++i; } + else if (next == 'r') { prompt += '\r'; ++i; } + else if (next == '\\') { prompt += '\\'; ++i; } + else { prompt += esc[i]; } + } else { + prompt += esc[i]; + } + } + } + + // ── Generate N responses ────────────────────────────────────────── + std::vector generations(n_gen); + for (int k = 0; k < n_gen; ++k) { + generations[k] = generate_response(ctx, model, prompt, max_tok, temp, rng); + + char hdr[64]; + snprintf(hdr, sizeof(hdr), "[QLORA:GEN:%d/%d] ", k + 1, n_gen); + std::string msg = std::string(hdr) + ipc_escape(generations[k]); + ipc_emit(msg.c_str()); + } + + // ── Request rewards ─────────────────────────────────────────────── + { + char buf[64]; + snprintf(buf, sizeof(buf), "[QLORA:REWARD_REQ:%d]", n_gen); + ipc_emit(buf); + } + + std::string reward_line; + if (!ipc_read_line(reward_line)) break; + if (reward_line == "STOP") { + LOG_INF("grpo: received STOP from Python\n"); + break; + } + std::vector rewards = ipc_parse_rewards(reward_line); + if ((int32_t)rewards.size() != n_gen) { + char buf[128]; + snprintf(buf, sizeof(buf), "[QLORA:ERROR] expected %d rewards, got %zu", n_gen, rewards.size()); + ipc_emit(buf); + return 1; + } + + // ── Build single-step mini-dataset: prompt+generations with rewards ─ + // Each generation is a separate sample; prompt = no-loss, generation = loss. + std::vector step_samples; + step_samples.reserve(n_gen); + for (int k = 0; k < n_gen; ++k) { + training_sample s; + s.reward = rewards[k]; + + auto tok_prompt = common_tokenize(ctx, prompt, /*add_special=*/true); + auto tok_gen = common_tokenize(ctx, generations[k], /*add_special=*/false); + + s.tokens.insert(s.tokens.end(), tok_prompt.begin(), tok_prompt.end()); + s.tokens.insert(s.tokens.end(), tok_gen.begin(), tok_gen.end()); + s.is_label.resize(s.tokens.size(), false); + for (size_t i = tok_prompt.size(); i < s.tokens.size(); ++i) { + s.is_label[i] = true; + } + step_samples.push_back(std::move(s)); + } + + // Ensure minimum token count for one context window. + // build_dataset drops the last token per sample during flattening, + // so we need total raw tokens > n_ctx to guarantee ndata >= 1. + while (true) { + size_t total = 0; + for (const auto & s : step_samples) total += s.tokens.size(); + if ((int64_t)total > n_ctx + (int64_t)step_samples.size()) break; + step_samples.push_back(step_samples.back()); + } + + std::vector window_rewards; + ggml_opt_dataset_t step_dataset = build_dataset( + step_samples, n_ctx, window_rewards, /*train_on_prompt=*/false, bos); + if (!step_dataset) { + ipc_emit("[QLORA:ERROR] build_dataset failed for step"); + return 1; + } + + // Apply reward weights for this step + const bool has_rewards = std::any_of(window_rewards.begin(), window_rewards.end(), + [](float r){ return std::abs(r - 1.0f) > 1e-4f; }); + if (has_rewards) { + llama_opt_set_reward_weights(window_rewards.data(), (int64_t)window_rewards.size()); + } + + // ── One optimizer step (full dataset = one mini-epoch) ──────────── + const int64_t idata_all = ggml_opt_dataset_ndata(step_dataset); + ggml_opt_result_t step_result = ggml_opt_result_init(); + + llama_opt_epoch(ctx, step_dataset, step_result, nullptr, idata_all, + nullptr, // no progress bar callback — clean stdout + nullptr, + false); // no shuffle for single-step + + double loss = 0.0, loss_unc = 0.0; + ggml_opt_result_loss(step_result, &loss, &loss_unc); + last_loss = (float)loss; + + ggml_opt_result_free(step_result); + ggml_opt_dataset_free(step_dataset); + llama_opt_set_reward_weights(nullptr, 0); + + ++step; + + // ── Emit progress ───────────────────────────────────────────────── + { + char buf[128]; + snprintf(buf, sizeof(buf), + "[QLORA:PROGRESS] step=%d/%d loss=%.4f epoch=1/1", + step, n_steps, last_loss); + ipc_emit(buf); + } + + // ── Optional checkpoint ─────────────────────────────────────────── + if (params.save_every > 0 && step % params.save_every == 0) { + std::string ckpt = params.lora_out + ".ckpt" + std::to_string(step) + ".gguf"; + save_adapter(lt, ckpt, arch, lora_alpha, base_model_path); + char buf[512]; + snprintf(buf, sizeof(buf), "[QLORA:CHECKPOINT] %s", ckpt.c_str()); + ipc_emit(buf); + } + } + + // Save final adapter + save_adapter(lt, params.lora_out, arch, lora_alpha, base_model_path); + + { + char buf[64]; + snprintf(buf, sizeof(buf), "[QLORA:DONE] final_loss=%.4f", last_loss); + ipc_emit(buf); + } + + return 0; +} + +// --------------------------------------------------------------------------- +int main(int argc, char ** argv) { + std::setlocale(LC_NUMERIC, "C"); + + common_params params; + params.escape = false; + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_FINETUNE_QLORA)) { + return 1; + } + + if (!params.grpo_mode && params.train_file.empty()) { + LOG_ERR("%s: --train-file is required (or use --grpo-mode for IPC training)\n", __func__); + return 1; + } + + // Force settings required for training + params.use_mmap = false; + params.cache_type_k = GGML_TYPE_F32; + params.cache_type_v = GGML_TYPE_F32; + // Warmup runs inference with PARAM-flagged tensors which causes a segfault; + // training never benefits from warmup, so disable it unconditionally. + params.warmup = false; + // Flash attention has no backward implementation; force standard attention for training. + params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED; + + const float lora_alpha = (params.lora_alpha > 0.0f) + ? params.lora_alpha : (float) params.lora_rank; + const auto targets = split_csv(params.lora_targets); + + // --- Step 1: Discover tensor shapes from model GGUF (no model load yet) --- + std::string arch; + { + struct ggml_context * ctx_meta = nullptr; + struct gguf_init_params gp = { true, &ctx_meta }; + struct gguf_context * ctx_gguf = gguf_init_from_file(params.model.path.c_str(), gp); + if (!ctx_gguf) { LOG_ERR("failed to open model GGUF\n"); return 1; } + int kid = gguf_find_key(ctx_gguf, "general.architecture"); + if (kid >= 0) arch = gguf_get_val_str(ctx_gguf, kid); + gguf_free(ctx_gguf); + ggml_free(ctx_meta); + } + + // --- Step 2: Allocate LoRA tensors and save initial adapter GGUF --- + // If the user already supplied a --lora adapter we reuse it (resume training). + // Otherwise we allocate fresh tensors (B=0, A=random), write them to a temp + // .init.gguf so common_init_from_params can load them before context creation + // (this makes sched_reserve size the graph to include LoRA nodes). + const bool resume_from_lora = !params.lora_adapters.empty(); + + std::mt19937 rng(42); + lora_tensors lt; // will be populated after context load (Step 4) + std::string init_adapter_path; + + if (!resume_from_lora) { + lt = alloc_lora_tensors(params.model.path, targets, params.lora_rank, rng, params.lora_freeze_layers); + if (lt.ab.empty()) return 1; + + init_adapter_path = params.lora_out + ".init.gguf"; + save_adapter(lt, init_adapter_path, arch, lora_alpha, params.model.path); + + // Register adapter so common_init_from_params loads it before context creation + common_adapter_lora_info adapter_info; + adapter_info.path = init_adapter_path; + adapter_info.scale = 1.0f; + params.lora_adapters.push_back(adapter_info); + } else { + LOG_INF("%s: resuming training from existing LoRA adapter: %s\n", + __func__, params.lora_adapters.back().path.c_str()); + } + + // --- Step 3: Load model + context (graph sized with LoRA nodes) --- + common_init(); + llama_backend_init(); + llama_numa_init(params.numa); + + auto llama_init = common_init_from_params(params); + auto * model = llama_init->model(); + auto * ctx = llama_init->context(); + + if (!model) { LOG_ERR("failed to load model\n"); return 1; } + + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); + + // Arch fallback if not in GGUF metadata + if (arch.empty()) { + char buf[256] = {}; + llama_model_desc(model, buf, sizeof(buf)); + arch = std::string(buf); + arch = arch.substr(0, arch.find_first_of(" /")); + } + + // --- Step 4: Mark the loaded adapter tensors as trainable --- + // common_init_from_params loaded the adapter; params.lora_adapters[back].ptr + // points to the live llama_adapter_lora with its own tensor copies in device + // memory. Mark those tensors trainable so the optimizer graph includes them. + { + llama_adapter_lora * loaded = params.lora_adapters.back().ptr; + if (!loaded) { + LOG_ERR("%s: adapter was not loaded by common_init_from_params\n", __func__); + return 1; + } + for (auto & kv : loaded->ab_map) { + ggml_set_param(kv.second.a); // lora_a → trainable + ggml_set_param(kv.second.b); // lora_b → trainable + } + // Point lt.ab at the live device tensors so save_adapter writes + // the trained weights (not the original init tensors). + lt.ab.clear(); + for (auto & kv : loaded->ab_map) { + lt.ab[kv.first] = {kv.second.a, kv.second.b}; + } + } + + // Remove temp init file when we created it (resume path has no init file) + if (!resume_from_lora && !init_adapter_path.empty()) { + std::remove(expand_tilde(init_adapter_path).c_str()); + } + + // --- Step 5: Load dataset --- + // In GRPO mode the dataset comes from Python via stdin/stdout — skip file loading. + auto tmpls = common_chat_templates_init(model, ""); + if (params.grpo_mode) { + int rc = run_grpo_mode(params, model, ctx, lt, arch, lora_alpha, params.model.path); + if (lt.buf) ggml_backend_buffer_free(lt.buf); + if (lt.ctx) ggml_free(lt.ctx); + llama_backend_free(); + return rc; + } + auto samples = load_jsonl(params.train_file, ctx, tmpls.get()); + if (samples.empty()) { + LOG_ERR("%s: no training samples loaded\n", __func__); + return 1; + } + + const int32_t n_ctx = llama_n_ctx(ctx); + std::vector window_rewards; + const llama_token bos = llama_vocab_bos(llama_model_get_vocab(model)); + auto dataset = build_dataset(samples, n_ctx, window_rewards, params.train_on_prompt, bos); + if (!dataset) return 1; + + // Check if any reward deviates from 1.0 — if so, enable reward-weighted SFT + const bool has_rewards = std::any_of(window_rewards.begin(), window_rewards.end(), + [](float r){ return std::abs(r - 1.0f) > 1e-4f; }); + if (has_rewards) { + LOG_INF("%s: reward-weighted SFT enabled (found non-uniform rewards in dataset)\n", __func__); + llama_opt_set_reward_weights(window_rewards.data(), (int64_t)window_rewards.size()); + } + + // Initialize optimizer — our custom param filter restricts training to lora_a/b + struct llama_opt_params lopt_params { + /*.n_ctx_train =*/0, + /*.param_filter =*/lora_param_filter, + /*.param_filter_ud =*/nullptr, + /*.get_opt_pars =*/common_opt_lr_pars, + /*.get_opt_pars_ud =*/¶ms.lr, + /*.optimizer_type =*/params.optimizer, + /*.grad_checkpoint_interval =*/params.grad_checkpoint_interval, + }; + llama_opt_init(ctx, model, lopt_params); + + const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - params.val_split); + + ggml_opt_result_t result_train = ggml_opt_result_init(); + ggml_opt_result_t result_eval = ggml_opt_result_init(); + + const int32_t n_ubatch = llama_n_ubatch(ctx); + const int32_t ubatch_per_ctx = (n_ubatch > 0) ? (n_ctx / n_ubatch) : 1; + + save_ctx sctx { <, ¶ms.lora_out, &arch, ¶ms.model.path, lora_alpha, params.save_every, ubatch_per_ctx, 0 }; + g_save_ctx = &sctx; + + const int64_t total_windows = ggml_opt_dataset_ndata(dataset); + LOG_INF("%s: starting QLoRA training — rank=%d alpha=%.1f epochs=%d loss=%s\n", + __func__, params.lora_rank, lora_alpha, params.lr.epochs, + params.train_on_prompt ? "prompt+response" : "response-only"); + LOG_INF("%s: dataset: %ld windows × %d ubatches = %ld steps per epoch (n_ctx=%d n_ubatch=%d stride=%d)\n", + __func__, (long)total_windows, ubatch_per_ctx, (long)(idata_split * ubatch_per_ctx), + n_ctx, n_ubatch, n_ctx / 2); + if (params.save_every > 0) { + LOG_INF("%s: will save checkpoint every %d windows → %s.ckptN.gguf\n", + __func__, params.save_every, params.lora_out.c_str()); + } + + ggml_opt_epoch_callback cb_train = (params.save_every > 0) + ? save_every_callback + : ggml_opt_epoch_callback_progress_bar; + + for (params.lr.epoch = 0; params.lr.epoch < params.lr.epochs; ++params.lr.epoch) { + sctx.last_saved = 0; // reset per-epoch window counter + llama_opt_epoch(ctx, dataset, result_train, result_eval, idata_split, + cb_train, + ggml_opt_epoch_callback_progress_bar, + params.shuffle_dataset); + fprintf(stderr, "\n"); + + // Per-epoch loss summary + { + double train_loss = 0.0, train_unc = 0.0; + ggml_opt_result_loss(result_train, &train_loss, &train_unc); + if (idata_split < ggml_opt_dataset_ndata(dataset)) { + double val_loss = 0.0, val_unc = 0.0; + ggml_opt_result_loss(result_eval, &val_loss, &val_unc); + LOG_INF("epoch %d/%d: train_loss=%.4f ± %.4f val_loss=%.4f ± %.4f\n", + params.lr.epoch + 1, params.lr.epochs, train_loss, train_unc, val_loss, val_unc); + } else { + LOG_INF("epoch %d/%d: train_loss=%.4f ± %.4f\n", + params.lr.epoch + 1, params.lr.epochs, train_loss, train_unc); + } + } + + ggml_opt_result_reset(result_train); + ggml_opt_result_reset(result_eval); + } + + ggml_opt_result_free(result_train); + ggml_opt_result_free(result_eval); + llama_opt_set_reward_weights(nullptr, 0); + + // Save final trained adapter + save_adapter(lt, params.lora_out, arch, lora_alpha, params.model.path); + + // Free scratch buffers only when we allocated them (not in resume path) + if (lt.buf) ggml_backend_buffer_free(lt.buf); + if (lt.ctx) ggml_free(lt.ctx); + ggml_opt_dataset_free(dataset); + llama_backend_free(); + + return 0; +} diff --git a/examples/qlora_training/grpo_example.py b/examples/qlora_training/grpo_example.py new file mode 100644 index 0000000000..c56ff9395f --- /dev/null +++ b/examples/qlora_training/grpo_example.py @@ -0,0 +1,399 @@ +#!/usr/bin/env python3 +""" +grpo_example.py — Minimal GRPO training loop using llama-finetune-qlora --grpo-mode + +Demonstrates the IPC protocol between the Python driver and the C++ subprocess. +No external dependencies required — only Python stdlib. + +Usage: + python3 grpo_example.py \ + --model /path/to/model-q4_k_m.gguf \ + --lora-out /path/to/output-adapter.gguf \ + [--lora /path/to/resume-adapter.gguf] \ + [--binary /path/to/llama-finetune-qlora] \ + [--n-steps 200] \ + [--n-gen 8] \ + [--rank 16] + +IPC Protocol (stdout from C++ process): + [QLORA:READY] — process initialised + [QLORA:PROMPT_REQ:] — C++ requests a prompt for step N + [QLORA:GEN:/] — one generation (newlines escaped as \\n) + [QLORA:REWARD_REQ:] — C++ requests N reward scores + [QLORA:PROGRESS] step=X/Y loss=Z epoch=A/B + [QLORA:CHECKPOINT] + [QLORA:DONE] final_loss=X + [QLORA:ERROR] + +Python → C++ stdin: + PROMPT + REWARD ... (advantages, 0..1 range) + STOP (request graceful shutdown) +""" + +import argparse +import logging +import math +import re +import subprocess +import sys +import time +from pathlib import Path +from typing import List, Optional, Tuple + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", +) +log = logging.getLogger("grpo_example") + +# ────────────────────────────────────────────────────────────────────────────── +# IPC helpers +# ────────────────────────────────────────────────────────────────────────────── + +_IPC_RE = re.compile(r"^\[QLORA:([A-Z_]+)(?::([^\]]*))?\](.*)$") + + +def escape(text: str) -> str: + """Escape newlines and backslashes for single-line IPC transport.""" + return text.replace("\\", "\\\\").replace("\n", "\\n").replace("\r", "\\r") + + +def unescape(text: str) -> str: + """Reverse of escape().""" + out, i = [], 0 + while i < len(text): + if text[i] == "\\" and i + 1 < len(text): + c = text[i + 1] + if c == "n": + out.append("\n") + elif c == "r": + out.append("\r") + elif c == "\\": + out.append("\\") + else: + out.append(c) + i += 2 + else: + out.append(text[i]) + i += 1 + return "".join(out) + + +def parse_ipc(line: str) -> Optional[Tuple[str, str, str]]: + """ + Parse an IPC line into (msg_type, seq, payload). + Returns None for non-IPC lines (model output, log lines, etc.). + """ + m = _IPC_RE.match(line.strip()) + if not m: + return None + return m.group(1), (m.group(2) or ""), m.group(3).strip() + + +def read_ipc(proc: subprocess.Popen, timeout: float = 120.0) -> Optional[Tuple[str, str, str]]: + """ + Read lines from proc.stdout until an IPC message arrives. + Non-IPC lines (model output, C++ logs leaked to stdout) are printed. + Returns None on EOF. + Raises TimeoutError if nothing arrives within `timeout` seconds. + """ + assert proc.stdout is not None + deadline = time.monotonic() + timeout + while True: + remaining = deadline - time.monotonic() + if remaining <= 0: + raise TimeoutError(f"No IPC message within {timeout:.0f}s") + + line = proc.stdout.readline() + if not line: + return None # EOF + + line = line.rstrip("\n") + parsed = parse_ipc(line) + if parsed: + return parsed + # Non-IPC — C++ sometimes leaks timing/debug lines to stdout. + # Print them so the user can see what's happening. + print(f" [cpp] {line}", file=sys.stderr) + + +def write_cmd(proc: subprocess.Popen, cmd: str): + """Write one command line to the subprocess stdin.""" + assert proc.stdin is not None + try: + proc.stdin.write(cmd + "\n") + proc.stdin.flush() + except BrokenPipeError: + raise RuntimeError("C++ subprocess stdin closed — did it crash?") + + +def wait_for(proc: subprocess.Popen, expected: str, timeout: float = 120.0) -> Tuple[str, str, str]: + """Block until the expected IPC message type arrives.""" + deadline = time.monotonic() + timeout + while True: + remaining = deadline - time.monotonic() + if remaining <= 0: + raise TimeoutError(f"Timed out waiting for [{expected}]") + parsed = read_ipc(proc, timeout=remaining) + if parsed is None: + raise RuntimeError(f"Subprocess exited before sending [{expected}]") + msg_type, seq, payload = parsed + if msg_type == expected: + return msg_type, seq, payload + log.debug("Ignoring unexpected IPC (%s) while waiting for %s", msg_type, expected) + + +# ────────────────────────────────────────────────────────────────────────────── +# Advantage normalisation (GRPO) +# ────────────────────────────────────────────────────────────────────────────── + +def normalise_rewards(rewards: List[float]) -> List[float]: + """ + Group-relative advantage normalisation: subtract mean, divide by std. + Clipped to [0, 1] so the C++ side always receives values in that range. + + All-equal rewards → uniform 0.5 (no signal, but no NaN either). + """ + if len(rewards) == 0: + return [] + mean = sum(rewards) / len(rewards) + variance = sum((r - mean) ** 2 for r in rewards) / len(rewards) + std = math.sqrt(variance) if variance > 1e-8 else 1.0 + + normalised = [(r - mean) / std for r in rewards] + # Shift to [0,1]: z-scores typically lie in [-3, +3] + clipped = [max(0.0, min(1.0, 0.5 + z / 6.0)) for z in normalised] + return clipped + + +# ────────────────────────────────────────────────────────────────────────────── +# Example prompt / reward providers +# ────────────────────────────────────────────────────────────────────────────── + +# Replace these with your own logic. + +_EXAMPLE_PROMPTS = [ + "Explain the concept of gradient descent in one sentence.", + "What is the capital of France?", + "Write a haiku about machine learning.", + "Describe the difference between SFT and RLHF.", + "What does GRPO stand for?", +] + + +def get_prompt(step: int) -> str: + """Return a prompt for the given training step (0-indexed).""" + return _EXAMPLE_PROMPTS[step % len(_EXAMPLE_PROMPTS)] + + +def score_generations(prompt: str, generations: List[str]) -> List[float]: + """ + Score a list of model generations for the given prompt. + Returns a list of raw reward scores (any numeric range; will be normalised). + + This example uses a trivial heuristic: longer, more varied responses + score higher. Replace with your actual reward model / verifier. + """ + scores = [] + for gen in generations: + words = gen.split() + # Simple heuristics: length + lexical diversity + length_score = min(1.0, len(words) / 50.0) + vocab_score = min(1.0, len(set(words)) / max(1, len(words))) + scores.append(0.6 * length_score + 0.4 * vocab_score) + return scores + + +# ────────────────────────────────────────────────────────────────────────────── +# Main GRPO loop +# ────────────────────────────────────────────────────────────────────────────── + +def run_grpo(args: argparse.Namespace): + # Resolve binary + binary = Path(args.binary) + if not binary.exists(): + log.error("Binary not found: %s", binary) + sys.exit(1) + + # Build command + cmd = [ + str(binary), + "--model", args.model, + "--lora-out", args.lora_out, + "--lora-rank", str(args.rank), + "--lora-alpha", str(args.rank // 2), + "-c", str(args.ctx_size), + "-b", str(args.ctx_size), + "-ub", "512", + "-ngl", str(args.ngl), + "-lr", str(args.lr), + "--seed", str(args.seed), + "--grad-checkpoint","48", + "--shuffle-dataset", + "--grpo-mode", + "--n-gen", str(args.n_gen), + "--n-steps", str(args.n_steps), + "--grpo-temp", str(args.temperature), + "--grpo-max-tokens",str(args.max_tokens), + ] + + if args.lora: + cmd += ["--lora", args.lora] + + if args.save_every > 0: + cmd += ["--save-every", str(args.save_every)] + + log.info("Launching: %s", " ".join(cmd)) + + proc = subprocess.Popen( + cmd, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=sys.stderr, # C++ debug/timing logs go directly to our stderr + text=True, + bufsize=1, + ) + + try: + _grpo_loop(proc, args) + except KeyboardInterrupt: + log.info("Interrupted — requesting graceful stop") + try: + write_cmd(proc, "STOP") + except Exception: + pass + except Exception as e: + log.error("GRPO loop error: %s", e) + proc.kill() + raise + finally: + try: + if proc.stdin is not None: + proc.stdin.close() + except Exception: + pass + rc = proc.wait(timeout=30) + if rc not in (0, None): + log.warning("Subprocess exited with code %d", rc) + + +def _grpo_loop(proc: subprocess.Popen, args: argparse.Namespace): + # ── Wait for READY ────────────────────────────────────────────────────── + log.info("Waiting for subprocess to initialise (model load can take a minute)…") + wait_for(proc, "READY", timeout=300) + log.info("Subprocess ready.") + + current_prompt: str = "" + generations: List[str] = [] + step = 0 + + while True: + parsed = read_ipc(proc, timeout=600) + if parsed is None: + log.info("Subprocess exited (EOF).") + break + + msg_type, seq, payload = parsed + + # ── PROMPT_REQ ────────────────────────────────────────────────────── + if msg_type == "PROMPT_REQ": + step = int(seq) if seq else step + 1 + current_prompt = get_prompt(step - 1) + generations = [] + log.debug("Step %d — sending prompt: %s", step, current_prompt[:60]) + write_cmd(proc, f"PROMPT {escape(current_prompt)}") + + # ── GEN ───────────────────────────────────────────────────────────── + elif msg_type == "GEN": + # seq = "k/n" + parts = seq.split("/") + k = int(parts[0]) + n = int(parts[1]) if len(parts) > 1 else args.n_gen + text = unescape(payload) + generations.append(text) + log.debug(" Generation %d/%d: %s…", k, n, text[:60].replace("\n", "↵")) + + # ── REWARD_REQ ────────────────────────────────────────────────────── + elif msg_type == "REWARD_REQ": + n_expected = int(seq) if seq else len(generations) + if len(generations) != n_expected: + log.warning( + "REWARD_REQ asked for %d rewards but collected %d generations", + n_expected, len(generations), + ) + + raw_rewards = score_generations(current_prompt, generations) + advantages = normalise_rewards(raw_rewards) + + reward_str = " ".join(f"{a:.6f}" for a in advantages) + log.debug(" Rewards (raw): %s", [f"{r:.3f}" for r in raw_rewards]) + log.debug(" Advantages: %s", [f"{a:.3f}" for a in advantages]) + write_cmd(proc, f"REWARD {reward_str}") + + # ── PROGRESS ──────────────────────────────────────────────────────── + elif msg_type == "PROGRESS": + # Format: step=X/Y loss=Z epoch=A/B + sm = re.search(r"step=(\d+)(?:/(\d+))?", payload) + lm = re.search(r"loss=([\d.]+)", payload) + step_str = f"{sm.group(1)}/{sm.group(2)}" if sm and sm.group(2) else (sm.group(1) if sm else "?") + loss_str = lm.group(1) if lm else "?" + print(f" step {step_str} loss {loss_str}", flush=True) + + # ── CHECKPOINT ────────────────────────────────────────────────────── + elif msg_type == "CHECKPOINT": + log.info("Checkpoint saved: %s", payload.strip()) + + # ── DONE ──────────────────────────────────────────────────────────── + elif msg_type == "DONE": + m = re.search(r"final_loss=([\d.]+)", payload) + loss = m.group(1) if m else "?" + log.info("Training complete. final_loss=%s", loss) + break + + # ── ERROR ──────────────────────────────────────────────────────────── + elif msg_type == "ERROR": + log.error("C++ process error: %s", payload.strip()) + raise RuntimeError(f"Training failed: {payload.strip()}") + + else: + log.debug("Unknown IPC message: [%s] seq=%r payload=%r", msg_type, seq, payload) + + +# ────────────────────────────────────────────────────────────────────────────── +# CLI +# ────────────────────────────────────────────────────────────────────────────── + +def parse_args() -> argparse.Namespace: + # Default binary: build/bin/ relative to this script's repo root + script_dir = Path(__file__).resolve().parent + repo_root = script_dir.parents[1] # examples/qlora_training → llama.cpp root + default_bin = repo_root / "build" / "bin" / "llama-finetune-qlora" + + p = argparse.ArgumentParser( + description="Minimal GRPO training loop via llama-finetune-qlora --grpo-mode", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + p.add_argument("--model", required=True, help="Base GGUF model path") + p.add_argument("--lora-out", required=True, help="Output adapter GGUF path") + p.add_argument("--lora", default=None, help="Resume from existing adapter GGUF") + p.add_argument("--binary", default=str(default_bin), help="Path to llama-finetune-qlora binary") + p.add_argument("--rank", type=int, default=16, help="LoRA rank") + p.add_argument("--n-steps", type=int, default=200, help="Number of GRPO steps") + p.add_argument("--n-gen", type=int, default=8, help="Generations per prompt") + p.add_argument("--lr", type=float, default=1e-4, help="Learning rate") + p.add_argument("--ctx-size", type=int, default=4096, help="Context window") + p.add_argument("--ngl", type=int, default=999, help="GPU layers (-ngl)") + p.add_argument("--temperature", type=float, default=0.8, help="Sampling temperature") + p.add_argument("--max-tokens", type=int, default=512, help="Max tokens per generation") + p.add_argument("--save-every", type=int, default=0, help="Save checkpoint every N steps (0=off)") + p.add_argument("--seed", type=int, default=42, help="RNG seed") + p.add_argument("--verbose", action="store_true", help="Enable DEBUG logging") + return p.parse_args() + + +if __name__ == "__main__": + args = parse_args() + if args.verbose: + logging.getLogger().setLevel(logging.DEBUG) + run_grpo(args) diff --git a/examples/qlora_training/sample_data.jsonl b/examples/qlora_training/sample_data.jsonl new file mode 100644 index 0000000000..982168cfc1 --- /dev/null +++ b/examples/qlora_training/sample_data.jsonl @@ -0,0 +1,7 @@ +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "What is the capital of France?"}, {"role": "assistant", "content": "The capital of France is Paris."}]} +{"messages": [{"role": "user", "content": "Explain gradient descent in one sentence."}, {"role": "assistant", "content": "Gradient descent iteratively adjusts model parameters in the direction that most reduces the loss function."}]} +{"messages": [{"role": "system", "content": "You are a concise coding assistant."}, {"role": "user", "content": "Write a Python function that returns the factorial of n."}, {"role": "assistant", "content": "```python\ndef factorial(n):\n return 1 if n <= 1 else n * factorial(n - 1)\n```"}]} +{"prompt": "What is supervised fine-tuning?", "response": "Supervised fine-tuning (SFT) trains a pre-trained model on labelled examples of the desired behaviour, adjusting weights to minimise cross-entropy loss on the target responses."} +{"prompt": "What is LoRA?", "response": "LoRA (Low-Rank Adaptation) fine-tunes large models by injecting trainable low-rank matrices into frozen weight layers, dramatically reducing the number of trainable parameters."} +{"text": "The transformer architecture, introduced in 'Attention Is All You Need' (Vaswani et al., 2017), relies entirely on self-attention mechanisms and has become the foundation of modern large language models."} +{"text": "QLoRA combines quantization with LoRA: the base model is loaded in 4-bit precision (frozen), while only the small LoRA adapter weights are trained in full precision."} diff --git a/examples/qlora_training/sample_rwsft_data.jsonl b/examples/qlora_training/sample_rwsft_data.jsonl new file mode 100644 index 0000000000..3bf0234451 --- /dev/null +++ b/examples/qlora_training/sample_rwsft_data.jsonl @@ -0,0 +1,6 @@ +{"messages": [{"role": "user", "content": "Explain what a neural network is."}, {"role": "assistant", "content": "A neural network is a computational model loosely inspired by biological neurons, consisting of layers of interconnected nodes that learn to map inputs to outputs through training."}], "reward": 0.9} +{"messages": [{"role": "user", "content": "Explain what a neural network is."}, {"role": "assistant", "content": "It's a thing that does AI stuff with nodes and layers."}], "reward": 0.2} +{"messages": [{"role": "user", "content": "What is backpropagation?"}, {"role": "assistant", "content": "Backpropagation computes gradients of the loss with respect to each parameter by applying the chain rule backwards through the computation graph, enabling gradient descent to update all weights efficiently."}], "reward": 1.0} +{"messages": [{"role": "user", "content": "What is backpropagation?"}, {"role": "assistant", "content": "Backpropagation is how neural networks learn by going backwards."}], "reward": 0.3} +{"prompt": "Describe overfitting in machine learning.", "response": "Overfitting occurs when a model learns the training data too closely—capturing noise and idiosyncrasies—resulting in poor generalisation to unseen examples. Regularisation, dropout, and early stopping are common mitigations.", "reward": 0.95} +{"prompt": "Describe overfitting in machine learning.", "response": "Overfitting is when the model is too good at training data but bad at new data.", "reward": 0.4} diff --git a/examples/training/finetune.cpp b/examples/training/finetune.cpp index e20f89488f..88d3db741d 100644 --- a/examples/training/finetune.cpp +++ b/examples/training/finetune.cpp @@ -73,6 +73,7 @@ int main(int argc, char ** argv) { /*get_opt_pars =*/common_opt_lr_pars, /*get_opt_pars_ud =*/¶ms.lr, /*optimizer_type =*/params.optimizer, + /*grad_checkpoint_interval =*/params.grad_checkpoint_interval, }; llama_opt_init(ctx, model, lopt_params); @@ -83,7 +84,7 @@ int main(int argc, char ** argv) { for (lr.epoch = 0; lr.epoch < lr.epochs; ++lr.epoch) { llama_opt_epoch(ctx, dataset, result_train, result_eval, idata_split, - ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar); + ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar, /*shuffle=*/false); fprintf(stderr, "\n"); ggml_opt_result_reset(result_train); diff --git a/ggml/include/ggml-opt.h b/ggml/include/ggml-opt.h index 1c2ed79b77..cac543c02d 100644 --- a/ggml/include/ggml-opt.h +++ b/ggml/include/ggml-opt.h @@ -89,6 +89,7 @@ extern "C" { float beta2; // second AdamW momentum float eps; // epsilon for numerical stability float wd; // weight decay - 0.0f to disable + float gclip; // element-wise gradient clipping threshold - 0.0f to disable } adamw; struct { float alpha; // learning rate @@ -125,6 +126,13 @@ extern "C" { ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters void * get_opt_pars_ud; // userdata for calculating optimizer parameters + // Gradient checkpointing: keep the output of every Nth forward node alive through + // the backward pass so the allocator cannot reuse its memory for other tensors. + // This trades compute for VRAM — intermediate activations between checkpoints are + // freed and recomputed during the backward pass by the existing graph structure. + // Set to 0 (default) to disable. A value of ~32–64 cuts activation VRAM by ~50%. + int32_t grad_checkpoint_interval; + // only GGML_OPT_OPTIMIZER_TYPE_ADAMW needs m, v momenta per parameter tensor enum ggml_opt_optimizer_type optimizer; }; diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 25f9601e9b..b060c52207 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -503,6 +503,7 @@ extern "C" { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID, GGML_OP_OUT_PROD, + GGML_OP_OUT_PROD_ID, // scattered outer-product for MUL_MAT_ID backward (MoE LoRA) GGML_OP_SCALE, GGML_OP_SET, @@ -1426,6 +1427,21 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); + // Scattered outer-product for the MUL_MAT_ID backward pass (MoE LoRA gradient). + // + // a: [cols, n_expert_used, n_tokens] F32 — activations + // b: [rows, n_expert_used, n_tokens] F32 — upstream gradient + // ids: [n_expert_used, n_tokens] I32 — expert dispatch indices + // result: [cols, rows, n_expert, 1] F32 + // + // result[:, :, e] += sum_{(i,t): ids[i,t]==e} a[:, i, t] ⊗ b[:, i, t] + GGML_API struct ggml_tensor * ggml_out_prod_id( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * ids, + int64_t n_expert); + // // operations on tensors without backpropagation // diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 314cc1088a..e78e3e9b5e 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -11046,7 +11046,7 @@ static void ggml_compute_forward_opt_step_adamw_f32( GGML_ASSERT(ggml_are_same_shape(src0, src0_grad)); GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_m)); GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_v)); - GGML_ASSERT(ggml_nelements(adamw_params) == 7); + GGML_ASSERT(ggml_nelements(adamw_params) == 8); const int ith = params->ith; const int nth = params->nth; @@ -11072,6 +11072,7 @@ static void ggml_compute_forward_opt_step_adamw_f32( const float wd = adamw_params_ptr[4]; const float beta1h = adamw_params_ptr[5]; const float beta2h = adamw_params_ptr[6]; + const float gclip = adamw_params_ptr[7]; // element-wise gradient clip (0 = disabled) const float keep = 1.f - alpha * wd; for (int ir = ir0; ir < ir1; ++ir) { const int64_t i03 = ir/(ne02*ne01); @@ -11086,8 +11087,10 @@ static void ggml_compute_forward_opt_step_adamw_f32( float * v = (float *) ((char *) src0_grad_v->data + offset); for (int i00 = 0; i00 < ne00; ++i00) { - m[i00] = m[i00]*beta1 + g[i00]*(1.0f - beta1); - v[i00] = v[i00]*beta2 + g[i00]*g[i00]*(1.0f - beta2); + const float gi = (gclip > 0.0f) ? fmaxf(-gclip, fminf(gclip, g[i00])) : g[i00]; + + m[i00] = m[i00]*beta1 + gi*(1.0f - beta1); + v[i00] = v[i00]*beta2 + gi*gi*(1.0f - beta2); const float mh = m[i00]*beta1h; const float vh = sqrtf(v[i00]*beta2h) + eps; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 5a0be4a472..8188e2acec 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2680,6 +2680,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_OUT_PROD: ggml_cuda_out_prod(ctx, dst); break; + case GGML_OP_OUT_PROD_ID: + ggml_cuda_out_prod_id(ctx, dst); + break; case GGML_OP_SCALE: ggml_cuda_op_scale(ctx, dst); break; @@ -4803,7 +4806,15 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g } } break; case GGML_OP_OUT_PROD: - return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; + return op->type == GGML_TYPE_F32 + && (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) + && op->src[1]->type == GGML_TYPE_F32; + case GGML_OP_OUT_PROD_ID: + return op->src[0] != nullptr && op->src[1] != nullptr && op->src[2] != nullptr + && op->type == GGML_TYPE_F32 + && op->src[0]->type == GGML_TYPE_F32 + && op->src[1]->type == GGML_TYPE_F32 + && op->src[2]->type == GGML_TYPE_I32; case GGML_OP_GET_ROWS: { switch (op->src[0]->type) { diff --git a/ggml/src/ggml-cuda/opt-step-adamw.cu b/ggml/src/ggml-cuda/opt-step-adamw.cu index 35154f2996..dd2c1edf3a 100644 --- a/ggml/src/ggml-cuda/opt-step-adamw.cu +++ b/ggml/src/ggml-cuda/opt-step-adamw.cu @@ -20,8 +20,9 @@ static __global__ void opt_step_adamw_f32( const float wd = pars[4]; const float beta1h = pars[5]; const float beta2h = pars[6]; + const float gclip = pars[7]; // element-wise gradient clip (0 = disabled) - const float gi = g[i]; + const float gi = (gclip > 0.0f) ? fmaxf(-gclip, fminf(gclip, g[i])) : g[i]; const float gmi = g_m[i]*beta1 + gi*(1.0f - beta1); const float gvi = g_v[i]*beta2 + gi*gi*(1.0f - beta2); @@ -62,7 +63,7 @@ void ggml_cuda_opt_step_adamw(ggml_backend_cuda_context & ctx, ggml_tensor * dst GGML_ASSERT(ggml_are_same_shape(src0, src0_grad)); GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_m)); GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_v)); - GGML_ASSERT(ggml_nelements(adamw_params) == 7); + GGML_ASSERT(ggml_nelements(adamw_params) == 8); float * src0_d = (float *) src0->data; const float * src0_grad_d = (const float *) src0_grad->data; diff --git a/ggml/src/ggml-cuda/out-prod.cu b/ggml/src/ggml-cuda/out-prod.cu index c9b2b699c6..392de34d1c 100644 --- a/ggml/src/ggml-cuda/out-prod.cu +++ b/ggml/src/ggml-cuda/out-prod.cu @@ -1,6 +1,9 @@ #include "out-prod.cuh" +#include "convert.cuh" #include +#include +#include void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; @@ -8,7 +11,7 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { GGML_TENSOR_BINARY_OP_LOCALS - GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type)); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); @@ -22,19 +25,37 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { GGML_ASSERT(ne2 == src1->ne[2]); GGML_ASSERT(ne3 == src1->ne[3]); - const float * src0_d = (const float *) src0->data; - const float * src1_d = (const float *) src1->data; - float * dst_d = (float *) dst->data; - cudaStream_t stream = ctx.stream(); cublasHandle_t handle = ctx.cublas_handle(); + // If src0 is quantized, dequantize to a temp F32 buffer on GPU + ggml_cuda_pool_alloc src0_f32_alloc; + const float * src0_d; + int64_t lda; + + if (src0->type != GGML_TYPE_F32) { + const int64_t n_elements = ggml_nelements(src0); + src0_f32_alloc.alloc(ctx.pool(), n_elements); + + to_fp32_cuda_t to_fp32 = ggml_get_to_fp32_cuda(src0->type); + GGML_ASSERT(to_fp32 != nullptr); + to_fp32(src0->data, src0_f32_alloc.ptr, n_elements, stream); + + src0_d = src0_f32_alloc.ptr; + lda = ne00; // dequantized data is contiguous: stride = ne00 + } else { + src0_d = (const float *) src0->data; + lda = nb01 / sizeof(float); + } + + const float * src1_d = (const float *) src1->data; + float * dst_d = (float *) dst->data; + const float alpha = 1.0f; const float beta = 0.0f; CUBLAS_CHECK(cublasSetStream(handle, stream)); - const int64_t lda = nb01 / sizeof(float); const int64_t ldc = nb1 / sizeof(float); const bool src1_T = ggml_is_transposed(src1); @@ -42,9 +63,9 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float); GGML_ASSERT( (src1_T ? nb11 : nb10) == sizeof(float)); - // data strides in dimensions 2/3 - const size_t s02 = nb02 / sizeof(float); - const size_t s03 = nb03 / sizeof(float); + // data strides in dimensions 2/3 (for dequantized src0, use element-based strides) + const size_t s02 = (src0->type != GGML_TYPE_F32) ? (ne00 * ne01) : (nb02 / sizeof(float)); + const size_t s03 = (src0->type != GGML_TYPE_F32) ? (ne00 * ne01 * ne02) : (nb03 / sizeof(float)); const size_t s12 = nb12 / sizeof(float); const size_t s13 = nb13 / sizeof(float); const size_t s2 = nb2 / sizeof(float); @@ -66,3 +87,115 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { } } } + +// ggml_cuda_out_prod_id +// +// Scattered outer-product for the MUL_MAT_ID backward pass (gradient w.r.t. expert weights). +// +// src0 = a [cols, n_expert_used, n_tokens] F32 — token activations +// src1 = b [rows, n_expert_used, n_tokens] F32 — upstream gradient +// src2 = ids [n_expert_used, n_tokens] I32 — expert dispatch indices +// dst [cols, rows, n_expert, 1] F32 — gradient w.r.t. expert weight matrices +// +// dst[:, :, e] += sum_{(i,t): ids[i,t]==e} a[:, i, t] ⊗ b[:, i, t] +// +// Algorithm: +// For each expert e: gather the token columns where ids[i,t]==e into contiguous +// GPU buffers, then use cublasSgemm (beta=1) to accumulate the outer product. +// ids may be CPU-resident (common in backward graphs where they are leaf tensors). +void ggml_cuda_out_prod_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; // a [cols, n_exp_used, n_tokens] + const ggml_tensor * src1 = dst->src[1]; // b [rows, n_exp_used, n_tokens] + const ggml_tensor * ids = dst->src[2]; // ids [n_exp_used, n_tokens] i32 + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(ids->type == GGML_TYPE_I32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + const int64_t cols = src0->ne[0]; + const int64_t n_exp_used = src0->ne[1]; + const int64_t n_tokens = src0->ne[2]; + const int64_t rows = src1->ne[0]; + const int64_t n_expert = dst->ne[2]; + + cudaStream_t stream = ctx.stream(); + cublasHandle_t handle = ctx.cublas_handle(); + CUBLAS_CHECK(cublasSetStream(handle, stream)); + + // Zero destination tensor before accumulating + CUDA_CHECK(cudaMemsetAsync(dst->data, 0, ggml_nbytes(dst), stream)); + + // Read ids to host — ids may be CPU-resident (backward graph leaf) or GPU-resident + const size_t ids_nbytes = ggml_nbytes(ids); + std::vector ids_host(ids_nbytes); + if (ids->buffer && !ggml_backend_buffer_is_host(ids->buffer)) { + // GPU-resident: copy to host and synchronize so we can inspect the values + CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids->data, ids_nbytes, cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + } else { + memcpy(ids_host.data(), ids->data, ids_nbytes); + } + + // Build per-expert token list: expert_tokens[e] = list of flat indices (iexp*n_tokens+itok) + // whose dispatch id equals e. + std::vector> expert_tokens(n_expert); + for (int64_t itok = 0; itok < n_tokens; ++itok) { + for (int64_t iexp = 0; iexp < n_exp_used; ++iexp) { + const int32_t eid = *(const int32_t *)(ids_host.data() + + itok * ids->nb[1] + iexp * ids->nb[0]); + GGML_ASSERT(eid >= 0 && eid < (int32_t)n_expert); + expert_tokens[eid].push_back(iexp * n_tokens + itok); + } + } + + // Strides (in elements, not bytes) + const int64_t a_stride_exp = src0->nb[1] / sizeof(float); // cols + const int64_t a_stride_tok = src0->nb[2] / sizeof(float); // cols * n_exp_used + const int64_t b_stride_exp = src1->nb[1] / sizeof(float); // rows + const int64_t b_stride_tok = src1->nb[2] / sizeof(float); // rows * n_exp_used + const int64_t dst_stride_e = dst->nb[2] / sizeof(float); // cols * rows + + const float alpha_one = 1.0f; + const float beta_acc = 1.0f; // accumulate — dst is already zeroed above + + const float * a_base = (const float *) src0->data; + const float * b_base = (const float *) src1->data; + float * d_base = (float *) dst->data; + + for (int64_t e = 0; e < n_expert; ++e) { + const auto & toks = expert_tokens[e]; + if (toks.empty()) { + continue; + } + + const int64_t ntoks_e = (int64_t) toks.size(); + + // Allocate contiguous gather buffers on GPU: a_e [cols, ntoks_e], b_e [rows, ntoks_e] + ggml_cuda_pool_alloc a_gathered(ctx.pool(), cols * ntoks_e); + ggml_cuda_pool_alloc b_gathered(ctx.pool(), rows * ntoks_e); + + // Gather token vectors from GPU src0/src1 into contiguous buffers + for (int64_t ti = 0; ti < ntoks_e; ++ti) { + const int64_t flat = toks[ti]; + const int64_t iexp = flat / n_tokens; + const int64_t itok = flat % n_tokens; + CUDA_CHECK(cudaMemcpyAsync( + a_gathered.ptr + ti * cols, + a_base + iexp * a_stride_exp + itok * a_stride_tok, + cols * sizeof(float), cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync( + b_gathered.ptr + ti * rows, + b_base + iexp * b_stride_exp + itok * b_stride_tok, + rows * sizeof(float), cudaMemcpyDeviceToDevice, stream)); + } + + // dst[:, :, e] += a_gathered @ b_gathered^T + // cuBLAS column-major: A=[cols, ntoks_e] lda=cols, B=[rows, ntoks_e] ldb=rows + CUBLAS_CHECK(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, + (int)cols, (int)rows, (int)ntoks_e, + &alpha_one, a_gathered.ptr, (int)cols, + b_gathered.ptr, (int)rows, + &beta_acc, d_base + e*dst_stride_e, (int)cols)); + } +} diff --git a/ggml/src/ggml-cuda/out-prod.cuh b/ggml/src/ggml-cuda/out-prod.cuh index a0046f5f8f..993307e797 100644 --- a/ggml/src/ggml-cuda/out-prod.cuh +++ b/ggml/src/ggml-cuda/out-prod.cuh @@ -1,3 +1,10 @@ #include "common.cuh" void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +// Scattered outer-product for MUL_MAT_ID backward (gradient w.r.t. expert weight matrices). +// src0: activations [cols, n_expert_used, n_tokens] F32 +// src1: grad_output [rows, n_expert_used, n_tokens] F32 +// src2: expert ids [n_expert_used, n_tokens] I32 (may be CPU-resident) +// dst: grad_weight [cols, rows, n_expert, 1] F32 +void ggml_cuda_out_prod_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-opt.cpp b/ggml/src/ggml-opt.cpp index e078ad14a3..8be90c8944 100644 --- a/ggml/src/ggml-opt.cpp +++ b/ggml/src/ggml-opt.cpp @@ -58,10 +58,13 @@ struct ggml_opt_context { std::vector grad_accs; std::vector grad_m; std::vector grad_v; + std::vector bufs_momenta; // per-param moment buffers (one per param node) + std::vector ctxs_momenta; // corresponding ggml contexts (keep alive for tensor metadata) int64_t iter = 1; int32_t opt_period = 1; int32_t opt_i = 0; + int32_t grad_checkpoint_interval = 0; bool loss_per_datapoint = false; ggml_opt_get_optimizer_params get_opt_pars = nullptr; @@ -230,6 +233,7 @@ struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * us result.adamw.beta2 = 0.999f; result.adamw.eps = 1e-8f; result.adamw.wd = 0.0f; + result.adamw.gclip = 0.0f; result.sgd.alpha = 1e-3f; result.sgd.wd = 0.0f; @@ -253,9 +257,10 @@ struct ggml_opt_params ggml_opt_default_params( /*loss_type =*/ loss_type, /*build_type =*/ GGML_OPT_BUILD_TYPE_OPT, /*opt_period =*/ 1, - /*get_opt_pars =*/ ggml_opt_get_default_optimizer_params, - /*get_opt_pars_ud =*/ nullptr, - /*optimizer =*/ GGML_OPT_OPTIMIZER_TYPE_ADAMW, + /*get_opt_pars =*/ ggml_opt_get_default_optimizer_params, + /*get_opt_pars_ud =*/ nullptr, + /*grad_checkpoint_interval =*/ 0, + /*optimizer =*/ GGML_OPT_OPTIMIZER_TYPE_ADAMW, }; } @@ -475,8 +480,23 @@ static void ggml_opt_build(ggml_opt_context_t opt_ctx) { for (int i = 0; i < n_nodes; ++i) { ggml_tensor * node = opt_ctx->gf->nodes[i]; if (node->flags & GGML_TENSOR_FLAG_PARAM) { - opt_ctx->grad_m[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne); - opt_ctx->grad_v[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne); + // Allocate moments on the same buffer type as the param tensor so + // the ADAMW op runs on the correct backend (avoids cross-device mismatch + // when some LoRA tensors are on CPU and others on GPU with partial offload). + ggml_backend_buffer_type_t param_buft = node->buffer + ? ggml_backend_buffer_get_type(node->buffer) + : ggml_backend_cpu_buffer_type(); + + // Allocate a tiny context + buffer for this pair of moment tensors. + const size_t sz = 2 * ggml_tensor_overhead(); + struct ggml_init_params mip = { sz, nullptr, true }; + struct ggml_context * mctx = ggml_init(mip); + opt_ctx->grad_m[i] = ggml_new_tensor(mctx, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne); + opt_ctx->grad_v[i] = ggml_new_tensor(mctx, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne); + ggml_backend_buffer_t mbuf = ggml_backend_alloc_ctx_tensors_from_buft(mctx, param_buft); + ggml_backend_buffer_clear(mbuf, 0); + opt_ctx->bufs_momenta.push_back(mbuf); + opt_ctx->ctxs_momenta.push_back(mctx); // keep alive for tensor metadata } else { opt_ctx->grad_m[i] = nullptr; opt_ctx->grad_v[i] = nullptr; @@ -485,6 +505,31 @@ static void ggml_opt_build(ggml_opt_context_t opt_ctx) { } } + // Gradient checkpointing: mark every Nth forward node as OUTPUT so the allocator + // keeps its memory alive through the backward pass. The backward graph already + // contains the forward ops (gb_grad is a superset of gf), so the checkpointed + // activations are naturally available for backward matmuls without recomputation. + // This prevents the allocator from aliasing those buffers to later ops, cutting + // peak activation VRAM at the cost of slightly larger static allocation. + if (opt_ctx->grad_checkpoint_interval > 0) { + const int interval = opt_ctx->grad_checkpoint_interval; + const int n_fwd = opt_ctx->gf->n_nodes; + int ckpt_count = 0; + for (int i = interval - 1; i < n_fwd; i += interval) { + struct ggml_tensor * node = opt_ctx->gf->nodes[i]; + // Only checkpoint F32 compute nodes — skip I32 index tensors and already-output nodes. + if (node->type != GGML_TYPE_F32) continue; + if (node->flags & GGML_TENSOR_FLAG_OUTPUT) continue; + if (node->flags & GGML_TENSOR_FLAG_INPUT) continue; + node->flags |= GGML_TENSOR_FLAG_OUTPUT; + ckpt_count++; + } + if (ckpt_count > 0) { + GGML_LOG_DEBUG("%s: gradient checkpointing: marked %d/%d nodes as persistent (interval=%d)\n", + __func__, ckpt_count, n_fwd, interval); + } + } + // gb_grad == graph backward gradients, forward pass, then backward pass to calculate gradients. opt_ctx->gb_grad = ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gf, /*force_grads =*/ true); ggml_build_backward_expand(opt_ctx->ctx_compute, opt_ctx->gb_grad, opt_ctx->grad_accs.data()); @@ -503,7 +548,7 @@ static void ggml_opt_build(ggml_opt_context_t opt_ctx) { // gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step. opt_ctx->gb_opt = ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gb_grad, /*force_grads =*/ true); - opt_ctx->opt_step_params = ggml_new_tensor_1d(opt_ctx->ctx_cpu, GGML_TYPE_F32, need_momenta ? 7 : 2); + opt_ctx->opt_step_params = ggml_new_tensor_1d(opt_ctx->ctx_cpu, GGML_TYPE_F32, need_momenta ? 8 : 2); ggml_tensor * adamw_params = opt_ctx->opt_step_params; ggml_set_input(adamw_params); const char * optimizer_name = ggml_opt_optimizer_name(opt_ctx->optimizer); @@ -555,10 +600,11 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) { result->build_type_alloc = params.build_type; result->inputs = params.inputs; result->outputs = params.outputs; - result->opt_period = params.opt_period; - result->get_opt_pars = params.get_opt_pars; - result->get_opt_pars_ud = params.get_opt_pars_ud; - result->optimizer = params.optimizer; + result->opt_period = params.opt_period; + result->grad_checkpoint_interval = params.grad_checkpoint_interval; + result->get_opt_pars = params.get_opt_pars; + result->get_opt_pars_ud = params.get_opt_pars_ud; + result->optimizer = params.optimizer; GGML_ASSERT(result->opt_period >= 1); @@ -587,6 +633,12 @@ void ggml_opt_free(ggml_opt_context_t opt_ctx) { } ggml_backend_buffer_free(opt_ctx->buf_static); ggml_backend_buffer_free(opt_ctx->buf_cpu); + for (ggml_backend_buffer_t buf : opt_ctx->bufs_momenta) { + ggml_backend_buffer_free(buf); + } + for (struct ggml_context * ctx : opt_ctx->ctxs_momenta) { + ggml_free(ctx); + } ggml_free(opt_ctx->ctx_static); ggml_free(opt_ctx->ctx_cpu); delete opt_ctx; @@ -726,6 +778,17 @@ void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward) { if (opt_ctx->build_type == GGML_OPT_BUILD_TYPE_OPT && opt_ctx->opt_period > 1 && opt_ctx->opt_i == 0) { ggml_graph_reset(opt_ctx->gb_grad); } + + // For non-static graphs the compute graph is rebuilt every call, so ggml_graph_reset + // is not called and grad_accs may carry over values from the previous accumulation window. + // Explicitly zero them at the start of each gradient-accumulation cycle. + if (!opt_ctx->static_graphs && backward && opt_ctx->opt_i == 0) { + for (struct ggml_tensor * ga : opt_ctx->grad_accs) { + if (ga) { + ggml_set_zero(ga); + } + } + } if (backward) { const int32_t opt_i_next = (opt_ctx->opt_i + 1) % opt_ctx->opt_period; opt_ctx->build_type = opt_i_next == 0 ? GGML_OPT_BUILD_TYPE_OPT : GGML_OPT_BUILD_TYPE_GRAD; @@ -793,6 +856,7 @@ void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result) { GGML_ASSERT(opt_pars.adamw.eps >= 0.0f); GGML_ASSERT(opt_pars.adamw.wd >= 0.0f); GGML_ASSERT(opt_pars.adamw.wd <= 1.0f); + GGML_ASSERT(opt_pars.adamw.gclip >= 0.0f); // beta1, beta2 after applying warmup const float beta1h = 1.0f / (1.0f - powf(opt_pars.adamw.beta1, opt_ctx->iter)); @@ -806,6 +870,7 @@ void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result) { adamw_par_data[4] = opt_pars.adamw.wd; adamw_par_data[5] = beta1h; adamw_par_data[6] = beta2h; + adamw_par_data[7] = opt_pars.adamw.gclip; } break; case GGML_OPT_OPTIMIZER_TYPE_SGD: { GGML_ASSERT(opt_pars.sgd.alpha > 0.0f); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index e5b83e1447..f85f45b3de 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -984,6 +984,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "MUL_MAT", "MUL_MAT_ID", "OUT_PROD", + "OUT_PROD_ID", "SCALE", "SET", @@ -1057,7 +1058,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "GLU", }; -static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96"); +static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT != 97"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1094,6 +1095,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "X*Y", "X[i]*Y", "X*Y", + "X_id⊗Y_id", "x*v", "y-\\>view(x)", @@ -1167,7 +1169,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "glu(x)", }; -static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96"); +static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT != 97"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -3302,6 +3304,44 @@ struct ggml_tensor * ggml_out_prod( return result; } +// ggml_out_prod_id +// +// Scattered outer-product for the MUL_MAT_ID backward pass. +// +// a: [cols, n_expert_used, n_tokens] F32 — activations (src1 of MUL_MAT_ID) +// b: [rows, n_expert_used, n_tokens] F32 — upstream gradient +// ids: [n_expert_used, n_tokens] I32 — expert dispatch indices (src2 of MUL_MAT_ID) +// result: [cols, rows, n_expert, 1] F32 +// +// result[:, :, e] += sum_{(i,t): ids[i,t]==e} a[:, i, t] ⊗ b[:, i, t] +// +// Computes the gradient w.r.t. the expert weight matrices (src0) of MUL_MAT_ID. +struct ggml_tensor * ggml_out_prod_id( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * ids, + int64_t n_expert) { + GGML_ASSERT(a->type == GGML_TYPE_F32); + GGML_ASSERT(b->type == GGML_TYPE_F32); + GGML_ASSERT(ids->type == GGML_TYPE_I32); + GGML_ASSERT(a->ne[1] == b->ne[1]); // n_expert_used matches + GGML_ASSERT(a->ne[2] == b->ne[2]); // n_tokens matches + GGML_ASSERT(ids->ne[0] == a->ne[1]); // n_expert_used matches ids + GGML_ASSERT(ids->ne[1] == a->ne[2]); // n_tokens matches ids + GGML_ASSERT(n_expert > 0); + + const int64_t ne[4] = { a->ne[0], b->ne[0], n_expert, 1 }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + + result->op = GGML_OP_OUT_PROD_ID; + result->src[0] = a; + result->src[1] = b; + result->src[2] = ids; + + return result; +} + // ggml_scale static struct ggml_tensor * ggml_scale_impl( @@ -3840,12 +3880,17 @@ struct ggml_tensor * ggml_get_rows_back( struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_tensor * c) { - GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_is_matrix(c) && (a->ne[0] == c->ne[0])); + GGML_ASSERT(b->type == GGML_TYPE_I32); + GGML_ASSERT(a->ne[0] == c->ne[0]); + // Support both 2D and 3D: result shape matches c (the source tensor shape) // TODO: implement non F32 return - //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]); - struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]); + struct ggml_tensor * result; + if (c->ne[2] > 1) { + result = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1], c->ne[2]); + } else { + result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]); + } result->op = GGML_OP_GET_ROWS_BACK; result->src[0] = a; @@ -6064,7 +6109,7 @@ struct ggml_tensor * ggml_opt_step_adamw( GGML_ASSERT(ggml_are_same_shape(a, m)); GGML_ASSERT(ggml_are_same_shape(a, v)); GGML_ASSERT(adamw_params->type == GGML_TYPE_F32); - GGML_ASSERT(ggml_nelements(adamw_params) == 7); + GGML_ASSERT(ggml_nelements(adamw_params) == 8); struct ggml_tensor * result = ggml_view_tensor(ctx, a); @@ -6527,6 +6572,35 @@ static void ggml_compute_backward( grad))); // [m,p,qq,rr] } } break; + case GGML_OP_MUL_MAT_ID: { + // Backward pass for indirect matrix multiplication (MoE). + // + // Forward: dst[rows, n_exp_used, n_tokens] = as[:,:,ids[i,t]] @ b[:,i,t] + // src0 = as [cols, rows, n_expert] — expert weight matrices + // src1 = b [cols, n_exp_used, n_tokens] — token activations + // src2 = ids [n_exp_used, n_tokens] — expert dispatch indices (I32) + // + // Gradient w.r.t. src1 (activations): + // grad_b[:,i,t] = as[:,:,ids[i,t]]^T @ grad[:,i,t] + // → computed via MUL_MAT_ID with transposed as + // + // Gradient w.r.t. src0 (expert weights, only when F32 i.e. LoRA): + // grad_as[:,:,e] += sum_{(i,t): ids[i,t]==e} b[:,i,t] ⊗ grad[:,i,t] + // → computed via OUT_PROD_ID + // + // Quantized src0 is frozen (stop-gradient) — handled in grads_needed below. + if (src0_needs_grads) { + const int64_t n_expert = src0->ne[2]; + struct ggml_tensor * grad_as = ggml_out_prod_id(ctx, src1, grad, src2, n_expert); + ggml_add_or_set(ctx, cgraph, isrc0, grad_as); + } + if (src1_needs_grads) { + // Transpose expert matrices: as [cols, rows, n_expert] → as_T [rows, cols, n_expert] + struct ggml_tensor * as_T = ggml_cont(ctx, ggml_permute(ctx, src0, 1, 0, 2, 3)); + struct ggml_tensor * grad_b = ggml_mul_mat_id(ctx, as_T, grad, src2); + ggml_add_or_set(ctx, cgraph, isrc1, grad_b); + } + } break; case GGML_OP_SCALE: { if (src0_needs_grads) { float s; @@ -6973,6 +7047,35 @@ void ggml_build_backward_expand( ignore_src[1] = true; break; + // MUL_MAT_ID: expert dispatch indices (src2) are integer — no gradient. + // When src0 is quantized the expert weights are frozen, so stop gradient through + // both src0 and src1 (activations have no path to loss without differentiable weights). + case GGML_OP_MUL_MAT_ID: + if (ggml_is_quantized(node->src[0]->type)) { + ignore_src[0] = true; + ignore_src[1] = true; + } + ignore_src[2] = true; // ids: integer tensor + break; + + // SET_ROWS is a KV-cache scatter write. The gradient of the written data flows + // through the attention read path (GET_ROWS backward), not through this node. + case GGML_OP_SET_ROWS: + ignore_src[0] = true; + ignore_src[1] = true; + break; + + // Ops with no backward implementation — stop gradient through all sources so the + // backward graph builder never tries to propagate through them. + case GGML_OP_SSM_CONV: // Mamba causal conv1d + case GGML_OP_SSM_SCAN: // Mamba selective scan + case GGML_OP_FLASH_ATTN_EXT: // use standard attention for training + ignore_src[0] = true; + ignore_src[1] = true; + ignore_src[2] = true; + ignore_src[3] = true; + break; + default: break; } @@ -6988,9 +7091,12 @@ void ggml_build_backward_expand( continue; } - // inplace operations are currently not supported - GGML_ASSERT(!node->view_src || node->op == GGML_OP_CPY || node->op == GGML_OP_VIEW || - node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE); + // inplace operations are currently not supported — warn and skip instead of crashing + if (node->view_src && node->op != GGML_OP_CPY && node->op != GGML_OP_VIEW && + node->op != GGML_OP_RESHAPE && node->op != GGML_OP_PERMUTE && node->op != GGML_OP_TRANSPOSE) { + GGML_LOG_WARN("%s: skipping unsupported inplace op '%s' in backward graph\n", __func__, ggml_op_name(node->op)); + continue; + } const size_t ihash = ggml_hash_find(&cgraph->visited_hash_set, node); GGML_ASSERT(ihash != GGML_HASHSET_FULL); diff --git a/include/llama.h b/include/llama.h index c6e102abe5..382c66e4c3 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1553,10 +1553,22 @@ extern "C" { void * get_opt_pars_ud; // userdata for calculating optimizer parameters enum ggml_opt_optimizer_type optimizer_type; + + // Gradient checkpointing: mark every Nth forward graph node as persistent so the + // allocator cannot reuse its memory during backward. Reduces peak activation VRAM + // at the cost of ~0 extra compute (activations are kept, not recomputed). + // Set to 0 (default) to disable. Good values: 32–64 nodes ≈ every 1–2 transformer layers. + int32_t grad_checkpoint_interval; }; LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params); + // weights: array of floats, one per dataset window (indexed by idata), already normalized to [0,1]. + // n_weights: length of the array. + // Pass NULL/0 to disable (equivalent to all-ones, i.e. standard SFT). + // The pointer must remain valid for the duration of all llama_opt_epoch calls. + LLAMA_API void llama_opt_set_reward_weights(const float * weights, int64_t n_weights); + LLAMA_API void llama_opt_epoch( struct llama_context * lctx, ggml_opt_dataset_t dataset, @@ -1564,7 +1576,8 @@ extern "C" { ggml_opt_result_t result_eval, int64_t idata_split, ggml_opt_epoch_callback callback_train, - ggml_opt_epoch_callback callback_eval); + ggml_opt_epoch_callback callback_eval, + bool shuffle); #ifdef __cplusplus } diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index d6a5800e63..1fee93ba40 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -334,16 +334,26 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_ auto * buft = ggml_backend_buffer_get_type(model_tensor->buffer); - // do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case + // do not load loras to extra buffer types (i.e. bufts for repacking) + // try device-native buft first (keeps LoRA on GPU), fall back to CPU only as last resort for (auto & ex : buft_extra) { if (ex == buft) { - LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft)); - - auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); - if (!cpu_dev) { - throw std::runtime_error(format("%s: no CPU backend found", __func__)); + // try to get the device's native (non-repack) buffer type + auto * dev = ggml_backend_buft_get_device(buft); + auto * native_buft = dev ? ggml_backend_dev_buffer_type(dev) : nullptr; + if (native_buft && native_buft != buft) { + LLAMA_LOG_WARN("%s: lora for '%s' cannot use repack buft '%s', using device-native '%s'\n", + __func__, model_tensor->name, ggml_backend_buft_name(buft), ggml_backend_buft_name(native_buft)); + buft = native_buft; + } else { + LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", + __func__, model_tensor->name, ggml_backend_buft_name(buft)); + auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + if (!cpu_dev) { + throw std::runtime_error(format("%s: no CPU backend found", __func__)); + } + buft = ggml_backend_dev_buffer_type(cpu_dev); } - buft = ggml_backend_dev_buffer_type(cpu_dev); break; } diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 1f7a52d789..74de4d5e53 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2677,11 +2677,71 @@ void llama_context::opt_init(struct llama_model * model, struct llama_opt_params GGML_ASSERT(model->hparams.n_ctx_train % n_batch == 0); GGML_ASSERT(n_batch % n_ubatch == 0); + // Recreate the scheduler and gf_res_prev with a training-inflated graph size before + // creating opt_ctx, so opt_ctx captures the new (larger) scheduler pointer. + // The backward graph (gb_grad) duplicates gf and adds ~2-3x more nodes+leafs; + // gb_opt adds optimizer step nodes on top. + // + // We measure the actual training forward graph node count at n_ubatch here, + // then multiply by 4 to cover gf + gb_grad + gb_opt. This is exact for any + // model size — no magic constant needed. + { + uint32_t train_fwd_nodes = 0; + + // Build a real training-ubatch forward graph in split-only mode (no buffer realloc) + // so we can count its actual nodes. Fall back to n_tensors formula if it fails. + if (memory) { + auto mctx_tmp = memory->init_full(); + if (mctx_tmp) { + // graph_reserve() uses gf_res_reserve to build the graph, so both + // must be large enough to hold the training forward graph. + // Use 16x n_tensors as a generous temporary cap for the measurement pass. + const uint32_t tmp_cap = std::max(4096u, 16u * model->n_tensors()); + gf_res_prev.reset(new llm_graph_result(tmp_cap)); + gf_res_reserve.reset(new llm_graph_result(tmp_cap)); + // split_only=true: only splits the graph, doesn't reallocate compute buffers + auto * gf_train = graph_reserve(n_ubatch, 1, n_ubatch, mctx_tmp.get(), /*split_only=*/true); + if (gf_train) { + train_fwd_nodes = (uint32_t)ggml_graph_n_nodes(gf_train); + LLAMA_LOG_INFO("%s: measured training graph nodes = %u (n_ubatch=%u)\n", + __func__, train_fwd_nodes, n_ubatch); + } + } + } + + if (train_fwd_nodes == 0) { + // Fallback: use n_tensors formula + train_fwd_nodes = std::max(1024u, 8u * model->n_tensors()); + LLAMA_LOG_WARN("%s: could not measure training graph, using fallback nodes=%u\n", + __func__, train_fwd_nodes); + } + + // gf + gb_grad + gb_opt each need ~train_fwd_nodes; multiply by 4 for safety headroom. + // Multiply by 2 again for the scheduler's n_nodes + n_leafs check. + const int64_t inflated = (int64_t)std::max(train_fwd_nodes, 1024u) * 4; + const int64_t sched_size = inflated * 2; + // Both gf_res_prev and gf_res_reserve are used to build forward graphs + // (graph_reserve uses gf_res_reserve; opt_epoch_iter uses gf_res_prev). + // Both must have capacity for the full backward graph. + gf_res_prev.reset(new llm_graph_result(inflated)); + gf_res_reserve.reset(new llm_graph_result(inflated)); + sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), + sched_size, cparams.pipeline_parallel, cparams.op_offload)); + // Suppress the next sched_reserve() call so that llama_decode() during GRPO inference + // steps does NOT replace the training sched with a smaller inference sched. + // opt_ctx->backend_sched stores a raw pointer to sched.get(); replacing sched while + // opt_ctx is alive would leave that pointer dangling and crash on the next opt_epoch. + sched_need_reserve = false; + LLAMA_LOG_INFO("%s: training graph capacity = %lld (train_fwd_nodes=%u x4)\n", + __func__, (long long)inflated, train_fwd_nodes); + } + ggml_opt_params opt_params = ggml_opt_default_params(sched.get(), GGML_OPT_LOSS_TYPE_CROSS_ENTROPY); - opt_params.opt_period = n_batch / n_ubatch; - opt_params.get_opt_pars = lopt_params.get_opt_pars; - opt_params.get_opt_pars_ud = lopt_params.get_opt_pars_ud; - opt_params.optimizer = lopt_params.optimizer_type; + opt_params.opt_period = n_batch / n_ubatch; + opt_params.get_opt_pars = lopt_params.get_opt_pars; + opt_params.get_opt_pars_ud = lopt_params.get_opt_pars_ud; + opt_params.optimizer = lopt_params.optimizer_type; + opt_params.grad_checkpoint_interval = lopt_params.grad_checkpoint_interval; opt_ctx = ggml_opt_init(opt_params); llama_opt_param_filter param_filter = lopt_params.param_filter; @@ -2716,6 +2776,7 @@ void llama_context::opt_epoch_iter( const std::vector & tokens, const std::vector & labels_sparse, llama_batch & batch, + float reward_scale, ggml_opt_epoch_callback callback, bool train, int64_t idata_in_loop, @@ -2764,6 +2825,8 @@ void llama_context::opt_epoch_iter( }; uint32_t pos_batch = 0; + static bool timings_printed = false; // print per-ubatch timings only for the first window + struct ggml_context * ctx_compute_opt = nullptr; do { const auto & ubatch = mctx->get_ubatch(); @@ -2776,56 +2839,98 @@ void llama_context::opt_epoch_iter( auto * res = gf_res_prev.get(); + const int64_t t0_build = ggml_time_ms(); const auto gparams = graph_params(res, ubatch, mctx.get(), LLM_GRAPH_TYPE_DEFAULT); res->reset(); auto * gf = model.build_graph(gparams); - struct ggml_context * ctx_compute_opt; - { + // Allocate the tensor metadata context once, then reset it each iteration. + // ggml_reset() is much cheaper than ggml_free()+ggml_init() — it just resets the + // allocation pointer without freeing/reallocating the backing memory buffer. + if (!ctx_compute_opt) { const size_t size_gf = ggml_graph_size(gf); - const size_t size_meta = 4*size_gf*ggml_tensor_overhead() + 2*ggml_graph_overhead_custom(size_gf, /*grads = */ true); + const size_t size_meta = 4*size_gf*ggml_tensor_overhead() + 3*ggml_graph_overhead_custom(size_gf, /*grads = */ true); struct ggml_init_params params = { /*.mem_size =*/ size_meta, /*.mem_buffer =*/ nullptr, /*.no_alloc =*/ true, }; ctx_compute_opt = ggml_init(params); + if (!timings_printed) { + LLAMA_LOG_INFO("%s: [timing] graph capacity=%zu n_nodes=%d size_meta=%.1fMB\n", __func__, + size_gf, ggml_graph_n_nodes(gf), (double)size_meta / (1024*1024)); + } + } else { + ggml_reset(ctx_compute_opt); } + + const int64_t t1_alloc = ggml_time_ms(); ggml_opt_prepare_alloc(opt_ctx, ctx_compute_opt, gf, res->get_inp_tokens(), res->get_logits()); ggml_opt_alloc(opt_ctx, train); + const int64_t t2_inputs = ggml_time_ms(); res->set_inputs(&ubatch); { struct ggml_tensor * labels = ggml_opt_labels(opt_ctx); GGML_ASSERT(labels->ne[1] == n_ubatch); ggml_set_zero(labels); - const float onef = 1.0f; for (uint32_t pos_ubatch = 0; pos_ubatch < n_ubatch; ++pos_ubatch) { const uint32_t ilabel = pos_ctx + pos_batch + pos_ubatch; + // -1 sentinel means "masked position" (prompt token, BOS separator, etc). + // Leave the label tensor zeroed at this position → zero cross-entropy + // contribution. Do NOT write anything — ggml_set_zero already handled it. + if (labels_sparse[ilabel] < 0) continue; GGML_ASSERT(labels_sparse[ilabel] < labels->ne[0]); - ggml_backend_tensor_set(labels, &onef, (pos_ubatch*labels->ne[0] + labels_sparse[ilabel])*sizeof(float), sizeof(float)); + ggml_backend_tensor_set(labels, &reward_scale, (pos_ubatch*labels->ne[0] + labels_sparse[ilabel])*sizeof(float), sizeof(float)); } } + + const int64_t t3_eval = ggml_time_ms(); ggml_opt_eval(opt_ctx, result); + + const int64_t t4_done = ggml_time_ms(); + if (!timings_printed) { + LLAMA_LOG_INFO("%s: [timing] build=%" PRId64 "ms alloc=%" PRId64 "ms inputs=%" PRId64 "ms eval=%" PRId64 "ms total=%" PRId64 "ms\n", + __func__, + t1_alloc - t0_build, + t2_inputs - t1_alloc, + t3_eval - t2_inputs, + t4_done - t3_eval, + t4_done - t0_build); + timings_printed = true; + } + if (callback) { callback(train, opt_ctx, dataset, result, idata_in_loop + (pos_ctx + pos_batch)/n_ubatch + 1, ndata_in_loop, t_loop_start); } - ggml_free(ctx_compute_opt); pos_batch += ubatch.n_tokens; } while (mctx->next()); + ggml_free(ctx_compute_opt); } } +// Optional per-window reward weights for reward-weighted SFT. +// Set via llama_opt_set_reward_weights() before calling llama_opt_epoch(). +// Null/0 means all rewards are 1.0 (standard SFT). +static thread_local const float * g_reward_weights = nullptr; +static thread_local int64_t g_reward_weights_n = 0; + +void llama_opt_set_reward_weights(const float * weights, int64_t n_weights) { + g_reward_weights = weights; + g_reward_weights_n = n_weights; +} + void llama_context::opt_epoch( ggml_opt_dataset_t dataset, ggml_opt_result_t result_train, ggml_opt_result_t result_eval, int64_t idata_split, ggml_opt_epoch_callback callback_train, - ggml_opt_epoch_callback callback_eval) { + ggml_opt_epoch_callback callback_eval, + bool shuffle) { const uint32_t n_ctx = this->n_ctx(); const uint32_t n_batch = std::min(cparams.n_batch, n_ctx); const uint32_t n_ubatch = std::min(cparams.n_ubatch, n_batch); @@ -2834,6 +2939,10 @@ void llama_context::opt_epoch( GGML_ASSERT(idata_split >= 0); GGML_ASSERT(idata_split <= ndata); + if (shuffle && idata_split > 1) { + ggml_opt_dataset_shuffle(opt_ctx, dataset, idata_split); + } + const uint32_t ubatch_per_ctx = n_ctx / n_ubatch; struct llama_batch batch = llama_batch_init(n_batch, 0, 1); @@ -2847,9 +2956,11 @@ void llama_context::opt_epoch( for (; idata < idata_split; ++idata) { constexpr bool train = true; const int64_t idata_in_loop = idata*ubatch_per_ctx; + const float reward = (g_reward_weights && idata < g_reward_weights_n) + ? g_reward_weights[idata] : 1.0f; ggml_opt_dataset_get_batch_host(dataset, tokens.data(), n_ctx*sizeof(llama_token), labels_sparse.data(), idata); - opt_epoch_iter(dataset, result_train, tokens, labels_sparse, batch, + opt_epoch_iter(dataset, result_train, tokens, labels_sparse, batch, reward, callback_train, train, idata_in_loop, ndata_in_loop, t_loop_start); } @@ -2860,7 +2971,7 @@ void llama_context::opt_epoch( const int64_t idata_in_loop = (idata - idata_split)*ubatch_per_ctx; ggml_opt_dataset_get_batch_host(dataset, tokens.data(), n_ctx*sizeof(llama_token), labels_sparse.data(), idata); - opt_epoch_iter(dataset, result_eval, tokens, labels_sparse, batch, + opt_epoch_iter(dataset, result_eval, tokens, labels_sparse, batch, 1.0f, callback_eval, train, idata_in_loop, ndata_in_loop, t_loop_start); } @@ -3622,12 +3733,14 @@ void llama_opt_epoch( ggml_opt_result_t result_eval, int64_t idata_split, ggml_opt_epoch_callback callback_train, - ggml_opt_epoch_callback callback_eval) { + ggml_opt_epoch_callback callback_eval, + bool shuffle) { ctx->opt_epoch( dataset, result_train, result_eval, idata_split, callback_train, - callback_eval); + callback_eval, + shuffle); } diff --git a/src/llama-context.h b/src/llama-context.h index e0d0085c1c..21d0ba6299 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -187,7 +187,8 @@ struct llama_context { ggml_opt_result_t result_eval, int64_t idata_split, ggml_opt_epoch_callback callback_train, - ggml_opt_epoch_callback callback_eval); + ggml_opt_epoch_callback callback_eval, + bool shuffle); void opt_epoch_iter( ggml_opt_dataset_t dataset, @@ -195,6 +196,7 @@ struct llama_context { const std::vector & tokens, const std::vector & labels_sparse, llama_batch & batch, + float reward_scale, ggml_opt_epoch_callback callback, bool train, int64_t idata_in_loop,