This commit is contained in:
Salvatore Rossitto 2026-03-15 23:55:07 +02:00 committed by GitHub
commit 727b1f69dc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
24 changed files with 2685 additions and 68 deletions

View File

@ -3607,32 +3607,108 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{ "-lr", "--learning-rate" }, "ALPHA",
string_format("adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)", (double) params.lr.lr0),
[](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA }));
add_opt(common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
string_format("(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
(double) params.lr.lr_min),
[](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA }));
add_opt(common_arg(
{"-decay-epochs", "--learning-rate-decay-epochs"}, "ALPHA",
string_format("(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)", (double) params.lr.decay_epochs),
[](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA }));
add_opt(common_arg(
{"-wd", "--weight-decay"}, "WD",
string_format("adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).", (double) params.lr.wd),
[](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA }));
add_opt(common_arg(
{"-val-split", "--val-split"}, "FRACTION",
string_format("fraction of data to use as validation set for training (default: %.2g).", (double) params.val_split),
[](common_params & params, const std::string & value) { params.val_split = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA }));
// qlora flags
add_opt(common_arg(
{"--lora-rank"}, "N",
string_format("LoRA rank r (default: %d)", params.lora_rank),
[](common_params & params, int value) { params.lora_rank = value; }
).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
add_opt(common_arg(
{"--lora-alpha"}, "F",
string_format("LoRA alpha (default: %d = use rank value)", (int) params.lora_alpha),
[](common_params & params, const std::string & value) { params.lora_alpha = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
add_opt(common_arg(
{"--lora-targets"}, "SUBSTRINGS",
string_format("comma-separated substrings of tensor names to add LoRA to (default: %s)", params.lora_targets.c_str()),
[](common_params & params, const std::string & value) { params.lora_targets = value; }
).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
add_opt(common_arg(
{"--lora-out"}, "FNAME",
string_format("output LoRA adapter GGUF path (default: %s)", params.lora_out.c_str()),
[](common_params & params, const std::string & value) { params.lora_out = value; }
).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
add_opt(common_arg(
{"--train-file"}, "FNAME",
"JSONL training dataset (fields: messages|prompt+response|text)",
[](common_params & params, const std::string & value) { params.train_file = value; }
).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
add_opt(common_arg(
{"--save-every"}, "N",
"save adapter checkpoint every N dataset windows during training (default: 0 = only at end)",
[](common_params & params, int value) { params.save_every = value; }
).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
add_opt(common_arg(
{"--freeze-layers"}, "N",
"freeze first N transformer layers — no LoRA adapters allocated for blk.0..blk.N-1 (default: 0 = train all layers)",
[](common_params & params, int value) { params.lora_freeze_layers = value; }
).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
add_opt(common_arg(
{"--grad-checkpoint"}, "N",
"gradient checkpointing interval to reduce peak activation VRAM (0 = disabled, default: 0)",
[](common_params & params, int value) { params.grad_checkpoint_interval = value; }
).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
add_opt(common_arg(
{"--train-on-prompt"},
"compute loss on prompt tokens too, not just the response (default: response-only loss)",
[](common_params & params) { params.train_on_prompt = true; }
).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
add_opt(common_arg(
{"--shuffle-dataset"},
"shuffle dataset windows at the start of each epoch (default: sequential order)",
[](common_params & params) { params.shuffle_dataset = true; }
).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
add_opt(common_arg(
{"--grpo-mode"},
"enable GRPO IPC training loop (prompts and rewards supplied via stdin/stdout)",
[](common_params & params) { params.grpo_mode = true; }
).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
add_opt(common_arg(
{"--n-gen"}, "N",
string_format("GRPO: number of generations per prompt (default: %d)", params.grpo_n_gen),
[](common_params & params, int value) { params.grpo_n_gen = value; }
).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
add_opt(common_arg(
{"--n-steps"}, "N",
string_format("GRPO: total optimizer steps (default: %d)", params.grpo_n_steps),
[](common_params & params, int value) { params.grpo_n_steps = value; }
).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
add_opt(common_arg(
{"--grpo-temp"}, "F",
string_format("GRPO: sampling temperature for rollout generation (default: %.2f)", (double) params.grpo_temperature),
[](common_params & params, const std::string & value) { params.grpo_temperature = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
add_opt(common_arg(
{"--grpo-max-tokens"}, "N",
string_format("GRPO: max tokens per generation (default: %d)", params.grpo_max_tokens),
[](common_params & params, int value) { params.grpo_max_tokens = value; }
).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
add_opt(common_arg(
{"-epochs", "--epochs"}, "N",
string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
[](common_params & params, int epochs) { params.lr.epochs = epochs; }
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA }));
add_opt(common_arg(
{"-opt", "--optimizer"}, "sgd|adamw", "adamw or sgd",
[](common_params & params, const std::string & name) {
@ -3641,7 +3717,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
}
}
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA }));
add_opt(common_arg(
{"--check"},
string_format("check rather than generate results (default: %s)", params.check ? "true" : "false"),

View File

@ -103,6 +103,7 @@ enum llama_example {
LLAMA_EXAMPLE_TTS,
LLAMA_EXAMPLE_DIFFUSION,
LLAMA_EXAMPLE_FINETUNE,
LLAMA_EXAMPLE_FINETUNE_QLORA,
LLAMA_EXAMPLE_FIT_PARAMS,
LLAMA_EXAMPLE_RESULTS,
LLAMA_EXAMPLE_EXPORT_GRAPH_OPS,
@ -518,7 +519,26 @@ struct common_params {
// finetune
struct lr_opt lr;
enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
float val_split = 0.05f; // fraction of the data used for the validation set
float val_split = 0.05f; // fraction of the data used for the validation set
// qlora fine-tuning
int32_t lora_rank = 16; // LoRA rank (r)
float lora_alpha = 0.0f; // LoRA alpha (0 = use rank value)
std::string lora_targets = "attn_q,attn_output,ffn_gate,ffn_up,ffn_down"; // comma-separated substrings to match trainable tensors
std::string lora_out = "adapter.gguf"; // output adapter GGUF path
std::string train_file = ""; // JSONL training dataset path
int32_t save_every = 0; // save checkpoint every N optimizer steps (0 = disabled)
int32_t lora_freeze_layers = 0; // do not apply LoRA to the first N transformer layers
int32_t grad_checkpoint_interval = 0; // gradient checkpointing interval to reduce peak VRAM (0 = disabled)
bool train_on_prompt = false; // include prompt tokens in training loss (default: response tokens only)
bool shuffle_dataset = false; // shuffle dataset windows at the start of each epoch
// grpo training
bool grpo_mode = false; // enable GRPO IPC training loop
int32_t grpo_n_gen = 8; // generations per prompt
int32_t grpo_n_steps = 500; // total GRPO optimizer steps
float grpo_temperature = 0.8f; // sampling temperature for rollouts
int32_t grpo_max_tokens = 512; // max tokens per generation
// embedding
bool embedding = false; // get only sentence embedding

View File

@ -34,6 +34,7 @@ else()
add_subdirectory(speculative-simple)
add_subdirectory(gen-docs)
add_subdirectory(training)
add_subdirectory(qlora_training)
add_subdirectory(diffusion)
if (NOT GGML_BACKEND_DL)
add_subdirectory(convert-llama2c-to-ggml)

View File

@ -0,0 +1,5 @@
set(TARGET llama-finetune-qlora)
add_executable(${TARGET} finetune_qlora.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -0,0 +1,331 @@
# llama.cpp — Native QLoRA Training
Native QLoRA + Reward-Weighted SFT training pipeline for quantized GGUF models.
The base model weights remain **frozen** (quantized tensors are skipped by `llama_set_param` because they are not `GGML_TYPE_F32`). Only freshly-allocated F32 LoRA A/B tensors are trained. The saved adapter GGUF is directly compatible with the existing `llama_adapter_lora_init` loader and `llama-export-lora` merge tool.
**Status:** Working. Phase 1 (QLoRA SFT) and Phase 2 (Reward-Weighted SFT) are implemented and functional. Training speed is currently limited by full backprop through quantized weights — see [Known Limitations](#known-limitations).
---
## Build
```bash
cd /mnt/w/llm-trading-arena/unsloth-api/llama.cpp
# First time (CUDA build):
cmake -B build -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_CURL=OFF
cmake --build build -j$(nproc)
# Incremental rebuild (after code changes):
cmake --build build --target llama-finetune-qlora -j$(nproc)
# If llama-adapter.cpp or llama-context.cpp changed, rebuild all:
cmake --build build -j$(nproc)
```
---
## Phase 1 — QLoRA SFT (`llama-finetune-qlora`)
Trains LoRA adapters on a quantized GGUF model.
### Recommended command (1.7B model, 16 GB card)
```bash
./build/bin/llama-finetune-qlora \
--model ~/qwen3-1.7b-q4_k_m.gguf \
--train-file data/train.jsonl \
--lora-rank 16 --lora-alpha 16 \
-c 4096 -b 4096 -ub 512 \
--save-every 10 \
--lora-out ~/adapter.gguf \
--epochs 3 --seed 42
```
### Recommended command (15B model, 16 GB card, partial offload)
```bash
./build/bin/llama-finetune-qlora \
--model ~/nemotron-15b-q4_k_m.gguf \
--train-file data/train.jsonl \
--lora-rank 16 --lora-alpha 16 \
-ngl 13 -c 14336 -b 14336 -ub 1024 \
--save-every 8 \
--lora-out ~/nemotron-lora.gguf \
--epochs 3 --seed 42
```
### All flags
| Flag | Default | Description |
|---|---|---|
| `--model` | *(required)* | Path to quantized GGUF model |
| `--train-file` | *(required)* | JSONL training dataset |
| `--lora-rank` | `16` | LoRA rank r |
| `--lora-alpha` | `0` (= rank) | LoRA alpha; effective scale = alpha/rank |
| `--lora-targets` | see below | Comma-separated internal tensor name substrings |
| `--lora-out` | `adapter.gguf` | Output adapter GGUF path (supports `~`) |
| `--save-every` | `0` | Save checkpoint every N dataset windows (0 = end only) |
| `--freeze-layers` | `0` | Skip LoRA on first N transformer layers (blk.0..N-1); backward already pruned automatically |
| `--grad-checkpoint` | `0` | Mark every Nth forward node persistent to reduce activation VRAM; good values: 3264 |
| `--train-on-prompt` | off | Compute loss on prompt tokens too (default: response-only loss) |
| `--shuffle-dataset` | off | Shuffle dataset windows at the start of each epoch |
| `--val-split` | `0.0` | Fraction of data to hold out for validation (e.g. `0.1` = 10%); val loss logged per epoch |
| `-epochs` / `--epochs` | `3` | Training epochs |
| `-c` / `--ctx-size` | `512` | Training context window (tokens) |
| `-b` / `--batch-size` | `2048` | Tokens per `llama_decode` call; set equal to `-c` |
| `-ub` / `--ubatch-size` | `512` | GPU micro-batch tokens; controls VRAM vs. step time |
| `-ngl` | `999` | GPU layers to offload |
| `-lr` / `--learning-rate` | `1e-4` | AdamW learning rate |
| `--seed` | `42` | Random seed for LoRA init |
### VRAM vs. step-time tradeoff
Step time and VRAM both scale linearly with `-ub`:
| Model | `-ub` | VRAM | Step time (approx) |
|---|---|---|---|
| 1.7B Q4_K_M | 512 | ~18 GB | ~120 s (OOM on 16 GB) |
| 1.7B Q4_K_M | 128 | ~6 GB | ~30 s |
| 15B Q4_K_M | 1024 | ~11 GB | ~60 s |
Use `-c` equal to your target sequence length. More context = more windows per sample = more steps per epoch. Reducing `-c` reduces total training time proportionally.
### Default LoRA targets
llama.cpp uses **internal GGUF tensor names**, not HuggingFace names:
| llama.cpp internal | HuggingFace equivalent | Status |
|---|---|---|
| `attn_q` | `q_proj` | ✅ default target, trainable |
| `attn_output` | `o_proj` | ✅ default target, trainable |
| `ffn_gate` | `gate_proj` | ✅ default target, trainable |
| `ffn_up` | `up_proj` | ✅ default target, trainable |
| `ffn_down` | `down_proj` | ✅ default target, trainable |
| `attn_k` | `k_proj` | ❌ not in defaults — zero gradient (KV scatter via SET_ROWS) |
| `attn_v` | `v_proj` | ❌ not in defaults — zero gradient (KV scatter via SET_ROWS) |
| `ssm_in` | `in_proj` | ❌ not in defaults — zero gradient (SSM_SCAN no backward) |
| `ssm_out` | `out_proj` | ❌ not in defaults — zero gradient (SSM_SCAN no backward) |
**MoE models:** Expert tensors (`*_exps`) are excluded regardless of `--lora-targets`. The quantized expert weights are frozen (stop-gradient), but LoRA on the dense FFN layers (`ffn_gate`, `ffn_up`, `ffn_down`) works — backward via `MUL_MAT_ID` + `OUT_PROD_ID`.
### Dataset format (JSONL)
**Chat format** (loss on response only; use `--train-on-prompt` for all tokens):
```json
{"messages": [{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi!"}]}
```
**Prompt/response** (loss on response only):
```json
{"prompt": "What is the capital of France?", "response": "Paris."}
```
**Plain text** (loss on all tokens):
```json
{"text": "The quick brown fox."}
```
**With reward** (Phase 2 — scales gradient by reward):
```json
{"prompt": "...", "response": "...", "reward": 0.85}
```
Rewards are normalized per epoch: clipped to `[-1, 1]`, then min-max scaled to `[0, 1]`. Reward 0 = sample ignored; reward 1 = full gradient.
### Verify and use the adapter
```bash
# Hot-load for inference (no merge needed)
./build/bin/llama-cli --model base.gguf --lora adapter.gguf -p "Hello"
# Merge into base model
./build/bin/llama-export-lora \
--model base.gguf --lora adapter.gguf --output merged.gguf
```
---
## Phase 2 — Reward-Weighted SFT
Built into `llama-finetune-qlora`. When the dataset contains a `reward` or `score` field, the cross-entropy loss for that sample is scaled by the reward before backprop. No extra flags needed — detection is automatic.
---
## Phase 3 — GRPO (Online RL via IPC)
`llama-finetune-qlora --grpo-mode` implements a full GRPO training loop where the Python process owns prompt sampling and reward scoring, and the C++ process owns model state, generation, and gradient updates.
### Quick start
```bash
python3 examples/qlora_training/grpo_example.py \
--model ~/qwen3-1.7b-q4_k_m.gguf \
--lora-out ~/grpo-adapter.gguf \
--rank 16 --n-steps 200 --n-gen 8
```
For verbose output (includes IPC message trace):
```bash
python3 examples/qlora_training/grpo_example.py \
--model ~/qwen3-1.7b-q4_k_m.gguf \
--lora-out ~/grpo-adapter.gguf \
--verbose
```
Resume from a checkpoint:
```bash
python3 examples/qlora_training/grpo_example.py \
--model ~/qwen3-1.7b-q4_k_m.gguf \
--lora ~/grpo-adapter.ckpt50.gguf \
--lora-out ~/grpo-adapter.gguf
```
### GRPO-specific flags
| Flag | Default | Description |
|---|---|---|
| `--grpo-mode` | off | Enable GRPO IPC mode |
| `--n-gen` | `8` | Rollouts per prompt |
| `--n-steps` | `500` | Total GRPO steps |
| `--grpo-temp` | `0.8` | Sampling temperature for rollouts |
| `--grpo-max-tokens` | `512` | Max tokens per generation |
All standard flags (`--lora-rank`, `-lr`, `-c`, `-ngl`, `--save-every`, etc.) work in GRPO mode too. `--train-file` is **not** required in GRPO mode.
### IPC protocol
The protocol is line-based over stdout (C++ → Python) and stdin (Python → C++). All non-protocol C++ output (timing, debug, model logs) goes to **stderr** and never contaminates the protocol channel.
**C++ → Python (stdout):**
| Line | When |
|---|---|
| `[QLORA:READY]` | Process initialised, model loaded |
| `[QLORA:PROMPT_REQ:<step>]` | C++ requests the prompt for step N |
| `[QLORA:GEN:<k>/<n>] <text>` | One generation (newlines escaped as `\n`) |
| `[QLORA:REWARD_REQ:<n>]` | C++ requests N reward scores |
| `[QLORA:PROGRESS] step=X/Y loss=Z epoch=A/B` | After each weight update |
| `[QLORA:CHECKPOINT] <path>` | After saving a checkpoint |
| `[QLORA:DONE] final_loss=X` | Training complete |
| `[QLORA:ERROR] <message>` | Fatal error |
**Python → C++ (stdin):**
| Line | Meaning |
|---|---|
| `PROMPT <escaped_text>` | Send prompt for the most recent `PROMPT_REQ` |
| `REWARD <r1> <r2> … <rN>` | Send N advantage scores in `[0, 1]` range |
| `STOP` | Request graceful shutdown after current step |
**Text encoding:** newlines in generation text are escaped as the two-character sequence `\n`; backslashes are doubled. Use `unescape()` from `grpo_example.py` (or any equivalent) to recover the original text.
### Writing your own driver
`grpo_example.py` contains two functions you replace with your own logic:
```python
def get_prompt(step: int) -> str:
"""Return the training prompt for step N."""
...
def score_generations(prompt: str, generations: List[str]) -> List[float]:
"""Score each generation. Any numeric range — will be normalised."""
...
```
The IPC helpers (`escape`, `unescape`, `parse_ipc`, `read_ipc`, `write_cmd`, `wait_for`, `normalise_rewards`) are standalone and have no external dependencies — copy them into your own project if needed.
### Training loop diagram
```
Python C++ (llama-finetune-qlora --grpo-mode)
│ │
│◄──── [QLORA:READY] ────────────┤ model loaded
│ │
│ ┌─────────────────────────────┤
│ │ for each step: │
│ │ ◄── PROMPT_REQ:N ─────────┤
│ │ ──► PROMPT <text> ────────► generate n_gen rollouts
│ │ ◄── GEN:1/n <text> ──┤
│ │ ◄── GEN:2/n <text> ──┤
│ │ ... │
│ │ ◄── GEN:n/n <text> ──┤
│ │ ◄── REWARD_REQ:n ─────────┤
│ │ (score generations) │
│ │ ──► REWARD a1 a2 … an ────► one backward + AdamW step
│ │ ◄── PROGRESS step=N/M … ──┤
│ └─────────────────────────────┤
│ │
│◄──── [QLORA:DONE] ─────────────┤ adapter saved
```
---
## Known Limitations & Optimization Roadmap
### Current limitations
**1. Full backprop through frozen quantized layers**
Every backward step dequantizes all frozen Q4_K_M weight tensors to compute activation gradients (needed to propagate loss from the output back to each LoRA layer). For a 28-layer 1.7B model at `-ub 512`, this is ~280 dequantizing matmuls per step → step time is 35× slower than inference.
**2. Activation VRAM** *(partially addressed by `--grad-checkpoint`)*
All forward activations are kept in VRAM throughout the backward pass. VRAM ≈ `model + KV + n_layers × hidden × n_ubatch × 10 × 4B + 2 × lora_params × 4B`. Reducing `-ub` reduces VRAM linearly. Use `--grad-checkpoint 48` to prevent the allocator from reusing intermediate activation buffers during backward, which cuts peak activation VRAM at near-zero compute cost.
**3. Full backprop through all layers** *(partially addressed by `--freeze-layers`)*
Gradients propagate through all layers that have LoRA adapters. Use `--freeze-layers N` to skip LoRA allocation for blk.0..N-1 — those layers receive no gradient (the `grads_needed` pruner already skips their backward ops automatically). Only the top (total_layers - N) layers are trained.
### Optimization roadmap
| Priority | Optimization | Expected gain | Status |
|---|---|---|---|
| ✅ Done | **`--freeze-layers N`** — no LoRA on first N layers; backward auto-pruned | Proportional to N/total | Implemented |
| ✅ Done | **`--grad-checkpoint N`** — keep every Nth activation alive through backward | Reduces peak activation VRAM | Implemented |
| ✅ Done | **`--train-on-prompt`** — compute loss on prompt tokens too | Configurable loss target | Implemented |
| ✅ Done | **`--shuffle-dataset`** — shuffle windows each epoch | Better convergence | Implemented |
| ✅ Done | **BOS separators** — insert BOS between concatenated samples | Correct cross-sample boundaries | Implemented |
| ✅ Done | **Per-epoch loss summary** — log train/val loss after each epoch | Observability | Implemented |
| ✅ Done | **`MUL_MAT_ID` backward** — LoRA on MoE dense FFN layers; `OUT_PROD_ID` for scattered outer product | Unlocks Mixtral/Nemotron-MoE | Implemented |
| ✅ Done | **Quantized `OUT_PROD`** — dequantize on GPU + cuBLAS for backward matmul | Full GPU training (no CPU fallback) | Implemented |
| ✅ Done | **Reuse `ctx_compute_opt`** — allocate tensor metadata context once, `ggml_reset()` across ubatches | Eliminate ~0.5 s/step overhead | Implemented |
| ❌ Skip | **Static training graphs** — KV mask shape changes per ubatch (`n_kv` grows); graph topology not static | Would need KV cache redesign | Not feasible |
| Low | **`SSM_SCAN/CONV` backward** — enable LoRA on Mamba SSM layers | Unlocks NemotronH SSM layers | Planned |
| Low | **GELU backward** — implement `ggml_gelu_back` kernel (UNARY + GLU) | Support GPT-2/Phi-style models | Planned (needs new CUDA/CPU kernels) |
---
## Implementation notes (for developers)
### Modified llama.cpp files
| File | Change |
|---|---|
| `ggml/src/ggml.c` | Backward graph fixes: `GET_ROWS` 3D, `SET_ROWS`, `MUL_MAT_ID`, `SSM_SCAN/CONV`, `FLASH_ATTN_EXT` all stop gradient; inplace-op assert → warn+skip |
| `src/llama-context.cpp` | `opt_init`: scheduler and graph sized with inflated capacity before `ggml_opt_init`; `opt_epoch_iter`: per-ubatch timing instrumentation; reward scaling via `g_reward_weights` TLS |
| `src/llama-adapter.cpp` | Repack-buft fallback for LoRA tensors: tries device-native buft before CPU |
| `common/common.h` | Added `save_every`, `lora_freeze_layers`, `grad_checkpoint_interval`, `train_on_prompt`, `shuffle_dataset` fields |
| `common/arg.cpp` | Added `--save-every`, `--freeze-layers`, `--grad-checkpoint`, `--train-on-prompt`, `--shuffle-dataset` arguments |
| `include/llama.h` | Added `llama_opt_set_reward_weights()`; `grad_checkpoint_interval` in `llama_opt_params`; `shuffle` param in `llama_opt_epoch` |
| `ggml/src/ggml-cuda/out-prod.cu` | `OUT_PROD` with quantized src0 (dequantize on GPU + cuBLAS); `OUT_PROD_ID` for MoE backward |
| `ggml/src/ggml-cuda/ggml-cuda.cu` | `supports_op` for quantized `OUT_PROD` and `OUT_PROD_ID`; CPU-resident ids fix in `mul_mat_id` |
| `ggml/include/ggml-opt.h` | Added `grad_checkpoint_interval` to `ggml_opt_params` |
| `ggml/src/ggml-opt.cpp` | Gradient checkpointing: marks every Nth forward node `GGML_TENSOR_FLAG_OUTPUT` before backward build |
### Key invariants
- `params.use_mmap = false` — forced; mmap'd tensors can't have data written back
- `params.flash_attn_type = DISABLED` — no backward impl for flash attention
- `params.warmup = false` — warmup runs inference with PARAM tensors → segfault
- `params.cache_type_k = F32` — training requires F32 KV (or BF16 with `--cache-type-k bf16`)
- LoRA A/B tensors are marked `PARAM` via `ggml_set_param` on the tensors loaded by `llama_adapter_lora_init`, not on the pre-init scratch tensors in `lt.buf`
- The adapter GGUF is pre-saved and loaded via `params.lora_adapters` BEFORE `common_init_from_params` so that `sched_reserve` includes LoRA graph nodes in its sizing
### Why opt_init inflation matters
`ggml_opt_init` captures `sched.get()` at construction time. The backward graph (`gb_grad`, `gb_opt`) is ~35× larger than the forward graph in node count. If the scheduler hash_set is sized only for the forward graph, `ggml_backend_sched_alloc_graph` on the backward graph will overflow it. We recreate `sched` with `inflated = fwd_nodes × 4` slots BEFORE calling `ggml_opt_init`.
### Reward weighting implementation
`llama_opt_set_reward_weights(weights, n)` sets thread-local `g_reward_weights`. In `opt_epoch`, each window reads `g_reward_weights[idata]` and passes it as `reward_scale` to `opt_epoch_iter`. Inside the iter loop, instead of writing `1.0f` for the correct token's label position in the cross-entropy label tensor, it writes `reward_scale`. Since cross-entropy loss = `-mean(label × log(softmax(logit)))`, scaling the label scales both loss and gradient identically.

View File

@ -0,0 +1,64 @@
#!/usr/bin/env python3
"""Quick check of LoRA tensor norms in a GGUF file."""
import sys, struct, numpy as np
def read_gguf(path):
with open(path, 'rb') as f:
assert f.read(4) == b'GGUF'
version = struct.unpack('<I', f.read(4))[0]
n_tensors, n_kv = struct.unpack('<QQ', f.read(16))
# skip KV pairs (simplified - just seek past them)
# Read tensor infos
tensors = []
for _ in range(n_kv):
# key
klen = struct.unpack('<Q', f.read(8))[0]
key = f.read(klen).decode()
vtype = struct.unpack('<I', f.read(4))[0]
# skip value based on type (simplified)
if vtype == 8: # string
slen = struct.unpack('<Q', f.read(8))[0]; f.read(slen)
elif vtype == 6: # float32
f.read(4)
elif vtype in (0,1,2,3,4,5,10,11,12): # int types
sizes = {0:1,1:1,2:2,3:4,4:8,5:1,10:2,11:4,12:8}
f.read(sizes.get(vtype,4))
elif vtype == 9: # bool
f.read(1)
else:
print(f"unknown kv type {vtype} for key {key}, stopping"); break
data_offset = None
for i in range(n_tensors):
nlen = struct.unpack('<Q', f.read(8))[0]
name = f.read(nlen).decode()
ndims = struct.unpack('<I', f.read(4))[0]
dims = struct.unpack('<' + 'Q'*ndims, f.read(8*ndims))
dtype = struct.unpack('<I', f.read(4))[0]
offset = struct.unpack('<Q', f.read(8))[0]
tensors.append((name, dims, dtype, offset))
# data section starts after alignment
pos = f.tell()
align = 32
data_start = (pos + align - 1) & ~(align - 1)
print(f"\nFile: {path}")
print(f"Tensors: {n_tensors}")
for name, dims, dtype, offset in tensors[:10]: # first 10
if dtype != 0: # only F32 (type 0)
print(f" {name}: dims={dims} type={dtype} (non-F32, skipping norm)")
continue
f.seek(data_start + offset)
n = 1
for d in dims: n *= d
data = np.frombuffer(f.read(n*4), dtype=np.float32)
print(f" {name}: dims={dims} norm={np.linalg.norm(data):.4f} max={np.abs(data).max():.4f} mean={np.abs(data).mean():.6f}")
if __name__ == '__main__':
for p in sys.argv[1:]:
try:
read_gguf(p)
except Exception as e:
print(f"Error reading {p}: {e}")

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,399 @@
#!/usr/bin/env python3
"""
grpo_example.py Minimal GRPO training loop using llama-finetune-qlora --grpo-mode
Demonstrates the IPC protocol between the Python driver and the C++ subprocess.
No external dependencies required only Python stdlib.
Usage:
python3 grpo_example.py \
--model /path/to/model-q4_k_m.gguf \
--lora-out /path/to/output-adapter.gguf \
[--lora /path/to/resume-adapter.gguf] \
[--binary /path/to/llama-finetune-qlora] \
[--n-steps 200] \
[--n-gen 8] \
[--rank 16]
IPC Protocol (stdout from C++ process):
[QLORA:READY] process initialised
[QLORA:PROMPT_REQ:<step>] C++ requests a prompt for step N
[QLORA:GEN:<k>/<n>] <text> one generation (newlines escaped as \\n)
[QLORA:REWARD_REQ:<n>] C++ requests N reward scores
[QLORA:PROGRESS] step=X/Y loss=Z epoch=A/B
[QLORA:CHECKPOINT] <path>
[QLORA:DONE] final_loss=X
[QLORA:ERROR] <message>
Python C++ stdin:
PROMPT <escaped_text>
REWARD <r1> <r2> ... <rN> (advantages, 0..1 range)
STOP (request graceful shutdown)
"""
import argparse
import logging
import math
import re
import subprocess
import sys
import time
from pathlib import Path
from typing import List, Optional, Tuple
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
)
log = logging.getLogger("grpo_example")
# ──────────────────────────────────────────────────────────────────────────────
# IPC helpers
# ──────────────────────────────────────────────────────────────────────────────
_IPC_RE = re.compile(r"^\[QLORA:([A-Z_]+)(?::([^\]]*))?\](.*)$")
def escape(text: str) -> str:
"""Escape newlines and backslashes for single-line IPC transport."""
return text.replace("\\", "\\\\").replace("\n", "\\n").replace("\r", "\\r")
def unescape(text: str) -> str:
"""Reverse of escape()."""
out, i = [], 0
while i < len(text):
if text[i] == "\\" and i + 1 < len(text):
c = text[i + 1]
if c == "n":
out.append("\n")
elif c == "r":
out.append("\r")
elif c == "\\":
out.append("\\")
else:
out.append(c)
i += 2
else:
out.append(text[i])
i += 1
return "".join(out)
def parse_ipc(line: str) -> Optional[Tuple[str, str, str]]:
"""
Parse an IPC line into (msg_type, seq, payload).
Returns None for non-IPC lines (model output, log lines, etc.).
"""
m = _IPC_RE.match(line.strip())
if not m:
return None
return m.group(1), (m.group(2) or ""), m.group(3).strip()
def read_ipc(proc: subprocess.Popen, timeout: float = 120.0) -> Optional[Tuple[str, str, str]]:
"""
Read lines from proc.stdout until an IPC message arrives.
Non-IPC lines (model output, C++ logs leaked to stdout) are printed.
Returns None on EOF.
Raises TimeoutError if nothing arrives within `timeout` seconds.
"""
assert proc.stdout is not None
deadline = time.monotonic() + timeout
while True:
remaining = deadline - time.monotonic()
if remaining <= 0:
raise TimeoutError(f"No IPC message within {timeout:.0f}s")
line = proc.stdout.readline()
if not line:
return None # EOF
line = line.rstrip("\n")
parsed = parse_ipc(line)
if parsed:
return parsed
# Non-IPC — C++ sometimes leaks timing/debug lines to stdout.
# Print them so the user can see what's happening.
print(f" [cpp] {line}", file=sys.stderr)
def write_cmd(proc: subprocess.Popen, cmd: str):
"""Write one command line to the subprocess stdin."""
assert proc.stdin is not None
try:
proc.stdin.write(cmd + "\n")
proc.stdin.flush()
except BrokenPipeError:
raise RuntimeError("C++ subprocess stdin closed — did it crash?")
def wait_for(proc: subprocess.Popen, expected: str, timeout: float = 120.0) -> Tuple[str, str, str]:
"""Block until the expected IPC message type arrives."""
deadline = time.monotonic() + timeout
while True:
remaining = deadline - time.monotonic()
if remaining <= 0:
raise TimeoutError(f"Timed out waiting for [{expected}]")
parsed = read_ipc(proc, timeout=remaining)
if parsed is None:
raise RuntimeError(f"Subprocess exited before sending [{expected}]")
msg_type, seq, payload = parsed
if msg_type == expected:
return msg_type, seq, payload
log.debug("Ignoring unexpected IPC (%s) while waiting for %s", msg_type, expected)
# ──────────────────────────────────────────────────────────────────────────────
# Advantage normalisation (GRPO)
# ──────────────────────────────────────────────────────────────────────────────
def normalise_rewards(rewards: List[float]) -> List[float]:
"""
Group-relative advantage normalisation: subtract mean, divide by std.
Clipped to [0, 1] so the C++ side always receives values in that range.
All-equal rewards uniform 0.5 (no signal, but no NaN either).
"""
if len(rewards) == 0:
return []
mean = sum(rewards) / len(rewards)
variance = sum((r - mean) ** 2 for r in rewards) / len(rewards)
std = math.sqrt(variance) if variance > 1e-8 else 1.0
normalised = [(r - mean) / std for r in rewards]
# Shift to [0,1]: z-scores typically lie in [-3, +3]
clipped = [max(0.0, min(1.0, 0.5 + z / 6.0)) for z in normalised]
return clipped
# ──────────────────────────────────────────────────────────────────────────────
# Example prompt / reward providers
# ──────────────────────────────────────────────────────────────────────────────
# Replace these with your own logic.
_EXAMPLE_PROMPTS = [
"Explain the concept of gradient descent in one sentence.",
"What is the capital of France?",
"Write a haiku about machine learning.",
"Describe the difference between SFT and RLHF.",
"What does GRPO stand for?",
]
def get_prompt(step: int) -> str:
"""Return a prompt for the given training step (0-indexed)."""
return _EXAMPLE_PROMPTS[step % len(_EXAMPLE_PROMPTS)]
def score_generations(prompt: str, generations: List[str]) -> List[float]:
"""
Score a list of model generations for the given prompt.
Returns a list of raw reward scores (any numeric range; will be normalised).
This example uses a trivial heuristic: longer, more varied responses
score higher. Replace with your actual reward model / verifier.
"""
scores = []
for gen in generations:
words = gen.split()
# Simple heuristics: length + lexical diversity
length_score = min(1.0, len(words) / 50.0)
vocab_score = min(1.0, len(set(words)) / max(1, len(words)))
scores.append(0.6 * length_score + 0.4 * vocab_score)
return scores
# ──────────────────────────────────────────────────────────────────────────────
# Main GRPO loop
# ──────────────────────────────────────────────────────────────────────────────
def run_grpo(args: argparse.Namespace):
# Resolve binary
binary = Path(args.binary)
if not binary.exists():
log.error("Binary not found: %s", binary)
sys.exit(1)
# Build command
cmd = [
str(binary),
"--model", args.model,
"--lora-out", args.lora_out,
"--lora-rank", str(args.rank),
"--lora-alpha", str(args.rank // 2),
"-c", str(args.ctx_size),
"-b", str(args.ctx_size),
"-ub", "512",
"-ngl", str(args.ngl),
"-lr", str(args.lr),
"--seed", str(args.seed),
"--grad-checkpoint","48",
"--shuffle-dataset",
"--grpo-mode",
"--n-gen", str(args.n_gen),
"--n-steps", str(args.n_steps),
"--grpo-temp", str(args.temperature),
"--grpo-max-tokens",str(args.max_tokens),
]
if args.lora:
cmd += ["--lora", args.lora]
if args.save_every > 0:
cmd += ["--save-every", str(args.save_every)]
log.info("Launching: %s", " ".join(cmd))
proc = subprocess.Popen(
cmd,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=sys.stderr, # C++ debug/timing logs go directly to our stderr
text=True,
bufsize=1,
)
try:
_grpo_loop(proc, args)
except KeyboardInterrupt:
log.info("Interrupted — requesting graceful stop")
try:
write_cmd(proc, "STOP")
except Exception:
pass
except Exception as e:
log.error("GRPO loop error: %s", e)
proc.kill()
raise
finally:
try:
if proc.stdin is not None:
proc.stdin.close()
except Exception:
pass
rc = proc.wait(timeout=30)
if rc not in (0, None):
log.warning("Subprocess exited with code %d", rc)
def _grpo_loop(proc: subprocess.Popen, args: argparse.Namespace):
# ── Wait for READY ──────────────────────────────────────────────────────
log.info("Waiting for subprocess to initialise (model load can take a minute)…")
wait_for(proc, "READY", timeout=300)
log.info("Subprocess ready.")
current_prompt: str = ""
generations: List[str] = []
step = 0
while True:
parsed = read_ipc(proc, timeout=600)
if parsed is None:
log.info("Subprocess exited (EOF).")
break
msg_type, seq, payload = parsed
# ── PROMPT_REQ ──────────────────────────────────────────────────────
if msg_type == "PROMPT_REQ":
step = int(seq) if seq else step + 1
current_prompt = get_prompt(step - 1)
generations = []
log.debug("Step %d — sending prompt: %s", step, current_prompt[:60])
write_cmd(proc, f"PROMPT {escape(current_prompt)}")
# ── GEN ─────────────────────────────────────────────────────────────
elif msg_type == "GEN":
# seq = "k/n"
parts = seq.split("/")
k = int(parts[0])
n = int(parts[1]) if len(parts) > 1 else args.n_gen
text = unescape(payload)
generations.append(text)
log.debug(" Generation %d/%d: %s", k, n, text[:60].replace("\n", ""))
# ── REWARD_REQ ──────────────────────────────────────────────────────
elif msg_type == "REWARD_REQ":
n_expected = int(seq) if seq else len(generations)
if len(generations) != n_expected:
log.warning(
"REWARD_REQ asked for %d rewards but collected %d generations",
n_expected, len(generations),
)
raw_rewards = score_generations(current_prompt, generations)
advantages = normalise_rewards(raw_rewards)
reward_str = " ".join(f"{a:.6f}" for a in advantages)
log.debug(" Rewards (raw): %s", [f"{r:.3f}" for r in raw_rewards])
log.debug(" Advantages: %s", [f"{a:.3f}" for a in advantages])
write_cmd(proc, f"REWARD {reward_str}")
# ── PROGRESS ────────────────────────────────────────────────────────
elif msg_type == "PROGRESS":
# Format: step=X/Y loss=Z epoch=A/B
sm = re.search(r"step=(\d+)(?:/(\d+))?", payload)
lm = re.search(r"loss=([\d.]+)", payload)
step_str = f"{sm.group(1)}/{sm.group(2)}" if sm and sm.group(2) else (sm.group(1) if sm else "?")
loss_str = lm.group(1) if lm else "?"
print(f" step {step_str} loss {loss_str}", flush=True)
# ── CHECKPOINT ──────────────────────────────────────────────────────
elif msg_type == "CHECKPOINT":
log.info("Checkpoint saved: %s", payload.strip())
# ── DONE ────────────────────────────────────────────────────────────
elif msg_type == "DONE":
m = re.search(r"final_loss=([\d.]+)", payload)
loss = m.group(1) if m else "?"
log.info("Training complete. final_loss=%s", loss)
break
# ── ERROR ────────────────────────────────────────────────────────────
elif msg_type == "ERROR":
log.error("C++ process error: %s", payload.strip())
raise RuntimeError(f"Training failed: {payload.strip()}")
else:
log.debug("Unknown IPC message: [%s] seq=%r payload=%r", msg_type, seq, payload)
# ──────────────────────────────────────────────────────────────────────────────
# CLI
# ──────────────────────────────────────────────────────────────────────────────
def parse_args() -> argparse.Namespace:
# Default binary: build/bin/ relative to this script's repo root
script_dir = Path(__file__).resolve().parent
repo_root = script_dir.parents[1] # examples/qlora_training → llama.cpp root
default_bin = repo_root / "build" / "bin" / "llama-finetune-qlora"
p = argparse.ArgumentParser(
description="Minimal GRPO training loop via llama-finetune-qlora --grpo-mode",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
p.add_argument("--model", required=True, help="Base GGUF model path")
p.add_argument("--lora-out", required=True, help="Output adapter GGUF path")
p.add_argument("--lora", default=None, help="Resume from existing adapter GGUF")
p.add_argument("--binary", default=str(default_bin), help="Path to llama-finetune-qlora binary")
p.add_argument("--rank", type=int, default=16, help="LoRA rank")
p.add_argument("--n-steps", type=int, default=200, help="Number of GRPO steps")
p.add_argument("--n-gen", type=int, default=8, help="Generations per prompt")
p.add_argument("--lr", type=float, default=1e-4, help="Learning rate")
p.add_argument("--ctx-size", type=int, default=4096, help="Context window")
p.add_argument("--ngl", type=int, default=999, help="GPU layers (-ngl)")
p.add_argument("--temperature", type=float, default=0.8, help="Sampling temperature")
p.add_argument("--max-tokens", type=int, default=512, help="Max tokens per generation")
p.add_argument("--save-every", type=int, default=0, help="Save checkpoint every N steps (0=off)")
p.add_argument("--seed", type=int, default=42, help="RNG seed")
p.add_argument("--verbose", action="store_true", help="Enable DEBUG logging")
return p.parse_args()
if __name__ == "__main__":
args = parse_args()
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
run_grpo(args)

View File

@ -0,0 +1,7 @@
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "What is the capital of France?"}, {"role": "assistant", "content": "The capital of France is Paris."}]}
{"messages": [{"role": "user", "content": "Explain gradient descent in one sentence."}, {"role": "assistant", "content": "Gradient descent iteratively adjusts model parameters in the direction that most reduces the loss function."}]}
{"messages": [{"role": "system", "content": "You are a concise coding assistant."}, {"role": "user", "content": "Write a Python function that returns the factorial of n."}, {"role": "assistant", "content": "```python\ndef factorial(n):\n return 1 if n <= 1 else n * factorial(n - 1)\n```"}]}
{"prompt": "What is supervised fine-tuning?", "response": "Supervised fine-tuning (SFT) trains a pre-trained model on labelled examples of the desired behaviour, adjusting weights to minimise cross-entropy loss on the target responses."}
{"prompt": "What is LoRA?", "response": "LoRA (Low-Rank Adaptation) fine-tunes large models by injecting trainable low-rank matrices into frozen weight layers, dramatically reducing the number of trainable parameters."}
{"text": "The transformer architecture, introduced in 'Attention Is All You Need' (Vaswani et al., 2017), relies entirely on self-attention mechanisms and has become the foundation of modern large language models."}
{"text": "QLoRA combines quantization with LoRA: the base model is loaded in 4-bit precision (frozen), while only the small LoRA adapter weights are trained in full precision."}

View File

@ -0,0 +1,6 @@
{"messages": [{"role": "user", "content": "Explain what a neural network is."}, {"role": "assistant", "content": "A neural network is a computational model loosely inspired by biological neurons, consisting of layers of interconnected nodes that learn to map inputs to outputs through training."}], "reward": 0.9}
{"messages": [{"role": "user", "content": "Explain what a neural network is."}, {"role": "assistant", "content": "It's a thing that does AI stuff with nodes and layers."}], "reward": 0.2}
{"messages": [{"role": "user", "content": "What is backpropagation?"}, {"role": "assistant", "content": "Backpropagation computes gradients of the loss with respect to each parameter by applying the chain rule backwards through the computation graph, enabling gradient descent to update all weights efficiently."}], "reward": 1.0}
{"messages": [{"role": "user", "content": "What is backpropagation?"}, {"role": "assistant", "content": "Backpropagation is how neural networks learn by going backwards."}], "reward": 0.3}
{"prompt": "Describe overfitting in machine learning.", "response": "Overfitting occurs when a model learns the training data too closely—capturing noise and idiosyncrasies—resulting in poor generalisation to unseen examples. Regularisation, dropout, and early stopping are common mitigations.", "reward": 0.95}
{"prompt": "Describe overfitting in machine learning.", "response": "Overfitting is when the model is too good at training data but bad at new data.", "reward": 0.4}

View File

@ -73,6 +73,7 @@ int main(int argc, char ** argv) {
/*get_opt_pars =*/common_opt_lr_pars,
/*get_opt_pars_ud =*/&params.lr,
/*optimizer_type =*/params.optimizer,
/*grad_checkpoint_interval =*/params.grad_checkpoint_interval,
};
llama_opt_init(ctx, model, lopt_params);
@ -83,7 +84,7 @@ int main(int argc, char ** argv) {
for (lr.epoch = 0; lr.epoch < lr.epochs; ++lr.epoch) {
llama_opt_epoch(ctx, dataset, result_train, result_eval, idata_split,
ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar, /*shuffle=*/false);
fprintf(stderr, "\n");
ggml_opt_result_reset(result_train);

View File

@ -89,6 +89,7 @@ extern "C" {
float beta2; // second AdamW momentum
float eps; // epsilon for numerical stability
float wd; // weight decay - 0.0f to disable
float gclip; // element-wise gradient clipping threshold - 0.0f to disable
} adamw;
struct {
float alpha; // learning rate
@ -125,6 +126,13 @@ extern "C" {
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
// Gradient checkpointing: keep the output of every Nth forward node alive through
// the backward pass so the allocator cannot reuse its memory for other tensors.
// This trades compute for VRAM — intermediate activations between checkpoints are
// freed and recomputed during the backward pass by the existing graph structure.
// Set to 0 (default) to disable. A value of ~3264 cuts activation VRAM by ~50%.
int32_t grad_checkpoint_interval;
// only GGML_OPT_OPTIMIZER_TYPE_ADAMW needs m, v momenta per parameter tensor
enum ggml_opt_optimizer_type optimizer;
};

View File

@ -503,6 +503,7 @@ extern "C" {
GGML_OP_MUL_MAT,
GGML_OP_MUL_MAT_ID,
GGML_OP_OUT_PROD,
GGML_OP_OUT_PROD_ID, // scattered outer-product for MUL_MAT_ID backward (MoE LoRA)
GGML_OP_SCALE,
GGML_OP_SET,
@ -1426,6 +1427,21 @@ extern "C" {
struct ggml_tensor * a,
struct ggml_tensor * b);
// Scattered outer-product for the MUL_MAT_ID backward pass (MoE LoRA gradient).
//
// a: [cols, n_expert_used, n_tokens] F32 — activations
// b: [rows, n_expert_used, n_tokens] F32 — upstream gradient
// ids: [n_expert_used, n_tokens] I32 — expert dispatch indices
// result: [cols, rows, n_expert, 1] F32
//
// result[:, :, e] += sum_{(i,t): ids[i,t]==e} a[:, i, t] ⊗ b[:, i, t]
GGML_API struct ggml_tensor * ggml_out_prod_id(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * ids,
int64_t n_expert);
//
// operations on tensors without backpropagation
//

View File

@ -11046,7 +11046,7 @@ static void ggml_compute_forward_opt_step_adamw_f32(
GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_m));
GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_v));
GGML_ASSERT(ggml_nelements(adamw_params) == 7);
GGML_ASSERT(ggml_nelements(adamw_params) == 8);
const int ith = params->ith;
const int nth = params->nth;
@ -11072,6 +11072,7 @@ static void ggml_compute_forward_opt_step_adamw_f32(
const float wd = adamw_params_ptr[4];
const float beta1h = adamw_params_ptr[5];
const float beta2h = adamw_params_ptr[6];
const float gclip = adamw_params_ptr[7]; // element-wise gradient clip (0 = disabled)
const float keep = 1.f - alpha * wd;
for (int ir = ir0; ir < ir1; ++ir) {
const int64_t i03 = ir/(ne02*ne01);
@ -11086,8 +11087,10 @@ static void ggml_compute_forward_opt_step_adamw_f32(
float * v = (float *) ((char *) src0_grad_v->data + offset);
for (int i00 = 0; i00 < ne00; ++i00) {
m[i00] = m[i00]*beta1 + g[i00]*(1.0f - beta1);
v[i00] = v[i00]*beta2 + g[i00]*g[i00]*(1.0f - beta2);
const float gi = (gclip > 0.0f) ? fmaxf(-gclip, fminf(gclip, g[i00])) : g[i00];
m[i00] = m[i00]*beta1 + gi*(1.0f - beta1);
v[i00] = v[i00]*beta2 + gi*gi*(1.0f - beta2);
const float mh = m[i00]*beta1h;
const float vh = sqrtf(v[i00]*beta2h) + eps;

View File

@ -2680,6 +2680,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_OUT_PROD:
ggml_cuda_out_prod(ctx, dst);
break;
case GGML_OP_OUT_PROD_ID:
ggml_cuda_out_prod_id(ctx, dst);
break;
case GGML_OP_SCALE:
ggml_cuda_op_scale(ctx, dst);
break;
@ -4803,7 +4806,15 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
}
} break;
case GGML_OP_OUT_PROD:
return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
return op->type == GGML_TYPE_F32
&& (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type))
&& op->src[1]->type == GGML_TYPE_F32;
case GGML_OP_OUT_PROD_ID:
return op->src[0] != nullptr && op->src[1] != nullptr && op->src[2] != nullptr
&& op->type == GGML_TYPE_F32
&& op->src[0]->type == GGML_TYPE_F32
&& op->src[1]->type == GGML_TYPE_F32
&& op->src[2]->type == GGML_TYPE_I32;
case GGML_OP_GET_ROWS:
{
switch (op->src[0]->type) {

View File

@ -20,8 +20,9 @@ static __global__ void opt_step_adamw_f32(
const float wd = pars[4];
const float beta1h = pars[5];
const float beta2h = pars[6];
const float gclip = pars[7]; // element-wise gradient clip (0 = disabled)
const float gi = g[i];
const float gi = (gclip > 0.0f) ? fmaxf(-gclip, fminf(gclip, g[i])) : g[i];
const float gmi = g_m[i]*beta1 + gi*(1.0f - beta1);
const float gvi = g_v[i]*beta2 + gi*gi*(1.0f - beta2);
@ -62,7 +63,7 @@ void ggml_cuda_opt_step_adamw(ggml_backend_cuda_context & ctx, ggml_tensor * dst
GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_m));
GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_v));
GGML_ASSERT(ggml_nelements(adamw_params) == 7);
GGML_ASSERT(ggml_nelements(adamw_params) == 8);
float * src0_d = (float *) src0->data;
const float * src0_grad_d = (const float *) src0_grad->data;

View File

@ -1,6 +1,9 @@
#include "out-prod.cuh"
#include "convert.cuh"
#include <cstdint>
#include <cstring>
#include <vector>
void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
@ -8,7 +11,7 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type));
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
@ -22,19 +25,37 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
GGML_ASSERT(ne2 == src1->ne[2]);
GGML_ASSERT(ne3 == src1->ne[3]);
const float * src0_d = (const float *) src0->data;
const float * src1_d = (const float *) src1->data;
float * dst_d = (float *) dst->data;
cudaStream_t stream = ctx.stream();
cublasHandle_t handle = ctx.cublas_handle();
// If src0 is quantized, dequantize to a temp F32 buffer on GPU
ggml_cuda_pool_alloc<float> src0_f32_alloc;
const float * src0_d;
int64_t lda;
if (src0->type != GGML_TYPE_F32) {
const int64_t n_elements = ggml_nelements(src0);
src0_f32_alloc.alloc(ctx.pool(), n_elements);
to_fp32_cuda_t to_fp32 = ggml_get_to_fp32_cuda(src0->type);
GGML_ASSERT(to_fp32 != nullptr);
to_fp32(src0->data, src0_f32_alloc.ptr, n_elements, stream);
src0_d = src0_f32_alloc.ptr;
lda = ne00; // dequantized data is contiguous: stride = ne00
} else {
src0_d = (const float *) src0->data;
lda = nb01 / sizeof(float);
}
const float * src1_d = (const float *) src1->data;
float * dst_d = (float *) dst->data;
const float alpha = 1.0f;
const float beta = 0.0f;
CUBLAS_CHECK(cublasSetStream(handle, stream));
const int64_t lda = nb01 / sizeof(float);
const int64_t ldc = nb1 / sizeof(float);
const bool src1_T = ggml_is_transposed(src1);
@ -42,9 +63,9 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);
GGML_ASSERT( (src1_T ? nb11 : nb10) == sizeof(float));
// data strides in dimensions 2/3
const size_t s02 = nb02 / sizeof(float);
const size_t s03 = nb03 / sizeof(float);
// data strides in dimensions 2/3 (for dequantized src0, use element-based strides)
const size_t s02 = (src0->type != GGML_TYPE_F32) ? (ne00 * ne01) : (nb02 / sizeof(float));
const size_t s03 = (src0->type != GGML_TYPE_F32) ? (ne00 * ne01 * ne02) : (nb03 / sizeof(float));
const size_t s12 = nb12 / sizeof(float);
const size_t s13 = nb13 / sizeof(float);
const size_t s2 = nb2 / sizeof(float);
@ -66,3 +87,115 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
}
}
}
// ggml_cuda_out_prod_id
//
// Scattered outer-product for the MUL_MAT_ID backward pass (gradient w.r.t. expert weights).
//
// src0 = a [cols, n_expert_used, n_tokens] F32 — token activations
// src1 = b [rows, n_expert_used, n_tokens] F32 — upstream gradient
// src2 = ids [n_expert_used, n_tokens] I32 — expert dispatch indices
// dst [cols, rows, n_expert, 1] F32 — gradient w.r.t. expert weight matrices
//
// dst[:, :, e] += sum_{(i,t): ids[i,t]==e} a[:, i, t] ⊗ b[:, i, t]
//
// Algorithm:
// For each expert e: gather the token columns where ids[i,t]==e into contiguous
// GPU buffers, then use cublasSgemm (beta=1) to accumulate the outer product.
// ids may be CPU-resident (common in backward graphs where they are leaf tensors).
void ggml_cuda_out_prod_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; // a [cols, n_exp_used, n_tokens]
const ggml_tensor * src1 = dst->src[1]; // b [rows, n_exp_used, n_tokens]
const ggml_tensor * ids = dst->src[2]; // ids [n_exp_used, n_tokens] i32
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(ids->type == GGML_TYPE_I32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
const int64_t cols = src0->ne[0];
const int64_t n_exp_used = src0->ne[1];
const int64_t n_tokens = src0->ne[2];
const int64_t rows = src1->ne[0];
const int64_t n_expert = dst->ne[2];
cudaStream_t stream = ctx.stream();
cublasHandle_t handle = ctx.cublas_handle();
CUBLAS_CHECK(cublasSetStream(handle, stream));
// Zero destination tensor before accumulating
CUDA_CHECK(cudaMemsetAsync(dst->data, 0, ggml_nbytes(dst), stream));
// Read ids to host — ids may be CPU-resident (backward graph leaf) or GPU-resident
const size_t ids_nbytes = ggml_nbytes(ids);
std::vector<char> ids_host(ids_nbytes);
if (ids->buffer && !ggml_backend_buffer_is_host(ids->buffer)) {
// GPU-resident: copy to host and synchronize so we can inspect the values
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids->data, ids_nbytes, cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaStreamSynchronize(stream));
} else {
memcpy(ids_host.data(), ids->data, ids_nbytes);
}
// Build per-expert token list: expert_tokens[e] = list of flat indices (iexp*n_tokens+itok)
// whose dispatch id equals e.
std::vector<std::vector<int64_t>> expert_tokens(n_expert);
for (int64_t itok = 0; itok < n_tokens; ++itok) {
for (int64_t iexp = 0; iexp < n_exp_used; ++iexp) {
const int32_t eid = *(const int32_t *)(ids_host.data()
+ itok * ids->nb[1] + iexp * ids->nb[0]);
GGML_ASSERT(eid >= 0 && eid < (int32_t)n_expert);
expert_tokens[eid].push_back(iexp * n_tokens + itok);
}
}
// Strides (in elements, not bytes)
const int64_t a_stride_exp = src0->nb[1] / sizeof(float); // cols
const int64_t a_stride_tok = src0->nb[2] / sizeof(float); // cols * n_exp_used
const int64_t b_stride_exp = src1->nb[1] / sizeof(float); // rows
const int64_t b_stride_tok = src1->nb[2] / sizeof(float); // rows * n_exp_used
const int64_t dst_stride_e = dst->nb[2] / sizeof(float); // cols * rows
const float alpha_one = 1.0f;
const float beta_acc = 1.0f; // accumulate — dst is already zeroed above
const float * a_base = (const float *) src0->data;
const float * b_base = (const float *) src1->data;
float * d_base = (float *) dst->data;
for (int64_t e = 0; e < n_expert; ++e) {
const auto & toks = expert_tokens[e];
if (toks.empty()) {
continue;
}
const int64_t ntoks_e = (int64_t) toks.size();
// Allocate contiguous gather buffers on GPU: a_e [cols, ntoks_e], b_e [rows, ntoks_e]
ggml_cuda_pool_alloc<float> a_gathered(ctx.pool(), cols * ntoks_e);
ggml_cuda_pool_alloc<float> b_gathered(ctx.pool(), rows * ntoks_e);
// Gather token vectors from GPU src0/src1 into contiguous buffers
for (int64_t ti = 0; ti < ntoks_e; ++ti) {
const int64_t flat = toks[ti];
const int64_t iexp = flat / n_tokens;
const int64_t itok = flat % n_tokens;
CUDA_CHECK(cudaMemcpyAsync(
a_gathered.ptr + ti * cols,
a_base + iexp * a_stride_exp + itok * a_stride_tok,
cols * sizeof(float), cudaMemcpyDeviceToDevice, stream));
CUDA_CHECK(cudaMemcpyAsync(
b_gathered.ptr + ti * rows,
b_base + iexp * b_stride_exp + itok * b_stride_tok,
rows * sizeof(float), cudaMemcpyDeviceToDevice, stream));
}
// dst[:, :, e] += a_gathered @ b_gathered^T
// cuBLAS column-major: A=[cols, ntoks_e] lda=cols, B=[rows, ntoks_e] ldb=rows
CUBLAS_CHECK(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T,
(int)cols, (int)rows, (int)ntoks_e,
&alpha_one, a_gathered.ptr, (int)cols,
b_gathered.ptr, (int)rows,
&beta_acc, d_base + e*dst_stride_e, (int)cols));
}
}

View File

@ -1,3 +1,10 @@
#include "common.cuh"
void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
// Scattered outer-product for MUL_MAT_ID backward (gradient w.r.t. expert weight matrices).
// src0: activations [cols, n_expert_used, n_tokens] F32
// src1: grad_output [rows, n_expert_used, n_tokens] F32
// src2: expert ids [n_expert_used, n_tokens] I32 (may be CPU-resident)
// dst: grad_weight [cols, rows, n_expert, 1] F32
void ggml_cuda_out_prod_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@ -58,10 +58,13 @@ struct ggml_opt_context {
std::vector<struct ggml_tensor *> grad_accs;
std::vector<struct ggml_tensor *> grad_m;
std::vector<struct ggml_tensor *> grad_v;
std::vector<ggml_backend_buffer_t> bufs_momenta; // per-param moment buffers (one per param node)
std::vector<struct ggml_context *> ctxs_momenta; // corresponding ggml contexts (keep alive for tensor metadata)
int64_t iter = 1;
int32_t opt_period = 1;
int32_t opt_i = 0;
int32_t grad_checkpoint_interval = 0;
bool loss_per_datapoint = false;
ggml_opt_get_optimizer_params get_opt_pars = nullptr;
@ -230,6 +233,7 @@ struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * us
result.adamw.beta2 = 0.999f;
result.adamw.eps = 1e-8f;
result.adamw.wd = 0.0f;
result.adamw.gclip = 0.0f;
result.sgd.alpha = 1e-3f;
result.sgd.wd = 0.0f;
@ -253,9 +257,10 @@ struct ggml_opt_params ggml_opt_default_params(
/*loss_type =*/ loss_type,
/*build_type =*/ GGML_OPT_BUILD_TYPE_OPT,
/*opt_period =*/ 1,
/*get_opt_pars =*/ ggml_opt_get_default_optimizer_params,
/*get_opt_pars_ud =*/ nullptr,
/*optimizer =*/ GGML_OPT_OPTIMIZER_TYPE_ADAMW,
/*get_opt_pars =*/ ggml_opt_get_default_optimizer_params,
/*get_opt_pars_ud =*/ nullptr,
/*grad_checkpoint_interval =*/ 0,
/*optimizer =*/ GGML_OPT_OPTIMIZER_TYPE_ADAMW,
};
}
@ -475,8 +480,23 @@ static void ggml_opt_build(ggml_opt_context_t opt_ctx) {
for (int i = 0; i < n_nodes; ++i) {
ggml_tensor * node = opt_ctx->gf->nodes[i];
if (node->flags & GGML_TENSOR_FLAG_PARAM) {
opt_ctx->grad_m[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
opt_ctx->grad_v[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
// Allocate moments on the same buffer type as the param tensor so
// the ADAMW op runs on the correct backend (avoids cross-device mismatch
// when some LoRA tensors are on CPU and others on GPU with partial offload).
ggml_backend_buffer_type_t param_buft = node->buffer
? ggml_backend_buffer_get_type(node->buffer)
: ggml_backend_cpu_buffer_type();
// Allocate a tiny context + buffer for this pair of moment tensors.
const size_t sz = 2 * ggml_tensor_overhead();
struct ggml_init_params mip = { sz, nullptr, true };
struct ggml_context * mctx = ggml_init(mip);
opt_ctx->grad_m[i] = ggml_new_tensor(mctx, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
opt_ctx->grad_v[i] = ggml_new_tensor(mctx, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
ggml_backend_buffer_t mbuf = ggml_backend_alloc_ctx_tensors_from_buft(mctx, param_buft);
ggml_backend_buffer_clear(mbuf, 0);
opt_ctx->bufs_momenta.push_back(mbuf);
opt_ctx->ctxs_momenta.push_back(mctx); // keep alive for tensor metadata
} else {
opt_ctx->grad_m[i] = nullptr;
opt_ctx->grad_v[i] = nullptr;
@ -485,6 +505,31 @@ static void ggml_opt_build(ggml_opt_context_t opt_ctx) {
}
}
// Gradient checkpointing: mark every Nth forward node as OUTPUT so the allocator
// keeps its memory alive through the backward pass. The backward graph already
// contains the forward ops (gb_grad is a superset of gf), so the checkpointed
// activations are naturally available for backward matmuls without recomputation.
// This prevents the allocator from aliasing those buffers to later ops, cutting
// peak activation VRAM at the cost of slightly larger static allocation.
if (opt_ctx->grad_checkpoint_interval > 0) {
const int interval = opt_ctx->grad_checkpoint_interval;
const int n_fwd = opt_ctx->gf->n_nodes;
int ckpt_count = 0;
for (int i = interval - 1; i < n_fwd; i += interval) {
struct ggml_tensor * node = opt_ctx->gf->nodes[i];
// Only checkpoint F32 compute nodes — skip I32 index tensors and already-output nodes.
if (node->type != GGML_TYPE_F32) continue;
if (node->flags & GGML_TENSOR_FLAG_OUTPUT) continue;
if (node->flags & GGML_TENSOR_FLAG_INPUT) continue;
node->flags |= GGML_TENSOR_FLAG_OUTPUT;
ckpt_count++;
}
if (ckpt_count > 0) {
GGML_LOG_DEBUG("%s: gradient checkpointing: marked %d/%d nodes as persistent (interval=%d)\n",
__func__, ckpt_count, n_fwd, interval);
}
}
// gb_grad == graph backward gradients, forward pass, then backward pass to calculate gradients.
opt_ctx->gb_grad = ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gf, /*force_grads =*/ true);
ggml_build_backward_expand(opt_ctx->ctx_compute, opt_ctx->gb_grad, opt_ctx->grad_accs.data());
@ -503,7 +548,7 @@ static void ggml_opt_build(ggml_opt_context_t opt_ctx) {
// gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step.
opt_ctx->gb_opt = ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gb_grad, /*force_grads =*/ true);
opt_ctx->opt_step_params = ggml_new_tensor_1d(opt_ctx->ctx_cpu, GGML_TYPE_F32, need_momenta ? 7 : 2);
opt_ctx->opt_step_params = ggml_new_tensor_1d(opt_ctx->ctx_cpu, GGML_TYPE_F32, need_momenta ? 8 : 2);
ggml_tensor * adamw_params = opt_ctx->opt_step_params;
ggml_set_input(adamw_params);
const char * optimizer_name = ggml_opt_optimizer_name(opt_ctx->optimizer);
@ -555,10 +600,11 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
result->build_type_alloc = params.build_type;
result->inputs = params.inputs;
result->outputs = params.outputs;
result->opt_period = params.opt_period;
result->get_opt_pars = params.get_opt_pars;
result->get_opt_pars_ud = params.get_opt_pars_ud;
result->optimizer = params.optimizer;
result->opt_period = params.opt_period;
result->grad_checkpoint_interval = params.grad_checkpoint_interval;
result->get_opt_pars = params.get_opt_pars;
result->get_opt_pars_ud = params.get_opt_pars_ud;
result->optimizer = params.optimizer;
GGML_ASSERT(result->opt_period >= 1);
@ -587,6 +633,12 @@ void ggml_opt_free(ggml_opt_context_t opt_ctx) {
}
ggml_backend_buffer_free(opt_ctx->buf_static);
ggml_backend_buffer_free(opt_ctx->buf_cpu);
for (ggml_backend_buffer_t buf : opt_ctx->bufs_momenta) {
ggml_backend_buffer_free(buf);
}
for (struct ggml_context * ctx : opt_ctx->ctxs_momenta) {
ggml_free(ctx);
}
ggml_free(opt_ctx->ctx_static);
ggml_free(opt_ctx->ctx_cpu);
delete opt_ctx;
@ -726,6 +778,17 @@ void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward) {
if (opt_ctx->build_type == GGML_OPT_BUILD_TYPE_OPT && opt_ctx->opt_period > 1 && opt_ctx->opt_i == 0) {
ggml_graph_reset(opt_ctx->gb_grad);
}
// For non-static graphs the compute graph is rebuilt every call, so ggml_graph_reset
// is not called and grad_accs may carry over values from the previous accumulation window.
// Explicitly zero them at the start of each gradient-accumulation cycle.
if (!opt_ctx->static_graphs && backward && opt_ctx->opt_i == 0) {
for (struct ggml_tensor * ga : opt_ctx->grad_accs) {
if (ga) {
ggml_set_zero(ga);
}
}
}
if (backward) {
const int32_t opt_i_next = (opt_ctx->opt_i + 1) % opt_ctx->opt_period;
opt_ctx->build_type = opt_i_next == 0 ? GGML_OPT_BUILD_TYPE_OPT : GGML_OPT_BUILD_TYPE_GRAD;
@ -793,6 +856,7 @@ void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result) {
GGML_ASSERT(opt_pars.adamw.eps >= 0.0f);
GGML_ASSERT(opt_pars.adamw.wd >= 0.0f);
GGML_ASSERT(opt_pars.adamw.wd <= 1.0f);
GGML_ASSERT(opt_pars.adamw.gclip >= 0.0f);
// beta1, beta2 after applying warmup
const float beta1h = 1.0f / (1.0f - powf(opt_pars.adamw.beta1, opt_ctx->iter));
@ -806,6 +870,7 @@ void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result) {
adamw_par_data[4] = opt_pars.adamw.wd;
adamw_par_data[5] = beta1h;
adamw_par_data[6] = beta2h;
adamw_par_data[7] = opt_pars.adamw.gclip;
} break;
case GGML_OPT_OPTIMIZER_TYPE_SGD: {
GGML_ASSERT(opt_pars.sgd.alpha > 0.0f);

View File

@ -984,6 +984,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"MUL_MAT",
"MUL_MAT_ID",
"OUT_PROD",
"OUT_PROD_ID",
"SCALE",
"SET",
@ -1057,7 +1058,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"GLU",
};
static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT != 97");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@ -1094,6 +1095,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"X*Y",
"X[i]*Y",
"X*Y",
"X_id⊗Y_id",
"x*v",
"y-\\>view(x)",
@ -1167,7 +1169,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"glu(x)",
};
static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT != 97");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@ -3302,6 +3304,44 @@ struct ggml_tensor * ggml_out_prod(
return result;
}
// ggml_out_prod_id
//
// Scattered outer-product for the MUL_MAT_ID backward pass.
//
// a: [cols, n_expert_used, n_tokens] F32 — activations (src1 of MUL_MAT_ID)
// b: [rows, n_expert_used, n_tokens] F32 — upstream gradient
// ids: [n_expert_used, n_tokens] I32 — expert dispatch indices (src2 of MUL_MAT_ID)
// result: [cols, rows, n_expert, 1] F32
//
// result[:, :, e] += sum_{(i,t): ids[i,t]==e} a[:, i, t] ⊗ b[:, i, t]
//
// Computes the gradient w.r.t. the expert weight matrices (src0) of MUL_MAT_ID.
struct ggml_tensor * ggml_out_prod_id(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * ids,
int64_t n_expert) {
GGML_ASSERT(a->type == GGML_TYPE_F32);
GGML_ASSERT(b->type == GGML_TYPE_F32);
GGML_ASSERT(ids->type == GGML_TYPE_I32);
GGML_ASSERT(a->ne[1] == b->ne[1]); // n_expert_used matches
GGML_ASSERT(a->ne[2] == b->ne[2]); // n_tokens matches
GGML_ASSERT(ids->ne[0] == a->ne[1]); // n_expert_used matches ids
GGML_ASSERT(ids->ne[1] == a->ne[2]); // n_tokens matches ids
GGML_ASSERT(n_expert > 0);
const int64_t ne[4] = { a->ne[0], b->ne[0], n_expert, 1 };
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
result->op = GGML_OP_OUT_PROD_ID;
result->src[0] = a;
result->src[1] = b;
result->src[2] = ids;
return result;
}
// ggml_scale
static struct ggml_tensor * ggml_scale_impl(
@ -3840,12 +3880,17 @@ struct ggml_tensor * ggml_get_rows_back(
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c) {
GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_is_matrix(c) && (a->ne[0] == c->ne[0]));
GGML_ASSERT(b->type == GGML_TYPE_I32);
GGML_ASSERT(a->ne[0] == c->ne[0]);
// Support both 2D and 3D: result shape matches c (the source tensor shape)
// TODO: implement non F32 return
//struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]);
struct ggml_tensor * result;
if (c->ne[2] > 1) {
result = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1], c->ne[2]);
} else {
result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]);
}
result->op = GGML_OP_GET_ROWS_BACK;
result->src[0] = a;
@ -6064,7 +6109,7 @@ struct ggml_tensor * ggml_opt_step_adamw(
GGML_ASSERT(ggml_are_same_shape(a, m));
GGML_ASSERT(ggml_are_same_shape(a, v));
GGML_ASSERT(adamw_params->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_nelements(adamw_params) == 7);
GGML_ASSERT(ggml_nelements(adamw_params) == 8);
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
@ -6527,6 +6572,35 @@ static void ggml_compute_backward(
grad))); // [m,p,qq,rr]
}
} break;
case GGML_OP_MUL_MAT_ID: {
// Backward pass for indirect matrix multiplication (MoE).
//
// Forward: dst[rows, n_exp_used, n_tokens] = as[:,:,ids[i,t]] @ b[:,i,t]
// src0 = as [cols, rows, n_expert] — expert weight matrices
// src1 = b [cols, n_exp_used, n_tokens] — token activations
// src2 = ids [n_exp_used, n_tokens] — expert dispatch indices (I32)
//
// Gradient w.r.t. src1 (activations):
// grad_b[:,i,t] = as[:,:,ids[i,t]]^T @ grad[:,i,t]
// → computed via MUL_MAT_ID with transposed as
//
// Gradient w.r.t. src0 (expert weights, only when F32 i.e. LoRA):
// grad_as[:,:,e] += sum_{(i,t): ids[i,t]==e} b[:,i,t] ⊗ grad[:,i,t]
// → computed via OUT_PROD_ID
//
// Quantized src0 is frozen (stop-gradient) — handled in grads_needed below.
if (src0_needs_grads) {
const int64_t n_expert = src0->ne[2];
struct ggml_tensor * grad_as = ggml_out_prod_id(ctx, src1, grad, src2, n_expert);
ggml_add_or_set(ctx, cgraph, isrc0, grad_as);
}
if (src1_needs_grads) {
// Transpose expert matrices: as [cols, rows, n_expert] → as_T [rows, cols, n_expert]
struct ggml_tensor * as_T = ggml_cont(ctx, ggml_permute(ctx, src0, 1, 0, 2, 3));
struct ggml_tensor * grad_b = ggml_mul_mat_id(ctx, as_T, grad, src2);
ggml_add_or_set(ctx, cgraph, isrc1, grad_b);
}
} break;
case GGML_OP_SCALE: {
if (src0_needs_grads) {
float s;
@ -6973,6 +7047,35 @@ void ggml_build_backward_expand(
ignore_src[1] = true;
break;
// MUL_MAT_ID: expert dispatch indices (src2) are integer — no gradient.
// When src0 is quantized the expert weights are frozen, so stop gradient through
// both src0 and src1 (activations have no path to loss without differentiable weights).
case GGML_OP_MUL_MAT_ID:
if (ggml_is_quantized(node->src[0]->type)) {
ignore_src[0] = true;
ignore_src[1] = true;
}
ignore_src[2] = true; // ids: integer tensor
break;
// SET_ROWS is a KV-cache scatter write. The gradient of the written data flows
// through the attention read path (GET_ROWS backward), not through this node.
case GGML_OP_SET_ROWS:
ignore_src[0] = true;
ignore_src[1] = true;
break;
// Ops with no backward implementation — stop gradient through all sources so the
// backward graph builder never tries to propagate through them.
case GGML_OP_SSM_CONV: // Mamba causal conv1d
case GGML_OP_SSM_SCAN: // Mamba selective scan
case GGML_OP_FLASH_ATTN_EXT: // use standard attention for training
ignore_src[0] = true;
ignore_src[1] = true;
ignore_src[2] = true;
ignore_src[3] = true;
break;
default:
break;
}
@ -6988,9 +7091,12 @@ void ggml_build_backward_expand(
continue;
}
// inplace operations are currently not supported
GGML_ASSERT(!node->view_src || node->op == GGML_OP_CPY || node->op == GGML_OP_VIEW ||
node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE);
// inplace operations are currently not supported — warn and skip instead of crashing
if (node->view_src && node->op != GGML_OP_CPY && node->op != GGML_OP_VIEW &&
node->op != GGML_OP_RESHAPE && node->op != GGML_OP_PERMUTE && node->op != GGML_OP_TRANSPOSE) {
GGML_LOG_WARN("%s: skipping unsupported inplace op '%s' in backward graph\n", __func__, ggml_op_name(node->op));
continue;
}
const size_t ihash = ggml_hash_find(&cgraph->visited_hash_set, node);
GGML_ASSERT(ihash != GGML_HASHSET_FULL);

View File

@ -1553,10 +1553,22 @@ extern "C" {
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
enum ggml_opt_optimizer_type optimizer_type;
// Gradient checkpointing: mark every Nth forward graph node as persistent so the
// allocator cannot reuse its memory during backward. Reduces peak activation VRAM
// at the cost of ~0 extra compute (activations are kept, not recomputed).
// Set to 0 (default) to disable. Good values: 3264 nodes ≈ every 12 transformer layers.
int32_t grad_checkpoint_interval;
};
LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
// weights: array of floats, one per dataset window (indexed by idata), already normalized to [0,1].
// n_weights: length of the array.
// Pass NULL/0 to disable (equivalent to all-ones, i.e. standard SFT).
// The pointer must remain valid for the duration of all llama_opt_epoch calls.
LLAMA_API void llama_opt_set_reward_weights(const float * weights, int64_t n_weights);
LLAMA_API void llama_opt_epoch(
struct llama_context * lctx,
ggml_opt_dataset_t dataset,
@ -1564,7 +1576,8 @@ extern "C" {
ggml_opt_result_t result_eval,
int64_t idata_split,
ggml_opt_epoch_callback callback_train,
ggml_opt_epoch_callback callback_eval);
ggml_opt_epoch_callback callback_eval,
bool shuffle);
#ifdef __cplusplus
}

View File

@ -334,16 +334,26 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
auto * buft = ggml_backend_buffer_get_type(model_tensor->buffer);
// do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case
// do not load loras to extra buffer types (i.e. bufts for repacking)
// try device-native buft first (keeps LoRA on GPU), fall back to CPU only as last resort
for (auto & ex : buft_extra) {
if (ex == buft) {
LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (!cpu_dev) {
throw std::runtime_error(format("%s: no CPU backend found", __func__));
// try to get the device's native (non-repack) buffer type
auto * dev = ggml_backend_buft_get_device(buft);
auto * native_buft = dev ? ggml_backend_dev_buffer_type(dev) : nullptr;
if (native_buft && native_buft != buft) {
LLAMA_LOG_WARN("%s: lora for '%s' cannot use repack buft '%s', using device-native '%s'\n",
__func__, model_tensor->name, ggml_backend_buft_name(buft), ggml_backend_buft_name(native_buft));
buft = native_buft;
} else {
LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n",
__func__, model_tensor->name, ggml_backend_buft_name(buft));
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (!cpu_dev) {
throw std::runtime_error(format("%s: no CPU backend found", __func__));
}
buft = ggml_backend_dev_buffer_type(cpu_dev);
}
buft = ggml_backend_dev_buffer_type(cpu_dev);
break;
}

View File

@ -2677,11 +2677,71 @@ void llama_context::opt_init(struct llama_model * model, struct llama_opt_params
GGML_ASSERT(model->hparams.n_ctx_train % n_batch == 0);
GGML_ASSERT(n_batch % n_ubatch == 0);
// Recreate the scheduler and gf_res_prev with a training-inflated graph size before
// creating opt_ctx, so opt_ctx captures the new (larger) scheduler pointer.
// The backward graph (gb_grad) duplicates gf and adds ~2-3x more nodes+leafs;
// gb_opt adds optimizer step nodes on top.
//
// We measure the actual training forward graph node count at n_ubatch here,
// then multiply by 4 to cover gf + gb_grad + gb_opt. This is exact for any
// model size — no magic constant needed.
{
uint32_t train_fwd_nodes = 0;
// Build a real training-ubatch forward graph in split-only mode (no buffer realloc)
// so we can count its actual nodes. Fall back to n_tensors formula if it fails.
if (memory) {
auto mctx_tmp = memory->init_full();
if (mctx_tmp) {
// graph_reserve() uses gf_res_reserve to build the graph, so both
// must be large enough to hold the training forward graph.
// Use 16x n_tensors as a generous temporary cap for the measurement pass.
const uint32_t tmp_cap = std::max<uint32_t>(4096u, 16u * model->n_tensors());
gf_res_prev.reset(new llm_graph_result(tmp_cap));
gf_res_reserve.reset(new llm_graph_result(tmp_cap));
// split_only=true: only splits the graph, doesn't reallocate compute buffers
auto * gf_train = graph_reserve(n_ubatch, 1, n_ubatch, mctx_tmp.get(), /*split_only=*/true);
if (gf_train) {
train_fwd_nodes = (uint32_t)ggml_graph_n_nodes(gf_train);
LLAMA_LOG_INFO("%s: measured training graph nodes = %u (n_ubatch=%u)\n",
__func__, train_fwd_nodes, n_ubatch);
}
}
}
if (train_fwd_nodes == 0) {
// Fallback: use n_tensors formula
train_fwd_nodes = std::max<uint32_t>(1024u, 8u * model->n_tensors());
LLAMA_LOG_WARN("%s: could not measure training graph, using fallback nodes=%u\n",
__func__, train_fwd_nodes);
}
// gf + gb_grad + gb_opt each need ~train_fwd_nodes; multiply by 4 for safety headroom.
// Multiply by 2 again for the scheduler's n_nodes + n_leafs check.
const int64_t inflated = (int64_t)std::max<uint32_t>(train_fwd_nodes, 1024u) * 4;
const int64_t sched_size = inflated * 2;
// Both gf_res_prev and gf_res_reserve are used to build forward graphs
// (graph_reserve uses gf_res_reserve; opt_epoch_iter uses gf_res_prev).
// Both must have capacity for the full backward graph.
gf_res_prev.reset(new llm_graph_result(inflated));
gf_res_reserve.reset(new llm_graph_result(inflated));
sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(),
sched_size, cparams.pipeline_parallel, cparams.op_offload));
// Suppress the next sched_reserve() call so that llama_decode() during GRPO inference
// steps does NOT replace the training sched with a smaller inference sched.
// opt_ctx->backend_sched stores a raw pointer to sched.get(); replacing sched while
// opt_ctx is alive would leave that pointer dangling and crash on the next opt_epoch.
sched_need_reserve = false;
LLAMA_LOG_INFO("%s: training graph capacity = %lld (train_fwd_nodes=%u x4)\n",
__func__, (long long)inflated, train_fwd_nodes);
}
ggml_opt_params opt_params = ggml_opt_default_params(sched.get(), GGML_OPT_LOSS_TYPE_CROSS_ENTROPY);
opt_params.opt_period = n_batch / n_ubatch;
opt_params.get_opt_pars = lopt_params.get_opt_pars;
opt_params.get_opt_pars_ud = lopt_params.get_opt_pars_ud;
opt_params.optimizer = lopt_params.optimizer_type;
opt_params.opt_period = n_batch / n_ubatch;
opt_params.get_opt_pars = lopt_params.get_opt_pars;
opt_params.get_opt_pars_ud = lopt_params.get_opt_pars_ud;
opt_params.optimizer = lopt_params.optimizer_type;
opt_params.grad_checkpoint_interval = lopt_params.grad_checkpoint_interval;
opt_ctx = ggml_opt_init(opt_params);
llama_opt_param_filter param_filter = lopt_params.param_filter;
@ -2716,6 +2776,7 @@ void llama_context::opt_epoch_iter(
const std::vector<llama_token> & tokens,
const std::vector<llama_token> & labels_sparse,
llama_batch & batch,
float reward_scale,
ggml_opt_epoch_callback callback,
bool train,
int64_t idata_in_loop,
@ -2764,6 +2825,8 @@ void llama_context::opt_epoch_iter(
};
uint32_t pos_batch = 0;
static bool timings_printed = false; // print per-ubatch timings only for the first window
struct ggml_context * ctx_compute_opt = nullptr;
do {
const auto & ubatch = mctx->get_ubatch();
@ -2776,56 +2839,98 @@ void llama_context::opt_epoch_iter(
auto * res = gf_res_prev.get();
const int64_t t0_build = ggml_time_ms();
const auto gparams = graph_params(res, ubatch, mctx.get(), LLM_GRAPH_TYPE_DEFAULT);
res->reset();
auto * gf = model.build_graph(gparams);
struct ggml_context * ctx_compute_opt;
{
// Allocate the tensor metadata context once, then reset it each iteration.
// ggml_reset() is much cheaper than ggml_free()+ggml_init() — it just resets the
// allocation pointer without freeing/reallocating the backing memory buffer.
if (!ctx_compute_opt) {
const size_t size_gf = ggml_graph_size(gf);
const size_t size_meta = 4*size_gf*ggml_tensor_overhead() + 2*ggml_graph_overhead_custom(size_gf, /*grads = */ true);
const size_t size_meta = 4*size_gf*ggml_tensor_overhead() + 3*ggml_graph_overhead_custom(size_gf, /*grads = */ true);
struct ggml_init_params params = {
/*.mem_size =*/ size_meta,
/*.mem_buffer =*/ nullptr,
/*.no_alloc =*/ true,
};
ctx_compute_opt = ggml_init(params);
if (!timings_printed) {
LLAMA_LOG_INFO("%s: [timing] graph capacity=%zu n_nodes=%d size_meta=%.1fMB\n", __func__,
size_gf, ggml_graph_n_nodes(gf), (double)size_meta / (1024*1024));
}
} else {
ggml_reset(ctx_compute_opt);
}
const int64_t t1_alloc = ggml_time_ms();
ggml_opt_prepare_alloc(opt_ctx, ctx_compute_opt, gf, res->get_inp_tokens(), res->get_logits());
ggml_opt_alloc(opt_ctx, train);
const int64_t t2_inputs = ggml_time_ms();
res->set_inputs(&ubatch);
{
struct ggml_tensor * labels = ggml_opt_labels(opt_ctx);
GGML_ASSERT(labels->ne[1] == n_ubatch);
ggml_set_zero(labels);
const float onef = 1.0f;
for (uint32_t pos_ubatch = 0; pos_ubatch < n_ubatch; ++pos_ubatch) {
const uint32_t ilabel = pos_ctx + pos_batch + pos_ubatch;
// -1 sentinel means "masked position" (prompt token, BOS separator, etc).
// Leave the label tensor zeroed at this position → zero cross-entropy
// contribution. Do NOT write anything — ggml_set_zero already handled it.
if (labels_sparse[ilabel] < 0) continue;
GGML_ASSERT(labels_sparse[ilabel] < labels->ne[0]);
ggml_backend_tensor_set(labels, &onef, (pos_ubatch*labels->ne[0] + labels_sparse[ilabel])*sizeof(float), sizeof(float));
ggml_backend_tensor_set(labels, &reward_scale, (pos_ubatch*labels->ne[0] + labels_sparse[ilabel])*sizeof(float), sizeof(float));
}
}
const int64_t t3_eval = ggml_time_ms();
ggml_opt_eval(opt_ctx, result);
const int64_t t4_done = ggml_time_ms();
if (!timings_printed) {
LLAMA_LOG_INFO("%s: [timing] build=%" PRId64 "ms alloc=%" PRId64 "ms inputs=%" PRId64 "ms eval=%" PRId64 "ms total=%" PRId64 "ms\n",
__func__,
t1_alloc - t0_build,
t2_inputs - t1_alloc,
t3_eval - t2_inputs,
t4_done - t3_eval,
t4_done - t0_build);
timings_printed = true;
}
if (callback) {
callback(train, opt_ctx, dataset, result, idata_in_loop + (pos_ctx + pos_batch)/n_ubatch + 1, ndata_in_loop, t_loop_start);
}
ggml_free(ctx_compute_opt);
pos_batch += ubatch.n_tokens;
} while (mctx->next());
ggml_free(ctx_compute_opt);
}
}
// Optional per-window reward weights for reward-weighted SFT.
// Set via llama_opt_set_reward_weights() before calling llama_opt_epoch().
// Null/0 means all rewards are 1.0 (standard SFT).
static thread_local const float * g_reward_weights = nullptr;
static thread_local int64_t g_reward_weights_n = 0;
void llama_opt_set_reward_weights(const float * weights, int64_t n_weights) {
g_reward_weights = weights;
g_reward_weights_n = n_weights;
}
void llama_context::opt_epoch(
ggml_opt_dataset_t dataset,
ggml_opt_result_t result_train,
ggml_opt_result_t result_eval,
int64_t idata_split,
ggml_opt_epoch_callback callback_train,
ggml_opt_epoch_callback callback_eval) {
ggml_opt_epoch_callback callback_eval,
bool shuffle) {
const uint32_t n_ctx = this->n_ctx();
const uint32_t n_batch = std::min(cparams.n_batch, n_ctx);
const uint32_t n_ubatch = std::min(cparams.n_ubatch, n_batch);
@ -2834,6 +2939,10 @@ void llama_context::opt_epoch(
GGML_ASSERT(idata_split >= 0);
GGML_ASSERT(idata_split <= ndata);
if (shuffle && idata_split > 1) {
ggml_opt_dataset_shuffle(opt_ctx, dataset, idata_split);
}
const uint32_t ubatch_per_ctx = n_ctx / n_ubatch;
struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
@ -2847,9 +2956,11 @@ void llama_context::opt_epoch(
for (; idata < idata_split; ++idata) {
constexpr bool train = true;
const int64_t idata_in_loop = idata*ubatch_per_ctx;
const float reward = (g_reward_weights && idata < g_reward_weights_n)
? g_reward_weights[idata] : 1.0f;
ggml_opt_dataset_get_batch_host(dataset, tokens.data(), n_ctx*sizeof(llama_token), labels_sparse.data(), idata);
opt_epoch_iter(dataset, result_train, tokens, labels_sparse, batch,
opt_epoch_iter(dataset, result_train, tokens, labels_sparse, batch, reward,
callback_train, train, idata_in_loop, ndata_in_loop, t_loop_start);
}
@ -2860,7 +2971,7 @@ void llama_context::opt_epoch(
const int64_t idata_in_loop = (idata - idata_split)*ubatch_per_ctx;
ggml_opt_dataset_get_batch_host(dataset, tokens.data(), n_ctx*sizeof(llama_token), labels_sparse.data(), idata);
opt_epoch_iter(dataset, result_eval, tokens, labels_sparse, batch,
opt_epoch_iter(dataset, result_eval, tokens, labels_sparse, batch, 1.0f,
callback_eval, train, idata_in_loop, ndata_in_loop, t_loop_start);
}
@ -3622,12 +3733,14 @@ void llama_opt_epoch(
ggml_opt_result_t result_eval,
int64_t idata_split,
ggml_opt_epoch_callback callback_train,
ggml_opt_epoch_callback callback_eval) {
ggml_opt_epoch_callback callback_eval,
bool shuffle) {
ctx->opt_epoch(
dataset,
result_train,
result_eval,
idata_split,
callback_train,
callback_eval);
callback_eval,
shuffle);
}

View File

@ -187,7 +187,8 @@ struct llama_context {
ggml_opt_result_t result_eval,
int64_t idata_split,
ggml_opt_epoch_callback callback_train,
ggml_opt_epoch_callback callback_eval);
ggml_opt_epoch_callback callback_eval,
bool shuffle);
void opt_epoch_iter(
ggml_opt_dataset_t dataset,
@ -195,6 +196,7 @@ struct llama_context {
const std::vector<llama_token> & tokens,
const std::vector<llama_token> & labels_sparse,
llama_batch & batch,
float reward_scale,
ggml_opt_epoch_callback callback,
bool train,
int64_t idata_in_loop,