Merge 99c2456bbb into 9e2e2198b0
This commit is contained in:
commit
727b1f69dc
|
|
@ -3607,32 +3607,108 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
{ "-lr", "--learning-rate" }, "ALPHA",
|
||||
string_format("adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)", (double) params.lr.lr0),
|
||||
[](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); }
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA }));
|
||||
add_opt(common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
|
||||
string_format("(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
|
||||
(double) params.lr.lr_min),
|
||||
[](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); }
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA }));
|
||||
add_opt(common_arg(
|
||||
{"-decay-epochs", "--learning-rate-decay-epochs"}, "ALPHA",
|
||||
string_format("(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)", (double) params.lr.decay_epochs),
|
||||
[](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); }
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA }));
|
||||
add_opt(common_arg(
|
||||
{"-wd", "--weight-decay"}, "WD",
|
||||
string_format("adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).", (double) params.lr.wd),
|
||||
[](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); }
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA }));
|
||||
add_opt(common_arg(
|
||||
{"-val-split", "--val-split"}, "FRACTION",
|
||||
string_format("fraction of data to use as validation set for training (default: %.2g).", (double) params.val_split),
|
||||
[](common_params & params, const std::string & value) { params.val_split = std::stof(value); }
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA }));
|
||||
// qlora flags
|
||||
add_opt(common_arg(
|
||||
{"--lora-rank"}, "N",
|
||||
string_format("LoRA rank r (default: %d)", params.lora_rank),
|
||||
[](common_params & params, int value) { params.lora_rank = value; }
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
|
||||
add_opt(common_arg(
|
||||
{"--lora-alpha"}, "F",
|
||||
string_format("LoRA alpha (default: %d = use rank value)", (int) params.lora_alpha),
|
||||
[](common_params & params, const std::string & value) { params.lora_alpha = std::stof(value); }
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
|
||||
add_opt(common_arg(
|
||||
{"--lora-targets"}, "SUBSTRINGS",
|
||||
string_format("comma-separated substrings of tensor names to add LoRA to (default: %s)", params.lora_targets.c_str()),
|
||||
[](common_params & params, const std::string & value) { params.lora_targets = value; }
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
|
||||
add_opt(common_arg(
|
||||
{"--lora-out"}, "FNAME",
|
||||
string_format("output LoRA adapter GGUF path (default: %s)", params.lora_out.c_str()),
|
||||
[](common_params & params, const std::string & value) { params.lora_out = value; }
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
|
||||
add_opt(common_arg(
|
||||
{"--train-file"}, "FNAME",
|
||||
"JSONL training dataset (fields: messages|prompt+response|text)",
|
||||
[](common_params & params, const std::string & value) { params.train_file = value; }
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
|
||||
add_opt(common_arg(
|
||||
{"--save-every"}, "N",
|
||||
"save adapter checkpoint every N dataset windows during training (default: 0 = only at end)",
|
||||
[](common_params & params, int value) { params.save_every = value; }
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
|
||||
add_opt(common_arg(
|
||||
{"--freeze-layers"}, "N",
|
||||
"freeze first N transformer layers — no LoRA adapters allocated for blk.0..blk.N-1 (default: 0 = train all layers)",
|
||||
[](common_params & params, int value) { params.lora_freeze_layers = value; }
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
|
||||
add_opt(common_arg(
|
||||
{"--grad-checkpoint"}, "N",
|
||||
"gradient checkpointing interval to reduce peak activation VRAM (0 = disabled, default: 0)",
|
||||
[](common_params & params, int value) { params.grad_checkpoint_interval = value; }
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
|
||||
add_opt(common_arg(
|
||||
{"--train-on-prompt"},
|
||||
"compute loss on prompt tokens too, not just the response (default: response-only loss)",
|
||||
[](common_params & params) { params.train_on_prompt = true; }
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
|
||||
add_opt(common_arg(
|
||||
{"--shuffle-dataset"},
|
||||
"shuffle dataset windows at the start of each epoch (default: sequential order)",
|
||||
[](common_params & params) { params.shuffle_dataset = true; }
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
|
||||
add_opt(common_arg(
|
||||
{"--grpo-mode"},
|
||||
"enable GRPO IPC training loop (prompts and rewards supplied via stdin/stdout)",
|
||||
[](common_params & params) { params.grpo_mode = true; }
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
|
||||
add_opt(common_arg(
|
||||
{"--n-gen"}, "N",
|
||||
string_format("GRPO: number of generations per prompt (default: %d)", params.grpo_n_gen),
|
||||
[](common_params & params, int value) { params.grpo_n_gen = value; }
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
|
||||
add_opt(common_arg(
|
||||
{"--n-steps"}, "N",
|
||||
string_format("GRPO: total optimizer steps (default: %d)", params.grpo_n_steps),
|
||||
[](common_params & params, int value) { params.grpo_n_steps = value; }
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
|
||||
add_opt(common_arg(
|
||||
{"--grpo-temp"}, "F",
|
||||
string_format("GRPO: sampling temperature for rollout generation (default: %.2f)", (double) params.grpo_temperature),
|
||||
[](common_params & params, const std::string & value) { params.grpo_temperature = std::stof(value); }
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
|
||||
add_opt(common_arg(
|
||||
{"--grpo-max-tokens"}, "N",
|
||||
string_format("GRPO: max tokens per generation (default: %d)", params.grpo_max_tokens),
|
||||
[](common_params & params, int value) { params.grpo_max_tokens = value; }
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE_QLORA }));
|
||||
add_opt(common_arg(
|
||||
{"-epochs", "--epochs"}, "N",
|
||||
string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
|
||||
[](common_params & params, int epochs) { params.lr.epochs = epochs; }
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA }));
|
||||
add_opt(common_arg(
|
||||
{"-opt", "--optimizer"}, "sgd|adamw", "adamw or sgd",
|
||||
[](common_params & params, const std::string & name) {
|
||||
|
|
@ -3641,7 +3717,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
|
||||
}
|
||||
}
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FINETUNE_QLORA }));
|
||||
add_opt(common_arg(
|
||||
{"--check"},
|
||||
string_format("check rather than generate results (default: %s)", params.check ? "true" : "false"),
|
||||
|
|
|
|||
|
|
@ -103,6 +103,7 @@ enum llama_example {
|
|||
LLAMA_EXAMPLE_TTS,
|
||||
LLAMA_EXAMPLE_DIFFUSION,
|
||||
LLAMA_EXAMPLE_FINETUNE,
|
||||
LLAMA_EXAMPLE_FINETUNE_QLORA,
|
||||
LLAMA_EXAMPLE_FIT_PARAMS,
|
||||
LLAMA_EXAMPLE_RESULTS,
|
||||
LLAMA_EXAMPLE_EXPORT_GRAPH_OPS,
|
||||
|
|
@ -518,7 +519,26 @@ struct common_params {
|
|||
// finetune
|
||||
struct lr_opt lr;
|
||||
enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
|
||||
float val_split = 0.05f; // fraction of the data used for the validation set
|
||||
float val_split = 0.05f; // fraction of the data used for the validation set
|
||||
|
||||
// qlora fine-tuning
|
||||
int32_t lora_rank = 16; // LoRA rank (r)
|
||||
float lora_alpha = 0.0f; // LoRA alpha (0 = use rank value)
|
||||
std::string lora_targets = "attn_q,attn_output,ffn_gate,ffn_up,ffn_down"; // comma-separated substrings to match trainable tensors
|
||||
std::string lora_out = "adapter.gguf"; // output adapter GGUF path
|
||||
std::string train_file = ""; // JSONL training dataset path
|
||||
int32_t save_every = 0; // save checkpoint every N optimizer steps (0 = disabled)
|
||||
int32_t lora_freeze_layers = 0; // do not apply LoRA to the first N transformer layers
|
||||
int32_t grad_checkpoint_interval = 0; // gradient checkpointing interval to reduce peak VRAM (0 = disabled)
|
||||
bool train_on_prompt = false; // include prompt tokens in training loss (default: response tokens only)
|
||||
bool shuffle_dataset = false; // shuffle dataset windows at the start of each epoch
|
||||
|
||||
// grpo training
|
||||
bool grpo_mode = false; // enable GRPO IPC training loop
|
||||
int32_t grpo_n_gen = 8; // generations per prompt
|
||||
int32_t grpo_n_steps = 500; // total GRPO optimizer steps
|
||||
float grpo_temperature = 0.8f; // sampling temperature for rollouts
|
||||
int32_t grpo_max_tokens = 512; // max tokens per generation
|
||||
|
||||
// embedding
|
||||
bool embedding = false; // get only sentence embedding
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ else()
|
|||
add_subdirectory(speculative-simple)
|
||||
add_subdirectory(gen-docs)
|
||||
add_subdirectory(training)
|
||||
add_subdirectory(qlora_training)
|
||||
add_subdirectory(diffusion)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
add_subdirectory(convert-llama2c-to-ggml)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,5 @@
|
|||
set(TARGET llama-finetune-qlora)
|
||||
add_executable(${TARGET} finetune_qlora.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
|
@ -0,0 +1,331 @@
|
|||
# llama.cpp — Native QLoRA Training
|
||||
|
||||
Native QLoRA + Reward-Weighted SFT training pipeline for quantized GGUF models.
|
||||
|
||||
The base model weights remain **frozen** (quantized tensors are skipped by `llama_set_param` because they are not `GGML_TYPE_F32`). Only freshly-allocated F32 LoRA A/B tensors are trained. The saved adapter GGUF is directly compatible with the existing `llama_adapter_lora_init` loader and `llama-export-lora` merge tool.
|
||||
|
||||
**Status:** Working. Phase 1 (QLoRA SFT) and Phase 2 (Reward-Weighted SFT) are implemented and functional. Training speed is currently limited by full backprop through quantized weights — see [Known Limitations](#known-limitations).
|
||||
|
||||
---
|
||||
|
||||
## Build
|
||||
|
||||
```bash
|
||||
cd /mnt/w/llm-trading-arena/unsloth-api/llama.cpp
|
||||
|
||||
# First time (CUDA build):
|
||||
cmake -B build -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_CURL=OFF
|
||||
cmake --build build -j$(nproc)
|
||||
|
||||
# Incremental rebuild (after code changes):
|
||||
cmake --build build --target llama-finetune-qlora -j$(nproc)
|
||||
# If llama-adapter.cpp or llama-context.cpp changed, rebuild all:
|
||||
cmake --build build -j$(nproc)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Phase 1 — QLoRA SFT (`llama-finetune-qlora`)
|
||||
|
||||
Trains LoRA adapters on a quantized GGUF model.
|
||||
|
||||
### Recommended command (1.7B model, 16 GB card)
|
||||
|
||||
```bash
|
||||
./build/bin/llama-finetune-qlora \
|
||||
--model ~/qwen3-1.7b-q4_k_m.gguf \
|
||||
--train-file data/train.jsonl \
|
||||
--lora-rank 16 --lora-alpha 16 \
|
||||
-c 4096 -b 4096 -ub 512 \
|
||||
--save-every 10 \
|
||||
--lora-out ~/adapter.gguf \
|
||||
--epochs 3 --seed 42
|
||||
```
|
||||
|
||||
### Recommended command (15B model, 16 GB card, partial offload)
|
||||
|
||||
```bash
|
||||
./build/bin/llama-finetune-qlora \
|
||||
--model ~/nemotron-15b-q4_k_m.gguf \
|
||||
--train-file data/train.jsonl \
|
||||
--lora-rank 16 --lora-alpha 16 \
|
||||
-ngl 13 -c 14336 -b 14336 -ub 1024 \
|
||||
--save-every 8 \
|
||||
--lora-out ~/nemotron-lora.gguf \
|
||||
--epochs 3 --seed 42
|
||||
```
|
||||
|
||||
### All flags
|
||||
|
||||
| Flag | Default | Description |
|
||||
|---|---|---|
|
||||
| `--model` | *(required)* | Path to quantized GGUF model |
|
||||
| `--train-file` | *(required)* | JSONL training dataset |
|
||||
| `--lora-rank` | `16` | LoRA rank r |
|
||||
| `--lora-alpha` | `0` (= rank) | LoRA alpha; effective scale = alpha/rank |
|
||||
| `--lora-targets` | see below | Comma-separated internal tensor name substrings |
|
||||
| `--lora-out` | `adapter.gguf` | Output adapter GGUF path (supports `~`) |
|
||||
| `--save-every` | `0` | Save checkpoint every N dataset windows (0 = end only) |
|
||||
| `--freeze-layers` | `0` | Skip LoRA on first N transformer layers (blk.0..N-1); backward already pruned automatically |
|
||||
| `--grad-checkpoint` | `0` | Mark every Nth forward node persistent to reduce activation VRAM; good values: 32–64 |
|
||||
| `--train-on-prompt` | off | Compute loss on prompt tokens too (default: response-only loss) |
|
||||
| `--shuffle-dataset` | off | Shuffle dataset windows at the start of each epoch |
|
||||
| `--val-split` | `0.0` | Fraction of data to hold out for validation (e.g. `0.1` = 10%); val loss logged per epoch |
|
||||
| `-epochs` / `--epochs` | `3` | Training epochs |
|
||||
| `-c` / `--ctx-size` | `512` | Training context window (tokens) |
|
||||
| `-b` / `--batch-size` | `2048` | Tokens per `llama_decode` call; set equal to `-c` |
|
||||
| `-ub` / `--ubatch-size` | `512` | GPU micro-batch tokens; controls VRAM vs. step time |
|
||||
| `-ngl` | `999` | GPU layers to offload |
|
||||
| `-lr` / `--learning-rate` | `1e-4` | AdamW learning rate |
|
||||
| `--seed` | `42` | Random seed for LoRA init |
|
||||
|
||||
### VRAM vs. step-time tradeoff
|
||||
|
||||
Step time and VRAM both scale linearly with `-ub`:
|
||||
|
||||
| Model | `-ub` | VRAM | Step time (approx) |
|
||||
|---|---|---|---|
|
||||
| 1.7B Q4_K_M | 512 | ~18 GB | ~120 s (OOM on 16 GB) |
|
||||
| 1.7B Q4_K_M | 128 | ~6 GB | ~30 s |
|
||||
| 15B Q4_K_M | 1024 | ~11 GB | ~60 s |
|
||||
|
||||
Use `-c` equal to your target sequence length. More context = more windows per sample = more steps per epoch. Reducing `-c` reduces total training time proportionally.
|
||||
|
||||
### Default LoRA targets
|
||||
|
||||
llama.cpp uses **internal GGUF tensor names**, not HuggingFace names:
|
||||
|
||||
| llama.cpp internal | HuggingFace equivalent | Status |
|
||||
|---|---|---|
|
||||
| `attn_q` | `q_proj` | ✅ default target, trainable |
|
||||
| `attn_output` | `o_proj` | ✅ default target, trainable |
|
||||
| `ffn_gate` | `gate_proj` | ✅ default target, trainable |
|
||||
| `ffn_up` | `up_proj` | ✅ default target, trainable |
|
||||
| `ffn_down` | `down_proj` | ✅ default target, trainable |
|
||||
| `attn_k` | `k_proj` | ❌ not in defaults — zero gradient (KV scatter via SET_ROWS) |
|
||||
| `attn_v` | `v_proj` | ❌ not in defaults — zero gradient (KV scatter via SET_ROWS) |
|
||||
| `ssm_in` | `in_proj` | ❌ not in defaults — zero gradient (SSM_SCAN no backward) |
|
||||
| `ssm_out` | `out_proj` | ❌ not in defaults — zero gradient (SSM_SCAN no backward) |
|
||||
|
||||
**MoE models:** Expert tensors (`*_exps`) are excluded regardless of `--lora-targets`. The quantized expert weights are frozen (stop-gradient), but LoRA on the dense FFN layers (`ffn_gate`, `ffn_up`, `ffn_down`) works — backward via `MUL_MAT_ID` + `OUT_PROD_ID`.
|
||||
|
||||
### Dataset format (JSONL)
|
||||
|
||||
**Chat format** (loss on response only; use `--train-on-prompt` for all tokens):
|
||||
```json
|
||||
{"messages": [{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi!"}]}
|
||||
```
|
||||
|
||||
**Prompt/response** (loss on response only):
|
||||
```json
|
||||
{"prompt": "What is the capital of France?", "response": "Paris."}
|
||||
```
|
||||
|
||||
**Plain text** (loss on all tokens):
|
||||
```json
|
||||
{"text": "The quick brown fox."}
|
||||
```
|
||||
|
||||
**With reward** (Phase 2 — scales gradient by reward):
|
||||
```json
|
||||
{"prompt": "...", "response": "...", "reward": 0.85}
|
||||
```
|
||||
|
||||
Rewards are normalized per epoch: clipped to `[-1, 1]`, then min-max scaled to `[0, 1]`. Reward 0 = sample ignored; reward 1 = full gradient.
|
||||
|
||||
### Verify and use the adapter
|
||||
|
||||
```bash
|
||||
# Hot-load for inference (no merge needed)
|
||||
./build/bin/llama-cli --model base.gguf --lora adapter.gguf -p "Hello"
|
||||
|
||||
# Merge into base model
|
||||
./build/bin/llama-export-lora \
|
||||
--model base.gguf --lora adapter.gguf --output merged.gguf
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Phase 2 — Reward-Weighted SFT
|
||||
|
||||
Built into `llama-finetune-qlora`. When the dataset contains a `reward` or `score` field, the cross-entropy loss for that sample is scaled by the reward before backprop. No extra flags needed — detection is automatic.
|
||||
|
||||
---
|
||||
|
||||
## Phase 3 — GRPO (Online RL via IPC)
|
||||
|
||||
`llama-finetune-qlora --grpo-mode` implements a full GRPO training loop where the Python process owns prompt sampling and reward scoring, and the C++ process owns model state, generation, and gradient updates.
|
||||
|
||||
### Quick start
|
||||
|
||||
```bash
|
||||
python3 examples/qlora_training/grpo_example.py \
|
||||
--model ~/qwen3-1.7b-q4_k_m.gguf \
|
||||
--lora-out ~/grpo-adapter.gguf \
|
||||
--rank 16 --n-steps 200 --n-gen 8
|
||||
```
|
||||
|
||||
For verbose output (includes IPC message trace):
|
||||
|
||||
```bash
|
||||
python3 examples/qlora_training/grpo_example.py \
|
||||
--model ~/qwen3-1.7b-q4_k_m.gguf \
|
||||
--lora-out ~/grpo-adapter.gguf \
|
||||
--verbose
|
||||
```
|
||||
|
||||
Resume from a checkpoint:
|
||||
|
||||
```bash
|
||||
python3 examples/qlora_training/grpo_example.py \
|
||||
--model ~/qwen3-1.7b-q4_k_m.gguf \
|
||||
--lora ~/grpo-adapter.ckpt50.gguf \
|
||||
--lora-out ~/grpo-adapter.gguf
|
||||
```
|
||||
|
||||
### GRPO-specific flags
|
||||
|
||||
| Flag | Default | Description |
|
||||
|---|---|---|
|
||||
| `--grpo-mode` | off | Enable GRPO IPC mode |
|
||||
| `--n-gen` | `8` | Rollouts per prompt |
|
||||
| `--n-steps` | `500` | Total GRPO steps |
|
||||
| `--grpo-temp` | `0.8` | Sampling temperature for rollouts |
|
||||
| `--grpo-max-tokens` | `512` | Max tokens per generation |
|
||||
|
||||
All standard flags (`--lora-rank`, `-lr`, `-c`, `-ngl`, `--save-every`, etc.) work in GRPO mode too. `--train-file` is **not** required in GRPO mode.
|
||||
|
||||
### IPC protocol
|
||||
|
||||
The protocol is line-based over stdout (C++ → Python) and stdin (Python → C++). All non-protocol C++ output (timing, debug, model logs) goes to **stderr** and never contaminates the protocol channel.
|
||||
|
||||
**C++ → Python (stdout):**
|
||||
|
||||
| Line | When |
|
||||
|---|---|
|
||||
| `[QLORA:READY]` | Process initialised, model loaded |
|
||||
| `[QLORA:PROMPT_REQ:<step>]` | C++ requests the prompt for step N |
|
||||
| `[QLORA:GEN:<k>/<n>] <text>` | One generation (newlines escaped as `\n`) |
|
||||
| `[QLORA:REWARD_REQ:<n>]` | C++ requests N reward scores |
|
||||
| `[QLORA:PROGRESS] step=X/Y loss=Z epoch=A/B` | After each weight update |
|
||||
| `[QLORA:CHECKPOINT] <path>` | After saving a checkpoint |
|
||||
| `[QLORA:DONE] final_loss=X` | Training complete |
|
||||
| `[QLORA:ERROR] <message>` | Fatal error |
|
||||
|
||||
**Python → C++ (stdin):**
|
||||
|
||||
| Line | Meaning |
|
||||
|---|---|
|
||||
| `PROMPT <escaped_text>` | Send prompt for the most recent `PROMPT_REQ` |
|
||||
| `REWARD <r1> <r2> … <rN>` | Send N advantage scores in `[0, 1]` range |
|
||||
| `STOP` | Request graceful shutdown after current step |
|
||||
|
||||
**Text encoding:** newlines in generation text are escaped as the two-character sequence `\n`; backslashes are doubled. Use `unescape()` from `grpo_example.py` (or any equivalent) to recover the original text.
|
||||
|
||||
### Writing your own driver
|
||||
|
||||
`grpo_example.py` contains two functions you replace with your own logic:
|
||||
|
||||
```python
|
||||
def get_prompt(step: int) -> str:
|
||||
"""Return the training prompt for step N."""
|
||||
...
|
||||
|
||||
def score_generations(prompt: str, generations: List[str]) -> List[float]:
|
||||
"""Score each generation. Any numeric range — will be normalised."""
|
||||
...
|
||||
```
|
||||
|
||||
The IPC helpers (`escape`, `unescape`, `parse_ipc`, `read_ipc`, `write_cmd`, `wait_for`, `normalise_rewards`) are standalone and have no external dependencies — copy them into your own project if needed.
|
||||
|
||||
### Training loop diagram
|
||||
|
||||
```
|
||||
Python C++ (llama-finetune-qlora --grpo-mode)
|
||||
│ │
|
||||
│◄──── [QLORA:READY] ────────────┤ model loaded
|
||||
│ │
|
||||
│ ┌─────────────────────────────┤
|
||||
│ │ for each step: │
|
||||
│ │ ◄── PROMPT_REQ:N ─────────┤
|
||||
│ │ ──► PROMPT <text> ────────► generate n_gen rollouts
|
||||
│ │ ◄── GEN:1/n <text> ──┤
|
||||
│ │ ◄── GEN:2/n <text> ──┤
|
||||
│ │ ... │
|
||||
│ │ ◄── GEN:n/n <text> ──┤
|
||||
│ │ ◄── REWARD_REQ:n ─────────┤
|
||||
│ │ (score generations) │
|
||||
│ │ ──► REWARD a1 a2 … an ────► one backward + AdamW step
|
||||
│ │ ◄── PROGRESS step=N/M … ──┤
|
||||
│ └─────────────────────────────┤
|
||||
│ │
|
||||
│◄──── [QLORA:DONE] ─────────────┤ adapter saved
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Known Limitations & Optimization Roadmap
|
||||
|
||||
### Current limitations
|
||||
|
||||
**1. Full backprop through frozen quantized layers**
|
||||
Every backward step dequantizes all frozen Q4_K_M weight tensors to compute activation gradients (needed to propagate loss from the output back to each LoRA layer). For a 28-layer 1.7B model at `-ub 512`, this is ~280 dequantizing matmuls per step → step time is 3–5× slower than inference.
|
||||
|
||||
**2. Activation VRAM** *(partially addressed by `--grad-checkpoint`)*
|
||||
All forward activations are kept in VRAM throughout the backward pass. VRAM ≈ `model + KV + n_layers × hidden × n_ubatch × 10 × 4B + 2 × lora_params × 4B`. Reducing `-ub` reduces VRAM linearly. Use `--grad-checkpoint 48` to prevent the allocator from reusing intermediate activation buffers during backward, which cuts peak activation VRAM at near-zero compute cost.
|
||||
|
||||
**3. Full backprop through all layers** *(partially addressed by `--freeze-layers`)*
|
||||
Gradients propagate through all layers that have LoRA adapters. Use `--freeze-layers N` to skip LoRA allocation for blk.0..N-1 — those layers receive no gradient (the `grads_needed` pruner already skips their backward ops automatically). Only the top (total_layers - N) layers are trained.
|
||||
|
||||
### Optimization roadmap
|
||||
|
||||
| Priority | Optimization | Expected gain | Status |
|
||||
|---|---|---|---|
|
||||
| ✅ Done | **`--freeze-layers N`** — no LoRA on first N layers; backward auto-pruned | Proportional to N/total | Implemented |
|
||||
| ✅ Done | **`--grad-checkpoint N`** — keep every Nth activation alive through backward | Reduces peak activation VRAM | Implemented |
|
||||
| ✅ Done | **`--train-on-prompt`** — compute loss on prompt tokens too | Configurable loss target | Implemented |
|
||||
| ✅ Done | **`--shuffle-dataset`** — shuffle windows each epoch | Better convergence | Implemented |
|
||||
| ✅ Done | **BOS separators** — insert BOS between concatenated samples | Correct cross-sample boundaries | Implemented |
|
||||
| ✅ Done | **Per-epoch loss summary** — log train/val loss after each epoch | Observability | Implemented |
|
||||
| ✅ Done | **`MUL_MAT_ID` backward** — LoRA on MoE dense FFN layers; `OUT_PROD_ID` for scattered outer product | Unlocks Mixtral/Nemotron-MoE | Implemented |
|
||||
| ✅ Done | **Quantized `OUT_PROD`** — dequantize on GPU + cuBLAS for backward matmul | Full GPU training (no CPU fallback) | Implemented |
|
||||
| ✅ Done | **Reuse `ctx_compute_opt`** — allocate tensor metadata context once, `ggml_reset()` across ubatches | Eliminate ~0.5 s/step overhead | Implemented |
|
||||
| ❌ Skip | **Static training graphs** — KV mask shape changes per ubatch (`n_kv` grows); graph topology not static | Would need KV cache redesign | Not feasible |
|
||||
| Low | **`SSM_SCAN/CONV` backward** — enable LoRA on Mamba SSM layers | Unlocks NemotronH SSM layers | Planned |
|
||||
| Low | **GELU backward** — implement `ggml_gelu_back` kernel (UNARY + GLU) | Support GPT-2/Phi-style models | Planned (needs new CUDA/CPU kernels) |
|
||||
|
||||
---
|
||||
|
||||
## Implementation notes (for developers)
|
||||
|
||||
### Modified llama.cpp files
|
||||
|
||||
| File | Change |
|
||||
|---|---|
|
||||
| `ggml/src/ggml.c` | Backward graph fixes: `GET_ROWS` 3D, `SET_ROWS`, `MUL_MAT_ID`, `SSM_SCAN/CONV`, `FLASH_ATTN_EXT` all stop gradient; inplace-op assert → warn+skip |
|
||||
| `src/llama-context.cpp` | `opt_init`: scheduler and graph sized with inflated capacity before `ggml_opt_init`; `opt_epoch_iter`: per-ubatch timing instrumentation; reward scaling via `g_reward_weights` TLS |
|
||||
| `src/llama-adapter.cpp` | Repack-buft fallback for LoRA tensors: tries device-native buft before CPU |
|
||||
| `common/common.h` | Added `save_every`, `lora_freeze_layers`, `grad_checkpoint_interval`, `train_on_prompt`, `shuffle_dataset` fields |
|
||||
| `common/arg.cpp` | Added `--save-every`, `--freeze-layers`, `--grad-checkpoint`, `--train-on-prompt`, `--shuffle-dataset` arguments |
|
||||
| `include/llama.h` | Added `llama_opt_set_reward_weights()`; `grad_checkpoint_interval` in `llama_opt_params`; `shuffle` param in `llama_opt_epoch` |
|
||||
| `ggml/src/ggml-cuda/out-prod.cu` | `OUT_PROD` with quantized src0 (dequantize on GPU + cuBLAS); `OUT_PROD_ID` for MoE backward |
|
||||
| `ggml/src/ggml-cuda/ggml-cuda.cu` | `supports_op` for quantized `OUT_PROD` and `OUT_PROD_ID`; CPU-resident ids fix in `mul_mat_id` |
|
||||
| `ggml/include/ggml-opt.h` | Added `grad_checkpoint_interval` to `ggml_opt_params` |
|
||||
| `ggml/src/ggml-opt.cpp` | Gradient checkpointing: marks every Nth forward node `GGML_TENSOR_FLAG_OUTPUT` before backward build |
|
||||
|
||||
### Key invariants
|
||||
|
||||
- `params.use_mmap = false` — forced; mmap'd tensors can't have data written back
|
||||
- `params.flash_attn_type = DISABLED` — no backward impl for flash attention
|
||||
- `params.warmup = false` — warmup runs inference with PARAM tensors → segfault
|
||||
- `params.cache_type_k = F32` — training requires F32 KV (or BF16 with `--cache-type-k bf16`)
|
||||
- LoRA A/B tensors are marked `PARAM` via `ggml_set_param` on the tensors loaded by `llama_adapter_lora_init`, not on the pre-init scratch tensors in `lt.buf`
|
||||
- The adapter GGUF is pre-saved and loaded via `params.lora_adapters` BEFORE `common_init_from_params` so that `sched_reserve` includes LoRA graph nodes in its sizing
|
||||
|
||||
### Why opt_init inflation matters
|
||||
|
||||
`ggml_opt_init` captures `sched.get()` at construction time. The backward graph (`gb_grad`, `gb_opt`) is ~3–5× larger than the forward graph in node count. If the scheduler hash_set is sized only for the forward graph, `ggml_backend_sched_alloc_graph` on the backward graph will overflow it. We recreate `sched` with `inflated = fwd_nodes × 4` slots BEFORE calling `ggml_opt_init`.
|
||||
|
||||
### Reward weighting implementation
|
||||
|
||||
`llama_opt_set_reward_weights(weights, n)` sets thread-local `g_reward_weights`. In `opt_epoch`, each window reads `g_reward_weights[idata]` and passes it as `reward_scale` to `opt_epoch_iter`. Inside the iter loop, instead of writing `1.0f` for the correct token's label position in the cross-entropy label tensor, it writes `reward_scale`. Since cross-entropy loss = `-mean(label × log(softmax(logit)))`, scaling the label scales both loss and gradient identically.
|
||||
|
|
@ -0,0 +1,64 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Quick check of LoRA tensor norms in a GGUF file."""
|
||||
import sys, struct, numpy as np
|
||||
|
||||
def read_gguf(path):
|
||||
with open(path, 'rb') as f:
|
||||
assert f.read(4) == b'GGUF'
|
||||
version = struct.unpack('<I', f.read(4))[0]
|
||||
n_tensors, n_kv = struct.unpack('<QQ', f.read(16))
|
||||
# skip KV pairs (simplified - just seek past them)
|
||||
# Read tensor infos
|
||||
tensors = []
|
||||
for _ in range(n_kv):
|
||||
# key
|
||||
klen = struct.unpack('<Q', f.read(8))[0]
|
||||
key = f.read(klen).decode()
|
||||
vtype = struct.unpack('<I', f.read(4))[0]
|
||||
# skip value based on type (simplified)
|
||||
if vtype == 8: # string
|
||||
slen = struct.unpack('<Q', f.read(8))[0]; f.read(slen)
|
||||
elif vtype == 6: # float32
|
||||
f.read(4)
|
||||
elif vtype in (0,1,2,3,4,5,10,11,12): # int types
|
||||
sizes = {0:1,1:1,2:2,3:4,4:8,5:1,10:2,11:4,12:8}
|
||||
f.read(sizes.get(vtype,4))
|
||||
elif vtype == 9: # bool
|
||||
f.read(1)
|
||||
else:
|
||||
print(f"unknown kv type {vtype} for key {key}, stopping"); break
|
||||
|
||||
data_offset = None
|
||||
for i in range(n_tensors):
|
||||
nlen = struct.unpack('<Q', f.read(8))[0]
|
||||
name = f.read(nlen).decode()
|
||||
ndims = struct.unpack('<I', f.read(4))[0]
|
||||
dims = struct.unpack('<' + 'Q'*ndims, f.read(8*ndims))
|
||||
dtype = struct.unpack('<I', f.read(4))[0]
|
||||
offset = struct.unpack('<Q', f.read(8))[0]
|
||||
tensors.append((name, dims, dtype, offset))
|
||||
|
||||
# data section starts after alignment
|
||||
pos = f.tell()
|
||||
align = 32
|
||||
data_start = (pos + align - 1) & ~(align - 1)
|
||||
|
||||
print(f"\nFile: {path}")
|
||||
print(f"Tensors: {n_tensors}")
|
||||
|
||||
for name, dims, dtype, offset in tensors[:10]: # first 10
|
||||
if dtype != 0: # only F32 (type 0)
|
||||
print(f" {name}: dims={dims} type={dtype} (non-F32, skipping norm)")
|
||||
continue
|
||||
f.seek(data_start + offset)
|
||||
n = 1
|
||||
for d in dims: n *= d
|
||||
data = np.frombuffer(f.read(n*4), dtype=np.float32)
|
||||
print(f" {name}: dims={dims} norm={np.linalg.norm(data):.4f} max={np.abs(data).max():.4f} mean={np.abs(data).mean():.6f}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
for p in sys.argv[1:]:
|
||||
try:
|
||||
read_gguf(p)
|
||||
except Exception as e:
|
||||
print(f"Error reading {p}: {e}")
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,399 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
grpo_example.py — Minimal GRPO training loop using llama-finetune-qlora --grpo-mode
|
||||
|
||||
Demonstrates the IPC protocol between the Python driver and the C++ subprocess.
|
||||
No external dependencies required — only Python stdlib.
|
||||
|
||||
Usage:
|
||||
python3 grpo_example.py \
|
||||
--model /path/to/model-q4_k_m.gguf \
|
||||
--lora-out /path/to/output-adapter.gguf \
|
||||
[--lora /path/to/resume-adapter.gguf] \
|
||||
[--binary /path/to/llama-finetune-qlora] \
|
||||
[--n-steps 200] \
|
||||
[--n-gen 8] \
|
||||
[--rank 16]
|
||||
|
||||
IPC Protocol (stdout from C++ process):
|
||||
[QLORA:READY] — process initialised
|
||||
[QLORA:PROMPT_REQ:<step>] — C++ requests a prompt for step N
|
||||
[QLORA:GEN:<k>/<n>] <text> — one generation (newlines escaped as \\n)
|
||||
[QLORA:REWARD_REQ:<n>] — C++ requests N reward scores
|
||||
[QLORA:PROGRESS] step=X/Y loss=Z epoch=A/B
|
||||
[QLORA:CHECKPOINT] <path>
|
||||
[QLORA:DONE] final_loss=X
|
||||
[QLORA:ERROR] <message>
|
||||
|
||||
Python → C++ stdin:
|
||||
PROMPT <escaped_text>
|
||||
REWARD <r1> <r2> ... <rN> (advantages, 0..1 range)
|
||||
STOP (request graceful shutdown)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import math
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
)
|
||||
log = logging.getLogger("grpo_example")
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# IPC helpers
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
_IPC_RE = re.compile(r"^\[QLORA:([A-Z_]+)(?::([^\]]*))?\](.*)$")
|
||||
|
||||
|
||||
def escape(text: str) -> str:
|
||||
"""Escape newlines and backslashes for single-line IPC transport."""
|
||||
return text.replace("\\", "\\\\").replace("\n", "\\n").replace("\r", "\\r")
|
||||
|
||||
|
||||
def unescape(text: str) -> str:
|
||||
"""Reverse of escape()."""
|
||||
out, i = [], 0
|
||||
while i < len(text):
|
||||
if text[i] == "\\" and i + 1 < len(text):
|
||||
c = text[i + 1]
|
||||
if c == "n":
|
||||
out.append("\n")
|
||||
elif c == "r":
|
||||
out.append("\r")
|
||||
elif c == "\\":
|
||||
out.append("\\")
|
||||
else:
|
||||
out.append(c)
|
||||
i += 2
|
||||
else:
|
||||
out.append(text[i])
|
||||
i += 1
|
||||
return "".join(out)
|
||||
|
||||
|
||||
def parse_ipc(line: str) -> Optional[Tuple[str, str, str]]:
|
||||
"""
|
||||
Parse an IPC line into (msg_type, seq, payload).
|
||||
Returns None for non-IPC lines (model output, log lines, etc.).
|
||||
"""
|
||||
m = _IPC_RE.match(line.strip())
|
||||
if not m:
|
||||
return None
|
||||
return m.group(1), (m.group(2) or ""), m.group(3).strip()
|
||||
|
||||
|
||||
def read_ipc(proc: subprocess.Popen, timeout: float = 120.0) -> Optional[Tuple[str, str, str]]:
|
||||
"""
|
||||
Read lines from proc.stdout until an IPC message arrives.
|
||||
Non-IPC lines (model output, C++ logs leaked to stdout) are printed.
|
||||
Returns None on EOF.
|
||||
Raises TimeoutError if nothing arrives within `timeout` seconds.
|
||||
"""
|
||||
assert proc.stdout is not None
|
||||
deadline = time.monotonic() + timeout
|
||||
while True:
|
||||
remaining = deadline - time.monotonic()
|
||||
if remaining <= 0:
|
||||
raise TimeoutError(f"No IPC message within {timeout:.0f}s")
|
||||
|
||||
line = proc.stdout.readline()
|
||||
if not line:
|
||||
return None # EOF
|
||||
|
||||
line = line.rstrip("\n")
|
||||
parsed = parse_ipc(line)
|
||||
if parsed:
|
||||
return parsed
|
||||
# Non-IPC — C++ sometimes leaks timing/debug lines to stdout.
|
||||
# Print them so the user can see what's happening.
|
||||
print(f" [cpp] {line}", file=sys.stderr)
|
||||
|
||||
|
||||
def write_cmd(proc: subprocess.Popen, cmd: str):
|
||||
"""Write one command line to the subprocess stdin."""
|
||||
assert proc.stdin is not None
|
||||
try:
|
||||
proc.stdin.write(cmd + "\n")
|
||||
proc.stdin.flush()
|
||||
except BrokenPipeError:
|
||||
raise RuntimeError("C++ subprocess stdin closed — did it crash?")
|
||||
|
||||
|
||||
def wait_for(proc: subprocess.Popen, expected: str, timeout: float = 120.0) -> Tuple[str, str, str]:
|
||||
"""Block until the expected IPC message type arrives."""
|
||||
deadline = time.monotonic() + timeout
|
||||
while True:
|
||||
remaining = deadline - time.monotonic()
|
||||
if remaining <= 0:
|
||||
raise TimeoutError(f"Timed out waiting for [{expected}]")
|
||||
parsed = read_ipc(proc, timeout=remaining)
|
||||
if parsed is None:
|
||||
raise RuntimeError(f"Subprocess exited before sending [{expected}]")
|
||||
msg_type, seq, payload = parsed
|
||||
if msg_type == expected:
|
||||
return msg_type, seq, payload
|
||||
log.debug("Ignoring unexpected IPC (%s) while waiting for %s", msg_type, expected)
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# Advantage normalisation (GRPO)
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def normalise_rewards(rewards: List[float]) -> List[float]:
|
||||
"""
|
||||
Group-relative advantage normalisation: subtract mean, divide by std.
|
||||
Clipped to [0, 1] so the C++ side always receives values in that range.
|
||||
|
||||
All-equal rewards → uniform 0.5 (no signal, but no NaN either).
|
||||
"""
|
||||
if len(rewards) == 0:
|
||||
return []
|
||||
mean = sum(rewards) / len(rewards)
|
||||
variance = sum((r - mean) ** 2 for r in rewards) / len(rewards)
|
||||
std = math.sqrt(variance) if variance > 1e-8 else 1.0
|
||||
|
||||
normalised = [(r - mean) / std for r in rewards]
|
||||
# Shift to [0,1]: z-scores typically lie in [-3, +3]
|
||||
clipped = [max(0.0, min(1.0, 0.5 + z / 6.0)) for z in normalised]
|
||||
return clipped
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# Example prompt / reward providers
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
# Replace these with your own logic.
|
||||
|
||||
_EXAMPLE_PROMPTS = [
|
||||
"Explain the concept of gradient descent in one sentence.",
|
||||
"What is the capital of France?",
|
||||
"Write a haiku about machine learning.",
|
||||
"Describe the difference between SFT and RLHF.",
|
||||
"What does GRPO stand for?",
|
||||
]
|
||||
|
||||
|
||||
def get_prompt(step: int) -> str:
|
||||
"""Return a prompt for the given training step (0-indexed)."""
|
||||
return _EXAMPLE_PROMPTS[step % len(_EXAMPLE_PROMPTS)]
|
||||
|
||||
|
||||
def score_generations(prompt: str, generations: List[str]) -> List[float]:
|
||||
"""
|
||||
Score a list of model generations for the given prompt.
|
||||
Returns a list of raw reward scores (any numeric range; will be normalised).
|
||||
|
||||
This example uses a trivial heuristic: longer, more varied responses
|
||||
score higher. Replace with your actual reward model / verifier.
|
||||
"""
|
||||
scores = []
|
||||
for gen in generations:
|
||||
words = gen.split()
|
||||
# Simple heuristics: length + lexical diversity
|
||||
length_score = min(1.0, len(words) / 50.0)
|
||||
vocab_score = min(1.0, len(set(words)) / max(1, len(words)))
|
||||
scores.append(0.6 * length_score + 0.4 * vocab_score)
|
||||
return scores
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# Main GRPO loop
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def run_grpo(args: argparse.Namespace):
|
||||
# Resolve binary
|
||||
binary = Path(args.binary)
|
||||
if not binary.exists():
|
||||
log.error("Binary not found: %s", binary)
|
||||
sys.exit(1)
|
||||
|
||||
# Build command
|
||||
cmd = [
|
||||
str(binary),
|
||||
"--model", args.model,
|
||||
"--lora-out", args.lora_out,
|
||||
"--lora-rank", str(args.rank),
|
||||
"--lora-alpha", str(args.rank // 2),
|
||||
"-c", str(args.ctx_size),
|
||||
"-b", str(args.ctx_size),
|
||||
"-ub", "512",
|
||||
"-ngl", str(args.ngl),
|
||||
"-lr", str(args.lr),
|
||||
"--seed", str(args.seed),
|
||||
"--grad-checkpoint","48",
|
||||
"--shuffle-dataset",
|
||||
"--grpo-mode",
|
||||
"--n-gen", str(args.n_gen),
|
||||
"--n-steps", str(args.n_steps),
|
||||
"--grpo-temp", str(args.temperature),
|
||||
"--grpo-max-tokens",str(args.max_tokens),
|
||||
]
|
||||
|
||||
if args.lora:
|
||||
cmd += ["--lora", args.lora]
|
||||
|
||||
if args.save_every > 0:
|
||||
cmd += ["--save-every", str(args.save_every)]
|
||||
|
||||
log.info("Launching: %s", " ".join(cmd))
|
||||
|
||||
proc = subprocess.Popen(
|
||||
cmd,
|
||||
stdin=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=sys.stderr, # C++ debug/timing logs go directly to our stderr
|
||||
text=True,
|
||||
bufsize=1,
|
||||
)
|
||||
|
||||
try:
|
||||
_grpo_loop(proc, args)
|
||||
except KeyboardInterrupt:
|
||||
log.info("Interrupted — requesting graceful stop")
|
||||
try:
|
||||
write_cmd(proc, "STOP")
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as e:
|
||||
log.error("GRPO loop error: %s", e)
|
||||
proc.kill()
|
||||
raise
|
||||
finally:
|
||||
try:
|
||||
if proc.stdin is not None:
|
||||
proc.stdin.close()
|
||||
except Exception:
|
||||
pass
|
||||
rc = proc.wait(timeout=30)
|
||||
if rc not in (0, None):
|
||||
log.warning("Subprocess exited with code %d", rc)
|
||||
|
||||
|
||||
def _grpo_loop(proc: subprocess.Popen, args: argparse.Namespace):
|
||||
# ── Wait for READY ──────────────────────────────────────────────────────
|
||||
log.info("Waiting for subprocess to initialise (model load can take a minute)…")
|
||||
wait_for(proc, "READY", timeout=300)
|
||||
log.info("Subprocess ready.")
|
||||
|
||||
current_prompt: str = ""
|
||||
generations: List[str] = []
|
||||
step = 0
|
||||
|
||||
while True:
|
||||
parsed = read_ipc(proc, timeout=600)
|
||||
if parsed is None:
|
||||
log.info("Subprocess exited (EOF).")
|
||||
break
|
||||
|
||||
msg_type, seq, payload = parsed
|
||||
|
||||
# ── PROMPT_REQ ──────────────────────────────────────────────────────
|
||||
if msg_type == "PROMPT_REQ":
|
||||
step = int(seq) if seq else step + 1
|
||||
current_prompt = get_prompt(step - 1)
|
||||
generations = []
|
||||
log.debug("Step %d — sending prompt: %s", step, current_prompt[:60])
|
||||
write_cmd(proc, f"PROMPT {escape(current_prompt)}")
|
||||
|
||||
# ── GEN ─────────────────────────────────────────────────────────────
|
||||
elif msg_type == "GEN":
|
||||
# seq = "k/n"
|
||||
parts = seq.split("/")
|
||||
k = int(parts[0])
|
||||
n = int(parts[1]) if len(parts) > 1 else args.n_gen
|
||||
text = unescape(payload)
|
||||
generations.append(text)
|
||||
log.debug(" Generation %d/%d: %s…", k, n, text[:60].replace("\n", "↵"))
|
||||
|
||||
# ── REWARD_REQ ──────────────────────────────────────────────────────
|
||||
elif msg_type == "REWARD_REQ":
|
||||
n_expected = int(seq) if seq else len(generations)
|
||||
if len(generations) != n_expected:
|
||||
log.warning(
|
||||
"REWARD_REQ asked for %d rewards but collected %d generations",
|
||||
n_expected, len(generations),
|
||||
)
|
||||
|
||||
raw_rewards = score_generations(current_prompt, generations)
|
||||
advantages = normalise_rewards(raw_rewards)
|
||||
|
||||
reward_str = " ".join(f"{a:.6f}" for a in advantages)
|
||||
log.debug(" Rewards (raw): %s", [f"{r:.3f}" for r in raw_rewards])
|
||||
log.debug(" Advantages: %s", [f"{a:.3f}" for a in advantages])
|
||||
write_cmd(proc, f"REWARD {reward_str}")
|
||||
|
||||
# ── PROGRESS ────────────────────────────────────────────────────────
|
||||
elif msg_type == "PROGRESS":
|
||||
# Format: step=X/Y loss=Z epoch=A/B
|
||||
sm = re.search(r"step=(\d+)(?:/(\d+))?", payload)
|
||||
lm = re.search(r"loss=([\d.]+)", payload)
|
||||
step_str = f"{sm.group(1)}/{sm.group(2)}" if sm and sm.group(2) else (sm.group(1) if sm else "?")
|
||||
loss_str = lm.group(1) if lm else "?"
|
||||
print(f" step {step_str} loss {loss_str}", flush=True)
|
||||
|
||||
# ── CHECKPOINT ──────────────────────────────────────────────────────
|
||||
elif msg_type == "CHECKPOINT":
|
||||
log.info("Checkpoint saved: %s", payload.strip())
|
||||
|
||||
# ── DONE ────────────────────────────────────────────────────────────
|
||||
elif msg_type == "DONE":
|
||||
m = re.search(r"final_loss=([\d.]+)", payload)
|
||||
loss = m.group(1) if m else "?"
|
||||
log.info("Training complete. final_loss=%s", loss)
|
||||
break
|
||||
|
||||
# ── ERROR ────────────────────────────────────────────────────────────
|
||||
elif msg_type == "ERROR":
|
||||
log.error("C++ process error: %s", payload.strip())
|
||||
raise RuntimeError(f"Training failed: {payload.strip()}")
|
||||
|
||||
else:
|
||||
log.debug("Unknown IPC message: [%s] seq=%r payload=%r", msg_type, seq, payload)
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# CLI
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
# Default binary: build/bin/ relative to this script's repo root
|
||||
script_dir = Path(__file__).resolve().parent
|
||||
repo_root = script_dir.parents[1] # examples/qlora_training → llama.cpp root
|
||||
default_bin = repo_root / "build" / "bin" / "llama-finetune-qlora"
|
||||
|
||||
p = argparse.ArgumentParser(
|
||||
description="Minimal GRPO training loop via llama-finetune-qlora --grpo-mode",
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
||||
)
|
||||
p.add_argument("--model", required=True, help="Base GGUF model path")
|
||||
p.add_argument("--lora-out", required=True, help="Output adapter GGUF path")
|
||||
p.add_argument("--lora", default=None, help="Resume from existing adapter GGUF")
|
||||
p.add_argument("--binary", default=str(default_bin), help="Path to llama-finetune-qlora binary")
|
||||
p.add_argument("--rank", type=int, default=16, help="LoRA rank")
|
||||
p.add_argument("--n-steps", type=int, default=200, help="Number of GRPO steps")
|
||||
p.add_argument("--n-gen", type=int, default=8, help="Generations per prompt")
|
||||
p.add_argument("--lr", type=float, default=1e-4, help="Learning rate")
|
||||
p.add_argument("--ctx-size", type=int, default=4096, help="Context window")
|
||||
p.add_argument("--ngl", type=int, default=999, help="GPU layers (-ngl)")
|
||||
p.add_argument("--temperature", type=float, default=0.8, help="Sampling temperature")
|
||||
p.add_argument("--max-tokens", type=int, default=512, help="Max tokens per generation")
|
||||
p.add_argument("--save-every", type=int, default=0, help="Save checkpoint every N steps (0=off)")
|
||||
p.add_argument("--seed", type=int, default=42, help="RNG seed")
|
||||
p.add_argument("--verbose", action="store_true", help="Enable DEBUG logging")
|
||||
return p.parse_args()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
if args.verbose:
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
run_grpo(args)
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "What is the capital of France?"}, {"role": "assistant", "content": "The capital of France is Paris."}]}
|
||||
{"messages": [{"role": "user", "content": "Explain gradient descent in one sentence."}, {"role": "assistant", "content": "Gradient descent iteratively adjusts model parameters in the direction that most reduces the loss function."}]}
|
||||
{"messages": [{"role": "system", "content": "You are a concise coding assistant."}, {"role": "user", "content": "Write a Python function that returns the factorial of n."}, {"role": "assistant", "content": "```python\ndef factorial(n):\n return 1 if n <= 1 else n * factorial(n - 1)\n```"}]}
|
||||
{"prompt": "What is supervised fine-tuning?", "response": "Supervised fine-tuning (SFT) trains a pre-trained model on labelled examples of the desired behaviour, adjusting weights to minimise cross-entropy loss on the target responses."}
|
||||
{"prompt": "What is LoRA?", "response": "LoRA (Low-Rank Adaptation) fine-tunes large models by injecting trainable low-rank matrices into frozen weight layers, dramatically reducing the number of trainable parameters."}
|
||||
{"text": "The transformer architecture, introduced in 'Attention Is All You Need' (Vaswani et al., 2017), relies entirely on self-attention mechanisms and has become the foundation of modern large language models."}
|
||||
{"text": "QLoRA combines quantization with LoRA: the base model is loaded in 4-bit precision (frozen), while only the small LoRA adapter weights are trained in full precision."}
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
{"messages": [{"role": "user", "content": "Explain what a neural network is."}, {"role": "assistant", "content": "A neural network is a computational model loosely inspired by biological neurons, consisting of layers of interconnected nodes that learn to map inputs to outputs through training."}], "reward": 0.9}
|
||||
{"messages": [{"role": "user", "content": "Explain what a neural network is."}, {"role": "assistant", "content": "It's a thing that does AI stuff with nodes and layers."}], "reward": 0.2}
|
||||
{"messages": [{"role": "user", "content": "What is backpropagation?"}, {"role": "assistant", "content": "Backpropagation computes gradients of the loss with respect to each parameter by applying the chain rule backwards through the computation graph, enabling gradient descent to update all weights efficiently."}], "reward": 1.0}
|
||||
{"messages": [{"role": "user", "content": "What is backpropagation?"}, {"role": "assistant", "content": "Backpropagation is how neural networks learn by going backwards."}], "reward": 0.3}
|
||||
{"prompt": "Describe overfitting in machine learning.", "response": "Overfitting occurs when a model learns the training data too closely—capturing noise and idiosyncrasies—resulting in poor generalisation to unseen examples. Regularisation, dropout, and early stopping are common mitigations.", "reward": 0.95}
|
||||
{"prompt": "Describe overfitting in machine learning.", "response": "Overfitting is when the model is too good at training data but bad at new data.", "reward": 0.4}
|
||||
|
|
@ -73,6 +73,7 @@ int main(int argc, char ** argv) {
|
|||
/*get_opt_pars =*/common_opt_lr_pars,
|
||||
/*get_opt_pars_ud =*/¶ms.lr,
|
||||
/*optimizer_type =*/params.optimizer,
|
||||
/*grad_checkpoint_interval =*/params.grad_checkpoint_interval,
|
||||
};
|
||||
llama_opt_init(ctx, model, lopt_params);
|
||||
|
||||
|
|
@ -83,7 +84,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
for (lr.epoch = 0; lr.epoch < lr.epochs; ++lr.epoch) {
|
||||
llama_opt_epoch(ctx, dataset, result_train, result_eval, idata_split,
|
||||
ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
|
||||
ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar, /*shuffle=*/false);
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
ggml_opt_result_reset(result_train);
|
||||
|
|
|
|||
|
|
@ -89,6 +89,7 @@ extern "C" {
|
|||
float beta2; // second AdamW momentum
|
||||
float eps; // epsilon for numerical stability
|
||||
float wd; // weight decay - 0.0f to disable
|
||||
float gclip; // element-wise gradient clipping threshold - 0.0f to disable
|
||||
} adamw;
|
||||
struct {
|
||||
float alpha; // learning rate
|
||||
|
|
@ -125,6 +126,13 @@ extern "C" {
|
|||
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
|
||||
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
|
||||
|
||||
// Gradient checkpointing: keep the output of every Nth forward node alive through
|
||||
// the backward pass so the allocator cannot reuse its memory for other tensors.
|
||||
// This trades compute for VRAM — intermediate activations between checkpoints are
|
||||
// freed and recomputed during the backward pass by the existing graph structure.
|
||||
// Set to 0 (default) to disable. A value of ~32–64 cuts activation VRAM by ~50%.
|
||||
int32_t grad_checkpoint_interval;
|
||||
|
||||
// only GGML_OPT_OPTIMIZER_TYPE_ADAMW needs m, v momenta per parameter tensor
|
||||
enum ggml_opt_optimizer_type optimizer;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -503,6 +503,7 @@ extern "C" {
|
|||
GGML_OP_MUL_MAT,
|
||||
GGML_OP_MUL_MAT_ID,
|
||||
GGML_OP_OUT_PROD,
|
||||
GGML_OP_OUT_PROD_ID, // scattered outer-product for MUL_MAT_ID backward (MoE LoRA)
|
||||
|
||||
GGML_OP_SCALE,
|
||||
GGML_OP_SET,
|
||||
|
|
@ -1426,6 +1427,21 @@ extern "C" {
|
|||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// Scattered outer-product for the MUL_MAT_ID backward pass (MoE LoRA gradient).
|
||||
//
|
||||
// a: [cols, n_expert_used, n_tokens] F32 — activations
|
||||
// b: [rows, n_expert_used, n_tokens] F32 — upstream gradient
|
||||
// ids: [n_expert_used, n_tokens] I32 — expert dispatch indices
|
||||
// result: [cols, rows, n_expert, 1] F32
|
||||
//
|
||||
// result[:, :, e] += sum_{(i,t): ids[i,t]==e} a[:, i, t] ⊗ b[:, i, t]
|
||||
GGML_API struct ggml_tensor * ggml_out_prod_id(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
struct ggml_tensor * ids,
|
||||
int64_t n_expert);
|
||||
|
||||
//
|
||||
// operations on tensors without backpropagation
|
||||
//
|
||||
|
|
|
|||
|
|
@ -11046,7 +11046,7 @@ static void ggml_compute_forward_opt_step_adamw_f32(
|
|||
GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_m));
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_v));
|
||||
GGML_ASSERT(ggml_nelements(adamw_params) == 7);
|
||||
GGML_ASSERT(ggml_nelements(adamw_params) == 8);
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
|
@ -11072,6 +11072,7 @@ static void ggml_compute_forward_opt_step_adamw_f32(
|
|||
const float wd = adamw_params_ptr[4];
|
||||
const float beta1h = adamw_params_ptr[5];
|
||||
const float beta2h = adamw_params_ptr[6];
|
||||
const float gclip = adamw_params_ptr[7]; // element-wise gradient clip (0 = disabled)
|
||||
const float keep = 1.f - alpha * wd;
|
||||
for (int ir = ir0; ir < ir1; ++ir) {
|
||||
const int64_t i03 = ir/(ne02*ne01);
|
||||
|
|
@ -11086,8 +11087,10 @@ static void ggml_compute_forward_opt_step_adamw_f32(
|
|||
float * v = (float *) ((char *) src0_grad_v->data + offset);
|
||||
|
||||
for (int i00 = 0; i00 < ne00; ++i00) {
|
||||
m[i00] = m[i00]*beta1 + g[i00]*(1.0f - beta1);
|
||||
v[i00] = v[i00]*beta2 + g[i00]*g[i00]*(1.0f - beta2);
|
||||
const float gi = (gclip > 0.0f) ? fmaxf(-gclip, fminf(gclip, g[i00])) : g[i00];
|
||||
|
||||
m[i00] = m[i00]*beta1 + gi*(1.0f - beta1);
|
||||
v[i00] = v[i00]*beta2 + gi*gi*(1.0f - beta2);
|
||||
|
||||
const float mh = m[i00]*beta1h;
|
||||
const float vh = sqrtf(v[i00]*beta2h) + eps;
|
||||
|
|
|
|||
|
|
@ -2680,6 +2680,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|||
case GGML_OP_OUT_PROD:
|
||||
ggml_cuda_out_prod(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_OUT_PROD_ID:
|
||||
ggml_cuda_out_prod_id(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SCALE:
|
||||
ggml_cuda_op_scale(ctx, dst);
|
||||
break;
|
||||
|
|
@ -4803,7 +4806,15 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|||
}
|
||||
} break;
|
||||
case GGML_OP_OUT_PROD:
|
||||
return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
|
||||
return op->type == GGML_TYPE_F32
|
||||
&& (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type))
|
||||
&& op->src[1]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_OUT_PROD_ID:
|
||||
return op->src[0] != nullptr && op->src[1] != nullptr && op->src[2] != nullptr
|
||||
&& op->type == GGML_TYPE_F32
|
||||
&& op->src[0]->type == GGML_TYPE_F32
|
||||
&& op->src[1]->type == GGML_TYPE_F32
|
||||
&& op->src[2]->type == GGML_TYPE_I32;
|
||||
case GGML_OP_GET_ROWS:
|
||||
{
|
||||
switch (op->src[0]->type) {
|
||||
|
|
|
|||
|
|
@ -20,8 +20,9 @@ static __global__ void opt_step_adamw_f32(
|
|||
const float wd = pars[4];
|
||||
const float beta1h = pars[5];
|
||||
const float beta2h = pars[6];
|
||||
const float gclip = pars[7]; // element-wise gradient clip (0 = disabled)
|
||||
|
||||
const float gi = g[i];
|
||||
const float gi = (gclip > 0.0f) ? fmaxf(-gclip, fminf(gclip, g[i])) : g[i];
|
||||
const float gmi = g_m[i]*beta1 + gi*(1.0f - beta1);
|
||||
const float gvi = g_v[i]*beta2 + gi*gi*(1.0f - beta2);
|
||||
|
||||
|
|
@ -62,7 +63,7 @@ void ggml_cuda_opt_step_adamw(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
|||
GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_m));
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_v));
|
||||
GGML_ASSERT(ggml_nelements(adamw_params) == 7);
|
||||
GGML_ASSERT(ggml_nelements(adamw_params) == 8);
|
||||
|
||||
float * src0_d = (float *) src0->data;
|
||||
const float * src0_grad_d = (const float *) src0_grad->data;
|
||||
|
|
|
|||
|
|
@ -1,6 +1,9 @@
|
|||
#include "out-prod.cuh"
|
||||
#include "convert.cuh"
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
|
||||
void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
|
@ -8,7 +11,7 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type));
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
|
|
@ -22,19 +25,37 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|||
GGML_ASSERT(ne2 == src1->ne[2]);
|
||||
GGML_ASSERT(ne3 == src1->ne[3]);
|
||||
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
cublasHandle_t handle = ctx.cublas_handle();
|
||||
|
||||
// If src0 is quantized, dequantize to a temp F32 buffer on GPU
|
||||
ggml_cuda_pool_alloc<float> src0_f32_alloc;
|
||||
const float * src0_d;
|
||||
int64_t lda;
|
||||
|
||||
if (src0->type != GGML_TYPE_F32) {
|
||||
const int64_t n_elements = ggml_nelements(src0);
|
||||
src0_f32_alloc.alloc(ctx.pool(), n_elements);
|
||||
|
||||
to_fp32_cuda_t to_fp32 = ggml_get_to_fp32_cuda(src0->type);
|
||||
GGML_ASSERT(to_fp32 != nullptr);
|
||||
to_fp32(src0->data, src0_f32_alloc.ptr, n_elements, stream);
|
||||
|
||||
src0_d = src0_f32_alloc.ptr;
|
||||
lda = ne00; // dequantized data is contiguous: stride = ne00
|
||||
} else {
|
||||
src0_d = (const float *) src0->data;
|
||||
lda = nb01 / sizeof(float);
|
||||
}
|
||||
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
const float alpha = 1.0f;
|
||||
const float beta = 0.0f;
|
||||
|
||||
CUBLAS_CHECK(cublasSetStream(handle, stream));
|
||||
|
||||
const int64_t lda = nb01 / sizeof(float);
|
||||
const int64_t ldc = nb1 / sizeof(float);
|
||||
|
||||
const bool src1_T = ggml_is_transposed(src1);
|
||||
|
|
@ -42,9 +63,9 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|||
const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);
|
||||
GGML_ASSERT( (src1_T ? nb11 : nb10) == sizeof(float));
|
||||
|
||||
// data strides in dimensions 2/3
|
||||
const size_t s02 = nb02 / sizeof(float);
|
||||
const size_t s03 = nb03 / sizeof(float);
|
||||
// data strides in dimensions 2/3 (for dequantized src0, use element-based strides)
|
||||
const size_t s02 = (src0->type != GGML_TYPE_F32) ? (ne00 * ne01) : (nb02 / sizeof(float));
|
||||
const size_t s03 = (src0->type != GGML_TYPE_F32) ? (ne00 * ne01 * ne02) : (nb03 / sizeof(float));
|
||||
const size_t s12 = nb12 / sizeof(float);
|
||||
const size_t s13 = nb13 / sizeof(float);
|
||||
const size_t s2 = nb2 / sizeof(float);
|
||||
|
|
@ -66,3 +87,115 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_cuda_out_prod_id
|
||||
//
|
||||
// Scattered outer-product for the MUL_MAT_ID backward pass (gradient w.r.t. expert weights).
|
||||
//
|
||||
// src0 = a [cols, n_expert_used, n_tokens] F32 — token activations
|
||||
// src1 = b [rows, n_expert_used, n_tokens] F32 — upstream gradient
|
||||
// src2 = ids [n_expert_used, n_tokens] I32 — expert dispatch indices
|
||||
// dst [cols, rows, n_expert, 1] F32 — gradient w.r.t. expert weight matrices
|
||||
//
|
||||
// dst[:, :, e] += sum_{(i,t): ids[i,t]==e} a[:, i, t] ⊗ b[:, i, t]
|
||||
//
|
||||
// Algorithm:
|
||||
// For each expert e: gather the token columns where ids[i,t]==e into contiguous
|
||||
// GPU buffers, then use cublasSgemm (beta=1) to accumulate the outer product.
|
||||
// ids may be CPU-resident (common in backward graphs where they are leaf tensors).
|
||||
void ggml_cuda_out_prod_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0]; // a [cols, n_exp_used, n_tokens]
|
||||
const ggml_tensor * src1 = dst->src[1]; // b [rows, n_exp_used, n_tokens]
|
||||
const ggml_tensor * ids = dst->src[2]; // ids [n_exp_used, n_tokens] i32
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int64_t cols = src0->ne[0];
|
||||
const int64_t n_exp_used = src0->ne[1];
|
||||
const int64_t n_tokens = src0->ne[2];
|
||||
const int64_t rows = src1->ne[0];
|
||||
const int64_t n_expert = dst->ne[2];
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
cublasHandle_t handle = ctx.cublas_handle();
|
||||
CUBLAS_CHECK(cublasSetStream(handle, stream));
|
||||
|
||||
// Zero destination tensor before accumulating
|
||||
CUDA_CHECK(cudaMemsetAsync(dst->data, 0, ggml_nbytes(dst), stream));
|
||||
|
||||
// Read ids to host — ids may be CPU-resident (backward graph leaf) or GPU-resident
|
||||
const size_t ids_nbytes = ggml_nbytes(ids);
|
||||
std::vector<char> ids_host(ids_nbytes);
|
||||
if (ids->buffer && !ggml_backend_buffer_is_host(ids->buffer)) {
|
||||
// GPU-resident: copy to host and synchronize so we can inspect the values
|
||||
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids->data, ids_nbytes, cudaMemcpyDeviceToHost, stream));
|
||||
CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||
} else {
|
||||
memcpy(ids_host.data(), ids->data, ids_nbytes);
|
||||
}
|
||||
|
||||
// Build per-expert token list: expert_tokens[e] = list of flat indices (iexp*n_tokens+itok)
|
||||
// whose dispatch id equals e.
|
||||
std::vector<std::vector<int64_t>> expert_tokens(n_expert);
|
||||
for (int64_t itok = 0; itok < n_tokens; ++itok) {
|
||||
for (int64_t iexp = 0; iexp < n_exp_used; ++iexp) {
|
||||
const int32_t eid = *(const int32_t *)(ids_host.data()
|
||||
+ itok * ids->nb[1] + iexp * ids->nb[0]);
|
||||
GGML_ASSERT(eid >= 0 && eid < (int32_t)n_expert);
|
||||
expert_tokens[eid].push_back(iexp * n_tokens + itok);
|
||||
}
|
||||
}
|
||||
|
||||
// Strides (in elements, not bytes)
|
||||
const int64_t a_stride_exp = src0->nb[1] / sizeof(float); // cols
|
||||
const int64_t a_stride_tok = src0->nb[2] / sizeof(float); // cols * n_exp_used
|
||||
const int64_t b_stride_exp = src1->nb[1] / sizeof(float); // rows
|
||||
const int64_t b_stride_tok = src1->nb[2] / sizeof(float); // rows * n_exp_used
|
||||
const int64_t dst_stride_e = dst->nb[2] / sizeof(float); // cols * rows
|
||||
|
||||
const float alpha_one = 1.0f;
|
||||
const float beta_acc = 1.0f; // accumulate — dst is already zeroed above
|
||||
|
||||
const float * a_base = (const float *) src0->data;
|
||||
const float * b_base = (const float *) src1->data;
|
||||
float * d_base = (float *) dst->data;
|
||||
|
||||
for (int64_t e = 0; e < n_expert; ++e) {
|
||||
const auto & toks = expert_tokens[e];
|
||||
if (toks.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const int64_t ntoks_e = (int64_t) toks.size();
|
||||
|
||||
// Allocate contiguous gather buffers on GPU: a_e [cols, ntoks_e], b_e [rows, ntoks_e]
|
||||
ggml_cuda_pool_alloc<float> a_gathered(ctx.pool(), cols * ntoks_e);
|
||||
ggml_cuda_pool_alloc<float> b_gathered(ctx.pool(), rows * ntoks_e);
|
||||
|
||||
// Gather token vectors from GPU src0/src1 into contiguous buffers
|
||||
for (int64_t ti = 0; ti < ntoks_e; ++ti) {
|
||||
const int64_t flat = toks[ti];
|
||||
const int64_t iexp = flat / n_tokens;
|
||||
const int64_t itok = flat % n_tokens;
|
||||
CUDA_CHECK(cudaMemcpyAsync(
|
||||
a_gathered.ptr + ti * cols,
|
||||
a_base + iexp * a_stride_exp + itok * a_stride_tok,
|
||||
cols * sizeof(float), cudaMemcpyDeviceToDevice, stream));
|
||||
CUDA_CHECK(cudaMemcpyAsync(
|
||||
b_gathered.ptr + ti * rows,
|
||||
b_base + iexp * b_stride_exp + itok * b_stride_tok,
|
||||
rows * sizeof(float), cudaMemcpyDeviceToDevice, stream));
|
||||
}
|
||||
|
||||
// dst[:, :, e] += a_gathered @ b_gathered^T
|
||||
// cuBLAS column-major: A=[cols, ntoks_e] lda=cols, B=[rows, ntoks_e] ldb=rows
|
||||
CUBLAS_CHECK(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T,
|
||||
(int)cols, (int)rows, (int)ntoks_e,
|
||||
&alpha_one, a_gathered.ptr, (int)cols,
|
||||
b_gathered.ptr, (int)rows,
|
||||
&beta_acc, d_base + e*dst_stride_e, (int)cols));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,3 +1,10 @@
|
|||
#include "common.cuh"
|
||||
|
||||
void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
// Scattered outer-product for MUL_MAT_ID backward (gradient w.r.t. expert weight matrices).
|
||||
// src0: activations [cols, n_expert_used, n_tokens] F32
|
||||
// src1: grad_output [rows, n_expert_used, n_tokens] F32
|
||||
// src2: expert ids [n_expert_used, n_tokens] I32 (may be CPU-resident)
|
||||
// dst: grad_weight [cols, rows, n_expert, 1] F32
|
||||
void ggml_cuda_out_prod_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
|
|
|||
|
|
@ -58,10 +58,13 @@ struct ggml_opt_context {
|
|||
std::vector<struct ggml_tensor *> grad_accs;
|
||||
std::vector<struct ggml_tensor *> grad_m;
|
||||
std::vector<struct ggml_tensor *> grad_v;
|
||||
std::vector<ggml_backend_buffer_t> bufs_momenta; // per-param moment buffers (one per param node)
|
||||
std::vector<struct ggml_context *> ctxs_momenta; // corresponding ggml contexts (keep alive for tensor metadata)
|
||||
|
||||
int64_t iter = 1;
|
||||
int32_t opt_period = 1;
|
||||
int32_t opt_i = 0;
|
||||
int32_t grad_checkpoint_interval = 0;
|
||||
bool loss_per_datapoint = false;
|
||||
|
||||
ggml_opt_get_optimizer_params get_opt_pars = nullptr;
|
||||
|
|
@ -230,6 +233,7 @@ struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * us
|
|||
result.adamw.beta2 = 0.999f;
|
||||
result.adamw.eps = 1e-8f;
|
||||
result.adamw.wd = 0.0f;
|
||||
result.adamw.gclip = 0.0f;
|
||||
|
||||
result.sgd.alpha = 1e-3f;
|
||||
result.sgd.wd = 0.0f;
|
||||
|
|
@ -253,9 +257,10 @@ struct ggml_opt_params ggml_opt_default_params(
|
|||
/*loss_type =*/ loss_type,
|
||||
/*build_type =*/ GGML_OPT_BUILD_TYPE_OPT,
|
||||
/*opt_period =*/ 1,
|
||||
/*get_opt_pars =*/ ggml_opt_get_default_optimizer_params,
|
||||
/*get_opt_pars_ud =*/ nullptr,
|
||||
/*optimizer =*/ GGML_OPT_OPTIMIZER_TYPE_ADAMW,
|
||||
/*get_opt_pars =*/ ggml_opt_get_default_optimizer_params,
|
||||
/*get_opt_pars_ud =*/ nullptr,
|
||||
/*grad_checkpoint_interval =*/ 0,
|
||||
/*optimizer =*/ GGML_OPT_OPTIMIZER_TYPE_ADAMW,
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -475,8 +480,23 @@ static void ggml_opt_build(ggml_opt_context_t opt_ctx) {
|
|||
for (int i = 0; i < n_nodes; ++i) {
|
||||
ggml_tensor * node = opt_ctx->gf->nodes[i];
|
||||
if (node->flags & GGML_TENSOR_FLAG_PARAM) {
|
||||
opt_ctx->grad_m[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
|
||||
opt_ctx->grad_v[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
|
||||
// Allocate moments on the same buffer type as the param tensor so
|
||||
// the ADAMW op runs on the correct backend (avoids cross-device mismatch
|
||||
// when some LoRA tensors are on CPU and others on GPU with partial offload).
|
||||
ggml_backend_buffer_type_t param_buft = node->buffer
|
||||
? ggml_backend_buffer_get_type(node->buffer)
|
||||
: ggml_backend_cpu_buffer_type();
|
||||
|
||||
// Allocate a tiny context + buffer for this pair of moment tensors.
|
||||
const size_t sz = 2 * ggml_tensor_overhead();
|
||||
struct ggml_init_params mip = { sz, nullptr, true };
|
||||
struct ggml_context * mctx = ggml_init(mip);
|
||||
opt_ctx->grad_m[i] = ggml_new_tensor(mctx, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
|
||||
opt_ctx->grad_v[i] = ggml_new_tensor(mctx, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
|
||||
ggml_backend_buffer_t mbuf = ggml_backend_alloc_ctx_tensors_from_buft(mctx, param_buft);
|
||||
ggml_backend_buffer_clear(mbuf, 0);
|
||||
opt_ctx->bufs_momenta.push_back(mbuf);
|
||||
opt_ctx->ctxs_momenta.push_back(mctx); // keep alive for tensor metadata
|
||||
} else {
|
||||
opt_ctx->grad_m[i] = nullptr;
|
||||
opt_ctx->grad_v[i] = nullptr;
|
||||
|
|
@ -485,6 +505,31 @@ static void ggml_opt_build(ggml_opt_context_t opt_ctx) {
|
|||
}
|
||||
}
|
||||
|
||||
// Gradient checkpointing: mark every Nth forward node as OUTPUT so the allocator
|
||||
// keeps its memory alive through the backward pass. The backward graph already
|
||||
// contains the forward ops (gb_grad is a superset of gf), so the checkpointed
|
||||
// activations are naturally available for backward matmuls without recomputation.
|
||||
// This prevents the allocator from aliasing those buffers to later ops, cutting
|
||||
// peak activation VRAM at the cost of slightly larger static allocation.
|
||||
if (opt_ctx->grad_checkpoint_interval > 0) {
|
||||
const int interval = opt_ctx->grad_checkpoint_interval;
|
||||
const int n_fwd = opt_ctx->gf->n_nodes;
|
||||
int ckpt_count = 0;
|
||||
for (int i = interval - 1; i < n_fwd; i += interval) {
|
||||
struct ggml_tensor * node = opt_ctx->gf->nodes[i];
|
||||
// Only checkpoint F32 compute nodes — skip I32 index tensors and already-output nodes.
|
||||
if (node->type != GGML_TYPE_F32) continue;
|
||||
if (node->flags & GGML_TENSOR_FLAG_OUTPUT) continue;
|
||||
if (node->flags & GGML_TENSOR_FLAG_INPUT) continue;
|
||||
node->flags |= GGML_TENSOR_FLAG_OUTPUT;
|
||||
ckpt_count++;
|
||||
}
|
||||
if (ckpt_count > 0) {
|
||||
GGML_LOG_DEBUG("%s: gradient checkpointing: marked %d/%d nodes as persistent (interval=%d)\n",
|
||||
__func__, ckpt_count, n_fwd, interval);
|
||||
}
|
||||
}
|
||||
|
||||
// gb_grad == graph backward gradients, forward pass, then backward pass to calculate gradients.
|
||||
opt_ctx->gb_grad = ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gf, /*force_grads =*/ true);
|
||||
ggml_build_backward_expand(opt_ctx->ctx_compute, opt_ctx->gb_grad, opt_ctx->grad_accs.data());
|
||||
|
|
@ -503,7 +548,7 @@ static void ggml_opt_build(ggml_opt_context_t opt_ctx) {
|
|||
// gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step.
|
||||
opt_ctx->gb_opt = ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gb_grad, /*force_grads =*/ true);
|
||||
|
||||
opt_ctx->opt_step_params = ggml_new_tensor_1d(opt_ctx->ctx_cpu, GGML_TYPE_F32, need_momenta ? 7 : 2);
|
||||
opt_ctx->opt_step_params = ggml_new_tensor_1d(opt_ctx->ctx_cpu, GGML_TYPE_F32, need_momenta ? 8 : 2);
|
||||
ggml_tensor * adamw_params = opt_ctx->opt_step_params;
|
||||
ggml_set_input(adamw_params);
|
||||
const char * optimizer_name = ggml_opt_optimizer_name(opt_ctx->optimizer);
|
||||
|
|
@ -555,10 +600,11 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
|
|||
result->build_type_alloc = params.build_type;
|
||||
result->inputs = params.inputs;
|
||||
result->outputs = params.outputs;
|
||||
result->opt_period = params.opt_period;
|
||||
result->get_opt_pars = params.get_opt_pars;
|
||||
result->get_opt_pars_ud = params.get_opt_pars_ud;
|
||||
result->optimizer = params.optimizer;
|
||||
result->opt_period = params.opt_period;
|
||||
result->grad_checkpoint_interval = params.grad_checkpoint_interval;
|
||||
result->get_opt_pars = params.get_opt_pars;
|
||||
result->get_opt_pars_ud = params.get_opt_pars_ud;
|
||||
result->optimizer = params.optimizer;
|
||||
|
||||
GGML_ASSERT(result->opt_period >= 1);
|
||||
|
||||
|
|
@ -587,6 +633,12 @@ void ggml_opt_free(ggml_opt_context_t opt_ctx) {
|
|||
}
|
||||
ggml_backend_buffer_free(opt_ctx->buf_static);
|
||||
ggml_backend_buffer_free(opt_ctx->buf_cpu);
|
||||
for (ggml_backend_buffer_t buf : opt_ctx->bufs_momenta) {
|
||||
ggml_backend_buffer_free(buf);
|
||||
}
|
||||
for (struct ggml_context * ctx : opt_ctx->ctxs_momenta) {
|
||||
ggml_free(ctx);
|
||||
}
|
||||
ggml_free(opt_ctx->ctx_static);
|
||||
ggml_free(opt_ctx->ctx_cpu);
|
||||
delete opt_ctx;
|
||||
|
|
@ -726,6 +778,17 @@ void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward) {
|
|||
if (opt_ctx->build_type == GGML_OPT_BUILD_TYPE_OPT && opt_ctx->opt_period > 1 && opt_ctx->opt_i == 0) {
|
||||
ggml_graph_reset(opt_ctx->gb_grad);
|
||||
}
|
||||
|
||||
// For non-static graphs the compute graph is rebuilt every call, so ggml_graph_reset
|
||||
// is not called and grad_accs may carry over values from the previous accumulation window.
|
||||
// Explicitly zero them at the start of each gradient-accumulation cycle.
|
||||
if (!opt_ctx->static_graphs && backward && opt_ctx->opt_i == 0) {
|
||||
for (struct ggml_tensor * ga : opt_ctx->grad_accs) {
|
||||
if (ga) {
|
||||
ggml_set_zero(ga);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (backward) {
|
||||
const int32_t opt_i_next = (opt_ctx->opt_i + 1) % opt_ctx->opt_period;
|
||||
opt_ctx->build_type = opt_i_next == 0 ? GGML_OPT_BUILD_TYPE_OPT : GGML_OPT_BUILD_TYPE_GRAD;
|
||||
|
|
@ -793,6 +856,7 @@ void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result) {
|
|||
GGML_ASSERT(opt_pars.adamw.eps >= 0.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.wd >= 0.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.wd <= 1.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.gclip >= 0.0f);
|
||||
|
||||
// beta1, beta2 after applying warmup
|
||||
const float beta1h = 1.0f / (1.0f - powf(opt_pars.adamw.beta1, opt_ctx->iter));
|
||||
|
|
@ -806,6 +870,7 @@ void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result) {
|
|||
adamw_par_data[4] = opt_pars.adamw.wd;
|
||||
adamw_par_data[5] = beta1h;
|
||||
adamw_par_data[6] = beta2h;
|
||||
adamw_par_data[7] = opt_pars.adamw.gclip;
|
||||
} break;
|
||||
case GGML_OPT_OPTIMIZER_TYPE_SGD: {
|
||||
GGML_ASSERT(opt_pars.sgd.alpha > 0.0f);
|
||||
|
|
|
|||
126
ggml/src/ggml.c
126
ggml/src/ggml.c
|
|
@ -984,6 +984,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|||
"MUL_MAT",
|
||||
"MUL_MAT_ID",
|
||||
"OUT_PROD",
|
||||
"OUT_PROD_ID",
|
||||
|
||||
"SCALE",
|
||||
"SET",
|
||||
|
|
@ -1057,7 +1058,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|||
"GLU",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
|
||||
static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT != 97");
|
||||
|
||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"none",
|
||||
|
|
@ -1094,6 +1095,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|||
"X*Y",
|
||||
"X[i]*Y",
|
||||
"X*Y",
|
||||
"X_id⊗Y_id",
|
||||
|
||||
"x*v",
|
||||
"y-\\>view(x)",
|
||||
|
|
@ -1167,7 +1169,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|||
"glu(x)",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
|
||||
static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT != 97");
|
||||
|
||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||
|
||||
|
|
@ -3302,6 +3304,44 @@ struct ggml_tensor * ggml_out_prod(
|
|||
return result;
|
||||
}
|
||||
|
||||
// ggml_out_prod_id
|
||||
//
|
||||
// Scattered outer-product for the MUL_MAT_ID backward pass.
|
||||
//
|
||||
// a: [cols, n_expert_used, n_tokens] F32 — activations (src1 of MUL_MAT_ID)
|
||||
// b: [rows, n_expert_used, n_tokens] F32 — upstream gradient
|
||||
// ids: [n_expert_used, n_tokens] I32 — expert dispatch indices (src2 of MUL_MAT_ID)
|
||||
// result: [cols, rows, n_expert, 1] F32
|
||||
//
|
||||
// result[:, :, e] += sum_{(i,t): ids[i,t]==e} a[:, i, t] ⊗ b[:, i, t]
|
||||
//
|
||||
// Computes the gradient w.r.t. the expert weight matrices (src0) of MUL_MAT_ID.
|
||||
struct ggml_tensor * ggml_out_prod_id(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
struct ggml_tensor * ids,
|
||||
int64_t n_expert) {
|
||||
GGML_ASSERT(a->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(b->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(a->ne[1] == b->ne[1]); // n_expert_used matches
|
||||
GGML_ASSERT(a->ne[2] == b->ne[2]); // n_tokens matches
|
||||
GGML_ASSERT(ids->ne[0] == a->ne[1]); // n_expert_used matches ids
|
||||
GGML_ASSERT(ids->ne[1] == a->ne[2]); // n_tokens matches ids
|
||||
GGML_ASSERT(n_expert > 0);
|
||||
|
||||
const int64_t ne[4] = { a->ne[0], b->ne[0], n_expert, 1 };
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
||||
|
||||
result->op = GGML_OP_OUT_PROD_ID;
|
||||
result->src[0] = a;
|
||||
result->src[1] = b;
|
||||
result->src[2] = ids;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_scale
|
||||
|
||||
static struct ggml_tensor * ggml_scale_impl(
|
||||
|
|
@ -3840,12 +3880,17 @@ struct ggml_tensor * ggml_get_rows_back(
|
|||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
struct ggml_tensor * c) {
|
||||
GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(ggml_is_matrix(c) && (a->ne[0] == c->ne[0]));
|
||||
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(a->ne[0] == c->ne[0]);
|
||||
|
||||
// Support both 2D and 3D: result shape matches c (the source tensor shape)
|
||||
// TODO: implement non F32 return
|
||||
//struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
|
||||
struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]);
|
||||
struct ggml_tensor * result;
|
||||
if (c->ne[2] > 1) {
|
||||
result = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1], c->ne[2]);
|
||||
} else {
|
||||
result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]);
|
||||
}
|
||||
|
||||
result->op = GGML_OP_GET_ROWS_BACK;
|
||||
result->src[0] = a;
|
||||
|
|
@ -6064,7 +6109,7 @@ struct ggml_tensor * ggml_opt_step_adamw(
|
|||
GGML_ASSERT(ggml_are_same_shape(a, m));
|
||||
GGML_ASSERT(ggml_are_same_shape(a, v));
|
||||
GGML_ASSERT(adamw_params->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ggml_nelements(adamw_params) == 7);
|
||||
GGML_ASSERT(ggml_nelements(adamw_params) == 8);
|
||||
|
||||
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
||||
|
||||
|
|
@ -6527,6 +6572,35 @@ static void ggml_compute_backward(
|
|||
grad))); // [m,p,qq,rr]
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_MUL_MAT_ID: {
|
||||
// Backward pass for indirect matrix multiplication (MoE).
|
||||
//
|
||||
// Forward: dst[rows, n_exp_used, n_tokens] = as[:,:,ids[i,t]] @ b[:,i,t]
|
||||
// src0 = as [cols, rows, n_expert] — expert weight matrices
|
||||
// src1 = b [cols, n_exp_used, n_tokens] — token activations
|
||||
// src2 = ids [n_exp_used, n_tokens] — expert dispatch indices (I32)
|
||||
//
|
||||
// Gradient w.r.t. src1 (activations):
|
||||
// grad_b[:,i,t] = as[:,:,ids[i,t]]^T @ grad[:,i,t]
|
||||
// → computed via MUL_MAT_ID with transposed as
|
||||
//
|
||||
// Gradient w.r.t. src0 (expert weights, only when F32 i.e. LoRA):
|
||||
// grad_as[:,:,e] += sum_{(i,t): ids[i,t]==e} b[:,i,t] ⊗ grad[:,i,t]
|
||||
// → computed via OUT_PROD_ID
|
||||
//
|
||||
// Quantized src0 is frozen (stop-gradient) — handled in grads_needed below.
|
||||
if (src0_needs_grads) {
|
||||
const int64_t n_expert = src0->ne[2];
|
||||
struct ggml_tensor * grad_as = ggml_out_prod_id(ctx, src1, grad, src2, n_expert);
|
||||
ggml_add_or_set(ctx, cgraph, isrc0, grad_as);
|
||||
}
|
||||
if (src1_needs_grads) {
|
||||
// Transpose expert matrices: as [cols, rows, n_expert] → as_T [rows, cols, n_expert]
|
||||
struct ggml_tensor * as_T = ggml_cont(ctx, ggml_permute(ctx, src0, 1, 0, 2, 3));
|
||||
struct ggml_tensor * grad_b = ggml_mul_mat_id(ctx, as_T, grad, src2);
|
||||
ggml_add_or_set(ctx, cgraph, isrc1, grad_b);
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_SCALE: {
|
||||
if (src0_needs_grads) {
|
||||
float s;
|
||||
|
|
@ -6973,6 +7047,35 @@ void ggml_build_backward_expand(
|
|||
ignore_src[1] = true;
|
||||
break;
|
||||
|
||||
// MUL_MAT_ID: expert dispatch indices (src2) are integer — no gradient.
|
||||
// When src0 is quantized the expert weights are frozen, so stop gradient through
|
||||
// both src0 and src1 (activations have no path to loss without differentiable weights).
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
if (ggml_is_quantized(node->src[0]->type)) {
|
||||
ignore_src[0] = true;
|
||||
ignore_src[1] = true;
|
||||
}
|
||||
ignore_src[2] = true; // ids: integer tensor
|
||||
break;
|
||||
|
||||
// SET_ROWS is a KV-cache scatter write. The gradient of the written data flows
|
||||
// through the attention read path (GET_ROWS backward), not through this node.
|
||||
case GGML_OP_SET_ROWS:
|
||||
ignore_src[0] = true;
|
||||
ignore_src[1] = true;
|
||||
break;
|
||||
|
||||
// Ops with no backward implementation — stop gradient through all sources so the
|
||||
// backward graph builder never tries to propagate through them.
|
||||
case GGML_OP_SSM_CONV: // Mamba causal conv1d
|
||||
case GGML_OP_SSM_SCAN: // Mamba selective scan
|
||||
case GGML_OP_FLASH_ATTN_EXT: // use standard attention for training
|
||||
ignore_src[0] = true;
|
||||
ignore_src[1] = true;
|
||||
ignore_src[2] = true;
|
||||
ignore_src[3] = true;
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
@ -6988,9 +7091,12 @@ void ggml_build_backward_expand(
|
|||
continue;
|
||||
}
|
||||
|
||||
// inplace operations are currently not supported
|
||||
GGML_ASSERT(!node->view_src || node->op == GGML_OP_CPY || node->op == GGML_OP_VIEW ||
|
||||
node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE);
|
||||
// inplace operations are currently not supported — warn and skip instead of crashing
|
||||
if (node->view_src && node->op != GGML_OP_CPY && node->op != GGML_OP_VIEW &&
|
||||
node->op != GGML_OP_RESHAPE && node->op != GGML_OP_PERMUTE && node->op != GGML_OP_TRANSPOSE) {
|
||||
GGML_LOG_WARN("%s: skipping unsupported inplace op '%s' in backward graph\n", __func__, ggml_op_name(node->op));
|
||||
continue;
|
||||
}
|
||||
|
||||
const size_t ihash = ggml_hash_find(&cgraph->visited_hash_set, node);
|
||||
GGML_ASSERT(ihash != GGML_HASHSET_FULL);
|
||||
|
|
|
|||
|
|
@ -1553,10 +1553,22 @@ extern "C" {
|
|||
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
|
||||
|
||||
enum ggml_opt_optimizer_type optimizer_type;
|
||||
|
||||
// Gradient checkpointing: mark every Nth forward graph node as persistent so the
|
||||
// allocator cannot reuse its memory during backward. Reduces peak activation VRAM
|
||||
// at the cost of ~0 extra compute (activations are kept, not recomputed).
|
||||
// Set to 0 (default) to disable. Good values: 32–64 nodes ≈ every 1–2 transformer layers.
|
||||
int32_t grad_checkpoint_interval;
|
||||
};
|
||||
|
||||
LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
|
||||
|
||||
// weights: array of floats, one per dataset window (indexed by idata), already normalized to [0,1].
|
||||
// n_weights: length of the array.
|
||||
// Pass NULL/0 to disable (equivalent to all-ones, i.e. standard SFT).
|
||||
// The pointer must remain valid for the duration of all llama_opt_epoch calls.
|
||||
LLAMA_API void llama_opt_set_reward_weights(const float * weights, int64_t n_weights);
|
||||
|
||||
LLAMA_API void llama_opt_epoch(
|
||||
struct llama_context * lctx,
|
||||
ggml_opt_dataset_t dataset,
|
||||
|
|
@ -1564,7 +1576,8 @@ extern "C" {
|
|||
ggml_opt_result_t result_eval,
|
||||
int64_t idata_split,
|
||||
ggml_opt_epoch_callback callback_train,
|
||||
ggml_opt_epoch_callback callback_eval);
|
||||
ggml_opt_epoch_callback callback_eval,
|
||||
bool shuffle);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
|||
|
|
@ -334,16 +334,26 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
|
|||
|
||||
auto * buft = ggml_backend_buffer_get_type(model_tensor->buffer);
|
||||
|
||||
// do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case
|
||||
// do not load loras to extra buffer types (i.e. bufts for repacking)
|
||||
// try device-native buft first (keeps LoRA on GPU), fall back to CPU only as last resort
|
||||
for (auto & ex : buft_extra) {
|
||||
if (ex == buft) {
|
||||
LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
|
||||
|
||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (!cpu_dev) {
|
||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||
// try to get the device's native (non-repack) buffer type
|
||||
auto * dev = ggml_backend_buft_get_device(buft);
|
||||
auto * native_buft = dev ? ggml_backend_dev_buffer_type(dev) : nullptr;
|
||||
if (native_buft && native_buft != buft) {
|
||||
LLAMA_LOG_WARN("%s: lora for '%s' cannot use repack buft '%s', using device-native '%s'\n",
|
||||
__func__, model_tensor->name, ggml_backend_buft_name(buft), ggml_backend_buft_name(native_buft));
|
||||
buft = native_buft;
|
||||
} else {
|
||||
LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n",
|
||||
__func__, model_tensor->name, ggml_backend_buft_name(buft));
|
||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (!cpu_dev) {
|
||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||
}
|
||||
buft = ggml_backend_dev_buffer_type(cpu_dev);
|
||||
}
|
||||
buft = ggml_backend_dev_buffer_type(cpu_dev);
|
||||
|
||||
break;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2677,11 +2677,71 @@ void llama_context::opt_init(struct llama_model * model, struct llama_opt_params
|
|||
GGML_ASSERT(model->hparams.n_ctx_train % n_batch == 0);
|
||||
GGML_ASSERT(n_batch % n_ubatch == 0);
|
||||
|
||||
// Recreate the scheduler and gf_res_prev with a training-inflated graph size before
|
||||
// creating opt_ctx, so opt_ctx captures the new (larger) scheduler pointer.
|
||||
// The backward graph (gb_grad) duplicates gf and adds ~2-3x more nodes+leafs;
|
||||
// gb_opt adds optimizer step nodes on top.
|
||||
//
|
||||
// We measure the actual training forward graph node count at n_ubatch here,
|
||||
// then multiply by 4 to cover gf + gb_grad + gb_opt. This is exact for any
|
||||
// model size — no magic constant needed.
|
||||
{
|
||||
uint32_t train_fwd_nodes = 0;
|
||||
|
||||
// Build a real training-ubatch forward graph in split-only mode (no buffer realloc)
|
||||
// so we can count its actual nodes. Fall back to n_tensors formula if it fails.
|
||||
if (memory) {
|
||||
auto mctx_tmp = memory->init_full();
|
||||
if (mctx_tmp) {
|
||||
// graph_reserve() uses gf_res_reserve to build the graph, so both
|
||||
// must be large enough to hold the training forward graph.
|
||||
// Use 16x n_tensors as a generous temporary cap for the measurement pass.
|
||||
const uint32_t tmp_cap = std::max<uint32_t>(4096u, 16u * model->n_tensors());
|
||||
gf_res_prev.reset(new llm_graph_result(tmp_cap));
|
||||
gf_res_reserve.reset(new llm_graph_result(tmp_cap));
|
||||
// split_only=true: only splits the graph, doesn't reallocate compute buffers
|
||||
auto * gf_train = graph_reserve(n_ubatch, 1, n_ubatch, mctx_tmp.get(), /*split_only=*/true);
|
||||
if (gf_train) {
|
||||
train_fwd_nodes = (uint32_t)ggml_graph_n_nodes(gf_train);
|
||||
LLAMA_LOG_INFO("%s: measured training graph nodes = %u (n_ubatch=%u)\n",
|
||||
__func__, train_fwd_nodes, n_ubatch);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (train_fwd_nodes == 0) {
|
||||
// Fallback: use n_tensors formula
|
||||
train_fwd_nodes = std::max<uint32_t>(1024u, 8u * model->n_tensors());
|
||||
LLAMA_LOG_WARN("%s: could not measure training graph, using fallback nodes=%u\n",
|
||||
__func__, train_fwd_nodes);
|
||||
}
|
||||
|
||||
// gf + gb_grad + gb_opt each need ~train_fwd_nodes; multiply by 4 for safety headroom.
|
||||
// Multiply by 2 again for the scheduler's n_nodes + n_leafs check.
|
||||
const int64_t inflated = (int64_t)std::max<uint32_t>(train_fwd_nodes, 1024u) * 4;
|
||||
const int64_t sched_size = inflated * 2;
|
||||
// Both gf_res_prev and gf_res_reserve are used to build forward graphs
|
||||
// (graph_reserve uses gf_res_reserve; opt_epoch_iter uses gf_res_prev).
|
||||
// Both must have capacity for the full backward graph.
|
||||
gf_res_prev.reset(new llm_graph_result(inflated));
|
||||
gf_res_reserve.reset(new llm_graph_result(inflated));
|
||||
sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(),
|
||||
sched_size, cparams.pipeline_parallel, cparams.op_offload));
|
||||
// Suppress the next sched_reserve() call so that llama_decode() during GRPO inference
|
||||
// steps does NOT replace the training sched with a smaller inference sched.
|
||||
// opt_ctx->backend_sched stores a raw pointer to sched.get(); replacing sched while
|
||||
// opt_ctx is alive would leave that pointer dangling and crash on the next opt_epoch.
|
||||
sched_need_reserve = false;
|
||||
LLAMA_LOG_INFO("%s: training graph capacity = %lld (train_fwd_nodes=%u x4)\n",
|
||||
__func__, (long long)inflated, train_fwd_nodes);
|
||||
}
|
||||
|
||||
ggml_opt_params opt_params = ggml_opt_default_params(sched.get(), GGML_OPT_LOSS_TYPE_CROSS_ENTROPY);
|
||||
opt_params.opt_period = n_batch / n_ubatch;
|
||||
opt_params.get_opt_pars = lopt_params.get_opt_pars;
|
||||
opt_params.get_opt_pars_ud = lopt_params.get_opt_pars_ud;
|
||||
opt_params.optimizer = lopt_params.optimizer_type;
|
||||
opt_params.opt_period = n_batch / n_ubatch;
|
||||
opt_params.get_opt_pars = lopt_params.get_opt_pars;
|
||||
opt_params.get_opt_pars_ud = lopt_params.get_opt_pars_ud;
|
||||
opt_params.optimizer = lopt_params.optimizer_type;
|
||||
opt_params.grad_checkpoint_interval = lopt_params.grad_checkpoint_interval;
|
||||
opt_ctx = ggml_opt_init(opt_params);
|
||||
|
||||
llama_opt_param_filter param_filter = lopt_params.param_filter;
|
||||
|
|
@ -2716,6 +2776,7 @@ void llama_context::opt_epoch_iter(
|
|||
const std::vector<llama_token> & tokens,
|
||||
const std::vector<llama_token> & labels_sparse,
|
||||
llama_batch & batch,
|
||||
float reward_scale,
|
||||
ggml_opt_epoch_callback callback,
|
||||
bool train,
|
||||
int64_t idata_in_loop,
|
||||
|
|
@ -2764,6 +2825,8 @@ void llama_context::opt_epoch_iter(
|
|||
};
|
||||
|
||||
uint32_t pos_batch = 0;
|
||||
static bool timings_printed = false; // print per-ubatch timings only for the first window
|
||||
struct ggml_context * ctx_compute_opt = nullptr;
|
||||
do {
|
||||
const auto & ubatch = mctx->get_ubatch();
|
||||
|
||||
|
|
@ -2776,56 +2839,98 @@ void llama_context::opt_epoch_iter(
|
|||
|
||||
auto * res = gf_res_prev.get();
|
||||
|
||||
const int64_t t0_build = ggml_time_ms();
|
||||
const auto gparams = graph_params(res, ubatch, mctx.get(), LLM_GRAPH_TYPE_DEFAULT);
|
||||
|
||||
res->reset();
|
||||
|
||||
auto * gf = model.build_graph(gparams);
|
||||
|
||||
struct ggml_context * ctx_compute_opt;
|
||||
{
|
||||
// Allocate the tensor metadata context once, then reset it each iteration.
|
||||
// ggml_reset() is much cheaper than ggml_free()+ggml_init() — it just resets the
|
||||
// allocation pointer without freeing/reallocating the backing memory buffer.
|
||||
if (!ctx_compute_opt) {
|
||||
const size_t size_gf = ggml_graph_size(gf);
|
||||
const size_t size_meta = 4*size_gf*ggml_tensor_overhead() + 2*ggml_graph_overhead_custom(size_gf, /*grads = */ true);
|
||||
const size_t size_meta = 4*size_gf*ggml_tensor_overhead() + 3*ggml_graph_overhead_custom(size_gf, /*grads = */ true);
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ size_meta,
|
||||
/*.mem_buffer =*/ nullptr,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
ctx_compute_opt = ggml_init(params);
|
||||
if (!timings_printed) {
|
||||
LLAMA_LOG_INFO("%s: [timing] graph capacity=%zu n_nodes=%d size_meta=%.1fMB\n", __func__,
|
||||
size_gf, ggml_graph_n_nodes(gf), (double)size_meta / (1024*1024));
|
||||
}
|
||||
} else {
|
||||
ggml_reset(ctx_compute_opt);
|
||||
}
|
||||
|
||||
const int64_t t1_alloc = ggml_time_ms();
|
||||
ggml_opt_prepare_alloc(opt_ctx, ctx_compute_opt, gf, res->get_inp_tokens(), res->get_logits());
|
||||
ggml_opt_alloc(opt_ctx, train);
|
||||
|
||||
const int64_t t2_inputs = ggml_time_ms();
|
||||
res->set_inputs(&ubatch);
|
||||
{
|
||||
struct ggml_tensor * labels = ggml_opt_labels(opt_ctx);
|
||||
GGML_ASSERT(labels->ne[1] == n_ubatch);
|
||||
ggml_set_zero(labels);
|
||||
const float onef = 1.0f;
|
||||
for (uint32_t pos_ubatch = 0; pos_ubatch < n_ubatch; ++pos_ubatch) {
|
||||
const uint32_t ilabel = pos_ctx + pos_batch + pos_ubatch;
|
||||
// -1 sentinel means "masked position" (prompt token, BOS separator, etc).
|
||||
// Leave the label tensor zeroed at this position → zero cross-entropy
|
||||
// contribution. Do NOT write anything — ggml_set_zero already handled it.
|
||||
if (labels_sparse[ilabel] < 0) continue;
|
||||
GGML_ASSERT(labels_sparse[ilabel] < labels->ne[0]);
|
||||
ggml_backend_tensor_set(labels, &onef, (pos_ubatch*labels->ne[0] + labels_sparse[ilabel])*sizeof(float), sizeof(float));
|
||||
ggml_backend_tensor_set(labels, &reward_scale, (pos_ubatch*labels->ne[0] + labels_sparse[ilabel])*sizeof(float), sizeof(float));
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t t3_eval = ggml_time_ms();
|
||||
ggml_opt_eval(opt_ctx, result);
|
||||
|
||||
const int64_t t4_done = ggml_time_ms();
|
||||
if (!timings_printed) {
|
||||
LLAMA_LOG_INFO("%s: [timing] build=%" PRId64 "ms alloc=%" PRId64 "ms inputs=%" PRId64 "ms eval=%" PRId64 "ms total=%" PRId64 "ms\n",
|
||||
__func__,
|
||||
t1_alloc - t0_build,
|
||||
t2_inputs - t1_alloc,
|
||||
t3_eval - t2_inputs,
|
||||
t4_done - t3_eval,
|
||||
t4_done - t0_build);
|
||||
timings_printed = true;
|
||||
}
|
||||
|
||||
if (callback) {
|
||||
callback(train, opt_ctx, dataset, result, idata_in_loop + (pos_ctx + pos_batch)/n_ubatch + 1, ndata_in_loop, t_loop_start);
|
||||
}
|
||||
ggml_free(ctx_compute_opt);
|
||||
|
||||
pos_batch += ubatch.n_tokens;
|
||||
} while (mctx->next());
|
||||
ggml_free(ctx_compute_opt);
|
||||
}
|
||||
}
|
||||
|
||||
// Optional per-window reward weights for reward-weighted SFT.
|
||||
// Set via llama_opt_set_reward_weights() before calling llama_opt_epoch().
|
||||
// Null/0 means all rewards are 1.0 (standard SFT).
|
||||
static thread_local const float * g_reward_weights = nullptr;
|
||||
static thread_local int64_t g_reward_weights_n = 0;
|
||||
|
||||
void llama_opt_set_reward_weights(const float * weights, int64_t n_weights) {
|
||||
g_reward_weights = weights;
|
||||
g_reward_weights_n = n_weights;
|
||||
}
|
||||
|
||||
void llama_context::opt_epoch(
|
||||
ggml_opt_dataset_t dataset,
|
||||
ggml_opt_result_t result_train,
|
||||
ggml_opt_result_t result_eval,
|
||||
int64_t idata_split,
|
||||
ggml_opt_epoch_callback callback_train,
|
||||
ggml_opt_epoch_callback callback_eval) {
|
||||
ggml_opt_epoch_callback callback_eval,
|
||||
bool shuffle) {
|
||||
const uint32_t n_ctx = this->n_ctx();
|
||||
const uint32_t n_batch = std::min(cparams.n_batch, n_ctx);
|
||||
const uint32_t n_ubatch = std::min(cparams.n_ubatch, n_batch);
|
||||
|
|
@ -2834,6 +2939,10 @@ void llama_context::opt_epoch(
|
|||
GGML_ASSERT(idata_split >= 0);
|
||||
GGML_ASSERT(idata_split <= ndata);
|
||||
|
||||
if (shuffle && idata_split > 1) {
|
||||
ggml_opt_dataset_shuffle(opt_ctx, dataset, idata_split);
|
||||
}
|
||||
|
||||
const uint32_t ubatch_per_ctx = n_ctx / n_ubatch;
|
||||
|
||||
struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
|
||||
|
|
@ -2847,9 +2956,11 @@ void llama_context::opt_epoch(
|
|||
for (; idata < idata_split; ++idata) {
|
||||
constexpr bool train = true;
|
||||
const int64_t idata_in_loop = idata*ubatch_per_ctx;
|
||||
const float reward = (g_reward_weights && idata < g_reward_weights_n)
|
||||
? g_reward_weights[idata] : 1.0f;
|
||||
|
||||
ggml_opt_dataset_get_batch_host(dataset, tokens.data(), n_ctx*sizeof(llama_token), labels_sparse.data(), idata);
|
||||
opt_epoch_iter(dataset, result_train, tokens, labels_sparse, batch,
|
||||
opt_epoch_iter(dataset, result_train, tokens, labels_sparse, batch, reward,
|
||||
callback_train, train, idata_in_loop, ndata_in_loop, t_loop_start);
|
||||
}
|
||||
|
||||
|
|
@ -2860,7 +2971,7 @@ void llama_context::opt_epoch(
|
|||
const int64_t idata_in_loop = (idata - idata_split)*ubatch_per_ctx;
|
||||
|
||||
ggml_opt_dataset_get_batch_host(dataset, tokens.data(), n_ctx*sizeof(llama_token), labels_sparse.data(), idata);
|
||||
opt_epoch_iter(dataset, result_eval, tokens, labels_sparse, batch,
|
||||
opt_epoch_iter(dataset, result_eval, tokens, labels_sparse, batch, 1.0f,
|
||||
callback_eval, train, idata_in_loop, ndata_in_loop, t_loop_start);
|
||||
}
|
||||
|
||||
|
|
@ -3622,12 +3733,14 @@ void llama_opt_epoch(
|
|||
ggml_opt_result_t result_eval,
|
||||
int64_t idata_split,
|
||||
ggml_opt_epoch_callback callback_train,
|
||||
ggml_opt_epoch_callback callback_eval) {
|
||||
ggml_opt_epoch_callback callback_eval,
|
||||
bool shuffle) {
|
||||
ctx->opt_epoch(
|
||||
dataset,
|
||||
result_train,
|
||||
result_eval,
|
||||
idata_split,
|
||||
callback_train,
|
||||
callback_eval);
|
||||
callback_eval,
|
||||
shuffle);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -187,7 +187,8 @@ struct llama_context {
|
|||
ggml_opt_result_t result_eval,
|
||||
int64_t idata_split,
|
||||
ggml_opt_epoch_callback callback_train,
|
||||
ggml_opt_epoch_callback callback_eval);
|
||||
ggml_opt_epoch_callback callback_eval,
|
||||
bool shuffle);
|
||||
|
||||
void opt_epoch_iter(
|
||||
ggml_opt_dataset_t dataset,
|
||||
|
|
@ -195,6 +196,7 @@ struct llama_context {
|
|||
const std::vector<llama_token> & tokens,
|
||||
const std::vector<llama_token> & labels_sparse,
|
||||
llama_batch & batch,
|
||||
float reward_scale,
|
||||
ggml_opt_epoch_callback callback,
|
||||
bool train,
|
||||
int64_t idata_in_loop,
|
||||
|
|
|
|||
Loading…
Reference in New Issue