llama.cpp/examples/qlora_training/finetune_qlora.cpp

1220 lines
49 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// QLoRA fine-tuning for quantized GGUF models.
//
// The base model weights stay frozen (quantized tensors are skipped by
// llama_set_param because they are not GGML_TYPE_F32). Only the freshly
// allocated F32 LoRA A/B tensors are trained. After training the adapter
// is saved as a GGUF file that is directly compatible with the existing
// llama_adapter_lora_init() loader and llama-export-lora merge tool.
//
// Usage example:
/* llama-finetune-qlora \
--model model-q4_k_m.gguf \
--train-file train.jsonl \
--lora-rank 16 --lora-alpha 16 \
--lora-out adapter.gguf \
--epochs 3 -c 4096 -b 4096 -ub 512
*/
// Default targets: attn_q, attn_output, ffn_gate, ffn_up, ffn_down
// Override with --lora-targets "comma,separated,substrings"
//
// NOTE: attn_k and attn_v are excluded from defaults. The KV write path uses
// ggml_set_rows (scatter op) — backward cannot propagate gradients through it.
// LoRA K/V would receive zero gradient.
//
// NOTE: ssm_in and ssm_out (Mamba/NemotronH) are excluded from defaults.
// SSM_SCAN/SSM_CONV have no backward implementation — LoRA on these layers
// would receive zero gradient. Adding them wastes memory with no benefit.
//
// NOTE: MoE expert tensors (*_exps) are excluded regardless of --lora-targets.
// The quantized expert weights are frozen (stop-gradient), but LoRA on dense
// FFN layers (ffn_gate, ffn_up, ffn_down) works via MUL_MAT_ID backward.
//
// Target substrings use llama.cpp internal GGUF names (NOT HuggingFace names):
// attn_q = q_proj attn_k = k_proj
// attn_v = v_proj attn_output= o_proj
// ffn_gate = gate_proj ffn_up = up_proj ffn_down = down_proj
// ssm_in = in_proj (Mamba/NemotronH) — zero gradient, not in defaults
// ssm_out = out_proj (Mamba/NemotronH) — zero gradient, not in defaults
#include "arg.h"
#include "chat.h"
#include "common.h"
#include "log.h"
#include "llama.h"
#include "gguf.h"
#include "ggml.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
// Internal adapter struct — included directly to avoid the temp-GGUF roundtrip
// for wiring trainable LoRA tensors into the compute graph.
#include "../../src/llama-adapter.h"
#include <cerrno>
#include <csignal>
#include <iostream>
#define JSON_ASSERT GGML_ASSERT
#include <nlohmann/json.hpp>
#include <algorithm>
#include <clocale>
#include <cmath>
#include <cstring>
#include <fstream>
#include <random>
#include <sstream>
#include <string>
#include <unordered_map>
#include <vector>
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
// Expand a leading ~/ to the HOME directory (the shell doesn't do this for us
// when a path is passed as a string argument to std::ofstream).
static std::string expand_tilde(const std::string & path) {
if (path.size() >= 2 && path[0] == '~' && path[1] == '/') {
const char * home = getenv("HOME");
if (!home) home = getenv("USERPROFILE"); // Windows fallback
if (home) return std::string(home) + path.substr(1);
}
return path;
}
static std::vector<std::string> split_csv(const std::string & s) {
std::vector<std::string> out;
std::istringstream ss(s);
std::string tok;
while (std::getline(ss, tok, ',')) {
if (!tok.empty()) out.push_back(tok);
}
return out;
}
// Tensors whose names contain these substrings use MUL_MAT_ID (sparse MoE expert dispatch)
// which has no backward implementation — exclude them from LoRA targets unconditionally.
static const std::vector<std::string> EXCLUDED_SUBSTRINGS = {
"_exps", // MoE expert weight stacks (ffn_gate_exps, ffn_up_exps, ffn_down_exps, ffn_gate_up_exps)
};
static bool tensor_is_excluded(const char * name) {
const std::string n(name);
for (const auto & ex : EXCLUDED_SUBSTRINGS) {
if (n.find(ex) != std::string::npos) return true;
}
return false;
}
// Extract the transformer block index from a tensor name of the form "blk.NN.<rest>".
// Returns -1 if the name does not follow this pattern.
static int tensor_layer_index(const char * name) {
// All per-layer tensors in llama.cpp GGUF are named "blk.<N>.<suffix>"
const char * p = strstr(name, "blk.");
if (!p) return -1;
p += 4; // skip "blk."
char * end = nullptr;
long idx = strtol(p, &end, 10);
if (end == p || (*end != '.' && *end != '\0')) return -1;
return (int) idx;
}
static bool tensor_matches_targets(const char * name, const std::vector<std::string> & targets,
int freeze_layers = 0) {
if (tensor_is_excluded(name)) return false;
if (freeze_layers > 0) {
const int layer = tensor_layer_index(name);
if (layer >= 0 && layer < freeze_layers) return false;
}
for (const auto & t : targets) {
if (std::string(name).find(t) != std::string::npos) return true;
}
return false;
}
// ---------------------------------------------------------------------------
// JSONL dataset loading
// ---------------------------------------------------------------------------
struct training_sample {
std::vector<llama_token> tokens; // full token sequence
std::vector<bool> is_label; // true for tokens that contribute to loss
float reward; // reward/score weight (1.0 = neutral, 0.0 = ignore)
};
// Apply a very simple ChatML fallback template when the model has no template.
static std::string apply_chatml(const std::vector<common_chat_msg> & msgs) {
std::string out;
for (const auto & m : msgs) {
out += "<|im_start|>" + m.role + "\n";
// content_parts is a vector; build a plain text string
std::string text;
if (!m.content_parts.empty()) {
for (const auto & p : m.content_parts) {
text += p.text;
}
}
out += text + "<|im_end|>\n";
}
return out;
}
static std::vector<training_sample> load_jsonl(
const std::string & path,
llama_context * ctx,
common_chat_templates * tmpls) {
std::ifstream f(path);
if (!f.is_open()) {
LOG_ERR("%s: cannot open %s\n", __func__, path.c_str());
return {};
}
std::vector<training_sample> samples;
std::string line;
int lineno = 0;
while (std::getline(f, line)) {
++lineno;
if (line.empty()) continue;
nlohmann::json j;
try { j = nlohmann::json::parse(line); }
catch (...) {
LOG_WRN("%s: skipping invalid JSON on line %d\n", __func__, lineno);
continue;
}
float reward = 1.0f;
if (j.contains("reward")) reward = j["reward"].get<float>();
else if (j.contains("score")) reward = j["score"].get<float>();
std::string prompt_text;
std::string response_text;
if (j.contains("messages")) {
// chat format — apply template
std::vector<common_chat_msg> msgs;
for (const auto & m : j["messages"]) {
common_chat_msg msg;
msg.role = m.value("role", "user");
common_chat_msg_content_part part;
part.type = "text";
part.text = m.value("content", "");
msg.content_parts.push_back(part);
msgs.push_back(msg);
}
// Skip samples where the last assistant turn contains an error marker.
// These are malformed/failed generations that should not be trained on.
{
std::string last_assistant_content;
for (int mi = (int)msgs.size() - 1; mi >= 0; --mi) {
if (msgs[mi].role == "assistant") {
last_assistant_content = msgs[mi].content_parts.empty()
? "" : msgs[mi].content_parts[0].text;
break;
}
}
// // this should be done on the python side...
// if (last_assistant_content.find("Error:") != std::string::npos ||
// last_assistant_content.find("error:") != std::string::npos) {
// LOG_DBG("%s: skipping line %d — assistant response contains error marker\n", __func__, lineno);
// continue;
// }
}
// Split into prompt (no loss) + last assistant response (loss).
// Render all messages except the last assistant turn as the prompt
// (with add_generation_prompt=true so the template adds the assistant
// prefix), then use the raw last assistant content as response_text.
// This ensures only the assistant's response tokens get loss, not the
// user turns or system prompt.
if (msgs.empty()) continue;
std::string last_assistant_content;
std::vector<common_chat_msg> prompt_msgs;
// Find the last assistant message
int last_asst_idx = -1;
for (int mi = (int)msgs.size() - 1; mi >= 0; --mi) {
if (msgs[mi].role == "assistant") { last_asst_idx = mi; break; }
}
if (last_asst_idx < 0) {
// No assistant turn — skip; nothing to train on
LOG_DBG("%s: skipping line %d — no assistant turn\n", __func__, lineno);
continue;
}
last_assistant_content = msgs[last_asst_idx].content_parts.empty()
? "" : msgs[last_asst_idx].content_parts[0].text;
for (int mi = 0; mi < last_asst_idx; ++mi) prompt_msgs.push_back(msgs[mi]);
if (tmpls) {
common_chat_templates_inputs inp;
inp.messages = prompt_msgs;
inp.add_generation_prompt = true;
prompt_text = common_chat_templates_apply(tmpls, inp).prompt;
response_text = last_assistant_content;
} else {
// Fallback: render everything as ChatML, use full text as response
std::vector<common_chat_msg> all_msgs = prompt_msgs;
all_msgs.push_back(msgs[last_asst_idx]);
prompt_text = "";
response_text = apply_chatml(all_msgs);
}
} else if (j.contains("prompt") && j.contains("response")) {
response_text = j["response"].get<std::string>();
// // this should be done on the python side...
// if (response_text.find("Error:") != std::string::npos ||
// response_text.find("error:") != std::string::npos) {
// LOG_DBG("%s: skipping line %d — response contains error marker\n", __func__, lineno);
// continue;
// }
prompt_text = j["prompt"].get<std::string>();
} else if (j.contains("text")) {
response_text = j["text"].get<std::string>();
} else {
LOG_WRN("%s: unknown format on line %d, skipping\n", __func__, lineno);
continue;
}
// Tokenize: prompt (no loss) + response (loss)
auto tok_prompt = common_tokenize(ctx, prompt_text, /*add_special=*/true);
auto tok_response = common_tokenize(ctx, response_text, /*add_special=*/false);
if (tok_prompt.empty() && tok_response.empty()) continue;
training_sample s;
s.reward = reward;
s.tokens.insert(s.tokens.end(), tok_prompt.begin(), tok_prompt.end());
s.tokens.insert(s.tokens.end(), tok_response.begin(), tok_response.end());
s.is_label.resize(s.tokens.size(), false);
// Only response tokens contribute to the loss
for (size_t i = tok_prompt.size(); i < s.tokens.size(); ++i) {
s.is_label[i] = true;
}
samples.push_back(std::move(s));
}
LOG_INF("%s: loaded %zu samples from %s\n", __func__, samples.size(), path.c_str());
return samples;
}
// Pack variable-length samples into fixed-context-length windows and create
// an ggml_opt_dataset. Labels for prompt tokens are set to -1 (ignored by
// the loss in the epoch loop).
// window_rewards is filled with one reward weight per window (averaged over
// the sample tokens that fall in that window). If all samples have reward=1.0
// the vector is all-ones and has no effect.
static ggml_opt_dataset_t build_dataset(
const std::vector<training_sample> & samples,
int32_t n_ctx,
std::vector<float> & window_rewards,
bool train_on_prompt = false,
llama_token bos_token = -1) {
// Flatten samples into token/label/reward streams
std::vector<llama_token> flat_tokens;
std::vector<int32_t> flat_labels; // -1 = no loss, token_id = loss target
std::vector<float> flat_rewards; // per-token reward from the source sample
for (size_t si = 0; si < samples.size(); ++si) {
const auto & s = samples[si];
// Insert BOS separator between samples to prevent cross-sample predictions.
// The first sample already has BOS from tokenization (add_special=true).
if (si > 0 && bos_token >= 0 && !s.tokens.empty()) {
flat_tokens .push_back(bos_token);
flat_labels .push_back(-1); // no loss on separator
flat_rewards.push_back(s.reward);
}
for (size_t i = 0; i + 1 < s.tokens.size(); ++i) {
flat_tokens .push_back(s.tokens[i]);
if (train_on_prompt) {
// All positions get correct next-token label (prompt + response)
flat_labels.push_back((int32_t)s.tokens[i + 1]);
} else {
// Only response positions get loss; prompt positions get -1 (sentinel).
// The sentinel is passed through to labels_sparse; opt_epoch_iter skips
// writing to the label tensor for those positions, leaving them zeroed →
// zero cross-entropy contribution. No gradient flows from prompt tokens.
flat_labels.push_back(s.is_label[i + 1] ? (int32_t)s.tokens[i + 1] : -1);
}
flat_rewards.push_back(s.reward);
}
}
if ((int64_t)flat_tokens.size() < n_ctx) {
LOG_ERR("%s: dataset too small (%zu tokens) for context %d\n",
__func__, flat_tokens.size(), n_ctx);
return nullptr;
}
const int64_t stride = n_ctx / 2;
int64_t ndata = ((int64_t)flat_tokens.size() - n_ctx) / stride;
if (ndata < 1) ndata = 1; // at least one window when flat_tokens >= n_ctx
window_rewards.resize(ndata);
ggml_opt_dataset_t dataset = ggml_opt_dataset_init(
GGML_TYPE_I32, GGML_TYPE_I32, n_ctx, n_ctx, ndata, 1);
int32_t * data = (int32_t *) ggml_opt_dataset_data (dataset)->data;
int32_t * labels = (int32_t *) ggml_opt_dataset_labels(dataset)->data;
for (int64_t i = 0; i < ndata; ++i) {
const int64_t off = i * stride;
float reward_sum = 0.0f;
for (int32_t j = 0; j < n_ctx; ++j) {
data [i * n_ctx + j] = flat_tokens[off + j];
// Pass -1 sentinel through unchanged for masked (prompt) positions.
// opt_epoch_iter skips these positions (no label tensor write → zero
// cross-entropy contribution). Do NOT substitute the current token
// here — that trains the model to predict itself (off-by-one) and
// causes repetition degeneration.
labels[i * n_ctx + j] = flat_labels[off + j];
reward_sum += flat_rewards[off + j];
}
window_rewards[i] = reward_sum / n_ctx;
}
// Normalize window rewards to [0, 1].
// Step 1: clip to [-1, 1] — outliers like 1.3/1.4 would otherwise compress the
// useful signal range after min-max scaling (a reward=1.0 would map to
// only 0.83 instead of 1.0 if the max is 1.4).
// Step 2: min-max scale clipped values → [0, 1].
// min → 0.0 (window ignored), max → 1.0 (full weight).
// If all rewards are identical (pure SFT dataset) keep at 1.0.
for (float & r : window_rewards) {
r = std::max(-1.0f, std::min(1.0f, r));
}
float rmin = *std::min_element(window_rewards.begin(), window_rewards.end());
float rmax = *std::max_element(window_rewards.begin(), window_rewards.end());
const float rrange = rmax - rmin;
if (rrange > 1e-6f) {
for (float & r : window_rewards) {
r = (r - rmin) / rrange;
}
LOG_INF("%s: reward range [%.4f, %.4f] (after clip to [-1,1]) → normalized to [0, 1]\n", __func__, rmin, rmax);
} else {
std::fill(window_rewards.begin(), window_rewards.end(), 1.0f);
}
return dataset;
}
// ---------------------------------------------------------------------------
// LoRA tensor allocation
// ---------------------------------------------------------------------------
struct lora_tensors {
struct ggml_context * ctx = nullptr;
struct ggml_backend_buffer * buf = nullptr;
// map: base tensor name → {lora_a, lora_b}
std::unordered_map<std::string, std::pair<ggml_tensor*, ggml_tensor*>> ab;
};
static lora_tensors alloc_lora_tensors(
const std::string & model_path,
const std::vector<std::string> & targets,
int32_t rank,
std::mt19937 & rng,
int32_t freeze_layers = 0) {
lora_tensors lt;
// Open the model GGUF to discover tensor names and shapes
// without needing access to private llama_model internals.
struct ggml_context * ctx_meta = nullptr;
struct gguf_init_params gguf_params = { /*.no_alloc=*/true, /*.ctx=*/&ctx_meta };
struct gguf_context * ctx_gguf = gguf_init_from_file(model_path.c_str(), gguf_params);
if (!ctx_gguf) {
LOG_ERR("%s: failed to open model GGUF for tensor discovery: %s\n",
__func__, model_path.c_str());
return lt;
}
// Collect matching 2-D tensors
struct tensor_info { std::string name; int64_t ne0, ne1; };
std::vector<tensor_info> matched;
for (ggml_tensor * t = ggml_get_first_tensor(ctx_meta);
t; t = ggml_get_next_tensor(ctx_meta, t)) {
if (ggml_n_dims(t) < 2) continue;
if (!tensor_matches_targets(t->name, targets, freeze_layers)) continue;
matched.push_back({t->name, t->ne[0], t->ne[1]});
}
gguf_free(ctx_gguf);
ggml_free(ctx_meta);
if (matched.empty()) {
LOG_ERR("%s: no model tensors matched --lora-targets; check spelling\n", __func__);
return lt;
}
if (freeze_layers > 0) {
LOG_INF("%s: freezing layers blk.0 .. blk.%d (no LoRA allocated; backward already pruned by grads_needed)\n",
__func__, freeze_layers - 1);
}
LOG_INF("%s: allocating LoRA A/B tensors for %zu weight matrices, rank=%d\n",
__func__, matched.size(), rank);
// Allocate ggml context for A+B tensors (2 tensors per matched weight)
const size_t mem = (2 * matched.size() + 16) * ggml_tensor_overhead();
struct ggml_init_params ip = { mem, nullptr, /*no_alloc=*/true };
lt.ctx = ggml_init(ip);
for (const auto & ti : matched) {
const int64_t in_dim = ti.ne0; // columns (input features)
const int64_t out_dim = ti.ne1; // rows (output features)
// lora_a: [in_dim, rank] applied first: a @ x
// lora_b: [rank, out_dim] applied second: b @ (a @ x)
// Convention matches llama-adapter.cpp:48-60:
// a->ne[0] == in_dim, a->ne[1] == rank
// b->ne[0] == rank, b->ne[1] == out_dim
ggml_tensor * la = ggml_new_tensor_2d(lt.ctx, GGML_TYPE_F32, in_dim, rank);
ggml_tensor * lb = ggml_new_tensor_2d(lt.ctx, GGML_TYPE_F32, rank, out_dim);
ggml_set_name(la, (ti.name + ".lora_a").c_str());
ggml_set_name(lb, (ti.name + ".lora_b").c_str());
lt.ab[ti.name] = {la, lb};
}
// Allocate backend buffer for all LoRA tensors at once
lt.buf = ggml_backend_alloc_ctx_tensors_from_buft(lt.ctx, ggml_backend_cpu_buffer_type());
// Initialize: A ~ N(0, 1/sqrt(rank)), B = 0
const float std_a = 1.0f / std::sqrt((float)rank);
std::normal_distribution<float> dist(0.0f, std_a);
for (auto & kv : lt.ab) {
ggml_tensor * la = kv.second.first;
ggml_tensor * lb = kv.second.second;
// Fill A
float * data_a = (float *) la->data;
for (int64_t i = 0; i < ggml_nelements(la); ++i) data_a[i] = dist(rng);
// Zero B
memset(lb->data, 0, ggml_nbytes(lb));
}
return lt;
}
// ---------------------------------------------------------------------------
// Param filter: only train lora_a / lora_b tensors
// ---------------------------------------------------------------------------
static bool lora_param_filter(const struct ggml_tensor * t, void * /*ud*/) {
const char * n = t->name;
const size_t len = strlen(n);
if (len > 7 && strcmp(n + len - 7, ".lora_a") == 0) return true;
if (len > 7 && strcmp(n + len - 7, ".lora_b") == 0) return true;
return false;
}
// ---------------------------------------------------------------------------
// Save adapter GGUF
// ---------------------------------------------------------------------------
static std::string basename_from_path(const std::string & p) {
const size_t pos = p.find_last_of("/\\");
if (pos == std::string::npos) return p;
return p.substr(pos + 1);
}
static void save_adapter(
const lora_tensors & lt,
const std::string & out_path,
const std::string & arch,
float alpha,
const std::string & base_model_path) {
// Build output GGUF context
struct gguf_context * gctx = gguf_init_empty();
// Metadata required by llama_adapter_lora_init
gguf_set_val_str(gctx, "general.type", "adapter");
gguf_set_val_str(gctx, "general.architecture", arch.c_str());
gguf_set_val_str(gctx, "adapter.type", "lora");
gguf_set_val_f32(gctx, "adapter.lora.alpha", alpha);
gguf_set_val_str(gctx, "adapter.base_model", basename_from_path(base_model_path).c_str());
// Register tensors
for (const auto & kv : lt.ab) {
gguf_add_tensor(gctx, kv.second.first); // lora_a
gguf_add_tensor(gctx, kv.second.second); // lora_b
}
// Write: meta placeholder → tensor data → rewrite meta
const std::string real_path = expand_tilde(out_path);
std::ofstream fout(real_path, std::ios::binary);
if (!fout.is_open()) {
LOG_ERR("%s: cannot open %s for writing\n", __func__, real_path.c_str());
gguf_free(gctx);
return;
}
// Write meta placeholder
const size_t meta_size = gguf_get_meta_size(gctx);
std::vector<char> zeros_buf(meta_size, 0);
fout.write(zeros_buf.data(), meta_size);
// Write tensor data — copy to CPU first in case tensors live on GPU
for (const auto & kv : lt.ab) {
for (ggml_tensor * t : {kv.second.first, kv.second.second}) {
const size_t nb = ggml_nbytes(t);
std::vector<char> cpu_buf(nb);
ggml_backend_tensor_get(t, cpu_buf.data(), 0, nb);
fout.write(cpu_buf.data(), nb);
// GGUF tensors are 32-byte aligned
const size_t pad = GGML_PAD(nb, 32) - nb;
if (pad > 0) {
std::vector<char> pad_buf(pad, 0);
fout.write(pad_buf.data(), pad);
}
}
}
// Re-write metadata at offset 0
std::vector<uint8_t> meta(meta_size);
gguf_get_meta_data(gctx, meta.data());
fout.seekp(0);
fout.write((const char *) meta.data(), meta_size);
fout.close();
gguf_free(gctx);
LOG_INF("%s: adapter saved to %s\n", __func__, real_path.c_str());
}
// ---------------------------------------------------------------------------
// Periodic checkpoint callback
// ---------------------------------------------------------------------------
struct save_ctx {
const lora_tensors * lt;
const std::string * lora_out;
const std::string * arch;
const std::string * base_model_path;
float lora_alpha;
int32_t save_every; // 0 = disabled
int32_t ubatch_per_ctx;
int64_t last_saved; // last window index at which we saved
};
// TLS pointer set before each epoch so the static callback can access it.
static thread_local save_ctx * g_save_ctx = nullptr;
static void save_every_callback(
bool train,
ggml_opt_context_t opt_ctx,
ggml_opt_dataset_t dataset,
ggml_opt_result_t result,
int64_t ibatch,
int64_t ibatch_max,
int64_t t_start_us) {
ggml_opt_epoch_callback_progress_bar(train, opt_ctx, dataset, result, ibatch, ibatch_max, t_start_us);
// Log loss at every window boundary so we can see if/when it diverges.
if (train && g_save_ctx) {
const int64_t window = ibatch / g_save_ctx->ubatch_per_ctx;
const int64_t ubatch_in_window = ibatch % g_save_ctx->ubatch_per_ctx;
if (ubatch_in_window == g_save_ctx->ubatch_per_ctx - 1) {
double loss = 0.0, loss_unc = 0.0;
ggml_opt_result_loss(result, &loss, &loss_unc);
fprintf(stderr, "\n[window %4ld] loss=%.4f ± %.4f\n", (long)window, loss, loss_unc);
}
}
if (!train || !g_save_ctx || g_save_ctx->save_every <= 0) return;
const int64_t window = ibatch / g_save_ctx->ubatch_per_ctx;
if (window > 0 && window != g_save_ctx->last_saved && window % g_save_ctx->save_every == 0) {
g_save_ctx->last_saved = window;
const std::string ckpt = *g_save_ctx->lora_out + ".ckpt" + std::to_string(window) + ".gguf";
save_adapter(*g_save_ctx->lt, ckpt, *g_save_ctx->arch, g_save_ctx->lora_alpha, *g_save_ctx->base_model_path);
fprintf(stderr, "\n");
LOG_INF("save_every_callback: checkpoint saved -> %s (window %ld)\n", ckpt.c_str(), (long)window);
}
}
// ---------------------------------------------------------------------------
// IPC helpers (stdout protocol, stdin commands)
// ---------------------------------------------------------------------------
// Escape newlines and backslashes for single-line IPC transmission.
// Mirrors _escape() in gguf_trainer.py.
static std::string ipc_escape(const std::string & s) {
std::string out;
out.reserve(s.size());
for (char c : s) {
if (c == '\\') out += "\\\\";
else if (c == '\n') out += "\\n";
else if (c == '\r') out += "\\r";
else out += c;
}
return out;
}
static void ipc_emit(const char * msg) {
fputs(msg, stdout);
fputc('\n', stdout);
fflush(stdout);
}
// Read one line from stdin, trimming the trailing newline.
// Returns false on EOF or error.
static bool ipc_read_line(std::string & out) {
out.clear();
if (!std::getline(std::cin, out)) return false;
// Strip trailing \r if present (Windows line endings)
if (!out.empty() && out.back() == '\r') out.pop_back();
return true;
}
// Parse "REWARD r1 r2 ... rN" into a float vector.
static std::vector<float> ipc_parse_rewards(const std::string & line) {
std::vector<float> rewards;
if (line.size() < 8 || line.substr(0, 7) != "REWARD ") return rewards;
std::istringstream ss(line.substr(7));
float r;
while (ss >> r) rewards.push_back(r);
return rewards;
}
// ---------------------------------------------------------------------------
// Greedy / temperature sampling for GRPO rollout generation
// ---------------------------------------------------------------------------
static std::string generate_response(
llama_context * ctx,
llama_model * model,
const std::string & prompt,
int32_t max_tokens,
float temperature,
std::mt19937 & rng) {
const llama_vocab * vocab = llama_model_get_vocab(model);
auto tokens = common_tokenize(ctx, prompt, /*add_special=*/true);
if (tokens.empty()) return "";
// Clear KV cache before each generation (don't carry over previous prompt state)
llama_memory_clear(llama_get_memory(ctx), true);
{
llama_batch batch = llama_batch_get_one(tokens.data(), (int32_t)tokens.size());
if (llama_decode(ctx, batch) != 0) {
LOG_ERR("%s: llama_decode failed on prompt\n", __func__);
return "";
}
}
std::string output;
const llama_token eos = llama_vocab_eos(vocab);
const llama_token nl = llama_vocab_nl(vocab);
// For ChatML models <|im_end|> is the turn-end marker but may not be the
// vocab EOS token. Look it up by tokenizing the string and taking the
// first token if it tokenizes to exactly one piece.
llama_token im_end = -1;
{
std::vector<llama_token> im_end_tokens(8);
static const char im_end_str[] = "<|im_end|>";
int n = llama_tokenize(vocab, im_end_str, (int32_t)strlen(im_end_str), im_end_tokens.data(), (int32_t)im_end_tokens.size(), /*add_special=*/false, /*parse_special=*/true);
if (n == 1) im_end = im_end_tokens[0];
}
const llama_token eot = llama_vocab_eot(vocab); // may equal eos on some models
for (int32_t i = 0; i < max_tokens; ++i) {
// Sample next token — use ith=-1 to always get the LAST output position's
// logits. llama_get_logits(ctx) returns position 0 which is wrong when the
// prompt batch has multiple output tokens (training context).
float * logits = llama_get_logits_ith(ctx, -1);
if (!logits) {
LOG_ERR("%s: llama_get_logits_ith(-1) returned NULL\n", __func__);
break;
}
const int32_t n_vocab = llama_vocab_n_tokens(vocab);
llama_token next_token;
if (temperature <= 0.0f) {
// Greedy
next_token = (llama_token)(std::max_element(logits, logits + n_vocab) - logits);
} else {
// Temperature sampling via softmax + categorical draw
std::vector<float> probs(n_vocab);
float max_logit = *std::max_element(logits, logits + n_vocab);
float sum = 0.0f;
for (int32_t k = 0; k < n_vocab; ++k) {
probs[k] = std::exp((logits[k] - max_logit) / temperature);
sum += probs[k];
}
for (float & p : probs) p /= sum;
std::discrete_distribution<int32_t> dist(probs.begin(), probs.end());
next_token = dist(rng);
}
if (next_token == eos) break;
if (next_token == eot) break;
if (im_end >= 0 && next_token == im_end && !output.empty()) break;
// Decode token to text
char buf[256] = {};
llama_token_to_piece(vocab, next_token, buf, sizeof(buf) - 1, 0, true);
output += buf;
// Feed token back for next step
llama_batch batch = llama_batch_get_one(&next_token, 1);
if (llama_decode(ctx, batch) != 0) break;
}
return output;
}
// ---------------------------------------------------------------------------
// GRPO IPC training loop
// ---------------------------------------------------------------------------
// Volatile flag set by SIGINT so the loop can exit cleanly.
static volatile sig_atomic_t g_grpo_stop = 0;
static void grpo_sigint_handler(int) { g_grpo_stop = 1; }
static int run_grpo_mode(
common_params & params,
llama_model * model,
llama_context * ctx,
lora_tensors & lt,
const std::string & arch,
float lora_alpha,
const std::string & base_model_path) {
const int32_t n_ctx = llama_n_ctx(ctx);
const int32_t n_gen = params.grpo_n_gen;
const int32_t n_steps = params.grpo_n_steps;
const float temp = params.grpo_temperature;
const int32_t max_tok = params.grpo_max_tokens;
std::mt19937 rng(params.sampling.seed != LLAMA_DEFAULT_SEED
? params.sampling.seed : 42);
// Initialize optimizer
struct llama_opt_params lopt_params {
/*.n_ctx_train =*/0,
/*.param_filter =*/lora_param_filter,
/*.param_filter_ud =*/nullptr,
/*.get_opt_pars =*/common_opt_lr_pars,
/*.get_opt_pars_ud =*/&params.lr,
/*.optimizer_type =*/params.optimizer,
/*.grad_checkpoint_interval =*/params.grad_checkpoint_interval,
};
llama_opt_init(ctx, model, lopt_params);
const llama_token bos = llama_vocab_bos(llama_model_get_vocab(model));
signal(SIGINT, grpo_sigint_handler);
// Signal Python that we are ready
ipc_emit("[QLORA:READY]");
float last_loss = 0.0f;
int step = 0;
while (step < n_steps && !g_grpo_stop) {
// ── Request prompt ────────────────────────────────────────────────
{
char buf[64];
snprintf(buf, sizeof(buf), "[QLORA:PROMPT_REQ:%d]", step + 1);
ipc_emit(buf);
}
std::string prompt_line;
if (!ipc_read_line(prompt_line)) break;
if (prompt_line == "STOP") {
LOG_INF("grpo: received STOP from Python\n");
break;
}
if (prompt_line.size() < 8 || prompt_line.substr(0, 7) != "PROMPT ") {
char buf[128];
snprintf(buf, sizeof(buf), "[QLORA:ERROR] expected PROMPT, got: %.80s", prompt_line.c_str());
ipc_emit(buf);
return 1;
}
// Unescape the prompt (\\n → \n etc.)
std::string prompt;
{
const std::string esc = prompt_line.substr(7);
prompt.reserve(esc.size());
for (size_t i = 0; i < esc.size(); ++i) {
if (esc[i] == '\\' && i + 1 < esc.size()) {
char next = esc[i+1];
if (next == 'n') { prompt += '\n'; ++i; }
else if (next == 'r') { prompt += '\r'; ++i; }
else if (next == '\\') { prompt += '\\'; ++i; }
else { prompt += esc[i]; }
} else {
prompt += esc[i];
}
}
}
// ── Generate N responses ──────────────────────────────────────────
std::vector<std::string> generations(n_gen);
for (int k = 0; k < n_gen; ++k) {
generations[k] = generate_response(ctx, model, prompt, max_tok, temp, rng);
char hdr[64];
snprintf(hdr, sizeof(hdr), "[QLORA:GEN:%d/%d] ", k + 1, n_gen);
std::string msg = std::string(hdr) + ipc_escape(generations[k]);
ipc_emit(msg.c_str());
}
// ── Request rewards ───────────────────────────────────────────────
{
char buf[64];
snprintf(buf, sizeof(buf), "[QLORA:REWARD_REQ:%d]", n_gen);
ipc_emit(buf);
}
std::string reward_line;
if (!ipc_read_line(reward_line)) break;
if (reward_line == "STOP") {
LOG_INF("grpo: received STOP from Python\n");
break;
}
std::vector<float> rewards = ipc_parse_rewards(reward_line);
if ((int32_t)rewards.size() != n_gen) {
char buf[128];
snprintf(buf, sizeof(buf), "[QLORA:ERROR] expected %d rewards, got %zu", n_gen, rewards.size());
ipc_emit(buf);
return 1;
}
// ── Build single-step mini-dataset: prompt+generations with rewards ─
// Each generation is a separate sample; prompt = no-loss, generation = loss.
std::vector<training_sample> step_samples;
step_samples.reserve(n_gen);
for (int k = 0; k < n_gen; ++k) {
training_sample s;
s.reward = rewards[k];
auto tok_prompt = common_tokenize(ctx, prompt, /*add_special=*/true);
auto tok_gen = common_tokenize(ctx, generations[k], /*add_special=*/false);
s.tokens.insert(s.tokens.end(), tok_prompt.begin(), tok_prompt.end());
s.tokens.insert(s.tokens.end(), tok_gen.begin(), tok_gen.end());
s.is_label.resize(s.tokens.size(), false);
for (size_t i = tok_prompt.size(); i < s.tokens.size(); ++i) {
s.is_label[i] = true;
}
step_samples.push_back(std::move(s));
}
// Ensure minimum token count for one context window.
// build_dataset drops the last token per sample during flattening,
// so we need total raw tokens > n_ctx to guarantee ndata >= 1.
while (true) {
size_t total = 0;
for (const auto & s : step_samples) total += s.tokens.size();
if ((int64_t)total > n_ctx + (int64_t)step_samples.size()) break;
step_samples.push_back(step_samples.back());
}
std::vector<float> window_rewards;
ggml_opt_dataset_t step_dataset = build_dataset(
step_samples, n_ctx, window_rewards, /*train_on_prompt=*/false, bos);
if (!step_dataset) {
ipc_emit("[QLORA:ERROR] build_dataset failed for step");
return 1;
}
// Apply reward weights for this step
const bool has_rewards = std::any_of(window_rewards.begin(), window_rewards.end(),
[](float r){ return std::abs(r - 1.0f) > 1e-4f; });
if (has_rewards) {
llama_opt_set_reward_weights(window_rewards.data(), (int64_t)window_rewards.size());
}
// ── One optimizer step (full dataset = one mini-epoch) ────────────
const int64_t idata_all = ggml_opt_dataset_ndata(step_dataset);
ggml_opt_result_t step_result = ggml_opt_result_init();
llama_opt_epoch(ctx, step_dataset, step_result, nullptr, idata_all,
nullptr, // no progress bar callback — clean stdout
nullptr,
false); // no shuffle for single-step
double loss = 0.0, loss_unc = 0.0;
ggml_opt_result_loss(step_result, &loss, &loss_unc);
last_loss = (float)loss;
ggml_opt_result_free(step_result);
ggml_opt_dataset_free(step_dataset);
llama_opt_set_reward_weights(nullptr, 0);
++step;
// ── Emit progress ─────────────────────────────────────────────────
{
char buf[128];
snprintf(buf, sizeof(buf),
"[QLORA:PROGRESS] step=%d/%d loss=%.4f epoch=1/1",
step, n_steps, last_loss);
ipc_emit(buf);
}
// ── Optional checkpoint ───────────────────────────────────────────
if (params.save_every > 0 && step % params.save_every == 0) {
std::string ckpt = params.lora_out + ".ckpt" + std::to_string(step) + ".gguf";
save_adapter(lt, ckpt, arch, lora_alpha, base_model_path);
char buf[512];
snprintf(buf, sizeof(buf), "[QLORA:CHECKPOINT] %s", ckpt.c_str());
ipc_emit(buf);
}
}
// Save final adapter
save_adapter(lt, params.lora_out, arch, lora_alpha, base_model_path);
{
char buf[64];
snprintf(buf, sizeof(buf), "[QLORA:DONE] final_loss=%.4f", last_loss);
ipc_emit(buf);
}
return 0;
}
// ---------------------------------------------------------------------------
int main(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
common_params params;
params.escape = false;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_FINETUNE_QLORA)) {
return 1;
}
if (!params.grpo_mode && params.train_file.empty()) {
LOG_ERR("%s: --train-file is required (or use --grpo-mode for IPC training)\n", __func__);
return 1;
}
// Force settings required for training
params.use_mmap = false;
params.cache_type_k = GGML_TYPE_F32;
params.cache_type_v = GGML_TYPE_F32;
// Warmup runs inference with PARAM-flagged tensors which causes a segfault;
// training never benefits from warmup, so disable it unconditionally.
params.warmup = false;
// Flash attention has no backward implementation; force standard attention for training.
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
const float lora_alpha = (params.lora_alpha > 0.0f)
? params.lora_alpha : (float) params.lora_rank;
const auto targets = split_csv(params.lora_targets);
// --- Step 1: Discover tensor shapes from model GGUF (no model load yet) ---
std::string arch;
{
struct ggml_context * ctx_meta = nullptr;
struct gguf_init_params gp = { true, &ctx_meta };
struct gguf_context * ctx_gguf = gguf_init_from_file(params.model.path.c_str(), gp);
if (!ctx_gguf) { LOG_ERR("failed to open model GGUF\n"); return 1; }
int kid = gguf_find_key(ctx_gguf, "general.architecture");
if (kid >= 0) arch = gguf_get_val_str(ctx_gguf, kid);
gguf_free(ctx_gguf);
ggml_free(ctx_meta);
}
// --- Step 2: Allocate LoRA tensors and save initial adapter GGUF ---
// If the user already supplied a --lora adapter we reuse it (resume training).
// Otherwise we allocate fresh tensors (B=0, A=random), write them to a temp
// .init.gguf so common_init_from_params can load them before context creation
// (this makes sched_reserve size the graph to include LoRA nodes).
const bool resume_from_lora = !params.lora_adapters.empty();
std::mt19937 rng(42);
lora_tensors lt; // will be populated after context load (Step 4)
std::string init_adapter_path;
if (!resume_from_lora) {
lt = alloc_lora_tensors(params.model.path, targets, params.lora_rank, rng, params.lora_freeze_layers);
if (lt.ab.empty()) return 1;
init_adapter_path = params.lora_out + ".init.gguf";
save_adapter(lt, init_adapter_path, arch, lora_alpha, params.model.path);
// Register adapter so common_init_from_params loads it before context creation
common_adapter_lora_info adapter_info;
adapter_info.path = init_adapter_path;
adapter_info.scale = 1.0f;
params.lora_adapters.push_back(adapter_info);
} else {
LOG_INF("%s: resuming training from existing LoRA adapter: %s\n",
__func__, params.lora_adapters.back().path.c_str());
}
// --- Step 3: Load model + context (graph sized with LoRA nodes) ---
common_init();
llama_backend_init();
llama_numa_init(params.numa);
auto llama_init = common_init_from_params(params);
auto * model = llama_init->model();
auto * ctx = llama_init->context();
if (!model) { LOG_ERR("failed to load model\n"); return 1; }
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
// Arch fallback if not in GGUF metadata
if (arch.empty()) {
char buf[256] = {};
llama_model_desc(model, buf, sizeof(buf));
arch = std::string(buf);
arch = arch.substr(0, arch.find_first_of(" /"));
}
// --- Step 4: Mark the loaded adapter tensors as trainable ---
// common_init_from_params loaded the adapter; params.lora_adapters[back].ptr
// points to the live llama_adapter_lora with its own tensor copies in device
// memory. Mark those tensors trainable so the optimizer graph includes them.
{
llama_adapter_lora * loaded = params.lora_adapters.back().ptr;
if (!loaded) {
LOG_ERR("%s: adapter was not loaded by common_init_from_params\n", __func__);
return 1;
}
for (auto & kv : loaded->ab_map) {
ggml_set_param(kv.second.a); // lora_a → trainable
ggml_set_param(kv.second.b); // lora_b → trainable
}
// Point lt.ab at the live device tensors so save_adapter writes
// the trained weights (not the original init tensors).
lt.ab.clear();
for (auto & kv : loaded->ab_map) {
lt.ab[kv.first] = {kv.second.a, kv.second.b};
}
}
// Remove temp init file when we created it (resume path has no init file)
if (!resume_from_lora && !init_adapter_path.empty()) {
std::remove(expand_tilde(init_adapter_path).c_str());
}
// --- Step 5: Load dataset ---
// In GRPO mode the dataset comes from Python via stdin/stdout — skip file loading.
auto tmpls = common_chat_templates_init(model, "");
if (params.grpo_mode) {
int rc = run_grpo_mode(params, model, ctx, lt, arch, lora_alpha, params.model.path);
if (lt.buf) ggml_backend_buffer_free(lt.buf);
if (lt.ctx) ggml_free(lt.ctx);
llama_backend_free();
return rc;
}
auto samples = load_jsonl(params.train_file, ctx, tmpls.get());
if (samples.empty()) {
LOG_ERR("%s: no training samples loaded\n", __func__);
return 1;
}
const int32_t n_ctx = llama_n_ctx(ctx);
std::vector<float> window_rewards;
const llama_token bos = llama_vocab_bos(llama_model_get_vocab(model));
auto dataset = build_dataset(samples, n_ctx, window_rewards, params.train_on_prompt, bos);
if (!dataset) return 1;
// Check if any reward deviates from 1.0 — if so, enable reward-weighted SFT
const bool has_rewards = std::any_of(window_rewards.begin(), window_rewards.end(),
[](float r){ return std::abs(r - 1.0f) > 1e-4f; });
if (has_rewards) {
LOG_INF("%s: reward-weighted SFT enabled (found non-uniform rewards in dataset)\n", __func__);
llama_opt_set_reward_weights(window_rewards.data(), (int64_t)window_rewards.size());
}
// Initialize optimizer — our custom param filter restricts training to lora_a/b
struct llama_opt_params lopt_params {
/*.n_ctx_train =*/0,
/*.param_filter =*/lora_param_filter,
/*.param_filter_ud =*/nullptr,
/*.get_opt_pars =*/common_opt_lr_pars,
/*.get_opt_pars_ud =*/&params.lr,
/*.optimizer_type =*/params.optimizer,
/*.grad_checkpoint_interval =*/params.grad_checkpoint_interval,
};
llama_opt_init(ctx, model, lopt_params);
const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - params.val_split);
ggml_opt_result_t result_train = ggml_opt_result_init();
ggml_opt_result_t result_eval = ggml_opt_result_init();
const int32_t n_ubatch = llama_n_ubatch(ctx);
const int32_t ubatch_per_ctx = (n_ubatch > 0) ? (n_ctx / n_ubatch) : 1;
save_ctx sctx { &lt, &params.lora_out, &arch, &params.model.path, lora_alpha, params.save_every, ubatch_per_ctx, 0 };
g_save_ctx = &sctx;
const int64_t total_windows = ggml_opt_dataset_ndata(dataset);
LOG_INF("%s: starting QLoRA training — rank=%d alpha=%.1f epochs=%d loss=%s\n",
__func__, params.lora_rank, lora_alpha, params.lr.epochs,
params.train_on_prompt ? "prompt+response" : "response-only");
LOG_INF("%s: dataset: %ld windows × %d ubatches = %ld steps per epoch (n_ctx=%d n_ubatch=%d stride=%d)\n",
__func__, (long)total_windows, ubatch_per_ctx, (long)(idata_split * ubatch_per_ctx),
n_ctx, n_ubatch, n_ctx / 2);
if (params.save_every > 0) {
LOG_INF("%s: will save checkpoint every %d windows → %s.ckptN.gguf\n",
__func__, params.save_every, params.lora_out.c_str());
}
ggml_opt_epoch_callback cb_train = (params.save_every > 0)
? save_every_callback
: ggml_opt_epoch_callback_progress_bar;
for (params.lr.epoch = 0; params.lr.epoch < params.lr.epochs; ++params.lr.epoch) {
sctx.last_saved = 0; // reset per-epoch window counter
llama_opt_epoch(ctx, dataset, result_train, result_eval, idata_split,
cb_train,
ggml_opt_epoch_callback_progress_bar,
params.shuffle_dataset);
fprintf(stderr, "\n");
// Per-epoch loss summary
{
double train_loss = 0.0, train_unc = 0.0;
ggml_opt_result_loss(result_train, &train_loss, &train_unc);
if (idata_split < ggml_opt_dataset_ndata(dataset)) {
double val_loss = 0.0, val_unc = 0.0;
ggml_opt_result_loss(result_eval, &val_loss, &val_unc);
LOG_INF("epoch %d/%d: train_loss=%.4f ± %.4f val_loss=%.4f ± %.4f\n",
params.lr.epoch + 1, params.lr.epochs, train_loss, train_unc, val_loss, val_unc);
} else {
LOG_INF("epoch %d/%d: train_loss=%.4f ± %.4f\n",
params.lr.epoch + 1, params.lr.epochs, train_loss, train_unc);
}
}
ggml_opt_result_reset(result_train);
ggml_opt_result_reset(result_eval);
}
ggml_opt_result_free(result_train);
ggml_opt_result_free(result_eval);
llama_opt_set_reward_weights(nullptr, 0);
// Save final trained adapter
save_adapter(lt, params.lora_out, arch, lora_alpha, params.model.path);
// Free scratch buffers only when we allocated them (not in resume path)
if (lt.buf) ggml_backend_buffer_free(lt.buf);
if (lt.ctx) ggml_free(lt.ctx);
ggml_opt_dataset_free(dataset);
llama_backend_free();
return 0;
}