fix: bounds check and eviction in checkpoint creation

- Check next_empty_cell < size before accessing cells array
- Update next_empty_cell to freed cell after eviction
- Increase rs_size from 3 to 4 for better checkpoint room
- Fix: eviction now correctly reuses freed cells for new checkpoints

Still TODO: checkpoint positions don't match what seq_rm looks for.
Checkpoints are created at the current tail position (post-update),
but seq_rm needs the pre-update position. Need to capture the position
BEFORE the speculative batch updates the tail.

Performance: 19.8-24 tok/s, 63-75% acceptance, no crashes.
This commit is contained in:
itigges22 2026-03-20 14:55:00 -04:00
parent 279e6c721e
commit 8ec2e6007e
2 changed files with 7 additions and 3 deletions

View File

@ -707,7 +707,7 @@ bool llama_memory_recurrent::find_slot(const llama_ubatch & ubatch) {
fprintf(stderr, "[MTP-FINDSLOT] checkpoint branch: seq_id=%d, cur_tail=%d, next_empty=%d\n",
(int)seq_id, cur_tail, (int)next_empty_cell);
fflush(stderr);
if (cells[next_empty_cell].is_empty()) {
if (next_empty_cell < size && cells[next_empty_cell].is_empty()) {
bool can_checkpoint = (get_cell_count(seq_id) < 8 && used < size * 0.9);
if (!can_checkpoint) {
// Try to evict the oldest checkpoint to make room
@ -725,6 +725,7 @@ bool llama_memory_recurrent::find_slot(const llama_ubatch & ubatch) {
cells[oldest].pos = -1;
cells[oldest].src = -1;
used--;
next_empty_cell = oldest; // reuse the freed cell
}
can_checkpoint = true;
}

View File

@ -8116,7 +8116,10 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
// cells for checkpoint/restore. Each sequence needs at least
// 1 active cell + 1 checkpoint cell per MTP draft step.
const uint32_t n_mtp = hparams.nextn_predict_layers;
const uint32_t rs_per_seq = 1 + (n_mtp > 0 ? 2 : 0); // active + checkpoint room
// For MTP: need room for active cell + checkpoint cells.
// With size=4: active(1) + checkpoint(1) + room(2) ensures
// can_checkpoint (used < size*0.9 = 3.6) can fire even with 3 cells in use.
const uint32_t rs_per_seq = 1 + (n_mtp > 0 ? 3 : 0);
const uint32_t rs_size = std::max((uint32_t) 1, cparams.n_seq_max * rs_per_seq);
res = new llama_memory_hybrid_iswa(
@ -8139,7 +8142,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
} else {
// Same MTP checkpoint room for non-SWA path
const uint32_t n_mtp2 = hparams.nextn_predict_layers;
const uint32_t rs_per_seq2 = 1 + (n_mtp2 > 0 ? 2 : 0);
const uint32_t rs_per_seq2 = 1 + (n_mtp2 > 0 ? 3 : 0);
const uint32_t rs_size2 = std::max((uint32_t) 1, cparams.n_seq_max * rs_per_seq2);
res = new llama_memory_hybrid(