From 8ec2e6007e5d6f0f9df37795b15b013d869ab0e4 Mon Sep 17 00:00:00 2001 From: itigges22 Date: Fri, 20 Mar 2026 14:55:00 -0400 Subject: [PATCH] fix: bounds check and eviction in checkpoint creation - Check next_empty_cell < size before accessing cells array - Update next_empty_cell to freed cell after eviction - Increase rs_size from 3 to 4 for better checkpoint room - Fix: eviction now correctly reuses freed cells for new checkpoints Still TODO: checkpoint positions don't match what seq_rm looks for. Checkpoints are created at the current tail position (post-update), but seq_rm needs the pre-update position. Need to capture the position BEFORE the speculative batch updates the tail. Performance: 19.8-24 tok/s, 63-75% acceptance, no crashes. --- src/llama-memory-recurrent.cpp | 3 ++- src/llama-model.cpp | 7 +++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp index aef2b0bdf4..fdcd54a292 100644 --- a/src/llama-memory-recurrent.cpp +++ b/src/llama-memory-recurrent.cpp @@ -707,7 +707,7 @@ bool llama_memory_recurrent::find_slot(const llama_ubatch & ubatch) { fprintf(stderr, "[MTP-FINDSLOT] checkpoint branch: seq_id=%d, cur_tail=%d, next_empty=%d\n", (int)seq_id, cur_tail, (int)next_empty_cell); fflush(stderr); - if (cells[next_empty_cell].is_empty()) { + if (next_empty_cell < size && cells[next_empty_cell].is_empty()) { bool can_checkpoint = (get_cell_count(seq_id) < 8 && used < size * 0.9); if (!can_checkpoint) { // Try to evict the oldest checkpoint to make room @@ -725,6 +725,7 @@ bool llama_memory_recurrent::find_slot(const llama_ubatch & ubatch) { cells[oldest].pos = -1; cells[oldest].src = -1; used--; + next_empty_cell = oldest; // reuse the freed cell } can_checkpoint = true; } diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 027e72cc17..5e1b512318 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -8116,7 +8116,10 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, // cells for checkpoint/restore. Each sequence needs at least // 1 active cell + 1 checkpoint cell per MTP draft step. const uint32_t n_mtp = hparams.nextn_predict_layers; - const uint32_t rs_per_seq = 1 + (n_mtp > 0 ? 2 : 0); // active + checkpoint room + // For MTP: need room for active cell + checkpoint cells. + // With size=4: active(1) + checkpoint(1) + room(2) ensures + // can_checkpoint (used < size*0.9 = 3.6) can fire even with 3 cells in use. + const uint32_t rs_per_seq = 1 + (n_mtp > 0 ? 3 : 0); const uint32_t rs_size = std::max((uint32_t) 1, cparams.n_seq_max * rs_per_seq); res = new llama_memory_hybrid_iswa( @@ -8139,7 +8142,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, } else { // Same MTP checkpoint room for non-SWA path const uint32_t n_mtp2 = hparams.nextn_predict_layers; - const uint32_t rs_per_seq2 = 1 + (n_mtp2 > 0 ? 2 : 0); + const uint32_t rs_per_seq2 = 1 + (n_mtp2 > 0 ? 3 : 0); const uint32_t rs_size2 = std::max((uint32_t) 1, cparams.n_seq_max * rs_per_seq2); res = new llama_memory_hybrid(