From 8ec2e6007e5d6f0f9df37795b15b013d869ab0e4 Mon Sep 17 00:00:00 2001
From: itigges22 <jitigges@vt.edu>
Date: Fri, 20 Mar 2026 14:55:00 -0400
Subject: [PATCH] fix: bounds check and eviction in checkpoint creation

- Check next_empty_cell < size before accessing cells array
- Update next_empty_cell to freed cell after eviction
- Increase rs_size from 3 to 4 for better checkpoint room
- Fix: eviction now correctly reuses freed cells for new checkpoints

Still TODO: checkpoint positions don't match what seq_rm looks for.
Checkpoints are created at the current tail position (post-update),
but seq_rm needs the pre-update position. Need to capture the position
BEFORE the speculative batch updates the tail.

Performance: 19.8-24 tok/s, 63-75% acceptance, no crashes.
---
 src/llama-memory-recurrent.cpp | 3 ++-
 src/llama-model.cpp            | 7 +++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp
index aef2b0bdf4..fdcd54a292 100644
--- a/src/llama-memory-recurrent.cpp
+++ b/src/llama-memory-recurrent.cpp
@@ -707,7 +707,7 @@ bool llama_memory_recurrent::find_slot(const llama_ubatch & ubatch) {
             fprintf(stderr, "[MTP-FINDSLOT] checkpoint branch: seq_id=%d, cur_tail=%d, next_empty=%d\n",
                     (int)seq_id, cur_tail, (int)next_empty_cell);
             fflush(stderr);
-            if (cells[next_empty_cell].is_empty()) {
+            if (next_empty_cell < size && cells[next_empty_cell].is_empty()) {
                 bool can_checkpoint = (get_cell_count(seq_id) < 8 && used < size * 0.9);
                 if (!can_checkpoint) {
                     // Try to evict the oldest checkpoint to make room
@@ -725,6 +725,7 @@ bool llama_memory_recurrent::find_slot(const llama_ubatch & ubatch) {
                             cells[oldest].pos = -1;
                             cells[oldest].src = -1;
                             used--;
+                            next_empty_cell = oldest; // reuse the freed cell
                         }
                         can_checkpoint = true;
                     }
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 027e72cc17..5e1b512318 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -8116,7 +8116,10 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                         // cells for checkpoint/restore. Each sequence needs at least
                         // 1 active cell + 1 checkpoint cell per MTP draft step.
                         const uint32_t n_mtp = hparams.nextn_predict_layers;
-                        const uint32_t rs_per_seq = 1 + (n_mtp > 0 ? 2 : 0); // active + checkpoint room
+                        // For MTP: need room for active cell + checkpoint cells.
+                        // With size=4: active(1) + checkpoint(1) + room(2) ensures
+                        // can_checkpoint (used < size*0.9 = 3.6) can fire even with 3 cells in use.
+                        const uint32_t rs_per_seq = 1 + (n_mtp > 0 ? 3 : 0);
                         const uint32_t rs_size = std::max((uint32_t) 1, cparams.n_seq_max * rs_per_seq);
 
                         res = new llama_memory_hybrid_iswa(
@@ -8139,7 +8142,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                     } else {
                         // Same MTP checkpoint room for non-SWA path
                         const uint32_t n_mtp2 = hparams.nextn_predict_layers;
-                        const uint32_t rs_per_seq2 = 1 + (n_mtp2 > 0 ? 2 : 0);
+                        const uint32_t rs_per_seq2 = 1 + (n_mtp2 > 0 ? 3 : 0);
                         const uint32_t rs_size2 = std::max((uint32_t) 1, cparams.n_seq_max * rs_per_seq2);
 
                         res = new llama_memory_hybrid(