fix: bounds check and eviction in checkpoint creation

- Check next_empty_cell < size before accessing cells array - Update next_empty_cell to freed cell after eviction - Increase rs_size from 3 to 4 for better checkpoint room - Fix: eviction now correctly reuses freed cells for new checkpoints Still TODO: checkpoint positions don't match what seq_rm looks for. Checkpoints are created at the current tail position (post-update), but seq_rm needs the pre-update position. Need to capture the position BEFORE the speculative batch updates the tail. Performance: 19.8-24 tok/s, 63-75% acceptance, no crashes.
2026-03-20 14:55:00 -04:00 · 2026-03-20 14:55:00 -04:00 · 8ec2e6007e
parent 279e6c721e
commit 8ec2e6007e
2 changed files with 7 additions and 3 deletions
--- a/src/llama-memory-recurrent.cpp
+++ b/src/llama-memory-recurrent.cpp
@ -707,7 +707,7 @@ bool llama_memory_recurrent::find_slot(const llama_ubatch & ubatch) {
            fprintf(stderr, "[MTP-FINDSLOT] checkpoint branch: seq_id=%d, cur_tail=%d, next_empty=%d\n",
                    (int)seq_id, cur_tail, (int)next_empty_cell);
            fflush(stderr);
-            if (cells[next_empty_cell].is_empty()) {
+            if (next_empty_cell < size && cells[next_empty_cell].is_empty()) {
                bool can_checkpoint = (get_cell_count(seq_id) < 8 && used < size * 0.9);
                if (!can_checkpoint) {
                    // Try to evict the oldest checkpoint to make room
@ -725,6 +725,7 @@ bool llama_memory_recurrent::find_slot(const llama_ubatch & ubatch) {
                            cells[oldest].pos = -1;
                            cells[oldest].src = -1;
                            used--;
+                            next_empty_cell = oldest; // reuse the freed cell
                        }
                        can_checkpoint = true;
                    }
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -8116,7 +8116,10 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                        // cells for checkpoint/restore. Each sequence needs at least
                        // 1 active cell + 1 checkpoint cell per MTP draft step.
                        const uint32_t n_mtp = hparams.nextn_predict_layers;
-                        const uint32_t rs_per_seq = 1 + (n_mtp > 0 ? 2 : 0); // active + checkpoint room
+                        // For MTP: need room for active cell + checkpoint cells.
+                        // With size=4: active(1) + checkpoint(1) + room(2) ensures
+                        // can_checkpoint (used < size*0.9 = 3.6) can fire even with 3 cells in use.
+                        const uint32_t rs_per_seq = 1 + (n_mtp > 0 ? 3 : 0);
                        const uint32_t rs_size = std::max((uint32_t) 1, cparams.n_seq_max * rs_per_seq);

                        res = new llama_memory_hybrid_iswa(
@ -8139,7 +8142,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                    } else {
                        // Same MTP checkpoint room for non-SWA path
                        const uint32_t n_mtp2 = hparams.nextn_predict_layers;
-                        const uint32_t rs_per_seq2 = 1 + (n_mtp2 > 0 ? 2 : 0);
+                        const uint32_t rs_per_seq2 = 1 + (n_mtp2 > 0 ? 3 : 0);
                        const uint32_t rs_size2 = std::max((uint32_t) 1, cparams.n_seq_max * rs_per_seq2);

                        res = new llama_memory_hybrid(