From 9a04ac4e10768e1786eb0c972da52cdf8168ffcd Mon Sep 17 00:00:00 2001
From: eauchs <maisjesaispasmoimdrr@gmail.com>
Date: Thu, 5 Mar 2026 11:02:13 +0100
Subject: [PATCH] fix: replace soft rollback with proper failure in recurrent
 seq_rm

The soft rollback path (cells[tail_id].pos = p0 - 1) only updated position
metadata, leaving SSM tensor state (r_l/s_l) reflecting the post-speculative
position. This caused silent state corruption and looping on speculative
decoding rejection for recurrent/hybrid models (e.g. Qwen3.5 MoE 27B).

seq_rm now returns false when no checkpoint exists at p0-1, correctly
signaling to the caller that rollback requires re-evaluation. The hybrid
memory layer already propagates false correctly.

Also add a LLAMA_LOG_DEBUG when the 0.9 cache threshold prevents checkpoint
creation, making the behavior visible rather than silent.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/llama-memory-recurrent.cpp | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp
index 4286d6ca2a..a1217a1fc2 100644
--- a/src/llama-memory-recurrent.cpp
+++ b/src/llama-memory-recurrent.cpp
@@ -175,9 +175,9 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
                 if (best_cell >= 0) {
                     tail_id = best_cell;
                 } else {
-                    // if no checkpoint found, we still move the position back (soft rollback)
-                    // only if it's the current sequence's tail
-                    cells[tail_id].pos = p0 - 1;
+                    // no checkpoint found at p0-1: the SSM tensor state cannot be rolled back
+                    // without re-evaluating the sequence. Signal failure to the caller.
+                    return false;
                 }
             }
             // invalidate tails which will be cleared
@@ -648,10 +648,17 @@ bool llama_memory_recurrent::find_slot(const llama_ubatch & ubatch) {
                 // Copy state data
                 copy_cell(seq_meta.tail, next_empty_cell);
 
-                // Keep history of previous states for rollback (up to 8 cells per sequence)
+                // Keep history of previous states for rollback (up to 8 cells per sequence).
+                // The 0.9 threshold prevents the checkpoint history from filling the cache.
+                // When the cache is too full to keep checkpoints, speculative decoding rollback
+                // will fail (seq_rm returns false) and the caller must re-evaluate.
                 if (get_cell_count(seq_id) < 8 && used < size * 0.9) {
                     // Do not erase seq_id from orig_cell to keep it as a checkpoint
                 } else {
+                    if (used >= size * 0.9) {
+                        LLAMA_LOG_DEBUG("%s: cache too full (used=%u/%u) to keep checkpoint for seq %d; speculative rollback will require re-evaluation\n",
+                            __func__, used, size, seq_id);
+                    }
                     // Erase oldest history point for this sequence
                     int32_t oldest_cell = -1;
                     llama_pos min_pos = std::numeric_limits<llama_pos>::max();