fix: replace soft rollback with proper failure in recurrent seq_rm
The soft rollback path (cells[tail_id].pos = p0 - 1) only updated position metadata, leaving SSM tensor state (r_l/s_l) reflecting the post-speculative position. This caused silent state corruption and looping on speculative decoding rejection for recurrent/hybrid models (e.g. Qwen3.5 MoE 27B). seq_rm now returns false when no checkpoint exists at p0-1, correctly signaling to the caller that rollback requires re-evaluation. The hybrid memory layer already propagates false correctly. Also add a LLAMA_LOG_DEBUG when the 0.9 cache threshold prevents checkpoint creation, making the behavior visible rather than silent. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
04e2fb15f3
commit
9a04ac4e10
|
|
@ -175,9 +175,9 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
|
||||||
if (best_cell >= 0) {
|
if (best_cell >= 0) {
|
||||||
tail_id = best_cell;
|
tail_id = best_cell;
|
||||||
} else {
|
} else {
|
||||||
// if no checkpoint found, we still move the position back (soft rollback)
|
// no checkpoint found at p0-1: the SSM tensor state cannot be rolled back
|
||||||
// only if it's the current sequence's tail
|
// without re-evaluating the sequence. Signal failure to the caller.
|
||||||
cells[tail_id].pos = p0 - 1;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// invalidate tails which will be cleared
|
// invalidate tails which will be cleared
|
||||||
|
|
@ -648,10 +648,17 @@ bool llama_memory_recurrent::find_slot(const llama_ubatch & ubatch) {
|
||||||
// Copy state data
|
// Copy state data
|
||||||
copy_cell(seq_meta.tail, next_empty_cell);
|
copy_cell(seq_meta.tail, next_empty_cell);
|
||||||
|
|
||||||
// Keep history of previous states for rollback (up to 8 cells per sequence)
|
// Keep history of previous states for rollback (up to 8 cells per sequence).
|
||||||
|
// The 0.9 threshold prevents the checkpoint history from filling the cache.
|
||||||
|
// When the cache is too full to keep checkpoints, speculative decoding rollback
|
||||||
|
// will fail (seq_rm returns false) and the caller must re-evaluate.
|
||||||
if (get_cell_count(seq_id) < 8 && used < size * 0.9) {
|
if (get_cell_count(seq_id) < 8 && used < size * 0.9) {
|
||||||
// Do not erase seq_id from orig_cell to keep it as a checkpoint
|
// Do not erase seq_id from orig_cell to keep it as a checkpoint
|
||||||
} else {
|
} else {
|
||||||
|
if (used >= size * 0.9) {
|
||||||
|
LLAMA_LOG_DEBUG("%s: cache too full (used=%u/%u) to keep checkpoint for seq %d; speculative rollback will require re-evaluation\n",
|
||||||
|
__func__, used, size, seq_id);
|
||||||
|
}
|
||||||
// Erase oldest history point for this sequence
|
// Erase oldest history point for this sequence
|
||||||
int32_t oldest_cell = -1;
|
int32_t oldest_cell = -1;
|
||||||
llama_pos min_pos = std::numeric_limits<llama_pos>::max();
|
llama_pos min_pos = std::numeric_limits<llama_pos>::max();
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue