From 4893cc07bba09525d6a1720d0686ed09b5a9b1c8 Mon Sep 17 00:00:00 2001
From: o7si <32285332+o7si@users.noreply.github.com>
Date: Fri, 26 Dec 2025 23:35:29 +0800
Subject: [PATCH] server : fix crash when seq_rm fails for hybrid/recurrent
 models (#18391)

* server : fix crash when seq_rm fails for hybrid/recurrent models

* server : add allow_processing param to clear_slot
---
 tools/server/server-context.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 94825dc862..1abbf6d6d9 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -1007,8 +1007,10 @@ private:
         return ret;
     }
 
-    void clear_slot(server_slot & slot) const {
-        GGML_ASSERT(!slot.is_processing());
+    void clear_slot(server_slot & slot, bool allow_processing = false) const {
+        if (!allow_processing) {
+            GGML_ASSERT(!slot.is_processing());
+        }
 
         SLT_WRN(slot, "clearing slot with %zu tokens\n", slot.prompt.tokens.size());
 
@@ -2336,7 +2338,7 @@ private:
                     if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) {
                         SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0);
 
-                        clear_slot(slot);
+                        clear_slot(slot, /*allow_processing=*/true);
 
                         // there is no common part left
                         slot.n_prompt_tokens_cache = 0;