server : fix crash when seq_rm fails for hybrid/recurrent models (#18391)

* server : fix crash when seq_rm fails for hybrid/recurrent models * server : add allow_processing param to clear_slot
2025-12-26 23:35:29 +08:00 · 2025-12-26 23:35:29 +08:00 · 4893cc07bb
parent af3be131c0
commit 4893cc07bb
1 changed files with 5 additions and 3 deletions
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@ -1007,8 +1007,10 @@ private:
        return ret;
    }

-    void clear_slot(server_slot & slot) const {
-        GGML_ASSERT(!slot.is_processing());
+    void clear_slot(server_slot & slot, bool allow_processing = false) const {
+        if (!allow_processing) {
+            GGML_ASSERT(!slot.is_processing());
+        }

        SLT_WRN(slot, "clearing slot with %zu tokens\n", slot.prompt.tokens.size());

@ -2336,7 +2338,7 @@ private:
                    if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) {
                        SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0);

-                        clear_slot(slot);
+                        clear_slot(slot, /*allow_processing=*/true);

                        // there is no common part left
                        slot.n_prompt_tokens_cache = 0;