server: fix infinite retry loop when KV cache is full

Fixes #20049 When KV cache allocation fails, the server would retry indefinitely if idle slots were cleared but space was still insufficient due to fragmentation. This adds a retry counter (max 10 attempts) and adaptively reduces batch size to prevent infinite loops while maintaining graceful degradation under memory pressure.
2026-03-02 17:01:20 -06:00 · 2026-03-02 17:01:20 -06:00 · 2104757172
parent 4d828bd1ab
commit 2104757172
2 changed files with 13 additions and 6 deletions
--- a/src/llama-memory-hybrid.cpp
+++ b/src/llama-memory-hybrid.cpp
@ -92,15 +92,14 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba

        // prepare the recurrent batches first
        if (!mem_recr->prepare(ubatches)) {
-            // TODO: will the recurrent cache be in an undefined context at this point?
-            LLAMA_LOG_ERROR("%s: failed to prepare recurrent ubatches\n", __func__);
+            LLAMA_LOG_DEBUG("%s: failed to prepare recurrent ubatches\n", __func__);
            return std::make_unique<llama_memory_hybrid_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
        }

        // prepare the attention cache
        auto heads_attn = mem_attn->prepare(ubatches);
        if (heads_attn.empty()) {
-            LLAMA_LOG_ERROR("%s: failed to prepare attention ubatches\n", __func__);
+            LLAMA_LOG_DEBUG("%s: failed to prepare attention ubatches\n", __func__);
            return std::make_unique<llama_memory_hybrid_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
        }

--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@ -2608,6 +2608,8 @@ private:
        }

        int32_t i_next = 0;
+        int32_t n_retries = 0;
+        const int32_t max_retries = 10;

        // process the created batch of tokens
        for (int32_t i = 0; i < batch.n_tokens; i = i_next) {
@ -2667,14 +2669,20 @@ private:
                }

                // retry with half the batch size to try to find a free slot in the KV cache
-                if (!try_clear_idle_slots()) {
-                    n_batch /= 2;
+                bool cleared_slot = try_clear_idle_slots();
+                n_retries++;
+                if (n_retries >= max_retries || (!cleared_slot && n_batch > 1)) {
+                    n_batch = std::max(1, n_batch / 2);
+                    n_retries = 0;
+                    SRV_WRN("reducing batch size to %d after %d retries (cleared_slot = %d)\n", n_batch, n_retries, cleared_slot);
                }

-                SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret);
+                SRV_WRN("failed to find free space in the KV cache, retrying (attempt %d/%d), i = %d, n_batch = %d, ret = %d\n", 
+                        n_retries, max_retries, i, n_batch, ret);

                continue; // continue loop of n_batch
            }
+            n_retries = 0;

            // move the head of the batch forward with the number of tokens we just processed
            i_next = i + n_tokens;