From 21047571722de7973701c385b921f6b700423500 Mon Sep 17 00:00:00 2001 From: Samaresh Kumar Singh Date: Mon, 2 Mar 2026 17:01:20 -0600 Subject: [PATCH 1/2] server: fix infinite retry loop when KV cache is full Fixes #20049 When KV cache allocation fails, the server would retry indefinitely if idle slots were cleared but space was still insufficient due to fragmentation. This adds a retry counter (max 10 attempts) and adaptively reduces batch size to prevent infinite loops while maintaining graceful degradation under memory pressure. --- src/llama-memory-hybrid.cpp | 5 ++--- tools/server/server-context.cpp | 14 +++++++++++--- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp index a1b45e4a3c..f821d8b000 100644 --- a/src/llama-memory-hybrid.cpp +++ b/src/llama-memory-hybrid.cpp @@ -92,15 +92,14 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba // prepare the recurrent batches first if (!mem_recr->prepare(ubatches)) { - // TODO: will the recurrent cache be in an undefined context at this point? - LLAMA_LOG_ERROR("%s: failed to prepare recurrent ubatches\n", __func__); + LLAMA_LOG_DEBUG("%s: failed to prepare recurrent ubatches\n", __func__); return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); } // prepare the attention cache auto heads_attn = mem_attn->prepare(ubatches); if (heads_attn.empty()) { - LLAMA_LOG_ERROR("%s: failed to prepare attention ubatches\n", __func__); + LLAMA_LOG_DEBUG("%s: failed to prepare attention ubatches\n", __func__); return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); } diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index aafed49502..0f43c1b77d 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2608,6 +2608,8 @@ private: } int32_t i_next = 0; + int32_t n_retries = 0; + const int32_t max_retries = 10; // process the created batch of tokens for (int32_t i = 0; i < batch.n_tokens; i = i_next) { @@ -2667,14 +2669,20 @@ private: } // retry with half the batch size to try to find a free slot in the KV cache - if (!try_clear_idle_slots()) { - n_batch /= 2; + bool cleared_slot = try_clear_idle_slots(); + n_retries++; + if (n_retries >= max_retries || (!cleared_slot && n_batch > 1)) { + n_batch = std::max(1, n_batch / 2); + n_retries = 0; + SRV_WRN("reducing batch size to %d after %d retries (cleared_slot = %d)\n", n_batch, n_retries, cleared_slot); } - SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret); + SRV_WRN("failed to find free space in the KV cache, retrying (attempt %d/%d), i = %d, n_batch = %d, ret = %d\n", + n_retries, max_retries, i, n_batch, ret); continue; // continue loop of n_batch } + n_retries = 0; // move the head of the batch forward with the number of tokens we just processed i_next = i + n_tokens; From de30196b51543d9e2e156249179441c00c0a0bb0 Mon Sep 17 00:00:00 2001 From: Samaresh Kumar Singh Date: Mon, 2 Mar 2026 17:38:17 -0600 Subject: [PATCH 2/2] fix trailing whitespace --- tools/server/server-context.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 0f43c1b77d..7f587da1e5 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2677,7 +2677,7 @@ private: SRV_WRN("reducing batch size to %d after %d retries (cleared_slot = %d)\n", n_batch, n_retries, cleared_slot); } - SRV_WRN("failed to find free space in the KV cache, retrying (attempt %d/%d), i = %d, n_batch = %d, ret = %d\n", + SRV_WRN("failed to find free space in the KV cache, retrying (attempt %d/%d), i = %d, n_batch = %d, ret = %d\n", n_retries, max_retries, i, n_batch, ret); continue; // continue loop of n_batch