diff --git a/common/arg.cpp b/common/arg.cpp
index 649216b7f0..2e0f46db51 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1311,6 +1311,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.kv_unified = value;
}
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
+ add_opt(common_arg(
+ {"--clear-idle"},
+ {"--no-clear-idle"},
+ "save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)",
+ [](common_params & params, bool value) {
+ params.clear_idle = value;
+ }
+ ).set_env("LLAMA_ARG_CLEAR_IDLE").set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--context-shift"},
{"--no-context-shift"},
diff --git a/common/common.h b/common/common.h
index 31a337daa6..020b6a721f 100644
--- a/common/common.h
+++ b/common/common.h
@@ -579,8 +579,9 @@ struct common_params {
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
bool cache_prompt = true; // whether to enable prompt caching
- int32_t n_ctx_checkpoints = 32; // max number of context checkpoints per slot
- int32_t checkpoint_every_nt = 8192; // make a checkpoint every n tokens during prefill
+ bool clear_idle = true; // save and clear idle slots upon starting a new task
+ int32_t n_ctx_checkpoints = 32; // max number of context checkpoints per slot
+ int32_t checkpoint_every_nt = 8192; // make a checkpoint every n tokens during prefill
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
std::string hostname = "127.0.0.1";
diff --git a/tools/cli/README.md b/tools/cli/README.md
index 840976a884..de0b780409 100644
--- a/tools/cli/README.md
+++ b/tools/cli/README.md
@@ -176,8 +176,8 @@
| `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) |
| `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) |
| `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)
(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) |
-| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
| `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)
(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
| `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
| `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_DRAFT_MAX) |
diff --git a/tools/completion/README.md b/tools/completion/README.md
index 25884ed92d..fe1a036a38 100644
--- a/tools/completion/README.md
+++ b/tools/completion/README.md
@@ -255,8 +255,8 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
| `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) |
| `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) |
| `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)
(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) |
-| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
| `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)
(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
| `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
diff --git a/tools/server/README.md b/tools/server/README.md
index 1bd8201689..b30309bf3b 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -167,6 +167,7 @@ For the full list of features, please refer to [server's changelog](https://gith
| `-cpent, --checkpoint-every-n-tokens N` | create a checkpoint every n tokens during prefill (processing), -1 to disable (default: 8192)
(env: LLAMA_ARG_CHECKPOINT_EVERY_NT) |
| `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) |
| `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)
(env: LLAMA_ARG_KV_UNIFIED) |
+| `--clear-idle, --no-clear-idle` | save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)
(env: LLAMA_ARG_CLEAR_IDLE) |
| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) |
| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode |
| `-sp, --special` | special tokens output enabled (default: false) |
@@ -221,8 +222,8 @@ For the full list of features, please refer to [server's changelog](https://gith
| `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) |
| `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) |
| `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)
(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) |
-| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
| `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)
(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
| `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)
when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled
(env: LLAMA_ARG_PREFILL_ASSISTANT) |
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) |
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 6f737d94d0..bd2552f75f 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -605,6 +605,17 @@ private:
llama_batch_free(batch);
}
+ void slot_save_and_clear(server_slot & slot) {
+ if (slot.prompt.n_tokens() == 0) {
+ return;
+ }
+ SLT_INF(slot, "%s", "saving idle slot to prompt cache\n");
+ SLT_DBG(slot, "%s", "__TEST_TAG_CLEAR_IDLE_SLOT__\n");
+ slot.prompt_save(*prompt_cache);
+ slot.prompt_clear(false);
+ prompt_cache->update();
+ }
+
void handle_sleeping_state(bool new_state) {
GGML_ASSERT(sleeping != new_state);
if (new_state) {
@@ -864,6 +875,19 @@ private:
metrics.init();
+ if (params_base.clear_idle) {
+ if (!params_base.kv_unified) {
+ SRV_WRN("%s: --clear-idle requires --kv-unified, disabling\n", __func__);
+ params_base.clear_idle = false;
+ } else if (params_base.cache_ram_mib == 0) {
+ SRV_WRN("%s: --clear-idle requires --cache-ram, disabling\n", __func__);
+ params_base.clear_idle = false;
+ } else {
+ SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__);
+ SRV_DBG("%s", "__TEST_TAG_CLEAR_IDLE_ENABLED__\n");
+ }
+ }
+
// populate webui settings
{
if (!params_base.webui_config_json.empty()) {
@@ -1010,15 +1034,15 @@ private:
// cache prompts only for completion tasks
update_cache = update_cache && task.type == SERVER_TASK_TYPE_COMPLETION;
- // don't update the cache if the slot's context is empty
- update_cache = update_cache && tokens.size() > 0;
-
if (update_cache) {
SRV_WRN("%s", "updating prompt cache\n");
const int64_t t_start = ggml_time_us();
- ret->prompt_save(*prompt_cache);
+ // don't save the slot's state if its context is empty
+ if (tokens.size() > 0) {
+ ret->prompt_save(*prompt_cache);
+ }
if (!ret->prompt_load(*prompt_cache, task.tokens)) {
ret->prompt_clear(false);
@@ -1692,9 +1716,7 @@ private:
const int id_slot = task.id_slot;
const int id_task = task.id;
- server_slot * slot = id_slot != -1
- ? get_slot_by_id(id_slot)
- : get_available_slot(task);
+ server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);
//
// slot scheduling logic
@@ -1731,6 +1753,14 @@ private:
SRV_ERR("failed to launch slot with task, id_task = %d\n", id_task);
break; // drop the task
}
+
+ if (params_base.clear_idle) {
+ for (auto & s : slots) {
+ if (!s.is_processing()) {
+ slot_save_and_clear(s);
+ }
+ }
+ }
} break;
case SERVER_TASK_TYPE_CANCEL:
{
diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
index 3018ac90f8..4cc87bc507 100644
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -2008,7 +2008,7 @@ server_prompt * server_prompt_cache::alloc(const server_prompt & prompt, size_t
bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tokens_new, llama_context * ctx, int32_t id_slot) {
const int lcp_best = prompt.tokens.get_common_prefix(tokens_new);
- float f_keep_best = float(lcp_best) / prompt.tokens.size();
+ float f_keep_best = prompt.tokens.size() > 0 ? float(lcp_best) / prompt.tokens.size() : -1.0f; // empty slot: any cache entry wins
float sim_best = float(lcp_best) / tokens_new.size();
SRV_WRN(" - looking for better prompt, base f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best);
diff --git a/tools/server/tests/unit/test_kv_keep_only_active.py b/tools/server/tests/unit/test_kv_keep_only_active.py
new file mode 100644
index 0000000000..da93d50011
--- /dev/null
+++ b/tools/server/tests/unit/test_kv_keep_only_active.py
@@ -0,0 +1,115 @@
+import os
+import tempfile
+import pytest
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+class LogReader:
+ def __init__(self, path):
+ self.path = path
+ self.pos = 0
+ def drain(self):
+ with open(self.path) as f:
+ f.seek(self.pos)
+ content = f.read()
+ self.pos = f.tell()
+ return content
+
+@pytest.fixture(autouse=True)
+def create_server():
+ global server
+ server = ServerPreset.tinyllama2()
+ server.n_slots = 2
+ server.n_predict = 4
+ server.temperature = 0.0
+ server.server_slots = True
+ server.cache_ram = 100
+ server.kv_unified = True
+ server.debug = True
+ fd, server.log_path = tempfile.mkstemp(suffix='.log')
+ os.close(fd)
+ yield
+
+
+LONG_PROMPT = (
+ "Once upon a time in a land far away, there lived a brave knight "
+ "who traveled across mountains and rivers to find the legendary "
+ "golden sword hidden deep within the enchanted forest of whispers. "
+ "He met many creatures along the way including dragons and fairies "
+ "and wizards who helped him on his noble quest to save the kingdom."
+)
+
+
+# idle slot cleared on launch should restore from cache-ram
+def test_clear_and_restore():
+ global server
+ server.start()
+ log = LogReader(server.log_path)
+
+ # verify feature is enabled
+ assert "__TEST_TAG_CLEAR_IDLE_ENABLED__" in log.drain()
+
+ res = server.make_request("POST", "/completion", data={
+ "prompt": LONG_PROMPT,
+ "id_slot": 0,
+ "cache_prompt": True,
+ })
+ assert res.status_code == 200
+ original_prompt_n = res.body["timings"]["prompt_n"]
+
+ # Slot 0 is the only slot with KV — should NOT be cleared
+ assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
+
+ # Launching slot 1 clears idle slot 0
+ res = server.make_request("POST", "/completion", data={
+ "prompt": "The quick brown fox",
+ "id_slot": 1,
+ "cache_prompt": True,
+ })
+ assert res.status_code == 200
+ assert "__TEST_TAG_CLEAR_IDLE_SLOT__" in log.drain()
+
+ # Re-send same prompt — should restore from cache-ram
+ res = server.make_request("POST", "/completion", data={
+ "prompt": LONG_PROMPT,
+ "cache_prompt": True,
+ })
+ assert res.status_code == 200
+ assert "updating prompt cache" in log.drain()
+ assert res.body["timings"]["cache_n"] > 0
+ assert res.body["timings"]["prompt_n"] < original_prompt_n
+
+ # Follow-up — slot 0 kept its KV, no clearing needed
+ res = server.make_request("POST", "/completion", data={
+ "prompt": LONG_PROMPT + " The knight finally reached the castle gates.",
+ "cache_prompt": True,
+ })
+ assert res.status_code == 200
+ assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
+
+
+def test_disabled_with_flag():
+ global server
+ server.no_clear_idle = True
+ server.start()
+ log = LogReader(server.log_path)
+
+ # Feature should not be enabled
+ assert "__TEST_TAG_CLEAR_IDLE_ENABLED__" not in log.drain()
+
+ res = server.make_request("POST", "/completion", data={
+ "prompt": LONG_PROMPT,
+ "id_slot": 0,
+ "cache_prompt": True,
+ })
+ assert res.status_code == 200
+
+ # Request on different slot — should NOT trigger clearing
+ res = server.make_request("POST", "/completion", data={
+ "prompt": "The quick brown fox",
+ "id_slot": 1,
+ "cache_prompt": True,
+ })
+ assert res.status_code == 200
+ assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py
index a9a7e3c4f3..5ddac5be49 100644
--- a/tools/server/tests/utils.py
+++ b/tools/server/tests/utils.py
@@ -102,6 +102,9 @@ class ServerProcess:
mmproj_url: str | None = None
media_path: str | None = None
sleep_idle_seconds: int | None = None
+ cache_ram: int | None = None
+ no_clear_idle: bool = False
+ log_path: str | None = None
webui_mcp_proxy: bool = False
# session variables
@@ -237,6 +240,10 @@ class ServerProcess:
server_args.extend(["--media-path", self.media_path])
if self.sleep_idle_seconds is not None:
server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds])
+ if self.cache_ram is not None:
+ server_args.extend(["--cache-ram", self.cache_ram])
+ if self.no_clear_idle:
+ server_args.append("--no-clear-idle")
if self.webui_mcp_proxy:
server_args.append("--webui-mcp-proxy")
@@ -249,11 +256,16 @@ class ServerProcess:
flags |= subprocess.CREATE_NEW_PROCESS_GROUP
flags |= subprocess.CREATE_NO_WINDOW
+ if self.log_path:
+ self._log = open(self.log_path, "w")
+ else:
+ self._log = sys.stdout
+
self.process = subprocess.Popen(
[str(arg) for arg in [server_path, *server_args]],
creationflags=flags,
- stdout=sys.stdout,
- stderr=sys.stdout,
+ stdout=self._log,
+ stderr=self._log if self._log != sys.stdout else sys.stdout,
env={**os.environ, "LLAMA_CACHE": "tmp"} if "LLAMA_CACHE" not in os.environ else None,
)
server_instances.add(self)
@@ -298,6 +310,8 @@ class ServerProcess:
except Exception as e:
print(f"Error waiting for server: {e}")
self.process = None
+ if hasattr(self, '_log') and self._log != sys.stdout:
+ self._log.close()
def make_request(
self,