diff --git a/common/arg.cpp b/common/arg.cpp index 649216b7f0..2e0f46db51 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1311,6 +1311,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.kv_unified = value; } ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL})); + add_opt(common_arg( + {"--clear-idle"}, + {"--no-clear-idle"}, + "save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)", + [](common_params & params, bool value) { + params.clear_idle = value; + } + ).set_env("LLAMA_ARG_CLEAR_IDLE").set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--context-shift"}, {"--no-context-shift"}, diff --git a/common/common.h b/common/common.h index 31a337daa6..020b6a721f 100644 --- a/common/common.h +++ b/common/common.h @@ -579,8 +579,9 @@ struct common_params { int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting bool cache_prompt = true; // whether to enable prompt caching - int32_t n_ctx_checkpoints = 32; // max number of context checkpoints per slot - int32_t checkpoint_every_nt = 8192; // make a checkpoint every n tokens during prefill + bool clear_idle = true; // save and clear idle slots upon starting a new task + int32_t n_ctx_checkpoints = 32; // max number of context checkpoints per slot + int32_t checkpoint_every_nt = 8192; // make a checkpoint every n tokens during prefill int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc. std::string hostname = "127.0.0.1"; diff --git a/tools/cli/README.md b/tools/cli/README.md index 840976a884..de0b780409 100644 --- a/tools/cli/README.md +++ b/tools/cli/README.md @@ -176,8 +176,8 @@ | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) | | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)
(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) | -| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | -| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | +| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)
(env: LLAMA_ARG_SKIP_CHAT_PARSING) | | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles | | `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_DRAFT_MAX) | diff --git a/tools/completion/README.md b/tools/completion/README.md index 25884ed92d..fe1a036a38 100644 --- a/tools/completion/README.md +++ b/tools/completion/README.md @@ -255,8 +255,8 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) | | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)
(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) | -| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | -| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | +| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)
(env: LLAMA_ARG_SKIP_CHAT_PARSING) | | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles | diff --git a/tools/server/README.md b/tools/server/README.md index 1bd8201689..b30309bf3b 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -167,6 +167,7 @@ For the full list of features, please refer to [server's changelog](https://gith | `-cpent, --checkpoint-every-n-tokens N` | create a checkpoint every n tokens during prefill (processing), -1 to disable (default: 8192)
(env: LLAMA_ARG_CHECKPOINT_EVERY_NT) | | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) | | `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)
(env: LLAMA_ARG_KV_UNIFIED) | +| `--clear-idle, --no-clear-idle` | save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)
(env: LLAMA_ARG_CLEAR_IDLE) | | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) | | `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode | | `-sp, --special` | special tokens output enabled (default: false) | @@ -221,8 +222,8 @@ For the full list of features, please refer to [server's changelog](https://gith | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) | | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)
(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) | -| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | -| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | +| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)
(env: LLAMA_ARG_SKIP_CHAT_PARSING) | | `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)
when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled

(env: LLAMA_ARG_PREFILL_ASSISTANT) | | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) | diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 6f737d94d0..bd2552f75f 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -605,6 +605,17 @@ private: llama_batch_free(batch); } + void slot_save_and_clear(server_slot & slot) { + if (slot.prompt.n_tokens() == 0) { + return; + } + SLT_INF(slot, "%s", "saving idle slot to prompt cache\n"); + SLT_DBG(slot, "%s", "__TEST_TAG_CLEAR_IDLE_SLOT__\n"); + slot.prompt_save(*prompt_cache); + slot.prompt_clear(false); + prompt_cache->update(); + } + void handle_sleeping_state(bool new_state) { GGML_ASSERT(sleeping != new_state); if (new_state) { @@ -864,6 +875,19 @@ private: metrics.init(); + if (params_base.clear_idle) { + if (!params_base.kv_unified) { + SRV_WRN("%s: --clear-idle requires --kv-unified, disabling\n", __func__); + params_base.clear_idle = false; + } else if (params_base.cache_ram_mib == 0) { + SRV_WRN("%s: --clear-idle requires --cache-ram, disabling\n", __func__); + params_base.clear_idle = false; + } else { + SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__); + SRV_DBG("%s", "__TEST_TAG_CLEAR_IDLE_ENABLED__\n"); + } + } + // populate webui settings { if (!params_base.webui_config_json.empty()) { @@ -1010,15 +1034,15 @@ private: // cache prompts only for completion tasks update_cache = update_cache && task.type == SERVER_TASK_TYPE_COMPLETION; - // don't update the cache if the slot's context is empty - update_cache = update_cache && tokens.size() > 0; - if (update_cache) { SRV_WRN("%s", "updating prompt cache\n"); const int64_t t_start = ggml_time_us(); - ret->prompt_save(*prompt_cache); + // don't save the slot's state if its context is empty + if (tokens.size() > 0) { + ret->prompt_save(*prompt_cache); + } if (!ret->prompt_load(*prompt_cache, task.tokens)) { ret->prompt_clear(false); @@ -1692,9 +1716,7 @@ private: const int id_slot = task.id_slot; const int id_task = task.id; - server_slot * slot = id_slot != -1 - ? get_slot_by_id(id_slot) - : get_available_slot(task); + server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task); // // slot scheduling logic @@ -1731,6 +1753,14 @@ private: SRV_ERR("failed to launch slot with task, id_task = %d\n", id_task); break; // drop the task } + + if (params_base.clear_idle) { + for (auto & s : slots) { + if (!s.is_processing()) { + slot_save_and_clear(s); + } + } + } } break; case SERVER_TASK_TYPE_CANCEL: { diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 3018ac90f8..4cc87bc507 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -2008,7 +2008,7 @@ server_prompt * server_prompt_cache::alloc(const server_prompt & prompt, size_t bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tokens_new, llama_context * ctx, int32_t id_slot) { const int lcp_best = prompt.tokens.get_common_prefix(tokens_new); - float f_keep_best = float(lcp_best) / prompt.tokens.size(); + float f_keep_best = prompt.tokens.size() > 0 ? float(lcp_best) / prompt.tokens.size() : -1.0f; // empty slot: any cache entry wins float sim_best = float(lcp_best) / tokens_new.size(); SRV_WRN(" - looking for better prompt, base f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best); diff --git a/tools/server/tests/unit/test_kv_keep_only_active.py b/tools/server/tests/unit/test_kv_keep_only_active.py new file mode 100644 index 0000000000..da93d50011 --- /dev/null +++ b/tools/server/tests/unit/test_kv_keep_only_active.py @@ -0,0 +1,115 @@ +import os +import tempfile +import pytest +from utils import * + +server = ServerPreset.tinyllama2() + +class LogReader: + def __init__(self, path): + self.path = path + self.pos = 0 + def drain(self): + with open(self.path) as f: + f.seek(self.pos) + content = f.read() + self.pos = f.tell() + return content + +@pytest.fixture(autouse=True) +def create_server(): + global server + server = ServerPreset.tinyllama2() + server.n_slots = 2 + server.n_predict = 4 + server.temperature = 0.0 + server.server_slots = True + server.cache_ram = 100 + server.kv_unified = True + server.debug = True + fd, server.log_path = tempfile.mkstemp(suffix='.log') + os.close(fd) + yield + + +LONG_PROMPT = ( + "Once upon a time in a land far away, there lived a brave knight " + "who traveled across mountains and rivers to find the legendary " + "golden sword hidden deep within the enchanted forest of whispers. " + "He met many creatures along the way including dragons and fairies " + "and wizards who helped him on his noble quest to save the kingdom." +) + + +# idle slot cleared on launch should restore from cache-ram +def test_clear_and_restore(): + global server + server.start() + log = LogReader(server.log_path) + + # verify feature is enabled + assert "__TEST_TAG_CLEAR_IDLE_ENABLED__" in log.drain() + + res = server.make_request("POST", "/completion", data={ + "prompt": LONG_PROMPT, + "id_slot": 0, + "cache_prompt": True, + }) + assert res.status_code == 200 + original_prompt_n = res.body["timings"]["prompt_n"] + + # Slot 0 is the only slot with KV — should NOT be cleared + assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain() + + # Launching slot 1 clears idle slot 0 + res = server.make_request("POST", "/completion", data={ + "prompt": "The quick brown fox", + "id_slot": 1, + "cache_prompt": True, + }) + assert res.status_code == 200 + assert "__TEST_TAG_CLEAR_IDLE_SLOT__" in log.drain() + + # Re-send same prompt — should restore from cache-ram + res = server.make_request("POST", "/completion", data={ + "prompt": LONG_PROMPT, + "cache_prompt": True, + }) + assert res.status_code == 200 + assert "updating prompt cache" in log.drain() + assert res.body["timings"]["cache_n"] > 0 + assert res.body["timings"]["prompt_n"] < original_prompt_n + + # Follow-up — slot 0 kept its KV, no clearing needed + res = server.make_request("POST", "/completion", data={ + "prompt": LONG_PROMPT + " The knight finally reached the castle gates.", + "cache_prompt": True, + }) + assert res.status_code == 200 + assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain() + + +def test_disabled_with_flag(): + global server + server.no_clear_idle = True + server.start() + log = LogReader(server.log_path) + + # Feature should not be enabled + assert "__TEST_TAG_CLEAR_IDLE_ENABLED__" not in log.drain() + + res = server.make_request("POST", "/completion", data={ + "prompt": LONG_PROMPT, + "id_slot": 0, + "cache_prompt": True, + }) + assert res.status_code == 200 + + # Request on different slot — should NOT trigger clearing + res = server.make_request("POST", "/completion", data={ + "prompt": "The quick brown fox", + "id_slot": 1, + "cache_prompt": True, + }) + assert res.status_code == 200 + assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain() diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py index a9a7e3c4f3..5ddac5be49 100644 --- a/tools/server/tests/utils.py +++ b/tools/server/tests/utils.py @@ -102,6 +102,9 @@ class ServerProcess: mmproj_url: str | None = None media_path: str | None = None sleep_idle_seconds: int | None = None + cache_ram: int | None = None + no_clear_idle: bool = False + log_path: str | None = None webui_mcp_proxy: bool = False # session variables @@ -237,6 +240,10 @@ class ServerProcess: server_args.extend(["--media-path", self.media_path]) if self.sleep_idle_seconds is not None: server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds]) + if self.cache_ram is not None: + server_args.extend(["--cache-ram", self.cache_ram]) + if self.no_clear_idle: + server_args.append("--no-clear-idle") if self.webui_mcp_proxy: server_args.append("--webui-mcp-proxy") @@ -249,11 +256,16 @@ class ServerProcess: flags |= subprocess.CREATE_NEW_PROCESS_GROUP flags |= subprocess.CREATE_NO_WINDOW + if self.log_path: + self._log = open(self.log_path, "w") + else: + self._log = sys.stdout + self.process = subprocess.Popen( [str(arg) for arg in [server_path, *server_args]], creationflags=flags, - stdout=sys.stdout, - stderr=sys.stdout, + stdout=self._log, + stderr=self._log if self._log != sys.stdout else sys.stdout, env={**os.environ, "LLAMA_CACHE": "tmp"} if "LLAMA_CACHE" not in os.environ else None, ) server_instances.add(self) @@ -298,6 +310,8 @@ class ServerProcess: except Exception as e: print(f"Error waiting for server: {e}") self.process = None + if hasattr(self, '_log') and self._log != sys.stdout: + self._log.close() def make_request( self,