From d9146ed2922592ce6c09f278f6e49de7b5c07ab1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 14 Jan 2026 12:49:21 +0200 Subject: [PATCH] server : consolidate slot reset/clear logic --- tools/server/server-context.cpp | 51 ++++++++++++++------------------- 1 file changed, 22 insertions(+), 29 deletions(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 01ac24200a..d968a94a81 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -127,6 +127,17 @@ struct server_slot { return res; } + void prompt_clear(bool allow_processing) { + if (!allow_processing) { + GGML_ASSERT(!is_processing()); + } + + SLT_INF(*this, "clearing prompt with %zu tokens\n", prompt.tokens.size()); + + llama_memory_seq_rm(llama_get_memory(ctx), id, -1, -1); + prompt.tokens.clear(); + } + std::vector lora; int32_t alora_invocation_start = -1; @@ -176,25 +187,15 @@ struct server_slot { n_draft_total = 0; n_draft_accepted = 0; + task_prev = std::move(task); task.reset(); - task_prev.reset(); + + llama_set_sampler(ctx, id, nullptr); // clear alora start alora_invocation_start = -1; } - // remove cached prompt + tokens - void clear(bool allow_processing) { - if (!allow_processing) { - GGML_ASSERT(!is_processing()); - } - - SLT_INF(*this, "clearing slot with %zu tokens\n", prompt.tokens.size()); - - llama_memory_seq_rm(llama_get_memory(ctx), id, -1, -1); - prompt.tokens.clear(); - } - void init_sampler() const { common_sampler_reset(smpl.get()); @@ -321,11 +322,10 @@ struct server_slot { // do not keep context of the child slots - the parent's context is enough if (is_child()) { - clear(false); + prompt_clear(false); } - task_prev = std::move(task); - task.reset(); + reset(); callback_on_release(id); } @@ -773,6 +773,7 @@ private: slots.clear(); + // initialize slots for (int i = 0; i < params_base.n_parallel; i++) { server_slot slot; @@ -1021,7 +1022,7 @@ private: ret->prompt_save(*prompt_cache); if (!ret->prompt_load(*prompt_cache, task.tokens)) { - ret->clear(false); + ret->prompt_clear(false); } prompt_cache->update(); @@ -1053,7 +1054,7 @@ private: if (slot.prompt.n_tokens() > 0) { SRV_WRN("purging slot %d with %zu tokens\n", slot.id, slot.prompt.tokens.size()); - slot.clear(false); + slot.prompt_clear(false); res = true; @@ -1079,8 +1080,6 @@ private: } bool launch_slot_with_task(server_slot & slot, server_task && task) { - slot.reset(); - // process per-request lora adapters if (!task.params.lora.empty()) { auto task_loras = construct_lora_list(task.params.lora); @@ -1838,7 +1837,7 @@ private: // Erase token cache const size_t n_erased = slot->prompt.tokens.size(); - slot->clear(false); + slot->prompt_clear(false); auto res = std::make_unique(); res->id = task.id; @@ -2395,7 +2394,7 @@ private: if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) { SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0); - slot.clear(true); + slot.prompt_clear(true); // there is no common part left slot.n_prompt_tokens_cache = 0; @@ -2567,12 +2566,6 @@ private: llama_set_embeddings(ctx, slot_batched->task->need_embd()); } - for (auto & slot : slots) { - if (!slot.is_processing() || !slot.smpl) { - llama_set_sampler(ctx, slot.id, nullptr); - } - } - if (batch.n_tokens == 0) { SRV_WRN("%s", "no tokens to decode\n"); } @@ -2628,7 +2621,7 @@ private: // note: it's complicated to keep track of how much of the current batch has been // processed before the error occurred, so we simply clear the entire context - slot.clear(false); + slot.prompt_clear(false); } }