server : consolidate slot reset/clear logic

This commit is contained in:
Georgi Gerganov 2026-01-14 12:49:21 +02:00
parent 3084bfe633
commit d9146ed292
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
1 changed files with 22 additions and 29 deletions

View File

@ -127,6 +127,17 @@ struct server_slot {
return res;
}
void prompt_clear(bool allow_processing) {
if (!allow_processing) {
GGML_ASSERT(!is_processing());
}
SLT_INF(*this, "clearing prompt with %zu tokens\n", prompt.tokens.size());
llama_memory_seq_rm(llama_get_memory(ctx), id, -1, -1);
prompt.tokens.clear();
}
std::vector<common_adapter_lora_info> lora;
int32_t alora_invocation_start = -1;
@ -176,25 +187,15 @@ struct server_slot {
n_draft_total = 0;
n_draft_accepted = 0;
task_prev = std::move(task);
task.reset();
task_prev.reset();
llama_set_sampler(ctx, id, nullptr);
// clear alora start
alora_invocation_start = -1;
}
// remove cached prompt + tokens
void clear(bool allow_processing) {
if (!allow_processing) {
GGML_ASSERT(!is_processing());
}
SLT_INF(*this, "clearing slot with %zu tokens\n", prompt.tokens.size());
llama_memory_seq_rm(llama_get_memory(ctx), id, -1, -1);
prompt.tokens.clear();
}
void init_sampler() const {
common_sampler_reset(smpl.get());
@ -321,11 +322,10 @@ struct server_slot {
// do not keep context of the child slots - the parent's context is enough
if (is_child()) {
clear(false);
prompt_clear(false);
}
task_prev = std::move(task);
task.reset();
reset();
callback_on_release(id);
}
@ -773,6 +773,7 @@ private:
slots.clear();
// initialize slots
for (int i = 0; i < params_base.n_parallel; i++) {
server_slot slot;
@ -1021,7 +1022,7 @@ private:
ret->prompt_save(*prompt_cache);
if (!ret->prompt_load(*prompt_cache, task.tokens)) {
ret->clear(false);
ret->prompt_clear(false);
}
prompt_cache->update();
@ -1053,7 +1054,7 @@ private:
if (slot.prompt.n_tokens() > 0) {
SRV_WRN("purging slot %d with %zu tokens\n", slot.id, slot.prompt.tokens.size());
slot.clear(false);
slot.prompt_clear(false);
res = true;
@ -1079,8 +1080,6 @@ private:
}
bool launch_slot_with_task(server_slot & slot, server_task && task) {
slot.reset();
// process per-request lora adapters
if (!task.params.lora.empty()) {
auto task_loras = construct_lora_list(task.params.lora);
@ -1838,7 +1837,7 @@ private:
// Erase token cache
const size_t n_erased = slot->prompt.tokens.size();
slot->clear(false);
slot->prompt_clear(false);
auto res = std::make_unique<server_task_result_slot_erase>();
res->id = task.id;
@ -2395,7 +2394,7 @@ private:
if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) {
SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0);
slot.clear(true);
slot.prompt_clear(true);
// there is no common part left
slot.n_prompt_tokens_cache = 0;
@ -2567,12 +2566,6 @@ private:
llama_set_embeddings(ctx, slot_batched->task->need_embd());
}
for (auto & slot : slots) {
if (!slot.is_processing() || !slot.smpl) {
llama_set_sampler(ctx, slot.id, nullptr);
}
}
if (batch.n_tokens == 0) {
SRV_WRN("%s", "no tokens to decode\n");
}
@ -2628,7 +2621,7 @@ private:
// note: it's complicated to keep track of how much of the current batch has been
// processed before the error occurred, so we simply clear the entire context
slot.clear(false);
slot.prompt_clear(false);
}
}