server : disable similarity slot selection with --cache-idle-slots and --parallel 1

This commit is contained in:
kiwixz 2026-04-18 16:02:00 +02:00
parent 97895129e5
commit e0da25a612
2 changed files with 18 additions and 10 deletions

View File

@ -3166,7 +3166,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PREFILL_ASSISTANT")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PREFILL_ASSISTANT"));
add_opt(common_arg( add_opt(common_arg(
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY", {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), string_format(
"how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n"
"disabled with --cache-idle-slots and --parallel 1\n",
params.slot_prompt_similarity
),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.slot_prompt_similarity = std::stof(value); params.slot_prompt_similarity = std::stof(value);
} }

View File

@ -686,9 +686,6 @@ private:
json json_webui_settings = json::object(); json json_webui_settings = json::object();
// Necessary similarity of prompt for slot selection
float slot_prompt_similarity = 0.0f;
std::string model_name; // name of the loaded model, to be used by API std::string model_name; // name of the loaded model, to be used by API
std::set<std::string> model_aliases; // additional names for the model std::set<std::string> model_aliases; // additional names for the model
std::set<std::string> model_tags; // informational tags std::set<std::string> model_tags; // informational tags
@ -853,9 +850,6 @@ private:
} }
} }
// Necessary similarity of prompt for slot selection
slot_prompt_similarity = params_base.slot_prompt_similarity;
// setup slots // setup slots
SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel); SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel);
@ -999,6 +993,16 @@ private:
} }
} }
// prompt similarity doesn't work with idle slots, rely on the unified cache instead
if (params_base.cache_idle_slots) {
params_base.slot_prompt_similarity = 0.0f;
}
// bypass prompt similarity when we only have one slot
if (params_base.n_parallel == 1) {
params_base.slot_prompt_similarity = 0.0f;
}
// populate webui settings // populate webui settings
{ {
if (!params_base.webui_config_json.empty()) { if (!params_base.webui_config_json.empty()) {
@ -1073,7 +1077,7 @@ private:
bool update_cache = false; bool update_cache = false;
// find the slot that has at least n% prompt similarity // find the slot that has at least n% prompt similarity
if (ret == nullptr && slot_prompt_similarity != 0.0f) { if (ret == nullptr && params_base.slot_prompt_similarity != 0.0f) {
float sim_best = 0; float sim_best = 0;
for (server_slot & slot : slots) { for (server_slot & slot : slots) {
@ -1093,7 +1097,7 @@ private:
const float sim_cur = float(tokens.get_common_prefix(task.tokens)) / task.tokens.size(); const float sim_cur = float(tokens.get_common_prefix(task.tokens)) / task.tokens.size();
// select the current slot if the criteria match // select the current slot if the criteria match
if (sim_cur > sim_best && sim_cur > slot_prompt_similarity) { if (sim_cur > sim_best && sim_cur > params_base.slot_prompt_similarity) {
sim_best = sim_cur; sim_best = sim_cur;
ret = &slot; ret = &slot;
@ -1104,7 +1108,7 @@ private:
const float f_keep = (sim_best*task.tokens.size()) / ret->prompt.tokens.size(); const float f_keep = (sim_best*task.tokens.size()) / ret->prompt.tokens.size();
SLT_INF(*ret, "selected slot by LCP similarity, sim_best = %.3f (> %.3f thold), f_keep = %.3f\n", SLT_INF(*ret, "selected slot by LCP similarity, sim_best = %.3f (> %.3f thold), f_keep = %.3f\n",
sim_best, slot_prompt_similarity, f_keep); sim_best, params_base.slot_prompt_similarity, f_keep);
// if we are about to lose a large portion of the existing context - save it in the prompt cache // if we are about to lose a large portion of the existing context - save it in the prompt cache
if (f_keep < 0.5f) { if (f_keep < 0.5f) {