server : disable similarity slot selection with --cache-idle-slots and --parallel 1
This commit is contained in:
parent
97895129e5
commit
e0da25a612
|
|
@ -3166,7 +3166,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PREFILL_ASSISTANT"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PREFILL_ASSISTANT"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
|
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
|
||||||
string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
|
string_format(
|
||||||
|
"how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n"
|
||||||
|
"disabled with --cache-idle-slots and --parallel 1\n",
|
||||||
|
params.slot_prompt_similarity
|
||||||
|
),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.slot_prompt_similarity = std::stof(value);
|
params.slot_prompt_similarity = std::stof(value);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -686,9 +686,6 @@ private:
|
||||||
|
|
||||||
json json_webui_settings = json::object();
|
json json_webui_settings = json::object();
|
||||||
|
|
||||||
// Necessary similarity of prompt for slot selection
|
|
||||||
float slot_prompt_similarity = 0.0f;
|
|
||||||
|
|
||||||
std::string model_name; // name of the loaded model, to be used by API
|
std::string model_name; // name of the loaded model, to be used by API
|
||||||
std::set<std::string> model_aliases; // additional names for the model
|
std::set<std::string> model_aliases; // additional names for the model
|
||||||
std::set<std::string> model_tags; // informational tags
|
std::set<std::string> model_tags; // informational tags
|
||||||
|
|
@ -853,9 +850,6 @@ private:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Necessary similarity of prompt for slot selection
|
|
||||||
slot_prompt_similarity = params_base.slot_prompt_similarity;
|
|
||||||
|
|
||||||
// setup slots
|
// setup slots
|
||||||
SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel);
|
SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel);
|
||||||
|
|
||||||
|
|
@ -999,6 +993,16 @@ private:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// prompt similarity doesn't work with idle slots, rely on the unified cache instead
|
||||||
|
if (params_base.cache_idle_slots) {
|
||||||
|
params_base.slot_prompt_similarity = 0.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
// bypass prompt similarity when we only have one slot
|
||||||
|
if (params_base.n_parallel == 1) {
|
||||||
|
params_base.slot_prompt_similarity = 0.0f;
|
||||||
|
}
|
||||||
|
|
||||||
// populate webui settings
|
// populate webui settings
|
||||||
{
|
{
|
||||||
if (!params_base.webui_config_json.empty()) {
|
if (!params_base.webui_config_json.empty()) {
|
||||||
|
|
@ -1073,7 +1077,7 @@ private:
|
||||||
bool update_cache = false;
|
bool update_cache = false;
|
||||||
|
|
||||||
// find the slot that has at least n% prompt similarity
|
// find the slot that has at least n% prompt similarity
|
||||||
if (ret == nullptr && slot_prompt_similarity != 0.0f) {
|
if (ret == nullptr && params_base.slot_prompt_similarity != 0.0f) {
|
||||||
float sim_best = 0;
|
float sim_best = 0;
|
||||||
|
|
||||||
for (server_slot & slot : slots) {
|
for (server_slot & slot : slots) {
|
||||||
|
|
@ -1093,7 +1097,7 @@ private:
|
||||||
const float sim_cur = float(tokens.get_common_prefix(task.tokens)) / task.tokens.size();
|
const float sim_cur = float(tokens.get_common_prefix(task.tokens)) / task.tokens.size();
|
||||||
|
|
||||||
// select the current slot if the criteria match
|
// select the current slot if the criteria match
|
||||||
if (sim_cur > sim_best && sim_cur > slot_prompt_similarity) {
|
if (sim_cur > sim_best && sim_cur > params_base.slot_prompt_similarity) {
|
||||||
sim_best = sim_cur;
|
sim_best = sim_cur;
|
||||||
|
|
||||||
ret = &slot;
|
ret = &slot;
|
||||||
|
|
@ -1104,7 +1108,7 @@ private:
|
||||||
const float f_keep = (sim_best*task.tokens.size()) / ret->prompt.tokens.size();
|
const float f_keep = (sim_best*task.tokens.size()) / ret->prompt.tokens.size();
|
||||||
|
|
||||||
SLT_INF(*ret, "selected slot by LCP similarity, sim_best = %.3f (> %.3f thold), f_keep = %.3f\n",
|
SLT_INF(*ret, "selected slot by LCP similarity, sim_best = %.3f (> %.3f thold), f_keep = %.3f\n",
|
||||||
sim_best, slot_prompt_similarity, f_keep);
|
sim_best, params_base.slot_prompt_similarity, f_keep);
|
||||||
|
|
||||||
// if we are about to lose a large portion of the existing context - save it in the prompt cache
|
// if we are about to lose a large portion of the existing context - save it in the prompt cache
|
||||||
if (f_keep < 0.5f) {
|
if (f_keep < 0.5f) {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue