server : disable similarity slot selection with --cache-idle-slots and --parallel 1

2026-04-18 16:02:00 +02:00 · 2026-04-18 16:02:00 +02:00 · e0da25a612
parent 97895129e5
commit e0da25a612
2 changed files with 18 additions and 10 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -3166,7 +3166,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PREFILL_ASSISTANT"));
    add_opt(common_arg(
        {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
-        string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
+        string_format(
            "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n"
            "disabled with --cache-idle-slots and --parallel 1\n",
            params.slot_prompt_similarity
        ),
        [](common_params & params, const std::string & value) {
            params.slot_prompt_similarity = std::stof(value);
        }
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@ -686,9 +686,6 @@ private:
    json json_webui_settings = json::object();
    // Necessary similarity of prompt for slot selection
    float slot_prompt_similarity = 0.0f;
    std::string model_name; // name of the loaded model, to be used by API
    std::set<std::string> model_aliases; // additional names for the model
    std::set<std::string> model_tags;    // informational tags
@ -853,9 +850,6 @@ private:
            }
        }
        // Necessary similarity of prompt for slot selection
        slot_prompt_similarity = params_base.slot_prompt_similarity;
        // setup slots
        SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel);
@ -999,6 +993,16 @@ private:
            }
        }
        // prompt similarity doesn't work with idle slots, rely on the unified cache instead
        if (params_base.cache_idle_slots) {
            params_base.slot_prompt_similarity = 0.0f;
        }
        // bypass prompt similarity when we only have one slot
        if (params_base.n_parallel == 1) {
            params_base.slot_prompt_similarity = 0.0f;
        }
        // populate webui settings
        {
            if (!params_base.webui_config_json.empty()) {
@ -1073,7 +1077,7 @@ private:
        bool update_cache = false;
        // find the slot that has at least n% prompt similarity
-        if (ret == nullptr && slot_prompt_similarity != 0.0f) {
+        if (ret == nullptr && params_base.slot_prompt_similarity != 0.0f) {
            float sim_best = 0;
            for (server_slot & slot : slots) {
@ -1093,7 +1097,7 @@ private:
                const float sim_cur = float(tokens.get_common_prefix(task.tokens)) / task.tokens.size();
                // select the current slot if the criteria match
-                if (sim_cur > sim_best && sim_cur > slot_prompt_similarity) {
+                if (sim_cur > sim_best && sim_cur > params_base.slot_prompt_similarity) {
                    sim_best = sim_cur;
                    ret = &slot;
@ -1104,7 +1108,7 @@ private:
                const float f_keep = (sim_best*task.tokens.size()) / ret->prompt.tokens.size();
                SLT_INF(*ret, "selected slot by LCP similarity, sim_best = %.3f (> %.3f thold), f_keep = %.3f\n",
-                        sim_best, slot_prompt_similarity, f_keep);
+                        sim_best, params_base.slot_prompt_similarity, f_keep);
                // if we are about to lose a large portion of the existing context - save it in the prompt cache
                if (f_keep < 0.5f) {