server : renamed spec checkpoints option

2026-02-24 22:41:21 +01:00 · 2026-02-24 22:41:21 +01:00 · bd2f7f2d7f
parent e994c4ec1f
commit bd2f7f2d7f
4 changed files with 16 additions and 13 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -3526,13 +3526,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
-        {"--spec-ckpt-num-tries"}, "N",
+        {"--spec-use-checkpoints"}, "[on|off|auto]",
-        string_format("number of tries for speculative decoding with recurrent memory (default: %d)", params.speculative.ckpt_num_tries),
+        string_format("use checkpoints to rewind token history in recurrent models ('on', 'off', or 'auto', default: %s)",
-        [](common_params & params, int value) {
+                        params.speculative.use_checkpoints ? "on" : "off"),
-            if (value < 0 || value > 10) {
+        [](common_params & params, const std::string & value) {
-                throw std::invalid_argument("number of tries must be between 0 and 10 inclusive");
+            if (is_truthy(value) || is_autoy(value)) {
                params.speculative.use_checkpoints = true;
            } else if (is_falsey(value)) {
                params.speculative.use_checkpoints = false;
            } else {
                throw std::invalid_argument("invalid value for --spec-use-checkpoints");
            }
            params.speculative.ckpt_num_tries = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
--- a/common/common.h
+++ b/common/common.h
@ -324,7 +324,8 @@ struct common_params_speculative {
    uint16_t ngram_size_n     = 12; // ngram size for lookup
    uint16_t ngram_size_m     = 48; // mgram size for speculative tokens
    uint16_t ngram_min_hits   =  1; // minimum hits at ngram/mgram lookup for mgram to be proposed
-    uint16_t ckpt_num_tries   =  0; // number of tries in case of recurrent memory
+    bool     use_checkpoints  =  false; // use checkpoints to rewind in token history of recurrent models
    std::shared_ptr<common_ngram_mod> ngram_mod;
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@ -1136,8 +1136,8 @@ struct common_speculative_session::impl {
            clear_draft();
            return draft;
        }
-        if (params_spec.ckpt_num_tries > 0
+        if (params_spec.use_checkpoints
-                && spec_ckpt_n_denials >= params_spec.ckpt_num_tries) {
+                && spec_ckpt_n_denials > 0) {
            clear_draft();
            return draft;
        }
@ -1166,7 +1166,7 @@ struct common_speculative_session::impl {
            draft.resize(n_draft_max);
        }
-        bool do_checkpoint = !draft.empty() && params_spec.ckpt_num_tries > 0;
+        bool do_checkpoint = !draft.empty() && params_spec.use_checkpoints;
        if (do_checkpoint && cached_text_tokens.size() > 5) {
            LOG_DBG("draft.size = %zu, n_spec_denials = %d, do_checkpoint = %s, tokens=[..., %d, %d, %d]\n",
                draft.size(), spec_ckpt_n_denials,
@ -1235,8 +1235,6 @@ struct common_speculative_session::impl {
                    return common_speculative_accept_response(std::move(ids), n_draft, true);
                }
                //spec_ckpt_n_accepted = (spec_ckpt_n_denials < params_spec.ckpt_num_tries) ? (int) (ids.size() - 1) : 0;
                callback.batch_clear();
            }
        }
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@ -849,7 +849,7 @@ private:
            slot.prompt.tokens.has_mtmd = mctx != nullptr;
            // try speculative decoding
-            if (can_spec || params_base.speculative.ckpt_num_tries > 0) {
+            if (can_spec || params_base.speculative.use_checkpoints) {
                if (mctx) {
                    SRV_ERR("%s\n", "speculative decoding is not supported with multimodal");
                    return false;