From bd2f7f2d7fe19eef9f751f39488b1b77b5f20b9e Mon Sep 17 00:00:00 2001 From: Sascha Rogmann Date: Tue, 24 Feb 2026 22:41:21 +0100 Subject: [PATCH] server : renamed spec checkpoints option --- common/arg.cpp | 16 ++++++++++------ common/common.h | 3 ++- common/speculative.cpp | 8 +++----- tools/server/server-context.cpp | 2 +- 4 files changed, 16 insertions(+), 13 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index dc5b0ae11e..d8604b707f 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3526,13 +3526,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( - {"--spec-ckpt-num-tries"}, "N", - string_format("number of tries for speculative decoding with recurrent memory (default: %d)", params.speculative.ckpt_num_tries), - [](common_params & params, int value) { - if (value < 0 || value > 10) { - throw std::invalid_argument("number of tries must be between 0 and 10 inclusive"); + {"--spec-use-checkpoints"}, "[on|off|auto]", + string_format("use checkpoints to rewind token history in recurrent models ('on', 'off', or 'auto', default: %s)", + params.speculative.use_checkpoints ? "on" : "off"), + [](common_params & params, const std::string & value) { + if (is_truthy(value) || is_autoy(value)) { + params.speculative.use_checkpoints = true; + } else if (is_falsey(value)) { + params.speculative.use_checkpoints = false; + } else { + throw std::invalid_argument("invalid value for --spec-use-checkpoints"); } - params.speculative.ckpt_num_tries = value; } ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( diff --git a/common/common.h b/common/common.h index f8775032b3..eb451748a6 100644 --- a/common/common.h +++ b/common/common.h @@ -324,7 +324,8 @@ struct common_params_speculative { uint16_t ngram_size_n = 12; // ngram size for lookup uint16_t ngram_size_m = 48; // mgram size for speculative tokens uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed - uint16_t ckpt_num_tries = 0; // number of tries in case of recurrent memory + bool use_checkpoints = false; // use checkpoints to rewind in token history of recurrent models + std::shared_ptr ngram_mod; diff --git a/common/speculative.cpp b/common/speculative.cpp index 356e3bc7a9..581c18419d 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -1136,8 +1136,8 @@ struct common_speculative_session::impl { clear_draft(); return draft; } - if (params_spec.ckpt_num_tries > 0 - && spec_ckpt_n_denials >= params_spec.ckpt_num_tries) { + if (params_spec.use_checkpoints + && spec_ckpt_n_denials > 0) { clear_draft(); return draft; } @@ -1166,7 +1166,7 @@ struct common_speculative_session::impl { draft.resize(n_draft_max); } - bool do_checkpoint = !draft.empty() && params_spec.ckpt_num_tries > 0; + bool do_checkpoint = !draft.empty() && params_spec.use_checkpoints; if (do_checkpoint && cached_text_tokens.size() > 5) { LOG_DBG("draft.size = %zu, n_spec_denials = %d, do_checkpoint = %s, tokens=[..., %d, %d, %d]\n", draft.size(), spec_ckpt_n_denials, @@ -1235,8 +1235,6 @@ struct common_speculative_session::impl { return common_speculative_accept_response(std::move(ids), n_draft, true); } - //spec_ckpt_n_accepted = (spec_ckpt_n_denials < params_spec.ckpt_num_tries) ? (int) (ids.size() - 1) : 0; - callback.batch_clear(); } } diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index b045b51139..8343975b19 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -849,7 +849,7 @@ private: slot.prompt.tokens.has_mtmd = mctx != nullptr; // try speculative decoding - if (can_spec || params_base.speculative.ckpt_num_tries > 0) { + if (can_spec || params_base.speculative.use_checkpoints) { if (mctx) { SRV_ERR("%s\n", "speculative decoding is not supported with multimodal"); return false;