server : renamed spec checkpoints option

This commit is contained in:
Sascha Rogmann 2026-02-24 22:41:21 +01:00
parent e994c4ec1f
commit bd2f7f2d7f
4 changed files with 16 additions and 13 deletions

View File

@ -3526,13 +3526,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
).set_examples({LLAMA_EXAMPLE_SERVER})); ).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg( add_opt(common_arg(
{"--spec-ckpt-num-tries"}, "N", {"--spec-use-checkpoints"}, "[on|off|auto]",
string_format("number of tries for speculative decoding with recurrent memory (default: %d)", params.speculative.ckpt_num_tries), string_format("use checkpoints to rewind token history in recurrent models ('on', 'off', or 'auto', default: %s)",
[](common_params & params, int value) { params.speculative.use_checkpoints ? "on" : "off"),
if (value < 0 || value > 10) { [](common_params & params, const std::string & value) {
throw std::invalid_argument("number of tries must be between 0 and 10 inclusive"); if (is_truthy(value) || is_autoy(value)) {
params.speculative.use_checkpoints = true;
} else if (is_falsey(value)) {
params.speculative.use_checkpoints = false;
} else {
throw std::invalid_argument("invalid value for --spec-use-checkpoints");
} }
params.speculative.ckpt_num_tries = value;
} }
).set_examples({LLAMA_EXAMPLE_SERVER})); ).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg( add_opt(common_arg(

View File

@ -324,7 +324,8 @@ struct common_params_speculative {
uint16_t ngram_size_n = 12; // ngram size for lookup uint16_t ngram_size_n = 12; // ngram size for lookup
uint16_t ngram_size_m = 48; // mgram size for speculative tokens uint16_t ngram_size_m = 48; // mgram size for speculative tokens
uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
uint16_t ckpt_num_tries = 0; // number of tries in case of recurrent memory bool use_checkpoints = false; // use checkpoints to rewind in token history of recurrent models
std::shared_ptr<common_ngram_mod> ngram_mod; std::shared_ptr<common_ngram_mod> ngram_mod;

View File

@ -1136,8 +1136,8 @@ struct common_speculative_session::impl {
clear_draft(); clear_draft();
return draft; return draft;
} }
if (params_spec.ckpt_num_tries > 0 if (params_spec.use_checkpoints
&& spec_ckpt_n_denials >= params_spec.ckpt_num_tries) { && spec_ckpt_n_denials > 0) {
clear_draft(); clear_draft();
return draft; return draft;
} }
@ -1166,7 +1166,7 @@ struct common_speculative_session::impl {
draft.resize(n_draft_max); draft.resize(n_draft_max);
} }
bool do_checkpoint = !draft.empty() && params_spec.ckpt_num_tries > 0; bool do_checkpoint = !draft.empty() && params_spec.use_checkpoints;
if (do_checkpoint && cached_text_tokens.size() > 5) { if (do_checkpoint && cached_text_tokens.size() > 5) {
LOG_DBG("draft.size = %zu, n_spec_denials = %d, do_checkpoint = %s, tokens=[..., %d, %d, %d]\n", LOG_DBG("draft.size = %zu, n_spec_denials = %d, do_checkpoint = %s, tokens=[..., %d, %d, %d]\n",
draft.size(), spec_ckpt_n_denials, draft.size(), spec_ckpt_n_denials,
@ -1235,8 +1235,6 @@ struct common_speculative_session::impl {
return common_speculative_accept_response(std::move(ids), n_draft, true); return common_speculative_accept_response(std::move(ids), n_draft, true);
} }
//spec_ckpt_n_accepted = (spec_ckpt_n_denials < params_spec.ckpt_num_tries) ? (int) (ids.size() - 1) : 0;
callback.batch_clear(); callback.batch_clear();
} }
} }

View File

@ -849,7 +849,7 @@ private:
slot.prompt.tokens.has_mtmd = mctx != nullptr; slot.prompt.tokens.has_mtmd = mctx != nullptr;
// try speculative decoding // try speculative decoding
if (can_spec || params_base.speculative.ckpt_num_tries > 0) { if (can_spec || params_base.speculative.use_checkpoints) {
if (mctx) { if (mctx) {
SRV_ERR("%s\n", "speculative decoding is not supported with multimodal"); SRV_ERR("%s\n", "speculative decoding is not supported with multimodal");
return false; return false;