server : renamed spec checkpoints option

This commit is contained in:
Sascha Rogmann 2026-02-24 22:41:21 +01:00
parent e994c4ec1f
commit bd2f7f2d7f
4 changed files with 16 additions and 13 deletions

View File

@ -3526,13 +3526,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--spec-ckpt-num-tries"}, "N",
string_format("number of tries for speculative decoding with recurrent memory (default: %d)", params.speculative.ckpt_num_tries),
[](common_params & params, int value) {
if (value < 0 || value > 10) {
throw std::invalid_argument("number of tries must be between 0 and 10 inclusive");
{"--spec-use-checkpoints"}, "[on|off|auto]",
string_format("use checkpoints to rewind token history in recurrent models ('on', 'off', or 'auto', default: %s)",
params.speculative.use_checkpoints ? "on" : "off"),
[](common_params & params, const std::string & value) {
if (is_truthy(value) || is_autoy(value)) {
params.speculative.use_checkpoints = true;
} else if (is_falsey(value)) {
params.speculative.use_checkpoints = false;
} else {
throw std::invalid_argument("invalid value for --spec-use-checkpoints");
}
params.speculative.ckpt_num_tries = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(

View File

@ -324,7 +324,8 @@ struct common_params_speculative {
uint16_t ngram_size_n = 12; // ngram size for lookup
uint16_t ngram_size_m = 48; // mgram size for speculative tokens
uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
uint16_t ckpt_num_tries = 0; // number of tries in case of recurrent memory
bool use_checkpoints = false; // use checkpoints to rewind in token history of recurrent models
std::shared_ptr<common_ngram_mod> ngram_mod;

View File

@ -1136,8 +1136,8 @@ struct common_speculative_session::impl {
clear_draft();
return draft;
}
if (params_spec.ckpt_num_tries > 0
&& spec_ckpt_n_denials >= params_spec.ckpt_num_tries) {
if (params_spec.use_checkpoints
&& spec_ckpt_n_denials > 0) {
clear_draft();
return draft;
}
@ -1166,7 +1166,7 @@ struct common_speculative_session::impl {
draft.resize(n_draft_max);
}
bool do_checkpoint = !draft.empty() && params_spec.ckpt_num_tries > 0;
bool do_checkpoint = !draft.empty() && params_spec.use_checkpoints;
if (do_checkpoint && cached_text_tokens.size() > 5) {
LOG_DBG("draft.size = %zu, n_spec_denials = %d, do_checkpoint = %s, tokens=[..., %d, %d, %d]\n",
draft.size(), spec_ckpt_n_denials,
@ -1235,8 +1235,6 @@ struct common_speculative_session::impl {
return common_speculative_accept_response(std::move(ids), n_draft, true);
}
//spec_ckpt_n_accepted = (spec_ckpt_n_denials < params_spec.ckpt_num_tries) ? (int) (ids.size() - 1) : 0;
callback.batch_clear();
}
}

View File

@ -849,7 +849,7 @@ private:
slot.prompt.tokens.has_mtmd = mctx != nullptr;
// try speculative decoding
if (can_spec || params_base.speculative.ckpt_num_tries > 0) {
if (can_spec || params_base.speculative.use_checkpoints) {
if (mctx) {
SRV_ERR("%s\n", "speculative decoding is not supported with multimodal");
return false;