server : renamed spec checkpoints option
This commit is contained in:
parent
e994c4ec1f
commit
bd2f7f2d7f
|
|
@ -3526,13 +3526,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--spec-ckpt-num-tries"}, "N",
|
||||
string_format("number of tries for speculative decoding with recurrent memory (default: %d)", params.speculative.ckpt_num_tries),
|
||||
[](common_params & params, int value) {
|
||||
if (value < 0 || value > 10) {
|
||||
throw std::invalid_argument("number of tries must be between 0 and 10 inclusive");
|
||||
{"--spec-use-checkpoints"}, "[on|off|auto]",
|
||||
string_format("use checkpoints to rewind token history in recurrent models ('on', 'off', or 'auto', default: %s)",
|
||||
params.speculative.use_checkpoints ? "on" : "off"),
|
||||
[](common_params & params, const std::string & value) {
|
||||
if (is_truthy(value) || is_autoy(value)) {
|
||||
params.speculative.use_checkpoints = true;
|
||||
} else if (is_falsey(value)) {
|
||||
params.speculative.use_checkpoints = false;
|
||||
} else {
|
||||
throw std::invalid_argument("invalid value for --spec-use-checkpoints");
|
||||
}
|
||||
params.speculative.ckpt_num_tries = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
|
|
|
|||
|
|
@ -324,7 +324,8 @@ struct common_params_speculative {
|
|||
uint16_t ngram_size_n = 12; // ngram size for lookup
|
||||
uint16_t ngram_size_m = 48; // mgram size for speculative tokens
|
||||
uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
|
||||
uint16_t ckpt_num_tries = 0; // number of tries in case of recurrent memory
|
||||
bool use_checkpoints = false; // use checkpoints to rewind in token history of recurrent models
|
||||
|
||||
|
||||
std::shared_ptr<common_ngram_mod> ngram_mod;
|
||||
|
||||
|
|
|
|||
|
|
@ -1136,8 +1136,8 @@ struct common_speculative_session::impl {
|
|||
clear_draft();
|
||||
return draft;
|
||||
}
|
||||
if (params_spec.ckpt_num_tries > 0
|
||||
&& spec_ckpt_n_denials >= params_spec.ckpt_num_tries) {
|
||||
if (params_spec.use_checkpoints
|
||||
&& spec_ckpt_n_denials > 0) {
|
||||
clear_draft();
|
||||
return draft;
|
||||
}
|
||||
|
|
@ -1166,7 +1166,7 @@ struct common_speculative_session::impl {
|
|||
draft.resize(n_draft_max);
|
||||
}
|
||||
|
||||
bool do_checkpoint = !draft.empty() && params_spec.ckpt_num_tries > 0;
|
||||
bool do_checkpoint = !draft.empty() && params_spec.use_checkpoints;
|
||||
if (do_checkpoint && cached_text_tokens.size() > 5) {
|
||||
LOG_DBG("draft.size = %zu, n_spec_denials = %d, do_checkpoint = %s, tokens=[..., %d, %d, %d]\n",
|
||||
draft.size(), spec_ckpt_n_denials,
|
||||
|
|
@ -1235,8 +1235,6 @@ struct common_speculative_session::impl {
|
|||
return common_speculative_accept_response(std::move(ids), n_draft, true);
|
||||
}
|
||||
|
||||
//spec_ckpt_n_accepted = (spec_ckpt_n_denials < params_spec.ckpt_num_tries) ? (int) (ids.size() - 1) : 0;
|
||||
|
||||
callback.batch_clear();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -849,7 +849,7 @@ private:
|
|||
slot.prompt.tokens.has_mtmd = mctx != nullptr;
|
||||
|
||||
// try speculative decoding
|
||||
if (can_spec || params_base.speculative.ckpt_num_tries > 0) {
|
||||
if (can_spec || params_base.speculative.use_checkpoints) {
|
||||
if (mctx) {
|
||||
SRV_ERR("%s\n", "speculative decoding is not supported with multimodal");
|
||||
return false;
|
||||
|
|
|
|||
Loading…
Reference in New Issue