cont : avoid --spec-use-checkpoints argument
This commit is contained in:
parent
63c66f1512
commit
8491e15405
|
|
@ -3538,20 +3538,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
params.speculative.ngram_min_hits = value;
|
params.speculative.ngram_min_hits = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||||
add_opt(common_arg(
|
|
||||||
{"--spec-use-checkpoints"}, "[on|off|auto]",
|
|
||||||
string_format("use checkpoints to rewind token history in recurrent models ('on', 'off', or 'auto', default: %s)",
|
|
||||||
params.speculative.use_checkpoints ? "on" : "off"),
|
|
||||||
[](common_params & params, const std::string & value) {
|
|
||||||
if (is_truthy(value) || is_autoy(value)) {
|
|
||||||
params.speculative.use_checkpoints = true;
|
|
||||||
} else if (is_falsey(value)) {
|
|
||||||
params.speculative.use_checkpoints = false;
|
|
||||||
} else {
|
|
||||||
throw std::invalid_argument("invalid value for --spec-use-checkpoints");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
|
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
|
||||||
string_format(
|
string_format(
|
||||||
|
|
|
||||||
|
|
@ -326,7 +326,6 @@ struct common_params_speculative {
|
||||||
uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
|
uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
|
||||||
bool use_checkpoints = false; // use checkpoints to rewind in token history of recurrent models
|
bool use_checkpoints = false; // use checkpoints to rewind in token history of recurrent models
|
||||||
|
|
||||||
|
|
||||||
std::shared_ptr<common_ngram_mod> ngram_mod;
|
std::shared_ptr<common_ngram_mod> ngram_mod;
|
||||||
|
|
||||||
std::string lookup_cache_static; // path of static ngram cache file for lookup decoding // NOLINT
|
std::string lookup_cache_static; // path of static ngram cache file for lookup decoding // NOLINT
|
||||||
|
|
|
||||||
|
|
@ -910,13 +910,13 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
|
||||||
return it->second;
|
return it->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool common_speculative_is_compat(llama_context * ctx_tgt) {
|
common_speculative_compat_type common_speculative_is_compat(llama_context * ctx_tgt) {
|
||||||
auto * mem = llama_get_memory(ctx_tgt);
|
auto * mem = llama_get_memory(ctx_tgt);
|
||||||
if (mem == nullptr) {
|
if (mem == nullptr) {
|
||||||
return false;
|
return COMMON_SPECULATIVE_COMPAT_TYPE_NO;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool res = true;
|
common_speculative_compat_type res = COMMON_SPECULATIVE_COMPAT_TYPE_FULL;
|
||||||
|
|
||||||
llama_memory_clear(mem, true);
|
llama_memory_clear(mem, true);
|
||||||
|
|
||||||
|
|
@ -928,14 +928,14 @@ bool common_speculative_is_compat(llama_context * ctx_tgt) {
|
||||||
int ret = llama_decode(ctx_tgt, llama_batch_get_one(tmp.data(), tmp.size()));
|
int ret = llama_decode(ctx_tgt, llama_batch_get_one(tmp.data(), tmp.size()));
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
LOG_ERR("%s: llama_decode() failed: %d\n", __func__, ret);
|
LOG_ERR("%s: llama_decode() failed: %d\n", __func__, ret);
|
||||||
res = false;
|
res = COMMON_SPECULATIVE_COMPAT_TYPE_NO;
|
||||||
goto done;
|
goto done;
|
||||||
}
|
}
|
||||||
|
|
||||||
// try to remove the last tokens
|
// try to remove the last tokens
|
||||||
if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
|
if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
|
||||||
LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
|
LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
|
||||||
res = false;
|
res = COMMON_SPECULATIVE_COMPAT_TYPE_CKPT;
|
||||||
goto done;
|
goto done;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1262,7 +1262,6 @@ struct common_speculative_session::impl {
|
||||||
if (draft.empty()) {
|
if (draft.empty()) {
|
||||||
// switch to non-draft inference
|
// switch to non-draft inference
|
||||||
LOG_DBG("%s: draft of length 0 after denied checkpoint\n", __func__);
|
LOG_DBG("%s: draft of length 0 after denied checkpoint\n", __func__);
|
||||||
clear_draft();
|
|
||||||
return draft;
|
return draft;
|
||||||
}
|
}
|
||||||
// we use the shortened draft of previous speculation
|
// we use the shortened draft of previous speculation
|
||||||
|
|
|
||||||
|
|
@ -23,9 +23,15 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
|
||||||
// convert type to string
|
// convert type to string
|
||||||
std::string common_speculative_type_to_str(enum common_speculative_type type);
|
std::string common_speculative_type_to_str(enum common_speculative_type type);
|
||||||
|
|
||||||
|
enum common_speculative_compat_type {
|
||||||
|
COMMON_SPECULATIVE_COMPAT_TYPE_NO = 0,
|
||||||
|
COMMON_SPECULATIVE_COMPAT_TYPE_FULL = 1,
|
||||||
|
COMMON_SPECULATIVE_COMPAT_TYPE_CKPT = 2,
|
||||||
|
};
|
||||||
|
|
||||||
// check if the llama_context is compatible for speculative decoding
|
// check if the llama_context is compatible for speculative decoding
|
||||||
// note: clears the memory of the context
|
// note: clears the memory of the context
|
||||||
bool common_speculative_is_compat(llama_context * ctx_tgt);
|
common_speculative_compat_type common_speculative_is_compat(llama_context * ctx_tgt);
|
||||||
|
|
||||||
common_speculative * common_speculative_init(
|
common_speculative * common_speculative_init(
|
||||||
common_params_speculative & params,
|
common_params_speculative & params,
|
||||||
|
|
|
||||||
|
|
@ -860,9 +860,14 @@ private:
|
||||||
|
|
||||||
slots.clear();
|
slots.clear();
|
||||||
|
|
||||||
const bool can_spec = common_speculative_is_compat(ctx);
|
const auto spec_type = common_speculative_is_compat(ctx);
|
||||||
if (!can_spec) {
|
if (spec_type == COMMON_SPECULATIVE_COMPAT_TYPE_NO) {
|
||||||
SRV_WRN("%s", "speculative decoding not supported by this context without checkpoints\n");
|
SRV_WRN("%s", "speculative decoding not supported by this context\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (spec_type == COMMON_SPECULATIVE_COMPAT_TYPE_CKPT) {
|
||||||
|
SRV_WRN("%s", "speculative decoding will use checkpoints\n");
|
||||||
|
params_base.speculative.use_checkpoints = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// initialize slots
|
// initialize slots
|
||||||
|
|
@ -881,7 +886,7 @@ private:
|
||||||
slot.prompt.tokens.has_mtmd = mctx != nullptr;
|
slot.prompt.tokens.has_mtmd = mctx != nullptr;
|
||||||
|
|
||||||
// try speculative decoding
|
// try speculative decoding
|
||||||
if (can_spec || params_base.speculative.use_checkpoints) {
|
if (spec_type != COMMON_SPECULATIVE_COMPAT_TYPE_NO) {
|
||||||
if (mctx) {
|
if (mctx) {
|
||||||
SRV_ERR("%s\n", "speculative decoding is not supported with multimodal");
|
SRV_ERR("%s\n", "speculative decoding is not supported with multimodal");
|
||||||
return false;
|
return false;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue