From 32d72eee29911bae59686da040341d31691b6143 Mon Sep 17 00:00:00 2001 From: Sascha Rogmann Date: Thu, 5 Feb 2026 22:27:52 +0100 Subject: [PATCH] spec: remove parameter spec-ngram-check-rate --- common/arg.cpp | 10 ---------- common/common.h | 1 - common/ngram-map.cpp | 7 +++---- common/ngram-map.h | 8 +++----- common/speculative.cpp | 14 ++------------ docs/speculative.md | 6 ------ tools/server/server-task.cpp | 4 ---- 7 files changed, 8 insertions(+), 42 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 5fbc9022c0..9c85696ebd 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3437,16 +3437,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.speculative.ngram_size_m = value; } ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(common_arg( - {"--spec-ngram-check-rate"}, "N", - string_format("ngram check rate for ngram-simple/ngram-map speculative decoding (default: %d)", params.speculative.ngram_check_rate), - [](common_params & params, int value) { - if (value < 1) { - throw std::invalid_argument("ngram check rate must be at least 1"); - } - params.speculative.ngram_check_rate = value; - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--spec-ngram-min-hits"}, "N", string_format("minimum hits for ngram-map speculative decoding (default: %d)", params.speculative.ngram_min_hits), diff --git a/common/common.h b/common/common.h index 398ebb0960..b284244530 100644 --- a/common/common.h +++ b/common/common.h @@ -269,7 +269,6 @@ struct common_params_speculative { uint16_t ngram_size_n = 12; // ngram size for lookup uint16_t ngram_size_m = 48; // mgram size for speculative tokens - uint16_t ngram_check_rate = 1; // check rate for ngram lookup uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed std::shared_ptr ngram_mod; diff --git a/common/ngram-map.cpp b/common/ngram-map.cpp index c5b8fc75ed..2b876a6e99 100644 --- a/common/ngram-map.cpp +++ b/common/ngram-map.cpp @@ -231,10 +231,9 @@ void common_ngram_map_draft(common_ngram_map & map, GGML_ABORT("%s: cur_len exceeds UINT32_MAX: %zu", __func__, cur_len); } - // Only check every check_rate tokens to save compute - // i.e., perform check if (cur_len - idx_last_check) >= check_rate - if (map.idx_last_check + map.check_rate > cur_len) { - return; + if (map.idx_last_check > cur_len) { + // Should not happen because of common_ngram_map_begin(). + GGML_ABORT("%s: map.idx_last_check > cur_len: %zu > %zu", __func__, map.idx_last_check, cur_len); } map.idx_last_check = cur_len; diff --git a/common/ngram-map.h b/common/ngram-map.h index 9668bd5a7c..6a5be56a33 100644 --- a/common/ngram-map.h +++ b/common/ngram-map.h @@ -24,7 +24,6 @@ struct common_ngram_simple_config { uint16_t size_ngram; // size of n-grams to lookup in self-mode uint16_t size_mgram; // size of m-grams to draft in self-mode - uint16_t check_rate; // check for speculative decoding without draft model for each check_rate token }; // Searches for a n-gram in the history and checks whether a draft sequence should be generated. @@ -66,15 +65,14 @@ struct common_ngram_map { bool key_only; // true if only key n-grams are used, no values. std::vector keys; // key n-grams which occur several times in token-history - uint16_t check_rate; // check for speculative decoding without draft model for each check_rate token uint16_t min_hits; // minimum number of key hits to consider a draft - bool show_key_map_stats = false; // true, if statitics of the key_map should be printed. + bool show_key_map_stats = true; // true, if statistics of the key_map should be printed. common_ngram_map(uint16_t sz_key, uint16_t sz_value, bool only_keys, - uint16_t check_rate, uint16_t min_hits) + uint16_t min_hits) : size_key(sz_key), size_value(sz_value), key_only(only_keys), - check_rate(check_rate), min_hits(min_hits) { + min_hits(min_hits) { key_map.resize(COMMON_NGRAM_HASH_MAP_SIZE); // 2^18 hash entries, 0 entries if key_map shouldn't be used } diff --git a/common/speculative.cpp b/common/speculative.cpp index c99b19dbfd..67dd621f8c 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -465,8 +465,6 @@ struct common_speculative_state_eagle3 : public common_speculative_state { struct common_speculative_state_ngram_simple : public common_speculative_state { common_ngram_simple_config config; - uint16_t check_id = 0; // used to control the frequency of generating drafts - common_speculative_state_ngram_simple( enum common_speculative_type type, common_ngram_simple_config config) @@ -481,11 +479,6 @@ struct common_speculative_state_ngram_simple : public common_speculative_state { const llama_tokens & prompt_tgt, llama_token id_last, llama_tokens & result) override { - ++check_id; - if (check_id < config.check_rate) { - return; - } - check_id = 0; result = common_ngram_simple_draft(config, prompt_tgt, id_last); GGML_UNUSED(params); @@ -752,10 +745,9 @@ static common_ngram_map get_common_ngram_map(const common_speculative_config & c uint16_t size_key = config.params.ngram_size_n; uint16_t size_value = config.params.ngram_size_m; bool key_only = (config.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K); - uint16_t check_rate = config.params.ngram_check_rate; uint16_t min_hits = config.params.ngram_min_hits; - return common_ngram_map(size_key, size_value, key_only, check_rate, min_hits); + return common_ngram_map(size_key, size_value, key_only, min_hits); } static common_speculative_state_ngram_cache create_state_ngram_cache( @@ -895,12 +887,10 @@ common_speculative * common_speculative_init( uint16_t ngram_size_key = ngram_map.size_key; uint16_t mgram_size_value = ngram_map.size_value; - uint16_t check_rate = ngram_map.check_rate; auto config_simple = common_ngram_simple_config { /* .size_ngram = */ ngram_size_key, - /* .size_mgram = */ mgram_size_value, - /* .check_rate = */ check_rate + /* .size_mgram = */ mgram_size_value }; auto state = std::make_unique( /* .type = */ config.type, diff --git a/docs/speculative.md b/docs/speculative.md index 03afab5b41..31856c157a 100644 --- a/docs/speculative.md +++ b/docs/speculative.md @@ -119,8 +119,6 @@ If a draft model is combined with a draftless decoding the draftless decoding ha of lookup n-gram (default: 12) --spec-ngram-size-m N ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: 48) ---spec-ngram-check-rate N ngram check rate for ngram-simple/ngram-map speculative decoding - (default: 1) --spec-ngram-min-hits N minimum hits for ngram-map speculative decoding (default: 1) ``` @@ -153,10 +151,6 @@ Sets the size M of the draft m-gram for n-gram map based speculative decoding. The m-gram size determines how many tokens to draft when a match is found. Larger values can provide more speedup but may reduce acceptance rate. -### `--spec-ngram-check-rate R` - -This option aims at performance if the n-gram lookup in history is to costly. A lookup will be executed at every R tokens (default is 1, every token). - ### `--spec-ngram-min-hits H` This option defines how often a key has to appear in the token history to be used as a draft (default is 1). diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 2d25db63b7..a137427c69 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -80,7 +80,6 @@ json task_params::to_json(bool only_metrics) const { {"speculative.type", common_speculative_type_to_str(speculative.type)}, {"speculative.ngram_size_n", speculative.ngram_size_n}, {"speculative.ngram_size_m", speculative.ngram_size_m}, - {"speculative.ngram_c_rate", speculative.ngram_check_rate}, {"speculative.ngram_m_hits", speculative.ngram_min_hits}, {"timings_per_token", timings_per_token}, {"post_sampling_probs", post_sampling_probs}, @@ -144,7 +143,6 @@ json task_params::to_json(bool only_metrics) const { {"speculative.type", common_speculative_type_to_str(speculative.type)}, {"speculative.ngram_size_n", speculative.ngram_size_n}, {"speculative.ngram_size_m", speculative.ngram_size_m}, - {"speculative.ngram_c_rate", speculative.ngram_check_rate}, {"speculative.ngram_m_hits", speculative.ngram_min_hits}, {"timings_per_token", timings_per_token}, {"post_sampling_probs", post_sampling_probs}, @@ -257,12 +255,10 @@ task_params server_task::params_from_json_cmpl( params.speculative.ngram_size_n = json_value(data, "speculative.ngram_size_n", defaults.speculative.ngram_size_n); params.speculative.ngram_size_m = json_value(data, "speculative.ngram_size_m", defaults.speculative.ngram_size_m); - params.speculative.ngram_check_rate = json_value(data, "speculative.ngram_c_rate", defaults.speculative.ngram_check_rate); params.speculative.ngram_min_hits = json_value(data, "speculative.ngram_m_hits", defaults.speculative.ngram_min_hits); params.speculative.ngram_size_n = std::max(std::min(1, (int) params.speculative.ngram_size_n), 1024); params.speculative.ngram_size_m = std::max(std::min(1, (int) params.speculative.ngram_size_m), 1024); - params.speculative.ngram_check_rate = std::max(std::min(1, (int) params.speculative.ngram_check_rate), 1024); params.speculative.ngram_min_hits = std::max(std::min(1, (int) params.speculative.ngram_min_hits), 1024); // Use OpenAI API logprobs only if n_probs wasn't provided