From 32d72eee29911bae59686da040341d31691b6143 Mon Sep 17 00:00:00 2001 From: Sascha Rogmann Date: Thu, 5 Feb 2026 22:27:52 +0100 Subject: [PATCH 1/4] spec: remove parameter spec-ngram-check-rate --- common/arg.cpp | 10 ---------- common/common.h | 1 - common/ngram-map.cpp | 7 +++---- common/ngram-map.h | 8 +++----- common/speculative.cpp | 14 ++------------ docs/speculative.md | 6 ------ tools/server/server-task.cpp | 4 ---- 7 files changed, 8 insertions(+), 42 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 5fbc9022c0..9c85696ebd 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3437,16 +3437,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.speculative.ngram_size_m = value; } ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(common_arg( - {"--spec-ngram-check-rate"}, "N", - string_format("ngram check rate for ngram-simple/ngram-map speculative decoding (default: %d)", params.speculative.ngram_check_rate), - [](common_params & params, int value) { - if (value < 1) { - throw std::invalid_argument("ngram check rate must be at least 1"); - } - params.speculative.ngram_check_rate = value; - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--spec-ngram-min-hits"}, "N", string_format("minimum hits for ngram-map speculative decoding (default: %d)", params.speculative.ngram_min_hits), diff --git a/common/common.h b/common/common.h index 398ebb0960..b284244530 100644 --- a/common/common.h +++ b/common/common.h @@ -269,7 +269,6 @@ struct common_params_speculative { uint16_t ngram_size_n = 12; // ngram size for lookup uint16_t ngram_size_m = 48; // mgram size for speculative tokens - uint16_t ngram_check_rate = 1; // check rate for ngram lookup uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed std::shared_ptr ngram_mod; diff --git a/common/ngram-map.cpp b/common/ngram-map.cpp index c5b8fc75ed..2b876a6e99 100644 --- a/common/ngram-map.cpp +++ b/common/ngram-map.cpp @@ -231,10 +231,9 @@ void common_ngram_map_draft(common_ngram_map & map, GGML_ABORT("%s: cur_len exceeds UINT32_MAX: %zu", __func__, cur_len); } - // Only check every check_rate tokens to save compute - // i.e., perform check if (cur_len - idx_last_check) >= check_rate - if (map.idx_last_check + map.check_rate > cur_len) { - return; + if (map.idx_last_check > cur_len) { + // Should not happen because of common_ngram_map_begin(). + GGML_ABORT("%s: map.idx_last_check > cur_len: %zu > %zu", __func__, map.idx_last_check, cur_len); } map.idx_last_check = cur_len; diff --git a/common/ngram-map.h b/common/ngram-map.h index 9668bd5a7c..6a5be56a33 100644 --- a/common/ngram-map.h +++ b/common/ngram-map.h @@ -24,7 +24,6 @@ struct common_ngram_simple_config { uint16_t size_ngram; // size of n-grams to lookup in self-mode uint16_t size_mgram; // size of m-grams to draft in self-mode - uint16_t check_rate; // check for speculative decoding without draft model for each check_rate token }; // Searches for a n-gram in the history and checks whether a draft sequence should be generated. @@ -66,15 +65,14 @@ struct common_ngram_map { bool key_only; // true if only key n-grams are used, no values. std::vector keys; // key n-grams which occur several times in token-history - uint16_t check_rate; // check for speculative decoding without draft model for each check_rate token uint16_t min_hits; // minimum number of key hits to consider a draft - bool show_key_map_stats = false; // true, if statitics of the key_map should be printed. + bool show_key_map_stats = true; // true, if statistics of the key_map should be printed. common_ngram_map(uint16_t sz_key, uint16_t sz_value, bool only_keys, - uint16_t check_rate, uint16_t min_hits) + uint16_t min_hits) : size_key(sz_key), size_value(sz_value), key_only(only_keys), - check_rate(check_rate), min_hits(min_hits) { + min_hits(min_hits) { key_map.resize(COMMON_NGRAM_HASH_MAP_SIZE); // 2^18 hash entries, 0 entries if key_map shouldn't be used } diff --git a/common/speculative.cpp b/common/speculative.cpp index c99b19dbfd..67dd621f8c 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -465,8 +465,6 @@ struct common_speculative_state_eagle3 : public common_speculative_state { struct common_speculative_state_ngram_simple : public common_speculative_state { common_ngram_simple_config config; - uint16_t check_id = 0; // used to control the frequency of generating drafts - common_speculative_state_ngram_simple( enum common_speculative_type type, common_ngram_simple_config config) @@ -481,11 +479,6 @@ struct common_speculative_state_ngram_simple : public common_speculative_state { const llama_tokens & prompt_tgt, llama_token id_last, llama_tokens & result) override { - ++check_id; - if (check_id < config.check_rate) { - return; - } - check_id = 0; result = common_ngram_simple_draft(config, prompt_tgt, id_last); GGML_UNUSED(params); @@ -752,10 +745,9 @@ static common_ngram_map get_common_ngram_map(const common_speculative_config & c uint16_t size_key = config.params.ngram_size_n; uint16_t size_value = config.params.ngram_size_m; bool key_only = (config.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K); - uint16_t check_rate = config.params.ngram_check_rate; uint16_t min_hits = config.params.ngram_min_hits; - return common_ngram_map(size_key, size_value, key_only, check_rate, min_hits); + return common_ngram_map(size_key, size_value, key_only, min_hits); } static common_speculative_state_ngram_cache create_state_ngram_cache( @@ -895,12 +887,10 @@ common_speculative * common_speculative_init( uint16_t ngram_size_key = ngram_map.size_key; uint16_t mgram_size_value = ngram_map.size_value; - uint16_t check_rate = ngram_map.check_rate; auto config_simple = common_ngram_simple_config { /* .size_ngram = */ ngram_size_key, - /* .size_mgram = */ mgram_size_value, - /* .check_rate = */ check_rate + /* .size_mgram = */ mgram_size_value }; auto state = std::make_unique( /* .type = */ config.type, diff --git a/docs/speculative.md b/docs/speculative.md index 03afab5b41..31856c157a 100644 --- a/docs/speculative.md +++ b/docs/speculative.md @@ -119,8 +119,6 @@ If a draft model is combined with a draftless decoding the draftless decoding ha of lookup n-gram (default: 12) --spec-ngram-size-m N ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: 48) ---spec-ngram-check-rate N ngram check rate for ngram-simple/ngram-map speculative decoding - (default: 1) --spec-ngram-min-hits N minimum hits for ngram-map speculative decoding (default: 1) ``` @@ -153,10 +151,6 @@ Sets the size M of the draft m-gram for n-gram map based speculative decoding. The m-gram size determines how many tokens to draft when a match is found. Larger values can provide more speedup but may reduce acceptance rate. -### `--spec-ngram-check-rate R` - -This option aims at performance if the n-gram lookup in history is to costly. A lookup will be executed at every R tokens (default is 1, every token). - ### `--spec-ngram-min-hits H` This option defines how often a key has to appear in the token history to be used as a draft (default is 1). diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 2d25db63b7..a137427c69 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -80,7 +80,6 @@ json task_params::to_json(bool only_metrics) const { {"speculative.type", common_speculative_type_to_str(speculative.type)}, {"speculative.ngram_size_n", speculative.ngram_size_n}, {"speculative.ngram_size_m", speculative.ngram_size_m}, - {"speculative.ngram_c_rate", speculative.ngram_check_rate}, {"speculative.ngram_m_hits", speculative.ngram_min_hits}, {"timings_per_token", timings_per_token}, {"post_sampling_probs", post_sampling_probs}, @@ -144,7 +143,6 @@ json task_params::to_json(bool only_metrics) const { {"speculative.type", common_speculative_type_to_str(speculative.type)}, {"speculative.ngram_size_n", speculative.ngram_size_n}, {"speculative.ngram_size_m", speculative.ngram_size_m}, - {"speculative.ngram_c_rate", speculative.ngram_check_rate}, {"speculative.ngram_m_hits", speculative.ngram_min_hits}, {"timings_per_token", timings_per_token}, {"post_sampling_probs", post_sampling_probs}, @@ -257,12 +255,10 @@ task_params server_task::params_from_json_cmpl( params.speculative.ngram_size_n = json_value(data, "speculative.ngram_size_n", defaults.speculative.ngram_size_n); params.speculative.ngram_size_m = json_value(data, "speculative.ngram_size_m", defaults.speculative.ngram_size_m); - params.speculative.ngram_check_rate = json_value(data, "speculative.ngram_c_rate", defaults.speculative.ngram_check_rate); params.speculative.ngram_min_hits = json_value(data, "speculative.ngram_m_hits", defaults.speculative.ngram_min_hits); params.speculative.ngram_size_n = std::max(std::min(1, (int) params.speculative.ngram_size_n), 1024); params.speculative.ngram_size_m = std::max(std::min(1, (int) params.speculative.ngram_size_m), 1024); - params.speculative.ngram_check_rate = std::max(std::min(1, (int) params.speculative.ngram_check_rate), 1024); params.speculative.ngram_min_hits = std::max(std::min(1, (int) params.speculative.ngram_min_hits), 1024); // Use OpenAI API logprobs only if n_probs wasn't provided From a5c174d971f30619a4033e36c3e573727f4f8c65 Mon Sep 17 00:00:00 2001 From: Sascha Rogmann Date: Thu, 5 Feb 2026 22:41:18 +0100 Subject: [PATCH 2/4] spec : renamed statistics vars --- common/speculative.cpp | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/common/speculative.cpp b/common/speculative.cpp index 67dd621f8c..4edfadc7b2 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -113,13 +113,12 @@ static bool common_speculative_are_compatible( struct common_speculative_state { const enum common_speculative_type type; - // TODO: rename to n_call_draft, n_gen_drafts, n_acc_drafts, n_gen_tokens, n_acc_tokens // TODO: add n_call_begin, n_call_accept - size_t drafts_call_count = 0; // number of times this implementation was called. - size_t drafts_generated_count = 0; // number of times a draft or part was generated by this implementation. - size_t drafts_accepted_count = 0; // number of times a draft or part was accepted by the target model. - size_t drafts_generated_tokens = 0; // number of tokens generated by this implementation. - size_t drafts_accepted_tokens = 0; // number of tokens accepted by the target model. + size_t n_call_draft = 0; // number of times this implementation was called. + size_t n_gen_drafts = 0; // number of times a draft or part was generated by this implementation. + size_t n_acc_drafts = 0; // number of times a draft or part was accepted by the target model. + size_t n_gen_tokens = 0; // number of tokens generated by this implementation. + size_t n_acc_tokens = 0; // number of tokens accepted by the target model. // TODO: track performance of most recent calls const bool gen_perf = true; // whether to generate performance stats. @@ -967,17 +966,17 @@ llama_tokens common_speculative_draft( { common_time_meas tm(impl->t_draft_us, !impl->gen_perf); impl->draft(params, prompt_tgt, id_last, result); - impl->drafts_call_count++; + impl->n_call_draft++; } if (!result.empty()) { LOG_DBG("%s: called impl %s, hist size = %zu, call_count = %zu, gen = %zu\n", __func__, common_speculative_type_to_str(impl.get()->type).c_str(), prompt_tgt.size(), - impl.get()->drafts_call_count, result.size()); + impl.get()->n_call_draft, result.size()); spec->curr_impl = impl.get(); // set current implementation for stats - impl->drafts_generated_count++; - impl->drafts_generated_tokens += result.size(); + impl->n_gen_drafts++; + impl->n_gen_tokens += result.size(); break; // We have a draft, so break out of the loop and return it. } @@ -998,8 +997,8 @@ void common_speculative_accept(common_speculative * spec, uint16_t n_accepted) { { common_time_meas tm(impl->t_accept_us, !impl->gen_perf); if (n_accepted > 0) { - impl->drafts_accepted_count++; - impl->drafts_accepted_tokens += n_accepted; + impl->n_acc_drafts++; + impl->n_acc_tokens += n_accepted; } impl->accept(n_accepted); @@ -1025,11 +1024,11 @@ void common_speculative_print_stats(const common_speculative * spec) { LOG_INF("statistics %s: #calls = %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n", common_speculative_type_to_str(impl->type).c_str(), - impl->drafts_call_count, - impl->drafts_generated_count, - impl->drafts_accepted_count, - impl->drafts_generated_tokens, - impl->drafts_accepted_tokens, + impl->n_call_draft, + impl->n_gen_drafts, + impl->n_acc_drafts, + impl->n_gen_tokens, + impl->n_acc_tokens, str_perf.c_str()); } } From 4283cfef30d196b59351ebd697e2d4cb82ea2bc8 Mon Sep 17 00:00:00 2001 From: Sascha Rogmann Date: Thu, 5 Feb 2026 23:02:14 +0100 Subject: [PATCH 3/4] spec : add n_call_begin, n_call_accept --- common/speculative.cpp | 12 ++++++++---- docs/speculative.md | 7 ++++++- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/common/speculative.cpp b/common/speculative.cpp index 4edfadc7b2..24b2cf9eb8 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -113,8 +113,10 @@ static bool common_speculative_are_compatible( struct common_speculative_state { const enum common_speculative_type type; - // TODO: add n_call_begin, n_call_accept - size_t n_call_draft = 0; // number of times this implementation was called. + size_t n_call_begin = 0; // number of times this implementation was called for refresh. + size_t n_call_draft = 0; // number of times this implementation was called for generation. + size_t n_call_accept = 0; // number of times this implementation was called for accumulation. + size_t n_gen_drafts = 0; // number of times a draft or part was generated by this implementation. size_t n_acc_drafts = 0; // number of times a draft or part was accepted by the target model. size_t n_gen_tokens = 0; // number of tokens generated by this implementation. @@ -950,6 +952,7 @@ void common_speculative_begin(common_speculative * spec, const llama_tokens & pr for (auto & impl : spec->impls) { common_time_meas tm(impl->t_begin_us, !impl->gen_perf); impl->begin(prompt); + impl->n_call_begin++; } } @@ -1002,6 +1005,7 @@ void common_speculative_accept(common_speculative * spec, uint16_t n_accepted) { } impl->accept(n_accepted); + impl->n_call_accept++; } } @@ -1022,9 +1026,9 @@ void common_speculative_print_stats(const common_speculative * spec) { str_perf = ""; } - LOG_INF("statistics %s: #calls = %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n", + LOG_INF("statistics %s: #calls(b,g,a) = %zu %zu %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n", common_speculative_type_to_str(impl->type).c_str(), - impl->n_call_draft, + impl->n_call_begin, impl->n_call_draft, impl->n_call_accept, impl->n_gen_drafts, impl->n_acc_drafts, impl->n_gen_tokens, diff --git a/docs/speculative.md b/docs/speculative.md index 31856c157a..29da332875 100644 --- a/docs/speculative.md +++ b/docs/speculative.md @@ -169,7 +169,12 @@ draft acceptance rate = 0.70312 ( 90 accepted / 128 generated) statistics ngram_mod: #calls = 810, #gen drafts = 15, #acc drafts = 15, #gen tokens = 960, #acc tokens = 730, dur(b,g,a) = 0.149, 0.347, 0.005 ms ``` -- `#calls`: number of calls of this implementations +``` +statistics ngram_map_k: #calls(b,g,a) = 6 1690 26, #gen drafts = 26, #acc drafts = 26, #gen tokens = 1248, #acc tokens = 968, dur(b,g,a) = 2.234, 1.427, 0.016 ms +``` + + +- `#calls(b,g,a)`: number of calls of begin (new prompt), generation and accumulation of this implementations - `#gen drafts`: number of drafts generated by this implementation - `#acc drafts`: number of drafts accepted (partially) by the main model - `#gen tokens`: number of tokens generated by this implementation (including rejected tokens) From 2d9b984293087200c9ca557b1077f20834a7d5f2 Mon Sep 17 00:00:00 2001 From: Sascha Rogmann Date: Thu, 5 Feb 2026 23:07:34 +0100 Subject: [PATCH 4/4] spec : don't enable key-map-stats --- common/ngram-map.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/ngram-map.h b/common/ngram-map.h index 6a5be56a33..41b9530449 100644 --- a/common/ngram-map.h +++ b/common/ngram-map.h @@ -67,7 +67,7 @@ struct common_ngram_map { std::vector keys; // key n-grams which occur several times in token-history uint16_t min_hits; // minimum number of key hits to consider a draft - bool show_key_map_stats = true; // true, if statistics of the key_map should be printed. + bool show_key_map_stats = false; // true, if statistics of the key_map should be printed. common_ngram_map(uint16_t sz_key, uint16_t sz_value, bool only_keys, uint16_t min_hits)