diff --git a/common/ngram-map.cpp b/common/ngram-map.cpp index 84fd761367..cab231bad7 100644 --- a/common/ngram-map.cpp +++ b/common/ngram-map.cpp @@ -7,6 +7,18 @@ #include #include +// prime number used for LCG hash function (32 bit), it is near (sqrt(5) - 1)/2 * 2^32. +#define LCG_FACTOR 2654435761UL + +// Compute the LCG hash of a n-gram of size len at offset start. +static uint32_t common_ngram_map_hash(const llama_tokens & tokens, size_t start, size_t len) { + uint32_t hash = 0; + for (size_t i = 0; i < len; ++i) { + hash = hash * LCG_FACTOR + tokens[start + i]; + } + return hash; +} + // Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...]. static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) { std::ostringstream oss; @@ -115,6 +127,100 @@ llama_tokens common_ngram_simple_draft( // maximum number of counted values of a ngram map value. #define COMMON_NGRAM_MAX_VALUE_COUNT 16380 +void common_ngram_map_begin( + common_ngram_map & map, const llama_tokens & tokens) { + size_t size_begin = tokens.size(); + + LOG_DBG("%s: begin, idx_last_draft=%zu, new begin=%zu, #keys=%zu\n", __func__, + map.idx_last_check, size_begin, map.keys.size()); + + size_t count_map_entries_upd = 0; + if (!map.key_map.empty() && size_begin < map.idx_last_check) { + if (map.show_key_map_stats) { + // Print statistics of hash map map_key. + size_t count_nonzero = 0; + uint32_t min_idx = UINT32_MAX; + uint32_t max_idx = 0; + for (size_t i = 0; i < map.key_map.size(); ++i) { + uint32_t key_idx = map.key_map[i]; + if (key_idx != 0) { + ++count_nonzero; + if (key_idx < min_idx) min_idx = key_idx; + if (key_idx > max_idx) max_idx = key_idx; + } + } + if (count_nonzero == 0) { + min_idx = 0; + } + LOG_INF("%s: key_map stats: entries=%zu, min_idx=%u, max_idx=%u, key_map_last_idx=%u\n", + __func__, count_nonzero, min_idx, max_idx, map.key_map_last_idx); + } + + // Update the map from hash to key index (clear outdated entries). + for (size_t i = 0; i < map.key_map.size(); ++i) { + uint32_t key_idx = map.key_map[i]; + if (key_idx >= map.size_last_begin) { + map.key_map[i] = 0; + count_map_entries_upd++; + } + } + map.key_map_last_idx = (map.size_last_begin > 0) ? map.size_last_begin - 1 : 0; + } + + if (size_begin < map.idx_last_check && !map.keys.empty()) { + // The next token generation will start at index size_begin. + // The tokens between map.size_last_begin and size_begin are no longer valid. + // + // Refresh map: Remove all entries with index >= map.size_last_begin. + size_t count_keys = map.keys.size(); + size_t count_keys_del = 0; + size_t count_values_del = 0; + for (int32_t i = map.keys.size() - 1; i >= 0; --i) { + common_ngram_map_key & key = map.keys[i]; + if (key.key_idx >= map.size_last_begin) { + // Delete the key. + LOG_DBG("%s: delete key %d at index %zu (>= size_last_begin=%zu)\n", __func__, i, key.key_idx, map.size_last_begin); + map.keys.erase(map.keys.begin() + i); + count_keys_del++; + continue; + } + if (map.key_only) { + continue; + } + + // Check the indices of the values. + for (int16_t j = COMMON_NGRAM_MAX_VALUES - 1; j >= 0; --j) { + common_ngram_map_value & value = key.values[j]; + if (value.value_idx >= map.size_last_begin) { + // Delete the value. + count_values_del++; + + // Move all values after this value to the left. + for (uint16_t k = j; k < COMMON_NGRAM_MAX_VALUES - 1; ++k) { + key.values[k] = key.values[k + 1]; + } + // Clear the last value. + key.values[COMMON_NGRAM_MAX_VALUES - 1].value_idx = 0; + key.values[COMMON_NGRAM_MAX_VALUES - 1].value_num = 0; + } + } + if (key.values[0].value_idx == 0) { + // No values left, delete the key. + LOG_DBG("%s: delete key %d at index %zu (no values left)\n", __func__, i, key.key_idx); + map.keys.erase(map.keys.begin() + i); + count_keys_del++; + } + } + + LOG_INF("%s: refresh map: idx_last_draft=%zu, new begin=%zu, #keys_checked=%zu, #keys_del=%zu, #values_del=%zu, #hashes_upd=%zu\n", __func__, + map.idx_last_check, size_begin, + count_keys, count_keys_del, count_values_del, count_map_entries_upd); + } + + map.idx_last_check = (map.size_last_begin > 0) ? map.size_last_begin - 1 : 0; + map.size_last_begin = size_begin; +} + void common_ngram_map_draft(common_ngram_map & map, const llama_tokens & inp, llama_token sampled, llama_tokens & draft) { @@ -129,6 +235,10 @@ void common_ngram_map_draft(common_ngram_map & map, if (cur_len < static_cast(2 * n + m)) { return; } + if (cur_len >= static_cast(UINT32_MAX)) { + // key_map uses uint32_t instead of size_t. + GGML_ABORT("%s: cur_len exceeds UINT32_MAX: %zu", __func__, cur_len); + } // Only check every check_rate tokens to save compute // i.e., perform check if (cur_len - idx_last_check) >= check_rate @@ -147,24 +257,92 @@ void common_ngram_map_draft(common_ngram_map & map, // search for the key in the map size_t match_pos = 0; - for (size_t j = cur_len - n - m - 1; j > 0; --j) { - bool match = true; - for (size_t k = 0; k < n; ++k) { - if (inp[j + k] != key_tokens[k]) { - match = false; - break; + if (map.size_last_begin > cur_len) { + GGML_ABORT("%s: map.size_last_begin > cur_len: %zu > %zu", __func__, map.size_last_begin, cur_len); + } + if (!map.key_map.empty()) { + // Search for the key in the map key_map from hash of ngrams to index of ngram. + uint32_t idx_hash = (common_ngram_map_hash(key_tokens, 0, n) % map.key_map.size()); + uint32_t idx_key = map.key_map[idx_hash]; + if (idx_key != 0 && idx_key < cur_len - n - m - 1) { + // Check if the key matches the key at idx_key (because of possible collisions). + bool match = true; + for (size_t k = 0; k < n; ++k) { + if (inp[idx_key + k] != key_tokens[k]) { + match = false; + break; + } + } + LOG_DBG("%s: key hash %x -> idx_key %d: match %d\n", __func__, idx_hash, idx_key, match ? 1 : 0); + if (match) { + match_pos = idx_key; } } - if (match) { - match_pos = j; - break; + } + if (match_pos == 0 && map.size_last_begin > (size_t) (n + m + 1)) { + // Search for the key in [1, map.size_last_begin - n - m -1], descending. + for (size_t j = map.size_last_begin - n - m - 1; j > map.key_map_last_idx; --j) { + // Check if the key matches the key. + bool match = true; + for (size_t k = 0; k < n; ++k) { + if (inp[j + k] != key_tokens[k]) { + match = false; + break; + } + } + if (match) { + match_pos = j; + break; + } + } + } + if (match_pos == 0) { + // In case of a reasoning chat, the part after size_last_begin may be deleted/reordered later. + // + // Search in [size_last_begin, cur_len - n - m - 1], descending. + for (size_t j = cur_len - n - m - 1; j > map.size_last_begin && j > map.key_map_last_idx; --j) { + bool match = true; + for (size_t k = 0; k < n; ++k) { + if (inp[j + k] != key_tokens[k]) { + match = false; + break; + } + } + if (match) { + match_pos = j; + break; + } } } if (match_pos > 0) { - LOG_INF("%s: cur_len = %zu, n = %d, m = %d, sz_tkns = %zu, sampled = %d, match_pos = %zu\n", __func__, + LOG_DBG("%s: cur_len = %zu, n = %d, m = %d, sz_tkns = %zu, sampled = %d, match_pos = %zu\n", __func__, cur_len, n, m, key_tokens.size(), sampled, match_pos); } + if (!map.key_map.empty()) { + // Add hashes of new ngrams in key_map. + // + // Use the same order as above. + if (map.size_last_begin > (size_t) (n + m + 1)) { + for (size_t j = map.size_last_begin - n - m - 1; j > map.key_map_last_idx; --j) { + // compute hash and store index of ngram at idx j in the map. + uint32_t idx_hash = (common_ngram_map_hash(inp, j, n) % map.key_map.size()); + if (map.key_map[idx_hash] == 0) { + map.key_map[idx_hash] = j; // collisions may occur + } + } + } + + for (size_t j = cur_len - n - m - 1; j > map.size_last_begin && j > map.key_map_last_idx; --j) { + // compute hash and store index of ngram at idx j in the map. + uint32_t idx_hash = (common_ngram_map_hash(inp, j, n) % map.key_map.size()); + if (map.key_map[idx_hash] == 0) { + map.key_map[idx_hash] = j; + } + } + map.key_map_last_idx = std::max(static_cast(cur_len - n - m - 1), map.key_map_last_idx); + } + if (match_pos == 0) { return; } @@ -215,8 +393,8 @@ void common_ngram_map_draft(common_ngram_map & map, draft.push_back(inp[match_pos + n + i]); } - LOG_INF("%s: key_offset = %zu, key_num = %d, draft.size = %zu\n", __func__, - key_offset, curr_key.key_num, draft.size()); + LOG_DBG("%s: key_idx = %zu, key_offset = %zu, key_num = %d, draft.size = %zu\n", __func__, + curr_key.key_idx, key_offset, curr_key.key_num, draft.size()); map.last_draft_created = false; map.last_draft_key_idx = key_offset; @@ -318,7 +496,7 @@ void common_ngram_map_draft(common_ngram_map & map, } } - if (sum_occur > 0 && max_occur < 3 * sum_occur) { + if (sum_occur > 0 && max_occur < 2 * sum_occur) { // The most frequent value is not much more frequent than the other values. // We do not use the draft. return; diff --git a/common/ngram-map.h b/common/ngram-map.h index b365034ac5..c094d513d5 100644 --- a/common/ngram-map.h +++ b/common/ngram-map.h @@ -9,6 +9,8 @@ // 2. ngram_map: lookup of n-grams followed by m-grams in token history using a map. // The map is a vector of key n-grams, and for each key n-gram there is a list of value m-grams. // +// ref: https://github.com/ggml-org/llama.cpp/pull/18471 +// #include "llama.h" #include "common.h" @@ -51,10 +53,13 @@ llama_tokens common_ngram_simple_draft( // maximum number of m-gram values stored for each key n-gram. #define COMMON_NGRAM_MAX_VALUES 4 +// number of entries in the (optional, size 0 to disable) map from ngram-hash to ngram-index. +#define COMMON_NGRAM_HASH_MAP_SIZE 262144 + // statistics of a m-gram after a known n-gram struct common_ngram_map_value { - size_t value_idx = 0; // index of value m-gram in token-history (0 if unused) - uint16_t value_num = 0; // number of occurences of this value m-gram after the key n-gram (0 in an unused values-slot) + size_t value_idx = 0; // index of value m-gram in token-history (0 if unused) + uint16_t value_num = 0; // number of occurences of this value m-gram after the key n-gram (0 in an unused values-slot) int16_t n_accepted = -1; // number of accepted tokens at last draft (-1 if unused) }; @@ -74,23 +79,43 @@ struct common_ngram_map { bool key_only; // true if only key n-grams are used, no values. - // first draft: vector only, no map. std::vector keys; // key n-grams which occur several times in token-history uint16_t check_rate; // check for speculative decoding without draft model for each check_rate token uint16_t min_hits; // minimum number of key hits to consider a draft + bool show_key_map_stats = false; // true, if statitics of the key_map should be printed. + common_ngram_map(uint16_t sz_key, uint16_t sz_value, bool only_keys, uint16_t check_rate, uint16_t min_hits) : size_key(sz_key), size_value(sz_value), key_only(only_keys), - check_rate(check_rate), min_hits(min_hits) {} + check_rate(check_rate), min_hits(min_hits) { + key_map.resize(COMMON_NGRAM_HASH_MAP_SIZE); // 2^18 hash entries, 0 entries if key_map shouldn't be used + } + + // In reasoning chats the previous reasoning block will be removed from context history. + // A rebuild of the ngram map is needed after that. + + size_t size_last_begin = 0; // number of tokens at previous start of generation bool last_draft_created = false; // true if a draft was created at last call. - size_t last_draft_key_idx = 0; // index of last key used for draft generation. + size_t last_draft_key_idx = 0; // index of last key used for draft generation (0 = no draft) uint16_t last_draft_value_idx = 0; // index of last value used for draft generation. size_t idx_last_check = 0; // index of last check in context history + + // optional map "hash to ngram-index" for faster lookup of n-grams. map is empty if unused. + // + // uint32_t instead of size_t (size of current histories is << UINT32_MAX) + std::vector key_map; // key_map[hash] = index of ngram in context window + uint32_t key_map_last_idx = 0; // index of the last ngram added to key_map }; +// Initialize the n-gram map with the given token history. +// map: the ngram map to initialize. +// tokens: the token history to base the map on. +void common_ngram_map_begin( + common_ngram_map & map, + const llama_tokens & tokens); // Searches for the n-gram in the history and checks whether a draft sequence should be generated. // map: the ngram map to search in. diff --git a/common/speculative.cpp b/common/speculative.cpp index a1a3b51c13..152aaa48d4 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -124,9 +124,9 @@ struct common_speculative_state { // TODO: track performance of most recent calls const bool gen_perf = true; // whether to generate performance stats. - // TODO: rename to t_draft_us - // TODO: add t_begin_us, t_accept_us - int64_t gen_duration_us = 0; // total time spent in this implementation in microseconds. + int64_t t_begin_us = 0; // total time spent in refresh of this implementation in microseconds. + int64_t t_draft_us = 0; // total time spent in generating drafts in this implementation in microseconds. + int64_t t_accept_us = 0; // total time spent in accumulation of this implementation in microseconds. common_speculative_state(enum common_speculative_type type) : type(type) {} @@ -499,7 +499,7 @@ struct common_speculative_state_ngram_map_k : public common_speculative_state { : common_speculative_state(type), map(std::move(map)) {} void begin(const llama_tokens & prompt) override { - GGML_UNUSED(prompt); + common_ngram_map_begin(map, prompt); } void draft( @@ -951,7 +951,12 @@ void common_speculative_begin(common_speculative * spec, const llama_tokens & pr } for (auto & impl : spec->impls) { + const int64_t t_start_us = impl->gen_perf ? ggml_time_us() : 0; + impl->begin(prompt); + + const int64_t t_now_us = impl->gen_perf ? ggml_time_us() : 0; + impl->t_begin_us += t_now_us - t_start_us; // accumulate duration for this refresh } } @@ -973,7 +978,7 @@ llama_tokens common_speculative_draft( const int64_t t_now_us = impl->gen_perf ? ggml_time_us() : 0; impl->drafts_call_count++; - impl->gen_duration_us += t_now_us - t_start_us; // accumulate duration for this implementation + impl->t_draft_us += t_now_us - t_start_us; // accumulate duration for this implementation } if (!result.empty()) { @@ -1001,12 +1006,15 @@ void common_speculative_accept(common_speculative * spec, uint16_t n_accepted) { GGML_ASSERT(impl); + const int64_t t_start_us = impl->gen_perf ? ggml_time_us() : 0; if (n_accepted > 0) { impl->drafts_accepted_count++; impl->drafts_accepted_tokens += n_accepted; } impl->accept(n_accepted); + const int64_t t_now_us = impl->gen_perf ? ggml_time_us() : 0; + impl->t_accept_us += t_now_us - t_start_us; // accumulate duration for this acculumulation } void common_speculative_print_stats(const common_speculative * spec) { @@ -1018,13 +1026,14 @@ void common_speculative_print_stats(const common_speculative * spec) { std::string str_perf; if (impl->gen_perf) { std::ostringstream oss; - oss << std::fixed << std::setprecision(3) << impl->gen_duration_us / 1000.0; - str_perf = ", dur = " + oss.str() + " ms"; + oss << std::fixed << std::setprecision(3) << impl->t_begin_us / 1000.0 << ", "; + oss << std::fixed << std::setprecision(3) << impl->t_draft_us / 1000.0 << ", "; + oss << std::fixed << std::setprecision(3) << impl->t_accept_us / 1000.0; + str_perf = ", dur(b,g,a) = " + oss.str() + " ms"; } else { str_perf = ""; } - // TODO: report time for begin() and accept() LOG_INF("statistics %s: #calls = %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n", common_speculative_type_to_str(impl->type).c_str(), impl->drafts_call_count, diff --git a/docs/speculative.md b/docs/speculative.md index 8281eaa2d3..03afab5b41 100644 --- a/docs/speculative.md +++ b/docs/speculative.md @@ -6,7 +6,7 @@ llama.cpp supports speculative decoding, a technique that can significantly acce ## Implementations -The `llama-server` application supports several implementations of speculative decoding: +The `llama-server` application supports several implementations of speculative decoding. An implementation with draft model can be mixed with an implementation without draft model. ### Draft Model (`draft`) @@ -32,12 +32,21 @@ An example to use this approach can be the rewriting of source code by a LLM. This implementation looks for the last n-gram in history that matches the current n-gram and creates a draft using the m tokens following the matched n-gram. It is the simplest self-speculative approach with minimal overhead. +``` +llama-server [...] --spec-type ngram-simple --draft-max 64 +``` + #### n-gram Map Key (`ngram-map-k`) -This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (argument `--spec-ngram-min-hits`) before generating drafts. +This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (argument `--spec-ngram-min-hits`, default is 1) before generating drafts. The number of accepted tokens is stored for each used n-gram. +**Example:** +``` +llama-server [...] --spec-type ngram-map-k --draft-max 64 +``` + #### n-gram Map Key-4-Values (`ngram-map-k4v`) This experimental implementation looks for the current n-gram of size n (called the _key_) in the token history. For each key, up to four _values_ (n-grams of size m, called _mgrams_) are tracked. An internal statistic counts the occurrences of each mgram after the key n-gram. If one mgram is significantly more frequent than the others, it is used as the draft. @@ -45,17 +54,65 @@ This experimental implementation looks for the current n-gram of size n (called The number of accepted tokens is stored for each used n-gram. **Example:** Server options to be used if there are a lot of longer repetitions. -```bash -llama-server [...] --spec-type ngram-map-k4v --spec-ngram-size-n 8 --spec-ngram-size-m 8 --spec-ngram-min-hits 2 +``` +llama-server [...] --spec-type ngram-map-k4v --spec-ngram-size-n 8 --spec-ngram-size-m 8 --spec-ngram-min-hits 2 --draft-max 64 ``` +### n-gram Mod (`ngram-mod`) + +Add basic ngram hasher for speculative decoding: + +- For each ngram, compute a hash using LCG +- For each computed hash, store the next token +- During speculation, iteratively compute the rolling hash of the last n tokens and pick the next token from the storage + +Some characteristics: + +- Lightweight (~16 MB) +- Constant memory and complexity +- Can generate variable draft lengths (i.e. m is not fixed) + +Currently, a single hash pool is shared across all server slots, so different requests can benefit from each other. + +**Sample usage:** + +``` +# notes: +# - small `n` are not recommended +# - MoEs require long drafts +# - dense models: can reduce `--draft-min` and `--draft-max` + +llama-server ... --spec-type ngram-mod --spec-ngram-size-n 24 --draft-min 48 --draft-max 64 +``` + +Applications: + +- Iterating over a block of text/code (e.g. in llama.vim) +- Reasoning models (when they have to repeat their thinking in the final answer) +- Summarization + +Example Video: + +- See #19164 + +### Differences between ngram-simple, ngram-map and ngram-mod + +- ngram-simple looks for a previous matching n-gram and inserts the following m-gram. +- ngram-map-k looks for a previous matching n-gram and inserts the following m-gram but uses an internal hash-map of n-grams in the current context window. +- ngram-mod uses a hash pool which is shared across all server slots. The hash pool is a map from n-gram hash to the next token (not the next m-gram as in ngram-map). ## Command-Line Options If a draft model is combined with a draftless decoding the draftless decoding has higher precedence. ``` ---spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v] +--draft, --draft-n, --draft-max N number of tokens to draft for speculative decoding (default: 16) + (env: LLAMA_ARG_DRAFT_MAX) +--draft-min, --draft-n-min N minimum number of draft tokens to use for speculative decoding + (default: 0) + (env: LLAMA_ARG_DRAFT_MIN) +[...] +--spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod] type of speculative decoding to use when no draft model is provided (default: none) --spec-ngram-size-n N ngram size N for ngram-simple/ngram-map speculative decoding, length @@ -78,6 +135,7 @@ Specifies a type of speculative decoding without draft model. | `ngram-simple` | Use simple n-gram pattern matching | | `ngram-map-k` | Use n-gram pattern matching with n-gram-keys | | `ngram-map-k4v` | Use n-gram pattern matching with n-gram-keys and up to four m-gram values (experimental) | +| `ngram-mod` | Use basic ngram hasher for speculative decoding with shared pool | **Example:** Server-instance used to refactor source code. ```bash @@ -112,9 +170,15 @@ statistics ngram_simple: #calls = 15, #gen drafts = 5, #acc drafts = 5, #gen tok statistics draft: #calls = 10, #gen drafts = 10, #acc drafts = 10, #gen tokens = 110, #acc tokens = 98 ``` +``` +draft acceptance rate = 0.70312 ( 90 accepted / 128 generated) +statistics ngram_mod: #calls = 810, #gen drafts = 15, #acc drafts = 15, #gen tokens = 960, #acc tokens = 730, dur(b,g,a) = 0.149, 0.347, 0.005 ms +``` + - `#calls`: number of calls of this implementations - `#gen drafts`: number of drafts generated by this implementation - `#acc drafts`: number of drafts accepted (partially) by the main model - `#gen tokens`: number of tokens generated by this implementation (including rejected tokens) - `#acc tokens`: number of tokens accepted by the main model +- `dur(b,g,a): durations of begin (new prompt), generation and accumulation (process acceptance).