#pragma once // // common/ngram-map.h: structures used to manage a map from n-grams to a list of m-grams // // These structures are used to do a lookup of n-grams followed by m-grams in token history. // // There are two algorithms implemented: // 1. ngram_simple: lookup of n-grams followed by m-grams in token history. // 2. ngram_map: lookup of n-grams followed by m-grams in token history using a map. // The map is a vector of key n-grams, and for each key n-gram there is a list of value m-grams. // #include "llama.h" #include // n-gram simple // // config of n-gram simple. struct common_ngram_simple_config { uint16_t size_ngram; // size of n-grams to lookup in self-mode uint16_t size_mgram; // size of m-grams to draft in self-mode uint16_t check_rate; // check for speculative decoding without draft model for each check_rate token }; // current state (and config) of n-gram simple. struct common_ngram_simple_state { common_ngram_simple_config config; size_t idx_last_check = 0; // index of last check in context history (mutable) common_ngram_simple_state(const common_ngram_simple_config & config) : config(config) {} }; // Searches for a n-gram in the history and checks whether a draft sequence should be generated. // state: the ngram simple state to search in. // inp: the tokens generated so far. // sampled: the token that was just sampled. // draft: vector to store the draft tokens, initially empty. llama_tokens common_ngram_simple_draft( common_ngram_simple_state & state, const llama_tokens & tokens, llama_token sampled); // n-gram map // // maximum number of m-gram values stored for each key n-gram. #define COMMON_NGRAM_MAX_VALUES 4 // statistics of a m-gram after a known n-gram struct common_ngram_map_value { size_t value_idx = 0; // index of value m-gram in token-history (0 if unused) uint16_t value_num = 0; // number of occurences of this value m-gram after the key n-gram (0 in an unused values-slot) int16_t n_accepted = -1; // number of accepted tokens at last draft (-1 if unused) }; // statistics of a n-gram struct common_ngram_map_key { size_t key_idx; // index of key n-gram in token-history size_t stat_idx; // index of last token of stastistics computation (key_num, values) uint16_t key_num; // number of occurences of this key n-gram in token-history common_ngram_map_value values[COMMON_NGRAM_MAX_VALUES]; // some known values after the key }; // map from n-grams to following m-grams in token-history struct common_ngram_map { uint16_t size_key; // size of key n-grams uint16_t size_value; // size of value m-grams bool key_only; // true if only key n-grams are used, no values. // first draft: vector only, no map. std::vector keys; // key n-grams which occur several times in token-history uint16_t check_rate; // check for speculative decoding without draft model for each check_rate token uint16_t min_hits; // minimum number of key hits to consider a draft common_ngram_map(uint16_t sz_key, uint16_t sz_value, bool only_keys, uint16_t check_rate, uint16_t min_hits) : size_key(sz_key), size_value(sz_value), key_only(only_keys), check_rate(check_rate), min_hits(min_hits) {} bool last_draft_created = false; // true if a draft was created at last call. size_t last_draft_key_idx = 0; // index of last key used for draft generation. uint16_t last_draft_value_idx = 0; // index of last value used for draft generation. size_t idx_last_check = 0; // index of last check in context history }; // Searches for the n-gram in the history and checks whether a draft sequence should be generated. // map: the ngram map to search in. // inp: the tokens generated so far. // sampled: the token that was just sampled. // draft: vector to store the draft tokens, initially empty. void common_ngram_map_draft( common_ngram_map & map, const llama_tokens & inp, llama_token sampled, llama_tokens & draft); // Update the statistics of a value after a draft was processed. void common_ngram_map_accept(common_ngram_map & map, uint16_t n_accepted);