From 7f0fe9d1d5b1d78c4c21121065037999587a09e6 Mon Sep 17 00:00:00 2001 From: Saurabh Dash Date: Sun, 15 Feb 2026 18:02:51 +0000 Subject: [PATCH] add some comments for regex --- src/llama-vocab.cpp | 4 ++-- src/unicode.cpp | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 3468a40d8b..b35cb02ce4 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -424,9 +424,9 @@ struct llm_tokenizer_bpe : llm_tokenizer { break; case LLAMA_VOCAB_PRE_TYPE_TINY_AYA: regex_exprs = { - // digit grouping split from tokenizer.json + // original regex from tokenizer.json: "\\d{1,3}(?=(?:\\d{3})*\\b)" "\\d{1,3}(?=(?:\\d{3})*\\b)", - // main split regex from tokenizer.json + // original regex from tokenizer.json: "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }; break; diff --git a/src/unicode.cpp b/src/unicode.cpp index 0f27782846..1475b53b65 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -773,6 +773,7 @@ static std::vector unicode_regex_split_custom(const std::string & text, // tiny_aya digit grouping pattern from tokenizer.json: // {"type": "Split", "pattern": {"Regex": "\\d{1,3}(?=(?:\\d{3})*\\b)"}, "behavior": "Isolated"} // Splits digits into groups of 3 from the right (e.g., 1234567 -> 1, 234, 567) + // TODO: Revisit this regex, incase there are any subtle tokenization differences with the original regex. bpe_offsets = unicode_regex_split_custom_afmoe(text, offsets); }