From 7f0fe9d1d5b1d78c4c21121065037999587a09e6 Mon Sep 17 00:00:00 2001
From: Saurabh Dash <saurabh@cohere.com>
Date: Sun, 15 Feb 2026 18:02:51 +0000
Subject: [PATCH] add some comments for regex

---
 src/llama-vocab.cpp | 4 ++--
 src/unicode.cpp     | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 3468a40d8b..b35cb02ce4 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -424,9 +424,9 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                 break;
             case LLAMA_VOCAB_PRE_TYPE_TINY_AYA:
                 regex_exprs = {
-                    // digit grouping split from tokenizer.json
+                    // original regex from tokenizer.json: "\\d{1,3}(?=(?:\\d{3})*\\b)"
                     "\\d{1,3}(?=(?:\\d{3})*\\b)",
-                    // main split regex from tokenizer.json
+                    // original regex from tokenizer.json: "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
                     "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                 };
                 break;
diff --git a/src/unicode.cpp b/src/unicode.cpp
index 0f27782846..1475b53b65 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -773,6 +773,7 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
         // tiny_aya digit grouping pattern from tokenizer.json:
         //   {"type": "Split", "pattern": {"Regex": "\\d{1,3}(?=(?:\\d{3})*\\b)"}, "behavior": "Isolated"}
         // Splits digits into groups of 3 from the right (e.g., 1234567 -> 1, 234, 567)
+        // TODO: Revisit this regex, incase there are any subtle tokenization differences with the original regex.
         bpe_offsets = unicode_regex_split_custom_afmoe(text, offsets);
     }