Restore BOM

llama3 custom regex split: fix \s
Unicode tables: separator, lowercase, uppercase and whitespace
2024-05-05 01:58:34 +02:00 · 2024-05-05 01:20:23 +02:00 · 2024-05-05 01:19:20 +02:00 · 2024-05-05 00:42:44 +02:00
10 changed files with 1021 additions and 688 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -12463,7 +12463,7 @@ struct llm_tokenizer_wpm {
                continue;
            }
            code = unicode_tolower(code);
-            if (type == CODEPOINT_TYPE_WHITESPACE) {
+            if (type == CODEPOINT_TYPE_SEPARATOR) {
                code = ' ';
            }
            std::string s = unicode_cpt_to_utf8(code);
--- a/models/ggml-vocab-phi-3.gguf
+++ b/models/ggml-vocab-phi-3.gguf
--- a/scripts/gen-unicode-data.py
+++ b/scripts/gen-unicode-data.py
@ -1,31 +1,51 @@
 import regex
+import unicodedata


-def cpt_to_utf8_str(cpt):
-    if cpt <= 0xFF:
-        return bytes([cpt, 0, 0, 0])
-    elif cpt <= 0xFFFF:
-        return bytes([cpt & 0xFF, cpt >> 8, 0, 0])
-    elif cpt <= 0xFFFFFF:
-        return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, 0])
-    else:
-        return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, cpt >> 24])
+if False:

+    # This code is equivalent to: cpt.to_bytes(4, "little"))
+    def cpt_to_utf8_str(cpt):
+        if cpt <= 0xFF:
+            return bytes([cpt, 0, 0, 0])
+        elif cpt <= 0xFFFF:
+            return bytes([cpt & 0xFF, cpt >> 8, 0, 0])
+        elif cpt <= 0xFFFFFF:
+            return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, 0])
+        else:
+            return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, cpt >> 24])

-def is_match(codepoint, regex_expr):
-    try:
-        res = regex.match(regex_expr, cpt_to_utf8_str(codepoint).decode('utf-32'))
-        return res is not None
-    except Exception:
-        return False
+    # This code is equivalent to: regex_expr_compiled.match(chr(codepoint))
+    def is_match(codepoint, regex_expr):
+        try:
+            res = regex_expr.match(cpt_to_utf8_str(codepoint).decode('utf-32'))
+            return res is not None
+        except Exception:
+            return False
+
+    # Verify previous statements, using chr() and ord()
+    for codepoint in range(0x110000):
+        temp = cpt_to_utf8_str(codepoint)
+        assert(temp == codepoint.to_bytes(4, "little"))
+        try:
+            char = temp.decode('utf-32')
+            if codepoint == 0xFEFF:  # BOM
+                assert(char == "")   # why?
+                char = "\uFEFF"
+        except UnicodeDecodeError:
+            continue
+        assert(char == chr(codepoint) )
+        assert(ord(char) == codepoint )


 def get_matches(regex_expr):
+    regex_expr_compiled = regex.compile(regex_expr)
    unicode_ranges = []
    current_range = None

    for codepoint in range(0x110000):
-        if is_match(codepoint, regex_expr):
+        char = chr(codepoint)
+        if regex_expr_compiled.match(char):
            if current_range is None:
                current_range = [codepoint, codepoint]
            else:
@ -40,27 +60,54 @@ def get_matches(regex_expr):
    return unicode_ranges


-def print_cat(cat, ranges):
-    print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat))
-    cnt = 0
-    for start, end in ranges:
-        if cnt % 4 != 0:
-            print(" ", end="")
-        print("{{0x{:08X}, 0x{:08X}}},".format(start, end), end="")
-        if cnt % 4 == 3:
-            print("")
-        cnt += 1
-
-    if cnt % 4 != 0:
-        print("")
+def print_cat(mode, cat, ranges):
+    if mode == "range":
+        print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat))
+    if mode == "range_value":
+        print("const std::vector<std::tuple<uint32_t, uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat))
+    if mode == "map":
+        print("const std::map<uint32_t, uint32_t> unicode_map_{} = {{".format(cat))
+    for i, values in enumerate(ranges):
+        end = ",\n" if (i%4 == 3 or i+1 == len(ranges)) else ", "
+        values = ["0x%08X"%value for value in values]
+        print("{" + ", ".join(values) + "}", end=end)
    print("};")
    print("")


-print_cat("number",      get_matches(r'\p{N}'))
-print_cat("letter",      get_matches(r'\p{L}'))
-print_cat("whitespace",  get_matches(r'\p{Z}'))
-print_cat("accent_mark", get_matches(r'\p{M}'))
-print_cat("punctuation", get_matches(r'\p{P}'))
-print_cat("symbol",      get_matches(r'\p{S}'))
-print_cat("control",     get_matches(r'\p{C}'))
+print_cat("range", "number",      get_matches(r'\p{N}'))
+print_cat("range", "letter",      get_matches(r'\p{L}'))
+print_cat("range", "separator",   get_matches(r'\p{Z}'))
+print_cat("range", "accent_mark", get_matches(r'\p{M}'))
+print_cat("range", "punctuation", get_matches(r'\p{P}'))
+print_cat("range", "symbol",      get_matches(r'\p{S}'))
+print_cat("range", "control",     get_matches(r'\p{C}'))
+
+print_cat("range", "whitespace",  get_matches(r'\s'))
+
+
+map_lowercase = []
+map_uppercase = []
+for codepoint in range(0x110000):
+    char = chr(codepoint)
+    lower = ord(char.lower()[0])
+    upper = ord(char.upper()[0])
+    if codepoint != lower:
+        map_lowercase.append((codepoint,lower))
+    if codepoint != upper:
+        map_uppercase.append((codepoint,upper))
+print_cat("map", "lowercase", map_lowercase)
+print_cat("map", "uppercase", map_uppercase)
+
+
+inv_map_nfd = {}
+for codepoint in range(0x110000):
+    char = chr(codepoint)
+    norm = ord(unicodedata.normalize('NFD', char)[0])
+    if codepoint != norm:
+        a, b = inv_map_nfd.get(norm, (codepoint, codepoint))
+        inv_map_nfd[norm] = (min(a, codepoint), max(b, codepoint))
+nfd_ranges = [ (a, b, nfd) for nfd,(a,b) in inv_map_nfd.items() ]
+nfd_ranges = list(sorted(nfd_ranges))
+del inv_map_nfd
+print_cat("range_value", "nfd", nfd_ranges)
--- a/tests/test-tokenizer-0-bpe.py
+++ b/tests/test-tokenizer-0-bpe.py
@ -1,117 +0,0 @@
-# tests with BPE tokenizer
-#
-# sample usage:
-#
-#   python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/Meta-Llama-3-8B-Instruct/
-#   python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/falcon-7b/
-#   python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/deepseek-coder-6.7b-instruct/
-#
-
-import argparse
-
-from transformers import AutoTokenizer
-
-parser = argparse.ArgumentParser()
-parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
-parser.add_argument("--fname-tok",   help="path to a text file to tokenize")
-args = parser.parse_args()
-
-dir_tokenizer = args.dir_tokenizer
-
-tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
-
-tests = [
-    "",
-    " ",
-    "  ",
-    "   ",
-    "\t",
-    "\n",
-    "\n\n",
-    "\n\n\n",
-    "\t\n",
-    "Hello world",
-    " Hello world",
-    "Hello World",
-    " Hello World",
-    " Hello World!",
-    "Hello, world!",
-    " Hello, world!",
-    " this is 🦙.cpp",
-    "w048 7tuijk dsdfhu",
-    "нещо на Български",
-    "កាន់តែពិសេសអាចខលចេញ",
-    "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
-    "Hello",
-    " Hello",
-    "  Hello",
-    "   Hello",
-    "    Hello",
-    "    Hello\n    Hello",
-    " (",
-    "\n =",
-    "' era",
-    "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～",
-    "3",
-    "33",
-    "333",
-    "3333",
-    "33333",
-    "333333",
-    "3333333",
-    "33333333",
-    "333333333",
-]
-
-for text in tests:
-    print('text: ', text)
-    print(tokenizer.encode(text))
-    print(tokenizer.decode(tokenizer.encode(text)))
-
-print("\n\ntests for C++:\n")
-for text in tests:
-    res = tokenizer.encode(text)
-
-    k = text.replace('\n', '\\n')
-    k = k.replace('\t', '\\t')
-    k = '"' + k + '"'
-    print("{ %-24s, { " % k, end='')
-    for x in res:
-        print("%7d," % x, end='')
-    print(" }, },")
-
-print(tokenizer.encode('hello'))
-print(tokenizer.encode('world'))
-print(tokenizer.encode(' world'))
-print(tokenizer.encode('hello world'))
-
-fname_tok = args.fname_tok
-if fname_tok:
-    print('tokenizing file: ', fname_tok)
-    fname_out = fname_tok + '.tok'
-    with open(fname_tok, 'r', encoding='utf-8') as f:
-        lines = f.readlines()
-        s = ''.join(lines)
-        res = tokenizer.encode(s)
-        # write to file
-        with open(fname_out, 'w', encoding='utf-8') as f:
-            for x in res:
-                # LLaMA v3 for some reason strips the space for these tokens (and others)
-                # if x == 662:
-                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
-                # elif x == 1174:
-                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
-                # elif x == 2564:
-                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
-                # elif x == 758:
-                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
-                # elif x == 949:
-                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
-                # elif x == 5354:
-                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
-                # else:
-                #     f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
-                f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n')
-        print('len(res): ', len(res))
-        print('len(lines): ', len(lines))
-    print('results written to: ', fname_out)
--- a/tests/test-tokenizer-0-spm.py
+++ b/tests/test-tokenizer-0-spm.py
@ -1,114 +0,0 @@
-# tests with SPM tokenizer
-#
-# sample usage:
-#
-#   python3 tests/test-tokenizer-0-spm.py ~/Data/huggingface/Llama-2-7b-hf/
-#   python3 tests/test-tokenizer-0-spm.py ~/Data/huggingface/CodeLlama-34b-Instruct-hf/
-#
-
-
-import argparse
-
-from sentencepiece import SentencePieceProcessor
-
-parser = argparse.ArgumentParser()
-parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
-parser.add_argument("--fname-tok",   help="path to a text file to tokenize")
-args = parser.parse_args()
-
-dir_tokenizer = args.dir_tokenizer
-
-tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model')
-
-tests = [
-    "",
-    " ",
-    "  ",
-    "   ",
-    "\t",
-    "\n",
-    "\n\n",
-    "\n\n\n",
-    "\t\n",
-    "Hello world",
-    " Hello world",
-    "Hello World",
-    " Hello World",
-    " Hello World!",
-    "Hello, world!",
-    " Hello, world!",
-    " this is 🦙.cpp",
-    "w048 7tuijk dsdfhu",
-    "нещо на Български",
-    "កាន់តែពិសេសអាចខលចេញ",
-    "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
-    "Hello",
-    " Hello",
-    "  Hello",
-    "   Hello",
-    "    Hello",
-    "    Hello\n    Hello",
-    " (",
-    "\n =",
-    "' era",
-    "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～",
-    "3",
-    "33",
-    "333",
-    "3333",
-    "33333",
-    "333333",
-    "3333333",
-    "33333333",
-    "333333333",
-]
-
-
-for text in tests:
-    print('text: ', text)
-    print('\nwith bos:')
-    print(tokenizer.encode(text, add_bos=True))
-    print(tokenizer.decode(tokenizer.encode(text, add_bos=True)))
-    print('\nwithout bos:')
-    print(tokenizer.encode(text, add_bos=False))
-    print(tokenizer.decode(tokenizer.encode(text, add_bos=False)))
-
-print("'" + tokenizer.id_to_piece(15043) + "'") # '_Hello'
-print("'" + tokenizer.id_to_piece(29871) + "'") # '_'
-print("'" + tokenizer.decode([15043]) + "'")        # 'Hello'
-print("'" + tokenizer.decode([15043, 15043]) + "'") # 'Hello Hello'
-print("'" + tokenizer.decode([29871, 15043]) + "'")               # ' Hello'
-print("'" + tokenizer.decode([29871, 15043, 29871, 15043]) + "'") # ' Hello  Hello'
-
-print("\n\ntests for C++:\n")
-for text in tests:
-    res = tokenizer.encode(text, add_bos=False)
-
-    k = text.replace('\n', '\\n')
-    k = k.replace('\t', '\\t')
-    k = '"' + k + '"'
-    print("{ %-24s, { " % k, end='')
-    for x in res:
-        print("%7d," % x, end='')
-    print(" }, },")
-
-print(tokenizer.encode('hello'))
-print(tokenizer.encode('world'))
-print(tokenizer.encode(' world'))
-print(tokenizer.encode('hello world'))
-
-fname_tok = args.fname_tok
-if fname_tok:
-    print('tokenizing file: ', fname_tok)
-    fname_out = fname_tok + '.tok'
-    with open(fname_tok, 'r', encoding='utf-8') as f:
-        lines = f.readlines()
-        s = ''.join(lines)
-        res = tokenizer.encode(s, add_bos=True)
-        # write to file
-        with open(fname_out, 'w', encoding='utf-8') as f:
-            for x in res:
-                f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
-        print('len(res): ', len(res))
-        print('len(lines): ', len(lines))
-    print('results written to: ', fname_out)
--- a/tests/test-tokenizer-random-bpe.py
+++ b/tests/test-tokenizer-random-bpe.py
@ -5,13 +5,16 @@
 #   python3 tests/test-tokenizer-0-bpe.py ./models/ggml-vocab-llama-bpe.gguf ~/Data/huggingface/Meta-Llama-3-8B-Instruct/
 #

-import random
+import logging
 import argparse
 import subprocess
+import random

 import cffi
 from transformers import AutoTokenizer, PreTrainedTokenizerBase

+logger = logging.getLogger("test-tokenizer-random-bpe")
+

 class LibLlama:

@ -141,22 +144,23 @@ def test_custom_texts(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBase):
    ]

    more_tests = [
-        '\x1f-a',  # unicode_ranges_control, {0x00001C, 0x00001F}
-        '¼-a',     # unicode_ranges_digit, 0x00BC
-        '½-a',     # unicode_ranges_digit, 0x00BD
-        '¾-a',     # unicode_ranges_digit, 0x00BE
-        'a 〇b',   # unicode_ranges_digit, 0x3007
-        'Ⅵ-a',    # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
+        '\x1f-a',   # unicode_ranges_control, {0x00001C, 0x00001F}
+        '¼-a',      # unicode_ranges_digit, 0x00BC
+        '½-a',      # unicode_ranges_digit, 0x00BD
+        '¾-a',      # unicode_ranges_digit, 0x00BE
+        'a 〇b',    # unicode_ranges_digit, 0x3007
+        'Ⅵ-a',     # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
+        '\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM)
    ]

    for text in tests+more_tests:
        ids1 = model.tokenize(text, parse_special=True)
        ids2 = tokenizer.encode(text)
-        print(repr(text))
+        logger.info(repr(text))
        if ids1 != ids2:
-            print(" TokenIDs:", list(ids1))
-            print(" Expected:", list(ids2))
-            print(" Index:", find_first_mismatch(ids1, ids2) )
+            logger.info(" TokenIDs: " + str(list(ids1)))
+            logger.info(" Expected: " + str(list(ids2)))
+            logger.info(" Index: %d" % find_first_mismatch(ids1, ids2))
            raise Exception()


@ -171,11 +175,11 @@ def test_random_chars(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBase, it
        .-,*/-+ª!"·$%&/()=?¿[]{}<>\\|@#~½¬~;:_
    """))
    
-    print( "Bruteforce random chars encodings ..." )
+    logger.info("Bruteforce random chars encodings ...")
    rand = random.Random()
    for m in range(iterations):

-        print(m)
+        logger.debug("%d/%d" % (m+1,iterations))
        rand.seed(m)

        text = []
@ -194,17 +198,17 @@ def test_random_chars(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBase, it

 def test_random_vocab_chars(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBase, iterations=100):

-    print( "Building vocab char list ..." )
+    logger.info("Building vocab char list ...")
    vocab_ids = list(tokenizer.vocab.values())
    vocab_text = tokenizer.decode(vocab_ids)
    vocab_chars = list(set(vocab_text))
    del vocab_ids, vocab_text
    
-    print( "Bruteforce random text encodings ..." )
+    logger.info("Bruteforce random text encodings ...")
    rand = random.Random()
    for m in range(iterations):

-        print(m)
+        logger.debug("%d/%d" % (m+1,iterations))
        rand.seed(m)
        
        text = rand.choices(vocab_chars, k=1024)
@ -212,12 +216,12 @@ def test_random_vocab_chars(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBa

        ids1 = model.tokenize(text, parse_special=True)
        ids2 = tokenizer.encode(text)
-        assert( ids1 == ids2 )
+        assert(ids1 == ids2)


 def test_random_vocab_tokens(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBase, iterations=100):

-    print( "Building token list ..." )
+    logger.info("Building token list ...")
    space_id = tokenizer.encode(" ")[0]
    vocab_ids = list(tokenizer.vocab.values())
    vocab_ids = list(sorted(vocab_ids + vocab_ids))
@ -227,17 +231,17 @@ def test_random_vocab_tokens(model:LibLlamaModel, tokenizer:PreTrainedTokenizerB
    vocab_tokens = vocab_tokens.split(" ")
    del vocab_ids
    
-    print( "Checking single token encodings ..." )
+    logger.info("Checking single token encodings ...")
    for token in vocab_tokens:
        ids1 = model.tokenize(token, parse_special=True)
        ids2 = tokenizer.encode(token)
        assert(ids1 == ids2)

-    print( "Bruteforce random text encodings ..." )
+    logger.info("Bruteforce random text encodings ...")
    rand = random.Random()
    for m in range(iterations):

-        print(m)
+        logger.debug("%d/%d" % (m+1,iterations))
        rand.seed(m)
        
        text = []
@ -252,18 +256,18 @@ def test_random_vocab_tokens(model:LibLlamaModel, tokenizer:PreTrainedTokenizerB

        ids1 = model.tokenize(text, parse_special=True)
        ids2 = tokenizer.encode(text)
-        assert( ids1 == ids2 )
+        assert(ids1 == ids2)


 def test_random_bytes(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBase, iterations=100):

    WHITESPACES = list(" "*20 + "\n"*5 + "\r\n"*5 + "\t"*5)

-    print( "Bruteforce random bytes encodings ..." )
+    logger.info("Bruteforce random bytes encodings ...")
    rand = random.Random()
    for m in range(iterations):

-        print(m)
+        logger.debug("%d/%d" % (m+1,iterations))
        rand.seed(m)

        text = []
@ -285,8 +289,11 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("vocab_file", help="path to vocab 'gguf' file")
    parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
+    parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
    args = parser.parse_args()

+    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
+
    model = LibLlamaModel(LibLlama(), args.vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=2048))

    tokenizer = AutoTokenizer.from_pretrained(args.dir_tokenizer)
--- a/unicode-data.cpp
+++ b/unicode-data.cpp
--- a/unicode-data.h
+++ b/unicode-data.h
@ -7,6 +7,7 @@

 extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_number;
 extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
+extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_separator;
 extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
 extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark;
 extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation;
--- a/unicode.cpp
+++ b/unicode.cpp
@ -9,6 +9,7 @@
 #include <stdexcept>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 #include <locale>
@ -120,9 +121,9 @@ static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
            cpt_types[i] = CODEPOINT_TYPE_LETTER;
        }
    }
-    for (auto p : unicode_ranges_whitespace) {
+    for (auto p : unicode_ranges_separator) {
        for (auto i = p.first; i <= p.second; ++ i) {
-            cpt_types[i] = CODEPOINT_TYPE_WHITESPACE;
+            cpt_types[i] = CODEPOINT_TYPE_SEPARATOR;
        }
    }
    for (auto p : unicode_ranges_accent_mark) {
@ -280,6 +281,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
                }
            }

+            char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
            int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
            // regex: <space>?\p{L}+
            if (cpt2_type == CODEPOINT_TYPE_LETTER) {
@ -300,17 +302,18 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
                continue;
            }
            // regex: <space>?[^\s\p{L}\p{N}]+
-            if (cpt2_type != CODEPOINT_TYPE_WHITESPACE && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
+            if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
                pos += (cpt == ' ');
-                while (cpt2_type != CODEPOINT_TYPE_WHITESPACE && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
+                while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
                    cpt2_type = _get_cpt_type(++pos);
+                    cpt2 = _get_cpt(pos);
                }
                _add_token(pos);
                continue;
            }

            size_t num_whitespaces = 0;
-            while (_get_cpt_type(pos+num_whitespaces) == CODEPOINT_TYPE_WHITESPACE) {
+            while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
                num_whitespaces++;
            }

@ -423,13 +426,14 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
            }

            // regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
+            char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
            int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
-            if (cpt2_type != CODEPOINT_TYPE_WHITESPACE && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
+            if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
                pos += (cpt == ' ');
-                while (cpt2_type != CODEPOINT_TYPE_WHITESPACE && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
+                while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
                    cpt2_type = _get_cpt_type(++pos);
+                    cpt2 = _get_cpt(pos);
                }
-                char32_t cpt2 = _get_cpt(pos);
                while (cpt2 == '\r' || cpt2 == '\n') {
                    cpt2 = _get_cpt(++pos);
                }
@ -439,7 +443,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &

            size_t num_whitespaces = 0;
            size_t last_end_r_or_n = 0;
-            while (_get_cpt_type(pos+num_whitespaces) == CODEPOINT_TYPE_WHITESPACE) {
+            while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
                char32_t cpt2 = _get_cpt(pos+num_whitespaces);
                if (cpt2 == '\r' || cpt2 == '\n') {
                    last_end_r_or_n = pos + num_whitespaces + 1;
@ -621,6 +625,19 @@ int unicode_cpt_type(const std::string & utf8) {
    return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset));
 }

+bool unicode_cpt_is_whitespace(uint32_t cp) {
+    static const std::unordered_set<uint32_t> is_whitespace = [] {
+        std::unordered_set<uint32_t> is_whitespace;
+        for (auto p : unicode_ranges_whitespace) {
+            for (auto i = p.first; i <= p.second; ++ i) {
+                is_whitespace.insert(i);
+            }
+        }
+        return is_whitespace;
+    }();
+    return (bool)is_whitespace.count(cp);
+}
+
 std::string unicode_byte_to_utf8(uint8_t byte) {
    static std::unordered_map<uint8_t, std::string> map = unicode_byte_to_utf8_map();
    return map.at(byte);
--- a/unicode.h
+++ b/unicode.h
@ -7,7 +7,7 @@
 #define CODEPOINT_TYPE_UNIDENTIFIED 0
 #define CODEPOINT_TYPE_NUMBER       1
 #define CODEPOINT_TYPE_LETTER       2
-#define CODEPOINT_TYPE_WHITESPACE   3
+#define CODEPOINT_TYPE_SEPARATOR    3
 #define CODEPOINT_TYPE_ACCENT_MARK  4
 #define CODEPOINT_TYPE_PUNCTUATION  5
 #define CODEPOINT_TYPE_SYMBOL       6
@ -21,6 +21,8 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
 int unicode_cpt_type(uint32_t cp);
 int unicode_cpt_type(const std::string & utf8);

+bool unicode_cpt_is_whitespace(uint32_t cp);
+
 std::string unicode_byte_to_utf8(uint8_t byte);
 uint8_t unicode_utf8_to_byte(const std::string & utf8);
Author	SHA1	Message	Date
jaime-m-p	edf375d26f	Restore BOM	2024-05-05 01:58:34 +02:00
jaime-m-p	67832e5554	llama3 custom regex split: fix \s	2024-05-05 01:20:23 +02:00
jaime-m-p	8fd849eb90	Unicode tables: separator, lowercase, uppercase and whitespace	2024-05-05 01:19:20 +02:00
jaime-m-p	69a49ac3a1	Fix merge	2024-05-05 00:42:44 +02:00