Compare commits
4 Commits
798b576c06
...
edf375d26f
| Author | SHA1 | Date |
|---|---|---|
|
|
edf375d26f | |
|
|
67832e5554 | |
|
|
8fd849eb90 | |
|
|
69a49ac3a1 |
|
|
@ -12463,7 +12463,7 @@ struct llm_tokenizer_wpm {
|
|||
continue;
|
||||
}
|
||||
code = unicode_tolower(code);
|
||||
if (type == CODEPOINT_TYPE_WHITESPACE) {
|
||||
if (type == CODEPOINT_TYPE_SEPARATOR) {
|
||||
code = ' ';
|
||||
}
|
||||
std::string s = unicode_cpt_to_utf8(code);
|
||||
|
|
|
|||
Binary file not shown.
|
|
@ -1,31 +1,51 @@
|
|||
import regex
|
||||
import unicodedata
|
||||
|
||||
|
||||
def cpt_to_utf8_str(cpt):
|
||||
if cpt <= 0xFF:
|
||||
return bytes([cpt, 0, 0, 0])
|
||||
elif cpt <= 0xFFFF:
|
||||
return bytes([cpt & 0xFF, cpt >> 8, 0, 0])
|
||||
elif cpt <= 0xFFFFFF:
|
||||
return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, 0])
|
||||
else:
|
||||
return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, cpt >> 24])
|
||||
if False:
|
||||
|
||||
# This code is equivalent to: cpt.to_bytes(4, "little"))
|
||||
def cpt_to_utf8_str(cpt):
|
||||
if cpt <= 0xFF:
|
||||
return bytes([cpt, 0, 0, 0])
|
||||
elif cpt <= 0xFFFF:
|
||||
return bytes([cpt & 0xFF, cpt >> 8, 0, 0])
|
||||
elif cpt <= 0xFFFFFF:
|
||||
return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, 0])
|
||||
else:
|
||||
return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, cpt >> 24])
|
||||
|
||||
def is_match(codepoint, regex_expr):
|
||||
try:
|
||||
res = regex.match(regex_expr, cpt_to_utf8_str(codepoint).decode('utf-32'))
|
||||
return res is not None
|
||||
except Exception:
|
||||
return False
|
||||
# This code is equivalent to: regex_expr_compiled.match(chr(codepoint))
|
||||
def is_match(codepoint, regex_expr):
|
||||
try:
|
||||
res = regex_expr.match(cpt_to_utf8_str(codepoint).decode('utf-32'))
|
||||
return res is not None
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
# Verify previous statements, using chr() and ord()
|
||||
for codepoint in range(0x110000):
|
||||
temp = cpt_to_utf8_str(codepoint)
|
||||
assert(temp == codepoint.to_bytes(4, "little"))
|
||||
try:
|
||||
char = temp.decode('utf-32')
|
||||
if codepoint == 0xFEFF: # BOM
|
||||
assert(char == "") # why?
|
||||
char = "\uFEFF"
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
assert(char == chr(codepoint) )
|
||||
assert(ord(char) == codepoint )
|
||||
|
||||
|
||||
def get_matches(regex_expr):
|
||||
regex_expr_compiled = regex.compile(regex_expr)
|
||||
unicode_ranges = []
|
||||
current_range = None
|
||||
|
||||
for codepoint in range(0x110000):
|
||||
if is_match(codepoint, regex_expr):
|
||||
char = chr(codepoint)
|
||||
if regex_expr_compiled.match(char):
|
||||
if current_range is None:
|
||||
current_range = [codepoint, codepoint]
|
||||
else:
|
||||
|
|
@ -40,27 +60,54 @@ def get_matches(regex_expr):
|
|||
return unicode_ranges
|
||||
|
||||
|
||||
def print_cat(cat, ranges):
|
||||
print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat))
|
||||
cnt = 0
|
||||
for start, end in ranges:
|
||||
if cnt % 4 != 0:
|
||||
print(" ", end="")
|
||||
print("{{0x{:08X}, 0x{:08X}}},".format(start, end), end="")
|
||||
if cnt % 4 == 3:
|
||||
print("")
|
||||
cnt += 1
|
||||
|
||||
if cnt % 4 != 0:
|
||||
print("")
|
||||
def print_cat(mode, cat, ranges):
|
||||
if mode == "range":
|
||||
print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat))
|
||||
if mode == "range_value":
|
||||
print("const std::vector<std::tuple<uint32_t, uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat))
|
||||
if mode == "map":
|
||||
print("const std::map<uint32_t, uint32_t> unicode_map_{} = {{".format(cat))
|
||||
for i, values in enumerate(ranges):
|
||||
end = ",\n" if (i%4 == 3 or i+1 == len(ranges)) else ", "
|
||||
values = ["0x%08X"%value for value in values]
|
||||
print("{" + ", ".join(values) + "}", end=end)
|
||||
print("};")
|
||||
print("")
|
||||
|
||||
|
||||
print_cat("number", get_matches(r'\p{N}'))
|
||||
print_cat("letter", get_matches(r'\p{L}'))
|
||||
print_cat("whitespace", get_matches(r'\p{Z}'))
|
||||
print_cat("accent_mark", get_matches(r'\p{M}'))
|
||||
print_cat("punctuation", get_matches(r'\p{P}'))
|
||||
print_cat("symbol", get_matches(r'\p{S}'))
|
||||
print_cat("control", get_matches(r'\p{C}'))
|
||||
print_cat("range", "number", get_matches(r'\p{N}'))
|
||||
print_cat("range", "letter", get_matches(r'\p{L}'))
|
||||
print_cat("range", "separator", get_matches(r'\p{Z}'))
|
||||
print_cat("range", "accent_mark", get_matches(r'\p{M}'))
|
||||
print_cat("range", "punctuation", get_matches(r'\p{P}'))
|
||||
print_cat("range", "symbol", get_matches(r'\p{S}'))
|
||||
print_cat("range", "control", get_matches(r'\p{C}'))
|
||||
|
||||
print_cat("range", "whitespace", get_matches(r'\s'))
|
||||
|
||||
|
||||
map_lowercase = []
|
||||
map_uppercase = []
|
||||
for codepoint in range(0x110000):
|
||||
char = chr(codepoint)
|
||||
lower = ord(char.lower()[0])
|
||||
upper = ord(char.upper()[0])
|
||||
if codepoint != lower:
|
||||
map_lowercase.append((codepoint,lower))
|
||||
if codepoint != upper:
|
||||
map_uppercase.append((codepoint,upper))
|
||||
print_cat("map", "lowercase", map_lowercase)
|
||||
print_cat("map", "uppercase", map_uppercase)
|
||||
|
||||
|
||||
inv_map_nfd = {}
|
||||
for codepoint in range(0x110000):
|
||||
char = chr(codepoint)
|
||||
norm = ord(unicodedata.normalize('NFD', char)[0])
|
||||
if codepoint != norm:
|
||||
a, b = inv_map_nfd.get(norm, (codepoint, codepoint))
|
||||
inv_map_nfd[norm] = (min(a, codepoint), max(b, codepoint))
|
||||
nfd_ranges = [ (a, b, nfd) for nfd,(a,b) in inv_map_nfd.items() ]
|
||||
nfd_ranges = list(sorted(nfd_ranges))
|
||||
del inv_map_nfd
|
||||
print_cat("range_value", "nfd", nfd_ranges)
|
||||
|
|
|
|||
|
|
@ -1,117 +0,0 @@
|
|||
# tests with BPE tokenizer
|
||||
#
|
||||
# sample usage:
|
||||
#
|
||||
# python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/Meta-Llama-3-8B-Instruct/
|
||||
# python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/falcon-7b/
|
||||
# python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/deepseek-coder-6.7b-instruct/
|
||||
#
|
||||
|
||||
import argparse
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
|
||||
parser.add_argument("--fname-tok", help="path to a text file to tokenize")
|
||||
args = parser.parse_args()
|
||||
|
||||
dir_tokenizer = args.dir_tokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
|
||||
|
||||
tests = [
|
||||
"",
|
||||
" ",
|
||||
" ",
|
||||
" ",
|
||||
"\t",
|
||||
"\n",
|
||||
"\n\n",
|
||||
"\n\n\n",
|
||||
"\t\n",
|
||||
"Hello world",
|
||||
" Hello world",
|
||||
"Hello World",
|
||||
" Hello World",
|
||||
" Hello World!",
|
||||
"Hello, world!",
|
||||
" Hello, world!",
|
||||
" this is 🦙.cpp",
|
||||
"w048 7tuijk dsdfhu",
|
||||
"нещо на Български",
|
||||
"កាន់តែពិសេសអាចខលចេញ",
|
||||
"🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
|
||||
"Hello",
|
||||
" Hello",
|
||||
" Hello",
|
||||
" Hello",
|
||||
" Hello",
|
||||
" Hello\n Hello",
|
||||
" (",
|
||||
"\n =",
|
||||
"' era",
|
||||
"Hello, y'all! How are you 😁 ?我想在apple工作1314151天~",
|
||||
"3",
|
||||
"33",
|
||||
"333",
|
||||
"3333",
|
||||
"33333",
|
||||
"333333",
|
||||
"3333333",
|
||||
"33333333",
|
||||
"333333333",
|
||||
]
|
||||
|
||||
for text in tests:
|
||||
print('text: ', text)
|
||||
print(tokenizer.encode(text))
|
||||
print(tokenizer.decode(tokenizer.encode(text)))
|
||||
|
||||
print("\n\ntests for C++:\n")
|
||||
for text in tests:
|
||||
res = tokenizer.encode(text)
|
||||
|
||||
k = text.replace('\n', '\\n')
|
||||
k = k.replace('\t', '\\t')
|
||||
k = '"' + k + '"'
|
||||
print("{ %-24s, { " % k, end='')
|
||||
for x in res:
|
||||
print("%7d," % x, end='')
|
||||
print(" }, },")
|
||||
|
||||
print(tokenizer.encode('hello'))
|
||||
print(tokenizer.encode('world'))
|
||||
print(tokenizer.encode(' world'))
|
||||
print(tokenizer.encode('hello world'))
|
||||
|
||||
fname_tok = args.fname_tok
|
||||
if fname_tok:
|
||||
print('tokenizing file: ', fname_tok)
|
||||
fname_out = fname_tok + '.tok'
|
||||
with open(fname_tok, 'r', encoding='utf-8') as f:
|
||||
lines = f.readlines()
|
||||
s = ''.join(lines)
|
||||
res = tokenizer.encode(s)
|
||||
# write to file
|
||||
with open(fname_out, 'w', encoding='utf-8') as f:
|
||||
for x in res:
|
||||
# LLaMA v3 for some reason strips the space for these tokens (and others)
|
||||
# if x == 662:
|
||||
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
|
||||
# elif x == 1174:
|
||||
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
|
||||
# elif x == 2564:
|
||||
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
|
||||
# elif x == 758:
|
||||
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
|
||||
# elif x == 949:
|
||||
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
|
||||
# elif x == 5354:
|
||||
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
|
||||
# else:
|
||||
# f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
|
||||
f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n')
|
||||
print('len(res): ', len(res))
|
||||
print('len(lines): ', len(lines))
|
||||
print('results written to: ', fname_out)
|
||||
|
|
@ -1,114 +0,0 @@
|
|||
# tests with SPM tokenizer
|
||||
#
|
||||
# sample usage:
|
||||
#
|
||||
# python3 tests/test-tokenizer-0-spm.py ~/Data/huggingface/Llama-2-7b-hf/
|
||||
# python3 tests/test-tokenizer-0-spm.py ~/Data/huggingface/CodeLlama-34b-Instruct-hf/
|
||||
#
|
||||
|
||||
|
||||
import argparse
|
||||
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
|
||||
parser.add_argument("--fname-tok", help="path to a text file to tokenize")
|
||||
args = parser.parse_args()
|
||||
|
||||
dir_tokenizer = args.dir_tokenizer
|
||||
|
||||
tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model')
|
||||
|
||||
tests = [
|
||||
"",
|
||||
" ",
|
||||
" ",
|
||||
" ",
|
||||
"\t",
|
||||
"\n",
|
||||
"\n\n",
|
||||
"\n\n\n",
|
||||
"\t\n",
|
||||
"Hello world",
|
||||
" Hello world",
|
||||
"Hello World",
|
||||
" Hello World",
|
||||
" Hello World!",
|
||||
"Hello, world!",
|
||||
" Hello, world!",
|
||||
" this is 🦙.cpp",
|
||||
"w048 7tuijk dsdfhu",
|
||||
"нещо на Български",
|
||||
"កាន់តែពិសេសអាចខលចេញ",
|
||||
"🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
|
||||
"Hello",
|
||||
" Hello",
|
||||
" Hello",
|
||||
" Hello",
|
||||
" Hello",
|
||||
" Hello\n Hello",
|
||||
" (",
|
||||
"\n =",
|
||||
"' era",
|
||||
"Hello, y'all! How are you 😁 ?我想在apple工作1314151天~",
|
||||
"3",
|
||||
"33",
|
||||
"333",
|
||||
"3333",
|
||||
"33333",
|
||||
"333333",
|
||||
"3333333",
|
||||
"33333333",
|
||||
"333333333",
|
||||
]
|
||||
|
||||
|
||||
for text in tests:
|
||||
print('text: ', text)
|
||||
print('\nwith bos:')
|
||||
print(tokenizer.encode(text, add_bos=True))
|
||||
print(tokenizer.decode(tokenizer.encode(text, add_bos=True)))
|
||||
print('\nwithout bos:')
|
||||
print(tokenizer.encode(text, add_bos=False))
|
||||
print(tokenizer.decode(tokenizer.encode(text, add_bos=False)))
|
||||
|
||||
print("'" + tokenizer.id_to_piece(15043) + "'") # '_Hello'
|
||||
print("'" + tokenizer.id_to_piece(29871) + "'") # '_'
|
||||
print("'" + tokenizer.decode([15043]) + "'") # 'Hello'
|
||||
print("'" + tokenizer.decode([15043, 15043]) + "'") # 'Hello Hello'
|
||||
print("'" + tokenizer.decode([29871, 15043]) + "'") # ' Hello'
|
||||
print("'" + tokenizer.decode([29871, 15043, 29871, 15043]) + "'") # ' Hello Hello'
|
||||
|
||||
print("\n\ntests for C++:\n")
|
||||
for text in tests:
|
||||
res = tokenizer.encode(text, add_bos=False)
|
||||
|
||||
k = text.replace('\n', '\\n')
|
||||
k = k.replace('\t', '\\t')
|
||||
k = '"' + k + '"'
|
||||
print("{ %-24s, { " % k, end='')
|
||||
for x in res:
|
||||
print("%7d," % x, end='')
|
||||
print(" }, },")
|
||||
|
||||
print(tokenizer.encode('hello'))
|
||||
print(tokenizer.encode('world'))
|
||||
print(tokenizer.encode(' world'))
|
||||
print(tokenizer.encode('hello world'))
|
||||
|
||||
fname_tok = args.fname_tok
|
||||
if fname_tok:
|
||||
print('tokenizing file: ', fname_tok)
|
||||
fname_out = fname_tok + '.tok'
|
||||
with open(fname_tok, 'r', encoding='utf-8') as f:
|
||||
lines = f.readlines()
|
||||
s = ''.join(lines)
|
||||
res = tokenizer.encode(s, add_bos=True)
|
||||
# write to file
|
||||
with open(fname_out, 'w', encoding='utf-8') as f:
|
||||
for x in res:
|
||||
f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
|
||||
print('len(res): ', len(res))
|
||||
print('len(lines): ', len(lines))
|
||||
print('results written to: ', fname_out)
|
||||
|
|
@ -5,13 +5,16 @@
|
|||
# python3 tests/test-tokenizer-0-bpe.py ./models/ggml-vocab-llama-bpe.gguf ~/Data/huggingface/Meta-Llama-3-8B-Instruct/
|
||||
#
|
||||
|
||||
import random
|
||||
import logging
|
||||
import argparse
|
||||
import subprocess
|
||||
import random
|
||||
|
||||
import cffi
|
||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||
|
||||
logger = logging.getLogger("test-tokenizer-random-bpe")
|
||||
|
||||
|
||||
class LibLlama:
|
||||
|
||||
|
|
@ -141,22 +144,23 @@ def test_custom_texts(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBase):
|
|||
]
|
||||
|
||||
more_tests = [
|
||||
'\x1f-a', # unicode_ranges_control, {0x00001C, 0x00001F}
|
||||
'¼-a', # unicode_ranges_digit, 0x00BC
|
||||
'½-a', # unicode_ranges_digit, 0x00BD
|
||||
'¾-a', # unicode_ranges_digit, 0x00BE
|
||||
'a 〇b', # unicode_ranges_digit, 0x3007
|
||||
'Ⅵ-a', # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
|
||||
'\x1f-a', # unicode_ranges_control, {0x00001C, 0x00001F}
|
||||
'¼-a', # unicode_ranges_digit, 0x00BC
|
||||
'½-a', # unicode_ranges_digit, 0x00BD
|
||||
'¾-a', # unicode_ranges_digit, 0x00BE
|
||||
'a 〇b', # unicode_ranges_digit, 0x3007
|
||||
'Ⅵ-a', # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
|
||||
'\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM)
|
||||
]
|
||||
|
||||
for text in tests+more_tests:
|
||||
ids1 = model.tokenize(text, parse_special=True)
|
||||
ids2 = tokenizer.encode(text)
|
||||
print(repr(text))
|
||||
logger.info(repr(text))
|
||||
if ids1 != ids2:
|
||||
print(" TokenIDs:", list(ids1))
|
||||
print(" Expected:", list(ids2))
|
||||
print(" Index:", find_first_mismatch(ids1, ids2) )
|
||||
logger.info(" TokenIDs: " + str(list(ids1)))
|
||||
logger.info(" Expected: " + str(list(ids2)))
|
||||
logger.info(" Index: %d" % find_first_mismatch(ids1, ids2))
|
||||
raise Exception()
|
||||
|
||||
|
||||
|
|
@ -171,11 +175,11 @@ def test_random_chars(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBase, it
|
|||
.-,*/-+ª!"·$%&/()=?¿[]{}<>\\|@#~½¬~;:_
|
||||
"""))
|
||||
|
||||
print( "Bruteforce random chars encodings ..." )
|
||||
logger.info("Bruteforce random chars encodings ...")
|
||||
rand = random.Random()
|
||||
for m in range(iterations):
|
||||
|
||||
print(m)
|
||||
logger.debug("%d/%d" % (m+1,iterations))
|
||||
rand.seed(m)
|
||||
|
||||
text = []
|
||||
|
|
@ -194,17 +198,17 @@ def test_random_chars(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBase, it
|
|||
|
||||
def test_random_vocab_chars(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBase, iterations=100):
|
||||
|
||||
print( "Building vocab char list ..." )
|
||||
logger.info("Building vocab char list ...")
|
||||
vocab_ids = list(tokenizer.vocab.values())
|
||||
vocab_text = tokenizer.decode(vocab_ids)
|
||||
vocab_chars = list(set(vocab_text))
|
||||
del vocab_ids, vocab_text
|
||||
|
||||
print( "Bruteforce random text encodings ..." )
|
||||
logger.info("Bruteforce random text encodings ...")
|
||||
rand = random.Random()
|
||||
for m in range(iterations):
|
||||
|
||||
print(m)
|
||||
logger.debug("%d/%d" % (m+1,iterations))
|
||||
rand.seed(m)
|
||||
|
||||
text = rand.choices(vocab_chars, k=1024)
|
||||
|
|
@ -212,12 +216,12 @@ def test_random_vocab_chars(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBa
|
|||
|
||||
ids1 = model.tokenize(text, parse_special=True)
|
||||
ids2 = tokenizer.encode(text)
|
||||
assert( ids1 == ids2 )
|
||||
assert(ids1 == ids2)
|
||||
|
||||
|
||||
def test_random_vocab_tokens(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBase, iterations=100):
|
||||
|
||||
print( "Building token list ..." )
|
||||
logger.info("Building token list ...")
|
||||
space_id = tokenizer.encode(" ")[0]
|
||||
vocab_ids = list(tokenizer.vocab.values())
|
||||
vocab_ids = list(sorted(vocab_ids + vocab_ids))
|
||||
|
|
@ -227,17 +231,17 @@ def test_random_vocab_tokens(model:LibLlamaModel, tokenizer:PreTrainedTokenizerB
|
|||
vocab_tokens = vocab_tokens.split(" ")
|
||||
del vocab_ids
|
||||
|
||||
print( "Checking single token encodings ..." )
|
||||
logger.info("Checking single token encodings ...")
|
||||
for token in vocab_tokens:
|
||||
ids1 = model.tokenize(token, parse_special=True)
|
||||
ids2 = tokenizer.encode(token)
|
||||
assert(ids1 == ids2)
|
||||
|
||||
print( "Bruteforce random text encodings ..." )
|
||||
logger.info("Bruteforce random text encodings ...")
|
||||
rand = random.Random()
|
||||
for m in range(iterations):
|
||||
|
||||
print(m)
|
||||
logger.debug("%d/%d" % (m+1,iterations))
|
||||
rand.seed(m)
|
||||
|
||||
text = []
|
||||
|
|
@ -252,18 +256,18 @@ def test_random_vocab_tokens(model:LibLlamaModel, tokenizer:PreTrainedTokenizerB
|
|||
|
||||
ids1 = model.tokenize(text, parse_special=True)
|
||||
ids2 = tokenizer.encode(text)
|
||||
assert( ids1 == ids2 )
|
||||
assert(ids1 == ids2)
|
||||
|
||||
|
||||
def test_random_bytes(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBase, iterations=100):
|
||||
|
||||
WHITESPACES = list(" "*20 + "\n"*5 + "\r\n"*5 + "\t"*5)
|
||||
|
||||
print( "Bruteforce random bytes encodings ..." )
|
||||
logger.info("Bruteforce random bytes encodings ...")
|
||||
rand = random.Random()
|
||||
for m in range(iterations):
|
||||
|
||||
print(m)
|
||||
logger.debug("%d/%d" % (m+1,iterations))
|
||||
rand.seed(m)
|
||||
|
||||
text = []
|
||||
|
|
@ -285,8 +289,11 @@ if __name__ == "__main__":
|
|||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("vocab_file", help="path to vocab 'gguf' file")
|
||||
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
|
||||
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||
|
||||
model = LibLlamaModel(LibLlama(), args.vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=2048))
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.dir_tokenizer)
|
||||
|
|
|
|||
1262
unicode-data.cpp
1262
unicode-data.cpp
File diff suppressed because it is too large
Load Diff
|
|
@ -7,6 +7,7 @@
|
|||
|
||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_number;
|
||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
|
||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_separator;
|
||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
|
||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark;
|
||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation;
|
||||
|
|
|
|||
35
unicode.cpp
35
unicode.cpp
|
|
@ -9,6 +9,7 @@
|
|||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <locale>
|
||||
|
|
@ -120,9 +121,9 @@ static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
|
|||
cpt_types[i] = CODEPOINT_TYPE_LETTER;
|
||||
}
|
||||
}
|
||||
for (auto p : unicode_ranges_whitespace) {
|
||||
for (auto p : unicode_ranges_separator) {
|
||||
for (auto i = p.first; i <= p.second; ++ i) {
|
||||
cpt_types[i] = CODEPOINT_TYPE_WHITESPACE;
|
||||
cpt_types[i] = CODEPOINT_TYPE_SEPARATOR;
|
||||
}
|
||||
}
|
||||
for (auto p : unicode_ranges_accent_mark) {
|
||||
|
|
@ -280,6 +281,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
|||
}
|
||||
}
|
||||
|
||||
char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
|
||||
int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
|
||||
// regex: <space>?\p{L}+
|
||||
if (cpt2_type == CODEPOINT_TYPE_LETTER) {
|
||||
|
|
@ -300,17 +302,18 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
|||
continue;
|
||||
}
|
||||
// regex: <space>?[^\s\p{L}\p{N}]+
|
||||
if (cpt2_type != CODEPOINT_TYPE_WHITESPACE && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
||||
if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
||||
pos += (cpt == ' ');
|
||||
while (cpt2_type != CODEPOINT_TYPE_WHITESPACE && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
||||
while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
||||
cpt2_type = _get_cpt_type(++pos);
|
||||
cpt2 = _get_cpt(pos);
|
||||
}
|
||||
_add_token(pos);
|
||||
continue;
|
||||
}
|
||||
|
||||
size_t num_whitespaces = 0;
|
||||
while (_get_cpt_type(pos+num_whitespaces) == CODEPOINT_TYPE_WHITESPACE) {
|
||||
while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
|
||||
num_whitespaces++;
|
||||
}
|
||||
|
||||
|
|
@ -423,13 +426,14 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
|||
}
|
||||
|
||||
// regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
|
||||
char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
|
||||
int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
|
||||
if (cpt2_type != CODEPOINT_TYPE_WHITESPACE && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
||||
if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
||||
pos += (cpt == ' ');
|
||||
while (cpt2_type != CODEPOINT_TYPE_WHITESPACE && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
||||
while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
||||
cpt2_type = _get_cpt_type(++pos);
|
||||
cpt2 = _get_cpt(pos);
|
||||
}
|
||||
char32_t cpt2 = _get_cpt(pos);
|
||||
while (cpt2 == '\r' || cpt2 == '\n') {
|
||||
cpt2 = _get_cpt(++pos);
|
||||
}
|
||||
|
|
@ -439,7 +443,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
|||
|
||||
size_t num_whitespaces = 0;
|
||||
size_t last_end_r_or_n = 0;
|
||||
while (_get_cpt_type(pos+num_whitespaces) == CODEPOINT_TYPE_WHITESPACE) {
|
||||
while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
|
||||
char32_t cpt2 = _get_cpt(pos+num_whitespaces);
|
||||
if (cpt2 == '\r' || cpt2 == '\n') {
|
||||
last_end_r_or_n = pos + num_whitespaces + 1;
|
||||
|
|
@ -621,6 +625,19 @@ int unicode_cpt_type(const std::string & utf8) {
|
|||
return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset));
|
||||
}
|
||||
|
||||
bool unicode_cpt_is_whitespace(uint32_t cp) {
|
||||
static const std::unordered_set<uint32_t> is_whitespace = [] {
|
||||
std::unordered_set<uint32_t> is_whitespace;
|
||||
for (auto p : unicode_ranges_whitespace) {
|
||||
for (auto i = p.first; i <= p.second; ++ i) {
|
||||
is_whitespace.insert(i);
|
||||
}
|
||||
}
|
||||
return is_whitespace;
|
||||
}();
|
||||
return (bool)is_whitespace.count(cp);
|
||||
}
|
||||
|
||||
std::string unicode_byte_to_utf8(uint8_t byte) {
|
||||
static std::unordered_map<uint8_t, std::string> map = unicode_byte_to_utf8_map();
|
||||
return map.at(byte);
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@
|
|||
#define CODEPOINT_TYPE_UNIDENTIFIED 0
|
||||
#define CODEPOINT_TYPE_NUMBER 1
|
||||
#define CODEPOINT_TYPE_LETTER 2
|
||||
#define CODEPOINT_TYPE_WHITESPACE 3
|
||||
#define CODEPOINT_TYPE_SEPARATOR 3
|
||||
#define CODEPOINT_TYPE_ACCENT_MARK 4
|
||||
#define CODEPOINT_TYPE_PUNCTUATION 5
|
||||
#define CODEPOINT_TYPE_SYMBOL 6
|
||||
|
|
@ -21,6 +21,8 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
|
|||
int unicode_cpt_type(uint32_t cp);
|
||||
int unicode_cpt_type(const std::string & utf8);
|
||||
|
||||
bool unicode_cpt_is_whitespace(uint32_t cp);
|
||||
|
||||
std::string unicode_byte_to_utf8(uint8_t byte);
|
||||
uint8_t unicode_utf8_to_byte(const std::string & utf8);
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue