189 lines
6.3 KiB
Python
189 lines
6.3 KiB
Python
#!/usr/bin/env python3
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import logging
|
||
import os
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
# Necessary to load the local gguf package
|
||
if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
|
||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||
|
||
from gguf.huggingface_hub import HFVocabRequest
|
||
|
||
logger = logging.getLogger("gguf-gen-pre")
|
||
|
||
|
||
# NOTE: It's impossible to catch all edge cases.
|
||
# Most naive way to handle this is to a have a pre-compiled unicode list of all 1.1 million characters
|
||
# as it's finite and iso standardized.
|
||
# This means we can predict the upper bound and can apply known time complexity solutions to
|
||
# discover the best way resolve it.
|
||
def test_pre_tok_params() -> list[str]:
|
||
return [
|
||
"ü, ǖ, ǘ, ǚ, ǜ", # diaeresis
|
||
"綠, 女, 怒, 玉, 句", # pinyin
|
||
"ied 4 ½ months", # ordinal
|
||
"¡Hola Mundo!", # spanish
|
||
"Olá Mundo!", # portuguese
|
||
"Selam Dünya!", # turkish
|
||
"Salam, dünýä!", # turkman
|
||
"Γειά σου Κόσμε!", # greek
|
||
"हैलो वर्ल्ड!", # hindi
|
||
"สวัสดีชาวโลก!", # thai
|
||
"こんにちは世界!", # japanese
|
||
"你好世界!", # chinese
|
||
"Hàlo a Shaoghail!", # gaelic
|
||
"Chào thế giới!", # vietnamese
|
||
"Привет, мир!", # russian
|
||
"Здравей свят!", # bulgarian
|
||
"សួស្តីពិភពលោក!", # kymer
|
||
"Le rapide renard brun sauta par dessus le chien paresseux.", # french
|
||
"\tWil je een kopje thee?\n", # dutch
|
||
" Te gustaría algo de té ? ", # spanish
|
||
# NOTE: I expect right-to-left languages to fail
|
||
"העלא וועלט!", # yiddish (r-to-l)
|
||
"سلام دنیا!", # persian (r-to-l)
|
||
"", # Why?; This is a falsy value in python, no symbols.
|
||
" ",
|
||
" ",
|
||
" ",
|
||
"\t",
|
||
"\n",
|
||
"\n\n",
|
||
"\n\n\n",
|
||
"\t\n",
|
||
"Hello world",
|
||
" Hello world",
|
||
"Hello World",
|
||
" Hello World",
|
||
" Hello World!",
|
||
"Hello, world!",
|
||
" Hello, world!",
|
||
" this is 🦙.cpp",
|
||
"w048 7tuijk dsdfhu",
|
||
"🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
|
||
"Hello",
|
||
" Hello",
|
||
" Hello",
|
||
" Hello",
|
||
" Hello",
|
||
" Hello\n Hello",
|
||
" (",
|
||
"\n =",
|
||
"' era",
|
||
"Hello, y'all! How are you 😁 局外人?苹果apple工作work3.14159天God~",
|
||
"3",
|
||
"33",
|
||
"333",
|
||
"3333",
|
||
"33333",
|
||
"333333",
|
||
"3333333",
|
||
]
|
||
|
||
|
||
def test_pre_tok(hf_voc_req: HFVocabRequest) -> None:
|
||
# NOTE: aggregate all models to their respective paths
|
||
from transformers import AutoTokenizer
|
||
|
||
params = test_pre_tok_params()
|
||
for model in hf_voc_req.models:
|
||
# set the model path, e.g. 'models/meta-llama/Llama-2-7b-hf'
|
||
path = Path(f"{hf_voc_req.model_path}/{model["repo"]}")
|
||
# set the model name, e.g. llama-2-7b-hf
|
||
name = path.stem.lower()
|
||
# model input encodings, e.g. 'models/meta-llama/Llama-2-7b-hf/llama-2-7b-hf.vocab.gguf.inp'
|
||
inp = path / f"ggml-vocab-{name}.inp"
|
||
# model output encodings, e.g. 'models/meta-llama/Llama-2-7b-hf/llama-2-7b-hf.vocab.gguf.out'
|
||
out = path / f"ggml-vocab-{name}.out"
|
||
# extracted tokenizer model
|
||
final = path / f"ggml-vocab-{name}.gguf"
|
||
|
||
# skip tokenizer folder if unavailable
|
||
if not path.exists():
|
||
logger.warning(f"skipped - {model['repo']} not found.")
|
||
continue
|
||
|
||
try: # create the tokenizer
|
||
tokenizer = AutoTokenizer.from_pretrained(path)
|
||
except OSError as e:
|
||
logger.error(f"{model['repo']} not found: {e}")
|
||
continue # skip this tokenizer model
|
||
|
||
with open(inp, "w", encoding="utf-8") as f:
|
||
for test in params:
|
||
f.write(f"{test}")
|
||
f.write("\n__ggml_vocab_test__\n")
|
||
|
||
with open(out, "w", encoding="utf-8") as f:
|
||
for test in params:
|
||
encodings = tokenizer.encode(test, add_special_tokens=False)
|
||
for encoding in encodings:
|
||
f.write(f" {encoding}")
|
||
f.write("\n")
|
||
|
||
logger.info(f"Tests for {model["repo"]} written in {final}.*")
|
||
|
||
|
||
def generate_vocab_script(hf_voc_req: HFVocabRequest) -> None:
|
||
# generate commands for creating vocab files
|
||
shscript = "#!/usr/bin/env bash\n\n"
|
||
|
||
for model in hf_voc_req.models:
|
||
# get the repo path
|
||
path = Path(f"{hf_voc_req.model_path}/{model["repo"]}")
|
||
# set the vocab path
|
||
vocab = path / f"ggml-vocab-{path.stem.lower()}.gguf"
|
||
# set the command line
|
||
tmpline = f"python3 convert-hf-to-gguf.py {path} --outfile {vocab} --vocab-only\n"
|
||
shscript += tmpline
|
||
logger.info(tmpline.strip())
|
||
|
||
with open("generate-vocab.sh", "w", encoding="utf-8") as f:
|
||
f.writelines(shscript)
|
||
logger.info(f"Wrote {len(shscript)} bytes to generate-vocab.sh")
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser()
|
||
parser.add_argument("hf_auth_token", help="A huggingface read auth token")
|
||
parser.add_argument(
|
||
"-v", "--verbose", action="store_true", help="Increase output verbosity."
|
||
)
|
||
parser.add_argument(
|
||
"-m", "--model-path", default=None, help="The models storage path. Default is 'models/'."
|
||
)
|
||
parser.add_argument(
|
||
"-t", "--gen-vocab-tests", action="store_true", help="Generate the tokenizer tests. Default is False."
|
||
)
|
||
parser.add_argument(
|
||
"-s", "--gen-vocab-script", action="store_true", help="Generate the gguf vocab files. Default is False."
|
||
)
|
||
args = parser.parse_args()
|
||
|
||
if args.verbose:
|
||
logging.basicConfig(level=logging.DEBUG)
|
||
else:
|
||
logging.basicConfig(level=logging.INFO)
|
||
|
||
hf_vocab_req = HFVocabRequest(
|
||
args.model_path, args.hf_auth_token, logger
|
||
)
|
||
|
||
hf_vocab_req.download_models()
|
||
hf_vocab_req.generate_checksums()
|
||
hf_vocab_req.log_pre_tokenizer_info()
|
||
|
||
if args.gen_tests:
|
||
test_pre_tok(hf_vocab_req)
|
||
|
||
if args.gen_vocab_script:
|
||
generate_vocab_script(hf_vocab_req)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|