359 lines
11 KiB
Python
359 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import logging
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Necessary to load the local gguf package
|
|
if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from gguf.constants import (
|
|
GPT_PRE_TOKENIZER_DEFAULT,
|
|
HF_TOKENIZER_BPE_FILES,
|
|
HF_TOKENIZER_SPM_FILES,
|
|
MODEL_ARCH,
|
|
MODEL_ARCH_NAMES,
|
|
ModelFileType,
|
|
VocabType,
|
|
)
|
|
from gguf.huggingface_hub import HFHubModel, HFHubTokenizer
|
|
|
|
logger = logging.getLogger("gguf-gen-pre")
|
|
|
|
#
|
|
# HuggingFace Model Map
|
|
#
|
|
# NOTE: All prerequisite model metadata must be defined here.
|
|
#
|
|
# Defines metadata for each Hugging Face model required during conversion to GGUF
|
|
#
|
|
# Field Descriptions
|
|
# - `model_repo` (str): The HuggingFace endpoint or local path to the models repository
|
|
# - `model_arch` (MODEL_ARCH): Model architecture type
|
|
# - `model_parts` (int): Number of parts required to join the model during conversion
|
|
# - `model_type` (FileFormatType): File format for the Hugging Face model files
|
|
# - `vocab_type` (VocabType): Vocabulary type used by the tokenizer
|
|
# - `vocab_pre` (Optional[Tuple[str]]): Tuple of pre-tokenizer pattern strings for this model
|
|
# - `vocab_files` (Tuple[str]): Tuple of file names required to extract vocabulary and other metadata
|
|
#
|
|
# NOTES
|
|
# - Possible algorithms are WordLevel, BPE, WordPiece, or Unigram
|
|
# - Possible LLaMa tokenizer model types are: None, SPM, BPE, or WPM
|
|
HF_MODEL_MAP = (
|
|
# SPM (Sentence Piece Models): Default to Byte Level Pre-tokenization.
|
|
{
|
|
"model_repo": "meta-llama/Llama-2-7b-hf",
|
|
"model_arch": MODEL_ARCH.LLAMA,
|
|
"model_parts": 2,
|
|
"model_type": ModelFileType.SAFETENSORS,
|
|
"vocab_type": VocabType.SPM,
|
|
"vocab_pre": None,
|
|
"vocab_files": HF_TOKENIZER_SPM_FILES,
|
|
},
|
|
{
|
|
"model_repo": "mistralai/Mistral-7B-Instruct-v0.1",
|
|
"model_arch": MODEL_ARCH.LLAMA,
|
|
"model_parts": 2,
|
|
"model_type": ModelFileType.SAFETENSORS,
|
|
"vocab_type": VocabType.SPM,
|
|
"vocab_pre": None,
|
|
"vocab_files": HF_TOKENIZER_SPM_FILES,
|
|
},
|
|
{
|
|
"model_repo": "mistralai/Mistral-7B-Instruct-v0.2",
|
|
"model_arch": MODEL_ARCH.LLAMA,
|
|
"model_parts": 3,
|
|
"model_type": ModelFileType.SAFETENSORS,
|
|
"vocab_type": VocabType.SPM,
|
|
"vocab_pre": None,
|
|
"vocab_files": HF_TOKENIZER_SPM_FILES,
|
|
},
|
|
{ # NOTE: Mistral v0.3 has a 'tokenizer.model.v3' file
|
|
"model_repo": "mistralai/Mistral-7B-Instruct-v0.3",
|
|
"model_arch": MODEL_ARCH.LLAMA,
|
|
"model_parts": 3,
|
|
"model_type": ModelFileType.SAFETENSORS,
|
|
"vocab_type": VocabType.SPM,
|
|
"vocab_pre": None,
|
|
"vocab_files": HF_TOKENIZER_SPM_FILES,
|
|
},
|
|
{
|
|
"model_repo": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
|
"model_arch": MODEL_ARCH.LLAMA,
|
|
"model_parts": 8,
|
|
"model_type": ModelFileType.SAFETENSORS,
|
|
"vocab_type": VocabType.SPM,
|
|
"vocab_pre": None,
|
|
"vocab_files": HF_TOKENIZER_SPM_FILES,
|
|
},
|
|
{
|
|
"model_repo": "microsoft/Phi-3-mini-4k-instruct",
|
|
"model_arch": MODEL_ARCH.PHI3,
|
|
"model_parts": 2,
|
|
"model_type": ModelFileType.SAFETENSORS,
|
|
"vocab_type": VocabType.SPM,
|
|
"vocab_pre": None,
|
|
"vocab_files": HF_TOKENIZER_SPM_FILES,
|
|
},
|
|
# WPM (Word Piece Models): Default to Byte Level Pre-tokenization.
|
|
# NOTE: BERT Normalization and Pre-tokenization rules differ from Byte Level Pre-tokenization.
|
|
{
|
|
"model_repo": "BAAI/bge-small-en-v1.5",
|
|
"model_arch": MODEL_ARCH.BERT,
|
|
"model_parts": 1,
|
|
"model_type": ModelFileType.BIN,
|
|
"vocab_type": VocabType.WPM,
|
|
"vocab_pre": None,
|
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
|
},
|
|
{
|
|
"model_repo": "jinaai/jina-embeddings-v2-base-en",
|
|
"model_arch": MODEL_ARCH.JINA_BERT_V2,
|
|
"model_parts": 1,
|
|
"model_type": ModelFileType.SAFETENSORS,
|
|
"vocab_type": VocabType.WPM,
|
|
"vocab_pre": None,
|
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
|
},
|
|
# BPE (Byte Pair Encoding Models): Default is Byte Level Pre-tokenization
|
|
{
|
|
"model_repo": "meta-llama/Meta-Llama-3-8B",
|
|
"model_arch": MODEL_ARCH.LLAMA,
|
|
"model_parts": 4,
|
|
"model_type": ModelFileType.SAFETENSORS,
|
|
"vocab_type": VocabType.BPE,
|
|
"vocab_pre": None,
|
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
|
},
|
|
{
|
|
"model_repo": "tiiuae/falcon-7b",
|
|
"model_arch": MODEL_ARCH.FALCON,
|
|
"model_parts": 2,
|
|
"model_type": ModelFileType.BIN,
|
|
"vocab_type": VocabType.BPE,
|
|
"vocab_pre": None,
|
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
|
},
|
|
{
|
|
"model_repo": "deepseek-ai/deepseek-llm-7b-base",
|
|
"model_arch": MODEL_ARCH.LLAMA,
|
|
"model_parts": 2,
|
|
"model_type": ModelFileType.BIN,
|
|
"vocab_type": VocabType.BPE,
|
|
"vocab_pre": None,
|
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
|
},
|
|
{
|
|
"model_repo": "deepseek-ai/deepseek-coder-6.7b-base",
|
|
"model_arch": MODEL_ARCH.LLAMA,
|
|
"model_parts": 2,
|
|
"model_type": ModelFileType.SAFETENSORS,
|
|
"vocab_type": VocabType.BPE,
|
|
"vocab_pre": None,
|
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
|
},
|
|
{
|
|
"model_repo": "mosaicml/mpt-7b",
|
|
"model_arch": MODEL_ARCH.MPT,
|
|
"model_parts": 2,
|
|
"model_type": ModelFileType.BIN,
|
|
"vocab_type": VocabType.BPE,
|
|
"vocab_pre": None,
|
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
|
},
|
|
#
|
|
# BPE: STARCODER
|
|
#
|
|
{
|
|
"model_repo": "bigcode/starcoder2-3b",
|
|
"model_arch": MODEL_ARCH.STARCODER2,
|
|
"model_parts": 1,
|
|
"model_type": ModelFileType.SAFETENSORS,
|
|
"vocab_type": VocabType.BPE,
|
|
"vocab_pre": None,
|
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
|
},
|
|
{
|
|
"model_repo": "smallcloudai/Refact-1_6-base",
|
|
"model_arch": MODEL_ARCH.REFACT,
|
|
"model_parts": 1,
|
|
"model_type": ModelFileType.BIN,
|
|
"vocab_type": VocabType.BPE,
|
|
"vocab_pre": None,
|
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
|
},
|
|
{
|
|
"model_repo": "CohereForAI/c4ai-command-r-v01",
|
|
"model_arch": MODEL_ARCH.COMMAND_R,
|
|
"model_parts": 15,
|
|
"model_type": ModelFileType.SAFETENSORS,
|
|
"vocab_type": VocabType.BPE,
|
|
"vocab_pre": None,
|
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
|
},
|
|
#
|
|
# BPE: QWEN
|
|
#
|
|
{
|
|
"model_repo": "Qwen/Qwen1.5-7B",
|
|
"model_arch": MODEL_ARCH.QWEN2,
|
|
"model_parts": 4,
|
|
"model_type": ModelFileType.SAFETENSORS,
|
|
"vocab_type": VocabType.BPE,
|
|
"vocab_pre": None,
|
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
|
},
|
|
{
|
|
"model_repo": "stabilityai/stablelm-2-zephyr-1_6b",
|
|
"model_arch": MODEL_ARCH.STABLELM,
|
|
"model_parts": 1,
|
|
"model_type": ModelFileType.SAFETENSORS,
|
|
"vocab_type": VocabType.BPE,
|
|
"vocab_pre": None,
|
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
|
},
|
|
#
|
|
# BPE: GPT-2
|
|
#
|
|
{
|
|
"model_repo": "openai-community/gpt2",
|
|
"model_arch": MODEL_ARCH.GPT2,
|
|
"model_parts": 1,
|
|
"model_type": ModelFileType.SAFETENSORS,
|
|
"vocab_type": VocabType.BPE,
|
|
"vocab_pre": None,
|
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
|
},
|
|
{
|
|
"model_repo": "allenai/OLMo-1.7-7B-hf",
|
|
"model_arch": MODEL_ARCH.OLMO,
|
|
"model_parts": 6,
|
|
"model_type": ModelFileType.SAFETENSORS,
|
|
"vocab_type": VocabType.BPE,
|
|
"vocab_pre": None,
|
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
|
},
|
|
# { # NOTE: I don't have access to this model
|
|
# "model_repo": "databricks/dbrx-base",
|
|
# "model_arch": MODEL_ARCH.DBRX,
|
|
# "model_parts": 0,
|
|
# "model_type": ModelFileType.SAFETENSORS,
|
|
# "vocab_type": VocabType.BPE,
|
|
# "vocab_pre": None,
|
|
# "vocab_files": HF_TOKENIZER_BPE_FILES,
|
|
# },
|
|
{ # NOTE: RoBERTa post processor
|
|
"model_repo": "jinaai/jina-embeddings-v2-base-es",
|
|
"model_arch": MODEL_ARCH.JINA_BERT_V2,
|
|
"model_parts": 1,
|
|
"model_type": ModelFileType.SAFETENSORS,
|
|
"vocab_type": VocabType.BPE,
|
|
"vocab_pre": None,
|
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
|
},
|
|
{ # NOTE: RoBERTa post processor
|
|
"model_repo": "jinaai/jina-embeddings-v2-base-de",
|
|
"model_arch": MODEL_ARCH.JINA_BERT_V2,
|
|
"model_parts": 1,
|
|
"model_type": ModelFileType.SAFETENSORS,
|
|
"vocab_type": VocabType.BPE,
|
|
"vocab_pre": None,
|
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
|
},
|
|
{ # NOTE: Phi-1 is compatible with GPT-2 arch and vocab
|
|
"model_repo": "microsoft/phi-1",
|
|
"model_arch": MODEL_ARCH.PHI2,
|
|
"model_parts": 1,
|
|
"model_type": ModelFileType.SAFETENSORS,
|
|
"vocab_type": VocabType.BPE,
|
|
"vocab_pre": None,
|
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
|
},
|
|
{
|
|
"model_repo": "microsoft/phi-1_5",
|
|
"model_arch": MODEL_ARCH.PHI2,
|
|
"model_parts": 1,
|
|
"model_type": ModelFileType.SAFETENSORS,
|
|
"vocab_type": VocabType.BPE,
|
|
"vocab_pre": None,
|
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
|
},
|
|
{
|
|
"model_repo": "microsoft/phi-2",
|
|
"model_arch": MODEL_ARCH.PHI2,
|
|
"model_parts": 2,
|
|
"model_type": ModelFileType.SAFETENSORS,
|
|
"vocab_type": VocabType.BPE,
|
|
"vocab_pre": None,
|
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
|
},
|
|
)
|
|
|
|
|
|
def get_arguments() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("auth_token", help="A huggingface read auth token")
|
|
parser.add_argument(
|
|
"-v", "--verbose", action="store_true", help="Increase output verbosity."
|
|
)
|
|
parser.add_argument(
|
|
"--model-path",
|
|
default="models",
|
|
help="The models storage path. Default is 'models'.",
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
args = get_arguments()
|
|
|
|
if args.verbose:
|
|
logging.basicConfig(level=logging.DEBUG)
|
|
else:
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
|
hub_model = HFHubModel(
|
|
auth_token=args.auth_token,
|
|
model_path=args.model_path,
|
|
logger=logger,
|
|
)
|
|
|
|
hub_tokenizer = HFHubTokenizer(
|
|
model_path=args.model_path,
|
|
logger=logger,
|
|
)
|
|
|
|
|
|
for model in HF_MODEL_MAP:
|
|
|
|
model_repo = model["model_repo"]
|
|
model_arch = model["model_arch"]
|
|
vocab_type = model["vocab_type"]
|
|
|
|
print(
|
|
"HUB_REPO:", model_repo,
|
|
"LLAMA_ARCH:", MODEL_ARCH_NAMES[model_arch]
|
|
)
|
|
|
|
hub_model.download_all_vocab_files(
|
|
model_repo=model_repo,
|
|
vocab_type=vocab_type,
|
|
)
|
|
# log the downloaded results
|
|
hub_tokenizer.log_tokenizer_json_info(model_repo)
|
|
|
|
normalizer = hub_tokenizer.get_normalizer(model_repo)
|
|
# extract the normalizer metadata
|
|
|
|
pre_tokenizer = hub_tokenizer.get_pre_tokenizer(model_repo)
|
|
# extract the pre-tokenizer metadata
|
|
|
|
added_tokens = hub_tokenizer.get_added_tokens(model_repo)
|
|
# extract the added tokens metadata
|
|
|
|
sha256sum = hub_tokenizer.get_tokenizer_json_hash(model_repo)
|
|
# use the hash to validate the models vocabulary
|