Merge f3bce52986 into db6adb3c88
This commit is contained in:
commit
2f25588634
|
|
@ -1144,6 +1144,9 @@ class TextModel(ModelBase):
|
|||
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
|
||||
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
|
||||
res = "jina-v2-de"
|
||||
if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448":
|
||||
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-zh
|
||||
res = "jina-v2-zh"
|
||||
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
|
||||
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
|
||||
res = "smaug-bpe"
|
||||
|
|
@ -1296,6 +1299,16 @@ class TextModel(ModelBase):
|
|||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def _set_vocab_whitespace(self) -> None:
|
||||
tokens, toktypes, tokpre = self.get_vocab_base()
|
||||
self.gguf_writer.add_tokenizer_model("whitespace")
|
||||
self.gguf_writer.add_tokenizer_pre(tokpre)
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def _set_vocab_qwen(self):
|
||||
dir_model = self.dir_model
|
||||
hparams = self.hparams
|
||||
|
|
@ -7262,7 +7275,17 @@ class JinaBertV2Model(BertModel):
|
|||
if tokenizer_class == 'BertTokenizer':
|
||||
super().set_vocab()
|
||||
elif tokenizer_class == 'RobertaTokenizer':
|
||||
self._set_vocab_gpt2()
|
||||
pre_tokenizer_type = None
|
||||
tokenizer_json_path = self.dir_model / "tokenizer.json"
|
||||
if tokenizer_json_path.is_file():
|
||||
with open(tokenizer_json_path, "r", encoding="utf-8") as f:
|
||||
tokenizer_json = json.load(f)
|
||||
pre_tokenizer_type = tokenizer_json.get("pre_tokenizer", {}).get("type")
|
||||
|
||||
if pre_tokenizer_type == "Whitespace":
|
||||
self._set_vocab_whitespace()
|
||||
else:
|
||||
self._set_vocab_gpt2()
|
||||
self.gguf_writer.add_token_type_count(2)
|
||||
else:
|
||||
raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
|
||||
|
|
|
|||
|
|
@ -106,6 +106,7 @@ models = [
|
|||
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
|
||||
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
||||
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
||||
{"name": "jina-v2-zh", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-zh", },
|
||||
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
|
||||
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
|
||||
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
|
||||
|
|
|
|||
|
|
@ -254,6 +254,8 @@ class Keys:
|
|||
CHAT_TEMPLATE = "tokenizer.chat_template"
|
||||
CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}"
|
||||
CHAT_TEMPLATES = "tokenizer.chat_templates"
|
||||
# Normalizer constants
|
||||
NORMALIZER_LOWERCASE = "tokenizer.ggml.normalizer.lowercase"
|
||||
# FIM/Infill special tokens constants
|
||||
FIM_PRE_ID = "tokenizer.ggml.fim_pre_token_id"
|
||||
FIM_SUF_ID = "tokenizer.ggml.fim_suf_token_id"
|
||||
|
|
|
|||
|
|
@ -1075,6 +1075,9 @@ class GGUFWriter:
|
|||
def add_eom_token_id(self, id: int) -> None:
|
||||
self.add_uint32(Keys.Tokenizer.EOM_ID, id)
|
||||
|
||||
def add_normalizer_lowercase(self, value: bool) -> None:
|
||||
self.add_bool(Keys.Tokenizer.NORMALIZER_LOWERCASE, value)
|
||||
|
||||
def add_classifier_output_labels(self, labels: Sequence[str]) -> None:
|
||||
self.add_array(Keys.Classifier.OUTPUT_LABELS.format(arch=self.arch), labels)
|
||||
|
||||
|
|
|
|||
|
|
@ -52,6 +52,7 @@ class SpecialVocab:
|
|||
add_special_token: dict[str, bool]
|
||||
special_token_ids: dict[str, int]
|
||||
chat_template: str | Sequence[Mapping[str, str]] | None
|
||||
normalizer_lowercase: bool
|
||||
|
||||
def __init__(
|
||||
self, path: str | os.PathLike[str], load_merges: bool = False,
|
||||
|
|
@ -64,6 +65,7 @@ class SpecialVocab:
|
|||
self.load_merges = load_merges
|
||||
self.merges = []
|
||||
self.chat_template = None
|
||||
self.normalizer_lowercase = False
|
||||
if special_token_types is not None:
|
||||
self.special_token_types = special_token_types
|
||||
else:
|
||||
|
|
@ -102,6 +104,10 @@ class SpecialVocab:
|
|||
if not quiet:
|
||||
logger.info(f'Setting chat_template to {self.chat_template}')
|
||||
gw.add_chat_template(self.chat_template)
|
||||
if self.normalizer_lowercase:
|
||||
if not quiet:
|
||||
logger.info('Setting normalizer_lowercase to True')
|
||||
gw.add_normalizer_lowercase(True)
|
||||
|
||||
def _load(self, path: Path) -> None:
|
||||
self._try_load_from_tokenizer_json(path)
|
||||
|
|
@ -146,6 +152,24 @@ class SpecialVocab:
|
|||
return
|
||||
logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping')
|
||||
|
||||
def _parse_normalizer(self, normalizer: dict) -> None:
|
||||
# ref: https://huggingface.co/docs/tokenizers/api/normalizers
|
||||
#
|
||||
# Detects lowercase normalization in three possible formats:
|
||||
# 1. Standalone: {"type": "Lowercase"}
|
||||
# 2. BertNormalizer attribute: {"type": "BertNormalizer", "lowercase": true, ...}
|
||||
# 3. Nested in Sequence: {"type": "Sequence", "normalizers": [...]}
|
||||
|
||||
normalizer_type = normalizer.get('type')
|
||||
if normalizer_type == 'Lowercase':
|
||||
self.normalizer_lowercase = True
|
||||
elif normalizer_type == 'BertNormalizer':
|
||||
if normalizer.get('lowercase', True):
|
||||
self.normalizer_lowercase = True
|
||||
elif normalizer_type == 'Sequence':
|
||||
for norm in normalizer.get('normalizers', []):
|
||||
self._parse_normalizer(norm)
|
||||
|
||||
def _try_load_from_tokenizer_json(self, path: Path) -> bool:
|
||||
tokenizer = None
|
||||
tokenizer_file = path / 'tokenizer.json'
|
||||
|
|
@ -178,6 +202,9 @@ class SpecialVocab:
|
|||
]
|
||||
else:
|
||||
raise ValueError("Unknown tokenizer merges format")
|
||||
# Parse normalizer configuration
|
||||
if normalizer := tokenizer.get('normalizer'):
|
||||
self._parse_normalizer(normalizer)
|
||||
added_tokens = tokenizer.get('added_tokens', {})
|
||||
else:
|
||||
added_tokens = {}
|
||||
|
|
|
|||
|
|
@ -291,6 +291,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|||
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
||||
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
||||
{ LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" },
|
||||
{ LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, "tokenizer.ggml.normalizer.lowercase" },
|
||||
{ LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
|
||||
{ LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
|
||||
{ LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
|
||||
|
|
|
|||
|
|
@ -280,6 +280,7 @@ enum llm_kv {
|
|||
LLM_KV_TOKENIZER_HF_JSON,
|
||||
LLM_KV_TOKENIZER_RWKV,
|
||||
LLM_KV_TOKENIZER_CHAT_TEMPLATE,
|
||||
LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE,
|
||||
LLM_KV_TOKENIZER_FIM_PRE_ID,
|
||||
LLM_KV_TOKENIZER_FIM_SUF_ID,
|
||||
LLM_KV_TOKENIZER_FIM_MID_ID,
|
||||
|
|
|
|||
|
|
@ -468,6 +468,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH:
|
||||
// ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-zh
|
||||
// whitespace pre-tokenizer
|
||||
regex_exprs = {
|
||||
"\\S+",
|
||||
};
|
||||
break;
|
||||
default:
|
||||
// default regex for BPE tokenization pre-processing
|
||||
regex_exprs = {
|
||||
|
|
@ -525,7 +532,20 @@ struct llm_tokenizer_bpe_session {
|
|||
|
||||
void tokenize(const std::string & text, std::vector<llama_token> & output) {
|
||||
int final_prev_index = -1;
|
||||
const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs);
|
||||
|
||||
std::string text_normalized;
|
||||
if (vocab.get_apply_lowercase()) {
|
||||
for (uint32_t cpt : unicode_cpts_from_utf8(text)) {
|
||||
text_normalized += unicode_cpt_to_utf8(unicode_tolower(cpt));
|
||||
}
|
||||
} else {
|
||||
text_normalized = text;
|
||||
}
|
||||
|
||||
auto word_collection = unicode_regex_split(text_normalized, tokenizer.regex_exprs);
|
||||
if (vocab.get_use_byte_encoding()) {
|
||||
word_collection = unicode_words_byte_encode(word_collection);
|
||||
}
|
||||
|
||||
symbols_final.clear();
|
||||
|
||||
|
|
@ -1598,6 +1618,8 @@ struct llama_vocab::impl {
|
|||
bool remove_extra_whitespaces = false;
|
||||
bool escape_whitespaces = true;
|
||||
bool treat_whitespace_as_suffix = false;
|
||||
bool apply_lowercase = false; // lowercase normalization
|
||||
bool use_byte_encoding = true;
|
||||
|
||||
std::unordered_map<std::string, llama_token> token_to_id;
|
||||
std::vector<token_data> id_to_token;
|
||||
|
|
@ -1747,7 +1769,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
special_mask_id = 103;
|
||||
|
||||
add_sep = true;
|
||||
} else if (tokenizer_model == "gpt2") {
|
||||
} else if (
|
||||
tokenizer_model == "gpt2" ||
|
||||
tokenizer_model == "whitespace") {
|
||||
type = LLAMA_VOCAB_TYPE_BPE;
|
||||
|
||||
// read bpe merges and populate bpe ranks
|
||||
|
|
@ -1782,12 +1806,21 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
}
|
||||
|
||||
// default special tokens
|
||||
special_bos_id = 11;
|
||||
special_eos_id = 11;
|
||||
special_unk_id = LLAMA_TOKEN_NULL;
|
||||
special_sep_id = LLAMA_TOKEN_NULL;
|
||||
special_pad_id = LLAMA_TOKEN_NULL;
|
||||
special_mask_id = LLAMA_TOKEN_NULL;
|
||||
if (tokenizer_model == "gpt2") {
|
||||
special_bos_id = 11;
|
||||
special_eos_id = 11;
|
||||
special_unk_id = LLAMA_TOKEN_NULL;
|
||||
special_sep_id = LLAMA_TOKEN_NULL;
|
||||
special_pad_id = LLAMA_TOKEN_NULL;
|
||||
special_mask_id = LLAMA_TOKEN_NULL;
|
||||
} else if (tokenizer_model == "whitespace") {
|
||||
special_bos_id = 0; // <s>
|
||||
special_eos_id = 2; // </s>
|
||||
special_unk_id = 3; // <unk>
|
||||
special_sep_id = 2; // </s> (same as eos)
|
||||
special_pad_id = 1; // <pad>
|
||||
special_mask_id = 4; // <mask>
|
||||
}
|
||||
} else if (tokenizer_model == "t5") {
|
||||
type = LLAMA_VOCAB_TYPE_UGM;
|
||||
|
||||
|
|
@ -2048,6 +2081,13 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
tokenizer_pre == "solar-open") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "jina-v2-zh") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH;
|
||||
clean_spaces = true;
|
||||
add_bos = true;
|
||||
add_sep = true;
|
||||
use_byte_encoding = false;
|
||||
} else {
|
||||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||
}
|
||||
|
|
@ -2078,8 +2118,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
}
|
||||
|
||||
ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false);
|
||||
ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false);
|
||||
ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false);
|
||||
ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false);
|
||||
ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, apply_lowercase, true);
|
||||
}
|
||||
|
||||
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
|
||||
|
|
@ -3157,6 +3198,9 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
|
|||
return _try_copy(token_text.data(), token_text.size());
|
||||
}
|
||||
if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
|
||||
if (!use_byte_encoding) {
|
||||
return _try_copy(token_text.data(), token_text.size());
|
||||
}
|
||||
std::string result = llama_decode_text(token_text);
|
||||
return _try_copy(result.data(), result.size());
|
||||
}
|
||||
|
|
@ -3581,6 +3625,14 @@ bool llama_vocab::get_treat_whitespace_as_suffix() const {
|
|||
return pimpl->treat_whitespace_as_suffix;
|
||||
}
|
||||
|
||||
bool llama_vocab::get_apply_lowercase() const {
|
||||
return pimpl->apply_lowercase;
|
||||
}
|
||||
|
||||
bool llama_vocab::get_use_byte_encoding() const {
|
||||
return pimpl->use_byte_encoding;
|
||||
}
|
||||
|
||||
int llama_vocab::max_token_len() const {
|
||||
return pimpl->max_token_len;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -54,6 +54,7 @@ enum llama_vocab_pre_type {
|
|||
LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43,
|
||||
LLAMA_VOCAB_PRE_TYPE_YOUTU = 44,
|
||||
LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45,
|
||||
LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH = 46,
|
||||
};
|
||||
|
||||
struct LLM_KV;
|
||||
|
|
@ -131,6 +132,8 @@ struct llama_vocab {
|
|||
bool get_remove_extra_whitespaces () const;
|
||||
bool get_escape_whitespaces () const;
|
||||
bool get_treat_whitespace_as_suffix() const;
|
||||
bool get_apply_lowercase () const;
|
||||
bool get_use_byte_encoding () const;
|
||||
|
||||
int max_token_len() const;
|
||||
|
||||
|
|
|
|||
|
|
@ -220,23 +220,6 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
|
|||
return conv.from_bytes(s);
|
||||
}
|
||||
|
||||
static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {
|
||||
std::vector<std::string> bpe_encoded_words;
|
||||
for (const auto & word : bpe_words) {
|
||||
std::string text_utf;
|
||||
auto utf_word = unicode_cpts_from_utf8(word);
|
||||
for (size_t i = 0; i < utf_word.size(); ++i) {
|
||||
text_utf += unicode_cpt_to_utf8(utf_word[i]);
|
||||
}
|
||||
|
||||
std::string encoded_token;
|
||||
for (char & c : text_utf) {
|
||||
encoded_token += unicode_byte_to_utf8(c);
|
||||
}
|
||||
bpe_encoded_words.emplace_back(encoded_token);
|
||||
}
|
||||
return bpe_encoded_words;
|
||||
}
|
||||
|
||||
// GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
|
||||
static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & text, const std::vector<size_t> & offsets) {
|
||||
|
|
@ -933,6 +916,24 @@ bool unicode_cpt_is_han(uint32_t cpt) {
|
|||
return false;
|
||||
}
|
||||
|
||||
std::vector<std::string> unicode_words_byte_encode(const std::vector<std::string> & bpe_words) {
|
||||
std::vector<std::string> bpe_encoded_words;
|
||||
for (const auto & word : bpe_words) {
|
||||
std::string text_utf;
|
||||
auto utf_word = unicode_cpts_from_utf8(word);
|
||||
for (size_t i = 0; i < utf_word.size(); ++i) {
|
||||
text_utf += unicode_cpt_to_utf8(utf_word[i]);
|
||||
}
|
||||
|
||||
std::string encoded_token;
|
||||
for (char & c : text_utf) {
|
||||
encoded_token += unicode_byte_to_utf8(c);
|
||||
}
|
||||
bpe_encoded_words.emplace_back(encoded_token);
|
||||
}
|
||||
return bpe_encoded_words;
|
||||
}
|
||||
|
||||
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
|
||||
// unicode categories
|
||||
static const std::map<std::string, int> k_ucat_enum = {
|
||||
|
|
@ -1120,5 +1121,5 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
|||
start += offset;
|
||||
}
|
||||
|
||||
return unicode_byte_encoding_process(bpe_words);
|
||||
return bpe_words;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -108,4 +108,6 @@ uint32_t unicode_tolower(uint32_t cpt);
|
|||
|
||||
bool unicode_cpt_is_han(uint32_t cpt);
|
||||
|
||||
std::vector<std::string> unicode_words_byte_encode(const std::vector<std::string> & bpe_words);
|
||||
|
||||
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
|
||||
|
|
|
|||
Loading…
Reference in New Issue