vocab: add tokenizer support for jina-embeddings-v2-base-zh
This commit is contained in:
parent
7d587e5544
commit
22e85fcf11
|
|
@ -1138,6 +1138,9 @@ class TextModel(ModelBase):
|
||||||
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
|
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
|
||||||
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
|
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
|
||||||
res = "jina-v2-de"
|
res = "jina-v2-de"
|
||||||
|
if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448":
|
||||||
|
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-zh
|
||||||
|
res = "jina-v2-zh"
|
||||||
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
|
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
|
||||||
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
|
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
|
||||||
res = "smaug-bpe"
|
res = "smaug-bpe"
|
||||||
|
|
|
||||||
|
|
@ -106,6 +106,7 @@ models = [
|
||||||
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
|
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
|
||||||
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
||||||
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
||||||
|
{"name": "jina-v2-zh", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-zh", },
|
||||||
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
|
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
|
||||||
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
|
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
|
||||||
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
|
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
|
||||||
|
|
|
||||||
|
|
@ -466,6 +466,11 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||||
// original regex from tokenizer.json
|
// original regex from tokenizer.json
|
||||||
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
|
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
|
||||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
|
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
|
||||||
|
case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH:
|
||||||
|
// ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-zh
|
||||||
|
// whitespace pre-tokenizer
|
||||||
|
regex_exprs = {
|
||||||
|
"\\S+",
|
||||||
};
|
};
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
|
@ -525,7 +530,20 @@ struct llm_tokenizer_bpe_session {
|
||||||
|
|
||||||
void tokenize(const std::string & text, std::vector<llama_token> & output) {
|
void tokenize(const std::string & text, std::vector<llama_token> & output) {
|
||||||
int final_prev_index = -1;
|
int final_prev_index = -1;
|
||||||
const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs);
|
|
||||||
|
std::string text_normalized;
|
||||||
|
if (vocab.get_apply_lowercase()) {
|
||||||
|
for (uint32_t cpt : unicode_cpts_from_utf8(text)) {
|
||||||
|
text_normalized += unicode_cpt_to_utf8(unicode_tolower(cpt));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
text_normalized = text;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto word_collection = unicode_regex_split(text_normalized, tokenizer.regex_exprs);
|
||||||
|
if (vocab.get_use_byte_encoding()) {
|
||||||
|
word_collection = unicode_words_byte_encode(word_collection);
|
||||||
|
}
|
||||||
|
|
||||||
symbols_final.clear();
|
symbols_final.clear();
|
||||||
|
|
||||||
|
|
@ -1598,6 +1616,8 @@ struct llama_vocab::impl {
|
||||||
bool remove_extra_whitespaces = false;
|
bool remove_extra_whitespaces = false;
|
||||||
bool escape_whitespaces = true;
|
bool escape_whitespaces = true;
|
||||||
bool treat_whitespace_as_suffix = false;
|
bool treat_whitespace_as_suffix = false;
|
||||||
|
bool apply_lowercase = false; // lowercase normalization
|
||||||
|
bool use_byte_encoding = true; // GPT-2 byte encoding for BPE vocab
|
||||||
|
|
||||||
std::unordered_map<std::string, llama_token> token_to_id;
|
std::unordered_map<std::string, llama_token> token_to_id;
|
||||||
std::vector<token_data> id_to_token;
|
std::vector<token_data> id_to_token;
|
||||||
|
|
@ -2041,6 +2061,14 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
tokenizer_pre == "solar-open") {
|
tokenizer_pre == "solar-open") {
|
||||||
pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
|
pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
|
||||||
clean_spaces = false;
|
clean_spaces = false;
|
||||||
|
} else if (
|
||||||
|
tokenizer_pre == "jina-v2-zh") {
|
||||||
|
pre_type = LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH;
|
||||||
|
clean_spaces = true;
|
||||||
|
add_bos = true;
|
||||||
|
add_sep = true;
|
||||||
|
apply_lowercase = true;
|
||||||
|
use_byte_encoding = false;
|
||||||
} else {
|
} else {
|
||||||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||||
}
|
}
|
||||||
|
|
@ -3143,6 +3171,9 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
|
||||||
return _try_copy(token_text.data(), token_text.size());
|
return _try_copy(token_text.data(), token_text.size());
|
||||||
}
|
}
|
||||||
if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
|
if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
|
||||||
|
if (!use_byte_encoding) {
|
||||||
|
return _try_copy(token_text.data(), token_text.size());
|
||||||
|
}
|
||||||
std::string result = llama_decode_text(token_text);
|
std::string result = llama_decode_text(token_text);
|
||||||
return _try_copy(result.data(), result.size());
|
return _try_copy(result.data(), result.size());
|
||||||
}
|
}
|
||||||
|
|
@ -3567,6 +3598,14 @@ bool llama_vocab::get_treat_whitespace_as_suffix() const {
|
||||||
return pimpl->treat_whitespace_as_suffix;
|
return pimpl->treat_whitespace_as_suffix;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool llama_vocab::get_apply_lowercase() const {
|
||||||
|
return pimpl->apply_lowercase;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool llama_vocab::get_use_byte_encoding() const {
|
||||||
|
return pimpl->use_byte_encoding;
|
||||||
|
}
|
||||||
|
|
||||||
int llama_vocab::max_token_len() const {
|
int llama_vocab::max_token_len() const {
|
||||||
return pimpl->max_token_len;
|
return pimpl->max_token_len;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -54,6 +54,7 @@ enum llama_vocab_pre_type {
|
||||||
LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43,
|
LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43,
|
||||||
LLAMA_VOCAB_PRE_TYPE_YOUTU = 44,
|
LLAMA_VOCAB_PRE_TYPE_YOUTU = 44,
|
||||||
LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45,
|
LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH = 46,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct LLM_KV;
|
struct LLM_KV;
|
||||||
|
|
@ -131,6 +132,8 @@ struct llama_vocab {
|
||||||
bool get_remove_extra_whitespaces () const;
|
bool get_remove_extra_whitespaces () const;
|
||||||
bool get_escape_whitespaces () const;
|
bool get_escape_whitespaces () const;
|
||||||
bool get_treat_whitespace_as_suffix() const;
|
bool get_treat_whitespace_as_suffix() const;
|
||||||
|
bool get_apply_lowercase () const;
|
||||||
|
bool get_use_byte_encoding () const;
|
||||||
|
|
||||||
int max_token_len() const;
|
int max_token_len() const;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -220,23 +220,6 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
|
||||||
return conv.from_bytes(s);
|
return conv.from_bytes(s);
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {
|
|
||||||
std::vector<std::string> bpe_encoded_words;
|
|
||||||
for (const auto & word : bpe_words) {
|
|
||||||
std::string text_utf;
|
|
||||||
auto utf_word = unicode_cpts_from_utf8(word);
|
|
||||||
for (size_t i = 0; i < utf_word.size(); ++i) {
|
|
||||||
text_utf += unicode_cpt_to_utf8(utf_word[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string encoded_token;
|
|
||||||
for (char & c : text_utf) {
|
|
||||||
encoded_token += unicode_byte_to_utf8(c);
|
|
||||||
}
|
|
||||||
bpe_encoded_words.emplace_back(encoded_token);
|
|
||||||
}
|
|
||||||
return bpe_encoded_words;
|
|
||||||
}
|
|
||||||
|
|
||||||
// GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
|
// GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
|
||||||
static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & text, const std::vector<size_t> & offsets) {
|
static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & text, const std::vector<size_t> & offsets) {
|
||||||
|
|
@ -956,6 +939,24 @@ bool unicode_cpt_is_han(uint32_t cpt) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> unicode_words_byte_encode(const std::vector<std::string> & bpe_words) {
|
||||||
|
std::vector<std::string> bpe_encoded_words;
|
||||||
|
for (const auto & word : bpe_words) {
|
||||||
|
std::string text_utf;
|
||||||
|
auto utf_word = unicode_cpts_from_utf8(word);
|
||||||
|
for (size_t i = 0; i < utf_word.size(); ++i) {
|
||||||
|
text_utf += unicode_cpt_to_utf8(utf_word[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string encoded_token;
|
||||||
|
for (char & c : text_utf) {
|
||||||
|
encoded_token += unicode_byte_to_utf8(c);
|
||||||
|
}
|
||||||
|
bpe_encoded_words.emplace_back(encoded_token);
|
||||||
|
}
|
||||||
|
return bpe_encoded_words;
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
|
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
|
||||||
// unicode categories
|
// unicode categories
|
||||||
static const std::map<std::string, int> k_ucat_enum = {
|
static const std::map<std::string, int> k_ucat_enum = {
|
||||||
|
|
@ -1143,5 +1144,5 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
||||||
start += offset;
|
start += offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
return unicode_byte_encoding_process(bpe_words);
|
return bpe_words;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -108,4 +108,6 @@ uint32_t unicode_tolower(uint32_t cpt);
|
||||||
|
|
||||||
bool unicode_cpt_is_han(uint32_t cpt);
|
bool unicode_cpt_is_han(uint32_t cpt);
|
||||||
|
|
||||||
|
std::vector<std::string> unicode_words_byte_encode(const std::vector<std::string> & bpe_words);
|
||||||
|
|
||||||
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
|
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue