diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index be83e3108e..37c350067a 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1138,6 +1138,9 @@ class TextModel(ModelBase): if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6": # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de res = "jina-v2-de" + if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448": + # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-zh + res = "jina-v2-zh" if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d": # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct res = "smaug-bpe" diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index aa9843ea17..f3f4647e8a 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -106,6 +106,7 @@ models = [ {"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM! {"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", }, {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", }, + {"name": "jina-v2-zh", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-zh", }, {"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", }, {"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", }, {"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", }, diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index a23950d007..61f95bb230 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -466,6 +466,11 @@ struct llm_tokenizer_bpe : llm_tokenizer { // original regex from tokenizer.json // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+" "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+", + case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH: + // ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-zh + // whitespace pre-tokenizer + regex_exprs = { + "\\S+", }; break; default: @@ -525,7 +530,20 @@ struct llm_tokenizer_bpe_session { void tokenize(const std::string & text, std::vector & output) { int final_prev_index = -1; - const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs); + + std::string text_normalized; + if (vocab.get_apply_lowercase()) { + for (uint32_t cpt : unicode_cpts_from_utf8(text)) { + text_normalized += unicode_cpt_to_utf8(unicode_tolower(cpt)); + } + } else { + text_normalized = text; + } + + auto word_collection = unicode_regex_split(text_normalized, tokenizer.regex_exprs); + if (vocab.get_use_byte_encoding()) { + word_collection = unicode_words_byte_encode(word_collection); + } symbols_final.clear(); @@ -1598,6 +1616,8 @@ struct llama_vocab::impl { bool remove_extra_whitespaces = false; bool escape_whitespaces = true; bool treat_whitespace_as_suffix = false; + bool apply_lowercase = false; // lowercase normalization + bool use_byte_encoding = true; // GPT-2 byte encoding for BPE vocab std::unordered_map token_to_id; std::vector id_to_token; @@ -2041,6 +2061,14 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "solar-open") { pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN; clean_spaces = false; + } else if ( + tokenizer_pre == "jina-v2-zh") { + pre_type = LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH; + clean_spaces = true; + add_bos = true; + add_sep = true; + apply_lowercase = true; + use_byte_encoding = false; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } @@ -3143,6 +3171,9 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t return _try_copy(token_text.data(), token_text.size()); } if (attr & LLAMA_TOKEN_ATTR_NORMAL) { + if (!use_byte_encoding) { + return _try_copy(token_text.data(), token_text.size()); + } std::string result = llama_decode_text(token_text); return _try_copy(result.data(), result.size()); } @@ -3567,6 +3598,14 @@ bool llama_vocab::get_treat_whitespace_as_suffix() const { return pimpl->treat_whitespace_as_suffix; } +bool llama_vocab::get_apply_lowercase() const { + return pimpl->apply_lowercase; +} + +bool llama_vocab::get_use_byte_encoding() const { + return pimpl->use_byte_encoding; +} + int llama_vocab::max_token_len() const { return pimpl->max_token_len; } diff --git a/src/llama-vocab.h b/src/llama-vocab.h index 28c3a82b91..120188e13e 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -54,6 +54,7 @@ enum llama_vocab_pre_type { LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43, LLAMA_VOCAB_PRE_TYPE_YOUTU = 44, LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45, + LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH = 46, }; struct LLM_KV; @@ -131,6 +132,8 @@ struct llama_vocab { bool get_remove_extra_whitespaces () const; bool get_escape_whitespaces () const; bool get_treat_whitespace_as_suffix() const; + bool get_apply_lowercase () const; + bool get_use_byte_encoding () const; int max_token_len() const; diff --git a/src/unicode.cpp b/src/unicode.cpp index b47dcbe619..a2f3a1f12c 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -220,23 +220,6 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) { return conv.from_bytes(s); } -static std::vector unicode_byte_encoding_process(const std::vector & bpe_words) { - std::vector bpe_encoded_words; - for (const auto & word : bpe_words) { - std::string text_utf; - auto utf_word = unicode_cpts_from_utf8(word); - for (size_t i = 0; i < utf_word.size(); ++i) { - text_utf += unicode_cpt_to_utf8(utf_word[i]); - } - - std::string encoded_token; - for (char & c : text_utf) { - encoded_token += unicode_byte_to_utf8(c); - } - bpe_encoded_words.emplace_back(encoded_token); - } - return bpe_encoded_words; -} // GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+ static std::vector unicode_regex_split_custom_gpt2(const std::string & text, const std::vector & offsets) { @@ -956,6 +939,24 @@ bool unicode_cpt_is_han(uint32_t cpt) { return false; } +std::vector unicode_words_byte_encode(const std::vector & bpe_words) { + std::vector bpe_encoded_words; + for (const auto & word : bpe_words) { + std::string text_utf; + auto utf_word = unicode_cpts_from_utf8(word); + for (size_t i = 0; i < utf_word.size(); ++i) { + text_utf += unicode_cpt_to_utf8(utf_word[i]); + } + + std::string encoded_token; + for (char & c : text_utf) { + encoded_token += unicode_byte_to_utf8(c); + } + bpe_encoded_words.emplace_back(encoded_token); + } + return bpe_encoded_words; +} + std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs) { // unicode categories static const std::map k_ucat_enum = { @@ -1143,5 +1144,5 @@ std::vector unicode_regex_split(const std::string & text, const std start += offset; } - return unicode_byte_encoding_process(bpe_words); + return bpe_words; } diff --git a/src/unicode.h b/src/unicode.h index 5bd1362ff4..9bf00a8c79 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -108,4 +108,6 @@ uint32_t unicode_tolower(uint32_t cpt); bool unicode_cpt_is_han(uint32_t cpt); +std::vector unicode_words_byte_encode(const std::vector & bpe_words); + std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs);