From 22e85fcf1180e05fc63f7e8c1cf6f1a4b6babd4a Mon Sep 17 00:00:00 2001 From: o7si Date: Sun, 11 Jan 2026 19:43:24 +0800 Subject: [PATCH 1/4] vocab: add tokenizer support for jina-embeddings-v2-base-zh --- convert_hf_to_gguf.py | 3 +++ convert_hf_to_gguf_update.py | 1 + src/llama-vocab.cpp | 41 +++++++++++++++++++++++++++++++++++- src/llama-vocab.h | 3 +++ src/unicode.cpp | 37 ++++++++++++++++---------------- src/unicode.h | 2 ++ 6 files changed, 68 insertions(+), 19 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index be83e3108e..37c350067a 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1138,6 +1138,9 @@ class TextModel(ModelBase): if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6": # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de res = "jina-v2-de" + if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448": + # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-zh + res = "jina-v2-zh" if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d": # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct res = "smaug-bpe" diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index aa9843ea17..f3f4647e8a 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -106,6 +106,7 @@ models = [ {"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM! {"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", }, {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", }, + {"name": "jina-v2-zh", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-zh", }, {"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", }, {"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", }, {"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", }, diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index a23950d007..61f95bb230 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -466,6 +466,11 @@ struct llm_tokenizer_bpe : llm_tokenizer { // original regex from tokenizer.json // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+" "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+", + case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH: + // ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-zh + // whitespace pre-tokenizer + regex_exprs = { + "\\S+", }; break; default: @@ -525,7 +530,20 @@ struct llm_tokenizer_bpe_session { void tokenize(const std::string & text, std::vector & output) { int final_prev_index = -1; - const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs); + + std::string text_normalized; + if (vocab.get_apply_lowercase()) { + for (uint32_t cpt : unicode_cpts_from_utf8(text)) { + text_normalized += unicode_cpt_to_utf8(unicode_tolower(cpt)); + } + } else { + text_normalized = text; + } + + auto word_collection = unicode_regex_split(text_normalized, tokenizer.regex_exprs); + if (vocab.get_use_byte_encoding()) { + word_collection = unicode_words_byte_encode(word_collection); + } symbols_final.clear(); @@ -1598,6 +1616,8 @@ struct llama_vocab::impl { bool remove_extra_whitespaces = false; bool escape_whitespaces = true; bool treat_whitespace_as_suffix = false; + bool apply_lowercase = false; // lowercase normalization + bool use_byte_encoding = true; // GPT-2 byte encoding for BPE vocab std::unordered_map token_to_id; std::vector id_to_token; @@ -2041,6 +2061,14 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "solar-open") { pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN; clean_spaces = false; + } else if ( + tokenizer_pre == "jina-v2-zh") { + pre_type = LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH; + clean_spaces = true; + add_bos = true; + add_sep = true; + apply_lowercase = true; + use_byte_encoding = false; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } @@ -3143,6 +3171,9 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t return _try_copy(token_text.data(), token_text.size()); } if (attr & LLAMA_TOKEN_ATTR_NORMAL) { + if (!use_byte_encoding) { + return _try_copy(token_text.data(), token_text.size()); + } std::string result = llama_decode_text(token_text); return _try_copy(result.data(), result.size()); } @@ -3567,6 +3598,14 @@ bool llama_vocab::get_treat_whitespace_as_suffix() const { return pimpl->treat_whitespace_as_suffix; } +bool llama_vocab::get_apply_lowercase() const { + return pimpl->apply_lowercase; +} + +bool llama_vocab::get_use_byte_encoding() const { + return pimpl->use_byte_encoding; +} + int llama_vocab::max_token_len() const { return pimpl->max_token_len; } diff --git a/src/llama-vocab.h b/src/llama-vocab.h index 28c3a82b91..120188e13e 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -54,6 +54,7 @@ enum llama_vocab_pre_type { LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43, LLAMA_VOCAB_PRE_TYPE_YOUTU = 44, LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45, + LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH = 46, }; struct LLM_KV; @@ -131,6 +132,8 @@ struct llama_vocab { bool get_remove_extra_whitespaces () const; bool get_escape_whitespaces () const; bool get_treat_whitespace_as_suffix() const; + bool get_apply_lowercase () const; + bool get_use_byte_encoding () const; int max_token_len() const; diff --git a/src/unicode.cpp b/src/unicode.cpp index b47dcbe619..a2f3a1f12c 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -220,23 +220,6 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) { return conv.from_bytes(s); } -static std::vector unicode_byte_encoding_process(const std::vector & bpe_words) { - std::vector bpe_encoded_words; - for (const auto & word : bpe_words) { - std::string text_utf; - auto utf_word = unicode_cpts_from_utf8(word); - for (size_t i = 0; i < utf_word.size(); ++i) { - text_utf += unicode_cpt_to_utf8(utf_word[i]); - } - - std::string encoded_token; - for (char & c : text_utf) { - encoded_token += unicode_byte_to_utf8(c); - } - bpe_encoded_words.emplace_back(encoded_token); - } - return bpe_encoded_words; -} // GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+ static std::vector unicode_regex_split_custom_gpt2(const std::string & text, const std::vector & offsets) { @@ -956,6 +939,24 @@ bool unicode_cpt_is_han(uint32_t cpt) { return false; } +std::vector unicode_words_byte_encode(const std::vector & bpe_words) { + std::vector bpe_encoded_words; + for (const auto & word : bpe_words) { + std::string text_utf; + auto utf_word = unicode_cpts_from_utf8(word); + for (size_t i = 0; i < utf_word.size(); ++i) { + text_utf += unicode_cpt_to_utf8(utf_word[i]); + } + + std::string encoded_token; + for (char & c : text_utf) { + encoded_token += unicode_byte_to_utf8(c); + } + bpe_encoded_words.emplace_back(encoded_token); + } + return bpe_encoded_words; +} + std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs) { // unicode categories static const std::map k_ucat_enum = { @@ -1143,5 +1144,5 @@ std::vector unicode_regex_split(const std::string & text, const std start += offset; } - return unicode_byte_encoding_process(bpe_words); + return bpe_words; } diff --git a/src/unicode.h b/src/unicode.h index 5bd1362ff4..9bf00a8c79 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -108,4 +108,6 @@ uint32_t unicode_tolower(uint32_t cpt); bool unicode_cpt_is_han(uint32_t cpt); +std::vector unicode_words_byte_encode(const std::vector & bpe_words); + std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs); From ccd55e4ff7a2b30d7421f319e5a6ed4e1509cc0c Mon Sep 17 00:00:00 2001 From: o7si Date: Wed, 14 Jan 2026 15:27:24 +0800 Subject: [PATCH 2/4] convert: add normalizer.lowercase metadata support --- gguf-py/gguf/constants.py | 2 ++ gguf-py/gguf/gguf_writer.py | 3 +++ gguf-py/gguf/vocab.py | 27 +++++++++++++++++++++++++++ 3 files changed, 32 insertions(+) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 31273b2b5a..813555e336 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -251,6 +251,8 @@ class Keys: CHAT_TEMPLATE = "tokenizer.chat_template" CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}" CHAT_TEMPLATES = "tokenizer.chat_templates" + # Normalizer constants + NORMALIZER_LOWERCASE = "tokenizer.ggml.normalizer.lowercase" # FIM/Infill special tokens constants FIM_PRE_ID = "tokenizer.ggml.fim_pre_token_id" FIM_SUF_ID = "tokenizer.ggml.fim_suf_token_id" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 7fbb78866b..b609c9e696 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1072,6 +1072,9 @@ class GGUFWriter: def add_eom_token_id(self, id: int) -> None: self.add_uint32(Keys.Tokenizer.EOM_ID, id) + def add_normalizer_lowercase(self, value: bool) -> None: + self.add_bool(Keys.Tokenizer.NORMALIZER_LOWERCASE, value) + def add_classifier_output_labels(self, labels: Sequence[str]) -> None: self.add_array(Keys.Classifier.OUTPUT_LABELS.format(arch=self.arch), labels) diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index 028e5748e4..36fc5cf023 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -52,6 +52,7 @@ class SpecialVocab: add_special_token: dict[str, bool] special_token_ids: dict[str, int] chat_template: str | Sequence[Mapping[str, str]] | None + normalizer_lowercase: bool def __init__( self, path: str | os.PathLike[str], load_merges: bool = False, @@ -64,6 +65,7 @@ class SpecialVocab: self.load_merges = load_merges self.merges = [] self.chat_template = None + self.normalizer_lowercase = False if special_token_types is not None: self.special_token_types = special_token_types else: @@ -102,6 +104,10 @@ class SpecialVocab: if not quiet: logger.info(f'Setting chat_template to {self.chat_template}') gw.add_chat_template(self.chat_template) + if self.normalizer_lowercase: + if not quiet: + logger.info('Setting normalizer_lowercase to True') + gw.add_normalizer_lowercase(True) def _load(self, path: Path) -> None: self._try_load_from_tokenizer_json(path) @@ -146,6 +152,24 @@ class SpecialVocab: return logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping') + def _parse_normalizer(self, normalizer: dict) -> None: + # ref: https://huggingface.co/docs/tokenizers/api/normalizers + # + # Detects lowercase normalization in three possible formats: + # 1. Standalone: {"type": "Lowercase"} + # 2. BertNormalizer attribute: {"type": "BertNormalizer", "lowercase": true, ...} + # 3. Nested in Sequence: {"type": "Sequence", "normalizers": [...]} + + normalizer_type = normalizer.get('type') + if normalizer_type == 'Lowercase': + self.normalizer_lowercase = True + elif normalizer_type == 'BertNormalizer': + if normalizer.get('lowercase', False): + self.normalizer_lowercase = True + elif normalizer_type == 'Sequence': + for norm in normalizer.get('normalizers', []): + self._parse_normalizer(norm) + def _try_load_from_tokenizer_json(self, path: Path) -> bool: tokenizer = None tokenizer_file = path / 'tokenizer.json' @@ -178,6 +202,9 @@ class SpecialVocab: ] else: raise ValueError("Unknown tokenizer merges format") + # Parse normalizer configuration + if normalizer := tokenizer.get('normalizer'): + self._parse_normalizer(normalizer) added_tokens = tokenizer.get('added_tokens', {}) else: added_tokens = {} From 0f6138527b9fc28a61c147f9b20d0bc81631238c Mon Sep 17 00:00:00 2001 From: o7si <32285332+o7si@users.noreply.github.com> Date: Wed, 14 Jan 2026 17:04:51 +0800 Subject: [PATCH 3/4] Update gguf-py/gguf/vocab.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- gguf-py/gguf/vocab.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index 36fc5cf023..b554dab7c5 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -164,7 +164,7 @@ class SpecialVocab: if normalizer_type == 'Lowercase': self.normalizer_lowercase = True elif normalizer_type == 'BertNormalizer': - if normalizer.get('lowercase', False): + if normalizer.get('lowercase', True): self.normalizer_lowercase = True elif normalizer_type == 'Sequence': for norm in normalizer.get('normalizers', []): From f3bce5298620406d19e001c182b493a8c70afc4e Mon Sep 17 00:00:00 2001 From: o7si Date: Sat, 17 Jan 2026 01:29:10 +0800 Subject: [PATCH 4/4] wip --- convert_hf_to_gguf.py | 22 +++++++++++++++++++++- src/llama-arch.cpp | 1 + src/llama-arch.h | 1 + src/llama-vocab.cpp | 35 ++++++++++++++++++++++++----------- 4 files changed, 47 insertions(+), 12 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 37c350067a..1c71de1a18 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1293,6 +1293,16 @@ class TextModel(ModelBase): special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_whitespace(self) -> None: + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("whitespace") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_qwen(self): dir_model = self.dir_model hparams = self.hparams @@ -7135,7 +7145,17 @@ class JinaBertV2Model(BertModel): if tokenizer_class == 'BertTokenizer': super().set_vocab() elif tokenizer_class == 'RobertaTokenizer': - self._set_vocab_gpt2() + pre_tokenizer_type = None + tokenizer_json_path = self.dir_model / "tokenizer.json" + if tokenizer_json_path.is_file(): + with open(tokenizer_json_path, "r", encoding="utf-8") as f: + tokenizer_json = json.load(f) + pre_tokenizer_type = tokenizer_json.get("pre_tokenizer", {}).get("type") + + if pre_tokenizer_type == "Whitespace": + self._set_vocab_whitespace() + else: + self._set_vocab_gpt2() self.gguf_writer.add_token_type_count(2) else: raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel') diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index a54bc1956a..05306798a9 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -288,6 +288,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" }, { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" }, { LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" }, + { LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, "tokenizer.ggml.normalizer.lowercase" }, { LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" }, { LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" }, { LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 270d28b16a..dc46fadc59 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -277,6 +277,7 @@ enum llm_kv { LLM_KV_TOKENIZER_HF_JSON, LLM_KV_TOKENIZER_RWKV, LLM_KV_TOKENIZER_CHAT_TEMPLATE, + LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, LLM_KV_TOKENIZER_FIM_PRE_ID, LLM_KV_TOKENIZER_FIM_SUF_ID, LLM_KV_TOKENIZER_FIM_MID_ID, diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 61f95bb230..6ddd1b6ffa 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -466,6 +466,8 @@ struct llm_tokenizer_bpe : llm_tokenizer { // original regex from tokenizer.json // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+" "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+", + }; + break; case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH: // ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-zh // whitespace pre-tokenizer @@ -1617,7 +1619,7 @@ struct llama_vocab::impl { bool escape_whitespaces = true; bool treat_whitespace_as_suffix = false; bool apply_lowercase = false; // lowercase normalization - bool use_byte_encoding = true; // GPT-2 byte encoding for BPE vocab + bool use_byte_encoding = true; std::unordered_map token_to_id; std::vector id_to_token; @@ -1767,7 +1769,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { special_mask_id = 103; add_sep = true; - } else if (tokenizer_model == "gpt2") { + } else if ( + tokenizer_model == "gpt2" || + tokenizer_model == "whitespace") { type = LLAMA_VOCAB_TYPE_BPE; // read bpe merges and populate bpe ranks @@ -1795,12 +1799,21 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { } // default special tokens - special_bos_id = 11; - special_eos_id = 11; - special_unk_id = LLAMA_TOKEN_NULL; - special_sep_id = LLAMA_TOKEN_NULL; - special_pad_id = LLAMA_TOKEN_NULL; - special_mask_id = LLAMA_TOKEN_NULL; + if (tokenizer_model == "gpt2") { + special_bos_id = 11; + special_eos_id = 11; + special_unk_id = LLAMA_TOKEN_NULL; + special_sep_id = LLAMA_TOKEN_NULL; + special_pad_id = LLAMA_TOKEN_NULL; + special_mask_id = LLAMA_TOKEN_NULL; + } else if (tokenizer_model == "whitespace") { + special_bos_id = 0; // + special_eos_id = 2; // + special_unk_id = 3; // + special_sep_id = 2; // (same as eos) + special_pad_id = 1; // + special_mask_id = 4; // + } } else if (tokenizer_model == "t5") { type = LLAMA_VOCAB_TYPE_UGM; @@ -2067,7 +2080,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { clean_spaces = true; add_bos = true; add_sep = true; - apply_lowercase = true; use_byte_encoding = false; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); @@ -2099,8 +2111,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } - ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false); - ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false); + ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false); + ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false); + ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, apply_lowercase, true); } const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());