From 22e85fcf1180e05fc63f7e8c1cf6f1a4b6babd4a Mon Sep 17 00:00:00 2001
From: o7si <o7si@foxmail.com>
Date: Sun, 11 Jan 2026 19:43:24 +0800
Subject: [PATCH 1/4] vocab: add tokenizer support for
 jina-embeddings-v2-base-zh

---
 convert_hf_to_gguf.py        |  3 +++
 convert_hf_to_gguf_update.py |  1 +
 src/llama-vocab.cpp          | 41 +++++++++++++++++++++++++++++++++++-
 src/llama-vocab.h            |  3 +++
 src/unicode.cpp              | 37 ++++++++++++++++----------------
 src/unicode.h                |  2 ++
 6 files changed, 68 insertions(+), 19 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index be83e3108e..37c350067a 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -1138,6 +1138,9 @@ class TextModel(ModelBase):
         if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
             # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
             res = "jina-v2-de"
+        if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448":
+            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-zh
+            res = "jina-v2-zh"
         if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
             # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
             res = "smaug-bpe"
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
index aa9843ea17..f3f4647e8a 100755
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -106,6 +106,7 @@ models = [
     {"name": "jina-v2-en",       "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
     {"name": "jina-v2-es",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
     {"name": "jina-v2-de",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
+    {"name": "jina-v2-zh",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-zh", },
     {"name": "smaug-bpe",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
     {"name": "poro-chat",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
     {"name": "jina-v2-code",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index a23950d007..61f95bb230 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -466,6 +466,11 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                     // original regex from tokenizer.json
                     // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
                     "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
+            case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH:
+                // ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-zh
+                // whitespace pre-tokenizer
+                regex_exprs = {
+                    "\\S+",
                 };
                 break;
             default:
@@ -525,7 +530,20 @@ struct llm_tokenizer_bpe_session {
 
     void tokenize(const std::string & text, std::vector<llama_token> & output) {
         int final_prev_index = -1;
-        const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs);
+
+        std::string text_normalized;
+        if (vocab.get_apply_lowercase()) {
+            for (uint32_t cpt : unicode_cpts_from_utf8(text)) {
+                text_normalized += unicode_cpt_to_utf8(unicode_tolower(cpt));
+            }
+        } else {
+            text_normalized = text;
+        }
+
+        auto word_collection = unicode_regex_split(text_normalized, tokenizer.regex_exprs);
+        if (vocab.get_use_byte_encoding()) {
+            word_collection = unicode_words_byte_encode(word_collection);
+        }
 
         symbols_final.clear();
 
@@ -1598,6 +1616,8 @@ struct llama_vocab::impl {
     bool remove_extra_whitespaces   = false;
     bool escape_whitespaces         = true;
     bool treat_whitespace_as_suffix = false;
+    bool apply_lowercase            = false;  // lowercase normalization
+    bool use_byte_encoding          = true;   // GPT-2 byte encoding for BPE vocab
 
     std::unordered_map<std::string, llama_token> token_to_id;
     std::vector<token_data>                      id_to_token;
@@ -2041,6 +2061,14 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                 tokenizer_pre == "solar-open") {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
                 clean_spaces = false;
+            } else if (
+                tokenizer_pre == "jina-v2-zh") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH;
+                clean_spaces = true;
+                add_bos = true;
+                add_sep = true;
+                apply_lowercase = true;
+                use_byte_encoding = false;
             } else {
                 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
             }
@@ -3143,6 +3171,9 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
                     return _try_copy(token_text.data(), token_text.size());
                 }
                 if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
+                    if (!use_byte_encoding) {
+                        return _try_copy(token_text.data(), token_text.size());
+                    }
                     std::string result = llama_decode_text(token_text);
                     return _try_copy(result.data(), result.size());
                 }
@@ -3567,6 +3598,14 @@ bool llama_vocab::get_treat_whitespace_as_suffix() const {
     return pimpl->treat_whitespace_as_suffix;
 }
 
+bool llama_vocab::get_apply_lowercase() const {
+    return pimpl->apply_lowercase;
+}
+
+bool llama_vocab::get_use_byte_encoding() const {
+    return pimpl->use_byte_encoding;
+}
+
 int llama_vocab::max_token_len() const {
     return pimpl->max_token_len;
 }
diff --git a/src/llama-vocab.h b/src/llama-vocab.h
index 28c3a82b91..120188e13e 100644
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@@ -54,6 +54,7 @@ enum llama_vocab_pre_type {
     LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN      = 43,
     LLAMA_VOCAB_PRE_TYPE_YOUTU           = 44,
     LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE      = 45,
+    LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH      = 46,
 };
 
 struct LLM_KV;
@@ -131,6 +132,8 @@ struct llama_vocab {
     bool get_remove_extra_whitespaces  () const;
     bool get_escape_whitespaces        () const;
     bool get_treat_whitespace_as_suffix() const;
+    bool get_apply_lowercase           () const;
+    bool get_use_byte_encoding         () const;
 
     int max_token_len() const;
 
diff --git a/src/unicode.cpp b/src/unicode.cpp
index b47dcbe619..a2f3a1f12c 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -220,23 +220,6 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
     return conv.from_bytes(s);
 }
 
-static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {
-    std::vector<std::string> bpe_encoded_words;
-    for (const auto & word : bpe_words) {
-        std::string text_utf;
-        auto utf_word =  unicode_cpts_from_utf8(word);
-        for (size_t i = 0; i < utf_word.size(); ++i) {
-            text_utf += unicode_cpt_to_utf8(utf_word[i]);
-        }
-
-        std::string encoded_token;
-        for (char & c : text_utf) {
-            encoded_token += unicode_byte_to_utf8(c);
-        }
-        bpe_encoded_words.emplace_back(encoded_token);
-    }
-    return bpe_encoded_words;
-}
 
 // GPT2 system regex:  's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
 static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & text, const std::vector<size_t> & offsets) {
@@ -956,6 +939,24 @@ bool unicode_cpt_is_han(uint32_t cpt) {
     return false;
 }
 
+std::vector<std::string> unicode_words_byte_encode(const std::vector<std::string> & bpe_words) {
+    std::vector<std::string> bpe_encoded_words;
+    for (const auto & word : bpe_words) {
+        std::string text_utf;
+        auto utf_word =  unicode_cpts_from_utf8(word);
+        for (size_t i = 0; i < utf_word.size(); ++i) {
+            text_utf += unicode_cpt_to_utf8(utf_word[i]);
+        }
+
+        std::string encoded_token;
+        for (char & c : text_utf) {
+            encoded_token += unicode_byte_to_utf8(c);
+        }
+        bpe_encoded_words.emplace_back(encoded_token);
+    }
+    return bpe_encoded_words;
+}
+
 std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
     // unicode categories
     static const std::map<std::string, int> k_ucat_enum = {
@@ -1143,5 +1144,5 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
         start += offset;
     }
 
-    return unicode_byte_encoding_process(bpe_words);
+    return bpe_words;
 }
diff --git a/src/unicode.h b/src/unicode.h
index 5bd1362ff4..9bf00a8c79 100644
--- a/src/unicode.h
+++ b/src/unicode.h
@@ -108,4 +108,6 @@ uint32_t unicode_tolower(uint32_t cpt);
 
 bool unicode_cpt_is_han(uint32_t cpt);
 
+std::vector<std::string> unicode_words_byte_encode(const std::vector<std::string> & bpe_words);
+
 std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);

From ccd55e4ff7a2b30d7421f319e5a6ed4e1509cc0c Mon Sep 17 00:00:00 2001
From: o7si <o7si@kanda-mashiro.cc>
Date: Wed, 14 Jan 2026 15:27:24 +0800
Subject: [PATCH 2/4] convert: add normalizer.lowercase metadata support

---
 gguf-py/gguf/constants.py   |  2 ++
 gguf-py/gguf/gguf_writer.py |  3 +++
 gguf-py/gguf/vocab.py       | 27 +++++++++++++++++++++++++++
 3 files changed, 32 insertions(+)

diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 31273b2b5a..813555e336 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -251,6 +251,8 @@ class Keys:
         CHAT_TEMPLATE        = "tokenizer.chat_template"
         CHAT_TEMPLATE_N      = "tokenizer.chat_template.{name}"
         CHAT_TEMPLATES       = "tokenizer.chat_templates"
+        # Normalizer constants
+        NORMALIZER_LOWERCASE = "tokenizer.ggml.normalizer.lowercase"
         # FIM/Infill special tokens constants
         FIM_PRE_ID           = "tokenizer.ggml.fim_pre_token_id"
         FIM_SUF_ID           = "tokenizer.ggml.fim_suf_token_id"
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 7fbb78866b..b609c9e696 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -1072,6 +1072,9 @@ class GGUFWriter:
     def add_eom_token_id(self, id: int) -> None:
         self.add_uint32(Keys.Tokenizer.EOM_ID, id)
 
+    def add_normalizer_lowercase(self, value: bool) -> None:
+        self.add_bool(Keys.Tokenizer.NORMALIZER_LOWERCASE, value)
+
     def add_classifier_output_labels(self, labels: Sequence[str]) -> None:
         self.add_array(Keys.Classifier.OUTPUT_LABELS.format(arch=self.arch), labels)
 
diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py
index 028e5748e4..36fc5cf023 100644
--- a/gguf-py/gguf/vocab.py
+++ b/gguf-py/gguf/vocab.py
@@ -52,6 +52,7 @@ class SpecialVocab:
     add_special_token: dict[str, bool]
     special_token_ids: dict[str, int]
     chat_template: str | Sequence[Mapping[str, str]] | None
+    normalizer_lowercase: bool
 
     def __init__(
         self, path: str | os.PathLike[str], load_merges: bool = False,
@@ -64,6 +65,7 @@ class SpecialVocab:
         self.load_merges = load_merges
         self.merges = []
         self.chat_template = None
+        self.normalizer_lowercase = False
         if special_token_types is not None:
             self.special_token_types = special_token_types
         else:
@@ -102,6 +104,10 @@ class SpecialVocab:
             if not quiet:
                 logger.info(f'Setting chat_template to {self.chat_template}')
             gw.add_chat_template(self.chat_template)
+        if self.normalizer_lowercase:
+            if not quiet:
+                logger.info('Setting normalizer_lowercase to True')
+            gw.add_normalizer_lowercase(True)
 
     def _load(self, path: Path) -> None:
         self._try_load_from_tokenizer_json(path)
@@ -146,6 +152,24 @@ class SpecialVocab:
             return
         logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping')
 
+    def _parse_normalizer(self, normalizer: dict) -> None:
+        # ref: https://huggingface.co/docs/tokenizers/api/normalizers
+        #
+        # Detects lowercase normalization in three possible formats:
+        # 1. Standalone: {"type": "Lowercase"}
+        # 2. BertNormalizer attribute: {"type": "BertNormalizer", "lowercase": true, ...}
+        # 3. Nested in Sequence: {"type": "Sequence", "normalizers": [...]}
+
+        normalizer_type = normalizer.get('type')
+        if normalizer_type == 'Lowercase':
+            self.normalizer_lowercase = True
+        elif normalizer_type == 'BertNormalizer':
+            if normalizer.get('lowercase', False):
+                self.normalizer_lowercase = True
+        elif normalizer_type == 'Sequence':
+            for norm in normalizer.get('normalizers', []):
+                self._parse_normalizer(norm)
+
     def _try_load_from_tokenizer_json(self, path: Path) -> bool:
         tokenizer = None
         tokenizer_file = path / 'tokenizer.json'
@@ -178,6 +202,9 @@ class SpecialVocab:
                         ]
                     else:
                         raise ValueError("Unknown tokenizer merges format")
+            # Parse normalizer configuration
+            if normalizer := tokenizer.get('normalizer'):
+                self._parse_normalizer(normalizer)
             added_tokens = tokenizer.get('added_tokens', {})
         else:
             added_tokens = {}

From 0f6138527b9fc28a61c147f9b20d0bc81631238c Mon Sep 17 00:00:00 2001
From: o7si <32285332+o7si@users.noreply.github.com>
Date: Wed, 14 Jan 2026 17:04:51 +0800
Subject: [PATCH 3/4] Update gguf-py/gguf/vocab.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
---
 gguf-py/gguf/vocab.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py
index 36fc5cf023..b554dab7c5 100644
--- a/gguf-py/gguf/vocab.py
+++ b/gguf-py/gguf/vocab.py
@@ -164,7 +164,7 @@ class SpecialVocab:
         if normalizer_type == 'Lowercase':
             self.normalizer_lowercase = True
         elif normalizer_type == 'BertNormalizer':
-            if normalizer.get('lowercase', False):
+            if normalizer.get('lowercase', True):
                 self.normalizer_lowercase = True
         elif normalizer_type == 'Sequence':
             for norm in normalizer.get('normalizers', []):

From f3bce5298620406d19e001c182b493a8c70afc4e Mon Sep 17 00:00:00 2001
From: o7si <o7si@foxmail.com>
Date: Sat, 17 Jan 2026 01:29:10 +0800
Subject: [PATCH 4/4] wip

---
 convert_hf_to_gguf.py | 22 +++++++++++++++++++++-
 src/llama-arch.cpp    |  1 +
 src/llama-arch.h      |  1 +
 src/llama-vocab.cpp   | 35 ++++++++++++++++++++++++-----------
 4 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 37c350067a..1c71de1a18 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -1293,6 +1293,16 @@ class TextModel(ModelBase):
         special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
         special_vocab.add_to_gguf(self.gguf_writer)
 
+    def _set_vocab_whitespace(self) -> None:
+        tokens, toktypes, tokpre = self.get_vocab_base()
+        self.gguf_writer.add_tokenizer_model("whitespace")
+        self.gguf_writer.add_tokenizer_pre(tokpre)
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
+        special_vocab.add_to_gguf(self.gguf_writer)
+
     def _set_vocab_qwen(self):
         dir_model = self.dir_model
         hparams = self.hparams
@@ -7135,7 +7145,17 @@ class JinaBertV2Model(BertModel):
         if tokenizer_class == 'BertTokenizer':
             super().set_vocab()
         elif tokenizer_class == 'RobertaTokenizer':
-            self._set_vocab_gpt2()
+            pre_tokenizer_type = None
+            tokenizer_json_path = self.dir_model / "tokenizer.json"
+            if tokenizer_json_path.is_file():
+                with open(tokenizer_json_path, "r", encoding="utf-8") as f:
+                    tokenizer_json = json.load(f)
+                    pre_tokenizer_type = tokenizer_json.get("pre_tokenizer", {}).get("type")
+
+            if pre_tokenizer_type == "Whitespace":
+                self._set_vocab_whitespace()
+            else:
+                self._set_vocab_gpt2()
             self.gguf_writer.add_token_type_count(2)
         else:
             raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index a54bc1956a..05306798a9 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -288,6 +288,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_TOKENIZER_HF_JSON,              "tokenizer.huggingface.json"              },
     { LLM_KV_TOKENIZER_RWKV,                 "tokenizer.rwkv.world"                    },
     { LLM_KV_TOKENIZER_CHAT_TEMPLATE,        "tokenizer.chat_template"                 },
+    { LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, "tokenizer.ggml.normalizer.lowercase"     },
     { LLM_KV_TOKENIZER_FIM_PRE_ID,           "tokenizer.ggml.fim_pre_token_id"         },
     { LLM_KV_TOKENIZER_FIM_SUF_ID,           "tokenizer.ggml.fim_suf_token_id"         },
     { LLM_KV_TOKENIZER_FIM_MID_ID,           "tokenizer.ggml.fim_mid_token_id"         },
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 270d28b16a..dc46fadc59 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -277,6 +277,7 @@ enum llm_kv {
     LLM_KV_TOKENIZER_HF_JSON,
     LLM_KV_TOKENIZER_RWKV,
     LLM_KV_TOKENIZER_CHAT_TEMPLATE,
+    LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE,
     LLM_KV_TOKENIZER_FIM_PRE_ID,
     LLM_KV_TOKENIZER_FIM_SUF_ID,
     LLM_KV_TOKENIZER_FIM_MID_ID,
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 61f95bb230..6ddd1b6ffa 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -466,6 +466,8 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                     // original regex from tokenizer.json
                     // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
                     "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
+                };
+                break;
             case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH:
                 // ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-zh
                 // whitespace pre-tokenizer
@@ -1617,7 +1619,7 @@ struct llama_vocab::impl {
     bool escape_whitespaces         = true;
     bool treat_whitespace_as_suffix = false;
     bool apply_lowercase            = false;  // lowercase normalization
-    bool use_byte_encoding          = true;   // GPT-2 byte encoding for BPE vocab
+    bool use_byte_encoding          = true;
 
     std::unordered_map<std::string, llama_token> token_to_id;
     std::vector<token_data>                      id_to_token;
@@ -1767,7 +1769,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             special_mask_id = 103;
 
             add_sep = true;
-        } else if (tokenizer_model == "gpt2") {
+        } else if (
+            tokenizer_model == "gpt2" ||
+            tokenizer_model == "whitespace") {
             type = LLAMA_VOCAB_TYPE_BPE;
 
             // read bpe merges and populate bpe ranks
@@ -1795,12 +1799,21 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             }
 
             // default special tokens
-            special_bos_id  = 11;
-            special_eos_id  = 11;
-            special_unk_id  = LLAMA_TOKEN_NULL;
-            special_sep_id  = LLAMA_TOKEN_NULL;
-            special_pad_id  = LLAMA_TOKEN_NULL;
-            special_mask_id = LLAMA_TOKEN_NULL;
+            if (tokenizer_model == "gpt2") {
+                special_bos_id  = 11;
+                special_eos_id  = 11;
+                special_unk_id  = LLAMA_TOKEN_NULL;
+                special_sep_id  = LLAMA_TOKEN_NULL;
+                special_pad_id  = LLAMA_TOKEN_NULL;
+                special_mask_id = LLAMA_TOKEN_NULL;
+            } else if (tokenizer_model == "whitespace") {
+                special_bos_id  = 0;   // <s>
+                special_eos_id  = 2;   // </s>
+                special_unk_id  = 3;   // <unk>
+                special_sep_id  = 2;   // </s> (same as eos)
+                special_pad_id  = 1;   // <pad>
+                special_mask_id = 4;   // <mask>
+            }
         } else if (tokenizer_model == "t5") {
             type = LLAMA_VOCAB_TYPE_UGM;
 
@@ -2067,7 +2080,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                 clean_spaces = true;
                 add_bos = true;
                 add_sep = true;
-                apply_lowercase = true;
                 use_byte_encoding = false;
             } else {
                 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
@@ -2099,8 +2111,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
         }
 
-        ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX,      add_space_prefix,         false);
-        ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false);
+        ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX,           add_space_prefix,         false);
+        ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,      remove_extra_whitespaces, false);
+        ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, apply_lowercase,          true);
     }
 
     const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());