From f3bce5298620406d19e001c182b493a8c70afc4e Mon Sep 17 00:00:00 2001
From: o7si <o7si@foxmail.com>
Date: Sat, 17 Jan 2026 01:29:10 +0800
Subject: [PATCH] wip

---
 convert_hf_to_gguf.py | 22 +++++++++++++++++++++-
 src/llama-arch.cpp    |  1 +
 src/llama-arch.h      |  1 +
 src/llama-vocab.cpp   | 35 ++++++++++++++++++++++++-----------
 4 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 37c350067a..1c71de1a18 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -1293,6 +1293,16 @@ class TextModel(ModelBase):
         special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
         special_vocab.add_to_gguf(self.gguf_writer)
 
+    def _set_vocab_whitespace(self) -> None:
+        tokens, toktypes, tokpre = self.get_vocab_base()
+        self.gguf_writer.add_tokenizer_model("whitespace")
+        self.gguf_writer.add_tokenizer_pre(tokpre)
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
+        special_vocab.add_to_gguf(self.gguf_writer)
+
     def _set_vocab_qwen(self):
         dir_model = self.dir_model
         hparams = self.hparams
@@ -7135,7 +7145,17 @@ class JinaBertV2Model(BertModel):
         if tokenizer_class == 'BertTokenizer':
             super().set_vocab()
         elif tokenizer_class == 'RobertaTokenizer':
-            self._set_vocab_gpt2()
+            pre_tokenizer_type = None
+            tokenizer_json_path = self.dir_model / "tokenizer.json"
+            if tokenizer_json_path.is_file():
+                with open(tokenizer_json_path, "r", encoding="utf-8") as f:
+                    tokenizer_json = json.load(f)
+                    pre_tokenizer_type = tokenizer_json.get("pre_tokenizer", {}).get("type")
+
+            if pre_tokenizer_type == "Whitespace":
+                self._set_vocab_whitespace()
+            else:
+                self._set_vocab_gpt2()
             self.gguf_writer.add_token_type_count(2)
         else:
             raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index a54bc1956a..05306798a9 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -288,6 +288,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_TOKENIZER_HF_JSON,              "tokenizer.huggingface.json"              },
     { LLM_KV_TOKENIZER_RWKV,                 "tokenizer.rwkv.world"                    },
     { LLM_KV_TOKENIZER_CHAT_TEMPLATE,        "tokenizer.chat_template"                 },
+    { LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, "tokenizer.ggml.normalizer.lowercase"     },
     { LLM_KV_TOKENIZER_FIM_PRE_ID,           "tokenizer.ggml.fim_pre_token_id"         },
     { LLM_KV_TOKENIZER_FIM_SUF_ID,           "tokenizer.ggml.fim_suf_token_id"         },
     { LLM_KV_TOKENIZER_FIM_MID_ID,           "tokenizer.ggml.fim_mid_token_id"         },
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 270d28b16a..dc46fadc59 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -277,6 +277,7 @@ enum llm_kv {
     LLM_KV_TOKENIZER_HF_JSON,
     LLM_KV_TOKENIZER_RWKV,
     LLM_KV_TOKENIZER_CHAT_TEMPLATE,
+    LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE,
     LLM_KV_TOKENIZER_FIM_PRE_ID,
     LLM_KV_TOKENIZER_FIM_SUF_ID,
     LLM_KV_TOKENIZER_FIM_MID_ID,
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 61f95bb230..6ddd1b6ffa 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -466,6 +466,8 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                     // original regex from tokenizer.json
                     // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
                     "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
+                };
+                break;
             case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH:
                 // ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-zh
                 // whitespace pre-tokenizer
@@ -1617,7 +1619,7 @@ struct llama_vocab::impl {
     bool escape_whitespaces         = true;
     bool treat_whitespace_as_suffix = false;
     bool apply_lowercase            = false;  // lowercase normalization
-    bool use_byte_encoding          = true;   // GPT-2 byte encoding for BPE vocab
+    bool use_byte_encoding          = true;
 
     std::unordered_map<std::string, llama_token> token_to_id;
     std::vector<token_data>                      id_to_token;
@@ -1767,7 +1769,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             special_mask_id = 103;
 
             add_sep = true;
-        } else if (tokenizer_model == "gpt2") {
+        } else if (
+            tokenizer_model == "gpt2" ||
+            tokenizer_model == "whitespace") {
             type = LLAMA_VOCAB_TYPE_BPE;
 
             // read bpe merges and populate bpe ranks
@@ -1795,12 +1799,21 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             }
 
             // default special tokens
-            special_bos_id  = 11;
-            special_eos_id  = 11;
-            special_unk_id  = LLAMA_TOKEN_NULL;
-            special_sep_id  = LLAMA_TOKEN_NULL;
-            special_pad_id  = LLAMA_TOKEN_NULL;
-            special_mask_id = LLAMA_TOKEN_NULL;
+            if (tokenizer_model == "gpt2") {
+                special_bos_id  = 11;
+                special_eos_id  = 11;
+                special_unk_id  = LLAMA_TOKEN_NULL;
+                special_sep_id  = LLAMA_TOKEN_NULL;
+                special_pad_id  = LLAMA_TOKEN_NULL;
+                special_mask_id = LLAMA_TOKEN_NULL;
+            } else if (tokenizer_model == "whitespace") {
+                special_bos_id  = 0;   // <s>
+                special_eos_id  = 2;   // </s>
+                special_unk_id  = 3;   // <unk>
+                special_sep_id  = 2;   // </s> (same as eos)
+                special_pad_id  = 1;   // <pad>
+                special_mask_id = 4;   // <mask>
+            }
         } else if (tokenizer_model == "t5") {
             type = LLAMA_VOCAB_TYPE_UGM;
 
@@ -2067,7 +2080,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                 clean_spaces = true;
                 add_bos = true;
                 add_sep = true;
-                apply_lowercase = true;
                 use_byte_encoding = false;
             } else {
                 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
@@ -2099,8 +2111,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
         }
 
-        ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX,      add_space_prefix,         false);
-        ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false);
+        ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX,           add_space_prefix,         false);
+        ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,      remove_extra_whitespaces, false);
+        ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, apply_lowercase,          true);
     }
 
     const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());