From f3bce5298620406d19e001c182b493a8c70afc4e Mon Sep 17 00:00:00 2001 From: o7si Date: Sat, 17 Jan 2026 01:29:10 +0800 Subject: [PATCH] wip --- convert_hf_to_gguf.py | 22 +++++++++++++++++++++- src/llama-arch.cpp | 1 + src/llama-arch.h | 1 + src/llama-vocab.cpp | 35 ++++++++++++++++++++++++----------- 4 files changed, 47 insertions(+), 12 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 37c350067a..1c71de1a18 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1293,6 +1293,16 @@ class TextModel(ModelBase): special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_whitespace(self) -> None: + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("whitespace") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_qwen(self): dir_model = self.dir_model hparams = self.hparams @@ -7135,7 +7145,17 @@ class JinaBertV2Model(BertModel): if tokenizer_class == 'BertTokenizer': super().set_vocab() elif tokenizer_class == 'RobertaTokenizer': - self._set_vocab_gpt2() + pre_tokenizer_type = None + tokenizer_json_path = self.dir_model / "tokenizer.json" + if tokenizer_json_path.is_file(): + with open(tokenizer_json_path, "r", encoding="utf-8") as f: + tokenizer_json = json.load(f) + pre_tokenizer_type = tokenizer_json.get("pre_tokenizer", {}).get("type") + + if pre_tokenizer_type == "Whitespace": + self._set_vocab_whitespace() + else: + self._set_vocab_gpt2() self.gguf_writer.add_token_type_count(2) else: raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel') diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index a54bc1956a..05306798a9 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -288,6 +288,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" }, { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" }, { LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" }, + { LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, "tokenizer.ggml.normalizer.lowercase" }, { LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" }, { LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" }, { LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 270d28b16a..dc46fadc59 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -277,6 +277,7 @@ enum llm_kv { LLM_KV_TOKENIZER_HF_JSON, LLM_KV_TOKENIZER_RWKV, LLM_KV_TOKENIZER_CHAT_TEMPLATE, + LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, LLM_KV_TOKENIZER_FIM_PRE_ID, LLM_KV_TOKENIZER_FIM_SUF_ID, LLM_KV_TOKENIZER_FIM_MID_ID, diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 61f95bb230..6ddd1b6ffa 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -466,6 +466,8 @@ struct llm_tokenizer_bpe : llm_tokenizer { // original regex from tokenizer.json // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+" "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+", + }; + break; case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH: // ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-zh // whitespace pre-tokenizer @@ -1617,7 +1619,7 @@ struct llama_vocab::impl { bool escape_whitespaces = true; bool treat_whitespace_as_suffix = false; bool apply_lowercase = false; // lowercase normalization - bool use_byte_encoding = true; // GPT-2 byte encoding for BPE vocab + bool use_byte_encoding = true; std::unordered_map token_to_id; std::vector id_to_token; @@ -1767,7 +1769,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { special_mask_id = 103; add_sep = true; - } else if (tokenizer_model == "gpt2") { + } else if ( + tokenizer_model == "gpt2" || + tokenizer_model == "whitespace") { type = LLAMA_VOCAB_TYPE_BPE; // read bpe merges and populate bpe ranks @@ -1795,12 +1799,21 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { } // default special tokens - special_bos_id = 11; - special_eos_id = 11; - special_unk_id = LLAMA_TOKEN_NULL; - special_sep_id = LLAMA_TOKEN_NULL; - special_pad_id = LLAMA_TOKEN_NULL; - special_mask_id = LLAMA_TOKEN_NULL; + if (tokenizer_model == "gpt2") { + special_bos_id = 11; + special_eos_id = 11; + special_unk_id = LLAMA_TOKEN_NULL; + special_sep_id = LLAMA_TOKEN_NULL; + special_pad_id = LLAMA_TOKEN_NULL; + special_mask_id = LLAMA_TOKEN_NULL; + } else if (tokenizer_model == "whitespace") { + special_bos_id = 0; // + special_eos_id = 2; // + special_unk_id = 3; // + special_sep_id = 2; // (same as eos) + special_pad_id = 1; // + special_mask_id = 4; // + } } else if (tokenizer_model == "t5") { type = LLAMA_VOCAB_TYPE_UGM; @@ -2067,7 +2080,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { clean_spaces = true; add_bos = true; add_sep = true; - apply_lowercase = true; use_byte_encoding = false; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); @@ -2099,8 +2111,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } - ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false); - ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false); + ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false); + ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false); + ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, apply_lowercase, true); } const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());