From 2b2afade9fb38b8d699ed561d20a259561c00fc3 Mon Sep 17 00:00:00 2001 From: o7si <32285332+o7si@users.noreply.github.com> Date: Fri, 2 Jan 2026 01:27:07 +0800 Subject: [PATCH] convert : fix encoding of WPM vocab for BERT models (#18500) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * convert: avoid token collision when stripping ## prefix * convert: use token types for BERT special tokens check * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret --------- Co-authored-by: Sigbjørn Skjæret --- convert_hf_to_gguf.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a1080b15f0..2c961b8f59 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5287,13 +5287,14 @@ class BertModel(TextModel): self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) # convert to phantom space vocab - def phantom(tok): - if tok.startswith("[") and tok.endswith("]"): + def phantom(tok, toktype): + if toktype == gguf.TokenType.CONTROL: return tok if tok.startswith("##"): return tok[2:] return "\u2581" + tok - tokens = list(map(phantom, tokens)) + assert len(tokens) == len(toktypes) + tokens = list(map(phantom, tokens, toktypes)) # add vocab to gguf self.gguf_writer.add_tokenizer_model("bert")