convert : fix encoding of WPM vocab for BERT models (#18500)
* convert: avoid token collision when stripping ## prefix * convert: use token types for BERT special tokens check * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
parent
f4f5019254
commit
2b2afade9f
|
|
@ -5287,13 +5287,14 @@ class BertModel(TextModel):
|
|||
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
|
||||
|
||||
# convert to phantom space vocab
|
||||
def phantom(tok):
|
||||
if tok.startswith("[") and tok.endswith("]"):
|
||||
def phantom(tok, toktype):
|
||||
if toktype == gguf.TokenType.CONTROL:
|
||||
return tok
|
||||
if tok.startswith("##"):
|
||||
return tok[2:]
|
||||
return "\u2581" + tok
|
||||
tokens = list(map(phantom, tokens))
|
||||
assert len(tokens) == len(toktypes)
|
||||
tokens = list(map(phantom, tokens, toktypes))
|
||||
|
||||
# add vocab to gguf
|
||||
self.gguf_writer.add_tokenizer_model("bert")
|
||||
|
|
|
|||
Loading…
Reference in New Issue