requirements : update transformers to 5.5.1 (#21617)

* requirements : update transformers to 5.5.0

This commit updates the transformers dependency to version 5.5.0.

The motivation for this is that transformers 5.5.0 includes support for
Gemma4 and is required to be able to convert Gemma4 models. This is also
causing issues for user of gguf-my-repo.

Refs: https://huggingface.co/spaces/ggml-org/gguf-my-repo/discussions/202

* fix huggingface_hub version

* set version of transformers to 5.5.0

* convert : add ty ignore directives to convert_hf_to_gguf.py

This commit adds `ty: ignore` directives to transformers tokenizers
field/methods to avoid type check errors. There might be better ways to
handle this and perhaps this can be done in a follow up commit.

The motivation for this is that it looks like in transformers 5.5.0
AutoTokenizer.from_pretrained can return generic tokenizer types or None
and the type checker now produces an error when the conversion script
accesses field like tokenizer.vocab.

* convert : add ty ignore to suppress type check errors

* convert : remove incorrect type ignores

* convert : fix remaining python checks

I was running a newer version of ty locally but I've switched to
version 0.0.26 which is what CI uses and I was then able to reproduce
the errors. Sorry about the noise.

* update transformers version to 5.5.1
This commit is contained in:
Daniel Bevenius 2026-04-09 12:36:29 +02:00 committed by GitHub
parent 4ef9301e4d
commit c8ac02fa1b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 108 additions and 108 deletions

View File

@ -1229,15 +1229,15 @@ class TextModel(ModelBase):
from transformers import AutoTokenizer from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model) tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) # ty: ignore[unresolved-attribute]
assert max(tokenizer.vocab.values()) < vocab_size assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute]
tokpre = self.get_vocab_base_pre(tokenizer) tokpre = self.get_vocab_base_pre(tokenizer)
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute]
added_vocab = tokenizer.get_added_vocab() added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
added_tokens_decoder = tokenizer.added_tokens_decoder added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute]
for i in range(vocab_size): for i in range(vocab_size):
if i not in reverse_vocab: if i not in reverse_vocab:
@ -1250,7 +1250,7 @@ class TextModel(ModelBase):
# To avoid unexpected issues - we make sure to normalize non-normalized tokens # To avoid unexpected issues - we make sure to normalize non-normalized tokens
if not added_tokens_decoder[i].normalized: if not added_tokens_decoder[i].normalized:
previous_token = token previous_token = token
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment]
if previous_token != token: if previous_token != token:
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
@ -1583,13 +1583,13 @@ class TextModel(ModelBase):
from transformers import AutoTokenizer from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
vocab_size = hparams["vocab_size"] vocab_size = hparams["vocab_size"]
assert max(tokenizer.get_vocab().values()) < vocab_size assert max(tokenizer.get_vocab().values()) < vocab_size # ty: ignore[unresolved-attribute]
tokpre = self.get_vocab_base_pre(tokenizer) tokpre = self.get_vocab_base_pre(tokenizer)
merges = [] merges = []
vocab = {} vocab = {}
mergeable_ranks = tokenizer.mergeable_ranks mergeable_ranks = tokenizer.mergeable_ranks # ty: ignore[unresolved-attribute]
for token, rank in mergeable_ranks.items(): for token, rank in mergeable_ranks.items():
vocab[QwenModel.token_bytes_to_string(token)] = rank vocab[QwenModel.token_bytes_to_string(token)] = rank
if len(token) == 1: if len(token) == 1:
@ -1599,7 +1599,7 @@ class TextModel(ModelBase):
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
added_vocab = tokenizer.special_tokens added_vocab = tokenizer.special_tokens # ty: ignore[unresolved-attribute]
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()} reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
for i in range(vocab_size): for i in range(vocab_size):
@ -1622,10 +1622,10 @@ class TextModel(ModelBase):
special_vocab.merges = merges special_vocab.merges = merges
# only add special tokens when they were not already loaded from config.json # only add special tokens when they were not already loaded from config.json
if len(special_vocab.special_token_ids) == 0: if len(special_vocab.special_token_ids) == 0:
special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"]) special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"]) special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
# this one is usually not in config.json anyway # this one is usually not in config.json anyway
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab.add_to_gguf(self.gguf_writer) special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_sentencepiece(self, add_to_gguf=True): def _set_vocab_sentencepiece(self, add_to_gguf=True):
@ -1877,10 +1877,10 @@ class TextModel(ModelBase):
self.gguf_writer.add_tokenizer_pre(tokpre) self.gguf_writer.add_tokenizer_pre(tokpre)
self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes) self.gguf_writer.add_token_types(toktypes)
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"]) special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab.add_to_gguf(self.gguf_writer) special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_glm(self): def _set_vocab_glm(self):
@ -1894,10 +1894,10 @@ class TextModel(ModelBase):
self.gguf_writer.add_token_types(toktypes) self.gguf_writer.add_token_types(toktypes)
# Special tokens # Special tokens
# Note: Using <|endoftext|> (151329) for eot causes endless generation # Note: Using <|endoftext|> (151329) for eot causes endless generation
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"]) # 151331 special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"]) # ty: ignore[unresolved-attribute] # 151331
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # 151336 special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute] # 151336
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # 151329 special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] # 151329
special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"]) # 151338 special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"]) # ty: ignore[unresolved-attribute] # 151338
special_vocab.add_to_gguf(self.gguf_writer) special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_interns1(self): def _set_vocab_interns1(self):
@ -1906,16 +1906,16 @@ class TextModel(ModelBase):
from transformers import AutoTokenizer from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab()) vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab()) # ty: ignore[unresolved-attribute]
vocab_size = self.hparams.get("vocab_size", len(vocab)) vocab_size = self.hparams.get("vocab_size", len(vocab))
assert max(vocab.values()) < vocab_size assert max(vocab.values()) < vocab_size
tokpre = self.get_vocab_base_pre(tokenizer) tokpre = self.get_vocab_base_pre(tokenizer)
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()} reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()}
added_vocab = tokenizer.get_added_vocab() added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
added_tokens_decoder = tokenizer.added_tokens_decoder added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute]
for i in range(vocab_size): for i in range(vocab_size):
if i not in reverse_vocab: if i not in reverse_vocab:
@ -1928,7 +1928,7 @@ class TextModel(ModelBase):
# To avoid unexpected issues - we make sure to normalize non-normalized tokens # To avoid unexpected issues - we make sure to normalize non-normalized tokens
if not added_tokens_decoder[i].normalized: if not added_tokens_decoder[i].normalized:
previous_token = token previous_token = token
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment]
if previous_token != token: if previous_token != token:
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
@ -2516,15 +2516,15 @@ class XverseModel(TextModel):
from transformers import AutoTokenizer from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model) tokenizer = AutoTokenizer.from_pretrained(dir_model)
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) # ty: ignore[unresolved-attribute]
# Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size, # Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size,
# because vocab_size is the count of items, and indexes start at 0. # because vocab_size is the count of items, and indexes start at 0.
max_vocab_index = max(tokenizer.get_vocab().values()) max_vocab_index = max(tokenizer.get_vocab().values()) # ty: ignore[unresolved-attribute]
if max_vocab_index >= vocab_size: if max_vocab_index >= vocab_size:
raise ValueError("Vocabulary size exceeds expected maximum size.") raise ValueError("Vocabulary size exceeds expected maximum size.")
reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute]
added_vocab = tokenizer.get_added_vocab() added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
for token_id in range(vocab_size): for token_id in range(vocab_size):
token_text = reverse_vocab[token_id].encode('utf-8') token_text = reverse_vocab[token_id].encode('utf-8')
@ -2535,7 +2535,7 @@ class XverseModel(TextModel):
elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
toktype = gguf.TokenType.BYTE # special toktype = gguf.TokenType.BYTE # special
elif reverse_vocab[token_id] in added_vocab: elif reverse_vocab[token_id] in added_vocab:
if tokenizer.added_tokens_decoder[token_id].special: if tokenizer.added_tokens_decoder[token_id].special: # ty: ignore[unresolved-attribute]
toktype = gguf.TokenType.CONTROL toktype = gguf.TokenType.CONTROL
else: else:
toktype = gguf.TokenType.USER_DEFINED toktype = gguf.TokenType.USER_DEFINED
@ -3752,7 +3752,7 @@ class QwenModel(TextModel):
@staticmethod @staticmethod
def token_bytes_to_string(b): def token_bytes_to_string(b):
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode # ty: ignore[unresolved-import]
byte_encoder = bytes_to_unicode() byte_encoder = bytes_to_unicode()
return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
@ -3823,14 +3823,14 @@ class DreamModel(TextModel):
from transformers import AutoTokenizer from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
vocab_dict = tokenizer.get_vocab() vocab_dict = tokenizer.get_vocab() # ty: ignore[unresolved-attribute]
vocab_size = self.hparams.get("vocab_size", len(vocab_dict)) vocab_size = self.hparams.get("vocab_size", len(vocab_dict))
assert max(vocab_dict.values()) < vocab_size assert max(vocab_dict.values()) < vocab_size
tokpre = self.get_vocab_base_pre(tokenizer) tokpre = self.get_vocab_base_pre(tokenizer)
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()} reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()}
added_vocab = tokenizer.get_added_vocab() added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
for i in range(vocab_size): for i in range(vocab_size):
if i not in reverse_vocab: if i not in reverse_vocab:
@ -3888,14 +3888,14 @@ class LLaDAModel(TextModel):
from transformers import AutoTokenizer from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
vocab_dict = tokenizer.get_vocab() vocab_dict = tokenizer.get_vocab() # ty: ignore[unresolved-attribute]
vocab_size = self.hparams.get("vocab_size", len(vocab_dict)) vocab_size = self.hparams.get("vocab_size", len(vocab_dict))
assert max(vocab_dict.values()) < vocab_size assert max(vocab_dict.values()) < vocab_size
tokpre = self.get_vocab_base_pre(tokenizer) tokpre = self.get_vocab_base_pre(tokenizer)
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()} reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()}
added_vocab = tokenizer.get_added_vocab() added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
for i in range(vocab_size): for i in range(vocab_size):
if i not in reverse_vocab: if i not in reverse_vocab:
@ -4673,9 +4673,9 @@ class Qwen3Model(Qwen2Model):
self.is_rerank = True self.is_rerank = True
self.is_tied_embeddings = self.hparams.get("tie_word_embeddings", False) self.is_tied_embeddings = self.hparams.get("tie_word_embeddings", False)
self.token_false_id = tokenizer.convert_tokens_to_ids("no") self.token_false_id = tokenizer.convert_tokens_to_ids("no") # ty: ignore[unresolved-attribute, invalid-assignment]
self.token_true_id = tokenizer.convert_tokens_to_ids("yes") self.token_true_id = tokenizer.convert_tokens_to_ids("yes") # ty: ignore[unresolved-attribute, invalid-assignment]
self.sep_token_id = tokenizer.convert_tokens_to_ids("|") self.sep_token_id = tokenizer.convert_tokens_to_ids("|") # ty: ignore[unresolved-attribute]
assert self.token_false_id is not None and self.token_true_id is not None assert self.token_false_id is not None and self.token_true_id is not None
@ -5944,7 +5944,7 @@ class KimiLinearModel(TextModel):
# Build merges list using the approach similar to HunYuanMoE # Build merges list using the approach similar to HunYuanMoE
merges = [] merges = []
vocab = {} vocab = {}
mergeable_ranks = tokenizer.model._mergeable_ranks mergeable_ranks = tokenizer.model._mergeable_ranks # ty: ignore[unresolved-attribute]
for token, rank in mergeable_ranks.items(): for token, rank in mergeable_ranks.items():
vocab[QwenModel.token_bytes_to_string(token)] = rank vocab[QwenModel.token_bytes_to_string(token)] = rank
if len(token) == 1: if len(token) == 1:
@ -5954,7 +5954,7 @@ class KimiLinearModel(TextModel):
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
# Build token list # Build token list
vocab_size = self.hparams["vocab_size"] vocab_size = self.hparams["vocab_size"]
special_tokens = tokenizer.special_tokens special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute]
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
tokens: list[str] = [] tokens: list[str] = []
toktypes: list[int] = [] toktypes: list[int] = []
@ -5980,7 +5980,7 @@ class KimiLinearModel(TextModel):
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
special_vocab.add_to_gguf(self.gguf_writer) special_vocab.add_to_gguf(self.gguf_writer)
# override eos id in config.json with tiktoken eos id # override eos id in config.json with tiktoken eos id
self.gguf_writer.add_eos_token_id(tokenizer.eos_id) self.gguf_writer.add_eos_token_id(tokenizer.eos_id) # ty: ignore[unresolved-attribute]
else: else:
raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!") raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!")
@ -6474,11 +6474,11 @@ class BertModel(TextModel):
with open(tokenizer_config_path, "r", encoding="utf-8") as fp: with open(tokenizer_config_path, "r", encoding="utf-8") as fp:
tokenizer_config_json = json.load(fp) tokenizer_config_json = json.load(fp)
add_prefix = tokenizer.add_prefix_space add_prefix = tokenizer.add_prefix_space # ty: ignore[unresolved-attribute]
remove_whitespaces = tokenizer.clean_up_tokenization_spaces remove_whitespaces = tokenizer.clean_up_tokenization_spaces # ty: ignore[unresolved-attribute]
precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"]) precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"])
vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size) vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size) # ty: ignore[unresolved-attribute]
else: else:
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
@ -6495,7 +6495,7 @@ class BertModel(TextModel):
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
scores: list[float] = [-10000.0] * vocab_size scores: list[float] = [-10000.0] * vocab_size
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size # ty: ignore[invalid-assignment]
if isinstance(tokenizer, SentencePieceProcessor): if isinstance(tokenizer, SentencePieceProcessor):
for token_id in range(tokenizer.vocab_size()): for token_id in range(tokenizer.vocab_size()):
@ -6517,20 +6517,20 @@ class BertModel(TextModel):
scores[token_id] = score scores[token_id] = score
toktypes[token_id] = toktype toktypes[token_id] = toktype
else: else:
added_vocab = tokenizer.get_added_vocab() added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
unk_token = tokenizer_config_json.get("unk_token") unk_token = tokenizer_config_json.get("unk_token")
unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3)) unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3)) # ty: ignore[no-matching-overload]
for token_id in range(tokenizer.vocab_size): for token_id in range(tokenizer.vocab_size): # ty: ignore[unresolved-attribute]
piece = tokenizer._convert_id_to_token(token_id) piece = tokenizer._convert_id_to_token(token_id) # ty: ignore[unresolved-attribute]
if (piece := tokenizer._convert_id_to_token(token_id)) is not None: if (piece := tokenizer._convert_id_to_token(token_id)) is not None: # ty: ignore[unresolved-attribute]
text = piece.encode("utf-8") text = piece.encode("utf-8")
score = tokenizer_json["model"]["vocab"][token_id][1] score = tokenizer_json["model"]["vocab"][token_id][1]
toktype = SentencePieceTokenTypes.NORMAL toktype = SentencePieceTokenTypes.NORMAL
if token_id == unk_token_id: if token_id == unk_token_id:
toktype = SentencePieceTokenTypes.UNKNOWN toktype = SentencePieceTokenTypes.UNKNOWN
elif token_id in tokenizer.all_special_ids: elif token_id in tokenizer.all_special_ids: # ty: ignore[unresolved-attribute]
toktype = SentencePieceTokenTypes.CONTROL toktype = SentencePieceTokenTypes.CONTROL
elif token_id in added_vocab.values(): elif token_id in added_vocab.values():
toktype = SentencePieceTokenTypes.USER_DEFINED toktype = SentencePieceTokenTypes.USER_DEFINED
@ -8839,7 +8839,7 @@ class DeepseekV2Model(TextModel):
# Build merges list using the approach similar to HunYuanMoE # Build merges list using the approach similar to HunYuanMoE
merges = [] merges = []
vocab = {} vocab = {}
mergeable_ranks = tokenizer.model._mergeable_ranks mergeable_ranks = tokenizer.model._mergeable_ranks # ty: ignore[unresolved-attribute]
for token, rank in mergeable_ranks.items(): for token, rank in mergeable_ranks.items():
vocab[QwenModel.token_bytes_to_string(token)] = rank vocab[QwenModel.token_bytes_to_string(token)] = rank
if len(token) == 1: if len(token) == 1:
@ -8850,7 +8850,7 @@ class DeepseekV2Model(TextModel):
# Build token list # Build token list
vocab_size = self.hparams["vocab_size"] vocab_size = self.hparams["vocab_size"]
special_tokens = tokenizer.special_tokens special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute]
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
tokens: list[str] = [] tokens: list[str] = []
toktypes: list[int] = [] toktypes: list[int] = []
@ -9821,10 +9821,10 @@ class Glm4Model(TextModel):
self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes) self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"]) special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab.add_to_gguf(self.gguf_writer) special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self): def set_gguf_parameters(self):
@ -10052,12 +10052,12 @@ class ChatGLMModel(TextModel):
from transformers import AutoTokenizer from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab())) vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab())) # ty: ignore[unresolved-attribute]
assert max(tokenizer.get_vocab().values()) < vocab_size assert max(tokenizer.get_vocab().values()) < vocab_size # ty: ignore[unresolved-attribute]
role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"]
special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens
for token_id in range(vocab_size): for token_id in range(vocab_size):
piece = tokenizer._convert_id_to_token(token_id) piece = tokenizer._convert_id_to_token(token_id) # ty: ignore[unresolved-attribute]
if token_id == 0: if token_id == 0:
piece = "<unk>" piece = "<unk>"
elif token_id == 1: elif token_id == 1:
@ -10065,17 +10065,17 @@ class ChatGLMModel(TextModel):
elif token_id == 2: elif token_id == 2:
piece = "<eos>" piece = "<eos>"
text = piece.encode("utf-8") text = piece.encode("utf-8") # ty: ignore[unresolved-attribute]
score = 0.0 score = 0.0
# Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py), # Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py),
# it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size() # it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size()
if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size(): if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size(): # ty: ignore[unresolved-attribute, invalid-argument-type]
score = tokenizer.tokenizer.sp_model.get_score(token_id) score = tokenizer.tokenizer.sp_model.get_score(token_id) # ty: ignore[unresolved-attribute]
if token_id >= tokenizer.tokenizer.sp_model.vocab_size(): if token_id >= tokenizer.tokenizer.sp_model.vocab_size(): # ty: ignore[unresolved-attribute]
if piece in special_tokens: if piece in special_tokens:
toktype = SentencePieceTokenTypes.CONTROL toktype = SentencePieceTokenTypes.CONTROL
elif len(piece) == 0: elif len(piece) == 0: # ty: ignore[invalid-argument-type]
text = f"[PAD{token_id}]".encode("utf-8") text = f"[PAD{token_id}]".encode("utf-8")
toktype = SentencePieceTokenTypes.UNUSED toktype = SentencePieceTokenTypes.UNUSED
else: else:
@ -10086,13 +10086,13 @@ class ChatGLMModel(TextModel):
continue continue
toktype = SentencePieceTokenTypes.NORMAL toktype = SentencePieceTokenTypes.NORMAL
if tokenizer.tokenizer.sp_model.is_unknown(token_id): if tokenizer.tokenizer.sp_model.is_unknown(token_id): # ty: ignore[unresolved-attribute]
toktype = SentencePieceTokenTypes.UNKNOWN toktype = SentencePieceTokenTypes.UNKNOWN
elif tokenizer.tokenizer.sp_model.is_control(token_id): elif tokenizer.tokenizer.sp_model.is_control(token_id): # ty: ignore[unresolved-attribute]
toktype = SentencePieceTokenTypes.CONTROL toktype = SentencePieceTokenTypes.CONTROL
elif tokenizer.tokenizer.sp_model.is_unused(token_id): elif tokenizer.tokenizer.sp_model.is_unused(token_id): # ty: ignore[unresolved-attribute]
toktype = SentencePieceTokenTypes.UNUSED toktype = SentencePieceTokenTypes.UNUSED
elif tokenizer.tokenizer.sp_model.is_byte(token_id): elif tokenizer.tokenizer.sp_model.is_byte(token_id): # ty: ignore[unresolved-attribute]
toktype = SentencePieceTokenTypes.BYTE toktype = SentencePieceTokenTypes.BYTE
tokens.append(text) tokens.append(text)
@ -10112,7 +10112,7 @@ class ChatGLMModel(TextModel):
@staticmethod @staticmethod
def token_bytes_to_string(b): def token_bytes_to_string(b):
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode # ty: ignore[unresolved-import]
byte_encoder = bytes_to_unicode() byte_encoder = bytes_to_unicode()
return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
@ -10146,7 +10146,7 @@ class ChatGLMModel(TextModel):
from transformers import AutoTokenizer from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
vocab_size = hparams.get("padded_vocab_size",hparams["vocab_size"]) vocab_size = hparams.get("padded_vocab_size",hparams["vocab_size"])
assert max(tokenizer.get_vocab().values()) < vocab_size assert max(tokenizer.get_vocab().values()) < vocab_size # ty: ignore[unresolved-attribute]
tokens, toktypes, tokpre = self.get_vocab_base() tokens, toktypes, tokpre = self.get_vocab_base()
self.gguf_writer.add_tokenizer_model("gpt2") self.gguf_writer.add_tokenizer_model("gpt2")
@ -10155,10 +10155,10 @@ class ChatGLMModel(TextModel):
self.gguf_writer.add_token_types(toktypes) self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
# only add special tokens when they were not already loaded from config.json # only add special tokens when they were not already loaded from config.json
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute]
# this one is usually not in config.json anyway # this one is usually not in config.json anyway
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab.add_to_gguf(self.gguf_writer) special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self): def set_gguf_parameters(self):
@ -11424,7 +11424,7 @@ class HunYuanMoEModel(TextModel):
# 2. Reverse-engineer the merges list from mergeable_ranks # 2. Reverse-engineer the merges list from mergeable_ranks
merges = [] merges = []
vocab = {} vocab = {}
mergeable_ranks = tokenizer.mergeable_ranks mergeable_ranks = tokenizer.mergeable_ranks # ty: ignore[unresolved-attribute]
for token, rank in mergeable_ranks.items(): for token, rank in mergeable_ranks.items():
vocab[QwenModel.token_bytes_to_string(token)] = rank vocab[QwenModel.token_bytes_to_string(token)] = rank
if len(token) == 1: if len(token) == 1:
@ -11435,8 +11435,8 @@ class HunYuanMoEModel(TextModel):
# 3. Generate the tokens and toktypes lists # 3. Generate the tokens and toktypes lists
vocab_size = self.hparams["vocab_size"] vocab_size = self.hparams["vocab_size"]
assert tokenizer.vocab_size == vocab_size assert tokenizer.vocab_size == vocab_size # ty: ignore[unresolved-attribute]
special_tokens = tokenizer.special_tokens special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute]
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
tokens: list[str] = [] tokens: list[str] = []
toktypes: list[int] = [] toktypes: list[int] = []
@ -11660,7 +11660,7 @@ class HunYuanModel(TextModel):
# 2. Reverse-engineer the merges list from mergeable_ranks # 2. Reverse-engineer the merges list from mergeable_ranks
merges = [] merges = []
vocab = {} vocab = {}
mergeable_ranks = tokenizer.mergeable_ranks mergeable_ranks = tokenizer.mergeable_ranks # ty: ignore[unresolved-attribute]
for token, rank in mergeable_ranks.items(): for token, rank in mergeable_ranks.items():
vocab[QwenModel.token_bytes_to_string(token)] = rank vocab[QwenModel.token_bytes_to_string(token)] = rank
if len(token) == 1: if len(token) == 1:
@ -11671,8 +11671,8 @@ class HunYuanModel(TextModel):
# 3. Generate the tokens and toktypes lists # 3. Generate the tokens and toktypes lists
vocab_size = self.hparams["vocab_size"] vocab_size = self.hparams["vocab_size"]
assert tokenizer.vocab_size == vocab_size assert tokenizer.vocab_size == vocab_size # ty: ignore[unresolved-attribute]
special_tokens = tokenizer.special_tokens special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute]
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
tokens: list[str] = [] tokens: list[str] = []
toktypes: list[int] = [] toktypes: list[int] = []
@ -12820,10 +12820,10 @@ class SolarOpenModel(Glm4MoeModel):
self.gguf_writer.add_tokenizer_pre(tokpre) self.gguf_writer.add_tokenizer_pre(tokpre)
self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes) self.gguf_writer.add_token_types(toktypes)
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|endoftext|>"]) special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<unk>"]) special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<unk>"]) # ty: ignore[unresolved-attribute]
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|startoftext|>"]) special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|startoftext|>"]) # ty: ignore[unresolved-attribute]
special_vocab.add_to_gguf(self.gguf_writer) special_vocab.add_to_gguf(self.gguf_writer)

View File

@ -296,7 +296,7 @@ for model in [*pre_computed_hashes, *all_models]:
except Exception as e: except Exception as e:
raise OSError(f"Error loading tokenizer for model {name}.") from e raise OSError(f"Error loading tokenizer for model {name}.") from e
chktok = tokenizer.encode(CHK_TXT) chktok = tokenizer.encode(CHK_TXT) # ty: ignore[unresolved-attribute]
chkhsh = sha256(str(chktok).encode()).hexdigest() chkhsh = sha256(str(chktok).encode()).hexdigest()
logger.info(f"model: {name}") logger.info(f"model: {name}")
@ -468,7 +468,7 @@ for model in models:
with open(f"models/ggml-vocab-{name}.gguf.out", "w") as f: with open(f"models/ggml-vocab-{name}.gguf.out", "w") as f:
for text in tests: for text in tests:
res = tokenizer.encode(text, add_special_tokens=False) res = tokenizer.encode(text, add_special_tokens=False) # ty: ignore[unresolved-attribute]
for r in res: for r in res:
f.write(f" {r}") f.write(f" {r}")
f.write("\n") f.write("\n")

View File

@ -402,7 +402,7 @@ if __name__ == '__main__':
# the invocation string includes the "<|start_of_turn|>" # the invocation string includes the "<|start_of_turn|>"
# token, but the adapters themselves were trained to # token, but the adapters themselves were trained to
# activate _after_ that first token, so we drop it here. # activate _after_ that first token, so we drop it here.
alora_invocation_tokens = tokenizer(invocation_string)["input_ids"][1:] alora_invocation_tokens = tokenizer(invocation_string)["input_ids"][1:] # ty: ignore[call-non-callable]
if alora_invocation_tokens: if alora_invocation_tokens:
logger.debug("GGUF KV: %s = %s", gguf.Keys.Adapter.ALORA_INVOCATION_TOKENS, alora_invocation_tokens) logger.debug("GGUF KV: %s = %s", gguf.Keys.Adapter.ALORA_INVOCATION_TOKENS, alora_invocation_tokens)
self.gguf_writer.add_key_value( self.gguf_writer.add_key_value(

View File

@ -53,10 +53,10 @@ model_name = os.path.basename(model_path)
print(f"Model name: {model_name}") print(f"Model name: {model_name}")
prompt = "Hello world today" prompt = "Hello world today"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids input_ids = tokenizer(prompt, return_tensors="pt").input_ids # ty: ignore[call-non-callable]
print(f"Input tokens: {input_ids}") print(f"Input tokens: {input_ids}")
print(f"Input text: {repr(prompt)}") print(f"Input text: {repr(prompt)}")
print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}") print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}") # ty: ignore[unresolved-attribute]
with torch.no_grad(): with torch.no_grad():
outputs = model(input_ids, output_hidden_states=True) outputs = model(input_ids, output_hidden_states=True)
@ -92,7 +92,7 @@ with torch.no_grad():
# Print embeddings per token in the requested format # Print embeddings per token in the requested format
print("\nToken embeddings:") print("\nToken embeddings:")
tokens = tokenizer.convert_ids_to_tokens(input_ids[0]) tokens = tokenizer.convert_ids_to_tokens(input_ids[0]) # ty: ignore[unresolved-attribute]
for i, embedding in enumerate(token_embeddings): for i, embedding in enumerate(token_embeddings):
# Format: show first few values, ..., then last few values # Format: show first few values, ..., then last few values
if len(embedding) > 10: if len(embedding) > 10:

View File

@ -207,8 +207,8 @@ def main():
else: else:
model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True) model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True)
encoded = tokenizer(prompt, return_tensors="pt") encoded = tokenizer(prompt, return_tensors="pt") # ty: ignore[call-non-callable]
tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0]) tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0]) # ty: ignore[unresolved-attribute]
n_tokens = len(tokens) n_tokens = len(tokens)
print(f"n_tokens: {n_tokens}"); print(f"n_tokens: {n_tokens}");
print(f"hidden_size: {model.config.hidden_size}") print(f"hidden_size: {model.config.hidden_size}")

View File

@ -543,7 +543,7 @@ class LlamaHfVocab(Vocab):
cache_dir=base_path, cache_dir=base_path,
local_files_only=True, local_files_only=True,
) )
assert self.tokenizer.is_fast # assume tokenizer.json is used assert self.tokenizer.is_fast # assume tokenizer.json is used # ty: ignore[unresolved-attribute]
# Initialize lists and dictionaries for added tokens # Initialize lists and dictionaries for added tokens
self.added_tokens_list = [] self.added_tokens_list = []
@ -552,30 +552,30 @@ class LlamaHfVocab(Vocab):
# Process added tokens # Process added tokens
for tok, tokidx in sorted( for tok, tokidx in sorted(
self.tokenizer.get_added_vocab().items(), key=lambda x: x[1] self.tokenizer.get_added_vocab().items(), key=lambda x: x[1] # ty: ignore[unresolved-attribute]
): ):
# Only consider added tokens that are not in the base vocabulary # Only consider added tokens that are not in the base vocabulary
if tokidx >= self.tokenizer.vocab_size: if tokidx >= self.tokenizer.vocab_size: # ty: ignore[unresolved-attribute]
self.added_tokens_list.append(tok) self.added_tokens_list.append(tok)
self.added_tokens_dict[tok] = tokidx self.added_tokens_dict[tok] = tokidx
self.added_tokens_ids.add(tokidx) self.added_tokens_ids.add(tokidx)
# Store special tokens and their IDs # Store special tokens and their IDs
self.specials = { self.specials = {
tok: self.tokenizer.get_vocab()[tok] tok: self.tokenizer.get_vocab()[tok] # ty: ignore[unresolved-attribute]
for tok in self.tokenizer.all_special_tokens for tok in self.tokenizer.all_special_tokens # ty: ignore[unresolved-attribute]
} }
self.special_ids = set(self.tokenizer.all_special_ids) self.special_ids = set(self.tokenizer.all_special_ids) # ty: ignore[unresolved-attribute]
# Set vocabulary sizes # Set vocabulary sizes
self.vocab_size_base = self.tokenizer.vocab_size self.vocab_size_base = self.tokenizer.vocab_size # ty: ignore[unresolved-attribute]
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer self.fname_tokenizer = fname_tokenizer
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
reverse_vocab = { reverse_vocab = {
id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items() id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items() # ty: ignore[unresolved-attribute]
} }
for token_id in range(self.vocab_size_base): for token_id in range(self.vocab_size_base):
@ -616,7 +616,7 @@ class LlamaHfVocab(Vocab):
yield text.encode("utf-8"), score, toktype yield text.encode("utf-8"), score, toktype
def has_newline_token(self): def has_newline_token(self):
return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab # ty: ignore[unresolved-attribute]
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
yield from self.hf_tokens() yield from self.hf_tokens()

View File

@ -18,7 +18,7 @@ classifiers = [
python = ">=3.9" python = ">=3.9"
numpy = "^1.25.0" numpy = "^1.25.0"
sentencepiece = ">=0.1.98,<0.3.0" sentencepiece = ">=0.1.98,<0.3.0"
transformers = ">=4.35.2,<5.0.0" transformers = "==5.5.1"
protobuf = ">=4.21.0,<5.0.0" protobuf = ">=4.21.0,<5.0.0"
gguf = { path = "./gguf-py" } gguf = { path = "./gguf-py" }
torch = { version = "^2.2.0", source = "pytorch" } torch = { version = "^2.2.0", source = "pytorch" }

View File

@ -1,7 +1,7 @@
numpy~=1.26.4 numpy~=1.26.4
sentencepiece>=0.1.98,<0.3.0 sentencepiece>=0.1.98,<0.3.0
transformers>=4.57.1,<5.0.0 transformers==5.5.1
gguf>=0.1.0 gguf>=0.1.0
protobuf>=4.21.0,<5.0.0 protobuf>=4.21.0,<5.0.0

View File

@ -1,6 +1,6 @@
aiohttp~=3.9.3 aiohttp~=3.9.3
pytest~=8.3.3 pytest~=8.3.3
huggingface_hub>=0.34.0,<1.0 huggingface_hub>=1.5.0,<2.0
matplotlib~=3.10.0 matplotlib~=3.10.0
numpy~=1.26.4 numpy~=1.26.4
openai~=2.14.0 openai~=2.14.0

View File

@ -19,7 +19,7 @@ with open(fname_tok, 'r', encoding='utf-8') as f:
lines = f.readlines() lines = f.readlines()
s = ''.join(lines) s = ''.join(lines)
t_start = time.time() t_start = time.time()
res = tokenizer.encode(s, add_special_tokens=False) res = tokenizer.encode(s, add_special_tokens=False) # ty: ignore[unresolved-attribute]
t_end = time.time() t_end = time.time()
print('\nmain : tokenized in', "{:.3f}".format(1000.0 * (t_end - t_start)), 'ms (py)') # noqa: NP100 print('\nmain : tokenized in', "{:.3f}".format(1000.0 * (t_end - t_start)), 'ms (py)') # noqa: NP100
with open(fname_out, 'w', encoding='utf-8') as f: with open(fname_out, 'w', encoding='utf-8') as f:

View File

@ -128,7 +128,7 @@ class Tokenizer:
class TokenizerGroundtruth (Tokenizer): class TokenizerGroundtruth (Tokenizer):
def __init__(self, dir_tokenizer: str): def __init__(self, dir_tokenizer: str):
self.model: PreTrainedTokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) self.model: PreTrainedTokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) # ty: ignore[invalid-assignment]
# guess BOS and EOS # guess BOS and EOS
ids = self.encode("a") ids = self.encode("a")
assert 1 <= len(ids) <= 3 assert 1 <= len(ids) <= 3
@ -142,7 +142,7 @@ class TokenizerGroundtruth (Tokenizer):
self.vocab = list(sorted(self.vocab)) self.vocab = list(sorted(self.vocab))
# tokens and lists # tokens and lists
self.special_tokens = list(self.model.all_special_tokens) self.special_tokens = list(self.model.all_special_tokens)
self.added_tokens = self.model.batch_decode(self.model.added_tokens_encoder.values(), skip_special_tokens=False) self.added_tokens = self.model.batch_decode(list(self.model.added_tokens_encoder.values()), skip_special_tokens=False)
self.bos_token = self.model.bos_token self.bos_token = self.model.bos_token
self.eos_token = self.model.eos_token self.eos_token = self.model.eos_token
@ -150,7 +150,7 @@ class TokenizerGroundtruth (Tokenizer):
return self.model.encode(text, add_special_tokens=True) return self.model.encode(text, add_special_tokens=True)
def decode(self, ids: list[int]) -> str: def decode(self, ids: list[int]) -> str:
return self.model.decode(ids, skip_special_tokens=False) return self.model.decode(ids, skip_special_tokens=False) # ty: ignore[invalid-return-type]
class TokenizerLlamaCpp (Tokenizer): class TokenizerLlamaCpp (Tokenizer):

View File

@ -1,6 +1,6 @@
aiohttp~=3.9.3 aiohttp~=3.9.3
pytest~=8.3.3 pytest~=8.3.3
huggingface_hub>=0.34.0,<1.0 huggingface_hub>=1.5.0,<2.0
numpy~=1.26.4 numpy~=1.26.4
openai~=2.14.0 openai~=2.14.0
prometheus-client~=0.20.0 prometheus-client~=0.20.0