From c8ac02fa1b9f8a154e110b655f7f7e4907796b0c Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Thu, 9 Apr 2026 12:36:29 +0200 Subject: [PATCH] requirements : update transformers to 5.5.1 (#21617) * requirements : update transformers to 5.5.0 This commit updates the transformers dependency to version 5.5.0. The motivation for this is that transformers 5.5.0 includes support for Gemma4 and is required to be able to convert Gemma4 models. This is also causing issues for user of gguf-my-repo. Refs: https://huggingface.co/spaces/ggml-org/gguf-my-repo/discussions/202 * fix huggingface_hub version * set version of transformers to 5.5.0 * convert : add ty ignore directives to convert_hf_to_gguf.py This commit adds `ty: ignore` directives to transformers tokenizers field/methods to avoid type check errors. There might be better ways to handle this and perhaps this can be done in a follow up commit. The motivation for this is that it looks like in transformers 5.5.0 AutoTokenizer.from_pretrained can return generic tokenizer types or None and the type checker now produces an error when the conversion script accesses field like tokenizer.vocab. * convert : add ty ignore to suppress type check errors * convert : remove incorrect type ignores * convert : fix remaining python checks I was running a newer version of ty locally but I've switched to version 0.0.26 which is what CI uses and I was then able to reproduce the errors. Sorry about the noise. * update transformers version to 5.5.1 --- convert_hf_to_gguf.py | 166 +++++++++--------- convert_hf_to_gguf_update.py | 4 +- convert_lora_to_gguf.py | 2 +- .../causal/run-casual-gen-embeddings-org.py | 6 +- .../scripts/utils/semantic_check.py | 4 +- gguf-py/gguf/vocab.py | 18 +- pyproject.toml | 2 +- .../requirements-convert_legacy_llama.txt | 2 +- requirements/requirements-tool_bench.txt | 2 +- tests/test-tokenizer-0.py | 2 +- tests/test-tokenizer-random.py | 6 +- tools/server/tests/requirements.txt | 2 +- 12 files changed, 108 insertions(+), 108 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index b5e56f87ca..8d6b0a97a0 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1229,15 +1229,15 @@ class TextModel(ModelBase): from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(self.dir_model) - vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) - assert max(tokenizer.vocab.values()) < vocab_size + vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) # ty: ignore[unresolved-attribute] + assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute] tokpre = self.get_vocab_base_pre(tokenizer) - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} - added_vocab = tokenizer.get_added_vocab() + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute] + added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] - added_tokens_decoder = tokenizer.added_tokens_decoder + added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute] for i in range(vocab_size): if i not in reverse_vocab: @@ -1250,7 +1250,7 @@ class TextModel(ModelBase): # To avoid unexpected issues - we make sure to normalize non-normalized tokens if not added_tokens_decoder[i].normalized: previous_token = token - token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) + token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment] if previous_token != token: logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") @@ -1583,13 +1583,13 @@ class TextModel(ModelBase): from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) vocab_size = hparams["vocab_size"] - assert max(tokenizer.get_vocab().values()) < vocab_size + assert max(tokenizer.get_vocab().values()) < vocab_size # ty: ignore[unresolved-attribute] tokpre = self.get_vocab_base_pre(tokenizer) merges = [] vocab = {} - mergeable_ranks = tokenizer.mergeable_ranks + mergeable_ranks = tokenizer.mergeable_ranks # ty: ignore[unresolved-attribute] for token, rank in mergeable_ranks.items(): vocab[QwenModel.token_bytes_to_string(token)] = rank if len(token) == 1: @@ -1599,7 +1599,7 @@ class TextModel(ModelBase): merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined - added_vocab = tokenizer.special_tokens + added_vocab = tokenizer.special_tokens # ty: ignore[unresolved-attribute] reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()} for i in range(vocab_size): @@ -1622,10 +1622,10 @@ class TextModel(ModelBase): special_vocab.merges = merges # only add special tokens when they were not already loaded from config.json if len(special_vocab.special_token_ids) == 0: - special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"]) - special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"]) + special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"]) # ty: ignore[unresolved-attribute] # this one is usually not in config.json anyway - special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) + special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) # ty: ignore[unresolved-attribute] special_vocab.add_to_gguf(self.gguf_writer) def _set_vocab_sentencepiece(self, add_to_gguf=True): @@ -1877,10 +1877,10 @@ class TextModel(ModelBase): self.gguf_writer.add_tokenizer_pre(tokpre) self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) - special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) - special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) - special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) - special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"]) + special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] special_vocab.add_to_gguf(self.gguf_writer) def _set_vocab_glm(self): @@ -1894,10 +1894,10 @@ class TextModel(ModelBase): self.gguf_writer.add_token_types(toktypes) # Special tokens # Note: Using <|endoftext|> (151329) for eot causes endless generation - special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"]) # 151331 - special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # 151336 - special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # 151329 - special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"]) # 151338 + special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"]) # ty: ignore[unresolved-attribute] # 151331 + special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute] # 151336 + special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] # 151329 + special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"]) # ty: ignore[unresolved-attribute] # 151338 special_vocab.add_to_gguf(self.gguf_writer) def _set_vocab_interns1(self): @@ -1906,16 +1906,16 @@ class TextModel(ModelBase): from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab()) + vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab()) # ty: ignore[unresolved-attribute] vocab_size = self.hparams.get("vocab_size", len(vocab)) assert max(vocab.values()) < vocab_size tokpre = self.get_vocab_base_pre(tokenizer) reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()} - added_vocab = tokenizer.get_added_vocab() + added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] - added_tokens_decoder = tokenizer.added_tokens_decoder + added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute] for i in range(vocab_size): if i not in reverse_vocab: @@ -1928,7 +1928,7 @@ class TextModel(ModelBase): # To avoid unexpected issues - we make sure to normalize non-normalized tokens if not added_tokens_decoder[i].normalized: previous_token = token - token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) + token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment] if previous_token != token: logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") @@ -2516,15 +2516,15 @@ class XverseModel(TextModel): from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(dir_model) - vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) + vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) # ty: ignore[unresolved-attribute] # Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size, # because vocab_size is the count of items, and indexes start at 0. - max_vocab_index = max(tokenizer.get_vocab().values()) + max_vocab_index = max(tokenizer.get_vocab().values()) # ty: ignore[unresolved-attribute] if max_vocab_index >= vocab_size: raise ValueError("Vocabulary size exceeds expected maximum size.") - reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} - added_vocab = tokenizer.get_added_vocab() + reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute] + added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] for token_id in range(vocab_size): token_text = reverse_vocab[token_id].encode('utf-8') @@ -2535,7 +2535,7 @@ class XverseModel(TextModel): elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): toktype = gguf.TokenType.BYTE # special elif reverse_vocab[token_id] in added_vocab: - if tokenizer.added_tokens_decoder[token_id].special: + if tokenizer.added_tokens_decoder[token_id].special: # ty: ignore[unresolved-attribute] toktype = gguf.TokenType.CONTROL else: toktype = gguf.TokenType.USER_DEFINED @@ -3752,7 +3752,7 @@ class QwenModel(TextModel): @staticmethod def token_bytes_to_string(b): - from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode + from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode # ty: ignore[unresolved-import] byte_encoder = bytes_to_unicode() return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) @@ -3823,14 +3823,14 @@ class DreamModel(TextModel): from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - vocab_dict = tokenizer.get_vocab() + vocab_dict = tokenizer.get_vocab() # ty: ignore[unresolved-attribute] vocab_size = self.hparams.get("vocab_size", len(vocab_dict)) assert max(vocab_dict.values()) < vocab_size tokpre = self.get_vocab_base_pre(tokenizer) reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()} - added_vocab = tokenizer.get_added_vocab() + added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] for i in range(vocab_size): if i not in reverse_vocab: @@ -3888,14 +3888,14 @@ class LLaDAModel(TextModel): from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - vocab_dict = tokenizer.get_vocab() + vocab_dict = tokenizer.get_vocab() # ty: ignore[unresolved-attribute] vocab_size = self.hparams.get("vocab_size", len(vocab_dict)) assert max(vocab_dict.values()) < vocab_size tokpre = self.get_vocab_base_pre(tokenizer) reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()} - added_vocab = tokenizer.get_added_vocab() + added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] for i in range(vocab_size): if i not in reverse_vocab: @@ -4673,9 +4673,9 @@ class Qwen3Model(Qwen2Model): self.is_rerank = True self.is_tied_embeddings = self.hparams.get("tie_word_embeddings", False) - self.token_false_id = tokenizer.convert_tokens_to_ids("no") - self.token_true_id = tokenizer.convert_tokens_to_ids("yes") - self.sep_token_id = tokenizer.convert_tokens_to_ids("|") + self.token_false_id = tokenizer.convert_tokens_to_ids("no") # ty: ignore[unresolved-attribute, invalid-assignment] + self.token_true_id = tokenizer.convert_tokens_to_ids("yes") # ty: ignore[unresolved-attribute, invalid-assignment] + self.sep_token_id = tokenizer.convert_tokens_to_ids("|") # ty: ignore[unresolved-attribute] assert self.token_false_id is not None and self.token_true_id is not None @@ -5944,7 +5944,7 @@ class KimiLinearModel(TextModel): # Build merges list using the approach similar to HunYuanMoE merges = [] vocab = {} - mergeable_ranks = tokenizer.model._mergeable_ranks + mergeable_ranks = tokenizer.model._mergeable_ranks # ty: ignore[unresolved-attribute] for token, rank in mergeable_ranks.items(): vocab[QwenModel.token_bytes_to_string(token)] = rank if len(token) == 1: @@ -5954,7 +5954,7 @@ class KimiLinearModel(TextModel): merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) # Build token list vocab_size = self.hparams["vocab_size"] - special_tokens = tokenizer.special_tokens + special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute] reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} tokens: list[str] = [] toktypes: list[int] = [] @@ -5980,7 +5980,7 @@ class KimiLinearModel(TextModel): special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) special_vocab.add_to_gguf(self.gguf_writer) # override eos id in config.json with tiktoken eos id - self.gguf_writer.add_eos_token_id(tokenizer.eos_id) + self.gguf_writer.add_eos_token_id(tokenizer.eos_id) # ty: ignore[unresolved-attribute] else: raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!") @@ -6474,11 +6474,11 @@ class BertModel(TextModel): with open(tokenizer_config_path, "r", encoding="utf-8") as fp: tokenizer_config_json = json.load(fp) - add_prefix = tokenizer.add_prefix_space - remove_whitespaces = tokenizer.clean_up_tokenization_spaces + add_prefix = tokenizer.add_prefix_space # ty: ignore[unresolved-attribute] + remove_whitespaces = tokenizer.clean_up_tokenization_spaces # ty: ignore[unresolved-attribute] precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"]) - vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size) + vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size) # ty: ignore[unresolved-attribute] else: sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) @@ -6495,7 +6495,7 @@ class BertModel(TextModel): tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size # ty: ignore[invalid-assignment] if isinstance(tokenizer, SentencePieceProcessor): for token_id in range(tokenizer.vocab_size()): @@ -6517,20 +6517,20 @@ class BertModel(TextModel): scores[token_id] = score toktypes[token_id] = toktype else: - added_vocab = tokenizer.get_added_vocab() + added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] unk_token = tokenizer_config_json.get("unk_token") - unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3)) + unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3)) # ty: ignore[no-matching-overload] - for token_id in range(tokenizer.vocab_size): - piece = tokenizer._convert_id_to_token(token_id) - if (piece := tokenizer._convert_id_to_token(token_id)) is not None: + for token_id in range(tokenizer.vocab_size): # ty: ignore[unresolved-attribute] + piece = tokenizer._convert_id_to_token(token_id) # ty: ignore[unresolved-attribute] + if (piece := tokenizer._convert_id_to_token(token_id)) is not None: # ty: ignore[unresolved-attribute] text = piece.encode("utf-8") score = tokenizer_json["model"]["vocab"][token_id][1] toktype = SentencePieceTokenTypes.NORMAL if token_id == unk_token_id: toktype = SentencePieceTokenTypes.UNKNOWN - elif token_id in tokenizer.all_special_ids: + elif token_id in tokenizer.all_special_ids: # ty: ignore[unresolved-attribute] toktype = SentencePieceTokenTypes.CONTROL elif token_id in added_vocab.values(): toktype = SentencePieceTokenTypes.USER_DEFINED @@ -8839,7 +8839,7 @@ class DeepseekV2Model(TextModel): # Build merges list using the approach similar to HunYuanMoE merges = [] vocab = {} - mergeable_ranks = tokenizer.model._mergeable_ranks + mergeable_ranks = tokenizer.model._mergeable_ranks # ty: ignore[unresolved-attribute] for token, rank in mergeable_ranks.items(): vocab[QwenModel.token_bytes_to_string(token)] = rank if len(token) == 1: @@ -8850,7 +8850,7 @@ class DeepseekV2Model(TextModel): # Build token list vocab_size = self.hparams["vocab_size"] - special_tokens = tokenizer.special_tokens + special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute] reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} tokens: list[str] = [] toktypes: list[int] = [] @@ -9821,10 +9821,10 @@ class Glm4Model(TextModel): self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) - special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) - special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) - special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"]) + special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): @@ -10052,12 +10052,12 @@ class ChatGLMModel(TextModel): from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab())) - assert max(tokenizer.get_vocab().values()) < vocab_size + vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab())) # ty: ignore[unresolved-attribute] + assert max(tokenizer.get_vocab().values()) < vocab_size # ty: ignore[unresolved-attribute] role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens for token_id in range(vocab_size): - piece = tokenizer._convert_id_to_token(token_id) + piece = tokenizer._convert_id_to_token(token_id) # ty: ignore[unresolved-attribute] if token_id == 0: piece = "" elif token_id == 1: @@ -10065,17 +10065,17 @@ class ChatGLMModel(TextModel): elif token_id == 2: piece = "" - text = piece.encode("utf-8") + text = piece.encode("utf-8") # ty: ignore[unresolved-attribute] score = 0.0 # Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py), # it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size() - if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size(): - score = tokenizer.tokenizer.sp_model.get_score(token_id) + if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size(): # ty: ignore[unresolved-attribute, invalid-argument-type] + score = tokenizer.tokenizer.sp_model.get_score(token_id) # ty: ignore[unresolved-attribute] - if token_id >= tokenizer.tokenizer.sp_model.vocab_size(): + if token_id >= tokenizer.tokenizer.sp_model.vocab_size(): # ty: ignore[unresolved-attribute] if piece in special_tokens: toktype = SentencePieceTokenTypes.CONTROL - elif len(piece) == 0: + elif len(piece) == 0: # ty: ignore[invalid-argument-type] text = f"[PAD{token_id}]".encode("utf-8") toktype = SentencePieceTokenTypes.UNUSED else: @@ -10086,13 +10086,13 @@ class ChatGLMModel(TextModel): continue toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.tokenizer.sp_model.is_unknown(token_id): + if tokenizer.tokenizer.sp_model.is_unknown(token_id): # ty: ignore[unresolved-attribute] toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.tokenizer.sp_model.is_control(token_id): + elif tokenizer.tokenizer.sp_model.is_control(token_id): # ty: ignore[unresolved-attribute] toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.tokenizer.sp_model.is_unused(token_id): + elif tokenizer.tokenizer.sp_model.is_unused(token_id): # ty: ignore[unresolved-attribute] toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.tokenizer.sp_model.is_byte(token_id): + elif tokenizer.tokenizer.sp_model.is_byte(token_id): # ty: ignore[unresolved-attribute] toktype = SentencePieceTokenTypes.BYTE tokens.append(text) @@ -10112,7 +10112,7 @@ class ChatGLMModel(TextModel): @staticmethod def token_bytes_to_string(b): - from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode + from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode # ty: ignore[unresolved-import] byte_encoder = bytes_to_unicode() return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) @@ -10146,7 +10146,7 @@ class ChatGLMModel(TextModel): from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) vocab_size = hparams.get("padded_vocab_size",hparams["vocab_size"]) - assert max(tokenizer.get_vocab().values()) < vocab_size + assert max(tokenizer.get_vocab().values()) < vocab_size # ty: ignore[unresolved-attribute] tokens, toktypes, tokpre = self.get_vocab_base() self.gguf_writer.add_tokenizer_model("gpt2") @@ -10155,10 +10155,10 @@ class ChatGLMModel(TextModel): self.gguf_writer.add_token_types(toktypes) special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) # only add special tokens when they were not already loaded from config.json - special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) - special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) + special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute] # this one is usually not in config.json anyway - special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) + special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): @@ -11424,7 +11424,7 @@ class HunYuanMoEModel(TextModel): # 2. Reverse-engineer the merges list from mergeable_ranks merges = [] vocab = {} - mergeable_ranks = tokenizer.mergeable_ranks + mergeable_ranks = tokenizer.mergeable_ranks # ty: ignore[unresolved-attribute] for token, rank in mergeable_ranks.items(): vocab[QwenModel.token_bytes_to_string(token)] = rank if len(token) == 1: @@ -11435,8 +11435,8 @@ class HunYuanMoEModel(TextModel): # 3. Generate the tokens and toktypes lists vocab_size = self.hparams["vocab_size"] - assert tokenizer.vocab_size == vocab_size - special_tokens = tokenizer.special_tokens + assert tokenizer.vocab_size == vocab_size # ty: ignore[unresolved-attribute] + special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute] reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} tokens: list[str] = [] toktypes: list[int] = [] @@ -11660,7 +11660,7 @@ class HunYuanModel(TextModel): # 2. Reverse-engineer the merges list from mergeable_ranks merges = [] vocab = {} - mergeable_ranks = tokenizer.mergeable_ranks + mergeable_ranks = tokenizer.mergeable_ranks # ty: ignore[unresolved-attribute] for token, rank in mergeable_ranks.items(): vocab[QwenModel.token_bytes_to_string(token)] = rank if len(token) == 1: @@ -11671,8 +11671,8 @@ class HunYuanModel(TextModel): # 3. Generate the tokens and toktypes lists vocab_size = self.hparams["vocab_size"] - assert tokenizer.vocab_size == vocab_size - special_tokens = tokenizer.special_tokens + assert tokenizer.vocab_size == vocab_size # ty: ignore[unresolved-attribute] + special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute] reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} tokens: list[str] = [] toktypes: list[int] = [] @@ -12820,10 +12820,10 @@ class SolarOpenModel(Glm4MoeModel): self.gguf_writer.add_tokenizer_pre(tokpre) self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) - special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) - special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|endoftext|>"]) - special_vocab._set_special_token("unk", tokenizer.get_added_vocab()[""]) - special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|startoftext|>"]) + special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("unk", tokenizer.get_added_vocab()[""]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|startoftext|>"]) # ty: ignore[unresolved-attribute] special_vocab.add_to_gguf(self.gguf_writer) diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 086f1c2286..d8d10a1012 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -296,7 +296,7 @@ for model in [*pre_computed_hashes, *all_models]: except Exception as e: raise OSError(f"Error loading tokenizer for model {name}.") from e - chktok = tokenizer.encode(CHK_TXT) + chktok = tokenizer.encode(CHK_TXT) # ty: ignore[unresolved-attribute] chkhsh = sha256(str(chktok).encode()).hexdigest() logger.info(f"model: {name}") @@ -468,7 +468,7 @@ for model in models: with open(f"models/ggml-vocab-{name}.gguf.out", "w") as f: for text in tests: - res = tokenizer.encode(text, add_special_tokens=False) + res = tokenizer.encode(text, add_special_tokens=False) # ty: ignore[unresolved-attribute] for r in res: f.write(f" {r}") f.write("\n") diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index ee98d0cf97..d583342056 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -402,7 +402,7 @@ if __name__ == '__main__': # the invocation string includes the "<|start_of_turn|>" # token, but the adapters themselves were trained to # activate _after_ that first token, so we drop it here. - alora_invocation_tokens = tokenizer(invocation_string)["input_ids"][1:] + alora_invocation_tokens = tokenizer(invocation_string)["input_ids"][1:] # ty: ignore[call-non-callable] if alora_invocation_tokens: logger.debug("GGUF KV: %s = %s", gguf.Keys.Adapter.ALORA_INVOCATION_TOKENS, alora_invocation_tokens) self.gguf_writer.add_key_value( diff --git a/examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.py b/examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.py index 4ab778fbc7..b94bec4e76 100755 --- a/examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.py +++ b/examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.py @@ -53,10 +53,10 @@ model_name = os.path.basename(model_path) print(f"Model name: {model_name}") prompt = "Hello world today" -input_ids = tokenizer(prompt, return_tensors="pt").input_ids +input_ids = tokenizer(prompt, return_tensors="pt").input_ids # ty: ignore[call-non-callable] print(f"Input tokens: {input_ids}") print(f"Input text: {repr(prompt)}") -print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}") +print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}") # ty: ignore[unresolved-attribute] with torch.no_grad(): outputs = model(input_ids, output_hidden_states=True) @@ -92,7 +92,7 @@ with torch.no_grad(): # Print embeddings per token in the requested format print("\nToken embeddings:") - tokens = tokenizer.convert_ids_to_tokens(input_ids[0]) + tokens = tokenizer.convert_ids_to_tokens(input_ids[0]) # ty: ignore[unresolved-attribute] for i, embedding in enumerate(token_embeddings): # Format: show first few values, ..., then last few values if len(embedding) > 10: diff --git a/examples/model-conversion/scripts/utils/semantic_check.py b/examples/model-conversion/scripts/utils/semantic_check.py index db0d004dab..754ae733da 100644 --- a/examples/model-conversion/scripts/utils/semantic_check.py +++ b/examples/model-conversion/scripts/utils/semantic_check.py @@ -207,8 +207,8 @@ def main(): else: model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True) - encoded = tokenizer(prompt, return_tensors="pt") - tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0]) + encoded = tokenizer(prompt, return_tensors="pt") # ty: ignore[call-non-callable] + tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0]) # ty: ignore[unresolved-attribute] n_tokens = len(tokens) print(f"n_tokens: {n_tokens}"); print(f"hidden_size: {model.config.hidden_size}") diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index 5cd729dfa8..09a9b7d183 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -543,7 +543,7 @@ class LlamaHfVocab(Vocab): cache_dir=base_path, local_files_only=True, ) - assert self.tokenizer.is_fast # assume tokenizer.json is used + assert self.tokenizer.is_fast # assume tokenizer.json is used # ty: ignore[unresolved-attribute] # Initialize lists and dictionaries for added tokens self.added_tokens_list = [] @@ -552,30 +552,30 @@ class LlamaHfVocab(Vocab): # Process added tokens for tok, tokidx in sorted( - self.tokenizer.get_added_vocab().items(), key=lambda x: x[1] + self.tokenizer.get_added_vocab().items(), key=lambda x: x[1] # ty: ignore[unresolved-attribute] ): # Only consider added tokens that are not in the base vocabulary - if tokidx >= self.tokenizer.vocab_size: + if tokidx >= self.tokenizer.vocab_size: # ty: ignore[unresolved-attribute] self.added_tokens_list.append(tok) self.added_tokens_dict[tok] = tokidx self.added_tokens_ids.add(tokidx) # Store special tokens and their IDs self.specials = { - tok: self.tokenizer.get_vocab()[tok] - for tok in self.tokenizer.all_special_tokens + tok: self.tokenizer.get_vocab()[tok] # ty: ignore[unresolved-attribute] + for tok in self.tokenizer.all_special_tokens # ty: ignore[unresolved-attribute] } - self.special_ids = set(self.tokenizer.all_special_ids) + self.special_ids = set(self.tokenizer.all_special_ids) # ty: ignore[unresolved-attribute] # Set vocabulary sizes - self.vocab_size_base = self.tokenizer.vocab_size + self.vocab_size_base = self.tokenizer.vocab_size # ty: ignore[unresolved-attribute] self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) self.fname_tokenizer = fname_tokenizer def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: reverse_vocab = { - id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items() + id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items() # ty: ignore[unresolved-attribute] } for token_id in range(self.vocab_size_base): @@ -616,7 +616,7 @@ class LlamaHfVocab(Vocab): yield text.encode("utf-8"), score, toktype def has_newline_token(self): - return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab + return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab # ty: ignore[unresolved-attribute] def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: yield from self.hf_tokens() diff --git a/pyproject.toml b/pyproject.toml index 422f53c7c7..35cd067083 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ classifiers = [ python = ">=3.9" numpy = "^1.25.0" sentencepiece = ">=0.1.98,<0.3.0" -transformers = ">=4.35.2,<5.0.0" +transformers = "==5.5.1" protobuf = ">=4.21.0,<5.0.0" gguf = { path = "./gguf-py" } torch = { version = "^2.2.0", source = "pytorch" } diff --git a/requirements/requirements-convert_legacy_llama.txt b/requirements/requirements-convert_legacy_llama.txt index 4898bf7ee2..18d3980106 100644 --- a/requirements/requirements-convert_legacy_llama.txt +++ b/requirements/requirements-convert_legacy_llama.txt @@ -1,7 +1,7 @@ numpy~=1.26.4 sentencepiece>=0.1.98,<0.3.0 -transformers>=4.57.1,<5.0.0 +transformers==5.5.1 gguf>=0.1.0 protobuf>=4.21.0,<5.0.0 diff --git a/requirements/requirements-tool_bench.txt b/requirements/requirements-tool_bench.txt index 3bb74fb9d0..66c3c12b3e 100644 --- a/requirements/requirements-tool_bench.txt +++ b/requirements/requirements-tool_bench.txt @@ -1,6 +1,6 @@ aiohttp~=3.9.3 pytest~=8.3.3 -huggingface_hub>=0.34.0,<1.0 +huggingface_hub>=1.5.0,<2.0 matplotlib~=3.10.0 numpy~=1.26.4 openai~=2.14.0 diff --git a/tests/test-tokenizer-0.py b/tests/test-tokenizer-0.py index cd760d1ce5..4f3f1c8a67 100644 --- a/tests/test-tokenizer-0.py +++ b/tests/test-tokenizer-0.py @@ -19,7 +19,7 @@ with open(fname_tok, 'r', encoding='utf-8') as f: lines = f.readlines() s = ''.join(lines) t_start = time.time() - res = tokenizer.encode(s, add_special_tokens=False) + res = tokenizer.encode(s, add_special_tokens=False) # ty: ignore[unresolved-attribute] t_end = time.time() print('\nmain : tokenized in', "{:.3f}".format(1000.0 * (t_end - t_start)), 'ms (py)') # noqa: NP100 with open(fname_out, 'w', encoding='utf-8') as f: diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index 25af4ee63b..8fc476b63c 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -128,7 +128,7 @@ class Tokenizer: class TokenizerGroundtruth (Tokenizer): def __init__(self, dir_tokenizer: str): - self.model: PreTrainedTokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) + self.model: PreTrainedTokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) # ty: ignore[invalid-assignment] # guess BOS and EOS ids = self.encode("a") assert 1 <= len(ids) <= 3 @@ -142,7 +142,7 @@ class TokenizerGroundtruth (Tokenizer): self.vocab = list(sorted(self.vocab)) # tokens and lists self.special_tokens = list(self.model.all_special_tokens) - self.added_tokens = self.model.batch_decode(self.model.added_tokens_encoder.values(), skip_special_tokens=False) + self.added_tokens = self.model.batch_decode(list(self.model.added_tokens_encoder.values()), skip_special_tokens=False) self.bos_token = self.model.bos_token self.eos_token = self.model.eos_token @@ -150,7 +150,7 @@ class TokenizerGroundtruth (Tokenizer): return self.model.encode(text, add_special_tokens=True) def decode(self, ids: list[int]) -> str: - return self.model.decode(ids, skip_special_tokens=False) + return self.model.decode(ids, skip_special_tokens=False) # ty: ignore[invalid-return-type] class TokenizerLlamaCpp (Tokenizer): diff --git a/tools/server/tests/requirements.txt b/tools/server/tests/requirements.txt index ca79d025ed..92d27e2a13 100644 --- a/tools/server/tests/requirements.txt +++ b/tools/server/tests/requirements.txt @@ -1,6 +1,6 @@ aiohttp~=3.9.3 pytest~=8.3.3 -huggingface_hub>=0.34.0,<1.0 +huggingface_hub>=1.5.0,<2.0 numpy~=1.26.4 openai~=2.14.0 prometheus-client~=0.20.0