diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index b5e56f87ca..8d6b0a97a0 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1229,15 +1229,15 @@ class TextModel(ModelBase): from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(self.dir_model) - vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) - assert max(tokenizer.vocab.values()) < vocab_size + vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) # ty: ignore[unresolved-attribute] + assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute] tokpre = self.get_vocab_base_pre(tokenizer) - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} - added_vocab = tokenizer.get_added_vocab() + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute] + added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] - added_tokens_decoder = tokenizer.added_tokens_decoder + added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute] for i in range(vocab_size): if i not in reverse_vocab: @@ -1250,7 +1250,7 @@ class TextModel(ModelBase): # To avoid unexpected issues - we make sure to normalize non-normalized tokens if not added_tokens_decoder[i].normalized: previous_token = token - token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) + token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment] if previous_token != token: logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") @@ -1583,13 +1583,13 @@ class TextModel(ModelBase): from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) vocab_size = hparams["vocab_size"] - assert max(tokenizer.get_vocab().values()) < vocab_size + assert max(tokenizer.get_vocab().values()) < vocab_size # ty: ignore[unresolved-attribute] tokpre = self.get_vocab_base_pre(tokenizer) merges = [] vocab = {} - mergeable_ranks = tokenizer.mergeable_ranks + mergeable_ranks = tokenizer.mergeable_ranks # ty: ignore[unresolved-attribute] for token, rank in mergeable_ranks.items(): vocab[QwenModel.token_bytes_to_string(token)] = rank if len(token) == 1: @@ -1599,7 +1599,7 @@ class TextModel(ModelBase): merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined - added_vocab = tokenizer.special_tokens + added_vocab = tokenizer.special_tokens # ty: ignore[unresolved-attribute] reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()} for i in range(vocab_size): @@ -1622,10 +1622,10 @@ class TextModel(ModelBase): special_vocab.merges = merges # only add special tokens when they were not already loaded from config.json if len(special_vocab.special_token_ids) == 0: - special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"]) - special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"]) + special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"]) # ty: ignore[unresolved-attribute] # this one is usually not in config.json anyway - special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) + special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) # ty: ignore[unresolved-attribute] special_vocab.add_to_gguf(self.gguf_writer) def _set_vocab_sentencepiece(self, add_to_gguf=True): @@ -1877,10 +1877,10 @@ class TextModel(ModelBase): self.gguf_writer.add_tokenizer_pre(tokpre) self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) - special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) - special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) - special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) - special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"]) + special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] special_vocab.add_to_gguf(self.gguf_writer) def _set_vocab_glm(self): @@ -1894,10 +1894,10 @@ class TextModel(ModelBase): self.gguf_writer.add_token_types(toktypes) # Special tokens # Note: Using <|endoftext|> (151329) for eot causes endless generation - special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"]) # 151331 - special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # 151336 - special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # 151329 - special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"]) # 151338 + special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"]) # ty: ignore[unresolved-attribute] # 151331 + special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute] # 151336 + special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] # 151329 + special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"]) # ty: ignore[unresolved-attribute] # 151338 special_vocab.add_to_gguf(self.gguf_writer) def _set_vocab_interns1(self): @@ -1906,16 +1906,16 @@ class TextModel(ModelBase): from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab()) + vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab()) # ty: ignore[unresolved-attribute] vocab_size = self.hparams.get("vocab_size", len(vocab)) assert max(vocab.values()) < vocab_size tokpre = self.get_vocab_base_pre(tokenizer) reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()} - added_vocab = tokenizer.get_added_vocab() + added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] - added_tokens_decoder = tokenizer.added_tokens_decoder + added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute] for i in range(vocab_size): if i not in reverse_vocab: @@ -1928,7 +1928,7 @@ class TextModel(ModelBase): # To avoid unexpected issues - we make sure to normalize non-normalized tokens if not added_tokens_decoder[i].normalized: previous_token = token - token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) + token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment] if previous_token != token: logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") @@ -2516,15 +2516,15 @@ class XverseModel(TextModel): from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(dir_model) - vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) + vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) # ty: ignore[unresolved-attribute] # Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size, # because vocab_size is the count of items, and indexes start at 0. - max_vocab_index = max(tokenizer.get_vocab().values()) + max_vocab_index = max(tokenizer.get_vocab().values()) # ty: ignore[unresolved-attribute] if max_vocab_index >= vocab_size: raise ValueError("Vocabulary size exceeds expected maximum size.") - reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} - added_vocab = tokenizer.get_added_vocab() + reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute] + added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] for token_id in range(vocab_size): token_text = reverse_vocab[token_id].encode('utf-8') @@ -2535,7 +2535,7 @@ class XverseModel(TextModel): elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): toktype = gguf.TokenType.BYTE # special elif reverse_vocab[token_id] in added_vocab: - if tokenizer.added_tokens_decoder[token_id].special: + if tokenizer.added_tokens_decoder[token_id].special: # ty: ignore[unresolved-attribute] toktype = gguf.TokenType.CONTROL else: toktype = gguf.TokenType.USER_DEFINED @@ -3752,7 +3752,7 @@ class QwenModel(TextModel): @staticmethod def token_bytes_to_string(b): - from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode + from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode # ty: ignore[unresolved-import] byte_encoder = bytes_to_unicode() return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) @@ -3823,14 +3823,14 @@ class DreamModel(TextModel): from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - vocab_dict = tokenizer.get_vocab() + vocab_dict = tokenizer.get_vocab() # ty: ignore[unresolved-attribute] vocab_size = self.hparams.get("vocab_size", len(vocab_dict)) assert max(vocab_dict.values()) < vocab_size tokpre = self.get_vocab_base_pre(tokenizer) reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()} - added_vocab = tokenizer.get_added_vocab() + added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] for i in range(vocab_size): if i not in reverse_vocab: @@ -3888,14 +3888,14 @@ class LLaDAModel(TextModel): from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - vocab_dict = tokenizer.get_vocab() + vocab_dict = tokenizer.get_vocab() # ty: ignore[unresolved-attribute] vocab_size = self.hparams.get("vocab_size", len(vocab_dict)) assert max(vocab_dict.values()) < vocab_size tokpre = self.get_vocab_base_pre(tokenizer) reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()} - added_vocab = tokenizer.get_added_vocab() + added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] for i in range(vocab_size): if i not in reverse_vocab: @@ -4673,9 +4673,9 @@ class Qwen3Model(Qwen2Model): self.is_rerank = True self.is_tied_embeddings = self.hparams.get("tie_word_embeddings", False) - self.token_false_id = tokenizer.convert_tokens_to_ids("no") - self.token_true_id = tokenizer.convert_tokens_to_ids("yes") - self.sep_token_id = tokenizer.convert_tokens_to_ids("|") + self.token_false_id = tokenizer.convert_tokens_to_ids("no") # ty: ignore[unresolved-attribute, invalid-assignment] + self.token_true_id = tokenizer.convert_tokens_to_ids("yes") # ty: ignore[unresolved-attribute, invalid-assignment] + self.sep_token_id = tokenizer.convert_tokens_to_ids("|") # ty: ignore[unresolved-attribute] assert self.token_false_id is not None and self.token_true_id is not None @@ -5944,7 +5944,7 @@ class KimiLinearModel(TextModel): # Build merges list using the approach similar to HunYuanMoE merges = [] vocab = {} - mergeable_ranks = tokenizer.model._mergeable_ranks + mergeable_ranks = tokenizer.model._mergeable_ranks # ty: ignore[unresolved-attribute] for token, rank in mergeable_ranks.items(): vocab[QwenModel.token_bytes_to_string(token)] = rank if len(token) == 1: @@ -5954,7 +5954,7 @@ class KimiLinearModel(TextModel): merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) # Build token list vocab_size = self.hparams["vocab_size"] - special_tokens = tokenizer.special_tokens + special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute] reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} tokens: list[str] = [] toktypes: list[int] = [] @@ -5980,7 +5980,7 @@ class KimiLinearModel(TextModel): special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) special_vocab.add_to_gguf(self.gguf_writer) # override eos id in config.json with tiktoken eos id - self.gguf_writer.add_eos_token_id(tokenizer.eos_id) + self.gguf_writer.add_eos_token_id(tokenizer.eos_id) # ty: ignore[unresolved-attribute] else: raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!") @@ -6474,11 +6474,11 @@ class BertModel(TextModel): with open(tokenizer_config_path, "r", encoding="utf-8") as fp: tokenizer_config_json = json.load(fp) - add_prefix = tokenizer.add_prefix_space - remove_whitespaces = tokenizer.clean_up_tokenization_spaces + add_prefix = tokenizer.add_prefix_space # ty: ignore[unresolved-attribute] + remove_whitespaces = tokenizer.clean_up_tokenization_spaces # ty: ignore[unresolved-attribute] precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"]) - vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size) + vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size) # ty: ignore[unresolved-attribute] else: sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) @@ -6495,7 +6495,7 @@ class BertModel(TextModel): tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size # ty: ignore[invalid-assignment] if isinstance(tokenizer, SentencePieceProcessor): for token_id in range(tokenizer.vocab_size()): @@ -6517,20 +6517,20 @@ class BertModel(TextModel): scores[token_id] = score toktypes[token_id] = toktype else: - added_vocab = tokenizer.get_added_vocab() + added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] unk_token = tokenizer_config_json.get("unk_token") - unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3)) + unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3)) # ty: ignore[no-matching-overload] - for token_id in range(tokenizer.vocab_size): - piece = tokenizer._convert_id_to_token(token_id) - if (piece := tokenizer._convert_id_to_token(token_id)) is not None: + for token_id in range(tokenizer.vocab_size): # ty: ignore[unresolved-attribute] + piece = tokenizer._convert_id_to_token(token_id) # ty: ignore[unresolved-attribute] + if (piece := tokenizer._convert_id_to_token(token_id)) is not None: # ty: ignore[unresolved-attribute] text = piece.encode("utf-8") score = tokenizer_json["model"]["vocab"][token_id][1] toktype = SentencePieceTokenTypes.NORMAL if token_id == unk_token_id: toktype = SentencePieceTokenTypes.UNKNOWN - elif token_id in tokenizer.all_special_ids: + elif token_id in tokenizer.all_special_ids: # ty: ignore[unresolved-attribute] toktype = SentencePieceTokenTypes.CONTROL elif token_id in added_vocab.values(): toktype = SentencePieceTokenTypes.USER_DEFINED @@ -8839,7 +8839,7 @@ class DeepseekV2Model(TextModel): # Build merges list using the approach similar to HunYuanMoE merges = [] vocab = {} - mergeable_ranks = tokenizer.model._mergeable_ranks + mergeable_ranks = tokenizer.model._mergeable_ranks # ty: ignore[unresolved-attribute] for token, rank in mergeable_ranks.items(): vocab[QwenModel.token_bytes_to_string(token)] = rank if len(token) == 1: @@ -8850,7 +8850,7 @@ class DeepseekV2Model(TextModel): # Build token list vocab_size = self.hparams["vocab_size"] - special_tokens = tokenizer.special_tokens + special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute] reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} tokens: list[str] = [] toktypes: list[int] = [] @@ -9821,10 +9821,10 @@ class Glm4Model(TextModel): self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) - special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) - special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) - special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"]) + special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): @@ -10052,12 +10052,12 @@ class ChatGLMModel(TextModel): from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab())) - assert max(tokenizer.get_vocab().values()) < vocab_size + vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab())) # ty: ignore[unresolved-attribute] + assert max(tokenizer.get_vocab().values()) < vocab_size # ty: ignore[unresolved-attribute] role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens for token_id in range(vocab_size): - piece = tokenizer._convert_id_to_token(token_id) + piece = tokenizer._convert_id_to_token(token_id) # ty: ignore[unresolved-attribute] if token_id == 0: piece = "" elif token_id == 1: @@ -10065,17 +10065,17 @@ class ChatGLMModel(TextModel): elif token_id == 2: piece = "" - text = piece.encode("utf-8") + text = piece.encode("utf-8") # ty: ignore[unresolved-attribute] score = 0.0 # Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py), # it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size() - if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size(): - score = tokenizer.tokenizer.sp_model.get_score(token_id) + if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size(): # ty: ignore[unresolved-attribute, invalid-argument-type] + score = tokenizer.tokenizer.sp_model.get_score(token_id) # ty: ignore[unresolved-attribute] - if token_id >= tokenizer.tokenizer.sp_model.vocab_size(): + if token_id >= tokenizer.tokenizer.sp_model.vocab_size(): # ty: ignore[unresolved-attribute] if piece in special_tokens: toktype = SentencePieceTokenTypes.CONTROL - elif len(piece) == 0: + elif len(piece) == 0: # ty: ignore[invalid-argument-type] text = f"[PAD{token_id}]".encode("utf-8") toktype = SentencePieceTokenTypes.UNUSED else: @@ -10086,13 +10086,13 @@ class ChatGLMModel(TextModel): continue toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.tokenizer.sp_model.is_unknown(token_id): + if tokenizer.tokenizer.sp_model.is_unknown(token_id): # ty: ignore[unresolved-attribute] toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.tokenizer.sp_model.is_control(token_id): + elif tokenizer.tokenizer.sp_model.is_control(token_id): # ty: ignore[unresolved-attribute] toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.tokenizer.sp_model.is_unused(token_id): + elif tokenizer.tokenizer.sp_model.is_unused(token_id): # ty: ignore[unresolved-attribute] toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.tokenizer.sp_model.is_byte(token_id): + elif tokenizer.tokenizer.sp_model.is_byte(token_id): # ty: ignore[unresolved-attribute] toktype = SentencePieceTokenTypes.BYTE tokens.append(text) @@ -10112,7 +10112,7 @@ class ChatGLMModel(TextModel): @staticmethod def token_bytes_to_string(b): - from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode + from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode # ty: ignore[unresolved-import] byte_encoder = bytes_to_unicode() return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) @@ -10146,7 +10146,7 @@ class ChatGLMModel(TextModel): from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) vocab_size = hparams.get("padded_vocab_size",hparams["vocab_size"]) - assert max(tokenizer.get_vocab().values()) < vocab_size + assert max(tokenizer.get_vocab().values()) < vocab_size # ty: ignore[unresolved-attribute] tokens, toktypes, tokpre = self.get_vocab_base() self.gguf_writer.add_tokenizer_model("gpt2") @@ -10155,10 +10155,10 @@ class ChatGLMModel(TextModel): self.gguf_writer.add_token_types(toktypes) special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) # only add special tokens when they were not already loaded from config.json - special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) - special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) + special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute] # this one is usually not in config.json anyway - special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) + special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): @@ -11424,7 +11424,7 @@ class HunYuanMoEModel(TextModel): # 2. Reverse-engineer the merges list from mergeable_ranks merges = [] vocab = {} - mergeable_ranks = tokenizer.mergeable_ranks + mergeable_ranks = tokenizer.mergeable_ranks # ty: ignore[unresolved-attribute] for token, rank in mergeable_ranks.items(): vocab[QwenModel.token_bytes_to_string(token)] = rank if len(token) == 1: @@ -11435,8 +11435,8 @@ class HunYuanMoEModel(TextModel): # 3. Generate the tokens and toktypes lists vocab_size = self.hparams["vocab_size"] - assert tokenizer.vocab_size == vocab_size - special_tokens = tokenizer.special_tokens + assert tokenizer.vocab_size == vocab_size # ty: ignore[unresolved-attribute] + special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute] reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} tokens: list[str] = [] toktypes: list[int] = [] @@ -11660,7 +11660,7 @@ class HunYuanModel(TextModel): # 2. Reverse-engineer the merges list from mergeable_ranks merges = [] vocab = {} - mergeable_ranks = tokenizer.mergeable_ranks + mergeable_ranks = tokenizer.mergeable_ranks # ty: ignore[unresolved-attribute] for token, rank in mergeable_ranks.items(): vocab[QwenModel.token_bytes_to_string(token)] = rank if len(token) == 1: @@ -11671,8 +11671,8 @@ class HunYuanModel(TextModel): # 3. Generate the tokens and toktypes lists vocab_size = self.hparams["vocab_size"] - assert tokenizer.vocab_size == vocab_size - special_tokens = tokenizer.special_tokens + assert tokenizer.vocab_size == vocab_size # ty: ignore[unresolved-attribute] + special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute] reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} tokens: list[str] = [] toktypes: list[int] = [] @@ -12820,10 +12820,10 @@ class SolarOpenModel(Glm4MoeModel): self.gguf_writer.add_tokenizer_pre(tokpre) self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) - special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) - special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|endoftext|>"]) - special_vocab._set_special_token("unk", tokenizer.get_added_vocab()[""]) - special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|startoftext|>"]) + special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("unk", tokenizer.get_added_vocab()[""]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|startoftext|>"]) # ty: ignore[unresolved-attribute] special_vocab.add_to_gguf(self.gguf_writer) diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 086f1c2286..d8d10a1012 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -296,7 +296,7 @@ for model in [*pre_computed_hashes, *all_models]: except Exception as e: raise OSError(f"Error loading tokenizer for model {name}.") from e - chktok = tokenizer.encode(CHK_TXT) + chktok = tokenizer.encode(CHK_TXT) # ty: ignore[unresolved-attribute] chkhsh = sha256(str(chktok).encode()).hexdigest() logger.info(f"model: {name}") @@ -468,7 +468,7 @@ for model in models: with open(f"models/ggml-vocab-{name}.gguf.out", "w") as f: for text in tests: - res = tokenizer.encode(text, add_special_tokens=False) + res = tokenizer.encode(text, add_special_tokens=False) # ty: ignore[unresolved-attribute] for r in res: f.write(f" {r}") f.write("\n") diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index ee98d0cf97..d583342056 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -402,7 +402,7 @@ if __name__ == '__main__': # the invocation string includes the "<|start_of_turn|>" # token, but the adapters themselves were trained to # activate _after_ that first token, so we drop it here. - alora_invocation_tokens = tokenizer(invocation_string)["input_ids"][1:] + alora_invocation_tokens = tokenizer(invocation_string)["input_ids"][1:] # ty: ignore[call-non-callable] if alora_invocation_tokens: logger.debug("GGUF KV: %s = %s", gguf.Keys.Adapter.ALORA_INVOCATION_TOKENS, alora_invocation_tokens) self.gguf_writer.add_key_value( diff --git a/examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.py b/examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.py index 4ab778fbc7..b94bec4e76 100755 --- a/examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.py +++ b/examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.py @@ -53,10 +53,10 @@ model_name = os.path.basename(model_path) print(f"Model name: {model_name}") prompt = "Hello world today" -input_ids = tokenizer(prompt, return_tensors="pt").input_ids +input_ids = tokenizer(prompt, return_tensors="pt").input_ids # ty: ignore[call-non-callable] print(f"Input tokens: {input_ids}") print(f"Input text: {repr(prompt)}") -print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}") +print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}") # ty: ignore[unresolved-attribute] with torch.no_grad(): outputs = model(input_ids, output_hidden_states=True) @@ -92,7 +92,7 @@ with torch.no_grad(): # Print embeddings per token in the requested format print("\nToken embeddings:") - tokens = tokenizer.convert_ids_to_tokens(input_ids[0]) + tokens = tokenizer.convert_ids_to_tokens(input_ids[0]) # ty: ignore[unresolved-attribute] for i, embedding in enumerate(token_embeddings): # Format: show first few values, ..., then last few values if len(embedding) > 10: diff --git a/examples/model-conversion/scripts/utils/semantic_check.py b/examples/model-conversion/scripts/utils/semantic_check.py index db0d004dab..754ae733da 100644 --- a/examples/model-conversion/scripts/utils/semantic_check.py +++ b/examples/model-conversion/scripts/utils/semantic_check.py @@ -207,8 +207,8 @@ def main(): else: model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True) - encoded = tokenizer(prompt, return_tensors="pt") - tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0]) + encoded = tokenizer(prompt, return_tensors="pt") # ty: ignore[call-non-callable] + tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0]) # ty: ignore[unresolved-attribute] n_tokens = len(tokens) print(f"n_tokens: {n_tokens}"); print(f"hidden_size: {model.config.hidden_size}") diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index 5cd729dfa8..09a9b7d183 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -543,7 +543,7 @@ class LlamaHfVocab(Vocab): cache_dir=base_path, local_files_only=True, ) - assert self.tokenizer.is_fast # assume tokenizer.json is used + assert self.tokenizer.is_fast # assume tokenizer.json is used # ty: ignore[unresolved-attribute] # Initialize lists and dictionaries for added tokens self.added_tokens_list = [] @@ -552,30 +552,30 @@ class LlamaHfVocab(Vocab): # Process added tokens for tok, tokidx in sorted( - self.tokenizer.get_added_vocab().items(), key=lambda x: x[1] + self.tokenizer.get_added_vocab().items(), key=lambda x: x[1] # ty: ignore[unresolved-attribute] ): # Only consider added tokens that are not in the base vocabulary - if tokidx >= self.tokenizer.vocab_size: + if tokidx >= self.tokenizer.vocab_size: # ty: ignore[unresolved-attribute] self.added_tokens_list.append(tok) self.added_tokens_dict[tok] = tokidx self.added_tokens_ids.add(tokidx) # Store special tokens and their IDs self.specials = { - tok: self.tokenizer.get_vocab()[tok] - for tok in self.tokenizer.all_special_tokens + tok: self.tokenizer.get_vocab()[tok] # ty: ignore[unresolved-attribute] + for tok in self.tokenizer.all_special_tokens # ty: ignore[unresolved-attribute] } - self.special_ids = set(self.tokenizer.all_special_ids) + self.special_ids = set(self.tokenizer.all_special_ids) # ty: ignore[unresolved-attribute] # Set vocabulary sizes - self.vocab_size_base = self.tokenizer.vocab_size + self.vocab_size_base = self.tokenizer.vocab_size # ty: ignore[unresolved-attribute] self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) self.fname_tokenizer = fname_tokenizer def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: reverse_vocab = { - id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items() + id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items() # ty: ignore[unresolved-attribute] } for token_id in range(self.vocab_size_base): @@ -616,7 +616,7 @@ class LlamaHfVocab(Vocab): yield text.encode("utf-8"), score, toktype def has_newline_token(self): - return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab + return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab # ty: ignore[unresolved-attribute] def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: yield from self.hf_tokens() diff --git a/pyproject.toml b/pyproject.toml index 422f53c7c7..35cd067083 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ classifiers = [ python = ">=3.9" numpy = "^1.25.0" sentencepiece = ">=0.1.98,<0.3.0" -transformers = ">=4.35.2,<5.0.0" +transformers = "==5.5.1" protobuf = ">=4.21.0,<5.0.0" gguf = { path = "./gguf-py" } torch = { version = "^2.2.0", source = "pytorch" } diff --git a/requirements/requirements-convert_legacy_llama.txt b/requirements/requirements-convert_legacy_llama.txt index 4898bf7ee2..18d3980106 100644 --- a/requirements/requirements-convert_legacy_llama.txt +++ b/requirements/requirements-convert_legacy_llama.txt @@ -1,7 +1,7 @@ numpy~=1.26.4 sentencepiece>=0.1.98,<0.3.0 -transformers>=4.57.1,<5.0.0 +transformers==5.5.1 gguf>=0.1.0 protobuf>=4.21.0,<5.0.0 diff --git a/requirements/requirements-tool_bench.txt b/requirements/requirements-tool_bench.txt index 3bb74fb9d0..66c3c12b3e 100644 --- a/requirements/requirements-tool_bench.txt +++ b/requirements/requirements-tool_bench.txt @@ -1,6 +1,6 @@ aiohttp~=3.9.3 pytest~=8.3.3 -huggingface_hub>=0.34.0,<1.0 +huggingface_hub>=1.5.0,<2.0 matplotlib~=3.10.0 numpy~=1.26.4 openai~=2.14.0 diff --git a/tests/test-tokenizer-0.py b/tests/test-tokenizer-0.py index cd760d1ce5..4f3f1c8a67 100644 --- a/tests/test-tokenizer-0.py +++ b/tests/test-tokenizer-0.py @@ -19,7 +19,7 @@ with open(fname_tok, 'r', encoding='utf-8') as f: lines = f.readlines() s = ''.join(lines) t_start = time.time() - res = tokenizer.encode(s, add_special_tokens=False) + res = tokenizer.encode(s, add_special_tokens=False) # ty: ignore[unresolved-attribute] t_end = time.time() print('\nmain : tokenized in', "{:.3f}".format(1000.0 * (t_end - t_start)), 'ms (py)') # noqa: NP100 with open(fname_out, 'w', encoding='utf-8') as f: diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index 25af4ee63b..8fc476b63c 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -128,7 +128,7 @@ class Tokenizer: class TokenizerGroundtruth (Tokenizer): def __init__(self, dir_tokenizer: str): - self.model: PreTrainedTokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) + self.model: PreTrainedTokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) # ty: ignore[invalid-assignment] # guess BOS and EOS ids = self.encode("a") assert 1 <= len(ids) <= 3 @@ -142,7 +142,7 @@ class TokenizerGroundtruth (Tokenizer): self.vocab = list(sorted(self.vocab)) # tokens and lists self.special_tokens = list(self.model.all_special_tokens) - self.added_tokens = self.model.batch_decode(self.model.added_tokens_encoder.values(), skip_special_tokens=False) + self.added_tokens = self.model.batch_decode(list(self.model.added_tokens_encoder.values()), skip_special_tokens=False) self.bos_token = self.model.bos_token self.eos_token = self.model.eos_token @@ -150,7 +150,7 @@ class TokenizerGroundtruth (Tokenizer): return self.model.encode(text, add_special_tokens=True) def decode(self, ids: list[int]) -> str: - return self.model.decode(ids, skip_special_tokens=False) + return self.model.decode(ids, skip_special_tokens=False) # ty: ignore[invalid-return-type] class TokenizerLlamaCpp (Tokenizer): diff --git a/tools/server/tests/requirements.txt b/tools/server/tests/requirements.txt index ca79d025ed..92d27e2a13 100644 --- a/tools/server/tests/requirements.txt +++ b/tools/server/tests/requirements.txt @@ -1,6 +1,6 @@ aiohttp~=3.9.3 pytest~=8.3.3 -huggingface_hub>=0.34.0,<1.0 +huggingface_hub>=1.5.0,<2.0 numpy~=1.26.4 openai~=2.14.0 prometheus-client~=0.20.0