convert : support latest mistral-common (fix conversion with --mistral-format) (#17712)
* fix convert_hf_to_gguf.py failing with --mistral-format using later mistral-common versions. * use get_one_valid_tokenizer_file from mistral-common if available and fallback to old logic otherwise. * use file name instead of file path for get_one_valid_tokenizer_file. * fix --mistral-format tokenizer file failing for tokenizers in subdirectories. * move get_one_valid_tokenizer_file import to avoid nested try-except.
This commit is contained in:
parent
e9f9483464
commit
424c579455
|
|
@ -31,6 +31,14 @@ except ImportError:
|
||||||
else:
|
else:
|
||||||
_mistral_common_installed = True
|
_mistral_common_installed = True
|
||||||
|
|
||||||
|
try:
|
||||||
|
from mistral_common.tokens.tokenizers.utils import ( # pyright: ignore[reportMissingImports]
|
||||||
|
get_one_valid_tokenizer_file,
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
# We still want the conversion to work with older mistral-common versions.
|
||||||
|
get_one_valid_tokenizer_file = None
|
||||||
|
|
||||||
|
|
||||||
import gguf
|
import gguf
|
||||||
|
|
||||||
|
|
@ -673,24 +681,30 @@ class MistralVocab(Vocab):
|
||||||
|
|
||||||
# Find the tokenizer files
|
# Find the tokenizer files
|
||||||
all_files = [f.as_posix() for f in base_path.glob("**/*") if f.is_file()]
|
all_files = [f.as_posix() for f in base_path.glob("**/*") if f.is_file()]
|
||||||
valid_tokenizer_files = _filter_valid_tokenizer_files(all_files)
|
|
||||||
|
|
||||||
if len(valid_tokenizer_files) == 0:
|
if get_one_valid_tokenizer_file is not None:
|
||||||
raise ValueError(f"No tokenizer file found in the directory: {base_path}")
|
tokenizer_file_path = get_one_valid_tokenizer_file(all_files)
|
||||||
# If there are multiple tokenizer files, we use tekken.json if it exists, otherwise the versioned one.
|
|
||||||
if len(valid_tokenizer_files) > 1:
|
|
||||||
if "tekken.json" in valid_tokenizer_files:
|
|
||||||
tokenizer_file = "tekken.json"
|
|
||||||
else:
|
|
||||||
tokenizer_file = sorted(valid_tokenizer_files)[-1]
|
|
||||||
logger.warning(
|
|
||||||
f"Multiple tokenizer files found in {base_path}. Using {tokenizer_file}"
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
tokenizer_file = valid_tokenizer_files[0]
|
valid_tokenizer_files = _filter_valid_tokenizer_files(all_files)
|
||||||
|
|
||||||
|
if len(valid_tokenizer_files) == 0:
|
||||||
|
raise ValueError(f"No tokenizer file found in the directory: {base_path}")
|
||||||
|
# If there are multiple tokenizer files, we use tekken.json if it exists, otherwise the versioned one.
|
||||||
|
if len(valid_tokenizer_files) > 1:
|
||||||
|
if "tekken.json" in valid_tokenizer_files:
|
||||||
|
tokenizer_file = "tekken.json"
|
||||||
|
else:
|
||||||
|
tokenizer_file = sorted(valid_tokenizer_files)[-1]
|
||||||
|
logger.warning(
|
||||||
|
f"Multiple tokenizer files found in {base_path}. Using {tokenizer_file}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
tokenizer_file = valid_tokenizer_files[0]
|
||||||
|
|
||||||
|
tokenizer_file_path = base_path / tokenizer_file
|
||||||
|
|
||||||
self.tokenizer = MistralTokenizer.from_file(
|
self.tokenizer = MistralTokenizer.from_file(
|
||||||
base_path / tokenizer_file
|
tokenizer_file_path
|
||||||
).instruct_tokenizer.tokenizer
|
).instruct_tokenizer.tokenizer
|
||||||
self.tokenizer_type = (
|
self.tokenizer_type = (
|
||||||
MistralTokenizerType.tekken
|
MistralTokenizerType.tekken
|
||||||
|
|
@ -698,7 +712,7 @@ class MistralVocab(Vocab):
|
||||||
else MistralTokenizerType.spm
|
else MistralTokenizerType.spm
|
||||||
)
|
)
|
||||||
self.vocab_size = self.tokenizer.n_words
|
self.vocab_size = self.tokenizer.n_words
|
||||||
self.fname_tokenizer = base_path / tokenizer_file
|
self.fname_tokenizer = tokenizer_file_path
|
||||||
self._name = (
|
self._name = (
|
||||||
"mistral-" + self.tokenizer_type.value + "-" + self.tokenizer.version
|
"mistral-" + self.tokenizer_type.value + "-" + self.tokenizer.version
|
||||||
)
|
)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue