From ccd55e4ff7a2b30d7421f319e5a6ed4e1509cc0c Mon Sep 17 00:00:00 2001 From: o7si Date: Wed, 14 Jan 2026 15:27:24 +0800 Subject: [PATCH] convert: add normalizer.lowercase metadata support --- gguf-py/gguf/constants.py | 2 ++ gguf-py/gguf/gguf_writer.py | 3 +++ gguf-py/gguf/vocab.py | 27 +++++++++++++++++++++++++++ 3 files changed, 32 insertions(+) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 31273b2b5a..813555e336 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -251,6 +251,8 @@ class Keys: CHAT_TEMPLATE = "tokenizer.chat_template" CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}" CHAT_TEMPLATES = "tokenizer.chat_templates" + # Normalizer constants + NORMALIZER_LOWERCASE = "tokenizer.ggml.normalizer.lowercase" # FIM/Infill special tokens constants FIM_PRE_ID = "tokenizer.ggml.fim_pre_token_id" FIM_SUF_ID = "tokenizer.ggml.fim_suf_token_id" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 7fbb78866b..b609c9e696 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1072,6 +1072,9 @@ class GGUFWriter: def add_eom_token_id(self, id: int) -> None: self.add_uint32(Keys.Tokenizer.EOM_ID, id) + def add_normalizer_lowercase(self, value: bool) -> None: + self.add_bool(Keys.Tokenizer.NORMALIZER_LOWERCASE, value) + def add_classifier_output_labels(self, labels: Sequence[str]) -> None: self.add_array(Keys.Classifier.OUTPUT_LABELS.format(arch=self.arch), labels) diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index 028e5748e4..36fc5cf023 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -52,6 +52,7 @@ class SpecialVocab: add_special_token: dict[str, bool] special_token_ids: dict[str, int] chat_template: str | Sequence[Mapping[str, str]] | None + normalizer_lowercase: bool def __init__( self, path: str | os.PathLike[str], load_merges: bool = False, @@ -64,6 +65,7 @@ class SpecialVocab: self.load_merges = load_merges self.merges = [] self.chat_template = None + self.normalizer_lowercase = False if special_token_types is not None: self.special_token_types = special_token_types else: @@ -102,6 +104,10 @@ class SpecialVocab: if not quiet: logger.info(f'Setting chat_template to {self.chat_template}') gw.add_chat_template(self.chat_template) + if self.normalizer_lowercase: + if not quiet: + logger.info('Setting normalizer_lowercase to True') + gw.add_normalizer_lowercase(True) def _load(self, path: Path) -> None: self._try_load_from_tokenizer_json(path) @@ -146,6 +152,24 @@ class SpecialVocab: return logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping') + def _parse_normalizer(self, normalizer: dict) -> None: + # ref: https://huggingface.co/docs/tokenizers/api/normalizers + # + # Detects lowercase normalization in three possible formats: + # 1. Standalone: {"type": "Lowercase"} + # 2. BertNormalizer attribute: {"type": "BertNormalizer", "lowercase": true, ...} + # 3. Nested in Sequence: {"type": "Sequence", "normalizers": [...]} + + normalizer_type = normalizer.get('type') + if normalizer_type == 'Lowercase': + self.normalizer_lowercase = True + elif normalizer_type == 'BertNormalizer': + if normalizer.get('lowercase', False): + self.normalizer_lowercase = True + elif normalizer_type == 'Sequence': + for norm in normalizer.get('normalizers', []): + self._parse_normalizer(norm) + def _try_load_from_tokenizer_json(self, path: Path) -> bool: tokenizer = None tokenizer_file = path / 'tokenizer.json' @@ -178,6 +202,9 @@ class SpecialVocab: ] else: raise ValueError("Unknown tokenizer merges format") + # Parse normalizer configuration + if normalizer := tokenizer.get('normalizer'): + self._parse_normalizer(normalizer) added_tokens = tokenizer.get('added_tokens', {}) else: added_tokens = {}