fix some tokenizer regex edge cases

2026-02-11 23:32:18 +00:00 · 2026-02-11 23:32:18 +00:00 · ce36cfea9f
parent 13267737b5
commit ce36cfea9f
2 changed files with 8 additions and 1 deletions
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -1123,7 +1123,7 @@ class TextModel(ModelBase):
        if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
            # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
            res = "command-r"
-        if chkhsh == "9e47301af09841d8b5fa0bba1851507ea4259f2f0ce47df39d296836c6dce0fc":
+        if chkhsh == "d772b220ace2baec124bed8cfafce0ead7d6c38a4b65ef11261cf9d5d62246d1":
            # ref: https://huggingface.co/CohereLabs/tiny-aya-base
            res = "tiny_aya"
        if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
@ -7269,6 +7269,8 @@ class Cohere2Model(TextModel):
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        # Cohere2 runtime in llama.cpp expects no bias tensors; skip them at conversion time.
        if name.endswith(".bias"):
+            if torch.any(data_torch != 0):
+                raise ValueError(f"Bias tensor {name!r} is not zero.")
            logger.debug(f"Skipping bias tensor {name!r} for Cohere2 conversion.")
            return

--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@ -796,6 +796,11 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
    } else if (regex_expr == "\\p{AFMoE_digits}") {
        // AFMOE digit pattern - use custom implementation for proper splitting
        bpe_offsets = unicode_regex_split_custom_afmoe(text, offsets);
+    } else if (regex_expr == "\\d{1,3}(?=(?:\\d{3})*\\b)") {
+        // tiny_aya digit grouping pattern from tokenizer.json:
+        //   {"type": "Split", "pattern": {"Regex": "\\d{1,3}(?=(?:\\d{3})*\\b)"}, "behavior": "Isolated"}
+        // Splits digits into groups of 3 from the right (e.g., 1234567 -> 1, 234, 567)
+        bpe_offsets = unicode_regex_split_custom_afmoe(text, offsets);
    }

    return bpe_offsets;