Fix model loading regex error

2026-01-29 19:48:39 +01:00 · 2026-01-29 19:48:39 +01:00 · 0c4093cc5b
parent 50e8962f79
commit 0c4093cc5b
1 changed files with 19 additions and 32 deletions
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@ -497,19 +497,26 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
    return bpe_offsets;
 }

-// use std::wregex to split the text
-static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) {
-    std::wregex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
+template<typename CharT>
+static std::vector<size_t> unicode_regex_split_stl(const std::basic_string<CharT> & text, const std::basic_string<CharT> & regex, const std::vector<size_t> & offsets) {
+#ifdef _MSC_VER // https://github.com/ggml-org/llama.cpp/issues/17830
+    // MSVC's std::regex has stack limitations with complex patterns
+    constexpr auto regex_flags = std::regex_constants::ECMAScript;
+#else
+    // Prevents catastrophic backtracking on repetitive input
+    constexpr auto regex_flags = std::regex_constants::optimize | std::regex_constants::nosubs;
+#endif
+    std::basic_regex<CharT> expr(regex, regex_flags);
    std::vector<size_t> bpe_offsets; // store the offset of each word
    bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
    size_t start = 0;
    for (auto offset : offsets) {
-        std::wcregex_iterator it(wtext.data() + start, wtext.data() + start + offset, expr);
-        std::wcregex_iterator end;
+        std::regex_iterator<const CharT *> it(text.data() + start, text.data() + start + offset, expr);
+        std::regex_iterator<const CharT *> end;

        int64_t start_idx = 0;
        while (it != end) {
-            std::wcmatch match = *it;
+            std::match_results<const CharT *> match = *it;
            if (match.position() > start_idx) {
                bpe_offsets.emplace_back(match.position() - start_idx);
            }
@ -527,34 +534,14 @@ static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, c
    return bpe_offsets;
 }

+// use std::wregex to split the text
+static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) {
+    return unicode_regex_split_stl<wchar_t>(wtext, regex_expr, offsets);
+}
+
 // use std::regex to split the text
 static std::vector<size_t> unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
-    std::regex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
-    std::vector<size_t> bpe_offsets; // store the offset of each word
-    bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
-    size_t start = 0;
-    for (auto offset : offsets) {
-        std::cregex_iterator it(text.data() + start, text.data() + start + offset, expr);
-        std::cregex_iterator end;
-
-        int64_t start_idx = 0;
-        while (it != end) {
-            std::cmatch match = *it;
-            if (match.position() > start_idx) {
-                bpe_offsets.emplace_back(match.position() - start_idx);
-            }
-            bpe_offsets.emplace_back(match.length());
-            start_idx = match.position() + match.length();
-            ++it;
-        }
-
-        if (start_idx < (int64_t) offset) {
-            bpe_offsets.emplace_back(offset - start_idx);
-        }
-        start += offset;
-    }
-
-    return bpe_offsets;
+    return unicode_regex_split_stl<char>(text, regex_expr, offsets);
 }

 // K2 system regex patterns (from tokenization_kimi.py):