From 0c4093cc5bdd0062b017a46017538fffa3ab8d5d Mon Sep 17 00:00:00 2001
From: Lasse Lauwerys <lauweryslasse@gmail.com>
Date: Thu, 29 Jan 2026 19:48:39 +0100
Subject: [PATCH] Fix model loading regex error

---
 src/unicode.cpp | 51 ++++++++++++++++++-------------------------------
 1 file changed, 19 insertions(+), 32 deletions(-)
diff --git a/src/unicode.cpp b/src/unicode.cpp
index b47dcbe619..706e45b217 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -497,19 +497,26 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
     return bpe_offsets;
 }
 
-// use std::wregex to split the text
-static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) {
-    std::wregex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
+template<typename CharT>
+static std::vector<size_t> unicode_regex_split_stl(const std::basic_string<CharT> & text, const std::basic_string<CharT> & regex, const std::vector<size_t> & offsets) {
+#ifdef _MSC_VER // https://github.com/ggml-org/llama.cpp/issues/17830
+    // MSVC's std::regex has stack limitations with complex patterns
+    constexpr auto regex_flags = std::regex_constants::ECMAScript;
+#else
+    // Prevents catastrophic backtracking on repetitive input
+    constexpr auto regex_flags = std::regex_constants::optimize | std::regex_constants::nosubs;
+#endif
+    std::basic_regex<CharT> expr(regex, regex_flags);
     std::vector<size_t> bpe_offsets; // store the offset of each word
     bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
     size_t start = 0;
     for (auto offset : offsets) {
-        std::wcregex_iterator it(wtext.data() + start, wtext.data() + start + offset, expr);
-        std::wcregex_iterator end;
+        std::regex_iterator<const CharT *> it(text.data() + start, text.data() + start + offset, expr);
+        std::regex_iterator<const CharT *> end;
 
         int64_t start_idx = 0;
         while (it != end) {
-            std::wcmatch match = *it;
+            std::match_results<const CharT *> match = *it;
             if (match.position() > start_idx) {
                 bpe_offsets.emplace_back(match.position() - start_idx);
             }
@@ -527,34 +534,14 @@ static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, c
     return bpe_offsets;
 }
 
+// use std::wregex to split the text
+static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) {
+    return unicode_regex_split_stl<wchar_t>(wtext, regex_expr, offsets);
+}
+
 // use std::regex to split the text
 static std::vector<size_t> unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
-    std::regex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
-    std::vector<size_t> bpe_offsets; // store the offset of each word
-    bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
-    size_t start = 0;
-    for (auto offset : offsets) {
-        std::cregex_iterator it(text.data() + start, text.data() + start + offset, expr);
-        std::cregex_iterator end;
-
-        int64_t start_idx = 0;
-        while (it != end) {
-            std::cmatch match = *it;
-            if (match.position() > start_idx) {
-                bpe_offsets.emplace_back(match.position() - start_idx);
-            }
-            bpe_offsets.emplace_back(match.length());
-            start_idx = match.position() + match.length();
-            ++it;
-        }
-
-        if (start_idx < (int64_t) offset) {
-            bpe_offsets.emplace_back(offset - start_idx);
-        }
-        start += offset;
-    }
-
-    return bpe_offsets;
+    return unicode_regex_split_stl<char>(text, regex_expr, offsets);
 }
 
 // K2 system regex patterns (from tokenization_kimi.py):