From 0c4093cc5bdd0062b017a46017538fffa3ab8d5d Mon Sep 17 00:00:00 2001 From: Lasse Lauwerys Date: Thu, 29 Jan 2026 19:48:39 +0100 Subject: [PATCH] Fix model loading regex error --- src/unicode.cpp | 51 ++++++++++++++++++------------------------------- 1 file changed, 19 insertions(+), 32 deletions(-) diff --git a/src/unicode.cpp b/src/unicode.cpp index b47dcbe619..706e45b217 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -497,19 +497,26 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & return bpe_offsets; } -// use std::wregex to split the text -static std::vector unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector & offsets) { - std::wregex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs); +template +static std::vector unicode_regex_split_stl(const std::basic_string & text, const std::basic_string & regex, const std::vector & offsets) { +#ifdef _MSC_VER // https://github.com/ggml-org/llama.cpp/issues/17830 + // MSVC's std::regex has stack limitations with complex patterns + constexpr auto regex_flags = std::regex_constants::ECMAScript; +#else + // Prevents catastrophic backtracking on repetitive input + constexpr auto regex_flags = std::regex_constants::optimize | std::regex_constants::nosubs; +#endif + std::basic_regex expr(regex, regex_flags); std::vector bpe_offsets; // store the offset of each word bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size size_t start = 0; for (auto offset : offsets) { - std::wcregex_iterator it(wtext.data() + start, wtext.data() + start + offset, expr); - std::wcregex_iterator end; + std::regex_iterator it(text.data() + start, text.data() + start + offset, expr); + std::regex_iterator end; int64_t start_idx = 0; while (it != end) { - std::wcmatch match = *it; + std::match_results match = *it; if (match.position() > start_idx) { bpe_offsets.emplace_back(match.position() - start_idx); } @@ -527,34 +534,14 @@ static std::vector unicode_regex_split_stl(const std::wstring & wtext, c return bpe_offsets; } +// use std::wregex to split the text +static std::vector unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector & offsets) { + return unicode_regex_split_stl(wtext, regex_expr, offsets); +} + // use std::regex to split the text static std::vector unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector & offsets) { - std::regex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs); - std::vector bpe_offsets; // store the offset of each word - bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size - size_t start = 0; - for (auto offset : offsets) { - std::cregex_iterator it(text.data() + start, text.data() + start + offset, expr); - std::cregex_iterator end; - - int64_t start_idx = 0; - while (it != end) { - std::cmatch match = *it; - if (match.position() > start_idx) { - bpe_offsets.emplace_back(match.position() - start_idx); - } - bpe_offsets.emplace_back(match.length()); - start_idx = match.position() + match.length(); - ++it; - } - - if (start_idx < (int64_t) offset) { - bpe_offsets.emplace_back(offset - start_idx); - } - start += offset; - } - - return bpe_offsets; + return unicode_regex_split_stl(text, regex_expr, offsets); } // K2 system regex patterns (from tokenization_kimi.py):