Fix model loading regex error

This commit is contained in:
Lasse Lauwerys 2026-01-29 19:48:39 +01:00
parent 50e8962f79
commit 0c4093cc5b
1 changed files with 19 additions and 32 deletions

View File

@ -497,19 +497,26 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
return bpe_offsets;
}
// use std::wregex to split the text
static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) {
std::wregex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
template<typename CharT>
static std::vector<size_t> unicode_regex_split_stl(const std::basic_string<CharT> & text, const std::basic_string<CharT> & regex, const std::vector<size_t> & offsets) {
#ifdef _MSC_VER // https://github.com/ggml-org/llama.cpp/issues/17830
// MSVC's std::regex has stack limitations with complex patterns
constexpr auto regex_flags = std::regex_constants::ECMAScript;
#else
// Prevents catastrophic backtracking on repetitive input
constexpr auto regex_flags = std::regex_constants::optimize | std::regex_constants::nosubs;
#endif
std::basic_regex<CharT> expr(regex, regex_flags);
std::vector<size_t> bpe_offsets; // store the offset of each word
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
size_t start = 0;
for (auto offset : offsets) {
std::wcregex_iterator it(wtext.data() + start, wtext.data() + start + offset, expr);
std::wcregex_iterator end;
std::regex_iterator<const CharT *> it(text.data() + start, text.data() + start + offset, expr);
std::regex_iterator<const CharT *> end;
int64_t start_idx = 0;
while (it != end) {
std::wcmatch match = *it;
std::match_results<const CharT *> match = *it;
if (match.position() > start_idx) {
bpe_offsets.emplace_back(match.position() - start_idx);
}
@ -527,34 +534,14 @@ static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, c
return bpe_offsets;
}
// use std::wregex to split the text
static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) {
return unicode_regex_split_stl<wchar_t>(wtext, regex_expr, offsets);
}
// use std::regex to split the text
static std::vector<size_t> unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
std::regex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
std::vector<size_t> bpe_offsets; // store the offset of each word
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
size_t start = 0;
for (auto offset : offsets) {
std::cregex_iterator it(text.data() + start, text.data() + start + offset, expr);
std::cregex_iterator end;
int64_t start_idx = 0;
while (it != end) {
std::cmatch match = *it;
if (match.position() > start_idx) {
bpe_offsets.emplace_back(match.position() - start_idx);
}
bpe_offsets.emplace_back(match.length());
start_idx = match.position() + match.length();
++it;
}
if (start_idx < (int64_t) offset) {
bpe_offsets.emplace_back(offset - start_idx);
}
start += offset;
}
return bpe_offsets;
return unicode_regex_split_stl<char>(text, regex_expr, offsets);
}
// K2 system regex patterns (from tokenization_kimi.py):