diff --git a/src/unicode.cpp b/src/unicode.cpp index b47dcbe619..adfc489d1f 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -497,49 +497,26 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & return bpe_offsets; } -// use std::wregex to split the text -static std::vector unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector & offsets) { - std::wregex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs); +template +static std::vector unicode_regex_split_stl(const std::basic_string & text, const std::basic_string & regex, const std::vector & offsets) { + using BidirIt = typename std::basic_string::const_iterator; +#ifdef _MSC_VER + // Bypass bug in MSVC: https://github.com/ggml-org/llama.cpp/issues/17830 + constexpr auto regex_flags = std::regex_constants::ECMAScript; +#else + constexpr auto regex_flags = std::regex_constants::optimize | std::regex_constants::nosubs; +#endif + std::basic_regex expr(regex, regex_flags); std::vector bpe_offsets; // store the offset of each word bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size size_t start = 0; for (auto offset : offsets) { - std::wcregex_iterator it(wtext.data() + start, wtext.data() + start + offset, expr); - std::wcregex_iterator end; + std::regex_iterator it(text.begin() + start, text.begin() + start + offset, expr); + std::regex_iterator end; int64_t start_idx = 0; while (it != end) { - std::wcmatch match = *it; - if (match.position() > start_idx) { - bpe_offsets.emplace_back(match.position() - start_idx); - } - bpe_offsets.emplace_back(match.length()); - start_idx = match.position() + match.length(); - ++it; - } - - if (start_idx < (int64_t) offset) { - bpe_offsets.emplace_back(offset - start_idx); - } - start += offset; - } - - return bpe_offsets; -} - -// use std::regex to split the text -static std::vector unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector & offsets) { - std::regex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs); - std::vector bpe_offsets; // store the offset of each word - bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size - size_t start = 0; - for (auto offset : offsets) { - std::cregex_iterator it(text.data() + start, text.data() + start + offset, expr); - std::cregex_iterator end; - - int64_t start_idx = 0; - while (it != end) { - std::cmatch match = *it; + std::match_results match = *it; if (match.position() > start_idx) { bpe_offsets.emplace_back(match.position() - start_idx); }