unicode : MSVC regex fix (#19340)
* Fix model loading regex error * Change comments * Use const_iterator and remove specializations --------- Co-authored-by: Alde Rojas <hello@alde.dev>
This commit is contained in:
parent
3688c4f504
commit
06bf3796f4
|
|
@ -497,49 +497,26 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
|||
return bpe_offsets;
|
||||
}
|
||||
|
||||
// use std::wregex to split the text
|
||||
static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) {
|
||||
std::wregex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
|
||||
template <typename CharT>
|
||||
static std::vector<size_t> unicode_regex_split_stl(const std::basic_string<CharT> & text, const std::basic_string<CharT> & regex, const std::vector<size_t> & offsets) {
|
||||
using BidirIt = typename std::basic_string<CharT>::const_iterator;
|
||||
#ifdef _MSC_VER
|
||||
// Bypass bug in MSVC: https://github.com/ggml-org/llama.cpp/issues/17830
|
||||
constexpr auto regex_flags = std::regex_constants::ECMAScript;
|
||||
#else
|
||||
constexpr auto regex_flags = std::regex_constants::optimize | std::regex_constants::nosubs;
|
||||
#endif
|
||||
std::basic_regex<CharT> expr(regex, regex_flags);
|
||||
std::vector<size_t> bpe_offsets; // store the offset of each word
|
||||
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
|
||||
size_t start = 0;
|
||||
for (auto offset : offsets) {
|
||||
std::wcregex_iterator it(wtext.data() + start, wtext.data() + start + offset, expr);
|
||||
std::wcregex_iterator end;
|
||||
std::regex_iterator<BidirIt> it(text.begin() + start, text.begin() + start + offset, expr);
|
||||
std::regex_iterator<BidirIt> end;
|
||||
|
||||
int64_t start_idx = 0;
|
||||
while (it != end) {
|
||||
std::wcmatch match = *it;
|
||||
if (match.position() > start_idx) {
|
||||
bpe_offsets.emplace_back(match.position() - start_idx);
|
||||
}
|
||||
bpe_offsets.emplace_back(match.length());
|
||||
start_idx = match.position() + match.length();
|
||||
++it;
|
||||
}
|
||||
|
||||
if (start_idx < (int64_t) offset) {
|
||||
bpe_offsets.emplace_back(offset - start_idx);
|
||||
}
|
||||
start += offset;
|
||||
}
|
||||
|
||||
return bpe_offsets;
|
||||
}
|
||||
|
||||
// use std::regex to split the text
|
||||
static std::vector<size_t> unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
|
||||
std::regex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
|
||||
std::vector<size_t> bpe_offsets; // store the offset of each word
|
||||
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
|
||||
size_t start = 0;
|
||||
for (auto offset : offsets) {
|
||||
std::cregex_iterator it(text.data() + start, text.data() + start + offset, expr);
|
||||
std::cregex_iterator end;
|
||||
|
||||
int64_t start_idx = 0;
|
||||
while (it != end) {
|
||||
std::cmatch match = *it;
|
||||
std::match_results<BidirIt> match = *it;
|
||||
if (match.position() > start_idx) {
|
||||
bpe_offsets.emplace_back(match.position() - start_idx);
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue