From ba4853ef7ae738d8690c503f24212c3665b375d8 Mon Sep 17 00:00:00 2001 From: Alde Rojas Date: Fri, 6 Feb 2026 00:17:18 -0600 Subject: [PATCH] Use const_iterator and remove specializations --- src/unicode.cpp | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/src/unicode.cpp b/src/unicode.cpp index e65e05d95a..adfc489d1f 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -497,9 +497,11 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & return bpe_offsets; } -template +template static std::vector unicode_regex_split_stl(const std::basic_string & text, const std::basic_string & regex, const std::vector & offsets) { -#ifdef _MSC_VER // Bypass bug in MSVC: https://github.com/ggml-org/llama.cpp/issues/17830 + using BidirIt = typename std::basic_string::const_iterator; +#ifdef _MSC_VER + // Bypass bug in MSVC: https://github.com/ggml-org/llama.cpp/issues/17830 constexpr auto regex_flags = std::regex_constants::ECMAScript; #else constexpr auto regex_flags = std::regex_constants::optimize | std::regex_constants::nosubs; @@ -509,12 +511,12 @@ static std::vector unicode_regex_split_stl(const std::basic_string it(text.data() + start, text.data() + start + offset, expr); - std::regex_iterator end; + std::regex_iterator it(text.begin() + start, text.begin() + start + offset, expr); + std::regex_iterator end; int64_t start_idx = 0; while (it != end) { - std::match_results match = *it; + std::match_results match = *it; if (match.position() > start_idx) { bpe_offsets.emplace_back(match.position() - start_idx); } @@ -532,16 +534,6 @@ static std::vector unicode_regex_split_stl(const std::basic_string unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector & offsets) { - return unicode_regex_split_stl(wtext, regex_expr, offsets); -} - -// use std::regex to split the text -static std::vector unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector & offsets) { - return unicode_regex_split_stl(text, regex_expr, offsets); -} - // K2 system regex patterns (from tokenization_kimi.py): // [\p{Han}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+ static std::vector unicode_regex_split_custom_kimi_k2(const std::string & text, const std::vector & offsets) {