From 1be97831e44a6335aca9c3f4f3edbb0e35bea98f Mon Sep 17 00:00:00 2001
From: Pascal <admin@serveurperso.com>
Date: Fri, 5 Dec 2025 12:52:23 +0100
Subject: [PATCH] fix: prevent segfault in tokenizer on highly repetitive input
 (#17786)

Add nosubs|optimize flags to std::regex constructors to prevent
catastrophic backtracking when processing prompts with repeated
identical characters (e.g., 'A' * 10000).

The nosubs flag disables subgroup capture, significantly reducing
memory usage and backtracking on uniform token sequences
---
 src/unicode.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/unicode.cpp b/src/unicode.cpp
index 77ba4fc46b..bb44edfadd 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -499,7 +499,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
 
 // use std::wregex to split the text
 static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) {
-    std::wregex expr(regex_expr);
+    std::wregex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
     std::vector<size_t> bpe_offsets; // store the offset of each word
     bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
     size_t start = 0;
@@ -529,7 +529,7 @@ static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, c
 
 // use std::regex to split the text
 static std::vector<size_t> unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
-    std::regex expr(regex_expr);
+    std::regex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
     std::vector<size_t> bpe_offsets; // store the offset of each word
     bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
     size_t start = 0;