Skip to content

Commit 3635601

Browse files
<regex>: Avoid stack growth in simple loops (#5939)
Co-authored-by: Stephan T. Lavavej <[email protected]>
1 parent df177d7 commit 3635601

File tree

3 files changed

+161
-32
lines changed

3 files changed

+161
-32
lines changed

benchmarks/src/regex_match.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ BENCHMARK_CAPTURE(bm_match_sequence_of_as, "a*", "a*")->Apply(common_args);
3030
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "a*?", "a*?")->Apply(common_args);
3131
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "(?:a)*", "(?:a)*")->Apply(common_args);
3232
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "(a)*", "(a)*")->Apply(common_args);
33+
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "(a)*?", "(a)*?")->Apply(common_args);
3334
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "(?:b|a)*", "(?:b|a)*")->Apply(common_args);
3435
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "(b|a)*", "(b|a)*")->Apply(common_args);
3536
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "(a)(?:b|a)*", "(a)(?:b|a)*")->Apply(common_args);

stl/inc/regex

Lines changed: 129 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1569,10 +1569,12 @@ public:
15691569
_Node_end_rep& operator=(const _Node_end_rep&) = delete;
15701570
};
15711571

1572-
struct _Loop_vals_v2_t { // storage for loop administration
1572+
template <class _Diff>
1573+
struct _Loop_vals_v3_t { // storage for loop administration
15731574
size_t _Loop_frame_idx = 0;
15741575
int _Loop_idx = 0;
15751576
unsigned int _Group_first = 0;
1577+
_Diff _Loop_length{};
15761578
};
15771579

15781580
class _Node_rep : public _Node_base { // node that marks the beginning of a repetition
@@ -1677,7 +1679,9 @@ enum class _Rx_unwind_ops {
16771679
_Disjunction_eval_alt_always,
16781680
_Do_nothing,
16791681
_Loop_simple_nongreedy,
1680-
_Loop_simple_greedy,
1682+
_Loop_simple_greedy_firstrep,
1683+
_Loop_simple_greedy_intermediaterep,
1684+
_Loop_simple_greedy_lastrep,
16811685
_Loop_nongreedy,
16821686
_Loop_greedy,
16831687
_Loop_restore_vals,
@@ -1812,7 +1816,7 @@ public:
18121816
private:
18131817
_Tgt_state_t<_It> _Tgt_state;
18141818
_Tgt_state_t<_It> _Res;
1815-
vector<_Loop_vals_v2_t> _Loop_vals;
1819+
vector<_Loop_vals_v3_t<_Iter_diff_t<_It>>> _Loop_vals;
18161820
vector<_Rx_state_frame_t<_It>> _Frames;
18171821
size_t _Frames_count;
18181822

@@ -1824,7 +1828,7 @@ private:
18241828
void _Increase_complexity_count();
18251829

18261830
void _Prepare_rep(_Node_rep*);
1827-
bool _Find_first_inner_capture_group(_Node_base*, _Loop_vals_v2_t*);
1831+
bool _Find_first_inner_capture_group(_Node_base*, _Loop_vals_v3_t<_Iter_diff_t<_It>>*);
18281832
void _Reset_capture_groups(unsigned int _First);
18291833
_It _Do_class(_Node_base*, _It);
18301834
bool _Match_pat(_Node_base*);
@@ -2321,6 +2325,8 @@ template <class _BidIt, class _Alloc, class _Elem, class _RxTraits, class _It>
23212325
bool _Regex_match1(_It _First, _It _Last, match_results<_BidIt, _Alloc>* _Matches,
23222326
const basic_regex<_Elem, _RxTraits>& _Re, regex_constants::match_flag_type _Flgs,
23232327
bool _Full) { // try to match regular expression to target text
2328+
static_assert(_Is_ranges_bidi_iter_v<_It>,
2329+
"regex_match requires bidirectional iterators or stronger. See N5014 [re.alg.match]/1.");
23242330
if (_Re._Empty()) {
23252331
return false;
23262332
}
@@ -2389,6 +2395,8 @@ _NODISCARD bool regex_match(const basic_string<_Elem, _StTraits, _StAlloc>& _Str
23892395
template <class _BidIt, class _Alloc, class _Elem, class _RxTraits, class _It>
23902396
bool _Regex_search2(_It _First, _It _Last, match_results<_BidIt, _Alloc>* _Matches,
23912397
const basic_regex<_Elem, _RxTraits>& _Re, regex_constants::match_flag_type _Flgs, _It _Org) {
2398+
static_assert(_Is_ranges_bidi_iter_v<_It>,
2399+
"regex_search requires bidirectional iterators or stronger. See N5014 [re.alg.search]/1.");
23922400
// search for regular expression match in target text
23932401
if (_Re._Empty()) {
23942402
return false;
@@ -2491,6 +2499,8 @@ _NODISCARD bool regex_search(const basic_string<_Elem, _StTraits, _StAlloc>& _St
24912499
template <class _OutIt, class _BidIt, class _RxTraits, class _Elem, class _Traits, class _Alloc>
24922500
_OutIt _Regex_replace1(_OutIt _Result, _BidIt _First, _BidIt _Last, const basic_regex<_Elem, _RxTraits>& _Re,
24932501
const basic_string<_Elem, _Traits, _Alloc>& _Fmt, regex_constants::match_flag_type _Flgs) {
2502+
static_assert(_Is_ranges_bidi_iter_v<_BidIt>,
2503+
"regex_replace requires bidirectional iterators or stronger. See N5014 [re.alg.replace].");
24942504
// search and replace
24952505
match_results<_BidIt> _Matches;
24962506
_BidIt _Pos = _First;
@@ -3422,7 +3432,7 @@ void _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Increase_complexity_coun
34223432

34233433
template <class _BidIt, class _Elem, class _RxTraits, class _It, class _Alloc>
34243434
void _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Prepare_rep(_Node_rep* _Node) {
3425-
_Loop_vals_v2_t* _Psav = &_Loop_vals[_Node->_Loop_number];
3435+
const auto _Psav = &_Loop_vals[_Node->_Loop_number];
34263436

34273437
// Determine first capture group in repetition for later capture group reset, if not done so previously.
34283438
// No capture group reset is performed for POSIX regexes,
@@ -3436,7 +3446,7 @@ void _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Prepare_rep(_Node_rep* _
34363446

34373447
template <class _BidIt, class _Elem, class _RxTraits, class _It, class _Alloc>
34383448
bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Find_first_inner_capture_group(
3439-
_Node_base* _Nx, _Loop_vals_v2_t* _Loop_state) {
3449+
_Node_base* _Nx, _Loop_vals_v3_t<_Iter_diff_t<_It>>* _Loop_state) {
34403450
if (0 < _Max_stack_count && --_Max_stack_count <= 0) {
34413451
_Xregex_error(regex_constants::error_stack);
34423452
}
@@ -3491,8 +3501,8 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Find_first_inner_capture
34913501

34923502
case _N_rep:
34933503
{
3494-
_Node_rep* _Inner_rep = static_cast<_Node_rep*>(_Nx);
3495-
_Loop_vals_v2_t* _Inner_loop_state = &_Loop_vals[_Inner_rep->_Loop_number];
3504+
const auto _Inner_rep = static_cast<_Node_rep*>(_Nx);
3505+
const auto _Inner_loop_state = &_Loop_vals[_Inner_rep->_Loop_number];
34963506
if (_Find_first_inner_capture_group(_Inner_rep->_Next, _Inner_loop_state)) {
34973507
_Loop_state->_Group_first = _Inner_loop_state->_Group_first;
34983508
_Found_group = true;
@@ -4078,15 +4088,16 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
40784088
auto& _Sav = _Loop_vals[_Node->_Loop_number];
40794089

40804090
if (_Node->_Simple_loop == 1) {
4081-
_Sav._Loop_frame_idx = _Push_frame(_Rx_unwind_ops::_Do_nothing, nullptr);
4091+
_Sav._Loop_frame_idx = _Push_frame(_Rx_unwind_ops::_Do_nothing, _Node);
40824092
_Increase_complexity_count();
40834093
if (_Node->_Min > 0 || (_Greedy && !_Longest && _Node->_Max != 0)) { // try a rep first
40844094
_Sav._Loop_idx = 1;
40854095
// _Next is already assigned correctly for matching a rep
40864096

40874097
// set up stack unwinding for greedy matching if no rep is allowed
40884098
if (_Node->_Min == 0) {
4089-
_Push_frame(_Rx_unwind_ops::_Loop_simple_greedy, _Node);
4099+
auto& _Frame = _Frames[_Sav._Loop_frame_idx];
4100+
_Frame._Code = _Rx_unwind_ops::_Loop_simple_greedy_firstrep;
40904101
}
40914102
} else { // try tail first
40924103
_Sav._Loop_idx = 0;
@@ -4136,37 +4147,79 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
41364147
auto& _Sav = _Loop_vals[_Nr->_Loop_number];
41374148
bool _Greedy = (_Nr->_Flags & _Fl_greedy) != 0;
41384149
if (_Nr->_Simple_loop != 0) {
4139-
if (_Sav._Loop_idx == 1
4140-
&& _Tgt_state._Cur == _Frames[_Sav._Loop_frame_idx]._Pos) { // initial match empty
4141-
// loop is branchless, so it will only ever match empty strings
4142-
// -> we only try tail for POSIX or if minimum number of reps is non-zero
4143-
// _Next is already assigned correctly for matching tail
4144-
4145-
if (!(_Sflags & regex_constants::_Any_posix) && _Nr->_Min == 0) {
4146-
_Failed = true;
4150+
if (_Sav._Loop_idx == 1) {
4151+
auto& _Base_frame = _Frames[_Sav._Loop_frame_idx];
4152+
_Sav._Loop_length = _STD distance(_Base_frame._Pos, _Tgt_state._Cur);
4153+
4154+
if (_Sav._Loop_length == _Iter_diff_t<_It>{}) { // initial match empty
4155+
// loop is branchless, so it will only ever match empty strings
4156+
// -> we only try tail for POSIX or if minimum number of reps is non-zero
4157+
// _Next is already assigned correctly for matching tail
4158+
if (!(_Sflags & regex_constants::_Any_posix) && _Nr->_Min == 0) {
4159+
_Failed = true;
4160+
} else {
4161+
_Increase_complexity_count();
4162+
}
4163+
break;
41474164
}
4148-
} else if (_Sav._Loop_idx < _Nr->_Min) { // at least one more rep to reach minimum
4165+
4166+
// allocate stack frame holding loop-specific unwinding opcode for second rep and beyond
4167+
auto _New_frame_code = _Base_frame._Code == _Rx_unwind_ops::_Loop_simple_greedy_firstrep
4168+
? _Rx_unwind_ops::_Loop_simple_greedy_lastrep
4169+
: _Rx_unwind_ops::_Do_nothing;
4170+
auto _New_frame_idx = _Push_frame(_New_frame_code, _Nr);
4171+
_Frames[_New_frame_idx]._Loop_frame_idx_sav = _Sav._Loop_frame_idx;
4172+
_Sav._Loop_frame_idx = _New_frame_idx;
4173+
} else { // discard stack frames for capturing group changes generated by this rep
4174+
_Frames_count = _Sav._Loop_frame_idx + 1U;
4175+
}
4176+
4177+
if (_Sav._Loop_idx < _Nr->_Min) { // at least one more rep to reach minimum
41494178
_Next = _Nr->_Next;
41504179
++_Sav._Loop_idx;
4151-
} else if (_Greedy && !_Longest && _Sav._Loop_idx != _Nr->_Max) { // one more rep to try next
4152-
// set up stack unwinding for greedy matching
4153-
_Push_frame(_Rx_unwind_ops::_Loop_simple_greedy, _Nr);
4180+
} else if (_Greedy && !_Longest) { // greedy matching
4181+
auto& _Frame = _Frames[_Sav._Loop_frame_idx];
4182+
if (_Frame._Code == _Rx_unwind_ops::_Do_nothing) { // min reps reached
4183+
_Frame._Code = _Rx_unwind_ops::_Loop_simple_greedy_lastrep;
4184+
// set iterator in base frame to start of prior rep
4185+
// (so to start of rep before reaching min reps)
4186+
auto& _Before_unwind_pos = _Frames[_Frame._Loop_frame_idx_sav]._Pos;
4187+
_Before_unwind_pos = _Tgt_state._Cur;
4188+
_STD advance(_Before_unwind_pos, -_Sav._Loop_length);
4189+
} else {
4190+
_STL_INTERNAL_CHECK(_Frame._Code == _Rx_unwind_ops::_Loop_simple_greedy_lastrep);
4191+
}
4192+
_Frame._Pos = _Tgt_state._Cur;
41544193

4155-
_Next = _Nr->_Next;
4156-
if (_Sav._Loop_idx < INT_MAX) { // avoid overflowing _Loop_idx
4157-
++_Sav._Loop_idx;
4194+
if (_Sav._Loop_idx != _Nr->_Max) { // try one more rep
4195+
_Next = _Nr->_Next;
4196+
if (_Sav._Loop_idx < INT_MAX) { // avoid overflowing _Loop_idx
4197+
++_Sav._Loop_idx;
4198+
}
4199+
} else { // try tail
4200+
_STD advance(_Frame._Pos, -_Sav._Loop_length);
4201+
if (_Frame._Pos != _Frames[_Frame._Loop_frame_idx_sav]._Pos) {
4202+
// capturing groups must be shifted when backtracking from tail
4203+
_Frame._Code = _Rx_unwind_ops::_Loop_simple_greedy_intermediaterep;
4204+
} else {
4205+
--_Frames_count;
4206+
}
4207+
// _Next is already assigned correctly for matching tail
41584208
}
4159-
} else { // non-greedy matching or greedy matching with maximum reached
4209+
} else { // non-greedy matching
41604210
// set up stack unwinding for non-greedy matching if one more rep is allowed
41614211
if (_Sav._Loop_idx != _Nr->_Max) {
4162-
_Push_frame(_Rx_unwind_ops::_Loop_simple_nongreedy, _Nr);
4212+
auto& _Frame = _Frames[_Sav._Loop_frame_idx];
4213+
_Frame._Pos = _Tgt_state._Cur;
4214+
_Frame._Code = _Rx_unwind_ops::_Loop_simple_nongreedy;
4215+
_Frame._Node = _Nr;
4216+
} else {
4217+
--_Frames_count;
41634218
}
41644219
// _Next is already assigned correctly for matching tail
41654220
}
41664221

4167-
if (!_Failed) {
4168-
_Increase_complexity_count();
4169-
}
4222+
_Increase_complexity_count();
41704223
} else {
41714224
const bool _Progress = _Frames[_Sav._Loop_frame_idx]._Pos != _Tgt_state._Cur;
41724225
if (_Sav._Loop_idx < _Nr->_Min) { // try another required match
@@ -4327,8 +4380,8 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
43274380
}
43284381
break;
43294382

4330-
case _Rx_unwind_ops::_Loop_simple_greedy:
4331-
// try tail if matching one more rep failed
4383+
case _Rx_unwind_ops::_Loop_simple_greedy_firstrep:
4384+
// try tail after backtracking from first rep
43324385
if (_Failed) {
43334386
auto _Node = static_cast<_Node_rep*>(_Frame._Node);
43344387

@@ -4339,6 +4392,50 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
43394392
}
43404393
break;
43414394

4395+
case _Rx_unwind_ops::_Loop_simple_greedy_intermediaterep:
4396+
// shift capturing groups, set up unwinding prior rep and try tail
4397+
// when backtracking between the second and the last attempted rep
4398+
if (_Failed) {
4399+
auto _Node = static_cast<_Node_rep*>(_Frame._Node);
4400+
4401+
// adjust capturing group begin and end iterators by rep length
4402+
auto& _Sav = _Loop_vals[_Node->_Loop_number];
4403+
for (auto _Capture_frame_idx = _Frame._Loop_frame_idx_sav + 1U; _Capture_frame_idx != _Frames_count;
4404+
++_Capture_frame_idx) {
4405+
const auto& _Capture_frame = _Frames[_Capture_frame_idx];
4406+
_STL_INTERNAL_CHECK(_Capture_frame._Code == _Rx_unwind_ops::_Capture_restore_begin
4407+
|| _Capture_frame._Code == _Rx_unwind_ops::_Capture_restore_matched_end
4408+
|| _Capture_frame._Code == _Rx_unwind_ops::_Capture_restore_unmatched_end);
4409+
auto& _Grp = _Tgt_state._Grps[_Capture_frame._Capture_idx];
4410+
_STD advance(
4411+
_Capture_frame._Code == _Rx_unwind_ops::_Capture_restore_begin ? _Grp._Begin : _Grp._End,
4412+
-_Sav._Loop_length);
4413+
}
4414+
}
4415+
_FALLTHROUGH;
4416+
4417+
case _Rx_unwind_ops::_Loop_simple_greedy_lastrep:
4418+
// set up unwinding prior rep and try tail
4419+
// when backtracking from last attempted rep
4420+
if (_Failed) {
4421+
auto _Node = static_cast<_Node_rep*>(_Frame._Node);
4422+
4423+
_Increase_complexity_count();
4424+
_Nx = _Node->_End_rep->_Next;
4425+
_Tgt_state._Cur = _Frame._Pos;
4426+
_Failed = false;
4427+
4428+
auto& _Sav = _Loop_vals[_Node->_Loop_number];
4429+
_STD advance(_Frame._Pos, -_Sav._Loop_length);
4430+
4431+
// set up unwinding if prior rep is not first or minimum rep
4432+
if (_Frames[_Frame._Loop_frame_idx_sav]._Pos != _Frame._Pos) {
4433+
_Frame._Code = _Rx_unwind_ops::_Loop_simple_greedy_intermediaterep;
4434+
++_Frames_count;
4435+
}
4436+
}
4437+
break;
4438+
43424439
case _Rx_unwind_ops::_Loop_greedy:
43434440
// try tail if matching one more rep failed
43444441
if (_Failed) {

tests/std/tests/VSO_0000000_regex_use/test.cpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2361,6 +2361,36 @@ void test_gh_5918() {
23612361
g_regexTester.should_match("ababa", R"((?:(a)(?:|b\1b)){2})");
23622362
}
23632363

2364+
void test_gh_5939() {
2365+
// GH-5939: Avoid stack growth in simple loops
2366+
// This PR manipulates the stack while processing simple loops to avoid growing it.
2367+
// The following tests verify that backtracking from such loops still works
2368+
// and matches capturing groups even with these modifications to the stack.
2369+
g_regexTester.should_match("abcdd", R"(([abc])*?abcd\1d)");
2370+
2371+
g_regexTester.should_match("abb", R"((a)*ab\1b)");
2372+
g_regexTester.should_match("abb", R"((a){0,1}ab\1b)");
2373+
g_regexTester.should_not_match("abb", R"((a){1,1}ab\1b)");
2374+
g_regexTester.should_not_match("abb", R"((a){1,2}ab\1b)");
2375+
g_regexTester.should_match("aabab", R"((a){1,2}ab\1b)");
2376+
g_regexTester.should_match("abcdab", R"((?:([abc])([abc]))*cd\1\2)");
2377+
g_regexTester.should_match("abcdab", R"((?:([abc])([abc])){0,1}cd\1\2)");
2378+
g_regexTester.should_match("abbacdba", R"((?:([abc])([abc]))*cd\1\2)");
2379+
g_regexTester.should_match("abbacdab", R"((?:([abc])([abc]))*bacd\1\2)");
2380+
g_regexTester.should_match("abbacd", R"((?:([abc])([abc]))*abbacd\1\2)");
2381+
g_regexTester.should_match("abbacdba", R"((?:([abc])([abc]))+cd\1\2)");
2382+
g_regexTester.should_match("abbacdab", R"((?:([abc])([abc]))+bacd\1\2)");
2383+
g_regexTester.should_match("abbacdab", R"((?:([abc])([abc])){0,2}bacd\1\2)");
2384+
g_regexTester.should_match("abbacdab", R"((?:([abc])([abc])){1,2}bacd\1\2)");
2385+
g_regexTester.should_not_match("abbacdab", R"((?:([abc])([abc]))+abbacd\1\2)");
2386+
g_regexTester.should_match("abbacdba", R"((?:([abc])([abc])){2,}cd\1\2)");
2387+
g_regexTester.should_not_match("abbacdab", R"((?:([abc])([abc])){2,}bacd\1\2)");
2388+
g_regexTester.should_not_match("abbacdab", R"((?:([abc])([abc])){2,}abbacd\1\2)");
2389+
g_regexTester.should_match("abcbbacdba", R"((?:([abc])([abc])){2,}cd\1\2)");
2390+
g_regexTester.should_match("abcbbacdcb", R"((?:([abc])([abc])){2,}bacd\1\2)");
2391+
g_regexTester.should_not_match("abcbbacdab", R"((?:([abc])([abc])){2,}abbacd\1\2)");
2392+
}
2393+
23642394
int main() {
23652395
test_dev10_449367_case_insensitivity_should_work();
23662396
test_dev11_462743_regex_collate_should_not_disable_regex_icase();
@@ -2420,6 +2450,7 @@ int main() {
24202450
test_gh_5798();
24212451
test_gh_5865();
24222452
test_gh_5918();
2453+
test_gh_5939();
24232454

24242455
return g_regexTester.result();
24252456
}

0 commit comments

Comments
 (0)