Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions benchmarks/src/regex_match.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ BENCHMARK_CAPTURE(bm_match_sequence_of_as, "a*", "a*")->Apply(common_args);
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "a*?", "a*?")->Apply(common_args);
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "(?:a)*", "(?:a)*")->Apply(common_args);
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "(a)*", "(a)*")->Apply(common_args);
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "(a)*?", "(a)*?")->Apply(common_args);
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "(?:b|a)*", "(?:b|a)*")->Apply(common_args);
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "(b|a)*", "(b|a)*")->Apply(common_args);
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "(a)(?:b|a)*", "(a)(?:b|a)*")->Apply(common_args);
Expand Down
161 changes: 129 additions & 32 deletions stl/inc/regex
Original file line number Diff line number Diff line change
Expand Up @@ -1569,10 +1569,12 @@ public:
_Node_end_rep& operator=(const _Node_end_rep&) = delete;
};

struct _Loop_vals_v2_t { // storage for loop administration
template <class _Diff>
struct _Loop_vals_v3_t { // storage for loop administration
size_t _Loop_frame_idx = 0;
int _Loop_idx = 0;
unsigned int _Group_first = 0;
_Diff _Loop_length{};
};

class _Node_rep : public _Node_base { // node that marks the beginning of a repetition
Expand Down Expand Up @@ -1677,7 +1679,9 @@ enum class _Rx_unwind_ops {
_Disjunction_eval_alt_always,
_Do_nothing,
_Loop_simple_nongreedy,
_Loop_simple_greedy,
_Loop_simple_greedy_firstrep,
_Loop_simple_greedy_intermediaterep,
_Loop_simple_greedy_lastrep,
_Loop_nongreedy,
_Loop_greedy,
_Loop_restore_vals,
Expand Down Expand Up @@ -1812,7 +1816,7 @@ public:
private:
_Tgt_state_t<_It> _Tgt_state;
_Tgt_state_t<_It> _Res;
vector<_Loop_vals_v2_t> _Loop_vals;
vector<_Loop_vals_v3_t<_Iter_diff_t<_It>>> _Loop_vals;
vector<_Rx_state_frame_t<_It>> _Frames;
size_t _Frames_count;

Expand All @@ -1824,7 +1828,7 @@ private:
void _Increase_complexity_count();

void _Prepare_rep(_Node_rep*);
bool _Find_first_inner_capture_group(_Node_base*, _Loop_vals_v2_t*);
bool _Find_first_inner_capture_group(_Node_base*, _Loop_vals_v3_t<_Iter_diff_t<_It>>*);
void _Reset_capture_groups(unsigned int _First);
_It _Do_class(_Node_base*, _It);
bool _Match_pat(_Node_base*);
Expand Down Expand Up @@ -2321,6 +2325,8 @@ template <class _BidIt, class _Alloc, class _Elem, class _RxTraits, class _It>
bool _Regex_match1(_It _First, _It _Last, match_results<_BidIt, _Alloc>* _Matches,
const basic_regex<_Elem, _RxTraits>& _Re, regex_constants::match_flag_type _Flgs,
bool _Full) { // try to match regular expression to target text
static_assert(_Is_ranges_bidi_iter_v<_It>,
"regex_match requires bidirectional iterators or stronger. See N5014 [re.alg.match]/1.");
if (_Re._Empty()) {
return false;
}
Expand Down Expand Up @@ -2389,6 +2395,8 @@ _NODISCARD bool regex_match(const basic_string<_Elem, _StTraits, _StAlloc>& _Str
template <class _BidIt, class _Alloc, class _Elem, class _RxTraits, class _It>
bool _Regex_search2(_It _First, _It _Last, match_results<_BidIt, _Alloc>* _Matches,
const basic_regex<_Elem, _RxTraits>& _Re, regex_constants::match_flag_type _Flgs, _It _Org) {
static_assert(_Is_ranges_bidi_iter_v<_It>,
"regex_search requires bidirectional iterators or stronger. See N5014 [re.alg.search]/1.");
// search for regular expression match in target text
if (_Re._Empty()) {
return false;
Expand Down Expand Up @@ -2491,6 +2499,8 @@ _NODISCARD bool regex_search(const basic_string<_Elem, _StTraits, _StAlloc>& _St
template <class _OutIt, class _BidIt, class _RxTraits, class _Elem, class _Traits, class _Alloc>
_OutIt _Regex_replace1(_OutIt _Result, _BidIt _First, _BidIt _Last, const basic_regex<_Elem, _RxTraits>& _Re,
const basic_string<_Elem, _Traits, _Alloc>& _Fmt, regex_constants::match_flag_type _Flgs) {
static_assert(_Is_ranges_bidi_iter_v<_BidIt>,
"regex_replace requires bidirectional iterators or stronger. See N5014 [re.alg.replace].");
// search and replace
match_results<_BidIt> _Matches;
_BidIt _Pos = _First;
Expand Down Expand Up @@ -3422,7 +3432,7 @@ void _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Increase_complexity_coun

template <class _BidIt, class _Elem, class _RxTraits, class _It, class _Alloc>
void _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Prepare_rep(_Node_rep* _Node) {
_Loop_vals_v2_t* _Psav = &_Loop_vals[_Node->_Loop_number];
const auto _Psav = &_Loop_vals[_Node->_Loop_number];

// Determine first capture group in repetition for later capture group reset, if not done so previously.
// No capture group reset is performed for POSIX regexes,
Expand All @@ -3436,7 +3446,7 @@ void _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Prepare_rep(_Node_rep* _

template <class _BidIt, class _Elem, class _RxTraits, class _It, class _Alloc>
bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Find_first_inner_capture_group(
_Node_base* _Nx, _Loop_vals_v2_t* _Loop_state) {
_Node_base* _Nx, _Loop_vals_v3_t<_Iter_diff_t<_It>>* _Loop_state) {
if (0 < _Max_stack_count && --_Max_stack_count <= 0) {
_Xregex_error(regex_constants::error_stack);
}
Expand Down Expand Up @@ -3491,8 +3501,8 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Find_first_inner_capture

case _N_rep:
{
_Node_rep* _Inner_rep = static_cast<_Node_rep*>(_Nx);
_Loop_vals_v2_t* _Inner_loop_state = &_Loop_vals[_Inner_rep->_Loop_number];
const auto _Inner_rep = static_cast<_Node_rep*>(_Nx);
const auto _Inner_loop_state = &_Loop_vals[_Inner_rep->_Loop_number];
if (_Find_first_inner_capture_group(_Inner_rep->_Next, _Inner_loop_state)) {
_Loop_state->_Group_first = _Inner_loop_state->_Group_first;
_Found_group = true;
Expand Down Expand Up @@ -4078,15 +4088,16 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
auto& _Sav = _Loop_vals[_Node->_Loop_number];

if (_Node->_Simple_loop == 1) {
_Sav._Loop_frame_idx = _Push_frame(_Rx_unwind_ops::_Do_nothing, nullptr);
_Sav._Loop_frame_idx = _Push_frame(_Rx_unwind_ops::_Do_nothing, _Node);
_Increase_complexity_count();
if (_Node->_Min > 0 || (_Greedy && !_Longest && _Node->_Max != 0)) { // try a rep first
_Sav._Loop_idx = 1;
// _Next is already assigned correctly for matching a rep

// set up stack unwinding for greedy matching if no rep is allowed
if (_Node->_Min == 0) {
_Push_frame(_Rx_unwind_ops::_Loop_simple_greedy, _Node);
auto& _Frame = _Frames[_Sav._Loop_frame_idx];
_Frame._Code = _Rx_unwind_ops::_Loop_simple_greedy_firstrep;
}
} else { // try tail first
_Sav._Loop_idx = 0;
Expand Down Expand Up @@ -4136,37 +4147,79 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
auto& _Sav = _Loop_vals[_Nr->_Loop_number];
bool _Greedy = (_Nr->_Flags & _Fl_greedy) != 0;
if (_Nr->_Simple_loop != 0) {
if (_Sav._Loop_idx == 1
&& _Tgt_state._Cur == _Frames[_Sav._Loop_frame_idx]._Pos) { // initial match empty
// loop is branchless, so it will only ever match empty strings
// -> we only try tail for POSIX or if minimum number of reps is non-zero
// _Next is already assigned correctly for matching tail

if (!(_Sflags & regex_constants::_Any_posix) && _Nr->_Min == 0) {
_Failed = true;
if (_Sav._Loop_idx == 1) {
auto& _Base_frame = _Frames[_Sav._Loop_frame_idx];
_Sav._Loop_length = _STD distance(_Base_frame._Pos, _Tgt_state._Cur);

if (_Sav._Loop_length == _Iter_diff_t<_It>{}) { // initial match empty
// loop is branchless, so it will only ever match empty strings
// -> we only try tail for POSIX or if minimum number of reps is non-zero
// _Next is already assigned correctly for matching tail
if (!(_Sflags & regex_constants::_Any_posix) && _Nr->_Min == 0) {
_Failed = true;
} else {
_Increase_complexity_count();
}
break;
}
} else if (_Sav._Loop_idx < _Nr->_Min) { // at least one more rep to reach minimum

// allocate stack frame holding loop-specific unwinding opcode for second rep and beyond
auto _New_frame_code = _Base_frame._Code == _Rx_unwind_ops::_Loop_simple_greedy_firstrep
? _Rx_unwind_ops::_Loop_simple_greedy_lastrep
: _Rx_unwind_ops::_Do_nothing;
auto _New_frame_idx = _Push_frame(_New_frame_code, _Nr);
_Frames[_New_frame_idx]._Loop_frame_idx_sav = _Sav._Loop_frame_idx;
_Sav._Loop_frame_idx = _New_frame_idx;
} else { // discard stack frames for capturing group changes generated by this rep
_Frames_count = _Sav._Loop_frame_idx + 1U;
}

if (_Sav._Loop_idx < _Nr->_Min) { // at least one more rep to reach minimum
_Next = _Nr->_Next;
++_Sav._Loop_idx;
} else if (_Greedy && !_Longest && _Sav._Loop_idx != _Nr->_Max) { // one more rep to try next
// set up stack unwinding for greedy matching
_Push_frame(_Rx_unwind_ops::_Loop_simple_greedy, _Nr);
} else if (_Greedy && !_Longest) { // greedy matching
auto& _Frame = _Frames[_Sav._Loop_frame_idx];
if (_Frame._Code == _Rx_unwind_ops::_Do_nothing) { // min reps reached
_Frame._Code = _Rx_unwind_ops::_Loop_simple_greedy_lastrep;
// set iterator in base frame to start of prior rep
// (so to start of rep before reaching min reps)
auto& _Before_unwind_pos = _Frames[_Frame._Loop_frame_idx_sav]._Pos;
_Before_unwind_pos = _Tgt_state._Cur;
_STD advance(_Before_unwind_pos, -_Sav._Loop_length);
} else {
_STL_INTERNAL_CHECK(_Frame._Code == _Rx_unwind_ops::_Loop_simple_greedy_lastrep);
}
_Frame._Pos = _Tgt_state._Cur;

_Next = _Nr->_Next;
if (_Sav._Loop_idx < INT_MAX) { // avoid overflowing _Loop_idx
++_Sav._Loop_idx;
if (_Sav._Loop_idx != _Nr->_Max) { // try one more rep
_Next = _Nr->_Next;
if (_Sav._Loop_idx < INT_MAX) { // avoid overflowing _Loop_idx
++_Sav._Loop_idx;
}
} else { // try tail
_STD advance(_Frame._Pos, -_Sav._Loop_length);
if (_Frame._Pos != _Frames[_Frame._Loop_frame_idx_sav]._Pos) {
// capturing groups must be shifted when backtracking from tail
_Frame._Code = _Rx_unwind_ops::_Loop_simple_greedy_intermediaterep;
} else {
--_Frames_count;
}
// _Next is already assigned correctly for matching tail
}
} else { // non-greedy matching or greedy matching with maximum reached
} else { // non-greedy matching
// set up stack unwinding for non-greedy matching if one more rep is allowed
if (_Sav._Loop_idx != _Nr->_Max) {
_Push_frame(_Rx_unwind_ops::_Loop_simple_nongreedy, _Nr);
auto& _Frame = _Frames[_Sav._Loop_frame_idx];
_Frame._Pos = _Tgt_state._Cur;
_Frame._Code = _Rx_unwind_ops::_Loop_simple_nongreedy;
_Frame._Node = _Nr;
} else {
--_Frames_count;
}
// _Next is already assigned correctly for matching tail
}

if (!_Failed) {
_Increase_complexity_count();
}
_Increase_complexity_count();
} else {
const bool _Progress = _Frames[_Sav._Loop_frame_idx]._Pos != _Tgt_state._Cur;
if (_Sav._Loop_idx < _Nr->_Min) { // try another required match
Expand Down Expand Up @@ -4327,8 +4380,8 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
}
break;

case _Rx_unwind_ops::_Loop_simple_greedy:
// try tail if matching one more rep failed
case _Rx_unwind_ops::_Loop_simple_greedy_firstrep:
// try tail after backtracking from first rep
if (_Failed) {
auto _Node = static_cast<_Node_rep*>(_Frame._Node);

Expand All @@ -4339,6 +4392,50 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
}
break;

case _Rx_unwind_ops::_Loop_simple_greedy_intermediaterep:
// shift capturing groups, set up unwinding prior rep and try tail
// when backtracking between the second and the last attempted rep
if (_Failed) {
auto _Node = static_cast<_Node_rep*>(_Frame._Node);

// adjust capturing group begin and end iterators by rep length
auto& _Sav = _Loop_vals[_Node->_Loop_number];
for (auto _Capture_frame_idx = _Frame._Loop_frame_idx_sav + 1U; _Capture_frame_idx != _Frames_count;
++_Capture_frame_idx) {
const auto& _Capture_frame = _Frames[_Capture_frame_idx];
_STL_INTERNAL_CHECK(_Capture_frame._Code == _Rx_unwind_ops::_Capture_restore_begin
|| _Capture_frame._Code == _Rx_unwind_ops::_Capture_restore_matched_end
|| _Capture_frame._Code == _Rx_unwind_ops::_Capture_restore_unmatched_end);
auto& _Grp = _Tgt_state._Grps[_Capture_frame._Capture_idx];
_STD advance(
_Capture_frame._Code == _Rx_unwind_ops::_Capture_restore_begin ? _Grp._Begin : _Grp._End,
-_Sav._Loop_length);
}
}
_FALLTHROUGH;

case _Rx_unwind_ops::_Loop_simple_greedy_lastrep:
// set up unwinding prior rep and try tail
// when backtracking from last attempted rep
if (_Failed) {
auto _Node = static_cast<_Node_rep*>(_Frame._Node);

_Increase_complexity_count();
_Nx = _Node->_End_rep->_Next;
_Tgt_state._Cur = _Frame._Pos;
_Failed = false;

auto& _Sav = _Loop_vals[_Node->_Loop_number];
_STD advance(_Frame._Pos, -_Sav._Loop_length);

// set up unwinding if prior rep is not first or minimum rep
if (_Frames[_Frame._Loop_frame_idx_sav]._Pos != _Frame._Pos) {
_Frame._Code = _Rx_unwind_ops::_Loop_simple_greedy_intermediaterep;
++_Frames_count;
}
}
break;

case _Rx_unwind_ops::_Loop_greedy:
// try tail if matching one more rep failed
if (_Failed) {
Expand Down
31 changes: 31 additions & 0 deletions tests/std/tests/VSO_0000000_regex_use/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2361,6 +2361,36 @@ void test_gh_5918() {
g_regexTester.should_match("ababa", R"((?:(a)(?:|b\1b)){2})");
}

void test_gh_5939() {
// GH-5939: Avoid stack growth in simple loops
// This PR manipulates the stack while processing simple loops to avoid growing it.
// The following tests verify that backtracking from such loops still works
// and matches capturing groups even with these modifications to the stack.
g_regexTester.should_match("abcdd", R"(([abc])*?abcd\1d)");

g_regexTester.should_match("abb", R"((a)*ab\1b)");
g_regexTester.should_match("abb", R"((a){0,1}ab\1b)");
g_regexTester.should_not_match("abb", R"((a){1,1}ab\1b)");
g_regexTester.should_not_match("abb", R"((a){1,2}ab\1b)");
g_regexTester.should_match("aabab", R"((a){1,2}ab\1b)");
g_regexTester.should_match("abcdab", R"((?:([abc])([abc]))*cd\1\2)");
g_regexTester.should_match("abcdab", R"((?:([abc])([abc])){0,1}cd\1\2)");
g_regexTester.should_match("abbacdba", R"((?:([abc])([abc]))*cd\1\2)");
g_regexTester.should_match("abbacdab", R"((?:([abc])([abc]))*bacd\1\2)");
g_regexTester.should_match("abbacd", R"((?:([abc])([abc]))*abbacd\1\2)");
g_regexTester.should_match("abbacdba", R"((?:([abc])([abc]))+cd\1\2)");
g_regexTester.should_match("abbacdab", R"((?:([abc])([abc]))+bacd\1\2)");
g_regexTester.should_match("abbacdab", R"((?:([abc])([abc])){0,2}bacd\1\2)");
g_regexTester.should_match("abbacdab", R"((?:([abc])([abc])){1,2}bacd\1\2)");
g_regexTester.should_not_match("abbacdab", R"((?:([abc])([abc]))+abbacd\1\2)");
g_regexTester.should_match("abbacdba", R"((?:([abc])([abc])){2,}cd\1\2)");
g_regexTester.should_not_match("abbacdab", R"((?:([abc])([abc])){2,}bacd\1\2)");
g_regexTester.should_not_match("abbacdab", R"((?:([abc])([abc])){2,}abbacd\1\2)");
g_regexTester.should_match("abcbbacdba", R"((?:([abc])([abc])){2,}cd\1\2)");
g_regexTester.should_match("abcbbacdcb", R"((?:([abc])([abc])){2,}bacd\1\2)");
g_regexTester.should_not_match("abcbbacdab", R"((?:([abc])([abc])){2,}abbacd\1\2)");
}

int main() {
test_dev10_449367_case_insensitivity_should_work();
test_dev11_462743_regex_collate_should_not_disable_regex_icase();
Expand Down Expand Up @@ -2420,6 +2450,7 @@ int main() {
test_gh_5798();
test_gh_5865();
test_gh_5918();
test_gh_5939();

return g_regexTester.result();
}