1 // class template regex -*- C++ -*- 2 3 // Copyright (C) 2013-2021 Free Software Foundation, Inc. 4 // 5 // This file is part of the GNU ISO C++ Library. This library is free 6 // software; you can redistribute it and/or modify it under the 7 // terms of the GNU General Public License as published by the 8 // Free Software Foundation; either version 3, or (at your option) 9 // any later version. 10 11 // This library is distributed in the hope that it will be useful, 12 // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 // GNU General Public License for more details. 15 16 // Under Section 7 of GPL version 3, you are granted additional 17 // permissions described in the GCC Runtime Library Exception, version 18 // 3.1, as published by the Free Software Foundation. 19 20 // You should have received a copy of the GNU General Public License and 21 // a copy of the GCC Runtime Library Exception along with this program; 22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 // <http://www.gnu.org/licenses/>. 24 25 /** 26 * @file bits/regex_compiler.tcc 27 * This is an internal header file, included by other library headers. 28 * Do not attempt to use it directly. @headername{regex} 29 */ 30 31 // FIXME make comments doxygen format. 32 33 /* 34 // This compiler refers to "Regular Expression Matching Can Be Simple And Fast" 35 // (http://swtch.com/~rsc/regexp/regexp1.html), 36 // but doesn't strictly follow it. 37 // 38 // When compiling, states are *chained* instead of tree- or graph-constructed. 39 // It's more like structured programs: there's if statement and loop statement. 40 // 41 // For alternative structure (say "a|b"), aka "if statement", two branches 42 // should be constructed. However, these two shall merge to an "end_tag" at 43 // the end of this operator: 44 // 45 // branch1 46 // / \ 47 // => begin_tag end_tag => 48 // \ / 49 // branch2 50 // 51 // This is the difference between this implementation and that in Russ's 52 // article. 53 // 54 // That's why we introduced dummy node here ------ "end_tag" is a dummy node. 55 // All dummy nodes will be eliminated at the end of compilation. 56 */ 57 58 namespace std _GLIBCXX_VISIBILITY(default) 59 { 60 _GLIBCXX_BEGIN_NAMESPACE_VERSION 61 62 namespace __detail 63 { 64 template<typename _TraitsT> 65 _Compiler<_TraitsT>:: _Compiler(const _CharT * __b,const _CharT * __e,const typename _TraitsT::locale_type & __loc,_FlagT __flags)66 _Compiler(const _CharT* __b, const _CharT* __e, 67 const typename _TraitsT::locale_type& __loc, _FlagT __flags) 68 : _M_flags(_S_validate(__flags)), 69 _M_scanner(__b, __e, _M_flags, __loc), 70 _M_nfa(make_shared<_RegexT>(__loc, _M_flags)), 71 _M_traits(_M_nfa->_M_traits), 72 _M_ctype(std::use_facet<_CtypeT>(__loc)) 73 { 74 _StateSeqT __r(*_M_nfa, _M_nfa->_M_start()); 75 __r._M_append(_M_nfa->_M_insert_subexpr_begin()); 76 this->_M_disjunction(); 77 if (!_M_match_token(_ScannerT::_S_token_eof)) 78 __throw_regex_error(regex_constants::error_paren); 79 __r._M_append(_M_pop()); 80 __glibcxx_assert(_M_stack.empty()); 81 __r._M_append(_M_nfa->_M_insert_subexpr_end()); 82 __r._M_append(_M_nfa->_M_insert_accept()); 83 _M_nfa->_M_eliminate_dummy(); 84 } 85 86 template<typename _TraitsT> 87 void 88 _Compiler<_TraitsT>:: _M_disjunction()89 _M_disjunction() 90 { 91 this->_M_alternative(); 92 while (_M_match_token(_ScannerT::_S_token_or)) 93 { 94 _StateSeqT __alt1 = _M_pop(); 95 this->_M_alternative(); 96 _StateSeqT __alt2 = _M_pop(); 97 auto __end = _M_nfa->_M_insert_dummy(); 98 __alt1._M_append(__end); 99 __alt2._M_append(__end); 100 // __alt2 is state._M_next, __alt1 is state._M_alt. The executor 101 // executes _M_alt before _M_next, as well as executing left 102 // alternative before right one. 103 _M_stack.push(_StateSeqT(*_M_nfa, 104 _M_nfa->_M_insert_alt( 105 __alt2._M_start, __alt1._M_start, false), 106 __end)); 107 } 108 } 109 110 template<typename _TraitsT> 111 void 112 _Compiler<_TraitsT>:: _M_alternative()113 _M_alternative() 114 { 115 if (this->_M_term()) 116 { 117 _StateSeqT __re = _M_pop(); 118 this->_M_alternative(); 119 __re._M_append(_M_pop()); 120 _M_stack.push(__re); 121 } 122 else 123 _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_dummy())); 124 } 125 126 template<typename _TraitsT> 127 bool 128 _Compiler<_TraitsT>:: _M_term()129 _M_term() 130 { 131 if (this->_M_assertion()) 132 return true; 133 if (this->_M_atom()) 134 { 135 while (this->_M_quantifier()) 136 ; 137 return true; 138 } 139 return false; 140 } 141 142 template<typename _TraitsT> 143 bool 144 _Compiler<_TraitsT>:: _M_assertion()145 _M_assertion() 146 { 147 if (_M_match_token(_ScannerT::_S_token_line_begin)) 148 _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_begin())); 149 else if (_M_match_token(_ScannerT::_S_token_line_end)) 150 _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_end())); 151 else if (_M_match_token(_ScannerT::_S_token_word_bound)) 152 // _M_value[0] == 'n' means it's negative, say "not word boundary". 153 _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa-> 154 _M_insert_word_bound(_M_value[0] == 'n'))); 155 else if (_M_match_token(_ScannerT::_S_token_subexpr_lookahead_begin)) 156 { 157 auto __neg = _M_value[0] == 'n'; 158 this->_M_disjunction(); 159 if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) 160 __throw_regex_error(regex_constants::error_paren, 161 "Parenthesis is not closed."); 162 auto __tmp = _M_pop(); 163 __tmp._M_append(_M_nfa->_M_insert_accept()); 164 _M_stack.push( 165 _StateSeqT( 166 *_M_nfa, 167 _M_nfa->_M_insert_lookahead(__tmp._M_start, __neg))); 168 } 169 else 170 return false; 171 return true; 172 } 173 174 template<typename _TraitsT> 175 bool 176 _Compiler<_TraitsT>:: _M_quantifier()177 _M_quantifier() 178 { 179 bool __neg = (_M_flags & regex_constants::ECMAScript); 180 auto __init = [this, &__neg]() 181 { 182 if (_M_stack.empty()) 183 __throw_regex_error(regex_constants::error_badrepeat, 184 "Nothing to repeat before a quantifier."); 185 __neg = __neg && _M_match_token(_ScannerT::_S_token_opt); 186 }; 187 if (_M_match_token(_ScannerT::_S_token_closure0)) 188 { 189 __init(); 190 auto __e = _M_pop(); 191 _StateSeqT __r(*_M_nfa, 192 _M_nfa->_M_insert_repeat(_S_invalid_state_id, 193 __e._M_start, __neg)); 194 __e._M_append(__r); 195 _M_stack.push(__r); 196 } 197 else if (_M_match_token(_ScannerT::_S_token_closure1)) 198 { 199 __init(); 200 auto __e = _M_pop(); 201 __e._M_append(_M_nfa->_M_insert_repeat(_S_invalid_state_id, 202 __e._M_start, __neg)); 203 _M_stack.push(__e); 204 } 205 else if (_M_match_token(_ScannerT::_S_token_opt)) 206 { 207 __init(); 208 auto __e = _M_pop(); 209 auto __end = _M_nfa->_M_insert_dummy(); 210 _StateSeqT __r(*_M_nfa, 211 _M_nfa->_M_insert_repeat(_S_invalid_state_id, 212 __e._M_start, __neg)); 213 __e._M_append(__end); 214 __r._M_append(__end); 215 _M_stack.push(__r); 216 } 217 else if (_M_match_token(_ScannerT::_S_token_interval_begin)) 218 { 219 if (_M_stack.empty()) 220 __throw_regex_error(regex_constants::error_badrepeat, 221 "Nothing to repeat before a quantifier."); 222 if (!_M_match_token(_ScannerT::_S_token_dup_count)) 223 __throw_regex_error(regex_constants::error_badbrace, 224 "Unexpected token in brace expression."); 225 _StateSeqT __r(_M_pop()); 226 _StateSeqT __e(*_M_nfa, _M_nfa->_M_insert_dummy()); 227 long __min_rep = _M_cur_int_value(10); 228 bool __infi = false; 229 long __n = 0; 230 231 // {3 232 if (_M_match_token(_ScannerT::_S_token_comma)) 233 { 234 if (_M_match_token(_ScannerT::_S_token_dup_count)) // {3,7} 235 __n = _M_cur_int_value(10) - __min_rep; 236 else 237 __infi = true; 238 } 239 if (!_M_match_token(_ScannerT::_S_token_interval_end)) 240 __throw_regex_error(regex_constants::error_brace, 241 "Unexpected end of brace expression."); 242 243 __neg = __neg && _M_match_token(_ScannerT::_S_token_opt); 244 245 for (long __i = 0; __i < __min_rep; ++__i) 246 __e._M_append(__r._M_clone()); 247 248 if (__infi) 249 { 250 auto __tmp = __r._M_clone(); 251 _StateSeqT __s(*_M_nfa, 252 _M_nfa->_M_insert_repeat(_S_invalid_state_id, 253 __tmp._M_start, __neg)); 254 __tmp._M_append(__s); 255 __e._M_append(__s); 256 } 257 else 258 { 259 if (__n < 0) 260 __throw_regex_error(regex_constants::error_badbrace, 261 "Invalid range in brace expression."); 262 auto __end = _M_nfa->_M_insert_dummy(); 263 // _M_alt is the "match more" branch, and _M_next is the 264 // "match less" one. Switch _M_alt and _M_next of all created 265 // nodes. This is a hack but IMO works well. 266 std::stack<_StateIdT> __stack; 267 for (long __i = 0; __i < __n; ++__i) 268 { 269 auto __tmp = __r._M_clone(); 270 auto __alt = _M_nfa->_M_insert_repeat(__tmp._M_start, 271 __end, __neg); 272 __stack.push(__alt); 273 __e._M_append(_StateSeqT(*_M_nfa, __alt, __tmp._M_end)); 274 } 275 __e._M_append(__end); 276 while (!__stack.empty()) 277 { 278 auto& __tmp = (*_M_nfa)[__stack.top()]; 279 __stack.pop(); 280 std::swap(__tmp._M_next, __tmp._M_alt); 281 } 282 } 283 _M_stack.push(__e); 284 } 285 else 286 return false; 287 return true; 288 } 289 290 #define __INSERT_REGEX_MATCHER(__func, ...)\ 291 do {\ 292 if (!(_M_flags & regex_constants::icase))\ 293 if (!(_M_flags & regex_constants::collate))\ 294 __func<false, false>(__VA_ARGS__);\ 295 else\ 296 __func<false, true>(__VA_ARGS__);\ 297 else\ 298 if (!(_M_flags & regex_constants::collate))\ 299 __func<true, false>(__VA_ARGS__);\ 300 else\ 301 __func<true, true>(__VA_ARGS__);\ 302 } while (false) 303 304 template<typename _TraitsT> 305 bool 306 _Compiler<_TraitsT>:: _M_atom()307 _M_atom() 308 { 309 if (_M_match_token(_ScannerT::_S_token_anychar)) 310 { 311 if (!(_M_flags & regex_constants::ECMAScript)) 312 __INSERT_REGEX_MATCHER(_M_insert_any_matcher_posix); 313 else 314 __INSERT_REGEX_MATCHER(_M_insert_any_matcher_ecma); 315 } 316 else if (_M_try_char()) 317 __INSERT_REGEX_MATCHER(_M_insert_char_matcher); 318 else if (_M_match_token(_ScannerT::_S_token_backref)) 319 _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa-> 320 _M_insert_backref(_M_cur_int_value(10)))); 321 else if (_M_match_token(_ScannerT::_S_token_quoted_class)) 322 __INSERT_REGEX_MATCHER(_M_insert_character_class_matcher); 323 else if (_M_match_token(_ScannerT::_S_token_subexpr_no_group_begin)) 324 { 325 _StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_dummy()); 326 this->_M_disjunction(); 327 if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) 328 __throw_regex_error(regex_constants::error_paren, 329 "Parenthesis is not closed."); 330 __r._M_append(_M_pop()); 331 _M_stack.push(__r); 332 } 333 else if (_M_match_token(_ScannerT::_S_token_subexpr_begin)) 334 { 335 _StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_subexpr_begin()); 336 this->_M_disjunction(); 337 if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) 338 __throw_regex_error(regex_constants::error_paren, 339 "Parenthesis is not closed."); 340 __r._M_append(_M_pop()); 341 __r._M_append(_M_nfa->_M_insert_subexpr_end()); 342 _M_stack.push(__r); 343 } 344 else if (!_M_bracket_expression()) 345 return false; 346 return true; 347 } 348 349 template<typename _TraitsT> 350 bool 351 _Compiler<_TraitsT>:: _M_bracket_expression()352 _M_bracket_expression() 353 { 354 bool __neg = 355 _M_match_token(_ScannerT::_S_token_bracket_neg_begin); 356 if (!(__neg || _M_match_token(_ScannerT::_S_token_bracket_begin))) 357 return false; 358 __INSERT_REGEX_MATCHER(_M_insert_bracket_matcher, __neg); 359 return true; 360 } 361 #undef __INSERT_REGEX_MATCHER 362 363 template<typename _TraitsT> 364 template<bool __icase, bool __collate> 365 void 366 _Compiler<_TraitsT>:: _M_insert_any_matcher_ecma()367 _M_insert_any_matcher_ecma() 368 { 369 _M_stack.push(_StateSeqT(*_M_nfa, 370 _M_nfa->_M_insert_matcher 371 (_AnyMatcher<_TraitsT, true, __icase, __collate> 372 (_M_traits)))); 373 } 374 375 template<typename _TraitsT> 376 template<bool __icase, bool __collate> 377 void 378 _Compiler<_TraitsT>:: _M_insert_any_matcher_posix()379 _M_insert_any_matcher_posix() 380 { 381 _M_stack.push(_StateSeqT(*_M_nfa, 382 _M_nfa->_M_insert_matcher 383 (_AnyMatcher<_TraitsT, false, __icase, __collate> 384 (_M_traits)))); 385 } 386 387 template<typename _TraitsT> 388 template<bool __icase, bool __collate> 389 void 390 _Compiler<_TraitsT>:: _M_insert_char_matcher()391 _M_insert_char_matcher() 392 { 393 _M_stack.push(_StateSeqT(*_M_nfa, 394 _M_nfa->_M_insert_matcher 395 (_CharMatcher<_TraitsT, __icase, __collate> 396 (_M_value[0], _M_traits)))); 397 } 398 399 template<typename _TraitsT> 400 template<bool __icase, bool __collate> 401 void 402 _Compiler<_TraitsT>:: _M_insert_character_class_matcher()403 _M_insert_character_class_matcher() 404 { 405 __glibcxx_assert(_M_value.size() == 1); 406 _BracketMatcher<_TraitsT, __icase, __collate> __matcher 407 (_M_ctype.is(_CtypeT::upper, _M_value[0]), _M_traits); 408 __matcher._M_add_character_class(_M_value, false); 409 __matcher._M_ready(); 410 _M_stack.push(_StateSeqT(*_M_nfa, 411 _M_nfa->_M_insert_matcher(std::move(__matcher)))); 412 } 413 414 template<typename _TraitsT> 415 template<bool __icase, bool __collate> 416 void 417 _Compiler<_TraitsT>:: _M_insert_bracket_matcher(bool __neg)418 _M_insert_bracket_matcher(bool __neg) 419 { 420 _BracketMatcher<_TraitsT, __icase, __collate> __matcher(__neg, _M_traits); 421 pair<bool, _CharT> __last_char; // Optional<_CharT> 422 __last_char.first = false; 423 if (!(_M_flags & regex_constants::ECMAScript)) 424 { 425 if (_M_try_char()) 426 { 427 __last_char.first = true; 428 __last_char.second = _M_value[0]; 429 } 430 else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) 431 { 432 __last_char.first = true; 433 __last_char.second = '-'; 434 } 435 } 436 while (_M_expression_term(__last_char, __matcher)) 437 ; 438 if (__last_char.first) 439 __matcher._M_add_char(__last_char.second); 440 __matcher._M_ready(); 441 _M_stack.push(_StateSeqT( 442 *_M_nfa, 443 _M_nfa->_M_insert_matcher(std::move(__matcher)))); 444 } 445 446 template<typename _TraitsT> 447 template<bool __icase, bool __collate> 448 bool 449 _Compiler<_TraitsT>:: _M_expression_term(pair<bool,_CharT> & __last_char,_BracketMatcher<_TraitsT,__icase,__collate> & __matcher)450 _M_expression_term(pair<bool, _CharT>& __last_char, 451 _BracketMatcher<_TraitsT, __icase, __collate>& __matcher) 452 { 453 if (_M_match_token(_ScannerT::_S_token_bracket_end)) 454 return false; 455 456 const auto __push_char = [&](_CharT __ch) 457 { 458 if (__last_char.first) 459 __matcher._M_add_char(__last_char.second); 460 else 461 __last_char.first = true; 462 __last_char.second = __ch; 463 }; 464 const auto __flush = [&] 465 { 466 if (__last_char.first) 467 { 468 __matcher._M_add_char(__last_char.second); 469 __last_char.first = false; 470 } 471 }; 472 473 if (_M_match_token(_ScannerT::_S_token_collsymbol)) 474 { 475 auto __symbol = __matcher._M_add_collate_element(_M_value); 476 if (__symbol.size() == 1) 477 __push_char(__symbol[0]); 478 else 479 __flush(); 480 } 481 else if (_M_match_token(_ScannerT::_S_token_equiv_class_name)) 482 { 483 __flush(); 484 __matcher._M_add_equivalence_class(_M_value); 485 } 486 else if (_M_match_token(_ScannerT::_S_token_char_class_name)) 487 { 488 __flush(); 489 __matcher._M_add_character_class(_M_value, false); 490 } 491 else if (_M_try_char()) 492 __push_char(_M_value[0]); 493 // POSIX doesn't allow '-' as a start-range char (say [a-z--0]), 494 // except when the '-' is the first or last character in the bracket 495 // expression ([--0]). ECMAScript treats all '-' after a range as a 496 // normal character. Also see above, where _M_expression_term gets called. 497 // 498 // As a result, POSIX rejects [-----], but ECMAScript doesn't. 499 // Boost (1.57.0) always uses POSIX style even in its ECMAScript syntax. 500 // Clang (3.5) always uses ECMAScript style even in its POSIX syntax. 501 // 502 // It turns out that no one reads BNFs ;) 503 else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) 504 { 505 if (!__last_char.first) 506 { 507 if (!(_M_flags & regex_constants::ECMAScript)) 508 { 509 if (_M_match_token(_ScannerT::_S_token_bracket_end)) 510 { 511 __push_char('-'); 512 return false; 513 } 514 __throw_regex_error( 515 regex_constants::error_range, 516 "Unexpected dash in bracket expression. For POSIX syntax, " 517 "a dash is not treated literally only when it is at " 518 "beginning or end."); 519 } 520 __push_char('-'); 521 } 522 else 523 { 524 if (_M_try_char()) 525 { 526 __matcher._M_make_range(__last_char.second, _M_value[0]); 527 __last_char.first = false; 528 } 529 else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) 530 { 531 __matcher._M_make_range(__last_char.second, '-'); 532 __last_char.first = false; 533 } 534 else 535 { 536 if (_M_scanner._M_get_token() 537 != _ScannerT::_S_token_bracket_end) 538 __throw_regex_error( 539 regex_constants::error_range, 540 "Character is expected after a dash."); 541 __push_char('-'); 542 } 543 } 544 } 545 else if (_M_match_token(_ScannerT::_S_token_quoted_class)) 546 { 547 __flush(); 548 __matcher._M_add_character_class(_M_value, 549 _M_ctype.is(_CtypeT::upper, 550 _M_value[0])); 551 } 552 else 553 __throw_regex_error(regex_constants::error_brack, 554 "Unexpected character in bracket expression."); 555 556 return true; 557 } 558 559 template<typename _TraitsT> 560 bool 561 _Compiler<_TraitsT>:: _M_try_char()562 _M_try_char() 563 { 564 bool __is_char = false; 565 if (_M_match_token(_ScannerT::_S_token_oct_num)) 566 { 567 __is_char = true; 568 _M_value.assign(1, _M_cur_int_value(8)); 569 } 570 else if (_M_match_token(_ScannerT::_S_token_hex_num)) 571 { 572 __is_char = true; 573 _M_value.assign(1, _M_cur_int_value(16)); 574 } 575 else if (_M_match_token(_ScannerT::_S_token_ord_char)) 576 __is_char = true; 577 return __is_char; 578 } 579 580 template<typename _TraitsT> 581 bool 582 _Compiler<_TraitsT>:: _M_match_token(_TokenT token)583 _M_match_token(_TokenT token) 584 { 585 if (token == _M_scanner._M_get_token()) 586 { 587 _M_value = _M_scanner._M_get_value(); 588 _M_scanner._M_advance(); 589 return true; 590 } 591 return false; 592 } 593 594 template<typename _TraitsT> 595 int 596 _Compiler<_TraitsT>:: _M_cur_int_value(int __radix)597 _M_cur_int_value(int __radix) 598 { 599 long __v = 0; 600 for (typename _StringT::size_type __i = 0; 601 __i < _M_value.length(); ++__i) 602 __v =__v * __radix + _M_traits.value(_M_value[__i], __radix); 603 return __v; 604 } 605 606 template<typename _TraitsT, bool __icase, bool __collate> 607 bool 608 _BracketMatcher<_TraitsT, __icase, __collate>:: _M_apply(_CharT __ch,false_type) const609 _M_apply(_CharT __ch, false_type) const 610 { 611 return [this, __ch] 612 { 613 if (std::binary_search(_M_char_set.begin(), _M_char_set.end(), 614 _M_translator._M_translate(__ch))) 615 return true; 616 auto __s = _M_translator._M_transform(__ch); 617 for (auto& __it : _M_range_set) 618 if (_M_translator._M_match_range(__it.first, __it.second, __s)) 619 return true; 620 if (_M_traits.isctype(__ch, _M_class_set)) 621 return true; 622 if (std::find(_M_equiv_set.begin(), _M_equiv_set.end(), 623 _M_traits.transform_primary(&__ch, &__ch+1)) 624 != _M_equiv_set.end()) 625 return true; 626 for (auto& __it : _M_neg_class_set) 627 if (!_M_traits.isctype(__ch, __it)) 628 return true; 629 return false; 630 }() ^ _M_is_non_matching; 631 } 632 } // namespace __detail 633 634 _GLIBCXX_END_NAMESPACE_VERSION 635 } // namespace 636