1 // class template regex -*- C++ -*- 2 3 // Copyright (C) 2013-2020 Free Software Foundation, Inc. 4 // 5 // This file is part of the GNU ISO C++ Library. This library is free 6 // software; you can redistribute it and/or modify it under the 7 // terms of the GNU General Public License as published by the 8 // Free Software Foundation; either version 3, or (at your option) 9 // any later version. 10 11 // This library is distributed in the hope that it will be useful, 12 // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 // GNU General Public License for more details. 15 16 // Under Section 7 of GPL version 3, you are granted additional 17 // permissions described in the GCC Runtime Library Exception, version 18 // 3.1, as published by the Free Software Foundation. 19 20 // You should have received a copy of the GNU General Public License and 21 // a copy of the GCC Runtime Library Exception along with this program; 22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 // <http://www.gnu.org/licenses/>. 24 25 /** 26 * @file bits/regex_compiler.tcc 27 * This is an internal header file, included by other library headers. 28 * Do not attempt to use it directly. @headername{regex} 29 */ 30 31 // FIXME make comments doxygen format. 32 33 /* 34 // This compiler refers to "Regular Expression Matching Can Be Simple And Fast" 35 // (http://swtch.com/~rsc/regexp/regexp1.html), 36 // but doesn't strictly follow it. 37 // 38 // When compiling, states are *chained* instead of tree- or graph-constructed. 39 // It's more like structured programs: there's if statement and loop statement. 40 // 41 // For alternative structure (say "a|b"), aka "if statement", two branches 42 // should be constructed. However, these two shall merge to an "end_tag" at 43 // the end of this operator: 44 // 45 // branch1 46 // / \ 47 // => begin_tag end_tag => 48 // \ / 49 // branch2 50 // 51 // This is the difference between this implementation and that in Russ's 52 // article. 53 // 54 // That's why we introduced dummy node here ------ "end_tag" is a dummy node. 55 // All dummy nodes will be eliminated at the end of compilation. 56 */ 57 58 namespace std _GLIBCXX_VISIBILITY(default) 59 { 60 _GLIBCXX_BEGIN_NAMESPACE_VERSION 61 62 namespace __detail 63 { 64 template<typename _TraitsT> 65 _Compiler<_TraitsT>:: _Compiler(_IterT __b,_IterT __e,const typename _TraitsT::locale_type & __loc,_FlagT __flags)66 _Compiler(_IterT __b, _IterT __e, 67 const typename _TraitsT::locale_type& __loc, _FlagT __flags) 68 : _M_flags(_S_validate(__flags)), 69 _M_scanner(__b, __e, _M_flags, __loc), 70 _M_nfa(make_shared<_RegexT>(__loc, _M_flags)), 71 _M_traits(_M_nfa->_M_traits), 72 _M_ctype(std::use_facet<_CtypeT>(__loc)) 73 { 74 _StateSeqT __r(*_M_nfa, _M_nfa->_M_start()); 75 __r._M_append(_M_nfa->_M_insert_subexpr_begin()); 76 this->_M_disjunction(); 77 if (!_M_match_token(_ScannerT::_S_token_eof)) 78 __throw_regex_error(regex_constants::error_paren); 79 __r._M_append(_M_pop()); 80 __glibcxx_assert(_M_stack.empty()); 81 __r._M_append(_M_nfa->_M_insert_subexpr_end()); 82 __r._M_append(_M_nfa->_M_insert_accept()); 83 _M_nfa->_M_eliminate_dummy(); 84 } 85 86 template<typename _TraitsT> 87 void 88 _Compiler<_TraitsT>:: _M_disjunction()89 _M_disjunction() 90 { 91 this->_M_alternative(); 92 while (_M_match_token(_ScannerT::_S_token_or)) 93 { 94 _StateSeqT __alt1 = _M_pop(); 95 this->_M_alternative(); 96 _StateSeqT __alt2 = _M_pop(); 97 auto __end = _M_nfa->_M_insert_dummy(); 98 __alt1._M_append(__end); 99 __alt2._M_append(__end); 100 // __alt2 is state._M_next, __alt1 is state._M_alt. The executor 101 // executes _M_alt before _M_next, as well as executing left 102 // alternative before right one. 103 _M_stack.push(_StateSeqT(*_M_nfa, 104 _M_nfa->_M_insert_alt( 105 __alt2._M_start, __alt1._M_start, false), 106 __end)); 107 } 108 } 109 110 template<typename _TraitsT> 111 void 112 _Compiler<_TraitsT>:: _M_alternative()113 _M_alternative() 114 { 115 if (this->_M_term()) 116 { 117 _StateSeqT __re = _M_pop(); 118 this->_M_alternative(); 119 __re._M_append(_M_pop()); 120 _M_stack.push(__re); 121 } 122 else 123 _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_dummy())); 124 } 125 126 template<typename _TraitsT> 127 bool 128 _Compiler<_TraitsT>:: _M_term()129 _M_term() 130 { 131 if (this->_M_assertion()) 132 return true; 133 if (this->_M_atom()) 134 { 135 while (this->_M_quantifier()) 136 ; 137 return true; 138 } 139 return false; 140 } 141 142 template<typename _TraitsT> 143 bool 144 _Compiler<_TraitsT>:: _M_assertion()145 _M_assertion() 146 { 147 if (_M_match_token(_ScannerT::_S_token_line_begin)) 148 _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_begin())); 149 else if (_M_match_token(_ScannerT::_S_token_line_end)) 150 _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_end())); 151 else if (_M_match_token(_ScannerT::_S_token_word_bound)) 152 // _M_value[0] == 'n' means it's negative, say "not word boundary". 153 _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa-> 154 _M_insert_word_bound(_M_value[0] == 'n'))); 155 else if (_M_match_token(_ScannerT::_S_token_subexpr_lookahead_begin)) 156 { 157 auto __neg = _M_value[0] == 'n'; 158 this->_M_disjunction(); 159 if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) 160 __throw_regex_error(regex_constants::error_paren, 161 "Parenthesis is not closed."); 162 auto __tmp = _M_pop(); 163 __tmp._M_append(_M_nfa->_M_insert_accept()); 164 _M_stack.push( 165 _StateSeqT( 166 *_M_nfa, 167 _M_nfa->_M_insert_lookahead(__tmp._M_start, __neg))); 168 } 169 else 170 return false; 171 return true; 172 } 173 174 template<typename _TraitsT> 175 bool 176 _Compiler<_TraitsT>:: _M_quantifier()177 _M_quantifier() 178 { 179 bool __neg = (_M_flags & regex_constants::ECMAScript); 180 auto __init = [this, &__neg]() 181 { 182 if (_M_stack.empty()) 183 __throw_regex_error(regex_constants::error_badrepeat, 184 "Nothing to repeat before a quantifier."); 185 __neg = __neg && _M_match_token(_ScannerT::_S_token_opt); 186 }; 187 if (_M_match_token(_ScannerT::_S_token_closure0)) 188 { 189 __init(); 190 auto __e = _M_pop(); 191 _StateSeqT __r(*_M_nfa, 192 _M_nfa->_M_insert_repeat(_S_invalid_state_id, 193 __e._M_start, __neg)); 194 __e._M_append(__r); 195 _M_stack.push(__r); 196 } 197 else if (_M_match_token(_ScannerT::_S_token_closure1)) 198 { 199 __init(); 200 auto __e = _M_pop(); 201 __e._M_append(_M_nfa->_M_insert_repeat(_S_invalid_state_id, 202 __e._M_start, __neg)); 203 _M_stack.push(__e); 204 } 205 else if (_M_match_token(_ScannerT::_S_token_opt)) 206 { 207 __init(); 208 auto __e = _M_pop(); 209 auto __end = _M_nfa->_M_insert_dummy(); 210 _StateSeqT __r(*_M_nfa, 211 _M_nfa->_M_insert_repeat(_S_invalid_state_id, 212 __e._M_start, __neg)); 213 __e._M_append(__end); 214 __r._M_append(__end); 215 _M_stack.push(__r); 216 } 217 else if (_M_match_token(_ScannerT::_S_token_interval_begin)) 218 { 219 if (_M_stack.empty()) 220 __throw_regex_error(regex_constants::error_badrepeat, 221 "Nothing to repeat before a quantifier."); 222 if (!_M_match_token(_ScannerT::_S_token_dup_count)) 223 __throw_regex_error(regex_constants::error_badbrace, 224 "Unexpected token in brace expression."); 225 _StateSeqT __r(_M_pop()); 226 _StateSeqT __e(*_M_nfa, _M_nfa->_M_insert_dummy()); 227 long __min_rep = _M_cur_int_value(10); 228 bool __infi = false; 229 long __n; 230 231 // {3 232 if (_M_match_token(_ScannerT::_S_token_comma)) 233 if (_M_match_token(_ScannerT::_S_token_dup_count)) // {3,7} 234 __n = _M_cur_int_value(10) - __min_rep; 235 else 236 __infi = true; 237 else 238 __n = 0; 239 if (!_M_match_token(_ScannerT::_S_token_interval_end)) 240 __throw_regex_error(regex_constants::error_brace, 241 "Unexpected end of brace expression."); 242 243 __neg = __neg && _M_match_token(_ScannerT::_S_token_opt); 244 245 for (long __i = 0; __i < __min_rep; ++__i) 246 __e._M_append(__r._M_clone()); 247 248 if (__infi) 249 { 250 auto __tmp = __r._M_clone(); 251 _StateSeqT __s(*_M_nfa, 252 _M_nfa->_M_insert_repeat(_S_invalid_state_id, 253 __tmp._M_start, __neg)); 254 __tmp._M_append(__s); 255 __e._M_append(__s); 256 } 257 else 258 { 259 if (__n < 0) 260 __throw_regex_error(regex_constants::error_badbrace, 261 "Invalid range in brace expression."); 262 auto __end = _M_nfa->_M_insert_dummy(); 263 // _M_alt is the "match more" branch, and _M_next is the 264 // "match less" one. Switch _M_alt and _M_next of all created 265 // nodes. This is a hack but IMO works well. 266 std::stack<_StateIdT> __stack; 267 for (long __i = 0; __i < __n; ++__i) 268 { 269 auto __tmp = __r._M_clone(); 270 auto __alt = _M_nfa->_M_insert_repeat(__tmp._M_start, 271 __end, __neg); 272 __stack.push(__alt); 273 __e._M_append(_StateSeqT(*_M_nfa, __alt, __tmp._M_end)); 274 } 275 __e._M_append(__end); 276 while (!__stack.empty()) 277 { 278 auto& __tmp = (*_M_nfa)[__stack.top()]; 279 __stack.pop(); 280 std::swap(__tmp._M_next, __tmp._M_alt); 281 } 282 } 283 _M_stack.push(__e); 284 } 285 else 286 return false; 287 return true; 288 } 289 290 #define __INSERT_REGEX_MATCHER(__func, ...)\ 291 do {\ 292 if (!(_M_flags & regex_constants::icase))\ 293 if (!(_M_flags & regex_constants::collate))\ 294 __func<false, false>(__VA_ARGS__);\ 295 else\ 296 __func<false, true>(__VA_ARGS__);\ 297 else\ 298 if (!(_M_flags & regex_constants::collate))\ 299 __func<true, false>(__VA_ARGS__);\ 300 else\ 301 __func<true, true>(__VA_ARGS__);\ 302 } while (false) 303 304 template<typename _TraitsT> 305 bool 306 _Compiler<_TraitsT>:: _M_atom()307 _M_atom() 308 { 309 if (_M_match_token(_ScannerT::_S_token_anychar)) 310 { 311 if (!(_M_flags & regex_constants::ECMAScript)) 312 __INSERT_REGEX_MATCHER(_M_insert_any_matcher_posix); 313 else 314 __INSERT_REGEX_MATCHER(_M_insert_any_matcher_ecma); 315 } 316 else if (_M_try_char()) 317 __INSERT_REGEX_MATCHER(_M_insert_char_matcher); 318 else if (_M_match_token(_ScannerT::_S_token_backref)) 319 _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa-> 320 _M_insert_backref(_M_cur_int_value(10)))); 321 else if (_M_match_token(_ScannerT::_S_token_quoted_class)) 322 __INSERT_REGEX_MATCHER(_M_insert_character_class_matcher); 323 else if (_M_match_token(_ScannerT::_S_token_subexpr_no_group_begin)) 324 { 325 _StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_dummy()); 326 this->_M_disjunction(); 327 if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) 328 __throw_regex_error(regex_constants::error_paren, 329 "Parenthesis is not closed."); 330 __r._M_append(_M_pop()); 331 _M_stack.push(__r); 332 } 333 else if (_M_match_token(_ScannerT::_S_token_subexpr_begin)) 334 { 335 _StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_subexpr_begin()); 336 this->_M_disjunction(); 337 if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) 338 __throw_regex_error(regex_constants::error_paren, 339 "Parenthesis is not closed."); 340 __r._M_append(_M_pop()); 341 __r._M_append(_M_nfa->_M_insert_subexpr_end()); 342 _M_stack.push(__r); 343 } 344 else if (!_M_bracket_expression()) 345 return false; 346 return true; 347 } 348 349 template<typename _TraitsT> 350 bool 351 _Compiler<_TraitsT>:: _M_bracket_expression()352 _M_bracket_expression() 353 { 354 bool __neg = 355 _M_match_token(_ScannerT::_S_token_bracket_neg_begin); 356 if (!(__neg || _M_match_token(_ScannerT::_S_token_bracket_begin))) 357 return false; 358 __INSERT_REGEX_MATCHER(_M_insert_bracket_matcher, __neg); 359 return true; 360 } 361 #undef __INSERT_REGEX_MATCHER 362 363 template<typename _TraitsT> 364 template<bool __icase, bool __collate> 365 void 366 _Compiler<_TraitsT>:: _M_insert_any_matcher_ecma()367 _M_insert_any_matcher_ecma() 368 { 369 _M_stack.push(_StateSeqT(*_M_nfa, 370 _M_nfa->_M_insert_matcher 371 (_AnyMatcher<_TraitsT, true, __icase, __collate> 372 (_M_traits)))); 373 } 374 375 template<typename _TraitsT> 376 template<bool __icase, bool __collate> 377 void 378 _Compiler<_TraitsT>:: _M_insert_any_matcher_posix()379 _M_insert_any_matcher_posix() 380 { 381 _M_stack.push(_StateSeqT(*_M_nfa, 382 _M_nfa->_M_insert_matcher 383 (_AnyMatcher<_TraitsT, false, __icase, __collate> 384 (_M_traits)))); 385 } 386 387 template<typename _TraitsT> 388 template<bool __icase, bool __collate> 389 void 390 _Compiler<_TraitsT>:: _M_insert_char_matcher()391 _M_insert_char_matcher() 392 { 393 _M_stack.push(_StateSeqT(*_M_nfa, 394 _M_nfa->_M_insert_matcher 395 (_CharMatcher<_TraitsT, __icase, __collate> 396 (_M_value[0], _M_traits)))); 397 } 398 399 template<typename _TraitsT> 400 template<bool __icase, bool __collate> 401 void 402 _Compiler<_TraitsT>:: _M_insert_character_class_matcher()403 _M_insert_character_class_matcher() 404 { 405 __glibcxx_assert(_M_value.size() == 1); 406 _BracketMatcher<__icase, __collate> __matcher 407 (_M_ctype.is(_CtypeT::upper, _M_value[0]), _M_traits); 408 __matcher._M_add_character_class(_M_value, false); 409 __matcher._M_ready(); 410 _M_stack.push(_StateSeqT(*_M_nfa, 411 _M_nfa->_M_insert_matcher(std::move(__matcher)))); 412 } 413 414 template<typename _TraitsT> 415 template<bool __icase, bool __collate> 416 void 417 _Compiler<_TraitsT>:: _M_insert_bracket_matcher(bool __neg)418 _M_insert_bracket_matcher(bool __neg) 419 { 420 _BracketMatcher<__icase, __collate> __matcher(__neg, _M_traits); 421 _BracketState __last_char; 422 if (_M_try_char()) 423 __last_char.set(_M_value[0]); 424 else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) 425 // Dash as first character is a normal character. 426 __last_char.set('-'); 427 while (_M_expression_term(__last_char, __matcher)) 428 ; 429 if (__last_char._M_is_char()) 430 __matcher._M_add_char(__last_char.get()); 431 __matcher._M_ready(); 432 _M_stack.push(_StateSeqT( 433 *_M_nfa, 434 _M_nfa->_M_insert_matcher(std::move(__matcher)))); 435 } 436 437 template<typename _TraitsT> 438 template<bool __icase, bool __collate> 439 bool 440 _Compiler<_TraitsT>:: _M_expression_term(_BracketState & __last_char,_BracketMatcher<__icase,__collate> & __matcher)441 _M_expression_term(_BracketState& __last_char, 442 _BracketMatcher<__icase, __collate>& __matcher) 443 { 444 if (_M_match_token(_ScannerT::_S_token_bracket_end)) 445 return false; 446 447 // Add any previously cached char into the matcher and update cache. 448 const auto __push_char = [&](_CharT __ch) 449 { 450 if (__last_char._M_is_char()) 451 __matcher._M_add_char(__last_char.get()); 452 __last_char.set(__ch); 453 }; 454 // Add any previously cached char into the matcher and update cache. 455 const auto __push_class = [&] 456 { 457 if (__last_char._M_is_char()) 458 __matcher._M_add_char(__last_char.get()); 459 // We don't cache anything here, just record that the last thing 460 // processed was a character class (or similar). 461 __last_char.reset(_BracketState::_Type::_Class); 462 }; 463 464 if (_M_match_token(_ScannerT::_S_token_collsymbol)) 465 { 466 auto __symbol = __matcher._M_add_collate_element(_M_value); 467 if (__symbol.size() == 1) 468 __push_char(__symbol[0]); 469 else 470 __push_class(); 471 } 472 else if (_M_match_token(_ScannerT::_S_token_equiv_class_name)) 473 { 474 __push_class(); 475 __matcher._M_add_equivalence_class(_M_value); 476 } 477 else if (_M_match_token(_ScannerT::_S_token_char_class_name)) 478 { 479 __push_class(); 480 __matcher._M_add_character_class(_M_value, false); 481 } 482 else if (_M_try_char()) 483 __push_char(_M_value[0]); 484 // POSIX doesn't allow '-' as a start-range char (say [a-z--0]), 485 // except when the '-' is the first or last character in the bracket 486 // expression ([--0]). ECMAScript treats all '-' after a range as a 487 // normal character. Also see above, where _M_expression_term gets called. 488 // 489 // As a result, POSIX rejects [-----], but ECMAScript doesn't. 490 // Boost (1.57.0) always uses POSIX style even in its ECMAScript syntax. 491 // Clang (3.5) always uses ECMAScript style even in its POSIX syntax. 492 // 493 // It turns out that no one reads BNFs ;) 494 else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) 495 { 496 if (_M_match_token(_ScannerT::_S_token_bracket_end)) 497 { 498 // For "-]" the dash is a literal character. 499 __push_char('-'); 500 return false; 501 } 502 else if (__last_char._M_is_class()) 503 { 504 // "\\w-" is invalid, start of range must be a single char. 505 __throw_regex_error(regex_constants::error_range, 506 "Invalid start of range in bracket expression."); 507 } 508 else if (__last_char._M_is_char()) 509 { 510 if (_M_try_char()) 511 { 512 // "x-y" 513 __matcher._M_make_range(__last_char.get(), _M_value[0]); 514 __last_char.reset(); 515 } 516 else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) 517 { 518 // "x--" 519 __matcher._M_make_range(__last_char.get(), '-'); 520 __last_char.reset(); 521 } 522 else 523 __throw_regex_error(regex_constants::error_range, 524 "Invalid end of range in bracket expression."); 525 } 526 else if (_M_flags & regex_constants::ECMAScript) 527 { 528 // A dash that is not part of an existing range. Might be the 529 // start of a new range, or might just be a literal '-' char. 530 // Only ECMAScript allows that in the middle of a bracket expr. 531 __push_char('-'); 532 } 533 else 534 __throw_regex_error(regex_constants::error_range, 535 "Invalid dash in bracket expression."); 536 } 537 else if (_M_match_token(_ScannerT::_S_token_quoted_class)) 538 { 539 __push_class(); 540 __matcher._M_add_character_class(_M_value, 541 _M_ctype.is(_CtypeT::upper, 542 _M_value[0])); 543 } 544 else 545 __throw_regex_error(regex_constants::error_brack, 546 "Unexpected character in bracket expression."); 547 548 return true; 549 } 550 551 template<typename _TraitsT> 552 bool 553 _Compiler<_TraitsT>:: _M_try_char()554 _M_try_char() 555 { 556 bool __is_char = false; 557 if (_M_match_token(_ScannerT::_S_token_oct_num)) 558 { 559 __is_char = true; 560 _M_value.assign(1, _M_cur_int_value(8)); 561 } 562 else if (_M_match_token(_ScannerT::_S_token_hex_num)) 563 { 564 __is_char = true; 565 _M_value.assign(1, _M_cur_int_value(16)); 566 } 567 else if (_M_match_token(_ScannerT::_S_token_ord_char)) 568 __is_char = true; 569 return __is_char; 570 } 571 572 template<typename _TraitsT> 573 bool 574 _Compiler<_TraitsT>:: _M_match_token(_TokenT __token)575 _M_match_token(_TokenT __token) 576 { 577 if (__token == _M_scanner._M_get_token()) 578 { 579 _M_value = _M_scanner._M_get_value(); 580 _M_scanner._M_advance(); 581 return true; 582 } 583 return false; 584 } 585 586 template<typename _TraitsT> 587 int 588 _Compiler<_TraitsT>:: _M_cur_int_value(int __radix)589 _M_cur_int_value(int __radix) 590 { 591 int __v = 0; 592 for (_CharT __c : _M_value) 593 if (__builtin_mul_overflow(__v, __radix, &__v) 594 || __builtin_add_overflow(__v, _M_traits.value(__c, __radix), &__v)) 595 std::__throw_regex_error(regex_constants::error_backref, 596 "invalid back reference"); 597 return __v; 598 } 599 600 template<typename _TraitsT, bool __icase, bool __collate> 601 bool 602 _BracketMatcher<_TraitsT, __icase, __collate>:: _M_apply(_CharT __ch,false_type) const603 _M_apply(_CharT __ch, false_type) const 604 { 605 return [this, __ch] 606 { 607 if (std::binary_search(_M_char_set.begin(), _M_char_set.end(), 608 _M_translator._M_translate(__ch))) 609 return true; 610 auto __s = _M_translator._M_transform(__ch); 611 for (auto& __it : _M_range_set) 612 if (_M_translator._M_match_range(__it.first, __it.second, __s)) 613 return true; 614 if (_M_traits.isctype(__ch, _M_class_set)) 615 return true; 616 if (std::find(_M_equiv_set.begin(), _M_equiv_set.end(), 617 _M_traits.transform_primary(&__ch, &__ch+1)) 618 != _M_equiv_set.end()) 619 return true; 620 for (auto& __it : _M_neg_class_set) 621 if (!_M_traits.isctype(__ch, __it)) 622 return true; 623 return false; 624 }() ^ _M_is_non_matching; 625 } 626 } // namespace __detail 627 628 _GLIBCXX_END_NAMESPACE_VERSION 629 } // namespace 630