1 // class template regex -*- C++ -*- 2 3 // Copyright (C) 2013-2018 Free Software Foundation, Inc. 4 // 5 // This file is part of the GNU ISO C++ Library. This library is free 6 // software; you can redistribute it and/or modify it under the 7 // terms of the GNU General Public License as published by the 8 // Free Software Foundation; either version 3, or (at your option) 9 // any later version. 10 11 // This library is distributed in the hope that it will be useful, 12 // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 // GNU General Public License for more details. 15 16 // Under Section 7 of GPL version 3, you are granted additional 17 // permissions described in the GCC Runtime Library Exception, version 18 // 3.1, as published by the Free Software Foundation. 19 20 // You should have received a copy of the GNU General Public License and 21 // a copy of the GCC Runtime Library Exception along with this program; 22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 // <http://www.gnu.org/licenses/>. 24 25 /** 26 * @file bits/regex_compiler.tcc 27 * This is an internal header file, included by other library headers. 28 * Do not attempt to use it directly. @headername{regex} 29 */ 30 31 // FIXME make comments doxygen format. 32 33 /* 34 // This compiler refers to "Regular Expression Matching Can Be Simple And Fast" 35 // (http://swtch.com/~rsc/regexp/regexp1.html), 36 // but doesn't strictly follow it. 37 // 38 // When compiling, states are *chained* instead of tree- or graph-constructed. 39 // It's more like structured programs: there's if statement and loop statement. 40 // 41 // For alternative structure (say "a|b"), aka "if statement", two branches 42 // should be constructed. However, these two shall merge to an "end_tag" at 43 // the end of this operator: 44 // 45 // branch1 46 // / \ 47 // => begin_tag end_tag => 48 // \ / 49 // branch2 50 // 51 // This is the difference between this implementation and that in Russ's 52 // article. 53 // 54 // That's why we introduced dummy node here ------ "end_tag" is a dummy node. 55 // All dummy nodes will be eliminated at the end of compilation. 56 */ 57 58 namespace std _GLIBCXX_VISIBILITY(default) 59 { 60 _GLIBCXX_BEGIN_NAMESPACE_VERSION 61 62 namespace __detail 63 { 64 template<typename _TraitsT> 65 _Compiler<_TraitsT>:: _Compiler(_IterT __b,_IterT __e,const typename _TraitsT::locale_type & __loc,_FlagT __flags)66 _Compiler(_IterT __b, _IterT __e, 67 const typename _TraitsT::locale_type& __loc, _FlagT __flags) 68 : _M_flags((__flags 69 & (regex_constants::ECMAScript 70 | regex_constants::basic 71 | regex_constants::extended 72 | regex_constants::grep 73 | regex_constants::egrep 74 | regex_constants::awk)) 75 ? __flags 76 : __flags | regex_constants::ECMAScript), 77 _M_scanner(__b, __e, _M_flags, __loc), 78 _M_nfa(make_shared<_RegexT>(__loc, _M_flags)), 79 _M_traits(_M_nfa->_M_traits), 80 _M_ctype(std::use_facet<_CtypeT>(__loc)) 81 { 82 _StateSeqT __r(*_M_nfa, _M_nfa->_M_start()); 83 __r._M_append(_M_nfa->_M_insert_subexpr_begin()); 84 this->_M_disjunction(); 85 if (!_M_match_token(_ScannerT::_S_token_eof)) 86 __throw_regex_error(regex_constants::error_paren); 87 __r._M_append(_M_pop()); 88 __glibcxx_assert(_M_stack.empty()); 89 __r._M_append(_M_nfa->_M_insert_subexpr_end()); 90 __r._M_append(_M_nfa->_M_insert_accept()); 91 _M_nfa->_M_eliminate_dummy(); 92 } 93 94 template<typename _TraitsT> 95 void 96 _Compiler<_TraitsT>:: _M_disjunction()97 _M_disjunction() 98 { 99 this->_M_alternative(); 100 while (_M_match_token(_ScannerT::_S_token_or)) 101 { 102 _StateSeqT __alt1 = _M_pop(); 103 this->_M_alternative(); 104 _StateSeqT __alt2 = _M_pop(); 105 auto __end = _M_nfa->_M_insert_dummy(); 106 __alt1._M_append(__end); 107 __alt2._M_append(__end); 108 // __alt2 is state._M_next, __alt1 is state._M_alt. The executor 109 // executes _M_alt before _M_next, as well as executing left 110 // alternative before right one. 111 _M_stack.push(_StateSeqT(*_M_nfa, 112 _M_nfa->_M_insert_alt( 113 __alt2._M_start, __alt1._M_start, false), 114 __end)); 115 } 116 } 117 118 template<typename _TraitsT> 119 void 120 _Compiler<_TraitsT>:: _M_alternative()121 _M_alternative() 122 { 123 if (this->_M_term()) 124 { 125 _StateSeqT __re = _M_pop(); 126 this->_M_alternative(); 127 __re._M_append(_M_pop()); 128 _M_stack.push(__re); 129 } 130 else 131 _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_dummy())); 132 } 133 134 template<typename _TraitsT> 135 bool 136 _Compiler<_TraitsT>:: _M_term()137 _M_term() 138 { 139 if (this->_M_assertion()) 140 return true; 141 if (this->_M_atom()) 142 { 143 while (this->_M_quantifier()); 144 return true; 145 } 146 return false; 147 } 148 149 template<typename _TraitsT> 150 bool 151 _Compiler<_TraitsT>:: _M_assertion()152 _M_assertion() 153 { 154 if (_M_match_token(_ScannerT::_S_token_line_begin)) 155 _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_begin())); 156 else if (_M_match_token(_ScannerT::_S_token_line_end)) 157 _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_end())); 158 else if (_M_match_token(_ScannerT::_S_token_word_bound)) 159 // _M_value[0] == 'n' means it's negative, say "not word boundary". 160 _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa-> 161 _M_insert_word_bound(_M_value[0] == 'n'))); 162 else if (_M_match_token(_ScannerT::_S_token_subexpr_lookahead_begin)) 163 { 164 auto __neg = _M_value[0] == 'n'; 165 this->_M_disjunction(); 166 if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) 167 __throw_regex_error(regex_constants::error_paren, 168 "Parenthesis is not closed."); 169 auto __tmp = _M_pop(); 170 __tmp._M_append(_M_nfa->_M_insert_accept()); 171 _M_stack.push( 172 _StateSeqT( 173 *_M_nfa, 174 _M_nfa->_M_insert_lookahead(__tmp._M_start, __neg))); 175 } 176 else 177 return false; 178 return true; 179 } 180 181 template<typename _TraitsT> 182 bool 183 _Compiler<_TraitsT>:: _M_quantifier()184 _M_quantifier() 185 { 186 bool __neg = (_M_flags & regex_constants::ECMAScript); 187 auto __init = [this, &__neg]() 188 { 189 if (_M_stack.empty()) 190 __throw_regex_error(regex_constants::error_badrepeat, 191 "Nothing to repeat before a quantifier."); 192 __neg = __neg && _M_match_token(_ScannerT::_S_token_opt); 193 }; 194 if (_M_match_token(_ScannerT::_S_token_closure0)) 195 { 196 __init(); 197 auto __e = _M_pop(); 198 _StateSeqT __r(*_M_nfa, 199 _M_nfa->_M_insert_repeat(_S_invalid_state_id, 200 __e._M_start, __neg)); 201 __e._M_append(__r); 202 _M_stack.push(__r); 203 } 204 else if (_M_match_token(_ScannerT::_S_token_closure1)) 205 { 206 __init(); 207 auto __e = _M_pop(); 208 __e._M_append(_M_nfa->_M_insert_repeat(_S_invalid_state_id, 209 __e._M_start, __neg)); 210 _M_stack.push(__e); 211 } 212 else if (_M_match_token(_ScannerT::_S_token_opt)) 213 { 214 __init(); 215 auto __e = _M_pop(); 216 auto __end = _M_nfa->_M_insert_dummy(); 217 _StateSeqT __r(*_M_nfa, 218 _M_nfa->_M_insert_repeat(_S_invalid_state_id, 219 __e._M_start, __neg)); 220 __e._M_append(__end); 221 __r._M_append(__end); 222 _M_stack.push(__r); 223 } 224 else if (_M_match_token(_ScannerT::_S_token_interval_begin)) 225 { 226 if (_M_stack.empty()) 227 __throw_regex_error(regex_constants::error_badrepeat, 228 "Nothing to repeat before a quantifier."); 229 if (!_M_match_token(_ScannerT::_S_token_dup_count)) 230 __throw_regex_error(regex_constants::error_badbrace, 231 "Unexpected token in brace expression."); 232 _StateSeqT __r(_M_pop()); 233 _StateSeqT __e(*_M_nfa, _M_nfa->_M_insert_dummy()); 234 long __min_rep = _M_cur_int_value(10); 235 bool __infi = false; 236 long __n; 237 238 // {3 239 if (_M_match_token(_ScannerT::_S_token_comma)) 240 if (_M_match_token(_ScannerT::_S_token_dup_count)) // {3,7} 241 __n = _M_cur_int_value(10) - __min_rep; 242 else 243 __infi = true; 244 else 245 __n = 0; 246 if (!_M_match_token(_ScannerT::_S_token_interval_end)) 247 __throw_regex_error(regex_constants::error_brace, 248 "Unexpected end of brace expression."); 249 250 __neg = __neg && _M_match_token(_ScannerT::_S_token_opt); 251 252 for (long __i = 0; __i < __min_rep; ++__i) 253 __e._M_append(__r._M_clone()); 254 255 if (__infi) 256 { 257 auto __tmp = __r._M_clone(); 258 _StateSeqT __s(*_M_nfa, 259 _M_nfa->_M_insert_repeat(_S_invalid_state_id, 260 __tmp._M_start, __neg)); 261 __tmp._M_append(__s); 262 __e._M_append(__s); 263 } 264 else 265 { 266 if (__n < 0) 267 __throw_regex_error(regex_constants::error_badbrace, 268 "Invalid range in brace expression."); 269 auto __end = _M_nfa->_M_insert_dummy(); 270 // _M_alt is the "match more" branch, and _M_next is the 271 // "match less" one. Switch _M_alt and _M_next of all created 272 // nodes. This is a hack but IMO works well. 273 std::stack<_StateIdT> __stack; 274 for (long __i = 0; __i < __n; ++__i) 275 { 276 auto __tmp = __r._M_clone(); 277 auto __alt = _M_nfa->_M_insert_repeat(__tmp._M_start, 278 __end, __neg); 279 __stack.push(__alt); 280 __e._M_append(_StateSeqT(*_M_nfa, __alt, __tmp._M_end)); 281 } 282 __e._M_append(__end); 283 while (!__stack.empty()) 284 { 285 auto& __tmp = (*_M_nfa)[__stack.top()]; 286 __stack.pop(); 287 std::swap(__tmp._M_next, __tmp._M_alt); 288 } 289 } 290 _M_stack.push(__e); 291 } 292 else 293 return false; 294 return true; 295 } 296 297 #define __INSERT_REGEX_MATCHER(__func, ...)\ 298 do {\ 299 if (!(_M_flags & regex_constants::icase))\ 300 if (!(_M_flags & regex_constants::collate))\ 301 __func<false, false>(__VA_ARGS__);\ 302 else\ 303 __func<false, true>(__VA_ARGS__);\ 304 else\ 305 if (!(_M_flags & regex_constants::collate))\ 306 __func<true, false>(__VA_ARGS__);\ 307 else\ 308 __func<true, true>(__VA_ARGS__);\ 309 } while (false) 310 311 template<typename _TraitsT> 312 bool 313 _Compiler<_TraitsT>:: _M_atom()314 _M_atom() 315 { 316 if (_M_match_token(_ScannerT::_S_token_anychar)) 317 { 318 if (!(_M_flags & regex_constants::ECMAScript)) 319 __INSERT_REGEX_MATCHER(_M_insert_any_matcher_posix); 320 else 321 __INSERT_REGEX_MATCHER(_M_insert_any_matcher_ecma); 322 } 323 else if (_M_try_char()) 324 __INSERT_REGEX_MATCHER(_M_insert_char_matcher); 325 else if (_M_match_token(_ScannerT::_S_token_backref)) 326 _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa-> 327 _M_insert_backref(_M_cur_int_value(10)))); 328 else if (_M_match_token(_ScannerT::_S_token_quoted_class)) 329 __INSERT_REGEX_MATCHER(_M_insert_character_class_matcher); 330 else if (_M_match_token(_ScannerT::_S_token_subexpr_no_group_begin)) 331 { 332 _StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_dummy()); 333 this->_M_disjunction(); 334 if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) 335 __throw_regex_error(regex_constants::error_paren, 336 "Parenthesis is not closed."); 337 __r._M_append(_M_pop()); 338 _M_stack.push(__r); 339 } 340 else if (_M_match_token(_ScannerT::_S_token_subexpr_begin)) 341 { 342 _StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_subexpr_begin()); 343 this->_M_disjunction(); 344 if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) 345 __throw_regex_error(regex_constants::error_paren, 346 "Parenthesis is not closed."); 347 __r._M_append(_M_pop()); 348 __r._M_append(_M_nfa->_M_insert_subexpr_end()); 349 _M_stack.push(__r); 350 } 351 else if (!_M_bracket_expression()) 352 return false; 353 return true; 354 } 355 356 template<typename _TraitsT> 357 bool 358 _Compiler<_TraitsT>:: _M_bracket_expression()359 _M_bracket_expression() 360 { 361 bool __neg = 362 _M_match_token(_ScannerT::_S_token_bracket_neg_begin); 363 if (!(__neg || _M_match_token(_ScannerT::_S_token_bracket_begin))) 364 return false; 365 __INSERT_REGEX_MATCHER(_M_insert_bracket_matcher, __neg); 366 return true; 367 } 368 #undef __INSERT_REGEX_MATCHER 369 370 template<typename _TraitsT> 371 template<bool __icase, bool __collate> 372 void 373 _Compiler<_TraitsT>:: _M_insert_any_matcher_ecma()374 _M_insert_any_matcher_ecma() 375 { 376 _M_stack.push(_StateSeqT(*_M_nfa, 377 _M_nfa->_M_insert_matcher 378 (_AnyMatcher<_TraitsT, true, __icase, __collate> 379 (_M_traits)))); 380 } 381 382 template<typename _TraitsT> 383 template<bool __icase, bool __collate> 384 void 385 _Compiler<_TraitsT>:: _M_insert_any_matcher_posix()386 _M_insert_any_matcher_posix() 387 { 388 _M_stack.push(_StateSeqT(*_M_nfa, 389 _M_nfa->_M_insert_matcher 390 (_AnyMatcher<_TraitsT, false, __icase, __collate> 391 (_M_traits)))); 392 } 393 394 template<typename _TraitsT> 395 template<bool __icase, bool __collate> 396 void 397 _Compiler<_TraitsT>:: _M_insert_char_matcher()398 _M_insert_char_matcher() 399 { 400 _M_stack.push(_StateSeqT(*_M_nfa, 401 _M_nfa->_M_insert_matcher 402 (_CharMatcher<_TraitsT, __icase, __collate> 403 (_M_value[0], _M_traits)))); 404 } 405 406 template<typename _TraitsT> 407 template<bool __icase, bool __collate> 408 void 409 _Compiler<_TraitsT>:: _M_insert_character_class_matcher()410 _M_insert_character_class_matcher() 411 { 412 __glibcxx_assert(_M_value.size() == 1); 413 _BracketMatcher<_TraitsT, __icase, __collate> __matcher 414 (_M_ctype.is(_CtypeT::upper, _M_value[0]), _M_traits); 415 __matcher._M_add_character_class(_M_value, false); 416 __matcher._M_ready(); 417 _M_stack.push(_StateSeqT(*_M_nfa, 418 _M_nfa->_M_insert_matcher(std::move(__matcher)))); 419 } 420 421 template<typename _TraitsT> 422 template<bool __icase, bool __collate> 423 void 424 _Compiler<_TraitsT>:: _M_insert_bracket_matcher(bool __neg)425 _M_insert_bracket_matcher(bool __neg) 426 { 427 _BracketMatcher<_TraitsT, __icase, __collate> __matcher(__neg, _M_traits); 428 pair<bool, _CharT> __last_char; // Optional<_CharT> 429 __last_char.first = false; 430 if (!(_M_flags & regex_constants::ECMAScript)) 431 { 432 if (_M_try_char()) 433 { 434 __last_char.first = true; 435 __last_char.second = _M_value[0]; 436 } 437 else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) 438 { 439 __last_char.first = true; 440 __last_char.second = '-'; 441 } 442 } 443 while (_M_expression_term(__last_char, __matcher)); 444 if (__last_char.first) 445 __matcher._M_add_char(__last_char.second); 446 __matcher._M_ready(); 447 _M_stack.push(_StateSeqT( 448 *_M_nfa, 449 _M_nfa->_M_insert_matcher(std::move(__matcher)))); 450 } 451 452 template<typename _TraitsT> 453 template<bool __icase, bool __collate> 454 bool 455 _Compiler<_TraitsT>:: _M_expression_term(pair<bool,_CharT> & __last_char,_BracketMatcher<_TraitsT,__icase,__collate> & __matcher)456 _M_expression_term(pair<bool, _CharT>& __last_char, 457 _BracketMatcher<_TraitsT, __icase, __collate>& __matcher) 458 { 459 if (_M_match_token(_ScannerT::_S_token_bracket_end)) 460 return false; 461 462 const auto __push_char = [&](_CharT __ch) 463 { 464 if (__last_char.first) 465 __matcher._M_add_char(__last_char.second); 466 else 467 __last_char.first = true; 468 __last_char.second = __ch; 469 }; 470 const auto __flush = [&] 471 { 472 if (__last_char.first) 473 { 474 __matcher._M_add_char(__last_char.second); 475 __last_char.first = false; 476 } 477 }; 478 479 if (_M_match_token(_ScannerT::_S_token_collsymbol)) 480 { 481 auto __symbol = __matcher._M_add_collate_element(_M_value); 482 if (__symbol.size() == 1) 483 __push_char(__symbol[0]); 484 else 485 __flush(); 486 } 487 else if (_M_match_token(_ScannerT::_S_token_equiv_class_name)) 488 { 489 __flush(); 490 __matcher._M_add_equivalence_class(_M_value); 491 } 492 else if (_M_match_token(_ScannerT::_S_token_char_class_name)) 493 { 494 __flush(); 495 __matcher._M_add_character_class(_M_value, false); 496 } 497 else if (_M_try_char()) 498 __push_char(_M_value[0]); 499 // POSIX doesn't allow '-' as a start-range char (say [a-z--0]), 500 // except when the '-' is the first or last character in the bracket 501 // expression ([--0]). ECMAScript treats all '-' after a range as a 502 // normal character. Also see above, where _M_expression_term gets called. 503 // 504 // As a result, POSIX rejects [-----], but ECMAScript doesn't. 505 // Boost (1.57.0) always uses POSIX style even in its ECMAScript syntax. 506 // Clang (3.5) always uses ECMAScript style even in its POSIX syntax. 507 // 508 // It turns out that no one reads BNFs ;) 509 else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) 510 { 511 if (!__last_char.first) 512 { 513 if (!(_M_flags & regex_constants::ECMAScript)) 514 { 515 if (_M_match_token(_ScannerT::_S_token_bracket_end)) 516 { 517 __push_char('-'); 518 return false; 519 } 520 __throw_regex_error( 521 regex_constants::error_range, 522 "Unexpected dash in bracket expression. For POSIX syntax, " 523 "a dash is not treated literally only when it is at " 524 "beginning or end."); 525 } 526 __push_char('-'); 527 } 528 else 529 { 530 if (_M_try_char()) 531 { 532 __matcher._M_make_range(__last_char.second, _M_value[0]); 533 __last_char.first = false; 534 } 535 else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) 536 { 537 __matcher._M_make_range(__last_char.second, '-'); 538 __last_char.first = false; 539 } 540 else 541 { 542 if (_M_scanner._M_get_token() 543 != _ScannerT::_S_token_bracket_end) 544 __throw_regex_error( 545 regex_constants::error_range, 546 "Character is expected after a dash."); 547 __push_char('-'); 548 } 549 } 550 } 551 else if (_M_match_token(_ScannerT::_S_token_quoted_class)) 552 { 553 __flush(); 554 __matcher._M_add_character_class(_M_value, 555 _M_ctype.is(_CtypeT::upper, 556 _M_value[0])); 557 } 558 else 559 __throw_regex_error(regex_constants::error_brack, 560 "Unexpected character in bracket expression."); 561 562 return true; 563 } 564 565 template<typename _TraitsT> 566 bool 567 _Compiler<_TraitsT>:: _M_try_char()568 _M_try_char() 569 { 570 bool __is_char = false; 571 if (_M_match_token(_ScannerT::_S_token_oct_num)) 572 { 573 __is_char = true; 574 _M_value.assign(1, _M_cur_int_value(8)); 575 } 576 else if (_M_match_token(_ScannerT::_S_token_hex_num)) 577 { 578 __is_char = true; 579 _M_value.assign(1, _M_cur_int_value(16)); 580 } 581 else if (_M_match_token(_ScannerT::_S_token_ord_char)) 582 __is_char = true; 583 return __is_char; 584 } 585 586 template<typename _TraitsT> 587 bool 588 _Compiler<_TraitsT>:: _M_match_token(_TokenT token)589 _M_match_token(_TokenT token) 590 { 591 if (token == _M_scanner._M_get_token()) 592 { 593 _M_value = _M_scanner._M_get_value(); 594 _M_scanner._M_advance(); 595 return true; 596 } 597 return false; 598 } 599 600 template<typename _TraitsT> 601 int 602 _Compiler<_TraitsT>:: _M_cur_int_value(int __radix)603 _M_cur_int_value(int __radix) 604 { 605 long __v = 0; 606 for (typename _StringT::size_type __i = 0; 607 __i < _M_value.length(); ++__i) 608 __v =__v * __radix + _M_traits.value(_M_value[__i], __radix); 609 return __v; 610 } 611 612 template<typename _TraitsT, bool __icase, bool __collate> 613 bool 614 _BracketMatcher<_TraitsT, __icase, __collate>:: _M_apply(_CharT __ch,false_type) const615 _M_apply(_CharT __ch, false_type) const 616 { 617 return [this, __ch] 618 { 619 if (std::binary_search(_M_char_set.begin(), _M_char_set.end(), 620 _M_translator._M_translate(__ch))) 621 return true; 622 auto __s = _M_translator._M_transform(__ch); 623 for (auto& __it : _M_range_set) 624 if (_M_translator._M_match_range(__it.first, __it.second, __s)) 625 return true; 626 if (_M_traits.isctype(__ch, _M_class_set)) 627 return true; 628 if (std::find(_M_equiv_set.begin(), _M_equiv_set.end(), 629 _M_traits.transform_primary(&__ch, &__ch+1)) 630 != _M_equiv_set.end()) 631 return true; 632 for (auto& __it : _M_neg_class_set) 633 if (!_M_traits.isctype(__ch, __it)) 634 return true; 635 return false; 636 }() ^ _M_is_non_matching; 637 } 638 } // namespace __detail 639 640 _GLIBCXX_END_NAMESPACE_VERSION 641 } // namespace 642