1 // class template regex -*- C++ -*- 2 3 // Copyright (C) 2010, 2011 Free Software Foundation, Inc. 4 // 5 // This file is part of the GNU ISO C++ Library. This library is free 6 // software; you can redistribute it and/or modify it under the 7 // terms of the GNU General Public License as published by the 8 // Free Software Foundation; either version 3, or (at your option) 9 // any later version. 10 11 // This library is distributed in the hope that it will be useful, 12 // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 // GNU General Public License for more details. 15 16 // Under Section 7 of GPL version 3, you are granted additional 17 // permissions described in the GCC Runtime Library Exception, version 18 // 3.1, as published by the Free Software Foundation. 19 20 // You should have received a copy of the GNU General Public License and 21 // a copy of the GCC Runtime Library Exception along with this program; 22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 // <http://www.gnu.org/licenses/>. 24 25 /** 26 * @file bits/regex_compiler.h 27 * This is an internal header file, included by other library headers. 28 * Do not attempt to use it directly. @headername{regex} 29 */ 30 31 namespace std _GLIBCXX_VISIBILITY(default) 32 { 33 namespace __regex 34 { 35 _GLIBCXX_BEGIN_NAMESPACE_VERSION 36 37 struct _Scanner_base 38 { 39 typedef unsigned int _StateT; 40 41 static constexpr _StateT _S_state_at_start = 1 << 0; 42 static constexpr _StateT _S_state_in_brace = 1 << 2; 43 static constexpr _StateT _S_state_in_bracket = 1 << 3; 44 45 virtual ~_Scanner_base() { }; 46 }; 47 48 // 49 // @brief Scans an input range for regex tokens. 50 // 51 // The %_Scanner class interprets the regular expression pattern in the input 52 // range passed to its constructor as a sequence of parse tokens passed to 53 // the regular expression compiler. The sequence of tokens provided depends 54 // on the flag settings passed to the constructor: different regular 55 // expression grammars will interpret the same input pattern in 56 // syntactically different ways. 57 // 58 template<typename _InputIterator> 59 class _Scanner: public _Scanner_base 60 { 61 public: 62 typedef _InputIterator _IteratorT; 63 typedef typename std::iterator_traits<_IteratorT>::value_type _CharT; 64 typedef std::basic_string<_CharT> _StringT; 65 typedef regex_constants::syntax_option_type _FlagT; 66 typedef const std::ctype<_CharT> _CtypeT; 67 68 // Token types returned from the scanner. 69 enum _TokenT 70 { 71 _S_token_anychar, 72 _S_token_backref, 73 _S_token_bracket_begin, 74 _S_token_bracket_end, 75 _S_token_inverse_class, 76 _S_token_char_class_name, 77 _S_token_closure0, 78 _S_token_closure1, 79 _S_token_collelem_multi, 80 _S_token_collelem_single, 81 _S_token_collsymbol, 82 _S_token_comma, 83 _S_token_dash, 84 _S_token_dup_count, 85 _S_token_eof, 86 _S_token_equiv_class_name, 87 _S_token_interval_begin, 88 _S_token_interval_end, 89 _S_token_line_begin, 90 _S_token_line_end, 91 _S_token_opt, 92 _S_token_or, 93 _S_token_ord_char, 94 _S_token_quoted_char, 95 _S_token_subexpr_begin, 96 _S_token_subexpr_end, 97 _S_token_word_begin, 98 _S_token_word_end, 99 _S_token_unknown 100 }; 101 102 public: 103 _Scanner(_IteratorT __begin, _IteratorT __end, _FlagT __flags, 104 std::locale __loc) 105 : _M_current(__begin) , _M_end(__end) , _M_flags(__flags), 106 _M_ctype(std::use_facet<_CtypeT>(__loc)), _M_state(_S_state_at_start) 107 { _M_advance(); } 108 109 void 110 _M_advance(); 111 112 _TokenT 113 _M_token() const 114 { return _M_curToken; } 115 116 const _StringT& 117 _M_value() const 118 { return _M_curValue; } 119 120 #ifdef _GLIBCXX_DEBUG 121 std::ostream& 122 _M_print(std::ostream&); 123 #endif 124 125 private: 126 void 127 _M_eat_escape(); 128 129 void 130 _M_scan_in_brace(); 131 132 void 133 _M_scan_in_bracket(); 134 135 void 136 _M_eat_charclass(); 137 138 void 139 _M_eat_equivclass(); 140 141 void 142 _M_eat_collsymbol(); 143 144 private: 145 _IteratorT _M_current; 146 _IteratorT _M_end; 147 _FlagT _M_flags; 148 _CtypeT& _M_ctype; 149 _TokenT _M_curToken; 150 _StringT _M_curValue; 151 _StateT _M_state; 152 }; 153 154 template<typename _InputIterator> 155 void 156 _Scanner<_InputIterator>:: 157 _M_advance() 158 { 159 if (_M_current == _M_end) 160 { 161 _M_curToken = _S_token_eof; 162 return; 163 } 164 165 _CharT __c = *_M_current; 166 if (_M_state & _S_state_in_bracket) 167 { 168 _M_scan_in_bracket(); 169 return; 170 } 171 if (_M_state & _S_state_in_brace) 172 { 173 _M_scan_in_brace(); 174 return; 175 } 176 #if 0 177 // TODO: re-enable line anchors when _M_assertion is implemented. 178 // See PR libstdc++/47724 179 else if (_M_state & _S_state_at_start && __c == _M_ctype.widen('^')) 180 { 181 _M_curToken = _S_token_line_begin; 182 ++_M_current; 183 return; 184 } 185 else if (__c == _M_ctype.widen('$')) 186 { 187 _M_curToken = _S_token_line_end; 188 ++_M_current; 189 return; 190 } 191 #endif 192 else if (__c == _M_ctype.widen('.')) 193 { 194 _M_curToken = _S_token_anychar; 195 ++_M_current; 196 return; 197 } 198 else if (__c == _M_ctype.widen('*')) 199 { 200 _M_curToken = _S_token_closure0; 201 ++_M_current; 202 return; 203 } 204 else if (__c == _M_ctype.widen('+')) 205 { 206 _M_curToken = _S_token_closure1; 207 ++_M_current; 208 return; 209 } 210 else if (__c == _M_ctype.widen('|')) 211 { 212 _M_curToken = _S_token_or; 213 ++_M_current; 214 return; 215 } 216 else if (__c == _M_ctype.widen('[')) 217 { 218 _M_curToken = _S_token_bracket_begin; 219 _M_state |= (_S_state_in_bracket | _S_state_at_start); 220 ++_M_current; 221 return; 222 } 223 else if (__c == _M_ctype.widen('\\')) 224 { 225 _M_eat_escape(); 226 return; 227 } 228 else if (!(_M_flags & (regex_constants::basic | regex_constants::grep))) 229 { 230 if (__c == _M_ctype.widen('(')) 231 { 232 _M_curToken = _S_token_subexpr_begin; 233 ++_M_current; 234 return; 235 } 236 else if (__c == _M_ctype.widen(')')) 237 { 238 _M_curToken = _S_token_subexpr_end; 239 ++_M_current; 240 return; 241 } 242 else if (__c == _M_ctype.widen('{')) 243 { 244 _M_curToken = _S_token_interval_begin; 245 _M_state |= _S_state_in_brace; 246 ++_M_current; 247 return; 248 } 249 } 250 251 _M_curToken = _S_token_ord_char; 252 _M_curValue.assign(1, __c); 253 ++_M_current; 254 } 255 256 257 template<typename _InputIterator> 258 void 259 _Scanner<_InputIterator>:: 260 _M_scan_in_brace() 261 { 262 if (_M_ctype.is(_CtypeT::digit, *_M_current)) 263 { 264 _M_curToken = _S_token_dup_count; 265 _M_curValue.assign(1, *_M_current); 266 ++_M_current; 267 while (_M_current != _M_end 268 && _M_ctype.is(_CtypeT::digit, *_M_current)) 269 { 270 _M_curValue += *_M_current; 271 ++_M_current; 272 } 273 return; 274 } 275 else if (*_M_current == _M_ctype.widen(',')) 276 { 277 _M_curToken = _S_token_comma; 278 ++_M_current; 279 return; 280 } 281 if (_M_flags & (regex_constants::basic | regex_constants::grep)) 282 { 283 if (*_M_current == _M_ctype.widen('\\')) 284 _M_eat_escape(); 285 } 286 else 287 { 288 if (*_M_current == _M_ctype.widen('}')) 289 { 290 _M_curToken = _S_token_interval_end; 291 _M_state &= ~_S_state_in_brace; 292 ++_M_current; 293 return; 294 } 295 } 296 } 297 298 template<typename _InputIterator> 299 void 300 _Scanner<_InputIterator>:: 301 _M_scan_in_bracket() 302 { 303 if (_M_state & _S_state_at_start && *_M_current == _M_ctype.widen('^')) 304 { 305 _M_curToken = _S_token_inverse_class; 306 _M_state &= ~_S_state_at_start; 307 ++_M_current; 308 return; 309 } 310 else if (*_M_current == _M_ctype.widen('[')) 311 { 312 ++_M_current; 313 if (_M_current == _M_end) 314 { 315 _M_curToken = _S_token_eof; 316 return; 317 } 318 319 if (*_M_current == _M_ctype.widen('.')) 320 { 321 _M_curToken = _S_token_collsymbol; 322 _M_eat_collsymbol(); 323 return; 324 } 325 else if (*_M_current == _M_ctype.widen(':')) 326 { 327 _M_curToken = _S_token_char_class_name; 328 _M_eat_charclass(); 329 return; 330 } 331 else if (*_M_current == _M_ctype.widen('=')) 332 { 333 _M_curToken = _S_token_equiv_class_name; 334 _M_eat_equivclass(); 335 return; 336 } 337 } 338 else if (*_M_current == _M_ctype.widen('-')) 339 { 340 _M_curToken = _S_token_dash; 341 ++_M_current; 342 return; 343 } 344 else if (*_M_current == _M_ctype.widen(']')) 345 { 346 if (!(_M_flags & regex_constants::ECMAScript) 347 || !(_M_state & _S_state_at_start)) 348 { 349 // special case: only if _not_ chr first after 350 // '[' or '[^' and if not ECMAscript 351 _M_curToken = _S_token_bracket_end; 352 ++_M_current; 353 return; 354 } 355 } 356 _M_curToken = _S_token_collelem_single; 357 _M_curValue.assign(1, *_M_current); 358 ++_M_current; 359 } 360 361 template<typename _InputIterator> 362 void 363 _Scanner<_InputIterator>:: 364 _M_eat_escape() 365 { 366 ++_M_current; 367 if (_M_current == _M_end) 368 { 369 _M_curToken = _S_token_eof; 370 return; 371 } 372 _CharT __c = *_M_current; 373 ++_M_current; 374 375 if (__c == _M_ctype.widen('(')) 376 { 377 if (!(_M_flags & (regex_constants::basic | regex_constants::grep))) 378 { 379 _M_curToken = _S_token_ord_char; 380 _M_curValue.assign(1, __c); 381 } 382 else 383 _M_curToken = _S_token_subexpr_begin; 384 } 385 else if (__c == _M_ctype.widen(')')) 386 { 387 if (!(_M_flags & (regex_constants::basic | regex_constants::grep))) 388 { 389 _M_curToken = _S_token_ord_char; 390 _M_curValue.assign(1, __c); 391 } 392 else 393 _M_curToken = _S_token_subexpr_end; 394 } 395 else if (__c == _M_ctype.widen('{')) 396 { 397 if (!(_M_flags & (regex_constants::basic | regex_constants::grep))) 398 { 399 _M_curToken = _S_token_ord_char; 400 _M_curValue.assign(1, __c); 401 } 402 else 403 { 404 _M_curToken = _S_token_interval_begin; 405 _M_state |= _S_state_in_brace; 406 } 407 } 408 else if (__c == _M_ctype.widen('}')) 409 { 410 if (!(_M_flags & (regex_constants::basic | regex_constants::grep))) 411 { 412 _M_curToken = _S_token_ord_char; 413 _M_curValue.assign(1, __c); 414 } 415 else 416 { 417 if (!(_M_state && _S_state_in_brace)) 418 __throw_regex_error(regex_constants::error_badbrace); 419 _M_state &= ~_S_state_in_brace; 420 _M_curToken = _S_token_interval_end; 421 } 422 } 423 else if (__c == _M_ctype.widen('x')) 424 { 425 ++_M_current; 426 if (_M_current == _M_end) 427 { 428 _M_curToken = _S_token_eof; 429 return; 430 } 431 if (_M_ctype.is(_CtypeT::digit, *_M_current)) 432 { 433 _M_curValue.assign(1, *_M_current); 434 ++_M_current; 435 if (_M_current == _M_end) 436 { 437 _M_curToken = _S_token_eof; 438 return; 439 } 440 if (_M_ctype.is(_CtypeT::digit, *_M_current)) 441 { 442 _M_curValue += *_M_current; 443 ++_M_current; 444 return; 445 } 446 } 447 } 448 else if (__c == _M_ctype.widen('^') 449 || __c == _M_ctype.widen('.') 450 || __c == _M_ctype.widen('*') 451 || __c == _M_ctype.widen('$') 452 || __c == _M_ctype.widen('\\')) 453 { 454 _M_curToken = _S_token_ord_char; 455 _M_curValue.assign(1, __c); 456 } 457 else if (_M_ctype.is(_CtypeT::digit, __c)) 458 { 459 _M_curToken = _S_token_backref; 460 _M_curValue.assign(1, __c); 461 } 462 else 463 __throw_regex_error(regex_constants::error_escape); 464 } 465 466 467 // Eats a character class or throwns an exception. 468 // current point to ':' delimiter on entry, char after ']' on return 469 template<typename _InputIterator> 470 void 471 _Scanner<_InputIterator>:: 472 _M_eat_charclass() 473 { 474 ++_M_current; // skip ':' 475 if (_M_current == _M_end) 476 __throw_regex_error(regex_constants::error_ctype); 477 for (_M_curValue.clear(); 478 _M_current != _M_end && *_M_current != _M_ctype.widen(':'); 479 ++_M_current) 480 _M_curValue += *_M_current; 481 if (_M_current == _M_end) 482 __throw_regex_error(regex_constants::error_ctype); 483 ++_M_current; // skip ':' 484 if (*_M_current != _M_ctype.widen(']')) 485 __throw_regex_error(regex_constants::error_ctype); 486 ++_M_current; // skip ']' 487 } 488 489 490 template<typename _InputIterator> 491 void 492 _Scanner<_InputIterator>:: 493 _M_eat_equivclass() 494 { 495 ++_M_current; // skip '=' 496 if (_M_current == _M_end) 497 __throw_regex_error(regex_constants::error_collate); 498 for (_M_curValue.clear(); 499 _M_current != _M_end && *_M_current != _M_ctype.widen('='); 500 ++_M_current) 501 _M_curValue += *_M_current; 502 if (_M_current == _M_end) 503 __throw_regex_error(regex_constants::error_collate); 504 ++_M_current; // skip '=' 505 if (*_M_current != _M_ctype.widen(']')) 506 __throw_regex_error(regex_constants::error_collate); 507 ++_M_current; // skip ']' 508 } 509 510 511 template<typename _InputIterator> 512 void 513 _Scanner<_InputIterator>:: 514 _M_eat_collsymbol() 515 { 516 ++_M_current; // skip '.' 517 if (_M_current == _M_end) 518 __throw_regex_error(regex_constants::error_collate); 519 for (_M_curValue.clear(); 520 _M_current != _M_end && *_M_current != _M_ctype.widen('.'); 521 ++_M_current) 522 _M_curValue += *_M_current; 523 if (_M_current == _M_end) 524 __throw_regex_error(regex_constants::error_collate); 525 ++_M_current; // skip '.' 526 if (*_M_current != _M_ctype.widen(']')) 527 __throw_regex_error(regex_constants::error_collate); 528 ++_M_current; // skip ']' 529 } 530 531 #ifdef _GLIBCXX_DEBUG 532 template<typename _InputIterator> 533 std::ostream& 534 _Scanner<_InputIterator>:: 535 _M_print(std::ostream& ostr) 536 { 537 switch (_M_curToken) 538 { 539 case _S_token_anychar: 540 ostr << "any-character\n"; 541 break; 542 case _S_token_backref: 543 ostr << "backref\n"; 544 break; 545 case _S_token_bracket_begin: 546 ostr << "bracket-begin\n"; 547 break; 548 case _S_token_bracket_end: 549 ostr << "bracket-end\n"; 550 break; 551 case _S_token_char_class_name: 552 ostr << "char-class-name \"" << _M_curValue << "\"\n"; 553 break; 554 case _S_token_closure0: 555 ostr << "closure0\n"; 556 break; 557 case _S_token_closure1: 558 ostr << "closure1\n"; 559 break; 560 case _S_token_collelem_multi: 561 ostr << "coll-elem-multi \"" << _M_curValue << "\"\n"; 562 break; 563 case _S_token_collelem_single: 564 ostr << "coll-elem-single \"" << _M_curValue << "\"\n"; 565 break; 566 case _S_token_collsymbol: 567 ostr << "collsymbol \"" << _M_curValue << "\"\n"; 568 break; 569 case _S_token_comma: 570 ostr << "comma\n"; 571 break; 572 case _S_token_dash: 573 ostr << "dash\n"; 574 break; 575 case _S_token_dup_count: 576 ostr << "dup count: " << _M_curValue << "\n"; 577 break; 578 case _S_token_eof: 579 ostr << "EOF\n"; 580 break; 581 case _S_token_equiv_class_name: 582 ostr << "equiv-class-name \"" << _M_curValue << "\"\n"; 583 break; 584 case _S_token_interval_begin: 585 ostr << "interval begin\n"; 586 break; 587 case _S_token_interval_end: 588 ostr << "interval end\n"; 589 break; 590 case _S_token_line_begin: 591 ostr << "line begin\n"; 592 break; 593 case _S_token_line_end: 594 ostr << "line end\n"; 595 break; 596 case _S_token_opt: 597 ostr << "opt\n"; 598 break; 599 case _S_token_or: 600 ostr << "or\n"; 601 break; 602 case _S_token_ord_char: 603 ostr << "ordinary character: \"" << _M_value() << "\"\n"; 604 break; 605 case _S_token_quoted_char: 606 ostr << "quoted char\n"; 607 break; 608 case _S_token_subexpr_begin: 609 ostr << "subexpr begin\n"; 610 break; 611 case _S_token_subexpr_end: 612 ostr << "subexpr end\n"; 613 break; 614 case _S_token_word_begin: 615 ostr << "word begin\n"; 616 break; 617 case _S_token_word_end: 618 ostr << "word end\n"; 619 break; 620 case _S_token_unknown: 621 ostr << "-- unknown token --\n"; 622 break; 623 } 624 return ostr; 625 } 626 #endif 627 628 // Builds an NFA from an input iterator interval. 629 template<typename _InIter, typename _TraitsT> 630 class _Compiler 631 { 632 public: 633 typedef _InIter _IterT; 634 typedef typename std::iterator_traits<_InIter>::value_type _CharT; 635 typedef std::basic_string<_CharT> _StringT; 636 typedef regex_constants::syntax_option_type _FlagT; 637 638 public: 639 _Compiler(const _InIter& __b, const _InIter& __e, 640 _TraitsT& __traits, _FlagT __flags); 641 642 const _Nfa& 643 _M_nfa() const 644 { return _M_state_store; } 645 646 private: 647 typedef _Scanner<_InIter> _ScannerT; 648 typedef typename _ScannerT::_TokenT _TokenT; 649 typedef std::stack<_StateSeq, std::vector<_StateSeq> > _StackT; 650 typedef _RangeMatcher<_InIter, _TraitsT> _RMatcherT; 651 652 // accepts a specific token or returns false. 653 bool 654 _M_match_token(_TokenT __token); 655 656 void 657 _M_disjunction(); 658 659 bool 660 _M_alternative(); 661 662 bool 663 _M_term(); 664 665 bool 666 _M_assertion(); 667 668 bool 669 _M_quantifier(); 670 671 bool 672 _M_atom(); 673 674 bool 675 _M_bracket_expression(); 676 677 bool 678 _M_bracket_list(_RMatcherT& __matcher); 679 680 bool 681 _M_follow_list(_RMatcherT& __matcher); 682 683 bool 684 _M_follow_list2(_RMatcherT& __matcher); 685 686 bool 687 _M_expression_term(_RMatcherT& __matcher); 688 689 bool 690 _M_range_expression(_RMatcherT& __matcher); 691 692 bool 693 _M_start_range(_RMatcherT& __matcher); 694 695 bool 696 _M_collating_symbol(_RMatcherT& __matcher); 697 698 bool 699 _M_equivalence_class(_RMatcherT& __matcher); 700 701 bool 702 _M_character_class(_RMatcherT& __matcher); 703 704 int 705 _M_cur_int_value(int __radix); 706 707 private: 708 _TraitsT& _M_traits; 709 _ScannerT _M_scanner; 710 _StringT _M_cur_value; 711 _Nfa _M_state_store; 712 _StackT _M_stack; 713 }; 714 715 template<typename _InIter, typename _TraitsT> 716 _Compiler<_InIter, _TraitsT>:: 717 _Compiler(const _InIter& __b, const _InIter& __e, _TraitsT& __traits, 718 _Compiler<_InIter, _TraitsT>::_FlagT __flags) 719 : _M_traits(__traits), _M_scanner(__b, __e, __flags, _M_traits.getloc()), 720 _M_state_store(__flags) 721 { 722 typedef _StartTagger<_InIter, _TraitsT> _Start; 723 typedef _EndTagger<_InIter, _TraitsT> _End; 724 725 _StateSeq __r(_M_state_store, 726 _M_state_store._M_insert_subexpr_begin(_Start(0))); 727 _M_disjunction(); 728 if (!_M_stack.empty()) 729 { 730 __r._M_append(_M_stack.top()); 731 _M_stack.pop(); 732 } 733 __r._M_append(_M_state_store._M_insert_subexpr_end(0, _End(0))); 734 __r._M_append(_M_state_store._M_insert_accept()); 735 } 736 737 template<typename _InIter, typename _TraitsT> 738 bool 739 _Compiler<_InIter, _TraitsT>:: 740 _M_match_token(_Compiler<_InIter, _TraitsT>::_TokenT token) 741 { 742 if (token == _M_scanner._M_token()) 743 { 744 _M_cur_value = _M_scanner._M_value(); 745 _M_scanner._M_advance(); 746 return true; 747 } 748 return false; 749 } 750 751 template<typename _InIter, typename _TraitsT> 752 void 753 _Compiler<_InIter, _TraitsT>:: 754 _M_disjunction() 755 { 756 this->_M_alternative(); 757 if (_M_match_token(_ScannerT::_S_token_or)) 758 { 759 _StateSeq __alt1 = _M_stack.top(); _M_stack.pop(); 760 this->_M_disjunction(); 761 _StateSeq __alt2 = _M_stack.top(); _M_stack.pop(); 762 _M_stack.push(_StateSeq(__alt1, __alt2)); 763 } 764 } 765 766 template<typename _InIter, typename _TraitsT> 767 bool 768 _Compiler<_InIter, _TraitsT>:: 769 _M_alternative() 770 { 771 if (this->_M_term()) 772 { 773 _StateSeq __re = _M_stack.top(); _M_stack.pop(); 774 this->_M_alternative(); 775 if (!_M_stack.empty()) 776 { 777 __re._M_append(_M_stack.top()); 778 _M_stack.pop(); 779 } 780 _M_stack.push(__re); 781 return true; 782 } 783 return false; 784 } 785 786 template<typename _InIter, typename _TraitsT> 787 bool 788 _Compiler<_InIter, _TraitsT>:: 789 _M_term() 790 { 791 if (this->_M_assertion()) 792 return true; 793 if (this->_M_atom()) 794 { 795 this->_M_quantifier(); 796 return true; 797 } 798 return false; 799 } 800 801 template<typename _InIter, typename _TraitsT> 802 bool 803 _Compiler<_InIter, _TraitsT>:: 804 _M_assertion() 805 { 806 if (_M_match_token(_ScannerT::_S_token_line_begin)) 807 { 808 // __m.push(_Matcher::_S_opcode_line_begin); 809 return true; 810 } 811 if (_M_match_token(_ScannerT::_S_token_line_end)) 812 { 813 // __m.push(_Matcher::_S_opcode_line_end); 814 return true; 815 } 816 if (_M_match_token(_ScannerT::_S_token_word_begin)) 817 { 818 // __m.push(_Matcher::_S_opcode_word_begin); 819 return true; 820 } 821 if (_M_match_token(_ScannerT::_S_token_word_end)) 822 { 823 // __m.push(_Matcher::_S_opcode_word_end); 824 return true; 825 } 826 return false; 827 } 828 829 template<typename _InIter, typename _TraitsT> 830 bool 831 _Compiler<_InIter, _TraitsT>:: 832 _M_quantifier() 833 { 834 if (_M_match_token(_ScannerT::_S_token_closure0)) 835 { 836 if (_M_stack.empty()) 837 __throw_regex_error(regex_constants::error_badrepeat); 838 _StateSeq __r(_M_stack.top(), -1); 839 __r._M_append(__r._M_front()); 840 _M_stack.pop(); 841 _M_stack.push(__r); 842 return true; 843 } 844 if (_M_match_token(_ScannerT::_S_token_closure1)) 845 { 846 if (_M_stack.empty()) 847 __throw_regex_error(regex_constants::error_badrepeat); 848 _StateSeq __r(_M_state_store, 849 _M_state_store. 850 _M_insert_alt(_S_invalid_state_id, 851 _M_stack.top()._M_front())); 852 _M_stack.top()._M_append(__r); 853 return true; 854 } 855 if (_M_match_token(_ScannerT::_S_token_opt)) 856 { 857 if (_M_stack.empty()) 858 __throw_regex_error(regex_constants::error_badrepeat); 859 _StateSeq __r(_M_stack.top(), -1); 860 _M_stack.pop(); 861 _M_stack.push(__r); 862 return true; 863 } 864 if (_M_match_token(_ScannerT::_S_token_interval_begin)) 865 { 866 if (_M_stack.empty()) 867 __throw_regex_error(regex_constants::error_badrepeat); 868 if (!_M_match_token(_ScannerT::_S_token_dup_count)) 869 __throw_regex_error(regex_constants::error_badbrace); 870 _StateSeq __r(_M_stack.top()); 871 int __min_rep = _M_cur_int_value(10); 872 for (int __i = 1; __i < __min_rep; ++__i) 873 _M_stack.top()._M_append(__r._M_clone()); 874 if (_M_match_token(_ScannerT::_S_token_comma)) 875 if (_M_match_token(_ScannerT::_S_token_dup_count)) 876 { 877 int __n = _M_cur_int_value(10) - __min_rep; 878 if (__n < 0) 879 __throw_regex_error(regex_constants::error_badbrace); 880 for (int __i = 0; __i < __n; ++__i) 881 { 882 _StateSeq __r(_M_state_store, 883 _M_state_store. 884 _M_insert_alt(_S_invalid_state_id, 885 _M_stack.top()._M_front())); 886 _M_stack.top()._M_append(__r); 887 } 888 } 889 else 890 { 891 _StateSeq __r(_M_stack.top(), -1); 892 __r._M_push_back(__r._M_front()); 893 _M_stack.pop(); 894 _M_stack.push(__r); 895 } 896 if (!_M_match_token(_ScannerT::_S_token_interval_end)) 897 __throw_regex_error(regex_constants::error_brace); 898 return true; 899 } 900 return false; 901 } 902 903 template<typename _InIter, typename _TraitsT> 904 bool 905 _Compiler<_InIter, _TraitsT>:: 906 _M_atom() 907 { 908 typedef _CharMatcher<_InIter, _TraitsT> _CMatcher; 909 typedef _StartTagger<_InIter, _TraitsT> _Start; 910 typedef _EndTagger<_InIter, _TraitsT> _End; 911 912 if (_M_match_token(_ScannerT::_S_token_anychar)) 913 { 914 _M_stack.push(_StateSeq(_M_state_store, 915 _M_state_store._M_insert_matcher 916 (_AnyMatcher))); 917 return true; 918 } 919 if (_M_match_token(_ScannerT::_S_token_ord_char)) 920 { 921 _M_stack.push(_StateSeq(_M_state_store, 922 _M_state_store._M_insert_matcher 923 (_CMatcher(_M_cur_value[0], _M_traits)))); 924 return true; 925 } 926 if (_M_match_token(_ScannerT::_S_token_quoted_char)) 927 { 928 // note that in the ECMA grammar, this case covers backrefs. 929 _M_stack.push(_StateSeq(_M_state_store, 930 _M_state_store._M_insert_matcher 931 (_CMatcher(_M_cur_value[0], _M_traits)))); 932 return true; 933 } 934 if (_M_match_token(_ScannerT::_S_token_backref)) 935 { 936 // __m.push(_Matcher::_S_opcode_ordchar, _M_cur_value); 937 return true; 938 } 939 if (_M_match_token(_ScannerT::_S_token_subexpr_begin)) 940 { 941 int __mark = _M_state_store._M_sub_count(); 942 _StateSeq __r(_M_state_store, 943 _M_state_store. 944 _M_insert_subexpr_begin(_Start(__mark))); 945 this->_M_disjunction(); 946 if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) 947 __throw_regex_error(regex_constants::error_paren); 948 if (!_M_stack.empty()) 949 { 950 __r._M_append(_M_stack.top()); 951 _M_stack.pop(); 952 } 953 __r._M_append(_M_state_store._M_insert_subexpr_end 954 (__mark, _End(__mark))); 955 _M_stack.push(__r); 956 return true; 957 } 958 return _M_bracket_expression(); 959 } 960 961 template<typename _InIter, typename _TraitsT> 962 bool 963 _Compiler<_InIter, _TraitsT>:: 964 _M_bracket_expression() 965 { 966 if (_M_match_token(_ScannerT::_S_token_bracket_begin)) 967 { 968 _RMatcherT __matcher(_M_match_token(_ScannerT::_S_token_line_begin), 969 _M_traits); 970 if (!_M_bracket_list(__matcher) 971 || !_M_match_token(_ScannerT::_S_token_bracket_end)) 972 __throw_regex_error(regex_constants::error_brack); 973 _M_stack.push(_StateSeq(_M_state_store, 974 _M_state_store._M_insert_matcher(__matcher))); 975 return true; 976 } 977 return false; 978 } 979 980 // If the dash is the last character in the bracket expression, it is not 981 // special. 982 template<typename _InIter, typename _TraitsT> 983 bool 984 _Compiler<_InIter, _TraitsT>:: 985 _M_bracket_list(_RMatcherT& __matcher) 986 { 987 if (_M_follow_list(__matcher)) 988 { 989 if (_M_match_token(_ScannerT::_S_token_dash)) 990 __matcher._M_add_char(_M_cur_value[0]); 991 return true; 992 } 993 return false; 994 } 995 996 template<typename _InIter, typename _TraitsT> 997 bool 998 _Compiler<_InIter, _TraitsT>:: 999 _M_follow_list(_RMatcherT& __matcher) 1000 { return _M_expression_term(__matcher) && _M_follow_list2(__matcher); } 1001 1002 template<typename _InIter, typename _TraitsT> 1003 bool 1004 _Compiler<_InIter, _TraitsT>:: 1005 _M_follow_list2(_RMatcherT& __matcher) 1006 { 1007 if (_M_expression_term(__matcher)) 1008 return _M_follow_list2(__matcher); 1009 return true; 1010 } 1011 1012 template<typename _InIter, typename _TraitsT> 1013 bool 1014 _Compiler<_InIter, _TraitsT>:: 1015 _M_expression_term(_RMatcherT& __matcher) 1016 { 1017 return (_M_collating_symbol(__matcher) 1018 || _M_character_class(__matcher) 1019 || _M_equivalence_class(__matcher) 1020 || (_M_start_range(__matcher) 1021 && _M_range_expression(__matcher))); 1022 } 1023 1024 template<typename _InIter, typename _TraitsT> 1025 bool 1026 _Compiler<_InIter, _TraitsT>:: 1027 _M_range_expression(_RMatcherT& __matcher) 1028 { 1029 if (!_M_collating_symbol(__matcher)) 1030 if (!_M_match_token(_ScannerT::_S_token_dash)) 1031 __throw_regex_error(regex_constants::error_range); 1032 __matcher._M_make_range(); 1033 return true; 1034 } 1035 1036 template<typename _InIter, typename _TraitsT> 1037 bool 1038 _Compiler<_InIter, _TraitsT>:: 1039 _M_start_range(_RMatcherT& __matcher) 1040 { return _M_match_token(_ScannerT::_S_token_dash); } 1041 1042 template<typename _InIter, typename _TraitsT> 1043 bool 1044 _Compiler<_InIter, _TraitsT>:: 1045 _M_collating_symbol(_RMatcherT& __matcher) 1046 { 1047 if (_M_match_token(_ScannerT::_S_token_collelem_single)) 1048 { 1049 __matcher._M_add_char(_M_cur_value[0]); 1050 return true; 1051 } 1052 if (_M_match_token(_ScannerT::_S_token_collsymbol)) 1053 { 1054 __matcher._M_add_collating_element(_M_cur_value); 1055 return true; 1056 } 1057 return false; 1058 } 1059 1060 template<typename _InIter, typename _TraitsT> 1061 bool 1062 _Compiler<_InIter, _TraitsT>:: 1063 _M_equivalence_class(_RMatcherT& __matcher) 1064 { 1065 if (_M_match_token(_ScannerT::_S_token_equiv_class_name)) 1066 { 1067 __matcher._M_add_equivalence_class(_M_cur_value); 1068 return true; 1069 } 1070 return false; 1071 } 1072 1073 template<typename _InIter, typename _TraitsT> 1074 bool 1075 _Compiler<_InIter, _TraitsT>:: 1076 _M_character_class(_RMatcherT& __matcher) 1077 { 1078 if (_M_match_token(_ScannerT::_S_token_char_class_name)) 1079 { 1080 __matcher._M_add_character_class(_M_cur_value); 1081 return true; 1082 } 1083 return false; 1084 } 1085 1086 template<typename _InIter, typename _TraitsT> 1087 int 1088 _Compiler<_InIter, _TraitsT>:: 1089 _M_cur_int_value(int __radix) 1090 { 1091 int __v = 0; 1092 for (typename _StringT::size_type __i = 0; 1093 __i < _M_cur_value.length(); ++__i) 1094 __v =__v * __radix + _M_traits.value(_M_cur_value[__i], __radix); 1095 return __v; 1096 } 1097 1098 template<typename _InIter, typename _TraitsT> 1099 _AutomatonPtr 1100 __compile(const _InIter& __b, const _InIter& __e, _TraitsT& __t, 1101 regex_constants::syntax_option_type __f) 1102 { return _AutomatonPtr(new _Nfa(_Compiler<_InIter, _TraitsT>(__b, __e, __t, 1103 __f)._M_nfa())); } 1104 1105 _GLIBCXX_END_NAMESPACE_VERSION 1106 } // namespace __regex 1107 } // namespace std 1108 1109 /* vim: set ts=8 sw=2 sts=2: */ 1110