1 // class template regex -*- C++ -*- 2 3 // Copyright (C) 2013-2020 Free Software Foundation, Inc. 4 // 5 // This file is part of the GNU ISO C++ Library. This library is free 6 // software; you can redistribute it and/or modify it under the 7 // terms of the GNU General Public License as published by the 8 // Free Software Foundation; either version 3, or (at your option) 9 // any later version. 10 11 // This library is distributed in the hope that it will be useful, 12 // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 // GNU General Public License for more details. 15 16 // Under Section 7 of GPL version 3, you are granted additional 17 // permissions described in the GCC Runtime Library Exception, version 18 // 3.1, as published by the Free Software Foundation. 19 20 // You should have received a copy of the GNU General Public License and 21 // a copy of the GCC Runtime Library Exception along with this program; 22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 // <http://www.gnu.org/licenses/>. 24 25 /** 26 * @file bits/regex_scanner.tcc 27 * This is an internal header file, included by other library headers. 28 * Do not attempt to use it directly. @headername{regex} 29 */ 30 31 // FIXME make comments doxygen format. 32 33 // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep 34 // and awk 35 // 1) grep is basic except '\n' is treated as '|' 36 // 2) egrep is extended except '\n' is treated as '|' 37 // 3) awk is extended except special escaping rules, and there's no 38 // back-reference. 39 // 40 // References: 41 // 42 // ECMAScript: ECMA-262 15.10 43 // 44 // basic, extended: 45 // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html 46 // 47 // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html 48 49 namespace std _GLIBCXX_VISIBILITY(default) 50 { 51 _GLIBCXX_BEGIN_NAMESPACE_VERSION 52 53 namespace __detail 54 { 55 template<typename _CharT> 56 _Scanner<_CharT>:: _Scanner(typename _Scanner::_IterT __begin,typename _Scanner::_IterT __end,_FlagT __flags,std::locale __loc)57 _Scanner(typename _Scanner::_IterT __begin, 58 typename _Scanner::_IterT __end, 59 _FlagT __flags, std::locale __loc) 60 : _ScannerBase(__flags), 61 _M_current(__begin), _M_end(__end), 62 _M_ctype(std::use_facet<_CtypeT>(__loc)), 63 _M_eat_escape(_M_is_ecma() 64 ? &_Scanner::_M_eat_escape_ecma 65 : &_Scanner::_M_eat_escape_posix) 66 { _M_advance(); } 67 68 template<typename _CharT> 69 void 70 _Scanner<_CharT>:: _M_advance()71 _M_advance() 72 { 73 if (_M_current == _M_end) 74 { 75 _M_token = _S_token_eof; 76 return; 77 } 78 79 if (_M_state == _S_state_normal) 80 _M_scan_normal(); 81 else if (_M_state == _S_state_in_bracket) 82 _M_scan_in_bracket(); 83 else if (_M_state == _S_state_in_brace) 84 _M_scan_in_brace(); 85 else 86 { 87 __glibcxx_assert(false); 88 } 89 } 90 91 // Differences between styles: 92 // 1) "\(", "\)", "\{" in basic. It's not escaping. 93 // 2) "(?:", "(?=", "(?!" in ECMAScript. 94 template<typename _CharT> 95 void 96 _Scanner<_CharT>:: _M_scan_normal()97 _M_scan_normal() 98 { 99 auto __c = *_M_current++; 100 101 if (std::strchr(_M_spec_char, _M_ctype.narrow(__c, ' ')) == nullptr) 102 { 103 _M_token = _S_token_ord_char; 104 _M_value.assign(1, __c); 105 return; 106 } 107 if (__c == '\\') 108 { 109 if (_M_current == _M_end) 110 __throw_regex_error( 111 regex_constants::error_escape, 112 "Unexpected end of regex when escaping."); 113 114 if (!_M_is_basic() 115 || (*_M_current != '(' 116 && *_M_current != ')' 117 && *_M_current != '{')) 118 { 119 (this->*_M_eat_escape)(); 120 return; 121 } 122 __c = *_M_current++; 123 } 124 if (__c == '(') 125 { 126 if (_M_is_ecma() && *_M_current == '?') 127 { 128 if (++_M_current == _M_end) 129 __throw_regex_error( 130 regex_constants::error_paren, 131 "Unexpected end of regex when in an open parenthesis."); 132 133 if (*_M_current == ':') 134 { 135 ++_M_current; 136 _M_token = _S_token_subexpr_no_group_begin; 137 } 138 else if (*_M_current == '=') 139 { 140 ++_M_current; 141 _M_token = _S_token_subexpr_lookahead_begin; 142 _M_value.assign(1, 'p'); 143 } 144 else if (*_M_current == '!') 145 { 146 ++_M_current; 147 _M_token = _S_token_subexpr_lookahead_begin; 148 _M_value.assign(1, 'n'); 149 } 150 else 151 __throw_regex_error( 152 regex_constants::error_paren, 153 "Invalid special open parenthesis."); 154 } 155 else if (_M_flags & regex_constants::nosubs) 156 _M_token = _S_token_subexpr_no_group_begin; 157 else 158 _M_token = _S_token_subexpr_begin; 159 } 160 else if (__c == ')') 161 _M_token = _S_token_subexpr_end; 162 else if (__c == '[') 163 { 164 _M_state = _S_state_in_bracket; 165 _M_at_bracket_start = true; 166 if (_M_current != _M_end && *_M_current == '^') 167 { 168 _M_token = _S_token_bracket_neg_begin; 169 ++_M_current; 170 } 171 else 172 _M_token = _S_token_bracket_begin; 173 } 174 else if (__c == '{') 175 { 176 _M_state = _S_state_in_brace; 177 _M_token = _S_token_interval_begin; 178 } 179 else if (__builtin_expect(__c == _CharT(0), false)) 180 { 181 if (!_M_is_ecma()) 182 { 183 __throw_regex_error(regex_constants::_S_null, 184 "Unexpected null character in regular expression"); 185 } 186 _M_token = _S_token_ord_char; 187 _M_value.assign(1, __c); 188 } 189 else if (__c != ']' && __c != '}') 190 { 191 auto __it = _M_token_tbl; 192 auto __narrowc = _M_ctype.narrow(__c, '\0'); 193 for (; __it->first != '\0'; ++__it) 194 if (__it->first == __narrowc) 195 { 196 _M_token = __it->second; 197 return; 198 } 199 __glibcxx_assert(false); 200 } 201 else 202 { 203 _M_token = _S_token_ord_char; 204 _M_value.assign(1, __c); 205 } 206 } 207 208 // Differences between styles: 209 // 1) different semantics of "[]" and "[^]". 210 // 2) Escaping in bracket expr. 211 template<typename _CharT> 212 void 213 _Scanner<_CharT>:: _M_scan_in_bracket()214 _M_scan_in_bracket() 215 { 216 if (_M_current == _M_end) 217 __throw_regex_error( 218 regex_constants::error_brack, 219 "Unexpected end of regex when in bracket expression."); 220 221 auto __c = *_M_current++; 222 223 if (__c == '-') 224 _M_token = _S_token_bracket_dash; 225 else if (__c == '[') 226 { 227 if (_M_current == _M_end) 228 __throw_regex_error(regex_constants::error_brack, 229 "Unexpected character class open bracket."); 230 231 if (*_M_current == '.') 232 { 233 _M_token = _S_token_collsymbol; 234 _M_eat_class(*_M_current++); 235 } 236 else if (*_M_current == ':') 237 { 238 _M_token = _S_token_char_class_name; 239 _M_eat_class(*_M_current++); 240 } 241 else if (*_M_current == '=') 242 { 243 _M_token = _S_token_equiv_class_name; 244 _M_eat_class(*_M_current++); 245 } 246 else 247 { 248 _M_token = _S_token_ord_char; 249 _M_value.assign(1, __c); 250 } 251 } 252 // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted 253 // literally. So "[]]" and "[^]]" are valid regexes. See the testcases 254 // `*/empty_range.cc`. 255 else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start)) 256 { 257 _M_token = _S_token_bracket_end; 258 _M_state = _S_state_normal; 259 } 260 // ECMAScript and awk permits escaping in bracket. 261 else if (__c == '\\' && (_M_is_ecma() || _M_is_awk())) 262 (this->*_M_eat_escape)(); 263 else 264 { 265 _M_token = _S_token_ord_char; 266 _M_value.assign(1, __c); 267 } 268 _M_at_bracket_start = false; 269 } 270 271 // Differences between styles: 272 // 1) "\}" in basic style. 273 template<typename _CharT> 274 void 275 _Scanner<_CharT>:: _M_scan_in_brace()276 _M_scan_in_brace() 277 { 278 if (_M_current == _M_end) 279 __throw_regex_error( 280 regex_constants::error_brace, 281 "Unexpected end of regex when in brace expression."); 282 283 auto __c = *_M_current++; 284 285 if (_M_ctype.is(_CtypeT::digit, __c)) 286 { 287 _M_token = _S_token_dup_count; 288 _M_value.assign(1, __c); 289 while (_M_current != _M_end 290 && _M_ctype.is(_CtypeT::digit, *_M_current)) 291 _M_value += *_M_current++; 292 } 293 else if (__c == ',') 294 _M_token = _S_token_comma; 295 // basic use \}. 296 else if (_M_is_basic()) 297 { 298 if (__c == '\\' && _M_current != _M_end && *_M_current == '}') 299 { 300 _M_state = _S_state_normal; 301 _M_token = _S_token_interval_end; 302 ++_M_current; 303 } 304 else 305 __throw_regex_error(regex_constants::error_badbrace, 306 "Unexpected character in brace expression."); 307 } 308 else if (__c == '}') 309 { 310 _M_state = _S_state_normal; 311 _M_token = _S_token_interval_end; 312 } 313 else 314 __throw_regex_error(regex_constants::error_badbrace, 315 "Unexpected character in brace expression."); 316 } 317 318 template<typename _CharT> 319 void 320 _Scanner<_CharT>:: _M_eat_escape_ecma()321 _M_eat_escape_ecma() 322 { 323 if (_M_current == _M_end) 324 __throw_regex_error(regex_constants::error_escape, 325 "Unexpected end of regex when escaping."); 326 327 auto __c = *_M_current++; 328 auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0')); 329 330 if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket)) 331 { 332 _M_token = _S_token_ord_char; 333 _M_value.assign(1, *__pos); 334 } 335 else if (__c == 'b') 336 { 337 _M_token = _S_token_word_bound; 338 _M_value.assign(1, 'p'); 339 } 340 else if (__c == 'B') 341 { 342 _M_token = _S_token_word_bound; 343 _M_value.assign(1, 'n'); 344 } 345 // N3376 28.13 346 else if (__c == 'd' 347 || __c == 'D' 348 || __c == 's' 349 || __c == 'S' 350 || __c == 'w' 351 || __c == 'W') 352 { 353 _M_token = _S_token_quoted_class; 354 _M_value.assign(1, __c); 355 } 356 else if (__c == 'c') 357 { 358 if (_M_current == _M_end) 359 __throw_regex_error( 360 regex_constants::error_escape, 361 "Unexpected end of regex when reading control code."); 362 _M_token = _S_token_ord_char; 363 _M_value.assign(1, *_M_current++); 364 } 365 else if (__c == 'x' || __c == 'u') 366 { 367 _M_value.erase(); 368 for (int __i = 0; __i < (__c == 'x' ? 2 : 4); __i++) 369 { 370 if (_M_current == _M_end 371 || !_M_ctype.is(_CtypeT::xdigit, *_M_current)) 372 __throw_regex_error( 373 regex_constants::error_escape, 374 "Unexpected end of regex when ascii character."); 375 _M_value += *_M_current++; 376 } 377 _M_token = _S_token_hex_num; 378 } 379 // ECMAScript recognizes multi-digit back-references. 380 else if (_M_ctype.is(_CtypeT::digit, __c)) 381 { 382 _M_value.assign(1, __c); 383 while (_M_current != _M_end 384 && _M_ctype.is(_CtypeT::digit, *_M_current)) 385 _M_value += *_M_current++; 386 _M_token = _S_token_backref; 387 } 388 else 389 { 390 _M_token = _S_token_ord_char; 391 _M_value.assign(1, __c); 392 } 393 } 394 395 // Differences between styles: 396 // 1) Extended doesn't support backref, but basic does. 397 template<typename _CharT> 398 void 399 _Scanner<_CharT>:: _M_eat_escape_posix()400 _M_eat_escape_posix() 401 { 402 if (_M_current == _M_end) 403 __throw_regex_error(regex_constants::error_escape, 404 "Unexpected end of regex when escaping."); 405 406 auto __c = *_M_current; 407 auto __pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0')); 408 409 if (__pos != nullptr && *__pos != '\0') 410 { 411 _M_token = _S_token_ord_char; 412 _M_value.assign(1, __c); 413 } 414 // We MUST judge awk before handling backrefs. There's no backref in awk. 415 else if (_M_is_awk()) 416 { 417 _M_eat_escape_awk(); 418 return; 419 } 420 else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0') 421 { 422 _M_token = _S_token_backref; 423 _M_value.assign(1, __c); 424 } 425 else 426 { 427 #ifdef __STRICT_ANSI__ 428 // POSIX says it is undefined to escape ordinary characters 429 __throw_regex_error(regex_constants::error_escape, 430 "Unexpected escape character."); 431 #else 432 _M_token = _S_token_ord_char; 433 _M_value.assign(1, __c); 434 #endif 435 } 436 ++_M_current; 437 } 438 439 template<typename _CharT> 440 void 441 _Scanner<_CharT>:: _M_eat_escape_awk()442 _M_eat_escape_awk() 443 { 444 auto __c = *_M_current++; 445 auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0')); 446 447 if (__pos != nullptr) 448 { 449 _M_token = _S_token_ord_char; 450 _M_value.assign(1, *__pos); 451 } 452 // \ddd for oct representation 453 else if (_M_ctype.is(_CtypeT::digit, __c) 454 && __c != '8' 455 && __c != '9') 456 { 457 _M_value.assign(1, __c); 458 for (int __i = 0; 459 __i < 2 460 && _M_current != _M_end 461 && _M_ctype.is(_CtypeT::digit, *_M_current) 462 && *_M_current != '8' 463 && *_M_current != '9'; 464 __i++) 465 _M_value += *_M_current++; 466 _M_token = _S_token_oct_num; 467 return; 468 } 469 else 470 __throw_regex_error(regex_constants::error_escape, 471 "Unexpected escape character."); 472 } 473 474 // Eats a character class or throws an exception. 475 // __ch could be ':', '.' or '=', _M_current is the char after ']' when 476 // returning. 477 template<typename _CharT> 478 void 479 _Scanner<_CharT>:: _M_eat_class(char __ch)480 _M_eat_class(char __ch) 481 { 482 for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;) 483 _M_value += *_M_current++; 484 if (_M_current == _M_end 485 || *_M_current++ != __ch 486 || _M_current == _M_end // skip __ch 487 || *_M_current++ != ']') // skip ']' 488 { 489 if (__ch == ':') 490 __throw_regex_error(regex_constants::error_ctype, 491 "Unexpected end of character class."); 492 else 493 __throw_regex_error(regex_constants::error_collate, 494 "Unexpected end of character class."); 495 } 496 } 497 498 #ifdef _GLIBCXX_DEBUG 499 template<typename _CharT> 500 std::ostream& 501 _Scanner<_CharT>:: _M_print(std::ostream & ostr)502 _M_print(std::ostream& ostr) 503 { 504 switch (_M_token) 505 { 506 case _S_token_anychar: 507 ostr << "any-character\n"; 508 break; 509 case _S_token_backref: 510 ostr << "backref\n"; 511 break; 512 case _S_token_bracket_begin: 513 ostr << "bracket-begin\n"; 514 break; 515 case _S_token_bracket_neg_begin: 516 ostr << "bracket-neg-begin\n"; 517 break; 518 case _S_token_bracket_end: 519 ostr << "bracket-end\n"; 520 break; 521 case _S_token_char_class_name: 522 ostr << "char-class-name \"" << _M_value << "\"\n"; 523 break; 524 case _S_token_closure0: 525 ostr << "closure0\n"; 526 break; 527 case _S_token_closure1: 528 ostr << "closure1\n"; 529 break; 530 case _S_token_collsymbol: 531 ostr << "collsymbol \"" << _M_value << "\"\n"; 532 break; 533 case _S_token_comma: 534 ostr << "comma\n"; 535 break; 536 case _S_token_dup_count: 537 ostr << "dup count: " << _M_value << "\n"; 538 break; 539 case _S_token_eof: 540 ostr << "EOF\n"; 541 break; 542 case _S_token_equiv_class_name: 543 ostr << "equiv-class-name \"" << _M_value << "\"\n"; 544 break; 545 case _S_token_interval_begin: 546 ostr << "interval begin\n"; 547 break; 548 case _S_token_interval_end: 549 ostr << "interval end\n"; 550 break; 551 case _S_token_line_begin: 552 ostr << "line begin\n"; 553 break; 554 case _S_token_line_end: 555 ostr << "line end\n"; 556 break; 557 case _S_token_opt: 558 ostr << "opt\n"; 559 break; 560 case _S_token_or: 561 ostr << "or\n"; 562 break; 563 case _S_token_ord_char: 564 ostr << "ordinary character: \"" << _M_value << "\"\n"; 565 break; 566 case _S_token_subexpr_begin: 567 ostr << "subexpr begin\n"; 568 break; 569 case _S_token_subexpr_no_group_begin: 570 ostr << "no grouping subexpr begin\n"; 571 break; 572 case _S_token_subexpr_lookahead_begin: 573 ostr << "lookahead subexpr begin\n"; 574 break; 575 case _S_token_subexpr_end: 576 ostr << "subexpr end\n"; 577 break; 578 case _S_token_unknown: 579 ostr << "-- unknown token --\n"; 580 break; 581 case _S_token_oct_num: 582 ostr << "oct number " << _M_value << "\n"; 583 break; 584 case _S_token_hex_num: 585 ostr << "hex number " << _M_value << "\n"; 586 break; 587 case _S_token_quoted_class: 588 ostr << "quoted class " << "\\" << _M_value << "\n"; 589 break; 590 default: 591 _GLIBCXX_DEBUG_ASSERT(false); 592 } 593 return ostr; 594 } 595 #endif 596 597 } // namespace __detail 598 _GLIBCXX_END_NAMESPACE_VERSION 599 } // namespace 600