1 // rules.hpp 2 // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/) 3 // 4 // Distributed under the Boost Software License, Version 1.0. (See accompanying 5 // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 6 #ifndef BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_RULES_HPP 7 #define BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_RULES_HPP 8 9 #include "consts.hpp" 10 #include <deque> 11 #include <locale> 12 #include <map> 13 #include "runtime_error.hpp" 14 #include <set> 15 #include "size_t.hpp" 16 #include <sstream> 17 #include <string> 18 #include <vector> 19 20 namespace boost 21 { 22 namespace lexer 23 { 24 namespace detail 25 { 26 // return name of initial state 27 template <typename CharT> 28 struct strings; 29 30 template <> 31 struct strings<char> equivsetboost::lexer::detail::equivset32 { 33 static const char *initial () 34 { 35 return "INITIAL"; 36 } 37 38 static const char *dot () 39 { 40 return "."; 41 } 42 43 static const char *all_states () 44 { 45 return "*"; 46 } 47 48 static const char *char_name () 49 { 50 return "char"; 51 } 52 53 static const char *char_prefix () 54 { 55 return ""; 56 } 57 }; intersectboost::lexer::detail::equivset58 59 template <> 60 struct strings<wchar_t> 61 { 62 static const wchar_t *initial () 63 { 64 return L"INITIAL"; 65 } 66 67 static const wchar_t *dot () 68 { 69 return L"."; 70 } 71 72 static const wchar_t *all_states () 73 { 74 return L"*"; 75 } 76 77 static const char *char_name () 78 { 79 return "wchar_t"; 80 } 81 82 static const char *char_prefix () 83 { 84 return "L"; 85 } 86 }; 87 } 88 89 template<typename CharT> 90 class basic_rules 91 { 92 public: 93 typedef std::vector<std::size_t> id_vector; 94 typedef std::deque<id_vector> id_vector_deque; 95 typedef std::basic_string<CharT> string; 96 typedef std::deque<string> string_deque; 97 typedef std::deque<string_deque> string_deque_deque; 98 typedef std::set<string> string_set; 99 typedef std::pair<string, string> string_pair; 100 typedef std::deque<string_pair> string_pair_deque; 101 typedef std::map<string, std::size_t> string_size_t_map; 102 typedef std::pair<string, std::size_t> string_size_t_pair; 103 104 basic_rules (const regex_flags flags_ = dot_not_newline, intersect_indexesboost::lexer::detail::equivset105 std::size_t (*counter_ptr_) () = 0) : 106 _flags (flags_), 107 _counter (0), 108 _counter_ptr (counter_ptr_) 109 { 110 add_state (initial ()); 111 } 112 113 void clear () 114 { 115 _statemap.clear (); 116 _macrodeque.clear (); 117 _macroset.clear (); 118 _regexes.clear (); 119 _ids.clear (); 120 _unique_ids.clear (); 121 _states.clear (); 122 _flags = dot_not_newline; 123 _locale = std::locale (); 124 add_state (initial ()); 125 } 126 127 void clear (const CharT *state_name_) 128 { 129 std::size_t state_ = state (state_name_); 130 131 if (state_ != npos) 132 { 133 _regexes[state_].clear (); 134 _ids[state_].clear (); 135 _unique_ids[state_].clear (); 136 _states[state_].clear (); 137 } 138 } 139 140 void flags (const regex_flags flags_) 141 { 142 _flags = flags_; 143 } 144 145 regex_flags flags () const 146 { 147 return _flags; 148 } 149 150 std::size_t next_unique_id () 151 { 152 return _counter_ptr ? _counter_ptr () : _counter++; 153 } 154 155 std::locale imbue (std::locale &locale_) 156 { 157 std::locale loc_ = _locale; 158 159 _locale = locale_; 160 return loc_; 161 } 162 163 const std::locale &locale () const 164 { 165 return _locale; 166 } 167 168 std::size_t state (const CharT *name_) const 169 { 170 std::size_t state_ = npos; 171 typename string_size_t_map::const_iterator iter_ = 172 _statemap.find (name_); 173 174 if (iter_ != _statemap.end ()) 175 { 176 state_ = iter_->second; 177 } 178 179 return state_; 180 } 181 182 const CharT *state (const std::size_t index_) const 183 { 184 if (index_ == 0) 185 { 186 return initial (); 187 } 188 else 189 { 190 const std::size_t vec_index_ = index_ - 1; 191 192 if (vec_index_ > _lexer_state_names.size () - 1) 193 { 194 return 0; 195 } 196 else 197 { 198 return _lexer_state_names[vec_index_].c_str (); 199 } 200 } 201 } 202 203 std::size_t add_state (const CharT *name_) 204 { 205 validate (name_); 206 207 if (_statemap.insert (string_size_t_pair (name_, 208 _statemap.size ())).second) 209 { 210 _regexes.push_back (string_deque ()); 211 _ids.push_back (id_vector ()); 212 _unique_ids.push_back (id_vector ()); 213 _states.push_back (id_vector ()); 214 215 if (string (name_) != initial ()) 216 { 217 _lexer_state_names.push_back (name_); 218 } 219 } 220 221 // Initial is not stored, so no need to - 1. 222 return _lexer_state_names.size (); 223 } 224 225 void add_macro (const CharT *name_, const CharT *regex_) 226 { 227 add_macro (name_, string (regex_)); 228 } 229 230 void add_macro (const CharT *name_, const CharT *regex_start_, 231 const CharT *regex_end_) 232 { 233 add_macro (name_, string (regex_start_, regex_end_)); 234 } 235 236 void add_macro (const CharT *name_, const string ®ex_) 237 { 238 validate (name_); 239 240 typename string_set::const_iterator iter_ = _macroset.find (name_); 241 242 if (iter_ == _macroset.end ()) 243 { 244 _macrodeque.push_back (string_pair (name_, regex_)); 245 _macroset.insert (name_); 246 } 247 else 248 { 249 std::basic_stringstream<CharT> ss_; 250 std::ostringstream os_; 251 252 os_ << "Attempt to redefine MACRO '"; 253 254 while (*name_) 255 { 256 os_ << ss_.narrow (*name_++, static_cast<CharT> (' ')); 257 } 258 259 os_ << "'."; 260 throw runtime_error (os_.str ()); 261 } 262 } 263 264 void add_macros (const basic_rules<CharT> &rules_) 265 { 266 const string_pair_deque ¯os_ = rules_.macrodeque (); 267 typename string_pair_deque::const_iterator macro_iter_ = 268 macros_.begin (); 269 typename string_pair_deque::const_iterator macro_end_ = 270 macros_.end (); 271 272 for (; macro_iter_ != macro_end_; ++macro_iter_) 273 { 274 add_macro (macro_iter_->first.c_str (), 275 macro_iter_->second.c_str ()); 276 } 277 } 278 279 void merge_macros (const basic_rules<CharT> &rules_) 280 { 281 const string_pair_deque ¯os_ = rules_.macrodeque (); 282 typename string_pair_deque::const_iterator macro_iter_ = 283 macros_.begin (); 284 typename string_pair_deque::const_iterator macro_end_ = 285 macros_.end (); 286 typename string_set::const_iterator macro_dest_iter_; 287 typename string_set::const_iterator macro_dest_end_ = _macroset.end (); 288 289 for (; macro_iter_ != macro_end_; ++macro_iter_) 290 { 291 macro_dest_iter_ = _macroset.find (macro_iter_->first); 292 293 if (macro_dest_iter_ == macro_dest_end_) 294 { 295 add_macro (macro_iter_->first.c_str (), 296 macro_iter_->second.c_str ()); 297 } 298 } 299 } 300 301 std::size_t add (const CharT *regex_, const std::size_t id_) 302 { 303 return add (string (regex_), id_); 304 } 305 306 std::size_t add (const CharT *regex_start_, const CharT *regex_end_, 307 const std::size_t id_) 308 { 309 return add (string (regex_start_, regex_end_), id_); 310 } 311 312 std::size_t add (const string ®ex_, const std::size_t id_) 313 { 314 const std::size_t counter_ = next_unique_id (); 315 316 check_for_invalid_id (id_); 317 _regexes.front ().push_back (regex_); 318 _ids.front ().push_back (id_); 319 _unique_ids.front ().push_back (counter_); 320 _states.front ().push_back (0); 321 return counter_; 322 } 323 324 std::size_t add (const CharT *curr_state_, const CharT *regex_, 325 const CharT *new_state_) 326 { 327 return add (curr_state_, string (regex_), new_state_); 328 } 329 330 std::size_t add (const CharT *curr_state_, const CharT *regex_start_, 331 const CharT *regex_end_, const CharT *new_state_) 332 { 333 return add (curr_state_, string (regex_start_, regex_end_), 334 new_state_); 335 } 336 337 std::size_t add (const CharT *curr_state_, const string ®ex_, 338 const CharT *new_state_) 339 { 340 return add (curr_state_, regex_, 0, new_state_, false); 341 } 342 343 std::size_t add (const CharT *curr_state_, const CharT *regex_, 344 const std::size_t id_, const CharT *new_state_) 345 { 346 return add (curr_state_, string (regex_), id_, new_state_); 347 } 348 349 std::size_t add (const CharT *curr_state_, const CharT *regex_start_, 350 const CharT *regex_end_, const std::size_t id_, 351 const CharT *new_state_) 352 { 353 return add (curr_state_, string (regex_start_, regex_end_), id_, 354 new_state_); 355 } 356 357 std::size_t add (const CharT *curr_state_, const string ®ex_, 358 const std::size_t id_, const CharT *new_state_) 359 { 360 return add (curr_state_, regex_, id_, new_state_, true); 361 } 362 363 void add (const CharT *source_, const basic_rules<CharT> &rules_, 364 const CharT *dest_, const CharT *to_ = detail::strings<CharT>::dot ()) 365 { 366 const bool star_ = *source_ == '*' && *(source_ + 1) == 0; 367 const bool dest_dot_ = *dest_ == '.' && *(dest_ + 1) == 0; 368 const bool to_dot_ = *to_ == '.' && *(to_ + 1) == 0; 369 std::size_t state_ = 0; 370 const string_deque_deque &all_regexes_ = rules_.regexes (); 371 const id_vector_deque &all_ids_ = rules_.ids (); 372 const id_vector_deque &all_unique_ids_ = rules_.unique_ids (); 373 const id_vector_deque &all_states_ = rules_.states (); 374 typename string_deque::const_iterator regex_iter_; 375 typename string_deque::const_iterator regex_end_; 376 typename id_vector::const_iterator id_iter_; 377 typename id_vector::const_iterator uid_iter_; 378 typename id_vector::const_iterator state_iter_; 379 380 if (star_) 381 { 382 typename string_deque_deque::const_iterator all_regexes_iter_ = 383 all_regexes_.begin (); 384 typename string_deque_deque::const_iterator all_regexes_end_ = 385 all_regexes_.end (); 386 typename id_vector_deque::const_iterator all_ids_iter_ = 387 all_ids_.begin (); 388 typename id_vector_deque::const_iterator all_uids_iter_ = 389 all_unique_ids_.begin (); 390 typename id_vector_deque::const_iterator all_states_iter_ = 391 all_states_.begin (); 392 393 for (; all_regexes_iter_ != all_regexes_end_; 394 ++state_, ++all_regexes_iter_, ++all_ids_iter_, 395 ++all_uids_iter_, ++all_states_iter_) 396 { 397 regex_iter_ = all_regexes_iter_->begin (); 398 regex_end_ = all_regexes_iter_->end (); 399 id_iter_ = all_ids_iter_->begin (); 400 uid_iter_ = all_uids_iter_->begin (); 401 state_iter_ = all_states_iter_->begin (); 402 403 for (; regex_iter_ != regex_end_; ++regex_iter_, ++id_iter_, 404 ++uid_iter_, ++state_iter_) 405 { 406 // If ..._dot_ then lookup state name from rules_; otherwise 407 // pass name through. 408 add (dest_dot_ ? rules_.state (state_) : dest_, *regex_iter_, 409 *id_iter_, to_dot_ ? rules_.state (*state_iter_) : to_, true, 410 *uid_iter_); 411 } 412 } 413 } 414 else 415 { 416 const CharT *start_ = source_; 417 string state_name_; 418 419 while (*source_) 420 { 421 while (*source_ && *source_ != ',') 422 { 423 ++source_; 424 } 425 426 state_name_.assign (start_, source_); 427 428 if (*source_) 429 { 430 ++source_; 431 start_ = source_; 432 } 433 434 state_ = rules_.state (state_name_.c_str ()); 435 436 if (state_ == npos) 437 { 438 std::basic_stringstream<CharT> ss_; 439 std::ostringstream os_; 440 441 os_ << "Unknown state name '"; 442 source_ = state_name_.c_str (); 443 444 while (*source_) 445 { 446 os_ << ss_.narrow (*source_++, ' '); 447 } 448 449 os_ << "'."; 450 throw runtime_error (os_.str ()); 451 } 452 453 regex_iter_ = all_regexes_[state_].begin (); 454 regex_end_ = all_regexes_[state_].end (); 455 id_iter_ = all_ids_[state_].begin (); 456 uid_iter_ = all_unique_ids_[state_].begin (); 457 state_iter_ = all_states_[state_].begin (); 458 459 for (; regex_iter_ != regex_end_; ++regex_iter_, ++id_iter_, 460 ++uid_iter_, ++state_iter_) 461 { 462 // If ..._dot_ then lookup state name from rules_; otherwise 463 // pass name through. 464 add (dest_dot_ ? state_name_.c_str () : dest_, *regex_iter_, 465 *id_iter_, to_dot_ ? rules_.state (*state_iter_) : to_, true, 466 *uid_iter_); 467 } 468 } 469 } 470 } 471 /* 472 void add (const CharT *curr_state_, const basic_rules<CharT> &rules_) 473 { 474 const string_deque_deque ®exes_ = rules_.regexes (); 475 const id_vector_deque &ids_ = rules_.ids (); 476 const id_vector_deque &unique_ids_ = rules_.unique_ids (); 477 typename string_deque_deque::const_iterator state_regex_iter_ = 478 regexes_.begin (); 479 typename string_deque_deque::const_iterator state_regex_end_ = 480 regexes_.end (); 481 typename id_vector_deque::const_iterator state_id_iter_ = 482 ids_.begin (); 483 typename id_vector_deque::const_iterator state_uid_iter_ = 484 unique_ids_.begin (); 485 typename string_deque::const_iterator regex_iter_; 486 typename string_deque::const_iterator regex_end_; 487 typename id_vector::const_iterator id_iter_; 488 typename id_vector::const_iterator uid_iter_; 489 490 for (; state_regex_iter_ != state_regex_end_; ++state_regex_iter_) 491 { 492 regex_iter_ = state_regex_iter_->begin (); 493 regex_end_ = state_regex_iter_->end (); 494 id_iter_ = state_id_iter_->begin (); 495 uid_iter_ = state_uid_iter_->begin (); 496 497 for (; regex_iter_ != regex_end_; ++regex_iter_, ++id_iter_, 498 ++uid_iter_) 499 { 500 add (curr_state_, *regex_iter_, *id_iter_, curr_state_, true, 501 *uid_iter_); 502 } 503 } 504 } 505 */ 506 const string_size_t_map &statemap () const 507 { 508 return _statemap; 509 } 510 511 const string_pair_deque ¯odeque () const 512 { 513 return _macrodeque; 514 } 515 516 const string_deque_deque ®exes () const 517 { 518 return _regexes; 519 } 520 521 const id_vector_deque &ids () const 522 { 523 return _ids; 524 } 525 526 const id_vector_deque &unique_ids () const 527 { 528 return _unique_ids; 529 } 530 531 const id_vector_deque &states () const 532 { 533 return _states; 534 } 535 536 bool empty () const 537 { 538 typename string_deque_deque::const_iterator iter_ = _regexes.begin (); 539 typename string_deque_deque::const_iterator end_ = _regexes.end (); 540 bool empty_ = true; 541 542 for (; iter_ != end_; ++iter_) 543 { 544 if (!iter_->empty ()) 545 { 546 empty_ = false; 547 break; 548 } 549 } 550 551 return empty_; 552 } 553 554 static const CharT *initial () 555 { 556 return detail::strings<CharT>::initial (); 557 } 558 559 static const CharT *all_states () 560 { 561 return detail::strings<CharT>::all_states (); 562 } 563 564 static const CharT *dot () 565 { 566 return detail::strings<CharT>::dot (); 567 } 568 569 private: 570 string_size_t_map _statemap; 571 string_pair_deque _macrodeque; 572 string_set _macroset; 573 string_deque_deque _regexes; 574 id_vector_deque _ids; 575 id_vector_deque _unique_ids; 576 id_vector_deque _states; 577 regex_flags _flags; 578 std::size_t _counter; 579 std::size_t (*_counter_ptr) (); 580 std::locale _locale; 581 string_deque _lexer_state_names; 582 583 std::size_t add (const CharT *curr_state_, const string ®ex_, 584 const std::size_t id_, const CharT *new_state_, const bool check_, 585 const std::size_t uid_ = npos) 586 { 587 const bool star_ = *curr_state_ == '*' && *(curr_state_ + 1) == 0; 588 const bool dot_ = *new_state_ == '.' && *(new_state_ + 1) == 0; 589 590 if (check_) 591 { 592 check_for_invalid_id (id_); 593 } 594 595 if (!dot_) 596 { 597 validate (new_state_); 598 } 599 600 std::size_t new_ = string::npos; 601 typename string_size_t_map::const_iterator iter_; 602 typename string_size_t_map::const_iterator end_ = _statemap.end (); 603 id_vector states_; 604 605 if (!dot_) 606 { 607 iter_ = _statemap.find (new_state_); 608 609 if (iter_ == end_) 610 { 611 std::basic_stringstream<CharT> ss_; 612 std::ostringstream os_; 613 614 os_ << "Unknown state name '"; 615 616 while (*new_state_) 617 { 618 os_ << ss_.narrow (*new_state_++, ' '); 619 } 620 621 os_ << "'."; 622 throw runtime_error (os_.str ()); 623 } 624 625 new_ = iter_->second; 626 } 627 628 if (star_) 629 { 630 const std::size_t size_ = _statemap.size (); 631 632 for (std::size_t i_ = 0; i_ < size_; ++i_) 633 { 634 states_.push_back (i_); 635 } 636 } 637 else 638 { 639 const CharT *start_ = curr_state_; 640 string state_; 641 642 while (*curr_state_) 643 { 644 while (*curr_state_ && *curr_state_ != ',') 645 { 646 ++curr_state_; 647 } 648 649 state_.assign (start_, curr_state_); 650 651 if (*curr_state_) 652 { 653 ++curr_state_; 654 start_ = curr_state_; 655 } 656 657 validate (state_.c_str ()); 658 iter_ = _statemap.find (state_.c_str ()); 659 660 if (iter_ == end_) 661 { 662 std::basic_stringstream<CharT> ss_; 663 std::ostringstream os_; 664 665 os_ << "Unknown state name '"; 666 curr_state_ = state_.c_str (); 667 668 while (*curr_state_) 669 { 670 os_ << ss_.narrow (*curr_state_++, ' '); 671 } 672 673 os_ << "'."; 674 throw runtime_error (os_.str ()); 675 } 676 677 states_.push_back (iter_->second); 678 } 679 } 680 681 std::size_t first_counter_ = npos; 682 683 for (std::size_t i_ = 0, size_ = states_.size (); i_ < size_; ++i_) 684 { 685 const std::size_t curr_ = states_[i_]; 686 687 _regexes[curr_].push_back (regex_); 688 _ids[curr_].push_back (id_); 689 690 if (uid_ == npos) 691 { 692 const std::size_t counter_ = next_unique_id (); 693 694 if (first_counter_ == npos) 695 { 696 first_counter_ = counter_; 697 } 698 699 _unique_ids[curr_].push_back (counter_); 700 } 701 else 702 { 703 if (first_counter_ == npos) 704 { 705 first_counter_ = uid_; 706 } 707 708 _unique_ids[curr_].push_back (uid_); 709 } 710 711 _states[curr_].push_back (dot_ ? curr_ : new_); 712 } 713 714 return first_counter_; 715 } 716 717 void validate (const CharT *name_) const 718 { 719 const CharT *start_ = name_; 720 721 if (*name_ != '_' && !(*name_ >= 'A' && *name_ <= 'Z') && 722 !(*name_ >= 'a' && *name_ <= 'z')) 723 { 724 std::basic_stringstream<CharT> ss_; 725 std::ostringstream os_; 726 727 os_ << "Invalid name '"; 728 729 while (*name_) 730 { 731 os_ << ss_.narrow (*name_++, ' '); 732 } 733 734 os_ << "'."; 735 throw runtime_error (os_.str ()); 736 } 737 else if (*name_) 738 { 739 ++name_; 740 } 741 742 while (*name_) 743 { 744 if (*name_ != '_' && *name_ != '-' && 745 !(*name_ >= 'A' && *name_ <= 'Z') && 746 !(*name_ >= 'a' && *name_ <= 'z') && 747 !(*name_ >= '0' && *name_ <= '9')) 748 { 749 std::basic_stringstream<CharT> ss_; 750 std::ostringstream os_; 751 752 os_ << "Invalid name '"; 753 name_ = start_; 754 755 while (*name_) 756 { 757 os_ << ss_.narrow (*name_++, ' '); 758 } 759 760 os_ << "'."; 761 throw runtime_error (os_.str ()); 762 } 763 764 ++name_; 765 } 766 767 if (name_ - start_ > static_cast<std::ptrdiff_t>(max_macro_len)) 768 { 769 std::basic_stringstream<CharT> ss_; 770 std::ostringstream os_; 771 772 os_ << "Name '"; 773 name_ = start_; 774 775 while (*name_) 776 { 777 os_ << ss_.narrow (*name_++, ' '); 778 } 779 780 os_ << "' too long."; 781 throw runtime_error (os_.str ()); 782 } 783 } 784 785 void check_for_invalid_id (const std::size_t id_) const 786 { 787 switch (id_) 788 { 789 case 0: 790 throw runtime_error ("id 0 is reserved for EOF."); 791 case npos: 792 throw runtime_error ("id npos is reserved for the " 793 "UNKNOWN token."); 794 default: 795 // OK 796 break; 797 } 798 } 799 }; 800 801 typedef basic_rules<char> rules; 802 typedef basic_rules<wchar_t> wrules; 803 } 804 } 805 806 #endif 807