1 // generator.hpp 2 // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/) 3 // 4 // Distributed under the Boost Software License, Version 1.0. (See accompanying 5 // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 6 #ifndef BOOST_LEXER_GENERATOR_HPP 7 #define BOOST_LEXER_GENERATOR_HPP 8 9 #include "char_traits.hpp" 10 // memcmp() 11 #include <cstring> 12 #include "partition/charset.hpp" 13 #include "partition/equivset.hpp" 14 #include <memory> 15 #include <limits> 16 #include "parser/tree/node.hpp" 17 #include "parser/parser.hpp" 18 #include "containers/ptr_list.hpp" 19 #include "rules.hpp" 20 #include "state_machine.hpp" 21 22 namespace boost 23 { 24 namespace lexer 25 { 26 template<typename CharT, typename Traits = char_traits<CharT> > 27 class basic_generator 28 { 29 public: 30 typedef typename detail::internals::size_t_vector size_t_vector; 31 typedef basic_rules<CharT> rules; 32 build(const rules & rules_,basic_state_machine<CharT> & state_machine_)33 static void build (const rules &rules_, 34 basic_state_machine<CharT> &state_machine_) 35 { 36 std::size_t index_ = 0; 37 std::size_t size_ = rules_.statemap ().size (); 38 node_ptr_vector node_ptr_vector_; 39 detail::internals &internals_ = const_cast<detail::internals &> 40 (state_machine_.data ()); 41 bool seen_BOL_assertion_ = false; 42 bool seen_EOL_assertion_ = false; 43 44 state_machine_.clear (); 45 46 for (; index_ < size_; ++index_) 47 { 48 internals_._lookup->push_back (static_cast<size_t_vector *>(0)); 49 internals_._lookup->back () = new size_t_vector; 50 internals_._dfa_alphabet.push_back (0); 51 internals_._dfa->push_back (static_cast<size_t_vector *>(0)); 52 internals_._dfa->back () = new size_t_vector; 53 } 54 55 for (index_ = 0, size_ = internals_._lookup->size (); 56 index_ < size_; ++index_) 57 { 58 internals_._lookup[index_]->resize (sizeof (CharT) == 1 ? 59 num_chars : num_wchar_ts, dead_state_index); 60 61 if (!rules_.regexes ()[index_].empty ()) 62 { 63 // vector mapping token indexes to partitioned token index sets 64 index_set_vector set_mapping_; 65 // syntax tree 66 detail::node *root_ = build_tree (rules_, index_, 67 node_ptr_vector_, internals_, set_mapping_); 68 69 build_dfa (root_, set_mapping_, 70 internals_._dfa_alphabet[index_], 71 *internals_._dfa[index_]); 72 73 if (internals_._seen_BOL_assertion) 74 { 75 seen_BOL_assertion_ = true; 76 } 77 78 if (internals_._seen_EOL_assertion) 79 { 80 seen_EOL_assertion_ = true; 81 } 82 83 internals_._seen_BOL_assertion = false; 84 internals_._seen_EOL_assertion = false; 85 } 86 } 87 88 internals_._seen_BOL_assertion = seen_BOL_assertion_; 89 internals_._seen_EOL_assertion = seen_EOL_assertion_; 90 } 91 minimise(basic_state_machine<CharT> & state_machine_)92 static void minimise (basic_state_machine<CharT> &state_machine_) 93 { 94 detail::internals &internals_ = const_cast<detail::internals &> 95 (state_machine_.data ()); 96 const std::size_t machines_ = internals_._dfa->size (); 97 98 for (std::size_t i_ = 0; i_ < machines_; ++i_) 99 { 100 const std::size_t dfa_alphabet_ = internals_._dfa_alphabet[i_]; 101 size_t_vector *dfa_ = internals_._dfa[i_]; 102 103 if (dfa_alphabet_ != 0) 104 { 105 std::size_t size_ = 0; 106 107 do 108 { 109 size_ = dfa_->size (); 110 minimise_dfa (dfa_alphabet_, *dfa_, size_); 111 } while (dfa_->size () != size_); 112 } 113 } 114 } 115 116 protected: 117 typedef detail::basic_charset<CharT> charset; 118 typedef detail::ptr_list<charset> charset_list; 119 typedef std::auto_ptr<charset> charset_ptr; 120 typedef detail::equivset equivset; 121 typedef detail::ptr_list<equivset> equivset_list; 122 typedef std::auto_ptr<equivset> equivset_ptr; 123 typedef typename charset::index_set index_set; 124 typedef std::vector<index_set> index_set_vector; 125 typedef detail::basic_parser<CharT> parser; 126 typedef typename parser::node_ptr_vector node_ptr_vector; 127 typedef std::set<const detail::node *> node_set; 128 typedef detail::ptr_vector<node_set> node_set_vector; 129 typedef std::vector<const detail::node *> node_vector; 130 typedef detail::ptr_vector<node_vector> node_vector_vector; 131 typedef typename parser::string string; 132 typedef std::pair<string, string> string_pair; 133 typedef typename parser::tokeniser::string_token string_token; 134 typedef std::deque<string_pair> macro_deque; 135 typedef std::pair<string, const detail::node *> macro_pair; 136 typedef typename parser::macro_map::iterator macro_iter; 137 typedef std::pair<macro_iter, bool> macro_iter_pair; 138 typedef typename parser::tokeniser::token_map token_map; 139 build_tree(const rules & rules_,const std::size_t state_,node_ptr_vector & node_ptr_vector_,detail::internals & internals_,index_set_vector & set_mapping_)140 static detail::node *build_tree (const rules &rules_, 141 const std::size_t state_, node_ptr_vector &node_ptr_vector_, 142 detail::internals &internals_, index_set_vector &set_mapping_) 143 { 144 size_t_vector *lookup_ = internals_._lookup[state_]; 145 const typename rules::string_deque_deque ®exes_ = 146 rules_.regexes (); 147 const typename rules::id_vector_deque &ids_ = rules_.ids (); 148 const typename rules::id_vector_deque &unique_ids_ = 149 rules_.unique_ids (); 150 const typename rules::id_vector_deque &states_ = rules_.states (); 151 typename rules::string_deque::const_iterator regex_iter_ = 152 regexes_[state_].begin (); 153 typename rules::string_deque::const_iterator regex_iter_end_ = 154 regexes_[state_].end (); 155 typename rules::id_vector::const_iterator ids_iter_ = 156 ids_[state_].begin (); 157 typename rules::id_vector::const_iterator unique_ids_iter_ = 158 unique_ids_[state_].begin (); 159 typename rules::id_vector::const_iterator states_iter_ = 160 states_[state_].begin (); 161 const typename rules::string ®ex_ = *regex_iter_; 162 // map of regex charset tokens (strings) to index 163 token_map token_map_; 164 const typename rules::string_pair_deque ¯odeque_ = 165 rules_.macrodeque (); 166 typename parser::macro_map macromap_; 167 typename detail::node::node_vector tree_vector_; 168 169 build_macros (token_map_, macrodeque_, macromap_, 170 rules_.flags (), rules_.locale (), node_ptr_vector_, 171 internals_._seen_BOL_assertion, internals_._seen_EOL_assertion); 172 173 detail::node *root_ = parser::parse (regex_.c_str (), 174 regex_.c_str () + regex_.size (), *ids_iter_, *unique_ids_iter_, 175 *states_iter_, rules_.flags (), rules_.locale (), node_ptr_vector_, 176 macromap_, token_map_, internals_._seen_BOL_assertion, 177 internals_._seen_EOL_assertion); 178 179 ++regex_iter_; 180 ++ids_iter_; 181 ++unique_ids_iter_; 182 ++states_iter_; 183 tree_vector_.push_back (root_); 184 185 // build syntax trees 186 while (regex_iter_ != regex_iter_end_) 187 { 188 // re-declare var, otherwise we perform an assignment..! 189 const typename rules::string ®ex2_ = *regex_iter_; 190 191 root_ = parser::parse (regex2_.c_str (), 192 regex2_.c_str () + regex2_.size (), *ids_iter_, 193 *unique_ids_iter_, *states_iter_, rules_.flags (), 194 rules_.locale (), node_ptr_vector_, macromap_, token_map_, 195 internals_._seen_BOL_assertion, 196 internals_._seen_EOL_assertion); 197 tree_vector_.push_back (root_); 198 ++regex_iter_; 199 ++ids_iter_; 200 ++unique_ids_iter_; 201 ++states_iter_; 202 } 203 204 if (internals_._seen_BOL_assertion) 205 { 206 // Fixup BOLs 207 typename detail::node::node_vector::iterator iter_ = 208 tree_vector_.begin (); 209 typename detail::node::node_vector::iterator end_ = 210 tree_vector_.end (); 211 212 for (; iter_ != end_; ++iter_) 213 { 214 fixup_bol (*iter_, node_ptr_vector_); 215 } 216 } 217 218 // join trees 219 { 220 typename detail::node::node_vector::iterator iter_ = 221 tree_vector_.begin (); 222 typename detail::node::node_vector::iterator end_ = 223 tree_vector_.end (); 224 225 if (iter_ != end_) 226 { 227 root_ = *iter_; 228 ++iter_; 229 } 230 231 for (; iter_ != end_; ++iter_) 232 { 233 node_ptr_vector_->push_back (static_cast<detail::selection_node *>(0)); 234 node_ptr_vector_->back () = new detail::selection_node 235 (root_, *iter_); 236 root_ = node_ptr_vector_->back (); 237 } 238 } 239 240 // partitioned token list 241 charset_list token_list_; 242 243 set_mapping_.resize (token_map_.size ()); 244 partition_tokens (token_map_, token_list_); 245 246 typename charset_list::list::const_iterator iter_ = 247 token_list_->begin (); 248 typename charset_list::list::const_iterator end_ = 249 token_list_->end (); 250 std::size_t index_ = 0; 251 252 for (; iter_ != end_; ++iter_, ++index_) 253 { 254 const charset *cs_ = *iter_; 255 typename charset::index_set::const_iterator set_iter_ = 256 cs_->_index_set.begin (); 257 typename charset::index_set::const_iterator set_end_ = 258 cs_->_index_set.end (); 259 260 fill_lookup (cs_->_token, lookup_, index_); 261 262 for (; set_iter_ != set_end_; ++set_iter_) 263 { 264 set_mapping_[*set_iter_].insert (index_); 265 } 266 } 267 268 internals_._dfa_alphabet[state_] = token_list_->size () + dfa_offset; 269 return root_; 270 } 271 build_macros(token_map & token_map_,const macro_deque & macrodeque_,typename parser::macro_map & macromap_,const regex_flags flags_,const std::locale & locale_,node_ptr_vector & node_ptr_vector_,bool & seen_BOL_assertion_,bool & seen_EOL_assertion_)272 static void build_macros (token_map &token_map_, 273 const macro_deque ¯odeque_, 274 typename parser::macro_map ¯omap_, const regex_flags flags_, 275 const std::locale &locale_, node_ptr_vector &node_ptr_vector_, 276 bool &seen_BOL_assertion_, bool &seen_EOL_assertion_) 277 { 278 for (typename macro_deque::const_iterator iter_ = 279 macrodeque_.begin (), end_ = macrodeque_.end (); 280 iter_ != end_; ++iter_) 281 { 282 const typename rules::string &name_ = iter_->first; 283 const typename rules::string ®ex_ = iter_->second; 284 detail::node *node_ = parser::parse (regex_.c_str (), 285 regex_.c_str () + regex_.size (), 0, 0, 0, flags_, 286 locale_, node_ptr_vector_, macromap_, token_map_, 287 seen_BOL_assertion_, seen_EOL_assertion_); 288 macro_iter_pair map_iter_ = macromap_. 289 insert (macro_pair (name_, static_cast<const detail::node *> 290 (0))); 291 292 map_iter_.first->second = node_; 293 } 294 } 295 build_dfa(detail::node * root_,const index_set_vector & set_mapping_,const std::size_t dfa_alphabet_,size_t_vector & dfa_)296 static void build_dfa (detail::node *root_, 297 const index_set_vector &set_mapping_, const std::size_t dfa_alphabet_, 298 size_t_vector &dfa_) 299 { 300 typename detail::node::node_vector *followpos_ = 301 &root_->firstpos (); 302 node_set_vector seen_sets_; 303 node_vector_vector seen_vectors_; 304 size_t_vector hash_vector_; 305 306 // 'jam' state 307 dfa_.resize (dfa_alphabet_, 0); 308 closure (followpos_, seen_sets_, seen_vectors_, 309 hash_vector_, dfa_alphabet_, dfa_); 310 311 std::size_t *ptr_ = 0; 312 313 for (std::size_t index_ = 0; index_ < seen_vectors_->size (); ++index_) 314 { 315 equivset_list equiv_list_; 316 317 build_equiv_list (seen_vectors_[index_], set_mapping_, equiv_list_); 318 319 for (typename equivset_list::list::const_iterator iter_ = 320 equiv_list_->begin (), end_ = equiv_list_->end (); 321 iter_ != end_; ++iter_) 322 { 323 equivset *equivset_ = *iter_; 324 const std::size_t transition_ = closure 325 (&equivset_->_followpos, seen_sets_, seen_vectors_, 326 hash_vector_, dfa_alphabet_, dfa_); 327 328 if (transition_ != npos) 329 { 330 ptr_ = &dfa_.front () + ((index_ + 1) * dfa_alphabet_); 331 332 // Prune abstemious transitions from end states. 333 if (*ptr_ && !equivset_->_greedy) continue; 334 335 for (typename detail::equivset::index_vector::const_iterator 336 equiv_iter_ = equivset_->_index_vector.begin (), 337 equiv_end_ = equivset_->_index_vector.end (); 338 equiv_iter_ != equiv_end_; ++equiv_iter_) 339 { 340 const std::size_t equiv_index_ = *equiv_iter_; 341 342 if (equiv_index_ == bol_token) 343 { 344 if (ptr_[eol_index] == 0) 345 { 346 ptr_[bol_index] = transition_; 347 } 348 } 349 else if (equiv_index_ == eol_token) 350 { 351 if (ptr_[bol_index] == 0) 352 { 353 ptr_[eol_index] = transition_; 354 } 355 } 356 else 357 { 358 ptr_[equiv_index_ + dfa_offset] = transition_; 359 } 360 } 361 } 362 } 363 } 364 } 365 closure(typename detail::node::node_vector * followpos_,node_set_vector & seen_sets_,node_vector_vector & seen_vectors_,size_t_vector & hash_vector_,const std::size_t size_,size_t_vector & dfa_)366 static std::size_t closure (typename detail::node::node_vector *followpos_, 367 node_set_vector &seen_sets_, node_vector_vector &seen_vectors_, 368 size_t_vector &hash_vector_, const std::size_t size_, 369 size_t_vector &dfa_) 370 { 371 bool end_state_ = false; 372 std::size_t id_ = 0; 373 std::size_t unique_id_ = npos; 374 std::size_t state_ = 0; 375 std::size_t hash_ = 0; 376 377 if (followpos_->empty ()) return npos; 378 379 std::size_t index_ = 0; 380 std::auto_ptr<node_set> set_ptr_ (new node_set); 381 std::auto_ptr<node_vector> vector_ptr_ (new node_vector); 382 383 for (typename detail::node::node_vector::const_iterator iter_ = 384 followpos_->begin (), end_ = followpos_->end (); 385 iter_ != end_; ++iter_) 386 { 387 closure_ex (*iter_, end_state_, id_, unique_id_, state_, 388 set_ptr_.get (), vector_ptr_.get (), hash_); 389 } 390 391 bool found_ = false; 392 typename size_t_vector::const_iterator hash_iter_ = 393 hash_vector_.begin (); 394 typename size_t_vector::const_iterator hash_end_ = 395 hash_vector_.end (); 396 typename node_set_vector::vector::const_iterator set_iter_ = 397 seen_sets_->begin (); 398 399 for (; hash_iter_ != hash_end_; ++hash_iter_, ++set_iter_) 400 { 401 found_ = *hash_iter_ == hash_ && *(*set_iter_) == *set_ptr_; 402 ++index_; 403 404 if (found_) break; 405 } 406 407 if (!found_) 408 { 409 seen_sets_->push_back (static_cast<node_set *>(0)); 410 seen_sets_->back () = set_ptr_.release (); 411 seen_vectors_->push_back (static_cast<node_vector *>(0)); 412 seen_vectors_->back () = vector_ptr_.release (); 413 hash_vector_.push_back (hash_); 414 // State 0 is the jam state... 415 index_ = seen_sets_->size (); 416 417 const std::size_t old_size_ = dfa_.size (); 418 419 dfa_.resize (old_size_ + size_, 0); 420 421 if (end_state_) 422 { 423 dfa_[old_size_] |= end_state; 424 dfa_[old_size_ + id_index] = id_; 425 dfa_[old_size_ + unique_id_index] = unique_id_; 426 dfa_[old_size_ + state_index] = state_; 427 } 428 } 429 430 return index_; 431 } 432 closure_ex(detail::node * node_,bool & end_state_,std::size_t & id_,std::size_t & unique_id_,std::size_t & state_,node_set * set_ptr_,node_vector * vector_ptr_,std::size_t & hash_)433 static void closure_ex (detail::node *node_, bool &end_state_, 434 std::size_t &id_, std::size_t &unique_id_, std::size_t &state_, 435 node_set *set_ptr_, node_vector *vector_ptr_, std::size_t &hash_) 436 { 437 const bool temp_end_state_ = node_->end_state (); 438 439 if (temp_end_state_) 440 { 441 if (!end_state_) 442 { 443 end_state_ = true; 444 id_ = node_->id (); 445 unique_id_ = node_->unique_id (); 446 state_ = node_->lexer_state (); 447 } 448 } 449 450 if (set_ptr_->insert (node_).second) 451 { 452 vector_ptr_->push_back (node_); 453 hash_ += reinterpret_cast<std::size_t> (node_); 454 } 455 } 456 partition_tokens(const token_map & map_,charset_list & lhs_)457 static void partition_tokens (const token_map &map_, 458 charset_list &lhs_) 459 { 460 charset_list rhs_; 461 462 fill_rhs_list (map_, rhs_); 463 464 if (!rhs_->empty ()) 465 { 466 typename charset_list::list::iterator iter_; 467 typename charset_list::list::iterator end_; 468 charset_ptr overlap_ (new charset); 469 470 lhs_->push_back (static_cast<charset *>(0)); 471 lhs_->back () = rhs_->front (); 472 rhs_->pop_front (); 473 474 while (!rhs_->empty ()) 475 { 476 charset_ptr r_ (rhs_->front ()); 477 478 rhs_->pop_front (); 479 iter_ = lhs_->begin (); 480 end_ = lhs_->end (); 481 482 while (!r_->empty () && iter_ != end_) 483 { 484 typename charset_list::list::iterator l_iter_ = iter_; 485 486 (*l_iter_)->intersect (*r_.get (), *overlap_.get ()); 487 488 if (overlap_->empty ()) 489 { 490 ++iter_; 491 } 492 else if ((*l_iter_)->empty ()) 493 { 494 delete *l_iter_; 495 *l_iter_ = overlap_.release (); 496 497 // VC++ 6 Hack: 498 charset_ptr temp_overlap_ (new charset); 499 500 overlap_ = temp_overlap_; 501 ++iter_; 502 } 503 else if (r_->empty ()) 504 { 505 delete r_.release (); 506 r_ = overlap_; 507 508 // VC++ 6 Hack: 509 charset_ptr temp_overlap_ (new charset); 510 511 overlap_ = temp_overlap_; 512 break; 513 } 514 else 515 { 516 iter_ = lhs_->insert (++iter_, 517 static_cast<charset *>(0)); 518 *iter_ = overlap_.release (); 519 520 // VC++ 6 Hack: 521 charset_ptr temp_overlap_ (new charset); 522 523 overlap_ = temp_overlap_; 524 ++iter_; 525 end_ = lhs_->end (); 526 } 527 } 528 529 if (!r_->empty ()) 530 { 531 lhs_->push_back (static_cast<charset *>(0)); 532 lhs_->back () = r_.release (); 533 } 534 } 535 } 536 } 537 fill_rhs_list(const token_map & map_,charset_list & list_)538 static void fill_rhs_list (const token_map &map_, 539 charset_list &list_) 540 { 541 typename parser::tokeniser::token_map::const_iterator iter_ = 542 map_.begin (); 543 typename parser::tokeniser::token_map::const_iterator end_ = 544 map_.end (); 545 546 for (; iter_ != end_; ++iter_) 547 { 548 list_->push_back (static_cast<charset *>(0)); 549 list_->back () = new charset (iter_->first, iter_->second); 550 } 551 } 552 fill_lookup(const string_token & token_,size_t_vector * lookup_,const std::size_t index_)553 static void fill_lookup (const string_token &token_, 554 size_t_vector *lookup_, const std::size_t index_) 555 { 556 const CharT *curr_ = token_._charset.c_str (); 557 const CharT *chars_end_ = curr_ + token_._charset.size (); 558 std::size_t *ptr_ = &lookup_->front (); 559 const std::size_t max_ = sizeof (CharT) == 1 ? 560 num_chars : num_wchar_ts; 561 562 if (token_._negated) 563 { 564 // $$$ FIXME JDG July 2014 $$$ 565 // this code is problematic on platforms where wchar_t is signed 566 // with min generating negative numbers. This crashes with BAD_ACCESS 567 // because of the vector index below: 568 // ptr_[static_cast<typename Traits::index_type>(curr_char_)] 569 CharT curr_char_ = 0; // (std::numeric_limits<CharT>::min)(); 570 std::size_t i_ = 0; 571 572 while (curr_ < chars_end_) 573 { 574 while (*curr_ > curr_char_) 575 { 576 ptr_[static_cast<typename Traits::index_type> 577 (curr_char_)] = index_ + dfa_offset; 578 ++curr_char_; 579 ++i_; 580 } 581 582 ++curr_char_; 583 ++curr_; 584 ++i_; 585 } 586 587 for (; i_ < max_; ++i_) 588 { 589 ptr_[static_cast<typename Traits::index_type>(curr_char_)] = 590 index_ + dfa_offset; 591 ++curr_char_; 592 } 593 } 594 else 595 { 596 while (curr_ < chars_end_) 597 { 598 ptr_[static_cast<typename Traits::index_type>(*curr_)] = 599 index_ + dfa_offset; 600 ++curr_; 601 } 602 } 603 } 604 build_equiv_list(const node_vector * vector_,const index_set_vector & set_mapping_,equivset_list & lhs_)605 static void build_equiv_list (const node_vector *vector_, 606 const index_set_vector &set_mapping_, equivset_list &lhs_) 607 { 608 equivset_list rhs_; 609 610 fill_rhs_list (vector_, set_mapping_, rhs_); 611 612 if (!rhs_->empty ()) 613 { 614 typename equivset_list::list::iterator iter_; 615 typename equivset_list::list::iterator end_; 616 equivset_ptr overlap_ (new equivset); 617 618 lhs_->push_back (static_cast<equivset *>(0)); 619 lhs_->back () = rhs_->front (); 620 rhs_->pop_front (); 621 622 while (!rhs_->empty ()) 623 { 624 equivset_ptr r_ (rhs_->front ()); 625 626 rhs_->pop_front (); 627 iter_ = lhs_->begin (); 628 end_ = lhs_->end (); 629 630 while (!r_->empty () && iter_ != end_) 631 { 632 typename equivset_list::list::iterator l_iter_ = iter_; 633 634 (*l_iter_)->intersect (*r_.get (), *overlap_.get ()); 635 636 if (overlap_->empty ()) 637 { 638 ++iter_; 639 } 640 else if ((*l_iter_)->empty ()) 641 { 642 delete *l_iter_; 643 *l_iter_ = overlap_.release (); 644 645 // VC++ 6 Hack: 646 equivset_ptr temp_overlap_ (new equivset); 647 648 overlap_ = temp_overlap_; 649 ++iter_; 650 } 651 else if (r_->empty ()) 652 { 653 delete r_.release (); 654 r_ = overlap_; 655 656 // VC++ 6 Hack: 657 equivset_ptr temp_overlap_ (new equivset); 658 659 overlap_ = temp_overlap_; 660 break; 661 } 662 else 663 { 664 iter_ = lhs_->insert (++iter_, 665 static_cast<equivset *>(0)); 666 *iter_ = overlap_.release (); 667 668 // VC++ 6 Hack: 669 equivset_ptr temp_overlap_ (new equivset); 670 671 overlap_ = temp_overlap_; 672 ++iter_; 673 end_ = lhs_->end (); 674 } 675 } 676 677 if (!r_->empty ()) 678 { 679 lhs_->push_back (static_cast<equivset *>(0)); 680 lhs_->back () = r_.release (); 681 } 682 } 683 } 684 } 685 fill_rhs_list(const node_vector * vector_,const index_set_vector & set_mapping_,equivset_list & list_)686 static void fill_rhs_list (const node_vector *vector_, 687 const index_set_vector &set_mapping_, equivset_list &list_) 688 { 689 typename node_vector::const_iterator iter_ = 690 vector_->begin (); 691 typename node_vector::const_iterator end_ = 692 vector_->end (); 693 694 for (; iter_ != end_; ++iter_) 695 { 696 const detail::node *node_ = *iter_; 697 698 if (!node_->end_state ()) 699 { 700 const std::size_t token_ = node_->token (); 701 702 if (token_ != null_token) 703 { 704 list_->push_back (static_cast<equivset *>(0)); 705 706 if (token_ == bol_token || token_ == eol_token) 707 { 708 std::set<std::size_t> index_set_; 709 710 index_set_.insert (token_); 711 list_->back () = new equivset (index_set_, 712 node_->greedy (), token_, node_->followpos ()); 713 } 714 else 715 { 716 list_->back () = new equivset (set_mapping_[token_], 717 node_->greedy (), token_, node_->followpos ()); 718 } 719 } 720 } 721 } 722 } 723 fixup_bol(detail::node * & root_,node_ptr_vector & node_ptr_vector_)724 static void fixup_bol (detail::node * &root_, 725 node_ptr_vector &node_ptr_vector_) 726 { 727 typename detail::node::node_vector *first_ = &root_->firstpos (); 728 bool found_ = false; 729 typename detail::node::node_vector::const_iterator iter_ = 730 first_->begin (); 731 typename detail::node::node_vector::const_iterator end_ = 732 first_->end (); 733 734 for (; iter_ != end_; ++iter_) 735 { 736 const detail::node *node_ = *iter_; 737 738 found_ = !node_->end_state () && node_->token () == bol_token; 739 740 if (found_) break; 741 } 742 743 if (!found_) 744 { 745 node_ptr_vector_->push_back (static_cast<detail::leaf_node *>(0)); 746 node_ptr_vector_->back () = new detail::leaf_node 747 (bol_token, true); 748 749 detail::node *lhs_ = node_ptr_vector_->back (); 750 751 node_ptr_vector_->push_back (static_cast<detail::leaf_node *>(0)); 752 node_ptr_vector_->back () = new detail::leaf_node 753 (null_token, true); 754 755 detail::node *rhs_ = node_ptr_vector_->back (); 756 757 node_ptr_vector_->push_back 758 (static_cast<detail::selection_node *>(0)); 759 node_ptr_vector_->back () = 760 new detail::selection_node (lhs_, rhs_); 761 lhs_ = node_ptr_vector_->back (); 762 763 node_ptr_vector_->push_back 764 (static_cast<detail::sequence_node *>(0)); 765 node_ptr_vector_->back () = 766 new detail::sequence_node (lhs_, root_); 767 root_ = node_ptr_vector_->back (); 768 } 769 } 770 minimise_dfa(const std::size_t dfa_alphabet_,size_t_vector & dfa_,std::size_t size_)771 static void minimise_dfa (const std::size_t dfa_alphabet_, 772 size_t_vector &dfa_, std::size_t size_) 773 { 774 const std::size_t *first_ = &dfa_.front (); 775 const std::size_t *second_ = 0; 776 const std::size_t *end_ = first_ + size_; 777 std::size_t index_ = 1; 778 std::size_t new_index_ = 1; 779 std::size_t curr_index_ = 0; 780 index_set index_set_; 781 size_t_vector lookup_; 782 std::size_t *lookup_ptr_ = 0; 783 784 lookup_.resize (size_ / dfa_alphabet_, null_token); 785 lookup_ptr_ = &lookup_.front (); 786 *lookup_ptr_ = 0; 787 // Only one 'jam' state, so skip it. 788 first_ += dfa_alphabet_; 789 790 for (; first_ < end_; first_ += dfa_alphabet_, ++index_) 791 { 792 for (second_ = first_ + dfa_alphabet_, curr_index_ = index_ + 1; 793 second_ < end_; second_ += dfa_alphabet_, ++curr_index_) 794 { 795 if (index_set_.find (curr_index_) != index_set_.end ()) 796 { 797 continue; 798 } 799 800 // Some systems have memcmp in namespace std. 801 using namespace std; 802 803 if (memcmp (first_, second_, sizeof (std::size_t) * 804 dfa_alphabet_) == 0) 805 { 806 index_set_.insert (curr_index_); 807 lookup_ptr_[curr_index_] = new_index_; 808 } 809 } 810 811 if (lookup_ptr_[index_] == null_token) 812 { 813 lookup_ptr_[index_] = new_index_; 814 ++new_index_; 815 } 816 } 817 818 if (!index_set_.empty ()) 819 { 820 const std::size_t *front_ = &dfa_.front (); 821 size_t_vector new_dfa_ (front_, front_ + dfa_alphabet_); 822 typename index_set::iterator set_end_ = 823 index_set_.end (); 824 const std::size_t *ptr_ = front_ + dfa_alphabet_; 825 std::size_t *new_ptr_ = 0; 826 827 new_dfa_.resize (size_ - index_set_.size () * dfa_alphabet_, 0); 828 new_ptr_ = &new_dfa_.front () + dfa_alphabet_; 829 size_ /= dfa_alphabet_; 830 831 for (index_ = 1; index_ < size_; ++index_) 832 { 833 if (index_set_.find (index_) != set_end_) 834 { 835 ptr_ += dfa_alphabet_; 836 continue; 837 } 838 839 new_ptr_[end_state_index] = ptr_[end_state_index]; 840 new_ptr_[id_index] = ptr_[id_index]; 841 new_ptr_[unique_id_index] = ptr_[unique_id_index]; 842 new_ptr_[state_index] = ptr_[state_index]; 843 new_ptr_[bol_index] = lookup_ptr_[ptr_[bol_index]]; 844 new_ptr_[eol_index] = lookup_ptr_[ptr_[eol_index]]; 845 new_ptr_ += dfa_offset; 846 ptr_ += dfa_offset; 847 848 for (std::size_t i_ = dfa_offset; i_ < dfa_alphabet_; ++i_) 849 { 850 *new_ptr_++ = lookup_ptr_[*ptr_++]; 851 } 852 } 853 854 dfa_.swap (new_dfa_); 855 } 856 } 857 }; 858 859 typedef basic_generator<char> generator; 860 typedef basic_generator<wchar_t> wgenerator; 861 } 862 } 863 864 #endif 865