1 // generator.hpp 2 // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/) 3 // 4 // Distributed under the Boost Software License, Version 1.0. (See accompanying 5 // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 6 #ifndef BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_GENERATOR_HPP 7 #define BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_GENERATOR_HPP 8 9 #include "char_traits.hpp" 10 // memcmp() 11 #include <cstring> 12 #include "partition/charset.hpp" 13 #include "partition/equivset.hpp" 14 #include <memory> 15 #include <limits> 16 #include "parser/tree/node.hpp" 17 #include "parser/parser.hpp" 18 #include "containers/ptr_list.hpp" 19 #include <boost/move/unique_ptr.hpp> 20 #include "rules.hpp" 21 #include "state_machine.hpp" 22 23 namespace boost 24 { 25 namespace lexer 26 { 27 template<typename CharT, typename Traits = char_traits<CharT> > 28 class basic_generator 29 { 30 public: 31 typedef typename detail::internals::size_t_vector size_t_vector; 32 typedef basic_rules<CharT> rules; 33 build(const rules & rules_,basic_state_machine<CharT> & state_machine_)34 static void build (const rules &rules_, 35 basic_state_machine<CharT> &state_machine_) 36 { 37 std::size_t index_ = 0; 38 std::size_t size_ = rules_.statemap ().size (); 39 node_ptr_vector node_ptr_vector_; 40 detail::internals &internals_ = const_cast<detail::internals &> 41 (state_machine_.data ()); 42 bool seen_BOL_assertion_ = false; 43 bool seen_EOL_assertion_ = false; 44 45 state_machine_.clear (); 46 47 for (; index_ < size_; ++index_) 48 { 49 internals_._lookup->push_back (static_cast<size_t_vector *>(0)); 50 internals_._lookup->back () = new size_t_vector; 51 internals_._dfa_alphabet.push_back (0); 52 internals_._dfa->push_back (static_cast<size_t_vector *>(0)); 53 internals_._dfa->back () = new size_t_vector; 54 } 55 56 for (index_ = 0, size_ = internals_._lookup->size (); 57 index_ < size_; ++index_) 58 { 59 internals_._lookup[index_]->resize (sizeof (CharT) == 1 ? 60 num_chars : num_wchar_ts, dead_state_index); 61 62 if (!rules_.regexes ()[index_].empty ()) 63 { 64 // vector mapping token indexes to partitioned token index sets 65 index_set_vector set_mapping_; 66 // syntax tree 67 detail::node *root_ = build_tree (rules_, index_, 68 node_ptr_vector_, internals_, set_mapping_); 69 70 build_dfa (root_, set_mapping_, 71 internals_._dfa_alphabet[index_], 72 *internals_._dfa[index_]); 73 74 if (internals_._seen_BOL_assertion) 75 { 76 seen_BOL_assertion_ = true; 77 } 78 79 if (internals_._seen_EOL_assertion) 80 { 81 seen_EOL_assertion_ = true; 82 } 83 84 internals_._seen_BOL_assertion = false; 85 internals_._seen_EOL_assertion = false; 86 } 87 } 88 89 internals_._seen_BOL_assertion = seen_BOL_assertion_; 90 internals_._seen_EOL_assertion = seen_EOL_assertion_; 91 } 92 minimise(basic_state_machine<CharT> & state_machine_)93 static void minimise (basic_state_machine<CharT> &state_machine_) 94 { 95 detail::internals &internals_ = const_cast<detail::internals &> 96 (state_machine_.data ()); 97 const std::size_t machines_ = internals_._dfa->size (); 98 99 for (std::size_t i_ = 0; i_ < machines_; ++i_) 100 { 101 const std::size_t dfa_alphabet_ = internals_._dfa_alphabet[i_]; 102 size_t_vector *dfa_ = internals_._dfa[i_]; 103 104 if (dfa_alphabet_ != 0) 105 { 106 std::size_t size_ = 0; 107 108 do 109 { 110 size_ = dfa_->size (); 111 minimise_dfa (dfa_alphabet_, *dfa_, size_); 112 } while (dfa_->size () != size_); 113 } 114 } 115 } 116 117 protected: 118 typedef detail::basic_charset<CharT> charset; 119 typedef detail::ptr_list<charset> charset_list; 120 typedef boost::movelib::unique_ptr<charset> charset_ptr; 121 typedef detail::equivset equivset; 122 typedef detail::ptr_list<equivset> equivset_list; 123 typedef boost::movelib::unique_ptr<equivset> equivset_ptr; 124 typedef typename charset::index_set index_set; 125 typedef std::vector<index_set> index_set_vector; 126 typedef detail::basic_parser<CharT> parser; 127 typedef typename parser::node_ptr_vector node_ptr_vector; 128 typedef std::set<const detail::node *> node_set; 129 typedef detail::ptr_vector<node_set> node_set_vector; 130 typedef std::vector<const detail::node *> node_vector; 131 typedef detail::ptr_vector<node_vector> node_vector_vector; 132 typedef typename parser::string string; 133 typedef std::pair<string, string> string_pair; 134 typedef typename parser::tokeniser::string_token string_token; 135 typedef std::deque<string_pair> macro_deque; 136 typedef std::pair<string, const detail::node *> macro_pair; 137 typedef typename parser::macro_map::iterator macro_iter; 138 typedef std::pair<macro_iter, bool> macro_iter_pair; 139 typedef typename parser::tokeniser::token_map token_map; 140 build_tree(const rules & rules_,const std::size_t state_,node_ptr_vector & node_ptr_vector_,detail::internals & internals_,index_set_vector & set_mapping_)141 static detail::node *build_tree (const rules &rules_, 142 const std::size_t state_, node_ptr_vector &node_ptr_vector_, 143 detail::internals &internals_, index_set_vector &set_mapping_) 144 { 145 size_t_vector *lookup_ = internals_._lookup[state_]; 146 const typename rules::string_deque_deque ®exes_ = 147 rules_.regexes (); 148 const typename rules::id_vector_deque &ids_ = rules_.ids (); 149 const typename rules::id_vector_deque &unique_ids_ = 150 rules_.unique_ids (); 151 const typename rules::id_vector_deque &states_ = rules_.states (); 152 typename rules::string_deque::const_iterator regex_iter_ = 153 regexes_[state_].begin (); 154 typename rules::string_deque::const_iterator regex_iter_end_ = 155 regexes_[state_].end (); 156 typename rules::id_vector::const_iterator ids_iter_ = 157 ids_[state_].begin (); 158 typename rules::id_vector::const_iterator unique_ids_iter_ = 159 unique_ids_[state_].begin (); 160 typename rules::id_vector::const_iterator states_iter_ = 161 states_[state_].begin (); 162 const typename rules::string ®ex_ = *regex_iter_; 163 // map of regex charset tokens (strings) to index 164 token_map token_map_; 165 const typename rules::string_pair_deque ¯odeque_ = 166 rules_.macrodeque (); 167 typename parser::macro_map macromap_; 168 typename detail::node::node_vector tree_vector_; 169 170 build_macros (token_map_, macrodeque_, macromap_, 171 rules_.flags (), rules_.locale (), node_ptr_vector_, 172 internals_._seen_BOL_assertion, internals_._seen_EOL_assertion); 173 174 detail::node *root_ = parser::parse (regex_.c_str (), 175 regex_.c_str () + regex_.size (), *ids_iter_, *unique_ids_iter_, 176 *states_iter_, rules_.flags (), rules_.locale (), node_ptr_vector_, 177 macromap_, token_map_, internals_._seen_BOL_assertion, 178 internals_._seen_EOL_assertion); 179 180 ++regex_iter_; 181 ++ids_iter_; 182 ++unique_ids_iter_; 183 ++states_iter_; 184 tree_vector_.push_back (root_); 185 186 // build syntax trees 187 while (regex_iter_ != regex_iter_end_) 188 { 189 // re-declare var, otherwise we perform an assignment..! 190 const typename rules::string ®ex2_ = *regex_iter_; 191 192 root_ = parser::parse (regex2_.c_str (), 193 regex2_.c_str () + regex2_.size (), *ids_iter_, 194 *unique_ids_iter_, *states_iter_, rules_.flags (), 195 rules_.locale (), node_ptr_vector_, macromap_, token_map_, 196 internals_._seen_BOL_assertion, 197 internals_._seen_EOL_assertion); 198 tree_vector_.push_back (root_); 199 ++regex_iter_; 200 ++ids_iter_; 201 ++unique_ids_iter_; 202 ++states_iter_; 203 } 204 205 if (internals_._seen_BOL_assertion) 206 { 207 // Fixup BOLs 208 typename detail::node::node_vector::iterator iter_ = 209 tree_vector_.begin (); 210 typename detail::node::node_vector::iterator end_ = 211 tree_vector_.end (); 212 213 for (; iter_ != end_; ++iter_) 214 { 215 fixup_bol (*iter_, node_ptr_vector_); 216 } 217 } 218 219 // join trees 220 { 221 typename detail::node::node_vector::iterator iter_ = 222 tree_vector_.begin (); 223 typename detail::node::node_vector::iterator end_ = 224 tree_vector_.end (); 225 226 if (iter_ != end_) 227 { 228 root_ = *iter_; 229 ++iter_; 230 } 231 232 for (; iter_ != end_; ++iter_) 233 { 234 node_ptr_vector_->push_back (static_cast<detail::selection_node *>(0)); 235 node_ptr_vector_->back () = new detail::selection_node 236 (root_, *iter_); 237 root_ = node_ptr_vector_->back (); 238 } 239 } 240 241 // partitioned token list 242 charset_list token_list_; 243 244 set_mapping_.resize (token_map_.size ()); 245 partition_tokens (token_map_, token_list_); 246 247 typename charset_list::list::const_iterator iter_ = 248 token_list_->begin (); 249 typename charset_list::list::const_iterator end_ = 250 token_list_->end (); 251 std::size_t index_ = 0; 252 253 for (; iter_ != end_; ++iter_, ++index_) 254 { 255 const charset *cs_ = *iter_; 256 typename charset::index_set::const_iterator set_iter_ = 257 cs_->_index_set.begin (); 258 typename charset::index_set::const_iterator set_end_ = 259 cs_->_index_set.end (); 260 261 fill_lookup (cs_->_token, lookup_, index_); 262 263 for (; set_iter_ != set_end_; ++set_iter_) 264 { 265 set_mapping_[*set_iter_].insert (index_); 266 } 267 } 268 269 internals_._dfa_alphabet[state_] = token_list_->size () + dfa_offset; 270 return root_; 271 } 272 build_macros(token_map & token_map_,const macro_deque & macrodeque_,typename parser::macro_map & macromap_,const regex_flags flags_,const std::locale & locale_,node_ptr_vector & node_ptr_vector_,bool & seen_BOL_assertion_,bool & seen_EOL_assertion_)273 static void build_macros (token_map &token_map_, 274 const macro_deque ¯odeque_, 275 typename parser::macro_map ¯omap_, const regex_flags flags_, 276 const std::locale &locale_, node_ptr_vector &node_ptr_vector_, 277 bool &seen_BOL_assertion_, bool &seen_EOL_assertion_) 278 { 279 for (typename macro_deque::const_iterator iter_ = 280 macrodeque_.begin (), end_ = macrodeque_.end (); 281 iter_ != end_; ++iter_) 282 { 283 const typename rules::string &name_ = iter_->first; 284 const typename rules::string ®ex_ = iter_->second; 285 detail::node *node_ = parser::parse (regex_.c_str (), 286 regex_.c_str () + regex_.size (), 0, 0, 0, flags_, 287 locale_, node_ptr_vector_, macromap_, token_map_, 288 seen_BOL_assertion_, seen_EOL_assertion_); 289 macro_iter_pair map_iter_ = macromap_. 290 insert (macro_pair (name_, static_cast<const detail::node *> 291 (0))); 292 293 map_iter_.first->second = node_; 294 } 295 } 296 build_dfa(detail::node * root_,const index_set_vector & set_mapping_,const std::size_t dfa_alphabet_,size_t_vector & dfa_)297 static void build_dfa (detail::node *root_, 298 const index_set_vector &set_mapping_, const std::size_t dfa_alphabet_, 299 size_t_vector &dfa_) 300 { 301 typename detail::node::node_vector *followpos_ = 302 &root_->firstpos (); 303 node_set_vector seen_sets_; 304 node_vector_vector seen_vectors_; 305 size_t_vector hash_vector_; 306 307 // 'jam' state 308 dfa_.resize (dfa_alphabet_, 0); 309 closure (followpos_, seen_sets_, seen_vectors_, 310 hash_vector_, dfa_alphabet_, dfa_); 311 312 std::size_t *ptr_ = 0; 313 314 for (std::size_t index_ = 0; index_ < seen_vectors_->size (); ++index_) 315 { 316 equivset_list equiv_list_; 317 318 build_equiv_list (seen_vectors_[index_], set_mapping_, equiv_list_); 319 320 for (typename equivset_list::list::const_iterator iter_ = 321 equiv_list_->begin (), end_ = equiv_list_->end (); 322 iter_ != end_; ++iter_) 323 { 324 equivset *equivset_ = *iter_; 325 const std::size_t transition_ = closure 326 (&equivset_->_followpos, seen_sets_, seen_vectors_, 327 hash_vector_, dfa_alphabet_, dfa_); 328 329 if (transition_ != npos) 330 { 331 ptr_ = &dfa_.front () + ((index_ + 1) * dfa_alphabet_); 332 333 // Prune abstemious transitions from end states. 334 if (*ptr_ && !equivset_->_greedy) continue; 335 336 for (typename detail::equivset::index_vector::const_iterator 337 equiv_iter_ = equivset_->_index_vector.begin (), 338 equiv_end_ = equivset_->_index_vector.end (); 339 equiv_iter_ != equiv_end_; ++equiv_iter_) 340 { 341 const std::size_t equiv_index_ = *equiv_iter_; 342 343 if (equiv_index_ == bol_token) 344 { 345 if (ptr_[eol_index] == 0) 346 { 347 ptr_[bol_index] = transition_; 348 } 349 } 350 else if (equiv_index_ == eol_token) 351 { 352 if (ptr_[bol_index] == 0) 353 { 354 ptr_[eol_index] = transition_; 355 } 356 } 357 else 358 { 359 ptr_[equiv_index_ + dfa_offset] = transition_; 360 } 361 } 362 } 363 } 364 } 365 } 366 closure(typename detail::node::node_vector * followpos_,node_set_vector & seen_sets_,node_vector_vector & seen_vectors_,size_t_vector & hash_vector_,const std::size_t size_,size_t_vector & dfa_)367 static std::size_t closure (typename detail::node::node_vector *followpos_, 368 node_set_vector &seen_sets_, node_vector_vector &seen_vectors_, 369 size_t_vector &hash_vector_, const std::size_t size_, 370 size_t_vector &dfa_) 371 { 372 bool end_state_ = false; 373 std::size_t id_ = 0; 374 std::size_t unique_id_ = npos; 375 std::size_t state_ = 0; 376 std::size_t hash_ = 0; 377 378 if (followpos_->empty ()) return npos; 379 380 std::size_t index_ = 0; 381 boost::movelib::unique_ptr<node_set> set_ptr_ (new node_set); 382 boost::movelib::unique_ptr<node_vector> vector_ptr_ (new node_vector); 383 384 for (typename detail::node::node_vector::const_iterator iter_ = 385 followpos_->begin (), end_ = followpos_->end (); 386 iter_ != end_; ++iter_) 387 { 388 closure_ex (*iter_, end_state_, id_, unique_id_, state_, 389 set_ptr_.get (), vector_ptr_.get (), hash_); 390 } 391 392 bool found_ = false; 393 typename size_t_vector::const_iterator hash_iter_ = 394 hash_vector_.begin (); 395 typename size_t_vector::const_iterator hash_end_ = 396 hash_vector_.end (); 397 typename node_set_vector::vector::const_iterator set_iter_ = 398 seen_sets_->begin (); 399 400 for (; hash_iter_ != hash_end_; ++hash_iter_, ++set_iter_) 401 { 402 found_ = *hash_iter_ == hash_ && *(*set_iter_) == *set_ptr_; 403 ++index_; 404 405 if (found_) break; 406 } 407 408 if (!found_) 409 { 410 seen_sets_->push_back (static_cast<node_set *>(0)); 411 seen_sets_->back () = set_ptr_.release (); 412 seen_vectors_->push_back (static_cast<node_vector *>(0)); 413 seen_vectors_->back () = vector_ptr_.release (); 414 hash_vector_.push_back (hash_); 415 // State 0 is the jam state... 416 index_ = seen_sets_->size (); 417 418 const std::size_t old_size_ = dfa_.size (); 419 420 dfa_.resize (old_size_ + size_, 0); 421 422 if (end_state_) 423 { 424 dfa_[old_size_] |= end_state; 425 dfa_[old_size_ + id_index] = id_; 426 dfa_[old_size_ + unique_id_index] = unique_id_; 427 dfa_[old_size_ + state_index] = state_; 428 } 429 } 430 431 return index_; 432 } 433 closure_ex(detail::node * node_,bool & end_state_,std::size_t & id_,std::size_t & unique_id_,std::size_t & state_,node_set * set_ptr_,node_vector * vector_ptr_,std::size_t & hash_)434 static void closure_ex (detail::node *node_, bool &end_state_, 435 std::size_t &id_, std::size_t &unique_id_, std::size_t &state_, 436 node_set *set_ptr_, node_vector *vector_ptr_, std::size_t &hash_) 437 { 438 const bool temp_end_state_ = node_->end_state (); 439 440 if (temp_end_state_) 441 { 442 if (!end_state_) 443 { 444 end_state_ = true; 445 id_ = node_->id (); 446 unique_id_ = node_->unique_id (); 447 state_ = node_->lexer_state (); 448 } 449 } 450 451 if (set_ptr_->insert (node_).second) 452 { 453 vector_ptr_->push_back (node_); 454 hash_ += reinterpret_cast<std::size_t> (node_); 455 } 456 } 457 partition_tokens(const token_map & map_,charset_list & lhs_)458 static void partition_tokens (const token_map &map_, 459 charset_list &lhs_) 460 { 461 charset_list rhs_; 462 463 fill_rhs_list (map_, rhs_); 464 465 if (!rhs_->empty ()) 466 { 467 typename charset_list::list::iterator iter_; 468 typename charset_list::list::iterator end_; 469 charset_ptr overlap_ (new charset); 470 471 lhs_->push_back (static_cast<charset *>(0)); 472 lhs_->back () = rhs_->front (); 473 rhs_->pop_front (); 474 475 while (!rhs_->empty ()) 476 { 477 charset_ptr r_ (rhs_->front ()); 478 479 rhs_->pop_front (); 480 iter_ = lhs_->begin (); 481 end_ = lhs_->end (); 482 483 while (!r_->empty () && iter_ != end_) 484 { 485 typename charset_list::list::iterator l_iter_ = iter_; 486 487 (*l_iter_)->intersect (*r_.get (), *overlap_.get ()); 488 489 if (overlap_->empty ()) 490 { 491 ++iter_; 492 } 493 else if ((*l_iter_)->empty ()) 494 { 495 delete *l_iter_; 496 *l_iter_ = overlap_.release (); 497 498 overlap_.reset (new charset); 499 ++iter_; 500 } 501 else if (r_->empty ()) 502 { 503 overlap_.swap (r_); 504 overlap_.reset (new charset); 505 break; 506 } 507 else 508 { 509 iter_ = lhs_->insert (++iter_, 510 static_cast<charset *>(0)); 511 *iter_ = overlap_.release (); 512 513 overlap_.reset(new charset); 514 ++iter_; 515 end_ = lhs_->end (); 516 } 517 } 518 519 if (!r_->empty ()) 520 { 521 lhs_->push_back (static_cast<charset *>(0)); 522 lhs_->back () = r_.release (); 523 } 524 } 525 } 526 } 527 fill_rhs_list(const token_map & map_,charset_list & list_)528 static void fill_rhs_list (const token_map &map_, 529 charset_list &list_) 530 { 531 typename parser::tokeniser::token_map::const_iterator iter_ = 532 map_.begin (); 533 typename parser::tokeniser::token_map::const_iterator end_ = 534 map_.end (); 535 536 for (; iter_ != end_; ++iter_) 537 { 538 list_->push_back (static_cast<charset *>(0)); 539 list_->back () = new charset (iter_->first, iter_->second); 540 } 541 } 542 fill_lookup(const string_token & token_,size_t_vector * lookup_,const std::size_t index_)543 static void fill_lookup (const string_token &token_, 544 size_t_vector *lookup_, const std::size_t index_) 545 { 546 const CharT *curr_ = token_._charset.c_str (); 547 const CharT *chars_end_ = curr_ + token_._charset.size (); 548 std::size_t *ptr_ = &lookup_->front (); 549 const std::size_t max_ = sizeof (CharT) == 1 ? 550 num_chars : num_wchar_ts; 551 552 if (token_._negated) 553 { 554 // $$$ FIXME JDG July 2014 $$$ 555 // this code is problematic on platforms where wchar_t is signed 556 // with min generating negative numbers. This crashes with BAD_ACCESS 557 // because of the vector index below: 558 // ptr_[static_cast<typename Traits::index_type>(curr_char_)] 559 CharT curr_char_ = 0; // (std::numeric_limits<CharT>::min)(); 560 std::size_t i_ = 0; 561 562 while (curr_ < chars_end_) 563 { 564 while (*curr_ > curr_char_) 565 { 566 ptr_[static_cast<typename Traits::index_type> 567 (curr_char_)] = index_ + dfa_offset; 568 ++curr_char_; 569 ++i_; 570 } 571 572 ++curr_char_; 573 ++curr_; 574 ++i_; 575 } 576 577 for (; i_ < max_; ++i_) 578 { 579 ptr_[static_cast<typename Traits::index_type>(curr_char_)] = 580 index_ + dfa_offset; 581 ++curr_char_; 582 } 583 } 584 else 585 { 586 while (curr_ < chars_end_) 587 { 588 ptr_[static_cast<typename Traits::index_type>(*curr_)] = 589 index_ + dfa_offset; 590 ++curr_; 591 } 592 } 593 } 594 build_equiv_list(const node_vector * vector_,const index_set_vector & set_mapping_,equivset_list & lhs_)595 static void build_equiv_list (const node_vector *vector_, 596 const index_set_vector &set_mapping_, equivset_list &lhs_) 597 { 598 equivset_list rhs_; 599 600 fill_rhs_list (vector_, set_mapping_, rhs_); 601 602 if (!rhs_->empty ()) 603 { 604 typename equivset_list::list::iterator iter_; 605 typename equivset_list::list::iterator end_; 606 equivset_ptr overlap_ (new equivset); 607 608 lhs_->push_back (static_cast<equivset *>(0)); 609 lhs_->back () = rhs_->front (); 610 rhs_->pop_front (); 611 612 while (!rhs_->empty ()) 613 { 614 equivset_ptr r_ (rhs_->front ()); 615 616 rhs_->pop_front (); 617 iter_ = lhs_->begin (); 618 end_ = lhs_->end (); 619 620 while (!r_->empty () && iter_ != end_) 621 { 622 typename equivset_list::list::iterator l_iter_ = iter_; 623 624 (*l_iter_)->intersect (*r_.get (), *overlap_.get ()); 625 626 if (overlap_->empty ()) 627 { 628 ++iter_; 629 } 630 else if ((*l_iter_)->empty ()) 631 { 632 delete *l_iter_; 633 *l_iter_ = overlap_.release (); 634 635 overlap_.reset (new equivset); 636 ++iter_; 637 } 638 else if (r_->empty ()) 639 { 640 overlap_.swap (r_); 641 overlap_.reset (new equivset); 642 break; 643 } 644 else 645 { 646 iter_ = lhs_->insert (++iter_, 647 static_cast<equivset *>(0)); 648 *iter_ = overlap_.release (); 649 650 overlap_.reset (new equivset); 651 ++iter_; 652 end_ = lhs_->end (); 653 } 654 } 655 656 if (!r_->empty ()) 657 { 658 lhs_->push_back (static_cast<equivset *>(0)); 659 lhs_->back () = r_.release (); 660 } 661 } 662 } 663 } 664 fill_rhs_list(const node_vector * vector_,const index_set_vector & set_mapping_,equivset_list & list_)665 static void fill_rhs_list (const node_vector *vector_, 666 const index_set_vector &set_mapping_, equivset_list &list_) 667 { 668 typename node_vector::const_iterator iter_ = 669 vector_->begin (); 670 typename node_vector::const_iterator end_ = 671 vector_->end (); 672 673 for (; iter_ != end_; ++iter_) 674 { 675 const detail::node *node_ = *iter_; 676 677 if (!node_->end_state ()) 678 { 679 const std::size_t token_ = node_->token (); 680 681 if (token_ != null_token) 682 { 683 list_->push_back (static_cast<equivset *>(0)); 684 685 if (token_ == bol_token || token_ == eol_token) 686 { 687 std::set<std::size_t> index_set_; 688 689 index_set_.insert (token_); 690 list_->back () = new equivset (index_set_, 691 node_->greedy (), token_, node_->followpos ()); 692 } 693 else 694 { 695 list_->back () = new equivset (set_mapping_[token_], 696 node_->greedy (), token_, node_->followpos ()); 697 } 698 } 699 } 700 } 701 } 702 fixup_bol(detail::node * & root_,node_ptr_vector & node_ptr_vector_)703 static void fixup_bol (detail::node * &root_, 704 node_ptr_vector &node_ptr_vector_) 705 { 706 typename detail::node::node_vector *first_ = &root_->firstpos (); 707 bool found_ = false; 708 typename detail::node::node_vector::const_iterator iter_ = 709 first_->begin (); 710 typename detail::node::node_vector::const_iterator end_ = 711 first_->end (); 712 713 for (; iter_ != end_; ++iter_) 714 { 715 const detail::node *node_ = *iter_; 716 717 found_ = !node_->end_state () && node_->token () == bol_token; 718 719 if (found_) break; 720 } 721 722 if (!found_) 723 { 724 node_ptr_vector_->push_back (static_cast<detail::leaf_node *>(0)); 725 node_ptr_vector_->back () = new detail::leaf_node 726 (bol_token, true); 727 728 detail::node *lhs_ = node_ptr_vector_->back (); 729 730 node_ptr_vector_->push_back (static_cast<detail::leaf_node *>(0)); 731 node_ptr_vector_->back () = new detail::leaf_node 732 (null_token, true); 733 734 detail::node *rhs_ = node_ptr_vector_->back (); 735 736 node_ptr_vector_->push_back 737 (static_cast<detail::selection_node *>(0)); 738 node_ptr_vector_->back () = 739 new detail::selection_node (lhs_, rhs_); 740 lhs_ = node_ptr_vector_->back (); 741 742 node_ptr_vector_->push_back 743 (static_cast<detail::sequence_node *>(0)); 744 node_ptr_vector_->back () = 745 new detail::sequence_node (lhs_, root_); 746 root_ = node_ptr_vector_->back (); 747 } 748 } 749 minimise_dfa(const std::size_t dfa_alphabet_,size_t_vector & dfa_,std::size_t size_)750 static void minimise_dfa (const std::size_t dfa_alphabet_, 751 size_t_vector &dfa_, std::size_t size_) 752 { 753 const std::size_t *first_ = &dfa_.front (); 754 const std::size_t *second_ = 0; 755 const std::size_t *end_ = first_ + size_; 756 std::size_t index_ = 1; 757 std::size_t new_index_ = 1; 758 std::size_t curr_index_ = 0; 759 index_set index_set_; 760 size_t_vector lookup_; 761 std::size_t *lookup_ptr_ = 0; 762 763 lookup_.resize (size_ / dfa_alphabet_, null_token); 764 lookup_ptr_ = &lookup_.front (); 765 *lookup_ptr_ = 0; 766 // Only one 'jam' state, so skip it. 767 first_ += dfa_alphabet_; 768 769 for (; first_ < end_; first_ += dfa_alphabet_, ++index_) 770 { 771 for (second_ = first_ + dfa_alphabet_, curr_index_ = index_ + 1; 772 second_ < end_; second_ += dfa_alphabet_, ++curr_index_) 773 { 774 if (index_set_.find (curr_index_) != index_set_.end ()) 775 { 776 continue; 777 } 778 779 // Some systems have memcmp in namespace std. 780 using namespace std; 781 782 if (memcmp (first_, second_, sizeof (std::size_t) * 783 dfa_alphabet_) == 0) 784 { 785 index_set_.insert (curr_index_); 786 lookup_ptr_[curr_index_] = new_index_; 787 } 788 } 789 790 if (lookup_ptr_[index_] == null_token) 791 { 792 lookup_ptr_[index_] = new_index_; 793 ++new_index_; 794 } 795 } 796 797 if (!index_set_.empty ()) 798 { 799 const std::size_t *front_ = &dfa_.front (); 800 size_t_vector new_dfa_ (front_, front_ + dfa_alphabet_); 801 typename index_set::iterator set_end_ = 802 index_set_.end (); 803 const std::size_t *ptr_ = front_ + dfa_alphabet_; 804 std::size_t *new_ptr_ = 0; 805 806 new_dfa_.resize (size_ - index_set_.size () * dfa_alphabet_, 0); 807 new_ptr_ = &new_dfa_.front () + dfa_alphabet_; 808 size_ /= dfa_alphabet_; 809 810 for (index_ = 1; index_ < size_; ++index_) 811 { 812 if (index_set_.find (index_) != set_end_) 813 { 814 ptr_ += dfa_alphabet_; 815 continue; 816 } 817 818 new_ptr_[end_state_index] = ptr_[end_state_index]; 819 new_ptr_[id_index] = ptr_[id_index]; 820 new_ptr_[unique_id_index] = ptr_[unique_id_index]; 821 new_ptr_[state_index] = ptr_[state_index]; 822 new_ptr_[bol_index] = lookup_ptr_[ptr_[bol_index]]; 823 new_ptr_[eol_index] = lookup_ptr_[ptr_[eol_index]]; 824 new_ptr_ += dfa_offset; 825 ptr_ += dfa_offset; 826 827 for (std::size_t i_ = dfa_offset; i_ < dfa_alphabet_; ++i_) 828 { 829 *new_ptr_++ = lookup_ptr_[*ptr_++]; 830 } 831 } 832 833 dfa_.swap (new_dfa_); 834 } 835 } 836 }; 837 838 typedef basic_generator<char> generator; 839 typedef basic_generator<wchar_t> wgenerator; 840 } 841 } 842 843 #endif 844