1 // generator.hpp
2 // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/)
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See accompanying
5 // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6 #ifndef BOOST_LEXER_GENERATOR_HPP
7 #define BOOST_LEXER_GENERATOR_HPP
8 
9 #include "char_traits.hpp"
10 // memcmp()
11 #include <cstring>
12 #include "partition/charset.hpp"
13 #include "partition/equivset.hpp"
14 #include <memory>
15 #include <limits>
16 #include "parser/tree/node.hpp"
17 #include "parser/parser.hpp"
18 #include "containers/ptr_list.hpp"
19 #include "rules.hpp"
20 #include "state_machine.hpp"
21 
22 namespace boost
23 {
24 namespace lexer
25 {
26 template<typename CharT, typename Traits = char_traits<CharT> >
27 class basic_generator
28 {
29 public:
30     typedef typename detail::internals::size_t_vector size_t_vector;
31     typedef basic_rules<CharT> rules;
32 
build(const rules & rules_,basic_state_machine<CharT> & state_machine_)33     static void build (const rules &rules_,
34         basic_state_machine<CharT> &state_machine_)
35     {
36         std::size_t index_ = 0;
37         std::size_t size_ = rules_.statemap ().size ();
38         node_ptr_vector node_ptr_vector_;
39         detail::internals &internals_ = const_cast<detail::internals &>
40             (state_machine_.data ());
41         bool seen_BOL_assertion_ = false;
42         bool seen_EOL_assertion_ = false;
43 
44         state_machine_.clear ();
45 
46         for (; index_ < size_; ++index_)
47         {
48             internals_._lookup->push_back (static_cast<size_t_vector *>(0));
49             internals_._lookup->back () = new size_t_vector;
50             internals_._dfa_alphabet.push_back (0);
51             internals_._dfa->push_back (static_cast<size_t_vector *>(0));
52             internals_._dfa->back () = new size_t_vector;
53         }
54 
55         for (index_ = 0, size_ = internals_._lookup->size ();
56             index_ < size_; ++index_)
57         {
58             internals_._lookup[index_]->resize (sizeof (CharT) == 1 ?
59                 num_chars : num_wchar_ts, dead_state_index);
60 
61             if (!rules_.regexes ()[index_].empty ())
62             {
63                 // vector mapping token indexes to partitioned token index sets
64                 index_set_vector set_mapping_;
65                 // syntax tree
66                 detail::node *root_ = build_tree (rules_, index_,
67                     node_ptr_vector_, internals_, set_mapping_);
68 
69                 build_dfa (root_, set_mapping_,
70                     internals_._dfa_alphabet[index_],
71                     *internals_._dfa[index_]);
72 
73                 if (internals_._seen_BOL_assertion)
74                 {
75                     seen_BOL_assertion_ = true;
76                 }
77 
78                 if (internals_._seen_EOL_assertion)
79                 {
80                     seen_EOL_assertion_ = true;
81                 }
82 
83                 internals_._seen_BOL_assertion = false;
84                 internals_._seen_EOL_assertion = false;
85             }
86         }
87 
88         internals_._seen_BOL_assertion = seen_BOL_assertion_;
89         internals_._seen_EOL_assertion = seen_EOL_assertion_;
90     }
91 
minimise(basic_state_machine<CharT> & state_machine_)92     static void minimise (basic_state_machine<CharT> &state_machine_)
93     {
94         detail::internals &internals_ = const_cast<detail::internals &>
95             (state_machine_.data ());
96         const std::size_t machines_ = internals_._dfa->size ();
97 
98         for (std::size_t i_ = 0; i_ < machines_; ++i_)
99         {
100             const std::size_t dfa_alphabet_ = internals_._dfa_alphabet[i_];
101             size_t_vector *dfa_ = internals_._dfa[i_];
102 
103             if (dfa_alphabet_ != 0)
104             {
105                 std::size_t size_ = 0;
106 
107                 do
108                 {
109                     size_ = dfa_->size ();
110                     minimise_dfa (dfa_alphabet_, *dfa_, size_);
111                 } while (dfa_->size () != size_);
112             }
113         }
114     }
115 
116 protected:
117     typedef detail::basic_charset<CharT> charset;
118     typedef detail::ptr_list<charset> charset_list;
119     typedef std::auto_ptr<charset> charset_ptr;
120     typedef detail::equivset equivset;
121     typedef detail::ptr_list<equivset> equivset_list;
122     typedef std::auto_ptr<equivset> equivset_ptr;
123     typedef typename charset::index_set index_set;
124     typedef std::vector<index_set> index_set_vector;
125     typedef detail::basic_parser<CharT> parser;
126     typedef typename parser::node_ptr_vector node_ptr_vector;
127     typedef std::set<const detail::node *> node_set;
128     typedef detail::ptr_vector<node_set> node_set_vector;
129     typedef std::vector<const detail::node *> node_vector;
130     typedef detail::ptr_vector<node_vector> node_vector_vector;
131     typedef typename parser::string string;
132     typedef std::pair<string, string> string_pair;
133     typedef typename parser::tokeniser::string_token string_token;
134     typedef std::deque<string_pair> macro_deque;
135     typedef std::pair<string, const detail::node *> macro_pair;
136     typedef typename parser::macro_map::iterator macro_iter;
137     typedef std::pair<macro_iter, bool> macro_iter_pair;
138     typedef typename parser::tokeniser::token_map token_map;
139 
build_tree(const rules & rules_,const std::size_t state_,node_ptr_vector & node_ptr_vector_,detail::internals & internals_,index_set_vector & set_mapping_)140     static detail::node *build_tree (const rules &rules_,
141         const std::size_t state_, node_ptr_vector &node_ptr_vector_,
142         detail::internals &internals_, index_set_vector &set_mapping_)
143     {
144         size_t_vector *lookup_ = internals_._lookup[state_];
145         const typename rules::string_deque_deque &regexes_ =
146             rules_.regexes ();
147         const typename rules::id_vector_deque &ids_ = rules_.ids ();
148         const typename rules::id_vector_deque &unique_ids_ =
149             rules_.unique_ids ();
150         const typename rules::id_vector_deque &states_ = rules_.states ();
151         typename rules::string_deque::const_iterator regex_iter_ =
152             regexes_[state_].begin ();
153         typename rules::string_deque::const_iterator regex_iter_end_ =
154             regexes_[state_].end ();
155         typename rules::id_vector::const_iterator ids_iter_ =
156             ids_[state_].begin ();
157         typename rules::id_vector::const_iterator unique_ids_iter_ =
158             unique_ids_[state_].begin ();
159         typename rules::id_vector::const_iterator states_iter_ =
160             states_[state_].begin ();
161         const typename rules::string &regex_ = *regex_iter_;
162         // map of regex charset tokens (strings) to index
163         token_map token_map_;
164         const typename rules::string_pair_deque &macrodeque_ =
165             rules_.macrodeque ();
166         typename parser::macro_map macromap_;
167         typename detail::node::node_vector tree_vector_;
168 
169         build_macros (token_map_, macrodeque_, macromap_,
170             rules_.flags (), rules_.locale (), node_ptr_vector_,
171             internals_._seen_BOL_assertion, internals_._seen_EOL_assertion);
172 
173         detail::node *root_ = parser::parse (regex_.c_str (),
174             regex_.c_str () + regex_.size (), *ids_iter_, *unique_ids_iter_,
175             *states_iter_, rules_.flags (), rules_.locale (), node_ptr_vector_,
176             macromap_, token_map_, internals_._seen_BOL_assertion,
177             internals_._seen_EOL_assertion);
178 
179         ++regex_iter_;
180         ++ids_iter_;
181         ++unique_ids_iter_;
182         ++states_iter_;
183         tree_vector_.push_back (root_);
184 
185         // build syntax trees
186         while (regex_iter_ != regex_iter_end_)
187         {
188             // re-declare var, otherwise we perform an assignment..!
189             const typename rules::string &regex2_ = *regex_iter_;
190 
191             root_ = parser::parse (regex2_.c_str (),
192                 regex2_.c_str () + regex2_.size (), *ids_iter_,
193                 *unique_ids_iter_, *states_iter_, rules_.flags (),
194                 rules_.locale (), node_ptr_vector_, macromap_, token_map_,
195                 internals_._seen_BOL_assertion,
196                 internals_._seen_EOL_assertion);
197             tree_vector_.push_back (root_);
198             ++regex_iter_;
199             ++ids_iter_;
200             ++unique_ids_iter_;
201             ++states_iter_;
202         }
203 
204         if (internals_._seen_BOL_assertion)
205         {
206             // Fixup BOLs
207             typename detail::node::node_vector::iterator iter_ =
208                 tree_vector_.begin ();
209             typename detail::node::node_vector::iterator end_ =
210                 tree_vector_.end ();
211 
212             for (; iter_ != end_; ++iter_)
213             {
214                 fixup_bol (*iter_, node_ptr_vector_);
215             }
216         }
217 
218         // join trees
219         {
220             typename detail::node::node_vector::iterator iter_ =
221                 tree_vector_.begin ();
222             typename detail::node::node_vector::iterator end_ =
223                 tree_vector_.end ();
224 
225             if (iter_ != end_)
226             {
227                 root_ = *iter_;
228                 ++iter_;
229             }
230 
231             for (; iter_ != end_; ++iter_)
232             {
233                 node_ptr_vector_->push_back (static_cast<detail::selection_node *>(0));
234                 node_ptr_vector_->back () = new detail::selection_node
235                     (root_, *iter_);
236                 root_ = node_ptr_vector_->back ();
237             }
238         }
239 
240         // partitioned token list
241         charset_list token_list_;
242 
243         set_mapping_.resize (token_map_.size ());
244         partition_tokens (token_map_, token_list_);
245 
246         typename charset_list::list::const_iterator iter_ =
247             token_list_->begin ();
248         typename charset_list::list::const_iterator end_ =
249             token_list_->end ();
250         std::size_t index_ = 0;
251 
252         for (; iter_ != end_; ++iter_, ++index_)
253         {
254             const charset *cs_ = *iter_;
255             typename charset::index_set::const_iterator set_iter_ =
256                 cs_->_index_set.begin ();
257             typename charset::index_set::const_iterator set_end_ =
258                 cs_->_index_set.end ();
259 
260             fill_lookup (cs_->_token, lookup_, index_);
261 
262             for (; set_iter_ != set_end_; ++set_iter_)
263             {
264                 set_mapping_[*set_iter_].insert (index_);
265             }
266         }
267 
268         internals_._dfa_alphabet[state_] = token_list_->size () + dfa_offset;
269         return root_;
270     }
271 
build_macros(token_map & token_map_,const macro_deque & macrodeque_,typename parser::macro_map & macromap_,const regex_flags flags_,const std::locale & locale_,node_ptr_vector & node_ptr_vector_,bool & seen_BOL_assertion_,bool & seen_EOL_assertion_)272     static void build_macros (token_map &token_map_,
273         const macro_deque &macrodeque_,
274         typename parser::macro_map &macromap_, const regex_flags flags_,
275         const std::locale &locale_, node_ptr_vector &node_ptr_vector_,
276         bool &seen_BOL_assertion_, bool &seen_EOL_assertion_)
277     {
278         for (typename macro_deque::const_iterator iter_ =
279             macrodeque_.begin (), end_ = macrodeque_.end ();
280             iter_ != end_; ++iter_)
281         {
282             const typename rules::string &name_ = iter_->first;
283             const typename rules::string &regex_ = iter_->second;
284             detail::node *node_ = parser::parse (regex_.c_str (),
285                 regex_.c_str () + regex_.size (), 0, 0, 0, flags_,
286                 locale_, node_ptr_vector_, macromap_, token_map_,
287                 seen_BOL_assertion_, seen_EOL_assertion_);
288             macro_iter_pair map_iter_ = macromap_.
289                 insert (macro_pair (name_, static_cast<const detail::node *>
290                 (0)));
291 
292             map_iter_.first->second = node_;
293         }
294     }
295 
build_dfa(detail::node * root_,const index_set_vector & set_mapping_,const std::size_t dfa_alphabet_,size_t_vector & dfa_)296     static void build_dfa (detail::node *root_,
297         const index_set_vector &set_mapping_, const std::size_t dfa_alphabet_,
298         size_t_vector &dfa_)
299     {
300         typename detail::node::node_vector *followpos_ =
301             &root_->firstpos ();
302         node_set_vector seen_sets_;
303         node_vector_vector seen_vectors_;
304         size_t_vector hash_vector_;
305 
306         // 'jam' state
307         dfa_.resize (dfa_alphabet_, 0);
308         closure (followpos_, seen_sets_, seen_vectors_,
309             hash_vector_, dfa_alphabet_, dfa_);
310 
311         std::size_t *ptr_ = 0;
312 
313         for (std::size_t index_ = 0; index_ < seen_vectors_->size (); ++index_)
314         {
315             equivset_list equiv_list_;
316 
317             build_equiv_list (seen_vectors_[index_], set_mapping_, equiv_list_);
318 
319             for (typename equivset_list::list::const_iterator iter_ =
320                 equiv_list_->begin (), end_ = equiv_list_->end ();
321                 iter_ != end_; ++iter_)
322             {
323                 equivset *equivset_ = *iter_;
324                 const std::size_t transition_ = closure
325                     (&equivset_->_followpos, seen_sets_, seen_vectors_,
326                     hash_vector_, dfa_alphabet_, dfa_);
327 
328                 if (transition_ != npos)
329                 {
330                     ptr_ = &dfa_.front () + ((index_ + 1) * dfa_alphabet_);
331 
332                     // Prune abstemious transitions from end states.
333                     if (*ptr_ && !equivset_->_greedy) continue;
334 
335                     for (typename detail::equivset::index_vector::const_iterator
336                         equiv_iter_ = equivset_->_index_vector.begin (),
337                         equiv_end_ = equivset_->_index_vector.end ();
338                         equiv_iter_ != equiv_end_; ++equiv_iter_)
339                     {
340                         const std::size_t equiv_index_ = *equiv_iter_;
341 
342                         if (equiv_index_ == bol_token)
343                         {
344                             if (ptr_[eol_index] == 0)
345                             {
346                                 ptr_[bol_index] = transition_;
347                             }
348                         }
349                         else if (equiv_index_ == eol_token)
350                         {
351                             if (ptr_[bol_index] == 0)
352                             {
353                                 ptr_[eol_index] = transition_;
354                             }
355                         }
356                         else
357                         {
358                             ptr_[equiv_index_ + dfa_offset] = transition_;
359                         }
360                     }
361                 }
362             }
363         }
364     }
365 
closure(typename detail::node::node_vector * followpos_,node_set_vector & seen_sets_,node_vector_vector & seen_vectors_,size_t_vector & hash_vector_,const std::size_t size_,size_t_vector & dfa_)366     static std::size_t closure (typename detail::node::node_vector *followpos_,
367         node_set_vector &seen_sets_, node_vector_vector &seen_vectors_,
368         size_t_vector &hash_vector_, const std::size_t size_,
369         size_t_vector &dfa_)
370     {
371         bool end_state_ = false;
372         std::size_t id_ = 0;
373         std::size_t unique_id_ = npos;
374         std::size_t state_ = 0;
375         std::size_t hash_ = 0;
376 
377         if (followpos_->empty ()) return npos;
378 
379         std::size_t index_ = 0;
380         std::auto_ptr<node_set> set_ptr_ (new node_set);
381         std::auto_ptr<node_vector> vector_ptr_ (new node_vector);
382 
383         for (typename detail::node::node_vector::const_iterator iter_ =
384             followpos_->begin (), end_ = followpos_->end ();
385             iter_ != end_; ++iter_)
386         {
387             closure_ex (*iter_, end_state_, id_, unique_id_, state_,
388                 set_ptr_.get (), vector_ptr_.get (), hash_);
389         }
390 
391         bool found_ = false;
392         typename size_t_vector::const_iterator hash_iter_ =
393             hash_vector_.begin ();
394         typename size_t_vector::const_iterator hash_end_ =
395             hash_vector_.end ();
396         typename node_set_vector::vector::const_iterator set_iter_ =
397             seen_sets_->begin ();
398 
399         for (; hash_iter_ != hash_end_; ++hash_iter_, ++set_iter_)
400         {
401             found_ = *hash_iter_ == hash_ && *(*set_iter_) == *set_ptr_;
402             ++index_;
403 
404             if (found_) break;
405         }
406 
407         if (!found_)
408         {
409             seen_sets_->push_back (static_cast<node_set *>(0));
410             seen_sets_->back () = set_ptr_.release ();
411             seen_vectors_->push_back (static_cast<node_vector *>(0));
412             seen_vectors_->back () = vector_ptr_.release ();
413             hash_vector_.push_back (hash_);
414             // State 0 is the jam state...
415             index_ = seen_sets_->size ();
416 
417             const std::size_t old_size_ = dfa_.size ();
418 
419             dfa_.resize (old_size_ + size_, 0);
420 
421             if (end_state_)
422             {
423                 dfa_[old_size_] |= end_state;
424                 dfa_[old_size_ + id_index] = id_;
425                 dfa_[old_size_ + unique_id_index] = unique_id_;
426                 dfa_[old_size_ + state_index] = state_;
427             }
428         }
429 
430         return index_;
431     }
432 
closure_ex(detail::node * node_,bool & end_state_,std::size_t & id_,std::size_t & unique_id_,std::size_t & state_,node_set * set_ptr_,node_vector * vector_ptr_,std::size_t & hash_)433     static void closure_ex (detail::node *node_, bool &end_state_,
434         std::size_t &id_, std::size_t &unique_id_, std::size_t &state_,
435         node_set *set_ptr_, node_vector *vector_ptr_, std::size_t &hash_)
436     {
437         const bool temp_end_state_ = node_->end_state ();
438 
439         if (temp_end_state_)
440         {
441             if (!end_state_)
442             {
443                 end_state_ = true;
444                 id_ = node_->id ();
445                 unique_id_ = node_->unique_id ();
446                 state_ = node_->lexer_state ();
447             }
448         }
449 
450         if (set_ptr_->insert (node_).second)
451         {
452             vector_ptr_->push_back (node_);
453             hash_ += reinterpret_cast<std::size_t> (node_);
454         }
455     }
456 
partition_tokens(const token_map & map_,charset_list & lhs_)457     static void partition_tokens (const token_map &map_,
458         charset_list &lhs_)
459     {
460         charset_list rhs_;
461 
462         fill_rhs_list (map_, rhs_);
463 
464         if (!rhs_->empty ())
465         {
466             typename charset_list::list::iterator iter_;
467             typename charset_list::list::iterator end_;
468             charset_ptr overlap_ (new charset);
469 
470             lhs_->push_back (static_cast<charset *>(0));
471             lhs_->back () = rhs_->front ();
472             rhs_->pop_front ();
473 
474             while (!rhs_->empty ())
475             {
476                 charset_ptr r_ (rhs_->front ());
477 
478                 rhs_->pop_front ();
479                 iter_ = lhs_->begin ();
480                 end_ = lhs_->end ();
481 
482                 while (!r_->empty () && iter_ != end_)
483                 {
484                     typename charset_list::list::iterator l_iter_ = iter_;
485 
486                     (*l_iter_)->intersect (*r_.get (), *overlap_.get ());
487 
488                     if (overlap_->empty ())
489                     {
490                         ++iter_;
491                     }
492                     else if ((*l_iter_)->empty ())
493                     {
494                         delete *l_iter_;
495                         *l_iter_ = overlap_.release ();
496 
497                         // VC++ 6 Hack:
498                         charset_ptr temp_overlap_ (new charset);
499 
500                         overlap_ = temp_overlap_;
501                         ++iter_;
502                     }
503                     else if (r_->empty ())
504                     {
505                         delete r_.release ();
506                         r_ = overlap_;
507 
508                         // VC++ 6 Hack:
509                         charset_ptr temp_overlap_ (new charset);
510 
511                         overlap_ = temp_overlap_;
512                         break;
513                     }
514                     else
515                     {
516                         iter_ = lhs_->insert (++iter_,
517                             static_cast<charset *>(0));
518                         *iter_ = overlap_.release ();
519 
520                         // VC++ 6 Hack:
521                         charset_ptr temp_overlap_ (new charset);
522 
523                         overlap_ = temp_overlap_;
524                         ++iter_;
525                         end_ = lhs_->end ();
526                     }
527                 }
528 
529                 if (!r_->empty ())
530                 {
531                     lhs_->push_back (static_cast<charset *>(0));
532                     lhs_->back () = r_.release ();
533                 }
534             }
535         }
536     }
537 
fill_rhs_list(const token_map & map_,charset_list & list_)538     static void fill_rhs_list (const token_map &map_,
539         charset_list &list_)
540     {
541         typename parser::tokeniser::token_map::const_iterator iter_ =
542             map_.begin ();
543         typename parser::tokeniser::token_map::const_iterator end_ =
544             map_.end ();
545 
546         for (; iter_ != end_; ++iter_)
547         {
548             list_->push_back (static_cast<charset *>(0));
549             list_->back () = new charset (iter_->first, iter_->second);
550         }
551     }
552 
fill_lookup(const string_token & token_,size_t_vector * lookup_,const std::size_t index_)553     static void fill_lookup (const string_token &token_,
554         size_t_vector *lookup_, const std::size_t index_)
555     {
556         const CharT *curr_ = token_._charset.c_str ();
557         const CharT *chars_end_ = curr_ + token_._charset.size ();
558         std::size_t *ptr_ = &lookup_->front ();
559         const std::size_t max_ = sizeof (CharT) == 1 ?
560             num_chars : num_wchar_ts;
561 
562         if (token_._negated)
563         {
564             // $$$ FIXME JDG July 2014 $$$
565             // this code is problematic on platforms where wchar_t is signed
566             // with min generating negative numbers. This crashes with BAD_ACCESS
567             // because of the vector index below:
568             //  ptr_[static_cast<typename Traits::index_type>(curr_char_)]
569             CharT curr_char_ = 0; // (std::numeric_limits<CharT>::min)();
570             std::size_t i_ = 0;
571 
572             while (curr_ < chars_end_)
573             {
574                 while (*curr_ > curr_char_)
575                 {
576                     ptr_[static_cast<typename Traits::index_type>
577                         (curr_char_)] = index_ + dfa_offset;
578                     ++curr_char_;
579                     ++i_;
580                 }
581 
582                 ++curr_char_;
583                 ++curr_;
584                 ++i_;
585             }
586 
587             for (; i_ < max_; ++i_)
588             {
589                 ptr_[static_cast<typename Traits::index_type>(curr_char_)] =
590                     index_ + dfa_offset;
591                 ++curr_char_;
592             }
593         }
594         else
595         {
596             while (curr_ < chars_end_)
597             {
598                 ptr_[static_cast<typename Traits::index_type>(*curr_)] =
599                     index_ + dfa_offset;
600                 ++curr_;
601             }
602         }
603     }
604 
build_equiv_list(const node_vector * vector_,const index_set_vector & set_mapping_,equivset_list & lhs_)605     static void build_equiv_list (const node_vector *vector_,
606         const index_set_vector &set_mapping_, equivset_list &lhs_)
607     {
608         equivset_list rhs_;
609 
610         fill_rhs_list (vector_, set_mapping_, rhs_);
611 
612         if (!rhs_->empty ())
613         {
614             typename equivset_list::list::iterator iter_;
615             typename equivset_list::list::iterator end_;
616             equivset_ptr overlap_ (new equivset);
617 
618             lhs_->push_back (static_cast<equivset *>(0));
619             lhs_->back () = rhs_->front ();
620             rhs_->pop_front ();
621 
622             while (!rhs_->empty ())
623             {
624                 equivset_ptr r_ (rhs_->front ());
625 
626                 rhs_->pop_front ();
627                 iter_ = lhs_->begin ();
628                 end_ = lhs_->end ();
629 
630                 while (!r_->empty () && iter_ != end_)
631                 {
632                     typename equivset_list::list::iterator l_iter_ = iter_;
633 
634                     (*l_iter_)->intersect (*r_.get (), *overlap_.get ());
635 
636                     if (overlap_->empty ())
637                     {
638                         ++iter_;
639                     }
640                     else if ((*l_iter_)->empty ())
641                     {
642                         delete *l_iter_;
643                         *l_iter_ = overlap_.release ();
644 
645                         // VC++ 6 Hack:
646                         equivset_ptr temp_overlap_ (new equivset);
647 
648                         overlap_ = temp_overlap_;
649                         ++iter_;
650                     }
651                     else if (r_->empty ())
652                     {
653                         delete r_.release ();
654                         r_ = overlap_;
655 
656                         // VC++ 6 Hack:
657                         equivset_ptr temp_overlap_ (new equivset);
658 
659                         overlap_ = temp_overlap_;
660                         break;
661                     }
662                     else
663                     {
664                         iter_ = lhs_->insert (++iter_,
665                             static_cast<equivset *>(0));
666                         *iter_ = overlap_.release ();
667 
668                         // VC++ 6 Hack:
669                         equivset_ptr temp_overlap_ (new equivset);
670 
671                         overlap_ = temp_overlap_;
672                         ++iter_;
673                         end_ = lhs_->end ();
674                     }
675                 }
676 
677                 if (!r_->empty ())
678                 {
679                     lhs_->push_back (static_cast<equivset *>(0));
680                     lhs_->back () = r_.release ();
681                 }
682             }
683         }
684     }
685 
fill_rhs_list(const node_vector * vector_,const index_set_vector & set_mapping_,equivset_list & list_)686     static void fill_rhs_list (const node_vector *vector_,
687         const index_set_vector &set_mapping_, equivset_list &list_)
688     {
689         typename node_vector::const_iterator iter_ =
690             vector_->begin ();
691         typename node_vector::const_iterator end_ =
692             vector_->end ();
693 
694         for (; iter_ != end_; ++iter_)
695         {
696             const detail::node *node_ = *iter_;
697 
698             if (!node_->end_state ())
699             {
700                 const std::size_t token_ = node_->token ();
701 
702                 if (token_ != null_token)
703                 {
704                     list_->push_back (static_cast<equivset *>(0));
705 
706                     if (token_ == bol_token || token_ == eol_token)
707                     {
708                         std::set<std::size_t> index_set_;
709 
710                         index_set_.insert (token_);
711                         list_->back () = new equivset (index_set_,
712                             node_->greedy (), token_, node_->followpos ());
713                     }
714                     else
715                     {
716                         list_->back () = new equivset (set_mapping_[token_],
717                             node_->greedy (), token_, node_->followpos ());
718                     }
719                 }
720             }
721         }
722     }
723 
fixup_bol(detail::node * & root_,node_ptr_vector & node_ptr_vector_)724     static void fixup_bol (detail::node * &root_,
725         node_ptr_vector &node_ptr_vector_)
726     {
727         typename detail::node::node_vector *first_ = &root_->firstpos ();
728         bool found_ = false;
729         typename detail::node::node_vector::const_iterator iter_ =
730             first_->begin ();
731         typename detail::node::node_vector::const_iterator end_ =
732             first_->end ();
733 
734         for (; iter_ != end_; ++iter_)
735         {
736             const detail::node *node_ = *iter_;
737 
738             found_ = !node_->end_state () && node_->token () == bol_token;
739 
740             if (found_) break;
741         }
742 
743         if (!found_)
744         {
745             node_ptr_vector_->push_back (static_cast<detail::leaf_node *>(0));
746             node_ptr_vector_->back () = new detail::leaf_node
747                 (bol_token, true);
748 
749             detail::node *lhs_ = node_ptr_vector_->back ();
750 
751             node_ptr_vector_->push_back (static_cast<detail::leaf_node *>(0));
752             node_ptr_vector_->back () = new detail::leaf_node
753                 (null_token, true);
754 
755             detail::node *rhs_ = node_ptr_vector_->back ();
756 
757             node_ptr_vector_->push_back
758                 (static_cast<detail::selection_node *>(0));
759             node_ptr_vector_->back () =
760                 new detail::selection_node (lhs_, rhs_);
761             lhs_ = node_ptr_vector_->back ();
762 
763             node_ptr_vector_->push_back
764                 (static_cast<detail::sequence_node *>(0));
765             node_ptr_vector_->back () =
766                 new detail::sequence_node (lhs_, root_);
767             root_ = node_ptr_vector_->back ();
768         }
769     }
770 
minimise_dfa(const std::size_t dfa_alphabet_,size_t_vector & dfa_,std::size_t size_)771     static void minimise_dfa (const std::size_t dfa_alphabet_,
772         size_t_vector &dfa_, std::size_t size_)
773     {
774         const std::size_t *first_ = &dfa_.front ();
775         const std::size_t *second_ = 0;
776         const std::size_t *end_ = first_ + size_;
777         std::size_t index_ = 1;
778         std::size_t new_index_ = 1;
779         std::size_t curr_index_ = 0;
780         index_set index_set_;
781         size_t_vector lookup_;
782         std::size_t *lookup_ptr_ = 0;
783 
784         lookup_.resize (size_ / dfa_alphabet_, null_token);
785         lookup_ptr_ = &lookup_.front ();
786         *lookup_ptr_ = 0;
787         // Only one 'jam' state, so skip it.
788         first_ += dfa_alphabet_;
789 
790         for (; first_ < end_; first_ += dfa_alphabet_, ++index_)
791         {
792             for (second_ = first_ + dfa_alphabet_, curr_index_ = index_ + 1;
793                 second_ < end_; second_ += dfa_alphabet_, ++curr_index_)
794             {
795                 if (index_set_.find (curr_index_) != index_set_.end ())
796                 {
797                     continue;
798                 }
799 
800                 // Some systems have memcmp in namespace std.
801                 using namespace std;
802 
803                 if (memcmp (first_, second_, sizeof (std::size_t) *
804                     dfa_alphabet_) == 0)
805                 {
806                     index_set_.insert (curr_index_);
807                     lookup_ptr_[curr_index_] = new_index_;
808                 }
809             }
810 
811             if (lookup_ptr_[index_] == null_token)
812             {
813                 lookup_ptr_[index_] = new_index_;
814                 ++new_index_;
815             }
816         }
817 
818         if (!index_set_.empty ())
819         {
820             const std::size_t *front_ = &dfa_.front ();
821             size_t_vector new_dfa_ (front_, front_ + dfa_alphabet_);
822             typename index_set::iterator set_end_ =
823                 index_set_.end ();
824             const std::size_t *ptr_ = front_ + dfa_alphabet_;
825             std::size_t *new_ptr_ = 0;
826 
827             new_dfa_.resize (size_ - index_set_.size () * dfa_alphabet_, 0);
828             new_ptr_ = &new_dfa_.front () + dfa_alphabet_;
829             size_ /= dfa_alphabet_;
830 
831             for (index_ = 1; index_ < size_; ++index_)
832             {
833                 if (index_set_.find (index_) != set_end_)
834                 {
835                     ptr_ += dfa_alphabet_;
836                     continue;
837                 }
838 
839                 new_ptr_[end_state_index] = ptr_[end_state_index];
840                 new_ptr_[id_index] = ptr_[id_index];
841                 new_ptr_[unique_id_index] = ptr_[unique_id_index];
842                 new_ptr_[state_index] = ptr_[state_index];
843                 new_ptr_[bol_index] = lookup_ptr_[ptr_[bol_index]];
844                 new_ptr_[eol_index] = lookup_ptr_[ptr_[eol_index]];
845                 new_ptr_ += dfa_offset;
846                 ptr_ += dfa_offset;
847 
848                 for (std::size_t i_ = dfa_offset; i_ < dfa_alphabet_; ++i_)
849                 {
850                     *new_ptr_++ = lookup_ptr_[*ptr_++];
851                 }
852             }
853 
854             dfa_.swap (new_dfa_);
855         }
856     }
857 };
858 
859 typedef basic_generator<char> generator;
860 typedef basic_generator<wchar_t> wgenerator;
861 }
862 }
863 
864 #endif
865