1 // generator.hpp
2 // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/)
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See accompanying
5 // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6 #ifndef BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_GENERATOR_HPP
7 #define BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_GENERATOR_HPP
8 
9 #include "char_traits.hpp"
10 // memcmp()
11 #include <cstring>
12 #include "partition/charset.hpp"
13 #include "partition/equivset.hpp"
14 #include <memory>
15 #include <limits>
16 #include "parser/tree/node.hpp"
17 #include "parser/parser.hpp"
18 #include "containers/ptr_list.hpp"
19 #include <boost/move/unique_ptr.hpp>
20 #include "rules.hpp"
21 #include "state_machine.hpp"
22 
23 namespace boost
24 {
25 namespace lexer
26 {
27 template<typename CharT, typename Traits = char_traits<CharT> >
28 class basic_generator
29 {
30 public:
31     typedef typename detail::internals::size_t_vector size_t_vector;
32     typedef basic_rules<CharT> rules;
33 
build(const rules & rules_,basic_state_machine<CharT> & state_machine_)34     static void build (const rules &rules_,
35         basic_state_machine<CharT> &state_machine_)
36     {
37         std::size_t index_ = 0;
38         std::size_t size_ = rules_.statemap ().size ();
39         node_ptr_vector node_ptr_vector_;
40         detail::internals &internals_ = const_cast<detail::internals &>
41             (state_machine_.data ());
42         bool seen_BOL_assertion_ = false;
43         bool seen_EOL_assertion_ = false;
44 
45         state_machine_.clear ();
46 
47         for (; index_ < size_; ++index_)
48         {
49             internals_._lookup->push_back (static_cast<size_t_vector *>(0));
50             internals_._lookup->back () = new size_t_vector;
51             internals_._dfa_alphabet.push_back (0);
52             internals_._dfa->push_back (static_cast<size_t_vector *>(0));
53             internals_._dfa->back () = new size_t_vector;
54         }
55 
56         for (index_ = 0, size_ = internals_._lookup->size ();
57             index_ < size_; ++index_)
58         {
59             internals_._lookup[index_]->resize (sizeof (CharT) == 1 ?
60                 num_chars : num_wchar_ts, dead_state_index);
61 
62             if (!rules_.regexes ()[index_].empty ())
63             {
64                 // vector mapping token indexes to partitioned token index sets
65                 index_set_vector set_mapping_;
66                 // syntax tree
67                 detail::node *root_ = build_tree (rules_, index_,
68                     node_ptr_vector_, internals_, set_mapping_);
69 
70                 build_dfa (root_, set_mapping_,
71                     internals_._dfa_alphabet[index_],
72                     *internals_._dfa[index_]);
73 
74                 if (internals_._seen_BOL_assertion)
75                 {
76                     seen_BOL_assertion_ = true;
77                 }
78 
79                 if (internals_._seen_EOL_assertion)
80                 {
81                     seen_EOL_assertion_ = true;
82                 }
83 
84                 internals_._seen_BOL_assertion = false;
85                 internals_._seen_EOL_assertion = false;
86             }
87         }
88 
89         internals_._seen_BOL_assertion = seen_BOL_assertion_;
90         internals_._seen_EOL_assertion = seen_EOL_assertion_;
91     }
92 
minimise(basic_state_machine<CharT> & state_machine_)93     static void minimise (basic_state_machine<CharT> &state_machine_)
94     {
95         detail::internals &internals_ = const_cast<detail::internals &>
96             (state_machine_.data ());
97         const std::size_t machines_ = internals_._dfa->size ();
98 
99         for (std::size_t i_ = 0; i_ < machines_; ++i_)
100         {
101             const std::size_t dfa_alphabet_ = internals_._dfa_alphabet[i_];
102             size_t_vector *dfa_ = internals_._dfa[i_];
103 
104             if (dfa_alphabet_ != 0)
105             {
106                 std::size_t size_ = 0;
107 
108                 do
109                 {
110                     size_ = dfa_->size ();
111                     minimise_dfa (dfa_alphabet_, *dfa_, size_);
112                 } while (dfa_->size () != size_);
113             }
114         }
115     }
116 
117 protected:
118     typedef detail::basic_charset<CharT> charset;
119     typedef detail::ptr_list<charset> charset_list;
120     typedef boost::movelib::unique_ptr<charset> charset_ptr;
121     typedef detail::equivset equivset;
122     typedef detail::ptr_list<equivset> equivset_list;
123     typedef boost::movelib::unique_ptr<equivset> equivset_ptr;
124     typedef typename charset::index_set index_set;
125     typedef std::vector<index_set> index_set_vector;
126     typedef detail::basic_parser<CharT> parser;
127     typedef typename parser::node_ptr_vector node_ptr_vector;
128     typedef std::set<const detail::node *> node_set;
129     typedef detail::ptr_vector<node_set> node_set_vector;
130     typedef std::vector<const detail::node *> node_vector;
131     typedef detail::ptr_vector<node_vector> node_vector_vector;
132     typedef typename parser::string string;
133     typedef std::pair<string, string> string_pair;
134     typedef typename parser::tokeniser::string_token string_token;
135     typedef std::deque<string_pair> macro_deque;
136     typedef std::pair<string, const detail::node *> macro_pair;
137     typedef typename parser::macro_map::iterator macro_iter;
138     typedef std::pair<macro_iter, bool> macro_iter_pair;
139     typedef typename parser::tokeniser::token_map token_map;
140 
build_tree(const rules & rules_,const std::size_t state_,node_ptr_vector & node_ptr_vector_,detail::internals & internals_,index_set_vector & set_mapping_)141     static detail::node *build_tree (const rules &rules_,
142         const std::size_t state_, node_ptr_vector &node_ptr_vector_,
143         detail::internals &internals_, index_set_vector &set_mapping_)
144     {
145         size_t_vector *lookup_ = internals_._lookup[state_];
146         const typename rules::string_deque_deque &regexes_ =
147             rules_.regexes ();
148         const typename rules::id_vector_deque &ids_ = rules_.ids ();
149         const typename rules::id_vector_deque &unique_ids_ =
150             rules_.unique_ids ();
151         const typename rules::id_vector_deque &states_ = rules_.states ();
152         typename rules::string_deque::const_iterator regex_iter_ =
153             regexes_[state_].begin ();
154         typename rules::string_deque::const_iterator regex_iter_end_ =
155             regexes_[state_].end ();
156         typename rules::id_vector::const_iterator ids_iter_ =
157             ids_[state_].begin ();
158         typename rules::id_vector::const_iterator unique_ids_iter_ =
159             unique_ids_[state_].begin ();
160         typename rules::id_vector::const_iterator states_iter_ =
161             states_[state_].begin ();
162         const typename rules::string &regex_ = *regex_iter_;
163         // map of regex charset tokens (strings) to index
164         token_map token_map_;
165         const typename rules::string_pair_deque &macrodeque_ =
166             rules_.macrodeque ();
167         typename parser::macro_map macromap_;
168         typename detail::node::node_vector tree_vector_;
169 
170         build_macros (token_map_, macrodeque_, macromap_,
171             rules_.flags (), rules_.locale (), node_ptr_vector_,
172             internals_._seen_BOL_assertion, internals_._seen_EOL_assertion);
173 
174         detail::node *root_ = parser::parse (regex_.c_str (),
175             regex_.c_str () + regex_.size (), *ids_iter_, *unique_ids_iter_,
176             *states_iter_, rules_.flags (), rules_.locale (), node_ptr_vector_,
177             macromap_, token_map_, internals_._seen_BOL_assertion,
178             internals_._seen_EOL_assertion);
179 
180         ++regex_iter_;
181         ++ids_iter_;
182         ++unique_ids_iter_;
183         ++states_iter_;
184         tree_vector_.push_back (root_);
185 
186         // build syntax trees
187         while (regex_iter_ != regex_iter_end_)
188         {
189             // re-declare var, otherwise we perform an assignment..!
190             const typename rules::string &regex2_ = *regex_iter_;
191 
192             root_ = parser::parse (regex2_.c_str (),
193                 regex2_.c_str () + regex2_.size (), *ids_iter_,
194                 *unique_ids_iter_, *states_iter_, rules_.flags (),
195                 rules_.locale (), node_ptr_vector_, macromap_, token_map_,
196                 internals_._seen_BOL_assertion,
197                 internals_._seen_EOL_assertion);
198             tree_vector_.push_back (root_);
199             ++regex_iter_;
200             ++ids_iter_;
201             ++unique_ids_iter_;
202             ++states_iter_;
203         }
204 
205         if (internals_._seen_BOL_assertion)
206         {
207             // Fixup BOLs
208             typename detail::node::node_vector::iterator iter_ =
209                 tree_vector_.begin ();
210             typename detail::node::node_vector::iterator end_ =
211                 tree_vector_.end ();
212 
213             for (; iter_ != end_; ++iter_)
214             {
215                 fixup_bol (*iter_, node_ptr_vector_);
216             }
217         }
218 
219         // join trees
220         {
221             typename detail::node::node_vector::iterator iter_ =
222                 tree_vector_.begin ();
223             typename detail::node::node_vector::iterator end_ =
224                 tree_vector_.end ();
225 
226             if (iter_ != end_)
227             {
228                 root_ = *iter_;
229                 ++iter_;
230             }
231 
232             for (; iter_ != end_; ++iter_)
233             {
234                 node_ptr_vector_->push_back (static_cast<detail::selection_node *>(0));
235                 node_ptr_vector_->back () = new detail::selection_node
236                     (root_, *iter_);
237                 root_ = node_ptr_vector_->back ();
238             }
239         }
240 
241         // partitioned token list
242         charset_list token_list_;
243 
244         set_mapping_.resize (token_map_.size ());
245         partition_tokens (token_map_, token_list_);
246 
247         typename charset_list::list::const_iterator iter_ =
248             token_list_->begin ();
249         typename charset_list::list::const_iterator end_ =
250             token_list_->end ();
251         std::size_t index_ = 0;
252 
253         for (; iter_ != end_; ++iter_, ++index_)
254         {
255             const charset *cs_ = *iter_;
256             typename charset::index_set::const_iterator set_iter_ =
257                 cs_->_index_set.begin ();
258             typename charset::index_set::const_iterator set_end_ =
259                 cs_->_index_set.end ();
260 
261             fill_lookup (cs_->_token, lookup_, index_);
262 
263             for (; set_iter_ != set_end_; ++set_iter_)
264             {
265                 set_mapping_[*set_iter_].insert (index_);
266             }
267         }
268 
269         internals_._dfa_alphabet[state_] = token_list_->size () + dfa_offset;
270         return root_;
271     }
272 
build_macros(token_map & token_map_,const macro_deque & macrodeque_,typename parser::macro_map & macromap_,const regex_flags flags_,const std::locale & locale_,node_ptr_vector & node_ptr_vector_,bool & seen_BOL_assertion_,bool & seen_EOL_assertion_)273     static void build_macros (token_map &token_map_,
274         const macro_deque &macrodeque_,
275         typename parser::macro_map &macromap_, const regex_flags flags_,
276         const std::locale &locale_, node_ptr_vector &node_ptr_vector_,
277         bool &seen_BOL_assertion_, bool &seen_EOL_assertion_)
278     {
279         for (typename macro_deque::const_iterator iter_ =
280             macrodeque_.begin (), end_ = macrodeque_.end ();
281             iter_ != end_; ++iter_)
282         {
283             const typename rules::string &name_ = iter_->first;
284             const typename rules::string &regex_ = iter_->second;
285             detail::node *node_ = parser::parse (regex_.c_str (),
286                 regex_.c_str () + regex_.size (), 0, 0, 0, flags_,
287                 locale_, node_ptr_vector_, macromap_, token_map_,
288                 seen_BOL_assertion_, seen_EOL_assertion_);
289             macro_iter_pair map_iter_ = macromap_.
290                 insert (macro_pair (name_, static_cast<const detail::node *>
291                 (0)));
292 
293             map_iter_.first->second = node_;
294         }
295     }
296 
build_dfa(detail::node * root_,const index_set_vector & set_mapping_,const std::size_t dfa_alphabet_,size_t_vector & dfa_)297     static void build_dfa (detail::node *root_,
298         const index_set_vector &set_mapping_, const std::size_t dfa_alphabet_,
299         size_t_vector &dfa_)
300     {
301         typename detail::node::node_vector *followpos_ =
302             &root_->firstpos ();
303         node_set_vector seen_sets_;
304         node_vector_vector seen_vectors_;
305         size_t_vector hash_vector_;
306 
307         // 'jam' state
308         dfa_.resize (dfa_alphabet_, 0);
309         closure (followpos_, seen_sets_, seen_vectors_,
310             hash_vector_, dfa_alphabet_, dfa_);
311 
312         std::size_t *ptr_ = 0;
313 
314         for (std::size_t index_ = 0; index_ < seen_vectors_->size (); ++index_)
315         {
316             equivset_list equiv_list_;
317 
318             build_equiv_list (seen_vectors_[index_], set_mapping_, equiv_list_);
319 
320             for (typename equivset_list::list::const_iterator iter_ =
321                 equiv_list_->begin (), end_ = equiv_list_->end ();
322                 iter_ != end_; ++iter_)
323             {
324                 equivset *equivset_ = *iter_;
325                 const std::size_t transition_ = closure
326                     (&equivset_->_followpos, seen_sets_, seen_vectors_,
327                     hash_vector_, dfa_alphabet_, dfa_);
328 
329                 if (transition_ != npos)
330                 {
331                     ptr_ = &dfa_.front () + ((index_ + 1) * dfa_alphabet_);
332 
333                     // Prune abstemious transitions from end states.
334                     if (*ptr_ && !equivset_->_greedy) continue;
335 
336                     for (typename detail::equivset::index_vector::const_iterator
337                         equiv_iter_ = equivset_->_index_vector.begin (),
338                         equiv_end_ = equivset_->_index_vector.end ();
339                         equiv_iter_ != equiv_end_; ++equiv_iter_)
340                     {
341                         const std::size_t equiv_index_ = *equiv_iter_;
342 
343                         if (equiv_index_ == bol_token)
344                         {
345                             if (ptr_[eol_index] == 0)
346                             {
347                                 ptr_[bol_index] = transition_;
348                             }
349                         }
350                         else if (equiv_index_ == eol_token)
351                         {
352                             if (ptr_[bol_index] == 0)
353                             {
354                                 ptr_[eol_index] = transition_;
355                             }
356                         }
357                         else
358                         {
359                             ptr_[equiv_index_ + dfa_offset] = transition_;
360                         }
361                     }
362                 }
363             }
364         }
365     }
366 
closure(typename detail::node::node_vector * followpos_,node_set_vector & seen_sets_,node_vector_vector & seen_vectors_,size_t_vector & hash_vector_,const std::size_t size_,size_t_vector & dfa_)367     static std::size_t closure (typename detail::node::node_vector *followpos_,
368         node_set_vector &seen_sets_, node_vector_vector &seen_vectors_,
369         size_t_vector &hash_vector_, const std::size_t size_,
370         size_t_vector &dfa_)
371     {
372         bool end_state_ = false;
373         std::size_t id_ = 0;
374         std::size_t unique_id_ = npos;
375         std::size_t state_ = 0;
376         std::size_t hash_ = 0;
377 
378         if (followpos_->empty ()) return npos;
379 
380         std::size_t index_ = 0;
381         boost::movelib::unique_ptr<node_set> set_ptr_ (new node_set);
382         boost::movelib::unique_ptr<node_vector> vector_ptr_ (new node_vector);
383 
384         for (typename detail::node::node_vector::const_iterator iter_ =
385             followpos_->begin (), end_ = followpos_->end ();
386             iter_ != end_; ++iter_)
387         {
388             closure_ex (*iter_, end_state_, id_, unique_id_, state_,
389                 set_ptr_.get (), vector_ptr_.get (), hash_);
390         }
391 
392         bool found_ = false;
393         typename size_t_vector::const_iterator hash_iter_ =
394             hash_vector_.begin ();
395         typename size_t_vector::const_iterator hash_end_ =
396             hash_vector_.end ();
397         typename node_set_vector::vector::const_iterator set_iter_ =
398             seen_sets_->begin ();
399 
400         for (; hash_iter_ != hash_end_; ++hash_iter_, ++set_iter_)
401         {
402             found_ = *hash_iter_ == hash_ && *(*set_iter_) == *set_ptr_;
403             ++index_;
404 
405             if (found_) break;
406         }
407 
408         if (!found_)
409         {
410             seen_sets_->push_back (static_cast<node_set *>(0));
411             seen_sets_->back () = set_ptr_.release ();
412             seen_vectors_->push_back (static_cast<node_vector *>(0));
413             seen_vectors_->back () = vector_ptr_.release ();
414             hash_vector_.push_back (hash_);
415             // State 0 is the jam state...
416             index_ = seen_sets_->size ();
417 
418             const std::size_t old_size_ = dfa_.size ();
419 
420             dfa_.resize (old_size_ + size_, 0);
421 
422             if (end_state_)
423             {
424                 dfa_[old_size_] |= end_state;
425                 dfa_[old_size_ + id_index] = id_;
426                 dfa_[old_size_ + unique_id_index] = unique_id_;
427                 dfa_[old_size_ + state_index] = state_;
428             }
429         }
430 
431         return index_;
432     }
433 
closure_ex(detail::node * node_,bool & end_state_,std::size_t & id_,std::size_t & unique_id_,std::size_t & state_,node_set * set_ptr_,node_vector * vector_ptr_,std::size_t & hash_)434     static void closure_ex (detail::node *node_, bool &end_state_,
435         std::size_t &id_, std::size_t &unique_id_, std::size_t &state_,
436         node_set *set_ptr_, node_vector *vector_ptr_, std::size_t &hash_)
437     {
438         const bool temp_end_state_ = node_->end_state ();
439 
440         if (temp_end_state_)
441         {
442             if (!end_state_)
443             {
444                 end_state_ = true;
445                 id_ = node_->id ();
446                 unique_id_ = node_->unique_id ();
447                 state_ = node_->lexer_state ();
448             }
449         }
450 
451         if (set_ptr_->insert (node_).second)
452         {
453             vector_ptr_->push_back (node_);
454             hash_ += reinterpret_cast<std::size_t> (node_);
455         }
456     }
457 
partition_tokens(const token_map & map_,charset_list & lhs_)458     static void partition_tokens (const token_map &map_,
459         charset_list &lhs_)
460     {
461         charset_list rhs_;
462 
463         fill_rhs_list (map_, rhs_);
464 
465         if (!rhs_->empty ())
466         {
467             typename charset_list::list::iterator iter_;
468             typename charset_list::list::iterator end_;
469             charset_ptr overlap_ (new charset);
470 
471             lhs_->push_back (static_cast<charset *>(0));
472             lhs_->back () = rhs_->front ();
473             rhs_->pop_front ();
474 
475             while (!rhs_->empty ())
476             {
477                 charset_ptr r_ (rhs_->front ());
478 
479                 rhs_->pop_front ();
480                 iter_ = lhs_->begin ();
481                 end_ = lhs_->end ();
482 
483                 while (!r_->empty () && iter_ != end_)
484                 {
485                     typename charset_list::list::iterator l_iter_ = iter_;
486 
487                     (*l_iter_)->intersect (*r_.get (), *overlap_.get ());
488 
489                     if (overlap_->empty ())
490                     {
491                         ++iter_;
492                     }
493                     else if ((*l_iter_)->empty ())
494                     {
495                         delete *l_iter_;
496                         *l_iter_ = overlap_.release ();
497 
498                         overlap_.reset (new charset);
499                         ++iter_;
500                     }
501                     else if (r_->empty ())
502                     {
503                         overlap_.swap (r_);
504                         overlap_.reset (new charset);
505                         break;
506                     }
507                     else
508                     {
509                         iter_ = lhs_->insert (++iter_,
510                             static_cast<charset *>(0));
511                         *iter_ = overlap_.release ();
512 
513                         overlap_.reset(new charset);
514                         ++iter_;
515                         end_ = lhs_->end ();
516                     }
517                 }
518 
519                 if (!r_->empty ())
520                 {
521                     lhs_->push_back (static_cast<charset *>(0));
522                     lhs_->back () = r_.release ();
523                 }
524             }
525         }
526     }
527 
fill_rhs_list(const token_map & map_,charset_list & list_)528     static void fill_rhs_list (const token_map &map_,
529         charset_list &list_)
530     {
531         typename parser::tokeniser::token_map::const_iterator iter_ =
532             map_.begin ();
533         typename parser::tokeniser::token_map::const_iterator end_ =
534             map_.end ();
535 
536         for (; iter_ != end_; ++iter_)
537         {
538             list_->push_back (static_cast<charset *>(0));
539             list_->back () = new charset (iter_->first, iter_->second);
540         }
541     }
542 
fill_lookup(const string_token & token_,size_t_vector * lookup_,const std::size_t index_)543     static void fill_lookup (const string_token &token_,
544         size_t_vector *lookup_, const std::size_t index_)
545     {
546         const CharT *curr_ = token_._charset.c_str ();
547         const CharT *chars_end_ = curr_ + token_._charset.size ();
548         std::size_t *ptr_ = &lookup_->front ();
549         const std::size_t max_ = sizeof (CharT) == 1 ?
550             num_chars : num_wchar_ts;
551 
552         if (token_._negated)
553         {
554             // $$$ FIXME JDG July 2014 $$$
555             // this code is problematic on platforms where wchar_t is signed
556             // with min generating negative numbers. This crashes with BAD_ACCESS
557             // because of the vector index below:
558             //  ptr_[static_cast<typename Traits::index_type>(curr_char_)]
559             CharT curr_char_ = 0; // (std::numeric_limits<CharT>::min)();
560             std::size_t i_ = 0;
561 
562             while (curr_ < chars_end_)
563             {
564                 while (*curr_ > curr_char_)
565                 {
566                     ptr_[static_cast<typename Traits::index_type>
567                         (curr_char_)] = index_ + dfa_offset;
568                     ++curr_char_;
569                     ++i_;
570                 }
571 
572                 ++curr_char_;
573                 ++curr_;
574                 ++i_;
575             }
576 
577             for (; i_ < max_; ++i_)
578             {
579                 ptr_[static_cast<typename Traits::index_type>(curr_char_)] =
580                     index_ + dfa_offset;
581                 ++curr_char_;
582             }
583         }
584         else
585         {
586             while (curr_ < chars_end_)
587             {
588                 ptr_[static_cast<typename Traits::index_type>(*curr_)] =
589                     index_ + dfa_offset;
590                 ++curr_;
591             }
592         }
593     }
594 
build_equiv_list(const node_vector * vector_,const index_set_vector & set_mapping_,equivset_list & lhs_)595     static void build_equiv_list (const node_vector *vector_,
596         const index_set_vector &set_mapping_, equivset_list &lhs_)
597     {
598         equivset_list rhs_;
599 
600         fill_rhs_list (vector_, set_mapping_, rhs_);
601 
602         if (!rhs_->empty ())
603         {
604             typename equivset_list::list::iterator iter_;
605             typename equivset_list::list::iterator end_;
606             equivset_ptr overlap_ (new equivset);
607 
608             lhs_->push_back (static_cast<equivset *>(0));
609             lhs_->back () = rhs_->front ();
610             rhs_->pop_front ();
611 
612             while (!rhs_->empty ())
613             {
614                 equivset_ptr r_ (rhs_->front ());
615 
616                 rhs_->pop_front ();
617                 iter_ = lhs_->begin ();
618                 end_ = lhs_->end ();
619 
620                 while (!r_->empty () && iter_ != end_)
621                 {
622                     typename equivset_list::list::iterator l_iter_ = iter_;
623 
624                     (*l_iter_)->intersect (*r_.get (), *overlap_.get ());
625 
626                     if (overlap_->empty ())
627                     {
628                         ++iter_;
629                     }
630                     else if ((*l_iter_)->empty ())
631                     {
632                         delete *l_iter_;
633                         *l_iter_ = overlap_.release ();
634 
635                         overlap_.reset (new equivset);
636                         ++iter_;
637                     }
638                     else if (r_->empty ())
639                     {
640                         overlap_.swap (r_);
641                         overlap_.reset (new equivset);
642                         break;
643                     }
644                     else
645                     {
646                         iter_ = lhs_->insert (++iter_,
647                             static_cast<equivset *>(0));
648                         *iter_ = overlap_.release ();
649 
650                         overlap_.reset (new equivset);
651                         ++iter_;
652                         end_ = lhs_->end ();
653                     }
654                 }
655 
656                 if (!r_->empty ())
657                 {
658                     lhs_->push_back (static_cast<equivset *>(0));
659                     lhs_->back () = r_.release ();
660                 }
661             }
662         }
663     }
664 
fill_rhs_list(const node_vector * vector_,const index_set_vector & set_mapping_,equivset_list & list_)665     static void fill_rhs_list (const node_vector *vector_,
666         const index_set_vector &set_mapping_, equivset_list &list_)
667     {
668         typename node_vector::const_iterator iter_ =
669             vector_->begin ();
670         typename node_vector::const_iterator end_ =
671             vector_->end ();
672 
673         for (; iter_ != end_; ++iter_)
674         {
675             const detail::node *node_ = *iter_;
676 
677             if (!node_->end_state ())
678             {
679                 const std::size_t token_ = node_->token ();
680 
681                 if (token_ != null_token)
682                 {
683                     list_->push_back (static_cast<equivset *>(0));
684 
685                     if (token_ == bol_token || token_ == eol_token)
686                     {
687                         std::set<std::size_t> index_set_;
688 
689                         index_set_.insert (token_);
690                         list_->back () = new equivset (index_set_,
691                             node_->greedy (), token_, node_->followpos ());
692                     }
693                     else
694                     {
695                         list_->back () = new equivset (set_mapping_[token_],
696                             node_->greedy (), token_, node_->followpos ());
697                     }
698                 }
699             }
700         }
701     }
702 
fixup_bol(detail::node * & root_,node_ptr_vector & node_ptr_vector_)703     static void fixup_bol (detail::node * &root_,
704         node_ptr_vector &node_ptr_vector_)
705     {
706         typename detail::node::node_vector *first_ = &root_->firstpos ();
707         bool found_ = false;
708         typename detail::node::node_vector::const_iterator iter_ =
709             first_->begin ();
710         typename detail::node::node_vector::const_iterator end_ =
711             first_->end ();
712 
713         for (; iter_ != end_; ++iter_)
714         {
715             const detail::node *node_ = *iter_;
716 
717             found_ = !node_->end_state () && node_->token () == bol_token;
718 
719             if (found_) break;
720         }
721 
722         if (!found_)
723         {
724             node_ptr_vector_->push_back (static_cast<detail::leaf_node *>(0));
725             node_ptr_vector_->back () = new detail::leaf_node
726                 (bol_token, true);
727 
728             detail::node *lhs_ = node_ptr_vector_->back ();
729 
730             node_ptr_vector_->push_back (static_cast<detail::leaf_node *>(0));
731             node_ptr_vector_->back () = new detail::leaf_node
732                 (null_token, true);
733 
734             detail::node *rhs_ = node_ptr_vector_->back ();
735 
736             node_ptr_vector_->push_back
737                 (static_cast<detail::selection_node *>(0));
738             node_ptr_vector_->back () =
739                 new detail::selection_node (lhs_, rhs_);
740             lhs_ = node_ptr_vector_->back ();
741 
742             node_ptr_vector_->push_back
743                 (static_cast<detail::sequence_node *>(0));
744             node_ptr_vector_->back () =
745                 new detail::sequence_node (lhs_, root_);
746             root_ = node_ptr_vector_->back ();
747         }
748     }
749 
minimise_dfa(const std::size_t dfa_alphabet_,size_t_vector & dfa_,std::size_t size_)750     static void minimise_dfa (const std::size_t dfa_alphabet_,
751         size_t_vector &dfa_, std::size_t size_)
752     {
753         const std::size_t *first_ = &dfa_.front ();
754         const std::size_t *second_ = 0;
755         const std::size_t *end_ = first_ + size_;
756         std::size_t index_ = 1;
757         std::size_t new_index_ = 1;
758         std::size_t curr_index_ = 0;
759         index_set index_set_;
760         size_t_vector lookup_;
761         std::size_t *lookup_ptr_ = 0;
762 
763         lookup_.resize (size_ / dfa_alphabet_, null_token);
764         lookup_ptr_ = &lookup_.front ();
765         *lookup_ptr_ = 0;
766         // Only one 'jam' state, so skip it.
767         first_ += dfa_alphabet_;
768 
769         for (; first_ < end_; first_ += dfa_alphabet_, ++index_)
770         {
771             for (second_ = first_ + dfa_alphabet_, curr_index_ = index_ + 1;
772                 second_ < end_; second_ += dfa_alphabet_, ++curr_index_)
773             {
774                 if (index_set_.find (curr_index_) != index_set_.end ())
775                 {
776                     continue;
777                 }
778 
779                 // Some systems have memcmp in namespace std.
780                 using namespace std;
781 
782                 if (memcmp (first_, second_, sizeof (std::size_t) *
783                     dfa_alphabet_) == 0)
784                 {
785                     index_set_.insert (curr_index_);
786                     lookup_ptr_[curr_index_] = new_index_;
787                 }
788             }
789 
790             if (lookup_ptr_[index_] == null_token)
791             {
792                 lookup_ptr_[index_] = new_index_;
793                 ++new_index_;
794             }
795         }
796 
797         if (!index_set_.empty ())
798         {
799             const std::size_t *front_ = &dfa_.front ();
800             size_t_vector new_dfa_ (front_, front_ + dfa_alphabet_);
801             typename index_set::iterator set_end_ =
802                 index_set_.end ();
803             const std::size_t *ptr_ = front_ + dfa_alphabet_;
804             std::size_t *new_ptr_ = 0;
805 
806             new_dfa_.resize (size_ - index_set_.size () * dfa_alphabet_, 0);
807             new_ptr_ = &new_dfa_.front () + dfa_alphabet_;
808             size_ /= dfa_alphabet_;
809 
810             for (index_ = 1; index_ < size_; ++index_)
811             {
812                 if (index_set_.find (index_) != set_end_)
813                 {
814                     ptr_ += dfa_alphabet_;
815                     continue;
816                 }
817 
818                 new_ptr_[end_state_index] = ptr_[end_state_index];
819                 new_ptr_[id_index] = ptr_[id_index];
820                 new_ptr_[unique_id_index] = ptr_[unique_id_index];
821                 new_ptr_[state_index] = ptr_[state_index];
822                 new_ptr_[bol_index] = lookup_ptr_[ptr_[bol_index]];
823                 new_ptr_[eol_index] = lookup_ptr_[ptr_[eol_index]];
824                 new_ptr_ += dfa_offset;
825                 ptr_ += dfa_offset;
826 
827                 for (std::size_t i_ = dfa_offset; i_ < dfa_alphabet_; ++i_)
828                 {
829                     *new_ptr_++ = lookup_ptr_[*ptr_++];
830                 }
831             }
832 
833             dfa_.swap (new_dfa_);
834         }
835     }
836 };
837 
838 typedef basic_generator<char> generator;
839 typedef basic_generator<wchar_t> wgenerator;
840 }
841 }
842 
843 #endif
844