1 // generate_re2c.hpp
2 // Copyright (c) 2009 Ben Hanson (http://www.benhanson.net/)
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See accompanying
5 // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6 #ifndef LEXERTL_GENERATE_RE2C_HPP
7 #define LEXERTL_GENERATE_RE2C_HPP
8 
9 #include "char_traits.hpp"
10 #include "consts.hpp"
11 #include "internals.hpp"
12 #include <iostream>
13 #include "runtime_error.hpp"
14 #include "size_t.hpp"
15 #include "state_machine.hpp"
16 #include <vector>
17 
18 namespace boost
19 {
20 namespace lexer
21 {
22 // check whether state0_0 is referenced from any of the other states
23 template <typename Char>
need_label0_0(boost::lexer::basic_state_machine<Char> const & sm_)24 bool need_label0_0(boost::lexer::basic_state_machine<Char> const &sm_)
25 {
26     typedef typename boost::lexer::basic_state_machine<Char>::iterator
27         iterator_type;
28     iterator_type iter_ = sm_.begin();
29     std::size_t states_ = iter_->states;
30 
31     for (std::size_t state_ = 0; state_ < states_; ++state_)
32     {
33         if (0 == iter_->bol_index || 0 == iter_->eol_index)
34         {
35             return true;
36         }
37 
38         std::size_t const transitions_ = iter_->transitions;
39         for (std::size_t t_ = 0; t_ < transitions_; ++t_)
40         {
41             if (0 == iter_->goto_state)
42             {
43                 return true;
44             }
45             ++iter_;
46         }
47         if (transitions_ == 0) ++iter_;
48     }
49     return false;
50 }
51 
52 template<typename CharT>
generate_re2c(const basic_state_machine<CharT> & state_machine_,std::ostream & os_,const bool use_pointers_=false,const bool skip_unknown_=true,const bool optimise_parameters_=true,const char * name_="next_token")53 void generate_re2c (const basic_state_machine<CharT> &state_machine_,
54     std::ostream &os_, const bool use_pointers_ = false,
55     const bool skip_unknown_ = true, const bool optimise_parameters_ = true,
56     const char *name_ = "next_token")
57 {
58     typedef typename boost::lexer::basic_string_token<CharT> string_token;
59     const detail::internals &sm_ = state_machine_.data ();
60 
61     if (sm_._lookup->size () == 0)
62     {
63         throw runtime_error ("Cannot generate code from an empty "
64             "state machine");
65     }
66 
67     std::string upper_name_ (__DATE__);
68     const std::size_t lookups_ = sm_._lookup->front ()->size ();
69     typename boost::lexer::basic_state_machine<CharT>::iterator iter_ =
70         state_machine_.begin();
71     typename boost::lexer::basic_state_machine<CharT>::iterator end_ =
72         state_machine_.end();
73     const std::size_t dfas_ = sm_._dfa->size ();
74     std::string::size_type pos_ = upper_name_.find (' ');
75     const char *iterator_ = 0;
76 
77     if (use_pointers_)
78     {
79         if (lookups_ == 256)
80         {
81             iterator_ = "const char *";
82         }
83         else
84         {
85             iterator_ = "const wchar_t *";
86         }
87     }
88     else
89     {
90         iterator_ = "Iterator &";
91     }
92 
93     while (pos_ != std::string::npos)
94     {
95         upper_name_.replace (pos_, 1, "_");
96         pos_ = upper_name_.find (' ', pos_);
97     }
98 
99     upper_name_ += '_';
100     upper_name_ +=  __TIME__;
101 
102     pos_ = upper_name_.find (':');
103 
104     while (pos_ != std::string::npos)
105     {
106         upper_name_.erase (pos_, 1);
107         pos_ = upper_name_.find (':', pos_);
108     }
109 
110     upper_name_ = '_' + upper_name_;
111     upper_name_ = name_ + upper_name_;
112     std::transform (upper_name_.begin (), upper_name_.end (),
113         upper_name_.begin (), ::toupper);
114     os_ << "#ifndef " << upper_name_ + '\n';
115     os_ << "#define " << upper_name_ + '\n';
116     os_ << "// Copyright (c) 2008-2009 Ben Hanson\n";
117     os_ << "//\n";
118     os_ << "// Distributed under the Boost Software License, "
119         "Version 1.0. (See accompanying\n";
120     os_ << "// file licence_1_0.txt or copy at "
121         "http://www.boost.org/LICENSE_1_0.txt)\n\n";
122     os_ << "// Auto-generated by boost::lexer\n";
123     os_ << "template<typename Iterator>\n";
124     os_ << "std::size_t " << name_  << " (";
125 
126     if (dfas_ > 1 || !optimise_parameters_)
127     {
128         os_ << "std::size_t &start_state_, ";
129     }
130 
131     if (use_pointers_)
132     {
133         os_ << iterator_ << " &";
134     }
135     else
136     {
137         os_ << iterator_;
138     }
139 
140     os_ << "start_token_, ";
141 
142     if (use_pointers_)
143     {
144         os_ << iterator_ << " const ";
145     }
146     else
147     {
148         os_ << "const " << iterator_;
149     }
150 
151     os_ << "end_, \n";
152     os_ << "    std::size_t &unique_id_";
153 
154     if (sm_._seen_BOL_assertion || !optimise_parameters_)
155     {
156         os_ << ", bool &beg_of_line_";
157     }
158 
159     os_ << ")\n";
160     os_ << "{\n";
161     os_ << "    static const std::size_t npos = static_cast"
162         "<std::size_t>(~0);\n";
163     os_ << "\n    if (start_token_ == end_)\n";
164     os_ << "    {\n";
165     os_ << "        unique_id_ = npos;\n";
166     os_ << "        return 0;\n";
167     os_ << "    }\n\n";
168 
169     if (dfas_ > 1)
170     {
171         os_ << "again:\n";
172     }
173 
174     os_ << "    Iterator curr_ = start_token_;\n";
175     os_ << "    bool end_state_ = false;\n";
176     os_ << "    std::size_t id_ = npos;\n";
177     os_ << "    std::size_t uid_ = npos;\n";
178 
179     if (dfas_ > 1)
180     {
181         os_ << "    std::size_t end_start_state_ = start_state_;\n";
182     }
183 
184     if (sm_._seen_BOL_assertion)
185     {
186         os_ << "    bool bol_ = beg_of_line_;\n";
187         os_ << "    bool end_bol_ = bol_;\n";
188     }
189 
190     os_ << "    Iterator end_token_ = start_token_;\n";
191     os_ << '\n';
192 
193     if (dfas_ > 1)
194     {
195         os_ << "    switch (start_state_)\n";
196         os_ << "    {\n";
197 
198         for (std::size_t i_ = 0; i_ < dfas_; ++i_)
199         {
200             os_ << "    case " << i_ << ":\n";
201             os_ << "        goto " << i_ << "_0;\n";
202             os_ << "        // Not needed, but to prevent warnings\n";
203             os_ << "        break;\n";
204         }
205 
206         os_ << "    default:\n";
207         os_ << "        throw std::runtime_error (\"Invalid start state!\")\n";
208         os_ << "        break;\n";
209         os_ << "    }\n\n";
210     }
211 
212     os_ << "    ";
213 
214     if (lookups_ == 256)
215     {
216         os_ << "char";
217     }
218     else
219     {
220         os_ << "wchar_t";
221     }
222 
223     os_ << " ch_ = 0;\n\n";
224 
225     bool need_state0_0_label = need_label0_0(state_machine_);
226 
227     for (std::size_t dfa_ = 0; dfa_ < dfas_; ++dfa_)
228     {
229         const std::size_t states_ = iter_->states;
230 
231         for (std::size_t state_ = 0; state_ < states_; ++state_)
232         {
233             const std::size_t transitions_ = iter_->transitions;
234             std::size_t t_ = 0;
235 
236             if (dfas_ > 1 || dfa_ != 0 || state_ != 0 || need_state0_0_label)
237             {
238                 os_ << "state" << dfa_ << '_' << state_ << ":\n";
239             }
240 
241             if (iter_->end_state)
242             {
243                 os_ << "    end_state_ = true;\n";
244                 os_ << "    id_ = " << iter_->id << ";\n";
245                 os_ << "    uid_ = " << iter_->unique_id << ";\n";
246                 os_ << "    end_token_ = curr_;\n";
247 
248                 if (dfas_ > 1)
249                 {
250                     os_ << "    end_start_state_ = " << iter_->goto_dfa <<
251                         ";\n";
252                 }
253 
254                 if (sm_._seen_BOL_assertion)
255                 {
256                     os_ << "    end_bol_ = bol_;\n";
257                 }
258 
259                 if (transitions_) os_ << '\n';
260             }
261 
262             if (t_ < transitions_ || iter_->bol_index != boost::lexer::npos ||
263                 iter_->eol_index != boost::lexer::npos)
264             {
265                 os_ << "    if (curr_ == end_) goto end;\n\n";
266                 os_ << "    ch_ = *curr_;\n";
267 
268                 if (iter_->bol_index != boost::lexer::npos)
269                 {
270                     os_ << "\n    if (bol_) goto state" << dfa_ << '_' <<
271                         iter_->bol_index << ";\n\n";
272                 }
273 
274                 if (iter_->eol_index != boost::lexer::npos)
275                 {
276                     os_ << "\n    if (ch_ == '\n') goto state" << dfa_ << '_' <<
277                         iter_->eol_index << ";\n\n";
278                 }
279 
280                 os_ << "    ++curr_;\n";
281             }
282 
283             for (; t_ < transitions_; ++t_)
284             {
285                 const char *ptr_ = iter_->token._charset.c_str();
286                 const char *end_ = ptr_ + iter_->token._charset.size();
287                 char start_char_ = 0;
288                 char curr_char_ = 0;
289                 bool range_ = false;
290                 bool first_char_ = true;
291 
292                 os_ << "\n    if (";
293 
294                 while (ptr_ != end_)
295                 {
296                     curr_char_ = *ptr_++;
297 
298                     if (*ptr_ == curr_char_ + 1)
299                     {
300                         if (!range_)
301                         {
302                             start_char_ = curr_char_;
303                         }
304 
305                         range_ = true;
306                     }
307                     else
308                     {
309                         if (!first_char_)
310                         {
311                             if (iter_->token._negated)
312                             {
313                                 os_ << " && ";
314                             }
315                             else
316                             {
317                                 os_ << " || ";
318                             }
319                         }
320 
321                         first_char_ = false;
322 
323                         if (range_)
324                         {
325                             typename string_token::string temp_;
326 
327                             if (iter_->token._negated)
328                             {
329                                 os_ << "!";
330                             }
331 
332                             string_token::escape_char (start_char_, temp_);
333                             os_ << "(ch_ >= '" << temp_;
334                             temp_.clear ();
335                             string_token::escape_char (curr_char_, temp_);
336                             os_ << "' && ch_ <= '" << temp_ << "')";
337                             range_ = false;
338                         }
339                         else
340                         {
341                             typename string_token::string temp_;
342 
343                             os_ << "ch_ ";
344 
345                             if (iter_->token._negated)
346                             {
347                                 os_ << "!=";
348                             }
349                             else
350                             {
351                                 os_ << "==";
352                             }
353 
354                             string_token::escape_char (curr_char_, temp_);
355                             os_ << " '" << temp_ << "'";
356                         }
357                     }
358                 }
359 
360                 os_ << ") goto state" << dfa_ << '_' << iter_->goto_state <<
361                     ";\n\n";
362                 ++iter_;
363             }
364 
365             if (!(dfa_ == dfas_ - 1 && state_ == states_ - 1))
366             {
367                 os_ << "    goto end;\n";
368             }
369 
370             if (transitions_ == 0) ++iter_;
371         }
372     }
373 
374     os_ << "end:\n";
375     os_ << "    if (end_state_)\n";
376     os_ << "    {\n";
377     os_ << "        // return longest match\n";
378 
379     if (dfas_ > 1)
380     {
381         os_ << "        start_state_ = end_start_state_;\n";
382     }
383 
384     if (sm_._seen_BOL_assertion && dfas_ < 2)
385     {
386         os_ << "        beg_of_line_ = end_bol_;\n";
387     }
388 
389     os_ << "        start_token_ = end_token_;\n";
390 
391     if (dfas_ > 1)
392     {
393         os_ << '\n';
394         os_ << "        if (id_ == 0)\n";
395         os_ << "        {\n";
396 
397         if (sm_._seen_BOL_assertion)
398         {
399             os_ << "            bol_ = end_bol_;\n";
400         }
401 
402         os_ << "            goto again;\n";
403         os_ << "        }\n";
404 
405         if (sm_._seen_BOL_assertion)
406         {
407             os_ << "        else\n";
408             os_ << "        {\n";
409             os_ << "            beg_of_line_ = end_bol_;\n";
410             os_ << "        }\n";
411         }
412     }
413 
414     os_ << "    }\n";
415     os_ << "    else\n";
416     os_ << "    {\n";
417 
418     if (sm_._seen_BOL_assertion)
419     {
420         os_ << "        beg_of_line_ = *start_token_ == '\\n';\n";
421     }
422 
423     if (skip_unknown_)
424     {
425         os_ << "        // No match causes char to be skipped\n";
426         os_ << "        ++start_token_;\n";
427     }
428 
429     os_ << "        id_ = npos;\n";
430     os_ << "        uid_ = npos;\n";
431     os_ << "    }\n";
432     os_ << '\n';
433     os_ << "    unique_id_ = uid_;\n";
434     os_ << "    return id_;\n";
435     os_ << "}\n";
436     os_ << "\n#endif\n";
437 }
438 }
439 }
440 #endif
441