1 // generate_cpp.hpp
2 // Copyright (c) 2008-2009 Ben Hanson (http://www.benhanson.net/)
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See accompanying
5 // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6 #ifndef BOOST_LEXER_GENERATE_CPP_HPP
7 #define BOOST_LEXER_GENERATE_CPP_HPP
8 
9 #include "char_traits.hpp"
10 #include "consts.hpp"
11 #include "internals.hpp"
12 #include <iostream>
13 #include <boost/detail/iterator.hpp>
14 #include "runtime_error.hpp"
15 #include "size_t.hpp"
16 #include "state_machine.hpp"
17 #include <vector>
18 
19 namespace boost
20 {
21 namespace lexer
22 {
23 template<typename CharT>
generate_cpp(const basic_state_machine<CharT> & state_machine_,std::ostream & os_,const bool use_pointers_=false,const bool skip_unknown_=true,const bool optimise_parameters_=true,const char * name_="next_token")24 void generate_cpp (const basic_state_machine<CharT> &state_machine_,
25     std::ostream &os_, const bool use_pointers_ = false,
26     const bool skip_unknown_ = true, const bool optimise_parameters_ = true,
27     const char *name_ = "next_token")
28 {
29     const detail::internals &sm_ = state_machine_.data ();
30 
31     if (sm_._lookup->size () == 0)
32     {
33         throw runtime_error ("Cannot generate code from an empty "
34             "state machine");
35     }
36 
37     std::string upper_name_ (__DATE__);
38     const std::size_t lookups_ = sm_._lookup->front ()->size ();
39     const std::size_t dfas_ = sm_._dfa->size ();
40     std::string::size_type pos_ = upper_name_.find (' ');
41     const char *iterator_ = 0;
42 
43     if (use_pointers_)
44     {
45         if (lookups_ == 256)
46         {
47             iterator_ = "const char *";
48         }
49         else
50         {
51             iterator_ = "const wchar_t *";
52         }
53     }
54     else
55     {
56         iterator_ = "Iterator &";
57     }
58 
59     while (pos_ != std::string::npos)
60     {
61         upper_name_.replace (pos_, 1, "_");
62         pos_ = upper_name_.find (' ', pos_);
63     }
64 
65     upper_name_ += '_';
66     upper_name_ +=  __TIME__;
67 
68     pos_ = upper_name_.find (':');
69 
70     while (pos_ != std::string::npos)
71     {
72         upper_name_.erase (pos_, 1);
73         pos_ = upper_name_.find (':', pos_);
74     }
75 
76     upper_name_ = '_' + upper_name_;
77     upper_name_ = name_ + upper_name_;
78     std::transform (upper_name_.begin (), upper_name_.end (),
79         upper_name_.begin (), ::toupper);
80     os_ << "#ifndef " << upper_name_ + '\n';
81     os_ << "#define " << upper_name_ + '\n';
82     os_ << "// Copyright (c) 2008-2009 Ben Hanson\n";
83     os_ << "//\n";
84     os_ << "// Distributed under the Boost Software License, "
85         "Version 1.0. (See accompanying\n";
86     os_ << "// file licence_1_0.txt or copy at "
87         "http://www.boost.org/LICENSE_1_0.txt)\n\n";
88     os_ << "// Auto-generated by boost::lexer\n";
89     os_ << "template<typename Iterator>\n";
90     os_ << "std::size_t " << name_  << " (";
91 
92     if (dfas_ > 1 || !optimise_parameters_)
93     {
94         os_ << "std::size_t &start_state_, ";
95     }
96 
97     if (use_pointers_)
98     {
99         os_ << iterator_ << " &";
100     }
101     else
102     {
103         os_ << iterator_;
104     }
105 
106     os_ << "start_token_, ";
107 
108     if (use_pointers_)
109     {
110         os_ << iterator_ << " const ";
111     }
112     else
113     {
114         os_ << "const " << iterator_;
115     }
116 
117     os_ << "end_, \n";
118     os_ << "    std::size_t &unique_id_";
119 
120     if (sm_._seen_BOL_assertion || !optimise_parameters_)
121     {
122         os_ << ", bool &beg_of_line_";
123     }
124 
125     os_ << ")\n";
126     os_ << "{\n";
127     os_ << "    enum {end_state_index, id_index, unique_id_index, state_index, bol_index,\n";
128     os_ << "        eol_index, dead_state_index, dfa_offset};\n";
129     os_ << "    static const std::size_t npos = static_cast"
130         "<std::size_t>(~0);\n";
131 
132     if (dfas_ > 1)
133     {
134         std::size_t state_ = 0;
135 
136         for (; state_ < dfas_; ++state_)
137         {
138             std::size_t i_ = 0;
139             std::size_t j_ = 1;
140             std::size_t count_ = lookups_ / 8;
141             const std::size_t *lookup_ = &sm_._lookup[state_]->front ();
142             const std::size_t *dfa_ = &sm_._dfa[state_]->front ();
143 
144             os_ << "    static const std::size_t lookup" << state_ << "_[" <<
145                 lookups_ << "] = {";
146 
147             for (; i_ < count_; ++i_)
148             {
149                 const std::size_t index_ = i_ * 8;
150 
151                 os_ << lookup_[index_];
152 
153                 for (; j_ < 8; ++j_)
154                 {
155                     os_ << ", " << lookup_[index_ + j_];
156                 }
157 
158                 if (i_ < count_ - 1)
159                 {
160                     os_ << "," << std::endl << "        ";
161                 }
162 
163                 j_ = 1;
164             }
165 
166             os_ << "};\n";
167             count_ = sm_._dfa[state_]->size ();
168             os_ << "    static const std::size_t dfa" << state_ << "_[" <<
169                 count_ << "] = {";
170             count_ /= 8;
171 
172             for (i_ = 0; i_ < count_; ++i_)
173             {
174                 const std::size_t index_ = i_ * 8;
175 
176                 os_ << dfa_[index_];
177 
178                 for (j_ = 1; j_ < 8; ++j_)
179                 {
180                     os_ << ", " << dfa_[index_ + j_];
181                 }
182 
183                 if (i_ < count_ - 1)
184                 {
185                     os_ << "," << std::endl << "        ";
186                 }
187             }
188 
189             const std::size_t mod_ = sm_._dfa[state_]->size () % 8;
190 
191             if (mod_)
192             {
193                 const std::size_t index_ = count_ * 8;
194 
195                 if (count_)
196                 {
197                     os_ << ",\n        ";
198                 }
199 
200                 os_ << dfa_[index_];
201 
202                 for (j_ = 1; j_ < mod_; ++j_)
203                 {
204                     os_ << ", " << dfa_[index_ + j_];
205                 }
206             }
207 
208             os_ << "};\n";
209         }
210 
211         std::size_t count_ = sm_._dfa_alphabet.size ();
212         std::size_t i_ = 1;
213 
214         os_ << "    static const std::size_t *lookup_arr_[" << count_ <<
215             "] = {";
216         os_ << "lookup0_";
217 
218         for (i_ = 1; i_ < count_; ++i_)
219         {
220             os_ << ", " << "lookup" << i_ << "_";
221         }
222 
223         os_ << "};\n";
224         os_ << "    static const std::size_t dfa_alphabet_arr_[" << count_ <<
225             "] = {";
226         os_ << sm_._dfa_alphabet.front ();
227 
228         for (i_ = 1; i_ < count_; ++i_)
229         {
230             os_ << ", " << sm_._dfa_alphabet[i_];
231         }
232 
233         os_ << "};\n";
234         os_ << "    static const std::size_t *dfa_arr_[" << count_ <<
235             "] = {";
236         os_ << "dfa0_";
237 
238         for (i_ = 1; i_ < count_; ++i_)
239         {
240             os_ << ", " << "dfa" << i_ << "_";
241         }
242 
243         os_ << "};\n";
244     }
245     else
246     {
247         const std::size_t *lookup_ = &sm_._lookup->front ()->front ();
248         const std::size_t *dfa_ = &sm_._dfa->front ()->front ();
249         std::size_t i_ = 0;
250         std::size_t j_ = 1;
251         std::size_t count_ = lookups_ / 8;
252 
253         os_ << "    static const std::size_t lookup_[";
254         os_ << sm_._lookup->front ()->size () << "] = {";
255 
256         for (; i_ < count_; ++i_)
257         {
258             const std::size_t index_ = i_ * 8;
259 
260             os_ << lookup_[index_];
261 
262             for (; j_ < 8; ++j_)
263             {
264                 os_ << ", " << lookup_[index_ + j_];
265             }
266 
267             if (i_ < count_ - 1)
268             {
269                 os_ << "," << std::endl << "        ";
270             }
271 
272             j_ = 1;
273         }
274 
275         os_ << "};\n";
276         os_ << "    static const std::size_t dfa_alphabet_ = " <<
277             sm_._dfa_alphabet.front () << ";\n";
278         os_ << "    static const std::size_t dfa_[" <<
279             sm_._dfa->front ()->size () << "] = {";
280         count_ = sm_._dfa->front ()->size () / 8;
281 
282         for (i_ = 0; i_ < count_; ++i_)
283         {
284             const std::size_t index_ = i_ * 8;
285 
286             os_ << dfa_[index_];
287 
288             for (j_ = 1; j_ < 8; ++j_)
289             {
290                 os_ << ", " << dfa_[index_ + j_];
291             }
292 
293             if (i_ < count_ - 1)
294             {
295                 os_ << "," << std::endl << "        ";
296             }
297         }
298 
299         const std::size_t mod_ = sm_._dfa->front ()->size () % 8;
300 
301         if (mod_)
302         {
303             const std::size_t index_ = count_ * 8;
304 
305             if (count_)
306             {
307                 os_ << ",\n        ";
308             }
309 
310             os_ << dfa_[index_];
311 
312             for (j_ = 1; j_ < mod_; ++j_)
313             {
314                 os_ << ", " << dfa_[index_ + j_];
315             }
316         }
317 
318         os_ << "};\n";
319     }
320 
321     os_ << "\n    if (start_token_ == end_)\n";
322     os_ << "    {\n";
323     os_ << "        unique_id_ = npos;\n";
324     os_ << "        return 0;\n";
325     os_ << "    }\n\n";
326 
327     if (dfas_ > 1)
328     {
329         os_ << "again:\n";
330         os_ << "    const std::size_t * lookup_ = "
331             "lookup_arr_[start_state_];\n";
332         os_ << "    std::size_t dfa_alphabet_ = "
333             "dfa_alphabet_arr_[start_state_];\n";
334         os_ << "    const std::size_t *dfa_ = dfa_arr_[start_state_];\n";
335     }
336 
337     os_ << "    const std::size_t *ptr_ = dfa_ + dfa_alphabet_;\n";
338     os_ << "    Iterator curr_ = start_token_;\n";
339     os_ << "    bool end_state_ = *ptr_ != 0;\n";
340     os_ << "    std::size_t id_ = *(ptr_ + id_index);\n";
341     os_ << "    std::size_t uid_ = *(ptr_ + unique_id_index);\n";
342 
343     if (dfas_ > 1)
344     {
345         os_ << "    std::size_t end_start_state_ = start_state_;\n";
346     }
347 
348     if (sm_._seen_BOL_assertion)
349     {
350         os_ << "    bool bol_ = beg_of_line_;\n";
351         os_ << "    bool end_bol_ = bol_;\n";
352     }
353 
354     os_ << "    Iterator end_token_ = start_token_;\n";
355     os_ << '\n';
356     os_ << "    while (curr_ != end_)\n";
357     os_ << "    {\n";
358 
359     if (sm_._seen_BOL_assertion)
360     {
361         os_ << "        const std::size_t BOL_state_ = ptr_[bol_index];\n";
362     }
363 
364     if (sm_._seen_EOL_assertion)
365     {
366         os_ << "        const std::size_t EOL_state_ = ptr_[eol_index];\n";
367     }
368 
369     if (sm_._seen_BOL_assertion || sm_._seen_EOL_assertion)
370     {
371         os_ << '\n';
372     }
373 
374     if (sm_._seen_BOL_assertion)
375     {
376         os_ << "        if (BOL_state_ && bol_)\n";
377         os_ << "        {\n";
378         os_ << "            ptr_ = &dfa_[BOL_state_ * dfa_alphabet_];\n";
379         os_ << "        }\n";
380     }
381 
382     if (sm_._seen_EOL_assertion)
383     {
384         os_ << "        ";
385 
386         if (sm_._seen_BOL_assertion)
387         {
388             os_ << "else ";
389         }
390 
391         os_ << "if (EOL_state_ && *curr_ == '\\n')\n";
392         os_ << "        {\n";
393         os_ << "            ptr_ = &dfa_[EOL_state_ * dfa_alphabet_];\n";
394         os_ << "        }\n";
395     }
396 
397     std::string tab_ (sm_._seen_BOL_assertion || sm_._seen_EOL_assertion ? "    " : "");
398 
399     if (sm_._seen_BOL_assertion || sm_._seen_EOL_assertion)
400     {
401         os_ << "        else\n";
402         os_ << "        {\n";
403     }
404 
405     if (sm_._seen_BOL_assertion)
406     {
407         os_ << "            ";
408 
409         if (lookups_ == 256)
410         {
411             os_ << "char";
412         }
413         else
414         {
415             os_ << "wchar_t";
416         }
417 
418         os_ << " prev_char_ = *curr_++;\n\n";
419         os_ << "            bol_ = prev_char_ == '\\n';\n\n";
420     }
421 
422     os_ << tab_;
423     os_ << "        const std::size_t state_ =\n";
424     os_ << tab_;
425     os_ << "            ptr_[lookup_[";
426 
427     if (lookups_ == 256)
428     {
429         os_ << "static_cast<unsigned char>(";
430     }
431 
432     if (sm_._seen_BOL_assertion)
433     {
434         os_ << "prev_char";
435     }
436     else
437     {
438         os_ << "*curr_++";
439     }
440 
441 
442     if (lookups_ == 256)
443     {
444         os_ << ')';
445     }
446 
447     os_ << "]];\n\n";
448 
449     os_ << tab_;
450     os_ << "        if (state_ == 0) break;\n\n";
451     os_ << tab_;
452     os_ << "        ptr_ = &dfa_[state_ * dfa_alphabet_];\n";
453 
454     if (sm_._seen_BOL_assertion || sm_._seen_EOL_assertion)
455     {
456         os_ << "        }\n";
457     }
458 
459     os_ << '\n';
460     os_ << "        if (*ptr_)\n";
461     os_ << "        {\n";
462     os_ << "            end_state_ = true;\n";
463     os_ << "            id_ = *(ptr_ + id_index);\n";
464     os_ << "            uid_ = *(ptr_ + unique_id_index);\n";
465 
466     if (dfas_ > 1)
467     {
468         os_ << "            end_start_state_ = *(ptr_ + state_index);\n";
469     }
470 
471     if (sm_._seen_BOL_assertion)
472     {
473         os_ << "            end_bol_ = bol_;\n";
474     }
475 
476     os_ << "            end_token_ = curr_;\n";
477     os_ << "        }\n";
478     os_ << "    }\n";
479     os_ << '\n';
480 
481     if (sm_._seen_EOL_assertion)
482     {
483         os_ << "    const std::size_t EOL_state_ = ptr_[eol_index];\n";
484         os_ << '\n';
485         os_ << "    if (EOL_state_ && curr_ == end_)\n";
486         os_ << "    {\n";
487         os_ << "        ptr_ = &dfa_[EOL_state_ * dfa_alphabet_];\n";
488         os_ << '\n';
489         os_ << "        if (*ptr_)\n";
490         os_ << "        {\n";
491         os_ << "            end_state_ = true;\n";
492         os_ << "            id_ = *(ptr_ + id_index);\n";
493         os_ << "            uid_ = *(ptr_ + unique_id_index);\n";
494 
495         if (dfas_ > 1)
496         {
497             os_ << "            end_start_state_ = *(ptr_ + state_index);\n";
498         }
499 
500         if (sm_._seen_BOL_assertion)
501         {
502             os_ << "            end_bol_ = bol_;\n";
503         }
504 
505         os_ << "            end_token_ = curr_;\n";
506         os_ << "        }\n";
507         os_ << "    }\n";
508         os_ << '\n';
509     }
510 
511     os_ << "    if (end_state_)\n";
512     os_ << "    {\n";
513     os_ << "        // return longest match\n";
514 
515     if (dfas_ > 1)
516     {
517         os_ << "        start_state_ = end_start_state_;\n";
518     }
519 
520     if (sm_._seen_BOL_assertion && dfas_ < 2)
521     {
522         os_ << "        beg_of_line_ = end_bol_;\n";
523     }
524 
525     os_ << "        start_token_ = end_token_;\n";
526 
527     if (dfas_ > 1)
528     {
529         os_ << '\n';
530         os_ << "        if (id_ == 0)\n";
531         os_ << "        {\n";
532 
533         if (sm_._seen_BOL_assertion)
534         {
535             os_ << "            bol_ = end_bol_;\n";
536         }
537 
538         os_ << "            goto again;\n";
539         os_ << "        }\n";
540 
541         if (sm_._seen_BOL_assertion)
542         {
543             os_ << "        else\n";
544             os_ << "        {\n";
545             os_ << "            beg_of_line_ = end_bol_;\n";
546             os_ << "        }\n";
547         }
548     }
549 
550     os_ << "    }\n";
551     os_ << "    else\n";
552     os_ << "    {\n";
553 
554     if (sm_._seen_BOL_assertion)
555     {
556         os_ << "        beg_of_line_ = *start_token_ == '\\n';\n";
557     }
558 
559     if (skip_unknown_)
560     {
561         os_ << "        // No match causes char to be skipped\n";
562         os_ << "        ++start_token_;\n";
563     }
564 
565     os_ << "        id_ = npos;\n";
566     os_ << "        uid_ = npos;\n";
567     os_ << "    }\n";
568     os_ << '\n';
569     os_ << "    unique_id_ = uid_;\n";
570     os_ << "    return id_;\n";
571     os_ << "}\n";
572     os_ << "\n#endif\n";
573 }
574 }
575 }
576 
577 #endif
578