1 // generate_cpp.hpp
2 // Copyright (c) 2008-2009 Ben Hanson (http://www.benhanson.net/)
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See accompanying
5 // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6 #ifndef BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_GENERATE_CPP_HPP
7 #define BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_GENERATE_CPP_HPP
8 
9 #include "char_traits.hpp"
10 #include "consts.hpp"
11 #include "internals.hpp"
12 #include <iostream>
13 #include "runtime_error.hpp"
14 #include "size_t.hpp"
15 #include "state_machine.hpp"
16 #include <vector>
17 
18 namespace boost
19 {
20 namespace lexer
21 {
22 template<typename CharT>
generate_cpp(const basic_state_machine<CharT> & state_machine_,std::ostream & os_,const bool use_pointers_=false,const bool skip_unknown_=true,const bool optimise_parameters_=true,const char * name_="next_token")23 void generate_cpp (const basic_state_machine<CharT> &state_machine_,
24     std::ostream &os_, const bool use_pointers_ = false,
25     const bool skip_unknown_ = true, const bool optimise_parameters_ = true,
26     const char *name_ = "next_token")
27 {
28     const detail::internals &sm_ = state_machine_.data ();
29 
30     if (sm_._lookup->size () == 0)
31     {
32         throw runtime_error ("Cannot generate code from an empty "
33             "state machine");
34     }
35 
36     std::string upper_name_ (__DATE__);
37     const std::size_t lookups_ = sm_._lookup->front ()->size ();
38     const std::size_t dfas_ = sm_._dfa->size ();
39     std::string::size_type pos_ = upper_name_.find (' ');
40     const char *iterator_ = 0;
41 
42     if (use_pointers_)
43     {
44         if (lookups_ == 256)
45         {
46             iterator_ = "const char *";
47         }
48         else
49         {
50             iterator_ = "const wchar_t *";
51         }
52     }
53     else
54     {
55         iterator_ = "Iterator &";
56     }
57 
58     while (pos_ != std::string::npos)
59     {
60         upper_name_.replace (pos_, 1, "_");
61         pos_ = upper_name_.find (' ', pos_);
62     }
63 
64     upper_name_ += '_';
65     upper_name_ +=  __TIME__;
66 
67     pos_ = upper_name_.find (':');
68 
69     while (pos_ != std::string::npos)
70     {
71         upper_name_.erase (pos_, 1);
72         pos_ = upper_name_.find (':', pos_);
73     }
74 
75     upper_name_ = '_' + upper_name_;
76     upper_name_ = name_ + upper_name_;
77     std::transform (upper_name_.begin (), upper_name_.end (),
78         upper_name_.begin (), ::toupper);
79     os_ << "#ifndef " << upper_name_ + '\n';
80     os_ << "#define " << upper_name_ + '\n';
81     os_ << "// Copyright (c) 2008-2009 Ben Hanson\n";
82     os_ << "//\n";
83     os_ << "// Distributed under the Boost Software License, "
84         "Version 1.0. (See accompanying\n";
85     os_ << "// file licence_1_0.txt or copy at "
86         "http://www.boost.org/LICENSE_1_0.txt)\n\n";
87     os_ << "// Auto-generated by boost::lexer\n";
88     os_ << "template<typename Iterator>\n";
89     os_ << "std::size_t " << name_  << " (";
90 
91     if (dfas_ > 1 || !optimise_parameters_)
92     {
93         os_ << "std::size_t &start_state_, ";
94     }
95 
96     if (use_pointers_)
97     {
98         os_ << iterator_ << " &";
99     }
100     else
101     {
102         os_ << iterator_;
103     }
104 
105     os_ << "start_token_, ";
106 
107     if (use_pointers_)
108     {
109         os_ << iterator_ << " const ";
110     }
111     else
112     {
113         os_ << "const " << iterator_;
114     }
115 
116     os_ << "end_, \n";
117     os_ << "    std::size_t &unique_id_";
118 
119     if (sm_._seen_BOL_assertion || !optimise_parameters_)
120     {
121         os_ << ", bool &beg_of_line_";
122     }
123 
124     os_ << ")\n";
125     os_ << "{\n";
126     os_ << "    enum {end_state_index, id_index, unique_id_index, state_index, bol_index,\n";
127     os_ << "        eol_index, dead_state_index, dfa_offset};\n";
128     os_ << "    static const std::size_t npos = static_cast"
129         "<std::size_t>(~0);\n";
130 
131     if (dfas_ > 1)
132     {
133         std::size_t state_ = 0;
134 
135         for (; state_ < dfas_; ++state_)
136         {
137             std::size_t i_ = 0;
138             std::size_t j_ = 1;
139             std::size_t count_ = lookups_ / 8;
140             const std::size_t *lookup_ = &sm_._lookup[state_]->front ();
141             const std::size_t *dfa_ = &sm_._dfa[state_]->front ();
142 
143             os_ << "    static const std::size_t lookup" << state_ << "_[" <<
144                 lookups_ << "] = {";
145 
146             for (; i_ < count_; ++i_)
147             {
148                 const std::size_t index_ = i_ * 8;
149 
150                 os_ << lookup_[index_];
151 
152                 for (; j_ < 8; ++j_)
153                 {
154                     os_ << ", " << lookup_[index_ + j_];
155                 }
156 
157                 if (i_ < count_ - 1)
158                 {
159                     os_ << "," << std::endl << "        ";
160                 }
161 
162                 j_ = 1;
163             }
164 
165             os_ << "};\n";
166             count_ = sm_._dfa[state_]->size ();
167             os_ << "    static const std::size_t dfa" << state_ << "_[" <<
168                 count_ << "] = {";
169             count_ /= 8;
170 
171             for (i_ = 0; i_ < count_; ++i_)
172             {
173                 const std::size_t index_ = i_ * 8;
174 
175                 os_ << dfa_[index_];
176 
177                 for (j_ = 1; j_ < 8; ++j_)
178                 {
179                     os_ << ", " << dfa_[index_ + j_];
180                 }
181 
182                 if (i_ < count_ - 1)
183                 {
184                     os_ << "," << std::endl << "        ";
185                 }
186             }
187 
188             const std::size_t mod_ = sm_._dfa[state_]->size () % 8;
189 
190             if (mod_)
191             {
192                 const std::size_t index_ = count_ * 8;
193 
194                 if (count_)
195                 {
196                     os_ << ",\n        ";
197                 }
198 
199                 os_ << dfa_[index_];
200 
201                 for (j_ = 1; j_ < mod_; ++j_)
202                 {
203                     os_ << ", " << dfa_[index_ + j_];
204                 }
205             }
206 
207             os_ << "};\n";
208         }
209 
210         std::size_t count_ = sm_._dfa_alphabet.size ();
211         std::size_t i_ = 1;
212 
213         os_ << "    static const std::size_t *lookup_arr_[" << count_ <<
214             "] = {";
215         os_ << "lookup0_";
216 
217         for (i_ = 1; i_ < count_; ++i_)
218         {
219             os_ << ", " << "lookup" << i_ << "_";
220         }
221 
222         os_ << "};\n";
223         os_ << "    static const std::size_t dfa_alphabet_arr_[" << count_ <<
224             "] = {";
225         os_ << sm_._dfa_alphabet.front ();
226 
227         for (i_ = 1; i_ < count_; ++i_)
228         {
229             os_ << ", " << sm_._dfa_alphabet[i_];
230         }
231 
232         os_ << "};\n";
233         os_ << "    static const std::size_t *dfa_arr_[" << count_ <<
234             "] = {";
235         os_ << "dfa0_";
236 
237         for (i_ = 1; i_ < count_; ++i_)
238         {
239             os_ << ", " << "dfa" << i_ << "_";
240         }
241 
242         os_ << "};\n";
243     }
244     else
245     {
246         const std::size_t *lookup_ = &sm_._lookup->front ()->front ();
247         const std::size_t *dfa_ = &sm_._dfa->front ()->front ();
248         std::size_t i_ = 0;
249         std::size_t j_ = 1;
250         std::size_t count_ = lookups_ / 8;
251 
252         os_ << "    static const std::size_t lookup_[";
253         os_ << sm_._lookup->front ()->size () << "] = {";
254 
255         for (; i_ < count_; ++i_)
256         {
257             const std::size_t index_ = i_ * 8;
258 
259             os_ << lookup_[index_];
260 
261             for (; j_ < 8; ++j_)
262             {
263                 os_ << ", " << lookup_[index_ + j_];
264             }
265 
266             if (i_ < count_ - 1)
267             {
268                 os_ << "," << std::endl << "        ";
269             }
270 
271             j_ = 1;
272         }
273 
274         os_ << "};\n";
275         os_ << "    static const std::size_t dfa_alphabet_ = " <<
276             sm_._dfa_alphabet.front () << ";\n";
277         os_ << "    static const std::size_t dfa_[" <<
278             sm_._dfa->front ()->size () << "] = {";
279         count_ = sm_._dfa->front ()->size () / 8;
280 
281         for (i_ = 0; i_ < count_; ++i_)
282         {
283             const std::size_t index_ = i_ * 8;
284 
285             os_ << dfa_[index_];
286 
287             for (j_ = 1; j_ < 8; ++j_)
288             {
289                 os_ << ", " << dfa_[index_ + j_];
290             }
291 
292             if (i_ < count_ - 1)
293             {
294                 os_ << "," << std::endl << "        ";
295             }
296         }
297 
298         const std::size_t mod_ = sm_._dfa->front ()->size () % 8;
299 
300         if (mod_)
301         {
302             const std::size_t index_ = count_ * 8;
303 
304             if (count_)
305             {
306                 os_ << ",\n        ";
307             }
308 
309             os_ << dfa_[index_];
310 
311             for (j_ = 1; j_ < mod_; ++j_)
312             {
313                 os_ << ", " << dfa_[index_ + j_];
314             }
315         }
316 
317         os_ << "};\n";
318     }
319 
320     os_ << "\n    if (start_token_ == end_)\n";
321     os_ << "    {\n";
322     os_ << "        unique_id_ = npos;\n";
323     os_ << "        return 0;\n";
324     os_ << "    }\n\n";
325 
326     if (dfas_ > 1)
327     {
328         os_ << "again:\n";
329         os_ << "    const std::size_t * lookup_ = "
330             "lookup_arr_[start_state_];\n";
331         os_ << "    std::size_t dfa_alphabet_ = "
332             "dfa_alphabet_arr_[start_state_];\n";
333         os_ << "    const std::size_t *dfa_ = dfa_arr_[start_state_];\n";
334     }
335 
336     os_ << "    const std::size_t *ptr_ = dfa_ + dfa_alphabet_;\n";
337     os_ << "    Iterator curr_ = start_token_;\n";
338     os_ << "    bool end_state_ = *ptr_ != 0;\n";
339     os_ << "    std::size_t id_ = *(ptr_ + id_index);\n";
340     os_ << "    std::size_t uid_ = *(ptr_ + unique_id_index);\n";
341 
342     if (dfas_ > 1)
343     {
344         os_ << "    std::size_t end_start_state_ = start_state_;\n";
345     }
346 
347     if (sm_._seen_BOL_assertion)
348     {
349         os_ << "    bool bol_ = beg_of_line_;\n";
350         os_ << "    bool end_bol_ = bol_;\n";
351     }
352 
353     os_ << "    Iterator end_token_ = start_token_;\n";
354     os_ << '\n';
355     os_ << "    while (curr_ != end_)\n";
356     os_ << "    {\n";
357 
358     if (sm_._seen_BOL_assertion)
359     {
360         os_ << "        const std::size_t BOL_state_ = ptr_[bol_index];\n";
361     }
362 
363     if (sm_._seen_EOL_assertion)
364     {
365         os_ << "        const std::size_t EOL_state_ = ptr_[eol_index];\n";
366     }
367 
368     if (sm_._seen_BOL_assertion || sm_._seen_EOL_assertion)
369     {
370         os_ << '\n';
371     }
372 
373     if (sm_._seen_BOL_assertion)
374     {
375         os_ << "        if (BOL_state_ && bol_)\n";
376         os_ << "        {\n";
377         os_ << "            ptr_ = &dfa_[BOL_state_ * dfa_alphabet_];\n";
378         os_ << "        }\n";
379     }
380 
381     if (sm_._seen_EOL_assertion)
382     {
383         os_ << "        ";
384 
385         if (sm_._seen_BOL_assertion)
386         {
387             os_ << "else ";
388         }
389 
390         os_ << "if (EOL_state_ && *curr_ == '\\n')\n";
391         os_ << "        {\n";
392         os_ << "            ptr_ = &dfa_[EOL_state_ * dfa_alphabet_];\n";
393         os_ << "        }\n";
394     }
395 
396     std::string tab_ (sm_._seen_BOL_assertion || sm_._seen_EOL_assertion ? "    " : "");
397 
398     if (sm_._seen_BOL_assertion || sm_._seen_EOL_assertion)
399     {
400         os_ << "        else\n";
401         os_ << "        {\n";
402     }
403 
404     if (sm_._seen_BOL_assertion)
405     {
406         os_ << "            ";
407 
408         if (lookups_ == 256)
409         {
410             os_ << "char";
411         }
412         else
413         {
414             os_ << "wchar_t";
415         }
416 
417         os_ << " prev_char_ = *curr_++;\n\n";
418         os_ << "            bol_ = prev_char_ == '\\n';\n\n";
419     }
420 
421     os_ << tab_;
422     os_ << "        const std::size_t state_ =\n";
423     os_ << tab_;
424     os_ << "            ptr_[lookup_[";
425 
426     if (lookups_ == 256)
427     {
428         os_ << "static_cast<unsigned char>(";
429     }
430 
431     if (sm_._seen_BOL_assertion)
432     {
433         os_ << "prev_char";
434     }
435     else
436     {
437         os_ << "*curr_++";
438     }
439 
440 
441     if (lookups_ == 256)
442     {
443         os_ << ')';
444     }
445 
446     os_ << "]];\n\n";
447 
448     os_ << tab_;
449     os_ << "        if (state_ == 0) break;\n\n";
450     os_ << tab_;
451     os_ << "        ptr_ = &dfa_[state_ * dfa_alphabet_];\n";
452 
453     if (sm_._seen_BOL_assertion || sm_._seen_EOL_assertion)
454     {
455         os_ << "        }\n";
456     }
457 
458     os_ << '\n';
459     os_ << "        if (*ptr_)\n";
460     os_ << "        {\n";
461     os_ << "            end_state_ = true;\n";
462     os_ << "            id_ = *(ptr_ + id_index);\n";
463     os_ << "            uid_ = *(ptr_ + unique_id_index);\n";
464 
465     if (dfas_ > 1)
466     {
467         os_ << "            end_start_state_ = *(ptr_ + state_index);\n";
468     }
469 
470     if (sm_._seen_BOL_assertion)
471     {
472         os_ << "            end_bol_ = bol_;\n";
473     }
474 
475     os_ << "            end_token_ = curr_;\n";
476     os_ << "        }\n";
477     os_ << "    }\n";
478     os_ << '\n';
479 
480     if (sm_._seen_EOL_assertion)
481     {
482         os_ << "    const std::size_t EOL_state_ = ptr_[eol_index];\n";
483         os_ << '\n';
484         os_ << "    if (EOL_state_ && curr_ == end_)\n";
485         os_ << "    {\n";
486         os_ << "        ptr_ = &dfa_[EOL_state_ * dfa_alphabet_];\n";
487         os_ << '\n';
488         os_ << "        if (*ptr_)\n";
489         os_ << "        {\n";
490         os_ << "            end_state_ = true;\n";
491         os_ << "            id_ = *(ptr_ + id_index);\n";
492         os_ << "            uid_ = *(ptr_ + unique_id_index);\n";
493 
494         if (dfas_ > 1)
495         {
496             os_ << "            end_start_state_ = *(ptr_ + state_index);\n";
497         }
498 
499         if (sm_._seen_BOL_assertion)
500         {
501             os_ << "            end_bol_ = bol_;\n";
502         }
503 
504         os_ << "            end_token_ = curr_;\n";
505         os_ << "        }\n";
506         os_ << "    }\n";
507         os_ << '\n';
508     }
509 
510     os_ << "    if (end_state_)\n";
511     os_ << "    {\n";
512     os_ << "        // return longest match\n";
513 
514     if (dfas_ > 1)
515     {
516         os_ << "        start_state_ = end_start_state_;\n";
517     }
518 
519     if (sm_._seen_BOL_assertion && dfas_ < 2)
520     {
521         os_ << "        beg_of_line_ = end_bol_;\n";
522     }
523 
524     os_ << "        start_token_ = end_token_;\n";
525 
526     if (dfas_ > 1)
527     {
528         os_ << '\n';
529         os_ << "        if (id_ == 0)\n";
530         os_ << "        {\n";
531 
532         if (sm_._seen_BOL_assertion)
533         {
534             os_ << "            bol_ = end_bol_;\n";
535         }
536 
537         os_ << "            goto again;\n";
538         os_ << "        }\n";
539 
540         if (sm_._seen_BOL_assertion)
541         {
542             os_ << "        else\n";
543             os_ << "        {\n";
544             os_ << "            beg_of_line_ = end_bol_;\n";
545             os_ << "        }\n";
546         }
547     }
548 
549     os_ << "    }\n";
550     os_ << "    else\n";
551     os_ << "    {\n";
552 
553     if (sm_._seen_BOL_assertion)
554     {
555         os_ << "        beg_of_line_ = *start_token_ == '\\n';\n";
556     }
557 
558     if (skip_unknown_)
559     {
560         os_ << "        // No match causes char to be skipped\n";
561         os_ << "        ++start_token_;\n";
562     }
563 
564     os_ << "        id_ = npos;\n";
565     os_ << "        uid_ = npos;\n";
566     os_ << "    }\n";
567     os_ << '\n';
568     os_ << "    unique_id_ = uid_;\n";
569     os_ << "    return id_;\n";
570     os_ << "}\n";
571     os_ << "\n#endif\n";
572 }
573 }
574 }
575 
576 #endif
577