1 /*=============================================================================
2     Boost.Wave: A Standard compliant C++ preprocessor library
3     http://www.boost.org/
4 
5     Copyright (c) 2001-2010 Hartmut Kaiser. Distributed under the Boost
6     Software License, Version 1.0. (See accompanying file
7     LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
8 =============================================================================*/
9 
10 #if !defined(BOOST_WAVE_LEXERTL_LEXER_HPP_INCLUDED)
11 #define BOOST_WAVE_LEXERTL_LEXER_HPP_INCLUDED
12 
13 #include <fstream>
14 
15 #include <boost/iterator/iterator_traits.hpp>
16 
17 #include <boost/wave/wave_config.hpp>
18 #include <boost/wave/language_support.hpp>
19 #include <boost/wave/token_ids.hpp>
20 #include <boost/wave/util/time_conversion_helper.hpp>
21 
22 #include <boost/wave/cpplexer/validate_universal_char.hpp>
23 #include <boost/wave/cpplexer/convert_trigraphs.hpp>
24 #include <boost/wave/cpplexer/cpplexer_exceptions.hpp>
25 #if BOOST_WAVE_SUPPORT_PRAGMA_ONCE != 0
26 #include <boost/wave/cpplexer/detect_include_guards.hpp>
27 #endif
28 
29 #include "wave_lexertl_config.hpp"
30 #include "../lexertl_iterator.hpp"
31 
32 #if BOOST_WAVE_LEXERTL_USE_STATIC_TABLES != 0
33 #include "wave_lexertl_tables.hpp"
34 #else
35 #include <boost/spirit/home/support/detail/lexer/generator.hpp>
36 #include <boost/spirit/home/support/detail/lexer/rules.hpp>
37 #include <boost/spirit/home/support/detail/lexer/state_machine.hpp>
38 #include <boost/spirit/home/support/detail/lexer/consts.hpp>
39 //#include "lexertl/examples/serialise.hpp>
40 // #if BOOST_WAVE_LEXERTL_GENERATE_CPP_CODE != 0
41 // #include "lexertl/examples/cpp_code.hpp"
42 // #endif
43 #endif
44 
45 ///////////////////////////////////////////////////////////////////////////////
46 namespace boost { namespace wave { namespace cpplexer { namespace lexertl
47 {
48 
49 #if BOOST_WAVE_LEXERTL_USE_STATIC_TABLES == 0
50 ///////////////////////////////////////////////////////////////////////////////
51 //  The following numbers are the array sizes of the token regex's which we
52 //  need to specify to make the CW compiler happy (at least up to V9.5).
53 #if BOOST_WAVE_SUPPORT_MS_EXTENSIONS != 0
54 #define INIT_DATA_SIZE              176
55 #else
56 #define INIT_DATA_SIZE              159
57 #endif
58 #define INIT_DATA_CPP_SIZE          15
59 #define INIT_DATA_PP_NUMBER_SIZE    2
60 #define INIT_MACRO_DATA_SIZE        27
61 #endif // #if BOOST_WAVE_LEXERTL_USE_STATIC_TABLES == 0
62 
63 //  this is just a hack to have a unique token id not otherwise used by Wave
64 #define T_ANYCTRL   T_LAST_TOKEN_ID
65 
66 ///////////////////////////////////////////////////////////////////////////////
67 namespace lexer
68 {
69 
70 ///////////////////////////////////////////////////////////////////////////////
71 //  this is the wrapper for the lexertl lexer library
72 template <typename Iterator, typename Position>
73 class lexertl
74 {
75 private:
76     typedef BOOST_WAVE_STRINGTYPE string_type;
77     typedef typename boost::detail::iterator_traits<Iterator>::value_type
78         char_type;
79 
80 public:
81     wave::token_id next_token(Iterator &first, Iterator const &last,
82         string_type& token_value);
83 
84 #if BOOST_WAVE_LEXERTL_USE_STATIC_TABLES != 0
lexertl()85     lexertl() {}
init_dfa(wave::language_support lang,Position const & pos,bool force_reinit=false)86     void init_dfa(wave::language_support lang, Position const& pos,
87         bool force_reinit = false) {}
is_initialized() const88     bool is_initialized() const { return true; }
89 #else
lexertl()90     lexertl() : has_compiled_dfa_(false) {}
91     bool init_dfa(wave::language_support lang, Position const& pos,
92         bool force_reinit = false);
is_initialized() const93     bool is_initialized() const { return has_compiled_dfa_; }
94 
95 // get time of last compilation
get_compilation_time()96     static std::time_t get_compilation_time()
97         { return compilation_time.get_time(); }
98 
99     bool load (std::istream& instrm);
100     bool save (std::ostream& outstrm);
101 
102 private:
103     boost::lexer::state_machine state_machine_;
104     bool has_compiled_dfa_;
105 
106 // initialization data (regular expressions for the token definitions)
107     struct lexer_macro_data {
108         char_type const *name;          // macro name
109         char_type const *macro;         // associated macro definition
110     };
111     static lexer_macro_data const init_macro_data[INIT_MACRO_DATA_SIZE];    // macro patterns
112 
113     struct lexer_data {
114         token_id tokenid;               // token data
115         char_type const *tokenregex;    // associated token to match
116     };
117     static lexer_data const init_data[INIT_DATA_SIZE];              // common patterns
118     static lexer_data const init_data_cpp[INIT_DATA_CPP_SIZE];      // C++ only patterns
119     static lexer_data const init_data_pp_number[INIT_DATA_PP_NUMBER_SIZE];  // pp-number only patterns
120 
121 // helper for calculation of the time of last compilation
122     static boost::wave::util::time_conversion_helper compilation_time;
123 #endif // #if BOOST_WAVE_LEXERTL_USE_STATIC_TABLES == 0
124 };
125 
126 #if BOOST_WAVE_LEXERTL_USE_STATIC_TABLES == 0
127 ///////////////////////////////////////////////////////////////////////////////
128 // get time of last compilation of this file
129 template <typename IteratorT, typename PositionT>
130 boost::wave::util::time_conversion_helper
131     lexertl<IteratorT, PositionT>::compilation_time(__DATE__ " " __TIME__);
132 
133 ///////////////////////////////////////////////////////////////////////////////
134 // token regex definitions
135 
136 //  helper for initializing token data and macro definitions
137 #define Q(c)                    "\\" c
138 #define TRI(c)                  "{TRI}" c
139 #define OR                      "|"
140 #define MACRO_DATA(name, macro) { name, macro }
141 #define TOKEN_DATA(id, regex)   { id, regex }
142 
143 // lexertl macro definitions
144 template <typename Iterator, typename Position>
145 typename lexertl<Iterator, Position>::lexer_macro_data const
146 lexertl<Iterator, Position>::init_macro_data[INIT_MACRO_DATA_SIZE] =
147 {
148     MACRO_DATA("ANY", "[\t\v\f\r\n\\040-\\377]"),
149     MACRO_DATA("ANYCTRL", "[\\000-\\037]"),
150     MACRO_DATA("TRI", "\\?\\?"),
151     MACRO_DATA("BLANK", "[ \t\v\f]"),
152     MACRO_DATA("CCOMMENT", "\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/"),
153     MACRO_DATA("PPSPACE", "(" "{BLANK}" OR "{CCOMMENT}" ")*"),
154     MACRO_DATA("OCTALDIGIT", "[0-7]"),
155     MACRO_DATA("DIGIT", "[0-9]"),
156     MACRO_DATA("HEXDIGIT", "[0-9a-fA-F]"),
157     MACRO_DATA("OPTSIGN", "[-+]?"),
158     MACRO_DATA("EXPSTART", "[eE][-+]"),
159     MACRO_DATA("EXPONENT", "([eE]{OPTSIGN}{DIGIT}+)"),
160     MACRO_DATA("NONDIGIT", "[a-zA-Z_]"),
161     MACRO_DATA("INTEGER", "(" "(0x|0X){HEXDIGIT}+" OR "0{OCTALDIGIT}*" OR "[1-9]{DIGIT}*" ")"),
162     MACRO_DATA("INTEGER_SUFFIX", "(" "[uU][lL]?" OR "[lL][uU]?" ")"),
163 #if BOOST_WAVE_SUPPORT_MS_EXTENSIONS != 0
164     MACRO_DATA("LONGINTEGER_SUFFIX", "([uU]([lL][lL])|([lL][lL])[uU]?|i64)"),
165 #else
166     MACRO_DATA("LONGINTEGER_SUFFIX", "([uU]([lL][lL])|([lL][lL])[uU]?)"),
167 #endif
168     MACRO_DATA("FLOAT_SUFFIX", "(" "[fF][lL]?" OR "[lL][fF]?" ")"),
169     MACRO_DATA("CHAR_SPEC", "L?"),
170     MACRO_DATA("BACKSLASH", "(" Q("\\") OR TRI(Q("/")) ")"),
171     MACRO_DATA("ESCAPESEQ", "{BACKSLASH}([abfnrtv?'\"]|{BACKSLASH}|x{HEXDIGIT}+|{OCTALDIGIT}{1,3})"),
172     MACRO_DATA("HEXQUAD", "{HEXDIGIT}{4}"),
173     MACRO_DATA("UNIVERSALCHAR", "{BACKSLASH}(u{HEXQUAD}|U{HEXQUAD}{2})"),
174     MACRO_DATA("POUNDDEF", "(" "#" OR TRI("=") OR Q("%:") ")"),
175     MACRO_DATA("NEWLINEDEF", "(" "\\n" OR "\\r" OR "\\r\\n" ")"),
176 #if BOOST_WAVE_SUPPORT_INCLUDE_NEXT != 0
177     MACRO_DATA("INCLUDEDEF", "(include|include_next)"),
178 #else
179     MACRO_DATA("INCLUDEDEF", "include"),
180 #endif
181     MACRO_DATA("PP_NUMBERDEF", "\\.?{DIGIT}({DIGIT}|{NONDIGIT}|{EXPSTART}|\\.)*"),
182     MACRO_DATA(NULL, NULL)      // should be the last entry
183 };
184 
185 // common C++/C99 token definitions
186 template <typename Iterator, typename Position>
187 typename lexertl<Iterator, Position>::lexer_data const
188 lexertl<Iterator, Position>::init_data[INIT_DATA_SIZE] =
189 {
190     TOKEN_DATA(T_AND, "&"),
191     TOKEN_DATA(T_ANDAND, "&&"),
192     TOKEN_DATA(T_ASSIGN, "="),
193     TOKEN_DATA(T_ANDASSIGN, "&="),
194     TOKEN_DATA(T_OR, Q("|")),
195     TOKEN_DATA(T_OR_TRIGRAPH, "{TRI}!"),
196     TOKEN_DATA(T_ORASSIGN, Q("|=")),
197     TOKEN_DATA(T_ORASSIGN_TRIGRAPH, "{TRI}!="),
198     TOKEN_DATA(T_XOR, Q("^")),
199     TOKEN_DATA(T_XOR_TRIGRAPH, "{TRI}'"),
200     TOKEN_DATA(T_XORASSIGN, Q("^=")),
201     TOKEN_DATA(T_XORASSIGN_TRIGRAPH, "{TRI}'="),
202     TOKEN_DATA(T_COMMA, ","),
203     TOKEN_DATA(T_COLON, ":"),
204     TOKEN_DATA(T_DIVIDEASSIGN, Q("/=")),
205     TOKEN_DATA(T_DIVIDE, Q("/")),
206     TOKEN_DATA(T_DOT, Q(".")),
207     TOKEN_DATA(T_ELLIPSIS, Q(".") "{3}"),
208     TOKEN_DATA(T_EQUAL, "=="),
209     TOKEN_DATA(T_GREATER, ">"),
210     TOKEN_DATA(T_GREATEREQUAL, ">="),
211     TOKEN_DATA(T_LEFTBRACE, Q("{")),
212     TOKEN_DATA(T_LEFTBRACE_ALT, "<" Q("%")),
213     TOKEN_DATA(T_LEFTBRACE_TRIGRAPH, "{TRI}<"),
214     TOKEN_DATA(T_LESS, "<"),
215     TOKEN_DATA(T_LESSEQUAL, "<="),
216     TOKEN_DATA(T_LEFTPAREN, Q("(")),
217     TOKEN_DATA(T_LEFTBRACKET, Q("[")),
218     TOKEN_DATA(T_LEFTBRACKET_ALT, "<:"),
219     TOKEN_DATA(T_LEFTBRACKET_TRIGRAPH, "{TRI}" Q("(")),
220     TOKEN_DATA(T_MINUS, Q("-")),
221     TOKEN_DATA(T_MINUSASSIGN, Q("-=")),
222     TOKEN_DATA(T_MINUSMINUS, Q("-") "{2}"),
223     TOKEN_DATA(T_PERCENT, Q("%")),
224     TOKEN_DATA(T_PERCENTASSIGN, Q("%=")),
225     TOKEN_DATA(T_NOT, "!"),
226     TOKEN_DATA(T_NOTEQUAL, "!="),
227     TOKEN_DATA(T_OROR, Q("|") "{2}"),
228     TOKEN_DATA(T_OROR_TRIGRAPH, "{TRI}!\\||\\|{TRI}!|{TRI}!{TRI}!"),
229     TOKEN_DATA(T_PLUS, Q("+")),
230     TOKEN_DATA(T_PLUSASSIGN, Q("+=")),
231     TOKEN_DATA(T_PLUSPLUS, Q("+") "{2}"),
232     TOKEN_DATA(T_ARROW, Q("->")),
233     TOKEN_DATA(T_QUESTION_MARK, Q("?")),
234     TOKEN_DATA(T_RIGHTBRACE, Q("}")),
235     TOKEN_DATA(T_RIGHTBRACE_ALT, Q("%>")),
236     TOKEN_DATA(T_RIGHTBRACE_TRIGRAPH, "{TRI}>"),
237     TOKEN_DATA(T_RIGHTPAREN, Q(")")),
238     TOKEN_DATA(T_RIGHTBRACKET, Q("]")),
239     TOKEN_DATA(T_RIGHTBRACKET_ALT, ":>"),
240     TOKEN_DATA(T_RIGHTBRACKET_TRIGRAPH, "{TRI}" Q(")")),
241     TOKEN_DATA(T_SEMICOLON, ";"),
242     TOKEN_DATA(T_SHIFTLEFT, "<<"),
243     TOKEN_DATA(T_SHIFTLEFTASSIGN, "<<="),
244     TOKEN_DATA(T_SHIFTRIGHT, ">>"),
245     TOKEN_DATA(T_SHIFTRIGHTASSIGN, ">>="),
246     TOKEN_DATA(T_STAR, Q("*")),
247     TOKEN_DATA(T_COMPL, Q("~")),
248     TOKEN_DATA(T_COMPL_TRIGRAPH, "{TRI}-"),
249     TOKEN_DATA(T_STARASSIGN, Q("*=")),
250     TOKEN_DATA(T_ASM, "asm"),
251     TOKEN_DATA(T_AUTO, "auto"),
252     TOKEN_DATA(T_BOOL, "bool"),
253     TOKEN_DATA(T_FALSE, "false"),
254     TOKEN_DATA(T_TRUE, "true"),
255     TOKEN_DATA(T_BREAK, "break"),
256     TOKEN_DATA(T_CASE, "case"),
257     TOKEN_DATA(T_CATCH, "catch"),
258     TOKEN_DATA(T_CHAR, "char"),
259     TOKEN_DATA(T_CLASS, "class"),
260     TOKEN_DATA(T_CONST, "const"),
261     TOKEN_DATA(T_CONSTCAST, "const_cast"),
262     TOKEN_DATA(T_CONTINUE, "continue"),
263     TOKEN_DATA(T_DEFAULT, "default"),
264     TOKEN_DATA(T_DELETE, "delete"),
265     TOKEN_DATA(T_DO, "do"),
266     TOKEN_DATA(T_DOUBLE, "double"),
267     TOKEN_DATA(T_DYNAMICCAST, "dynamic_cast"),
268     TOKEN_DATA(T_ELSE, "else"),
269     TOKEN_DATA(T_ENUM, "enum"),
270     TOKEN_DATA(T_EXPLICIT, "explicit"),
271     TOKEN_DATA(T_EXPORT, "export"),
272     TOKEN_DATA(T_EXTERN, "extern"),
273     TOKEN_DATA(T_FLOAT, "float"),
274     TOKEN_DATA(T_FOR, "for"),
275     TOKEN_DATA(T_FRIEND, "friend"),
276     TOKEN_DATA(T_GOTO, "goto"),
277     TOKEN_DATA(T_IF, "if"),
278     TOKEN_DATA(T_INLINE, "inline"),
279     TOKEN_DATA(T_INT, "int"),
280     TOKEN_DATA(T_LONG, "long"),
281     TOKEN_DATA(T_MUTABLE, "mutable"),
282     TOKEN_DATA(T_NAMESPACE, "namespace"),
283     TOKEN_DATA(T_NEW, "new"),
284     TOKEN_DATA(T_OPERATOR, "operator"),
285     TOKEN_DATA(T_PRIVATE, "private"),
286     TOKEN_DATA(T_PROTECTED, "protected"),
287     TOKEN_DATA(T_PUBLIC, "public"),
288     TOKEN_DATA(T_REGISTER, "register"),
289     TOKEN_DATA(T_REINTERPRETCAST, "reinterpret_cast"),
290     TOKEN_DATA(T_RETURN, "return"),
291     TOKEN_DATA(T_SHORT, "short"),
292     TOKEN_DATA(T_SIGNED, "signed"),
293     TOKEN_DATA(T_SIZEOF, "sizeof"),
294     TOKEN_DATA(T_STATIC, "static"),
295     TOKEN_DATA(T_STATICCAST, "static_cast"),
296     TOKEN_DATA(T_STRUCT, "struct"),
297     TOKEN_DATA(T_SWITCH, "switch"),
298     TOKEN_DATA(T_TEMPLATE, "template"),
299     TOKEN_DATA(T_THIS, "this"),
300     TOKEN_DATA(T_THROW, "throw"),
301     TOKEN_DATA(T_TRY, "try"),
302     TOKEN_DATA(T_TYPEDEF, "typedef"),
303     TOKEN_DATA(T_TYPEID, "typeid"),
304     TOKEN_DATA(T_TYPENAME, "typename"),
305     TOKEN_DATA(T_UNION, "union"),
306     TOKEN_DATA(T_UNSIGNED, "unsigned"),
307     TOKEN_DATA(T_USING, "using"),
308     TOKEN_DATA(T_VIRTUAL, "virtual"),
309     TOKEN_DATA(T_VOID, "void"),
310     TOKEN_DATA(T_VOLATILE, "volatile"),
311     TOKEN_DATA(T_WCHART, "wchar_t"),
312     TOKEN_DATA(T_WHILE, "while"),
313     TOKEN_DATA(T_PP_DEFINE, "{POUNDDEF}{PPSPACE}define"),
314     TOKEN_DATA(T_PP_IF, "{POUNDDEF}{PPSPACE}if"),
315     TOKEN_DATA(T_PP_IFDEF, "{POUNDDEF}{PPSPACE}ifdef"),
316     TOKEN_DATA(T_PP_IFNDEF, "{POUNDDEF}{PPSPACE}ifndef"),
317     TOKEN_DATA(T_PP_ELSE, "{POUNDDEF}{PPSPACE}else"),
318     TOKEN_DATA(T_PP_ELIF, "{POUNDDEF}{PPSPACE}elif"),
319     TOKEN_DATA(T_PP_ENDIF, "{POUNDDEF}{PPSPACE}endif"),
320     TOKEN_DATA(T_PP_ERROR, "{POUNDDEF}{PPSPACE}error"),
321     TOKEN_DATA(T_PP_QHEADER, "{POUNDDEF}{PPSPACE}{INCLUDEDEF}{PPSPACE}" Q("\"") "[^\\n\\r\"]+" Q("\"")),
322     TOKEN_DATA(T_PP_HHEADER, "{POUNDDEF}{PPSPACE}{INCLUDEDEF}{PPSPACE}" "<" "[^\\n\\r>]+" ">"),
323     TOKEN_DATA(T_PP_INCLUDE, "{POUNDDEF}{PPSPACE}{INCLUDEDEF}{PPSPACE}"),
324     TOKEN_DATA(T_PP_LINE, "{POUNDDEF}{PPSPACE}line"),
325     TOKEN_DATA(T_PP_PRAGMA, "{POUNDDEF}{PPSPACE}pragma"),
326     TOKEN_DATA(T_PP_UNDEF, "{POUNDDEF}{PPSPACE}undef"),
327     TOKEN_DATA(T_PP_WARNING, "{POUNDDEF}{PPSPACE}warning"),
328 #if BOOST_WAVE_SUPPORT_MS_EXTENSIONS != 0
329     TOKEN_DATA(T_MSEXT_INT8, "__int8"),
330     TOKEN_DATA(T_MSEXT_INT16, "__int16"),
331     TOKEN_DATA(T_MSEXT_INT32, "__int32"),
332     TOKEN_DATA(T_MSEXT_INT64, "__int64"),
333     TOKEN_DATA(T_MSEXT_BASED, "_?" "_based"),
334     TOKEN_DATA(T_MSEXT_DECLSPEC, "_?" "_declspec"),
335     TOKEN_DATA(T_MSEXT_CDECL, "_?" "_cdecl"),
336     TOKEN_DATA(T_MSEXT_FASTCALL, "_?" "_fastcall"),
337     TOKEN_DATA(T_MSEXT_STDCALL, "_?" "_stdcall"),
338     TOKEN_DATA(T_MSEXT_TRY , "__try"),
339     TOKEN_DATA(T_MSEXT_EXCEPT, "__except"),
340     TOKEN_DATA(T_MSEXT_FINALLY, "__finally"),
341     TOKEN_DATA(T_MSEXT_LEAVE, "__leave"),
342     TOKEN_DATA(T_MSEXT_INLINE, "_?" "_inline"),
343     TOKEN_DATA(T_MSEXT_ASM, "_?" "_asm"),
344     TOKEN_DATA(T_MSEXT_PP_REGION, "{POUNDDEF}{PPSPACE}region"),
345     TOKEN_DATA(T_MSEXT_PP_ENDREGION, "{POUNDDEF}{PPSPACE}endregion"),
346 #endif // BOOST_WAVE_SUPPORT_MS_EXTENSIONS != 0
347     TOKEN_DATA(T_LONGINTLIT, "{INTEGER}{LONGINTEGER_SUFFIX}"),
348     TOKEN_DATA(T_INTLIT, "{INTEGER}{INTEGER_SUFFIX}?"),
349     TOKEN_DATA(T_FLOATLIT,
350         "(" "{DIGIT}*" Q(".") "{DIGIT}+" OR "{DIGIT}+" Q(".") "){EXPONENT}?{FLOAT_SUFFIX}?" OR
351         "{DIGIT}+{EXPONENT}{FLOAT_SUFFIX}?"),
352 #if BOOST_WAVE_USE_STRICT_LEXER != 0
353     TOKEN_DATA(T_IDENTIFIER,
354         "(" "{NONDIGIT}" OR "{UNIVERSALCHAR}" ")"
355         "(" "{NONDIGIT}" OR "{DIGIT}" OR "{UNIVERSALCHAR}" ")*"),
356 #else
357     TOKEN_DATA(T_IDENTIFIER,
358         "(" "{NONDIGIT}" OR Q("$") OR "{UNIVERSALCHAR}" ")"
359         "(" "{NONDIGIT}" OR Q("$") OR "{DIGIT}" OR "{UNIVERSALCHAR}" ")*"),
360 #endif
361     TOKEN_DATA(T_CCOMMENT, "{CCOMMENT}"),
362     TOKEN_DATA(T_CPPCOMMENT, Q("/") Q("/[^\\n\\r]*") "{NEWLINEDEF}" ),
363     TOKEN_DATA(T_CHARLIT,
364         "{CHAR_SPEC}" "'" "({ESCAPESEQ}|[^\\n\\r']|{UNIVERSALCHAR})+" "'"),
365     TOKEN_DATA(T_STRINGLIT,
366         "{CHAR_SPEC}" Q("\"") "({ESCAPESEQ}|[^\\n\\r\"]|{UNIVERSALCHAR})*" Q("\"")),
367     TOKEN_DATA(T_SPACE, "{BLANK}+"),
368     TOKEN_DATA(T_CONTLINE, Q("\\") "\\n"),
369     TOKEN_DATA(T_NEWLINE, "{NEWLINEDEF}"),
370     TOKEN_DATA(T_POUND_POUND, "##"),
371     TOKEN_DATA(T_POUND_POUND_ALT, Q("%:") Q("%:")),
372     TOKEN_DATA(T_POUND_POUND_TRIGRAPH, "({TRI}=){2}"),
373     TOKEN_DATA(T_POUND, "#"),
374     TOKEN_DATA(T_POUND_ALT, Q("%:")),
375     TOKEN_DATA(T_POUND_TRIGRAPH, "{TRI}="),
376     TOKEN_DATA(T_ANY_TRIGRAPH, "{TRI}\\/"),
377     TOKEN_DATA(T_ANY, "{ANY}"),
378     TOKEN_DATA(T_ANYCTRL, "{ANYCTRL}"),   // this should be the last recognized token
379     { token_id(0) }               // this should be the last entry
380 };
381 
382 // C++ only token definitions
383 template <typename Iterator, typename Position>
384 typename lexertl<Iterator, Position>::lexer_data const
385 lexertl<Iterator, Position>::init_data_cpp[INIT_DATA_CPP_SIZE] =
386 {
387     TOKEN_DATA(T_AND_ALT, "bitand"),
388     TOKEN_DATA(T_ANDASSIGN_ALT, "and_eq"),
389     TOKEN_DATA(T_ANDAND_ALT, "and"),
390     TOKEN_DATA(T_OR_ALT, "bitor"),
391     TOKEN_DATA(T_ORASSIGN_ALT, "or_eq"),
392     TOKEN_DATA(T_OROR_ALT, "or"),
393     TOKEN_DATA(T_XORASSIGN_ALT, "xor_eq"),
394     TOKEN_DATA(T_XOR_ALT, "xor"),
395     TOKEN_DATA(T_NOTEQUAL_ALT, "not_eq"),
396     TOKEN_DATA(T_NOT_ALT, "not"),
397     TOKEN_DATA(T_COMPL_ALT, "compl"),
398 #if BOOST_WAVE_SUPPORT_IMPORT_KEYWORD != 0
399     TOKEN_DATA(T_IMPORT, "import"),
400 #endif
401     TOKEN_DATA(T_ARROWSTAR, Q("->") Q("*")),
402     TOKEN_DATA(T_DOTSTAR, Q(".") Q("*")),
403     TOKEN_DATA(T_COLON_COLON, "::"),
404     { token_id(0) }       // this should be the last entry
405 };
406 
407 // pp-number specific token definitions
408 template <typename Iterator, typename Position>
409 typename lexertl<Iterator, Position>::lexer_data const
410 lexertl<Iterator, Position>::init_data_pp_number[INIT_DATA_PP_NUMBER_SIZE] =
411 {
412     TOKEN_DATA(T_PP_NUMBER, "{PP_NUMBERDEF}"),
413     { token_id(0) }       // this should be the last entry
414 };
415 
416 #undef MACRO_DATA
417 #undef TOKEN_DATA
418 #undef OR
419 #undef TRI
420 #undef Q
421 
422 ///////////////////////////////////////////////////////////////////////////////
423 // initialize lexertl lexer from C++ token regex's
424 template <typename Iterator, typename Position>
425 inline bool
init_dfa(wave::language_support lang,Position const & pos,bool force_reinit)426 lexertl<Iterator, Position>::init_dfa(wave::language_support lang,
427     Position const& pos, bool force_reinit)
428 {
429     if (has_compiled_dfa_)
430         return true;
431 
432 std::ifstream dfa_in("wave_lexertl_lexer.dfa", std::ios::in|std::ios::binary);
433 
434     if (force_reinit || !dfa_in.is_open() || !load (dfa_in))
435     {
436         dfa_in.close();
437 
438         state_machine_.clear();
439 
440     // register macro definitions
441         boost::lexer::rules rules;
442         for (int k = 0; NULL != init_macro_data[k].name; ++k) {
443             rules.add_macro(init_macro_data[k].name, init_macro_data[k].macro);
444         }
445 
446     // if pp-numbers should be preferred, insert the corresponding rule first
447         if (wave::need_prefer_pp_numbers(lang)) {
448             for (int j = 0; 0 != init_data_pp_number[j].tokenid; ++j) {
449                 rules.add(init_data_pp_number[j].tokenregex,
450                     init_data_pp_number[j].tokenid);
451             }
452         }
453 
454     // if in C99 mode, some of the keywords are not valid
455         if (!wave::need_c99(lang)) {
456             for (int j = 0; 0 != init_data_cpp[j].tokenid; ++j) {
457                 rules.add(init_data_cpp[j].tokenregex,
458                     init_data_cpp[j].tokenid);
459             }
460         }
461 
462         for (int i = 0; 0 != init_data[i].tokenid; ++i) {
463             rules.add(init_data[i].tokenregex, init_data[i].tokenid);
464         }
465 
466     // generate minimized DFA
467         try {
468             boost::lexer::generator::build (rules, state_machine_);
469             boost::lexer::generator::minimise (state_machine_);
470         }
471         catch (std::runtime_error const& e) {
472             string_type msg("lexertl initialization error: ");
473             msg += e.what();
474             BOOST_WAVE_LEXER_THROW(wave::cpplexer::lexing_exception,
475                 unexpected_error, msg.c_str(),
476                 pos.get_line(), pos.get_column(), pos.get_file().c_str());
477             return false;
478         }
479 
480     std::ofstream dfa_out ("wave_lexertl_lexer.dfa",
481         std::ios::out|std::ios::binary|std::ios::trunc);
482 
483         if (dfa_out.is_open())
484             save (dfa_out);
485     }
486 
487     has_compiled_dfa_ = true;
488     return true;
489 }
490 #endif // BOOST_WAVE_LEXERTL_USE_STATIC_TABLES == 0
491 
492 ///////////////////////////////////////////////////////////////////////////////
493 // return next token from the input stream
494 template <typename Iterator, typename Position>
495 inline wave::token_id
next_token(Iterator & first,Iterator const & last,string_type & token_value)496 lexertl<Iterator, Position>::next_token(Iterator &first, Iterator const &last,
497     string_type& token_value)
498 {
499 #if BOOST_WAVE_LEXERTL_USE_STATIC_TABLES == 0
500     size_t const* const lookup = &state_machine_.data()._lookup[0]->front ();
501     size_t const dfa_alphabet = state_machine_.data()._dfa_alphabet[0];
502 
503     size_t const* dfa = &state_machine_.data()._dfa[0]->front();
504     size_t const* ptr = dfa + dfa_alphabet + boost::lexer::dfa_offset;
505 #else
506     const std::size_t *ptr = dfa + dfa_offset;
507 #endif // BOOST_WAVE_LEXERTL_USE_STATIC_TABLES == 0
508 
509     Iterator curr = first;
510     Iterator end_token = first;
511     bool end_state = (*ptr != 0);
512     size_t id = *(ptr + 1);
513 
514     while (curr != last) {
515         size_t const state = ptr[lookup[int(*curr)]];
516         if (0 == state)
517             break;
518         ++curr;
519 
520 #if BOOST_WAVE_LEXERTL_USE_STATIC_TABLES == 0
521         ptr = &dfa[state * (dfa_alphabet + boost::lexer::dfa_offset)];
522 #else
523         ptr = &dfa[state * dfa_offset];
524 #endif // BOOST_WAVE_LEXERTL_USE_STATIC_TABLES == 0
525 
526         if (0 != *ptr) {
527             end_state = true;
528             id = *(ptr + 1);
529             end_token = curr;
530         }
531     }
532 
533     if (end_state) {
534         if (T_ANY == id) {
535             id = TOKEN_FROM_ID(*first, UnknownTokenType);
536         }
537 
538         // return longest match
539         string_type str(first, end_token);
540         token_value.swap(str);
541         first = end_token;
542         return wave::token_id(id);
543     }
544     return T_EOF;
545 }
546 
547 #if BOOST_WAVE_LEXERTL_USE_STATIC_TABLES == 0
548 ///////////////////////////////////////////////////////////////////////////////
549 //  load the DFA tables to/from a stream
550 template <typename Iterator, typename Position>
551 inline bool
load(std::istream & instrm)552 lexertl<Iterator, Position>::load (std::istream& instrm)
553 {
554 // #if !defined(BOOST_WAVE_LEXERTL_GENERATE_CPP_CODE)
555 //     std::size_t version = 0;
556 //     boost::lexer::serialise::load_as_binary(instrm, state_machine_, version);
557 //     if (version != (std::size_t)get_compilation_time())
558 //         return false;       // too new for us
559 //     return instrm.good();
560 // #else
561     return false;   // always create the dfa when generating the C++ code
562 // #endif
563 }
564 
565 ///////////////////////////////////////////////////////////////////////////////
566 //  save the DFA tables to/from a stream
567 template <typename Iterator, typename Position>
568 inline bool
save(std::ostream & outstrm)569 lexertl<Iterator, Position>::save (std::ostream& outstrm)
570 {
571 // #if defined(BOOST_WAVE_LEXERTL_GENERATE_CPP_CODE)
572 //     cpp_code::generate(state_machine_, outstrm);
573 // #else
574 //     boost::lexer::serialise::save_as_binary(state_machine_, outstrm,
575 //         (std::size_t)get_compilation_time());
576 // #endif
577     return outstrm.good();
578 }
579 #endif // #if BOOST_WAVE_LEXERTL_USE_STATIC_TABLES == 0
580 
581 ///////////////////////////////////////////////////////////////////////////////
582 }   // namespace lexer
583 
584 ///////////////////////////////////////////////////////////////////////////////
585 template <typename Iterator, typename Position = wave::util::file_position_type>
586 class lexertl_functor
587 :   public lexertl_input_interface<wave::cpplexer::lex_token<Position> >
588 {
589 public:
590     typedef wave::util::position_iterator<Iterator, Position> iterator_type;
591     typedef typename boost::detail::iterator_traits<Iterator>::value_type
592         char_type;
593     typedef BOOST_WAVE_STRINGTYPE string_type;
594     typedef wave::cpplexer::lex_token<Position> token_type;
595 
lexertl_functor(Iterator const & first_,Iterator const & last_,Position const & pos_,wave::language_support language)596     lexertl_functor(Iterator const &first_, Iterator const &last_,
597             Position const &pos_, wave::language_support language)
598     :   first(first_, last_, pos_), language(language), at_eof(false)
599     {
600         lexer_.init_dfa(language, pos_);
601     }
~lexertl_functor()602     ~lexertl_functor() {}
603 
604 // get the next token from the input stream
get(token_type & result)605     token_type& get(token_type& result)
606     {
607         if (lexer_.is_initialized() && !at_eof) {
608             do {
609             // generate and return the next token
610             string_type token_val;
611             Position pos = first.get_position();   // begin of token position
612             wave::token_id id = lexer_.next_token(first, last, token_val);
613 
614                 if (T_CONTLINE != id) {
615                 //  The cast should avoid spurious warnings about missing case labels
616                 //  for the other token ids's.
617                     switch (static_cast<unsigned int>(id)) {
618                     case T_IDENTIFIER:
619                     // test identifier characters for validity (throws if
620                     // invalid chars found)
621                         if (!wave::need_no_character_validation(language)) {
622                             using wave::cpplexer::impl::validate_identifier_name;
623                             validate_identifier_name(token_val,
624                                 pos.get_line(), pos.get_column(), pos.get_file());
625                         }
626                         break;
627 
628                     case T_STRINGLIT:
629                     case T_CHARLIT:
630                     // test literal characters for validity (throws if invalid
631                     // chars found)
632                         if (wave::need_convert_trigraphs(language)) {
633                             using wave::cpplexer::impl::convert_trigraphs;
634                             token_val = convert_trigraphs(token_val);
635                         }
636                         if (!wave::need_no_character_validation(language)) {
637                             using wave::cpplexer::impl::validate_literal;
638                             validate_literal(token_val,
639                                 pos.get_line(), pos.get_column(), pos.get_file());
640                         }
641                         break;
642 
643                     case T_LONGINTLIT:  // supported in C99 and long_long mode
644                         if (!wave::need_long_long(language)) {
645                         // syntax error: not allowed in C++ mode
646                             BOOST_WAVE_LEXER_THROW(
647                                 wave::cpplexer::lexing_exception,
648                                 invalid_long_long_literal, token_val.c_str(),
649                                 pos.get_line(), pos.get_column(),
650                                 pos.get_file().c_str());
651                         }
652                         break;
653 
654 #if BOOST_WAVE_SUPPORT_INCLUDE_NEXT != 0
655                     case T_PP_HHEADER:
656                     case T_PP_QHEADER:
657                     case T_PP_INCLUDE:
658                     // convert to the corresponding ..._next token, if appropriate
659                         {
660                         // Skip '#' and whitespace and see whether we find an
661                         // 'include_next' here.
662                             typename string_type::size_type start = token_val.find("include");
663                             if (0 == token_val.compare(start, 12, "include_next", 12))
664                                 id = token_id(id | AltTokenType);
665                         }
666                         break;
667 #endif // BOOST_WAVE_SUPPORT_INCLUDE_NEXT != 0
668 
669                     case T_EOF:
670                     // T_EOF is returned as a valid token, the next call will
671                     // return T_EOI, i.e. the actual end of input
672                         at_eof = true;
673                         token_val.clear();
674                         break;
675 
676                     case T_OR_TRIGRAPH:
677                     case T_XOR_TRIGRAPH:
678                     case T_LEFTBRACE_TRIGRAPH:
679                     case T_RIGHTBRACE_TRIGRAPH:
680                     case T_LEFTBRACKET_TRIGRAPH:
681                     case T_RIGHTBRACKET_TRIGRAPH:
682                     case T_COMPL_TRIGRAPH:
683                     case T_POUND_TRIGRAPH:
684                     case T_ANY_TRIGRAPH:
685                         if (wave::need_convert_trigraphs(language))
686                         {
687                             using wave::cpplexer::impl::convert_trigraph;
688                             token_val = convert_trigraph(token_val);
689                         }
690                         break;
691 
692                     case T_ANYCTRL:
693                         // matched some unexpected character
694                         {
695                             // 21 is the max required size for a 64 bit integer
696                             // represented as a string
697                             char buffer[22];
698                             string_type msg("invalid character in input stream: '0x");
699 
700                             // for some systems sprintf is in namespace std
701                             using namespace std;
702                             sprintf(buffer, "%02x'", token_val[0]);
703                             msg += buffer;
704                             BOOST_WAVE_LEXER_THROW(
705                                 wave::cpplexer::lexing_exception,
706                                 generic_lexing_error,
707                                 msg.c_str(), pos.get_line(), pos.get_column(),
708                                 pos.get_file().c_str());
709                         }
710                         break;
711                     }
712 
713                     result = token_type(id, token_val, pos);
714 #if BOOST_WAVE_SUPPORT_PRAGMA_ONCE != 0
715                     return guards.detect_guard(result);
716 #else
717                     return result;
718 #endif
719                 }
720             } while (true);     // skip the T_CONTLINE token
721         }
722         return result = token_type();           // return T_EOI
723     }
724 
set_position(Position const & pos)725     void set_position(Position const &pos)
726     {
727         // set position has to change the file name and line number only
728         first.get_position().set_file(pos.get_file());
729         first.get_position().set_line(pos.get_line());
730     }
731 
732 #if BOOST_WAVE_SUPPORT_PRAGMA_ONCE != 0
has_include_guards(std::string & guard_name) const733     bool has_include_guards(std::string& guard_name) const
734         { return guards.detected(guard_name); }
735 #endif
736 
737 private:
738     iterator_type first;
739     iterator_type last;
740 
741     wave::language_support language;
742     bool at_eof;
743 #if BOOST_WAVE_SUPPORT_PRAGMA_ONCE != 0
744     include_guards<token_type> guards;
745 #endif
746 
747     static lexer::lexertl<iterator_type, Position> lexer_;
748 };
749 
750 template <typename Iterator, typename Position>
751 lexer::lexertl<
752     typename lexertl_functor<Iterator, Position>::iterator_type, Position>
753         lexertl_functor<Iterator, Position>::lexer_;
754 
755 #undef INIT_DATA_SIZE
756 #undef INIT_DATA_CPP_SIZE
757 #undef INIT_DATA_PP_NUMBER_SIZE
758 #undef INIT_MACRO_DATA_SIZE
759 #undef T_ANYCTRL
760 
761 ///////////////////////////////////////////////////////////////////////////////
762 //
763 //  The new_lexer_gen<>::new_lexer function (declared in lexertl_interface.hpp)
764 //  should be defined inline, if the lex_functor shouldn't be instantiated
765 //  separately from the lex_iterator.
766 //
767 //  Separate (explicit) instantiation helps to reduce compilation time.
768 //
769 ///////////////////////////////////////////////////////////////////////////////
770 
771 #if BOOST_WAVE_SEPARATE_LEXER_INSTANTIATION != 0
772 #define BOOST_WAVE_FLEX_NEW_LEXER_INLINE
773 #else
774 #define BOOST_WAVE_FLEX_NEW_LEXER_INLINE inline
775 #endif
776 
777 ///////////////////////////////////////////////////////////////////////////////
778 //
779 //  The 'new_lexer' function allows the opaque generation of a new lexer object.
780 //  It is coupled to the iterator type to allow to decouple the lexer/iterator
781 //  configurations at compile time.
782 //
783 //  This function is declared inside the xlex_interface.hpp file, which is
784 //  referenced by the source file calling the lexer and the source file, which
785 //  instantiates the lex_functor. But it is defined here, so it will be
786 //  instantiated only while compiling the source file, which instantiates the
787 //  lex_functor. While the xlex_interface.hpp file may be included everywhere,
788 //  this file (xlex_lexer.hpp) should be included only once. This allows
789 //  to decouple the lexer interface from the lexer implementation and reduces
790 //  compilation time.
791 //
792 ///////////////////////////////////////////////////////////////////////////////
793 
794 template <typename Iterator, typename Position>
795 BOOST_WAVE_FLEX_NEW_LEXER_INLINE
796 wave::cpplexer::lex_input_interface<wave::cpplexer::lex_token<Position> > *
new_lexer(Iterator const & first,Iterator const & last,Position const & pos,wave::language_support language)797 new_lexer_gen<Iterator, Position>::new_lexer(Iterator const &first,
798     Iterator const &last, Position const &pos, wave::language_support language)
799 {
800     return new lexertl_functor<Iterator, Position>(first, last, pos, language);
801 }
802 
803 #undef BOOST_WAVE_FLEX_NEW_LEXER_INLINE
804 
805 ///////////////////////////////////////////////////////////////////////////////
806 }}}}   // namespace boost::wave::cpplexer::lexertl
807 
808 #endif // !defined(BOOST_WAVE_LEXERTL_LEXER_HPP_INCLUDED)
809 
810