1 /*=============================================================================
2 Boost.Wave: A Standard compliant C++ preprocessor library
3 http://www.boost.org/
4
5 Copyright (c) 2001-2010 Hartmut Kaiser. Distributed under the Boost
6 Software License, Version 1.0. (See accompanying file
7 LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
8 =============================================================================*/
9
10 #if !defined(BOOST_WAVE_LEXERTL_LEXER_HPP_INCLUDED)
11 #define BOOST_WAVE_LEXERTL_LEXER_HPP_INCLUDED
12
13 #include <fstream>
14
15 #include <boost/iterator/iterator_traits.hpp>
16
17 #include <boost/wave/wave_config.hpp>
18 #include <boost/wave/language_support.hpp>
19 #include <boost/wave/token_ids.hpp>
20 #include <boost/wave/util/time_conversion_helper.hpp>
21
22 #include <boost/wave/cpplexer/validate_universal_char.hpp>
23 #include <boost/wave/cpplexer/convert_trigraphs.hpp>
24 #include <boost/wave/cpplexer/cpplexer_exceptions.hpp>
25 #if BOOST_WAVE_SUPPORT_PRAGMA_ONCE != 0
26 #include <boost/wave/cpplexer/detect_include_guards.hpp>
27 #endif
28
29 #include "wave_lexertl_config.hpp"
30 #include "../lexertl_iterator.hpp"
31
32 #if BOOST_WAVE_LEXERTL_USE_STATIC_TABLES != 0
33 #include "wave_lexertl_tables.hpp"
34 #else
35 #include <boost/spirit/home/support/detail/lexer/generator.hpp>
36 #include <boost/spirit/home/support/detail/lexer/rules.hpp>
37 #include <boost/spirit/home/support/detail/lexer/state_machine.hpp>
38 #include <boost/spirit/home/support/detail/lexer/consts.hpp>
39 //#include "lexertl/examples/serialise.hpp>
40 // #if BOOST_WAVE_LEXERTL_GENERATE_CPP_CODE != 0
41 // #include "lexertl/examples/cpp_code.hpp"
42 // #endif
43 #endif
44
45 ///////////////////////////////////////////////////////////////////////////////
46 namespace boost { namespace wave { namespace cpplexer { namespace lexertl
47 {
48
49 #if BOOST_WAVE_LEXERTL_USE_STATIC_TABLES == 0
50 ///////////////////////////////////////////////////////////////////////////////
51 // The following numbers are the array sizes of the token regex's which we
52 // need to specify to make the CW compiler happy (at least up to V9.5).
53 #if BOOST_WAVE_SUPPORT_MS_EXTENSIONS != 0
54 #define INIT_DATA_SIZE 176
55 #else
56 #define INIT_DATA_SIZE 159
57 #endif
58 #define INIT_DATA_CPP_SIZE 15
59 #define INIT_DATA_PP_NUMBER_SIZE 2
60 #define INIT_MACRO_DATA_SIZE 27
61 #endif // #if BOOST_WAVE_LEXERTL_USE_STATIC_TABLES == 0
62
63 // this is just a hack to have a unique token id not otherwise used by Wave
64 #define T_ANYCTRL T_LAST_TOKEN_ID
65
66 ///////////////////////////////////////////////////////////////////////////////
67 namespace lexer
68 {
69
70 ///////////////////////////////////////////////////////////////////////////////
71 // this is the wrapper for the lexertl lexer library
72 template <typename Iterator, typename Position>
73 class lexertl
74 {
75 private:
76 typedef BOOST_WAVE_STRINGTYPE string_type;
77 typedef typename boost::detail::iterator_traits<Iterator>::value_type
78 char_type;
79
80 public:
81 wave::token_id next_token(Iterator &first, Iterator const &last,
82 string_type& token_value);
83
84 #if BOOST_WAVE_LEXERTL_USE_STATIC_TABLES != 0
lexertl()85 lexertl() {}
init_dfa(wave::language_support lang,Position const & pos,bool force_reinit=false)86 void init_dfa(wave::language_support lang, Position const& pos,
87 bool force_reinit = false) {}
is_initialized() const88 bool is_initialized() const { return true; }
89 #else
lexertl()90 lexertl() : has_compiled_dfa_(false) {}
91 bool init_dfa(wave::language_support lang, Position const& pos,
92 bool force_reinit = false);
is_initialized() const93 bool is_initialized() const { return has_compiled_dfa_; }
94
95 // get time of last compilation
get_compilation_time()96 static std::time_t get_compilation_time()
97 { return compilation_time.get_time(); }
98
99 bool load (std::istream& instrm);
100 bool save (std::ostream& outstrm);
101
102 private:
103 boost::lexer::state_machine state_machine_;
104 bool has_compiled_dfa_;
105
106 // initialization data (regular expressions for the token definitions)
107 struct lexer_macro_data {
108 char_type const *name; // macro name
109 char_type const *macro; // associated macro definition
110 };
111 static lexer_macro_data const init_macro_data[INIT_MACRO_DATA_SIZE]; // macro patterns
112
113 struct lexer_data {
114 token_id tokenid; // token data
115 char_type const *tokenregex; // associated token to match
116 };
117 static lexer_data const init_data[INIT_DATA_SIZE]; // common patterns
118 static lexer_data const init_data_cpp[INIT_DATA_CPP_SIZE]; // C++ only patterns
119 static lexer_data const init_data_pp_number[INIT_DATA_PP_NUMBER_SIZE]; // pp-number only patterns
120
121 // helper for calculation of the time of last compilation
122 static boost::wave::util::time_conversion_helper compilation_time;
123 #endif // #if BOOST_WAVE_LEXERTL_USE_STATIC_TABLES == 0
124 };
125
126 #if BOOST_WAVE_LEXERTL_USE_STATIC_TABLES == 0
127 ///////////////////////////////////////////////////////////////////////////////
128 // get time of last compilation of this file
129 template <typename IteratorT, typename PositionT>
130 boost::wave::util::time_conversion_helper
131 lexertl<IteratorT, PositionT>::compilation_time(__DATE__ " " __TIME__);
132
133 ///////////////////////////////////////////////////////////////////////////////
134 // token regex definitions
135
136 // helper for initializing token data and macro definitions
137 #define Q(c) "\\" c
138 #define TRI(c) "{TRI}" c
139 #define OR "|"
140 #define MACRO_DATA(name, macro) { name, macro }
141 #define TOKEN_DATA(id, regex) { id, regex }
142
143 // lexertl macro definitions
144 template <typename Iterator, typename Position>
145 typename lexertl<Iterator, Position>::lexer_macro_data const
146 lexertl<Iterator, Position>::init_macro_data[INIT_MACRO_DATA_SIZE] =
147 {
148 MACRO_DATA("ANY", "[\t\v\f\r\n\\040-\\377]"),
149 MACRO_DATA("ANYCTRL", "[\\000-\\037]"),
150 MACRO_DATA("TRI", "\\?\\?"),
151 MACRO_DATA("BLANK", "[ \t\v\f]"),
152 MACRO_DATA("CCOMMENT", "\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/"),
153 MACRO_DATA("PPSPACE", "(" "{BLANK}" OR "{CCOMMENT}" ")*"),
154 MACRO_DATA("OCTALDIGIT", "[0-7]"),
155 MACRO_DATA("DIGIT", "[0-9]"),
156 MACRO_DATA("HEXDIGIT", "[0-9a-fA-F]"),
157 MACRO_DATA("OPTSIGN", "[-+]?"),
158 MACRO_DATA("EXPSTART", "[eE][-+]"),
159 MACRO_DATA("EXPONENT", "([eE]{OPTSIGN}{DIGIT}+)"),
160 MACRO_DATA("NONDIGIT", "[a-zA-Z_]"),
161 MACRO_DATA("INTEGER", "(" "(0x|0X){HEXDIGIT}+" OR "0{OCTALDIGIT}*" OR "[1-9]{DIGIT}*" ")"),
162 MACRO_DATA("INTEGER_SUFFIX", "(" "[uU][lL]?" OR "[lL][uU]?" ")"),
163 #if BOOST_WAVE_SUPPORT_MS_EXTENSIONS != 0
164 MACRO_DATA("LONGINTEGER_SUFFIX", "([uU]([lL][lL])|([lL][lL])[uU]?|i64)"),
165 #else
166 MACRO_DATA("LONGINTEGER_SUFFIX", "([uU]([lL][lL])|([lL][lL])[uU]?)"),
167 #endif
168 MACRO_DATA("FLOAT_SUFFIX", "(" "[fF][lL]?" OR "[lL][fF]?" ")"),
169 MACRO_DATA("CHAR_SPEC", "L?"),
170 MACRO_DATA("BACKSLASH", "(" Q("\\") OR TRI(Q("/")) ")"),
171 MACRO_DATA("ESCAPESEQ", "{BACKSLASH}([abfnrtv?'\"]|{BACKSLASH}|x{HEXDIGIT}+|{OCTALDIGIT}{1,3})"),
172 MACRO_DATA("HEXQUAD", "{HEXDIGIT}{4}"),
173 MACRO_DATA("UNIVERSALCHAR", "{BACKSLASH}(u{HEXQUAD}|U{HEXQUAD}{2})"),
174 MACRO_DATA("POUNDDEF", "(" "#" OR TRI("=") OR Q("%:") ")"),
175 MACRO_DATA("NEWLINEDEF", "(" "\\n" OR "\\r" OR "\\r\\n" ")"),
176 #if BOOST_WAVE_SUPPORT_INCLUDE_NEXT != 0
177 MACRO_DATA("INCLUDEDEF", "(include|include_next)"),
178 #else
179 MACRO_DATA("INCLUDEDEF", "include"),
180 #endif
181 MACRO_DATA("PP_NUMBERDEF", "\\.?{DIGIT}({DIGIT}|{NONDIGIT}|{EXPSTART}|\\.)*"),
182 MACRO_DATA(NULL, NULL) // should be the last entry
183 };
184
185 // common C++/C99 token definitions
186 template <typename Iterator, typename Position>
187 typename lexertl<Iterator, Position>::lexer_data const
188 lexertl<Iterator, Position>::init_data[INIT_DATA_SIZE] =
189 {
190 TOKEN_DATA(T_AND, "&"),
191 TOKEN_DATA(T_ANDAND, "&&"),
192 TOKEN_DATA(T_ASSIGN, "="),
193 TOKEN_DATA(T_ANDASSIGN, "&="),
194 TOKEN_DATA(T_OR, Q("|")),
195 TOKEN_DATA(T_OR_TRIGRAPH, "{TRI}!"),
196 TOKEN_DATA(T_ORASSIGN, Q("|=")),
197 TOKEN_DATA(T_ORASSIGN_TRIGRAPH, "{TRI}!="),
198 TOKEN_DATA(T_XOR, Q("^")),
199 TOKEN_DATA(T_XOR_TRIGRAPH, "{TRI}'"),
200 TOKEN_DATA(T_XORASSIGN, Q("^=")),
201 TOKEN_DATA(T_XORASSIGN_TRIGRAPH, "{TRI}'="),
202 TOKEN_DATA(T_COMMA, ","),
203 TOKEN_DATA(T_COLON, ":"),
204 TOKEN_DATA(T_DIVIDEASSIGN, Q("/=")),
205 TOKEN_DATA(T_DIVIDE, Q("/")),
206 TOKEN_DATA(T_DOT, Q(".")),
207 TOKEN_DATA(T_ELLIPSIS, Q(".") "{3}"),
208 TOKEN_DATA(T_EQUAL, "=="),
209 TOKEN_DATA(T_GREATER, ">"),
210 TOKEN_DATA(T_GREATEREQUAL, ">="),
211 TOKEN_DATA(T_LEFTBRACE, Q("{")),
212 TOKEN_DATA(T_LEFTBRACE_ALT, "<" Q("%")),
213 TOKEN_DATA(T_LEFTBRACE_TRIGRAPH, "{TRI}<"),
214 TOKEN_DATA(T_LESS, "<"),
215 TOKEN_DATA(T_LESSEQUAL, "<="),
216 TOKEN_DATA(T_LEFTPAREN, Q("(")),
217 TOKEN_DATA(T_LEFTBRACKET, Q("[")),
218 TOKEN_DATA(T_LEFTBRACKET_ALT, "<:"),
219 TOKEN_DATA(T_LEFTBRACKET_TRIGRAPH, "{TRI}" Q("(")),
220 TOKEN_DATA(T_MINUS, Q("-")),
221 TOKEN_DATA(T_MINUSASSIGN, Q("-=")),
222 TOKEN_DATA(T_MINUSMINUS, Q("-") "{2}"),
223 TOKEN_DATA(T_PERCENT, Q("%")),
224 TOKEN_DATA(T_PERCENTASSIGN, Q("%=")),
225 TOKEN_DATA(T_NOT, "!"),
226 TOKEN_DATA(T_NOTEQUAL, "!="),
227 TOKEN_DATA(T_OROR, Q("|") "{2}"),
228 TOKEN_DATA(T_OROR_TRIGRAPH, "{TRI}!\\||\\|{TRI}!|{TRI}!{TRI}!"),
229 TOKEN_DATA(T_PLUS, Q("+")),
230 TOKEN_DATA(T_PLUSASSIGN, Q("+=")),
231 TOKEN_DATA(T_PLUSPLUS, Q("+") "{2}"),
232 TOKEN_DATA(T_ARROW, Q("->")),
233 TOKEN_DATA(T_QUESTION_MARK, Q("?")),
234 TOKEN_DATA(T_RIGHTBRACE, Q("}")),
235 TOKEN_DATA(T_RIGHTBRACE_ALT, Q("%>")),
236 TOKEN_DATA(T_RIGHTBRACE_TRIGRAPH, "{TRI}>"),
237 TOKEN_DATA(T_RIGHTPAREN, Q(")")),
238 TOKEN_DATA(T_RIGHTBRACKET, Q("]")),
239 TOKEN_DATA(T_RIGHTBRACKET_ALT, ":>"),
240 TOKEN_DATA(T_RIGHTBRACKET_TRIGRAPH, "{TRI}" Q(")")),
241 TOKEN_DATA(T_SEMICOLON, ";"),
242 TOKEN_DATA(T_SHIFTLEFT, "<<"),
243 TOKEN_DATA(T_SHIFTLEFTASSIGN, "<<="),
244 TOKEN_DATA(T_SHIFTRIGHT, ">>"),
245 TOKEN_DATA(T_SHIFTRIGHTASSIGN, ">>="),
246 TOKEN_DATA(T_STAR, Q("*")),
247 TOKEN_DATA(T_COMPL, Q("~")),
248 TOKEN_DATA(T_COMPL_TRIGRAPH, "{TRI}-"),
249 TOKEN_DATA(T_STARASSIGN, Q("*=")),
250 TOKEN_DATA(T_ASM, "asm"),
251 TOKEN_DATA(T_AUTO, "auto"),
252 TOKEN_DATA(T_BOOL, "bool"),
253 TOKEN_DATA(T_FALSE, "false"),
254 TOKEN_DATA(T_TRUE, "true"),
255 TOKEN_DATA(T_BREAK, "break"),
256 TOKEN_DATA(T_CASE, "case"),
257 TOKEN_DATA(T_CATCH, "catch"),
258 TOKEN_DATA(T_CHAR, "char"),
259 TOKEN_DATA(T_CLASS, "class"),
260 TOKEN_DATA(T_CONST, "const"),
261 TOKEN_DATA(T_CONSTCAST, "const_cast"),
262 TOKEN_DATA(T_CONTINUE, "continue"),
263 TOKEN_DATA(T_DEFAULT, "default"),
264 TOKEN_DATA(T_DELETE, "delete"),
265 TOKEN_DATA(T_DO, "do"),
266 TOKEN_DATA(T_DOUBLE, "double"),
267 TOKEN_DATA(T_DYNAMICCAST, "dynamic_cast"),
268 TOKEN_DATA(T_ELSE, "else"),
269 TOKEN_DATA(T_ENUM, "enum"),
270 TOKEN_DATA(T_EXPLICIT, "explicit"),
271 TOKEN_DATA(T_EXPORT, "export"),
272 TOKEN_DATA(T_EXTERN, "extern"),
273 TOKEN_DATA(T_FLOAT, "float"),
274 TOKEN_DATA(T_FOR, "for"),
275 TOKEN_DATA(T_FRIEND, "friend"),
276 TOKEN_DATA(T_GOTO, "goto"),
277 TOKEN_DATA(T_IF, "if"),
278 TOKEN_DATA(T_INLINE, "inline"),
279 TOKEN_DATA(T_INT, "int"),
280 TOKEN_DATA(T_LONG, "long"),
281 TOKEN_DATA(T_MUTABLE, "mutable"),
282 TOKEN_DATA(T_NAMESPACE, "namespace"),
283 TOKEN_DATA(T_NEW, "new"),
284 TOKEN_DATA(T_OPERATOR, "operator"),
285 TOKEN_DATA(T_PRIVATE, "private"),
286 TOKEN_DATA(T_PROTECTED, "protected"),
287 TOKEN_DATA(T_PUBLIC, "public"),
288 TOKEN_DATA(T_REGISTER, "register"),
289 TOKEN_DATA(T_REINTERPRETCAST, "reinterpret_cast"),
290 TOKEN_DATA(T_RETURN, "return"),
291 TOKEN_DATA(T_SHORT, "short"),
292 TOKEN_DATA(T_SIGNED, "signed"),
293 TOKEN_DATA(T_SIZEOF, "sizeof"),
294 TOKEN_DATA(T_STATIC, "static"),
295 TOKEN_DATA(T_STATICCAST, "static_cast"),
296 TOKEN_DATA(T_STRUCT, "struct"),
297 TOKEN_DATA(T_SWITCH, "switch"),
298 TOKEN_DATA(T_TEMPLATE, "template"),
299 TOKEN_DATA(T_THIS, "this"),
300 TOKEN_DATA(T_THROW, "throw"),
301 TOKEN_DATA(T_TRY, "try"),
302 TOKEN_DATA(T_TYPEDEF, "typedef"),
303 TOKEN_DATA(T_TYPEID, "typeid"),
304 TOKEN_DATA(T_TYPENAME, "typename"),
305 TOKEN_DATA(T_UNION, "union"),
306 TOKEN_DATA(T_UNSIGNED, "unsigned"),
307 TOKEN_DATA(T_USING, "using"),
308 TOKEN_DATA(T_VIRTUAL, "virtual"),
309 TOKEN_DATA(T_VOID, "void"),
310 TOKEN_DATA(T_VOLATILE, "volatile"),
311 TOKEN_DATA(T_WCHART, "wchar_t"),
312 TOKEN_DATA(T_WHILE, "while"),
313 TOKEN_DATA(T_PP_DEFINE, "{POUNDDEF}{PPSPACE}define"),
314 TOKEN_DATA(T_PP_IF, "{POUNDDEF}{PPSPACE}if"),
315 TOKEN_DATA(T_PP_IFDEF, "{POUNDDEF}{PPSPACE}ifdef"),
316 TOKEN_DATA(T_PP_IFNDEF, "{POUNDDEF}{PPSPACE}ifndef"),
317 TOKEN_DATA(T_PP_ELSE, "{POUNDDEF}{PPSPACE}else"),
318 TOKEN_DATA(T_PP_ELIF, "{POUNDDEF}{PPSPACE}elif"),
319 TOKEN_DATA(T_PP_ENDIF, "{POUNDDEF}{PPSPACE}endif"),
320 TOKEN_DATA(T_PP_ERROR, "{POUNDDEF}{PPSPACE}error"),
321 TOKEN_DATA(T_PP_QHEADER, "{POUNDDEF}{PPSPACE}{INCLUDEDEF}{PPSPACE}" Q("\"") "[^\\n\\r\"]+" Q("\"")),
322 TOKEN_DATA(T_PP_HHEADER, "{POUNDDEF}{PPSPACE}{INCLUDEDEF}{PPSPACE}" "<" "[^\\n\\r>]+" ">"),
323 TOKEN_DATA(T_PP_INCLUDE, "{POUNDDEF}{PPSPACE}{INCLUDEDEF}{PPSPACE}"),
324 TOKEN_DATA(T_PP_LINE, "{POUNDDEF}{PPSPACE}line"),
325 TOKEN_DATA(T_PP_PRAGMA, "{POUNDDEF}{PPSPACE}pragma"),
326 TOKEN_DATA(T_PP_UNDEF, "{POUNDDEF}{PPSPACE}undef"),
327 TOKEN_DATA(T_PP_WARNING, "{POUNDDEF}{PPSPACE}warning"),
328 #if BOOST_WAVE_SUPPORT_MS_EXTENSIONS != 0
329 TOKEN_DATA(T_MSEXT_INT8, "__int8"),
330 TOKEN_DATA(T_MSEXT_INT16, "__int16"),
331 TOKEN_DATA(T_MSEXT_INT32, "__int32"),
332 TOKEN_DATA(T_MSEXT_INT64, "__int64"),
333 TOKEN_DATA(T_MSEXT_BASED, "_?" "_based"),
334 TOKEN_DATA(T_MSEXT_DECLSPEC, "_?" "_declspec"),
335 TOKEN_DATA(T_MSEXT_CDECL, "_?" "_cdecl"),
336 TOKEN_DATA(T_MSEXT_FASTCALL, "_?" "_fastcall"),
337 TOKEN_DATA(T_MSEXT_STDCALL, "_?" "_stdcall"),
338 TOKEN_DATA(T_MSEXT_TRY , "__try"),
339 TOKEN_DATA(T_MSEXT_EXCEPT, "__except"),
340 TOKEN_DATA(T_MSEXT_FINALLY, "__finally"),
341 TOKEN_DATA(T_MSEXT_LEAVE, "__leave"),
342 TOKEN_DATA(T_MSEXT_INLINE, "_?" "_inline"),
343 TOKEN_DATA(T_MSEXT_ASM, "_?" "_asm"),
344 TOKEN_DATA(T_MSEXT_PP_REGION, "{POUNDDEF}{PPSPACE}region"),
345 TOKEN_DATA(T_MSEXT_PP_ENDREGION, "{POUNDDEF}{PPSPACE}endregion"),
346 #endif // BOOST_WAVE_SUPPORT_MS_EXTENSIONS != 0
347 TOKEN_DATA(T_LONGINTLIT, "{INTEGER}{LONGINTEGER_SUFFIX}"),
348 TOKEN_DATA(T_INTLIT, "{INTEGER}{INTEGER_SUFFIX}?"),
349 TOKEN_DATA(T_FLOATLIT,
350 "(" "{DIGIT}*" Q(".") "{DIGIT}+" OR "{DIGIT}+" Q(".") "){EXPONENT}?{FLOAT_SUFFIX}?" OR
351 "{DIGIT}+{EXPONENT}{FLOAT_SUFFIX}?"),
352 #if BOOST_WAVE_USE_STRICT_LEXER != 0
353 TOKEN_DATA(T_IDENTIFIER,
354 "(" "{NONDIGIT}" OR "{UNIVERSALCHAR}" ")"
355 "(" "{NONDIGIT}" OR "{DIGIT}" OR "{UNIVERSALCHAR}" ")*"),
356 #else
357 TOKEN_DATA(T_IDENTIFIER,
358 "(" "{NONDIGIT}" OR Q("$") OR "{UNIVERSALCHAR}" ")"
359 "(" "{NONDIGIT}" OR Q("$") OR "{DIGIT}" OR "{UNIVERSALCHAR}" ")*"),
360 #endif
361 TOKEN_DATA(T_CCOMMENT, "{CCOMMENT}"),
362 TOKEN_DATA(T_CPPCOMMENT, Q("/") Q("/[^\\n\\r]*") "{NEWLINEDEF}" ),
363 TOKEN_DATA(T_CHARLIT,
364 "{CHAR_SPEC}" "'" "({ESCAPESEQ}|[^\\n\\r']|{UNIVERSALCHAR})+" "'"),
365 TOKEN_DATA(T_STRINGLIT,
366 "{CHAR_SPEC}" Q("\"") "({ESCAPESEQ}|[^\\n\\r\"]|{UNIVERSALCHAR})*" Q("\"")),
367 TOKEN_DATA(T_SPACE, "{BLANK}+"),
368 TOKEN_DATA(T_CONTLINE, Q("\\") "\\n"),
369 TOKEN_DATA(T_NEWLINE, "{NEWLINEDEF}"),
370 TOKEN_DATA(T_POUND_POUND, "##"),
371 TOKEN_DATA(T_POUND_POUND_ALT, Q("%:") Q("%:")),
372 TOKEN_DATA(T_POUND_POUND_TRIGRAPH, "({TRI}=){2}"),
373 TOKEN_DATA(T_POUND, "#"),
374 TOKEN_DATA(T_POUND_ALT, Q("%:")),
375 TOKEN_DATA(T_POUND_TRIGRAPH, "{TRI}="),
376 TOKEN_DATA(T_ANY_TRIGRAPH, "{TRI}\\/"),
377 TOKEN_DATA(T_ANY, "{ANY}"),
378 TOKEN_DATA(T_ANYCTRL, "{ANYCTRL}"), // this should be the last recognized token
379 { token_id(0) } // this should be the last entry
380 };
381
382 // C++ only token definitions
383 template <typename Iterator, typename Position>
384 typename lexertl<Iterator, Position>::lexer_data const
385 lexertl<Iterator, Position>::init_data_cpp[INIT_DATA_CPP_SIZE] =
386 {
387 TOKEN_DATA(T_AND_ALT, "bitand"),
388 TOKEN_DATA(T_ANDASSIGN_ALT, "and_eq"),
389 TOKEN_DATA(T_ANDAND_ALT, "and"),
390 TOKEN_DATA(T_OR_ALT, "bitor"),
391 TOKEN_DATA(T_ORASSIGN_ALT, "or_eq"),
392 TOKEN_DATA(T_OROR_ALT, "or"),
393 TOKEN_DATA(T_XORASSIGN_ALT, "xor_eq"),
394 TOKEN_DATA(T_XOR_ALT, "xor"),
395 TOKEN_DATA(T_NOTEQUAL_ALT, "not_eq"),
396 TOKEN_DATA(T_NOT_ALT, "not"),
397 TOKEN_DATA(T_COMPL_ALT, "compl"),
398 #if BOOST_WAVE_SUPPORT_IMPORT_KEYWORD != 0
399 TOKEN_DATA(T_IMPORT, "import"),
400 #endif
401 TOKEN_DATA(T_ARROWSTAR, Q("->") Q("*")),
402 TOKEN_DATA(T_DOTSTAR, Q(".") Q("*")),
403 TOKEN_DATA(T_COLON_COLON, "::"),
404 { token_id(0) } // this should be the last entry
405 };
406
407 // pp-number specific token definitions
408 template <typename Iterator, typename Position>
409 typename lexertl<Iterator, Position>::lexer_data const
410 lexertl<Iterator, Position>::init_data_pp_number[INIT_DATA_PP_NUMBER_SIZE] =
411 {
412 TOKEN_DATA(T_PP_NUMBER, "{PP_NUMBERDEF}"),
413 { token_id(0) } // this should be the last entry
414 };
415
416 #undef MACRO_DATA
417 #undef TOKEN_DATA
418 #undef OR
419 #undef TRI
420 #undef Q
421
422 ///////////////////////////////////////////////////////////////////////////////
423 // initialize lexertl lexer from C++ token regex's
424 template <typename Iterator, typename Position>
425 inline bool
init_dfa(wave::language_support lang,Position const & pos,bool force_reinit)426 lexertl<Iterator, Position>::init_dfa(wave::language_support lang,
427 Position const& pos, bool force_reinit)
428 {
429 if (has_compiled_dfa_)
430 return true;
431
432 std::ifstream dfa_in("wave_lexertl_lexer.dfa", std::ios::in|std::ios::binary);
433
434 if (force_reinit || !dfa_in.is_open() || !load (dfa_in))
435 {
436 dfa_in.close();
437
438 state_machine_.clear();
439
440 // register macro definitions
441 boost::lexer::rules rules;
442 for (int k = 0; NULL != init_macro_data[k].name; ++k) {
443 rules.add_macro(init_macro_data[k].name, init_macro_data[k].macro);
444 }
445
446 // if pp-numbers should be preferred, insert the corresponding rule first
447 if (wave::need_prefer_pp_numbers(lang)) {
448 for (int j = 0; 0 != init_data_pp_number[j].tokenid; ++j) {
449 rules.add(init_data_pp_number[j].tokenregex,
450 init_data_pp_number[j].tokenid);
451 }
452 }
453
454 // if in C99 mode, some of the keywords are not valid
455 if (!wave::need_c99(lang)) {
456 for (int j = 0; 0 != init_data_cpp[j].tokenid; ++j) {
457 rules.add(init_data_cpp[j].tokenregex,
458 init_data_cpp[j].tokenid);
459 }
460 }
461
462 for (int i = 0; 0 != init_data[i].tokenid; ++i) {
463 rules.add(init_data[i].tokenregex, init_data[i].tokenid);
464 }
465
466 // generate minimized DFA
467 try {
468 boost::lexer::generator::build (rules, state_machine_);
469 boost::lexer::generator::minimise (state_machine_);
470 }
471 catch (std::runtime_error const& e) {
472 string_type msg("lexertl initialization error: ");
473 msg += e.what();
474 BOOST_WAVE_LEXER_THROW(wave::cpplexer::lexing_exception,
475 unexpected_error, msg.c_str(),
476 pos.get_line(), pos.get_column(), pos.get_file().c_str());
477 return false;
478 }
479
480 std::ofstream dfa_out ("wave_lexertl_lexer.dfa",
481 std::ios::out|std::ios::binary|std::ios::trunc);
482
483 if (dfa_out.is_open())
484 save (dfa_out);
485 }
486
487 has_compiled_dfa_ = true;
488 return true;
489 }
490 #endif // BOOST_WAVE_LEXERTL_USE_STATIC_TABLES == 0
491
492 ///////////////////////////////////////////////////////////////////////////////
493 // return next token from the input stream
494 template <typename Iterator, typename Position>
495 inline wave::token_id
next_token(Iterator & first,Iterator const & last,string_type & token_value)496 lexertl<Iterator, Position>::next_token(Iterator &first, Iterator const &last,
497 string_type& token_value)
498 {
499 #if BOOST_WAVE_LEXERTL_USE_STATIC_TABLES == 0
500 size_t const* const lookup = &state_machine_.data()._lookup[0]->front ();
501 size_t const dfa_alphabet = state_machine_.data()._dfa_alphabet[0];
502
503 size_t const* dfa = &state_machine_.data()._dfa[0]->front();
504 size_t const* ptr = dfa + dfa_alphabet + boost::lexer::dfa_offset;
505 #else
506 const std::size_t *ptr = dfa + dfa_offset;
507 #endif // BOOST_WAVE_LEXERTL_USE_STATIC_TABLES == 0
508
509 Iterator curr = first;
510 Iterator end_token = first;
511 bool end_state = (*ptr != 0);
512 size_t id = *(ptr + 1);
513
514 while (curr != last) {
515 size_t const state = ptr[lookup[int(*curr)]];
516 if (0 == state)
517 break;
518 ++curr;
519
520 #if BOOST_WAVE_LEXERTL_USE_STATIC_TABLES == 0
521 ptr = &dfa[state * (dfa_alphabet + boost::lexer::dfa_offset)];
522 #else
523 ptr = &dfa[state * dfa_offset];
524 #endif // BOOST_WAVE_LEXERTL_USE_STATIC_TABLES == 0
525
526 if (0 != *ptr) {
527 end_state = true;
528 id = *(ptr + 1);
529 end_token = curr;
530 }
531 }
532
533 if (end_state) {
534 if (T_ANY == id) {
535 id = TOKEN_FROM_ID(*first, UnknownTokenType);
536 }
537
538 // return longest match
539 string_type str(first, end_token);
540 token_value.swap(str);
541 first = end_token;
542 return wave::token_id(id);
543 }
544 return T_EOF;
545 }
546
547 #if BOOST_WAVE_LEXERTL_USE_STATIC_TABLES == 0
548 ///////////////////////////////////////////////////////////////////////////////
549 // load the DFA tables to/from a stream
550 template <typename Iterator, typename Position>
551 inline bool
load(std::istream & instrm)552 lexertl<Iterator, Position>::load (std::istream& instrm)
553 {
554 // #if !defined(BOOST_WAVE_LEXERTL_GENERATE_CPP_CODE)
555 // std::size_t version = 0;
556 // boost::lexer::serialise::load_as_binary(instrm, state_machine_, version);
557 // if (version != (std::size_t)get_compilation_time())
558 // return false; // too new for us
559 // return instrm.good();
560 // #else
561 return false; // always create the dfa when generating the C++ code
562 // #endif
563 }
564
565 ///////////////////////////////////////////////////////////////////////////////
566 // save the DFA tables to/from a stream
567 template <typename Iterator, typename Position>
568 inline bool
save(std::ostream & outstrm)569 lexertl<Iterator, Position>::save (std::ostream& outstrm)
570 {
571 // #if defined(BOOST_WAVE_LEXERTL_GENERATE_CPP_CODE)
572 // cpp_code::generate(state_machine_, outstrm);
573 // #else
574 // boost::lexer::serialise::save_as_binary(state_machine_, outstrm,
575 // (std::size_t)get_compilation_time());
576 // #endif
577 return outstrm.good();
578 }
579 #endif // #if BOOST_WAVE_LEXERTL_USE_STATIC_TABLES == 0
580
581 ///////////////////////////////////////////////////////////////////////////////
582 } // namespace lexer
583
584 ///////////////////////////////////////////////////////////////////////////////
585 template <typename Iterator, typename Position = wave::util::file_position_type>
586 class lexertl_functor
587 : public lexertl_input_interface<wave::cpplexer::lex_token<Position> >
588 {
589 public:
590 typedef wave::util::position_iterator<Iterator, Position> iterator_type;
591 typedef typename boost::detail::iterator_traits<Iterator>::value_type
592 char_type;
593 typedef BOOST_WAVE_STRINGTYPE string_type;
594 typedef wave::cpplexer::lex_token<Position> token_type;
595
lexertl_functor(Iterator const & first_,Iterator const & last_,Position const & pos_,wave::language_support language)596 lexertl_functor(Iterator const &first_, Iterator const &last_,
597 Position const &pos_, wave::language_support language)
598 : first(first_, last_, pos_), language(language), at_eof(false)
599 {
600 lexer_.init_dfa(language, pos_);
601 }
~lexertl_functor()602 ~lexertl_functor() {}
603
604 // get the next token from the input stream
get(token_type & result)605 token_type& get(token_type& result)
606 {
607 if (lexer_.is_initialized() && !at_eof) {
608 do {
609 // generate and return the next token
610 string_type token_val;
611 Position pos = first.get_position(); // begin of token position
612 wave::token_id id = lexer_.next_token(first, last, token_val);
613
614 if (T_CONTLINE != id) {
615 // The cast should avoid spurious warnings about missing case labels
616 // for the other token ids's.
617 switch (static_cast<unsigned int>(id)) {
618 case T_IDENTIFIER:
619 // test identifier characters for validity (throws if
620 // invalid chars found)
621 if (!wave::need_no_character_validation(language)) {
622 using wave::cpplexer::impl::validate_identifier_name;
623 validate_identifier_name(token_val,
624 pos.get_line(), pos.get_column(), pos.get_file());
625 }
626 break;
627
628 case T_STRINGLIT:
629 case T_CHARLIT:
630 // test literal characters for validity (throws if invalid
631 // chars found)
632 if (wave::need_convert_trigraphs(language)) {
633 using wave::cpplexer::impl::convert_trigraphs;
634 token_val = convert_trigraphs(token_val);
635 }
636 if (!wave::need_no_character_validation(language)) {
637 using wave::cpplexer::impl::validate_literal;
638 validate_literal(token_val,
639 pos.get_line(), pos.get_column(), pos.get_file());
640 }
641 break;
642
643 case T_LONGINTLIT: // supported in C99 and long_long mode
644 if (!wave::need_long_long(language)) {
645 // syntax error: not allowed in C++ mode
646 BOOST_WAVE_LEXER_THROW(
647 wave::cpplexer::lexing_exception,
648 invalid_long_long_literal, token_val.c_str(),
649 pos.get_line(), pos.get_column(),
650 pos.get_file().c_str());
651 }
652 break;
653
654 #if BOOST_WAVE_SUPPORT_INCLUDE_NEXT != 0
655 case T_PP_HHEADER:
656 case T_PP_QHEADER:
657 case T_PP_INCLUDE:
658 // convert to the corresponding ..._next token, if appropriate
659 {
660 // Skip '#' and whitespace and see whether we find an
661 // 'include_next' here.
662 typename string_type::size_type start = token_val.find("include");
663 if (0 == token_val.compare(start, 12, "include_next", 12))
664 id = token_id(id | AltTokenType);
665 }
666 break;
667 #endif // BOOST_WAVE_SUPPORT_INCLUDE_NEXT != 0
668
669 case T_EOF:
670 // T_EOF is returned as a valid token, the next call will
671 // return T_EOI, i.e. the actual end of input
672 at_eof = true;
673 token_val.clear();
674 break;
675
676 case T_OR_TRIGRAPH:
677 case T_XOR_TRIGRAPH:
678 case T_LEFTBRACE_TRIGRAPH:
679 case T_RIGHTBRACE_TRIGRAPH:
680 case T_LEFTBRACKET_TRIGRAPH:
681 case T_RIGHTBRACKET_TRIGRAPH:
682 case T_COMPL_TRIGRAPH:
683 case T_POUND_TRIGRAPH:
684 case T_ANY_TRIGRAPH:
685 if (wave::need_convert_trigraphs(language))
686 {
687 using wave::cpplexer::impl::convert_trigraph;
688 token_val = convert_trigraph(token_val);
689 }
690 break;
691
692 case T_ANYCTRL:
693 // matched some unexpected character
694 {
695 // 21 is the max required size for a 64 bit integer
696 // represented as a string
697 char buffer[22];
698 string_type msg("invalid character in input stream: '0x");
699
700 // for some systems sprintf is in namespace std
701 using namespace std;
702 sprintf(buffer, "%02x'", token_val[0]);
703 msg += buffer;
704 BOOST_WAVE_LEXER_THROW(
705 wave::cpplexer::lexing_exception,
706 generic_lexing_error,
707 msg.c_str(), pos.get_line(), pos.get_column(),
708 pos.get_file().c_str());
709 }
710 break;
711 }
712
713 result = token_type(id, token_val, pos);
714 #if BOOST_WAVE_SUPPORT_PRAGMA_ONCE != 0
715 return guards.detect_guard(result);
716 #else
717 return result;
718 #endif
719 }
720 } while (true); // skip the T_CONTLINE token
721 }
722 return result = token_type(); // return T_EOI
723 }
724
set_position(Position const & pos)725 void set_position(Position const &pos)
726 {
727 // set position has to change the file name and line number only
728 first.get_position().set_file(pos.get_file());
729 first.get_position().set_line(pos.get_line());
730 }
731
732 #if BOOST_WAVE_SUPPORT_PRAGMA_ONCE != 0
has_include_guards(std::string & guard_name) const733 bool has_include_guards(std::string& guard_name) const
734 { return guards.detected(guard_name); }
735 #endif
736
737 private:
738 iterator_type first;
739 iterator_type last;
740
741 wave::language_support language;
742 bool at_eof;
743 #if BOOST_WAVE_SUPPORT_PRAGMA_ONCE != 0
744 include_guards<token_type> guards;
745 #endif
746
747 static lexer::lexertl<iterator_type, Position> lexer_;
748 };
749
750 template <typename Iterator, typename Position>
751 lexer::lexertl<
752 typename lexertl_functor<Iterator, Position>::iterator_type, Position>
753 lexertl_functor<Iterator, Position>::lexer_;
754
755 #undef INIT_DATA_SIZE
756 #undef INIT_DATA_CPP_SIZE
757 #undef INIT_DATA_PP_NUMBER_SIZE
758 #undef INIT_MACRO_DATA_SIZE
759 #undef T_ANYCTRL
760
761 ///////////////////////////////////////////////////////////////////////////////
762 //
763 // The new_lexer_gen<>::new_lexer function (declared in lexertl_interface.hpp)
764 // should be defined inline, if the lex_functor shouldn't be instantiated
765 // separately from the lex_iterator.
766 //
767 // Separate (explicit) instantiation helps to reduce compilation time.
768 //
769 ///////////////////////////////////////////////////////////////////////////////
770
771 #if BOOST_WAVE_SEPARATE_LEXER_INSTANTIATION != 0
772 #define BOOST_WAVE_FLEX_NEW_LEXER_INLINE
773 #else
774 #define BOOST_WAVE_FLEX_NEW_LEXER_INLINE inline
775 #endif
776
777 ///////////////////////////////////////////////////////////////////////////////
778 //
779 // The 'new_lexer' function allows the opaque generation of a new lexer object.
780 // It is coupled to the iterator type to allow to decouple the lexer/iterator
781 // configurations at compile time.
782 //
783 // This function is declared inside the xlex_interface.hpp file, which is
784 // referenced by the source file calling the lexer and the source file, which
785 // instantiates the lex_functor. But it is defined here, so it will be
786 // instantiated only while compiling the source file, which instantiates the
787 // lex_functor. While the xlex_interface.hpp file may be included everywhere,
788 // this file (xlex_lexer.hpp) should be included only once. This allows
789 // to decouple the lexer interface from the lexer implementation and reduces
790 // compilation time.
791 //
792 ///////////////////////////////////////////////////////////////////////////////
793
794 template <typename Iterator, typename Position>
795 BOOST_WAVE_FLEX_NEW_LEXER_INLINE
796 wave::cpplexer::lex_input_interface<wave::cpplexer::lex_token<Position> > *
new_lexer(Iterator const & first,Iterator const & last,Position const & pos,wave::language_support language)797 new_lexer_gen<Iterator, Position>::new_lexer(Iterator const &first,
798 Iterator const &last, Position const &pos, wave::language_support language)
799 {
800 return new lexertl_functor<Iterator, Position>(first, last, pos, language);
801 }
802
803 #undef BOOST_WAVE_FLEX_NEW_LEXER_INLINE
804
805 ///////////////////////////////////////////////////////////////////////////////
806 }}}} // namespace boost::wave::cpplexer::lexertl
807
808 #endif // !defined(BOOST_WAVE_LEXERTL_LEXER_HPP_INCLUDED)
809
810