1 //  Copyright (c) 2001-2011 Hartmut Kaiser
2 //
3 //  Distributed under the Boost Software License, Version 1.0. (See accompanying
4 //  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
5 
6 #if !defined(BOOST_SPIRIT_LEX_LEXER_MAR_17_2007_0139PM)
7 #define BOOST_SPIRIT_LEX_LEXER_MAR_17_2007_0139PM
8 
9 #if defined(_MSC_VER)
10 #pragma once
11 #endif
12 
13 #include <iosfwd>
14 
15 #include <boost/spirit/home/support/detail/lexer/generator.hpp>
16 #include <boost/spirit/home/support/detail/lexer/rules.hpp>
17 #include <boost/spirit/home/support/detail/lexer/consts.hpp>
18 #include <boost/spirit/home/support/unused.hpp>
19 
20 #include <boost/spirit/home/lex/lexer/lexertl/token.hpp>
21 #include <boost/spirit/home/lex/lexer/lexertl/functor.hpp>
22 #include <boost/spirit/home/lex/lexer/lexertl/functor_data.hpp>
23 #include <boost/spirit/home/lex/lexer/lexertl/iterator.hpp>
24 #if defined(BOOST_SPIRIT_LEXERTL_DEBUG)
25 #include <boost/spirit/home/support/detail/lexer/debug.hpp>
26 #endif
27 
28 #include <boost/foreach.hpp>
29 
30 #include <iterator> // for std::iterator_traits
31 
32 namespace boost { namespace spirit { namespace lex { namespace lexertl
33 {
34     ///////////////////////////////////////////////////////////////////////////
35     namespace detail
36     {
37         ///////////////////////////////////////////////////////////////////////
38         //  The must_escape function checks if the given character value needs
39         //  to be preceded by a backslash character to disable its special
40         //  meaning in the context of a regular expression
41         ///////////////////////////////////////////////////////////////////////
42         template <typename Char>
must_escape(Char c)43         inline bool must_escape(Char c)
44         {
45             // FIXME: more needed?
46             switch (c) {
47             case '+': case '/': case '*': case '?':
48             case '|':
49             case '(': case ')':
50             case '[': case ']':
51             case '{': case '}':
52             case '.':
53             case '^': case '$':
54             case '\\':
55             case '"':
56                 return true;
57 
58             default:
59                 break;
60             }
61             return false;
62         }
63 
64         ///////////////////////////////////////////////////////////////////////
65         //  The escape function returns the string representation of the given
66         //  character value, possibly escaped with a backslash character, to
67         //  allow it being safely used in a regular expression definition.
68         ///////////////////////////////////////////////////////////////////////
69         template <typename Char>
escape(Char ch)70         inline std::basic_string<Char> escape(Char ch)
71         {
72             std::basic_string<Char> result(1, ch);
73             if (detail::must_escape(ch))
74             {
75                 typedef typename std::basic_string<Char>::size_type size_type;
76                 result.insert((size_type)0, 1, '\\');
77             }
78             return result;
79         }
80 
81         ///////////////////////////////////////////////////////////////////////
82         //
83         ///////////////////////////////////////////////////////////////////////
map_flags(unsigned int flags)84         inline boost::lexer::regex_flags map_flags(unsigned int flags)
85         {
86             unsigned int retval = boost::lexer::none;
87             if (flags & match_flags::match_not_dot_newline)
88                 retval |= boost::lexer::dot_not_newline;
89             if (flags & match_flags::match_icase)
90                 retval |= boost::lexer::icase;
91 
92             return boost::lexer::regex_flags(retval);
93         }
94     }
95 
96     ///////////////////////////////////////////////////////////////////////////
97     template <typename Lexer, typename F>
98     bool generate_static(Lexer const&
99       , std::basic_ostream<typename Lexer::char_type>&
100       , typename Lexer::char_type const*, F);
101 
102     ///////////////////////////////////////////////////////////////////////////
103     //
104     //  Every lexer type to be used as a lexer for Spirit has to conform to
105     //  the following public interface:
106     //
107     //    typedefs:
108     //        iterator_type   The type of the iterator exposed by this lexer.
109     //        token_type      The type of the tokens returned from the exposed
110     //                        iterators.
111     //
112     //    functions:
113     //        default constructor
114     //                        Since lexers are instantiated as base classes
115     //                        only it might be a good idea to make this
116     //                        constructor protected.
117     //        begin, end      Return a pair of iterators, when dereferenced
118     //                        returning the sequence of tokens recognized in
119     //                        the input stream given as the parameters to the
120     //                        begin() function.
121     //        add_token       Should add the definition of a token to be
122     //                        recognized by this lexer.
123     //        clear           Should delete all current token definitions
124     //                        associated with the given state of this lexer
125     //                        object.
126     //
127     //    template parameters:
128     //        Iterator        The type of the iterator used to access the
129     //                        underlying character stream.
130     //        Token           The type of the tokens to be returned from the
131     //                        exposed token iterator.
132     //        Functor         The type of the InputPolicy to use to instantiate
133     //                        the multi_pass iterator type to be used as the
134     //                        token iterator (returned from begin()/end()).
135     //
136     ///////////////////////////////////////////////////////////////////////////
137 
138     ///////////////////////////////////////////////////////////////////////////
139     //
140     //  The lexer class is a implementation of a Spirit.Lex lexer on
141     //  top of Ben Hanson's lexertl library as outlined above (For more
142     //  information about lexertl go here: http://www.benhanson.net/lexertl.html).
143     //
144     //  This class is supposed to be used as the first and only template
145     //  parameter while instantiating instances of a lex::lexer class.
146     //
147     ///////////////////////////////////////////////////////////////////////////
148     template <typename Token = token<>
149       , typename Iterator = typename Token::iterator_type
150       , typename Functor = functor<Token, lexertl::detail::data, Iterator> >
151     class lexer
152     {
153     private:
true_boost::spirit::lex::lexertl::lexer::dummy154         struct dummy { void true_() {} };
155         typedef void (dummy::*safe_bool)();
156 
157         static std::size_t const all_states_id = static_cast<std::size_t>(-2);
158 
159     public:
operator safe_bool() const160         operator safe_bool() const
161             { return initialized_dfa_ ? &dummy::true_ : 0; }
162 
163         typedef typename std::iterator_traits<Iterator>::value_type char_type;
164         typedef std::basic_string<char_type> string_type;
165 
166         typedef boost::lexer::basic_rules<char_type> basic_rules_type;
167 
168         //  Every lexer type to be used as a lexer for Spirit has to conform to
169         //  a public interface .
170         typedef Token token_type;
171         typedef typename Token::id_type id_type;
172         typedef iterator<Functor> iterator_type;
173 
174     private:
175         // this type is purely used for the iterator_type construction below
176         struct iterator_data_type
177         {
178             typedef typename Functor::semantic_actions_type semantic_actions_type;
179 
iterator_data_typeboost::spirit::lex::lexertl::lexer::iterator_data_type180             iterator_data_type(
181                     boost::lexer::basic_state_machine<char_type> const& sm
182                   , boost::lexer::basic_rules<char_type> const& rules
183                   , semantic_actions_type const& actions)
184               : state_machine_(sm), rules_(rules), actions_(actions)
185             {}
186 
187             boost::lexer::basic_state_machine<char_type> const& state_machine_;
188             boost::lexer::basic_rules<char_type> const& rules_;
189             semantic_actions_type const& actions_;
190 
191             // silence MSVC warning C4512: assignment operator could not be generated
192             BOOST_DELETED_FUNCTION(iterator_data_type& operator= (iterator_data_type const&))
193         };
194 
195     public:
196         //  Return the start iterator usable for iterating over the generated
197         //  tokens.
begin(Iterator & first,Iterator const & last,char_type const * initial_state=0) const198         iterator_type begin(Iterator& first, Iterator const& last
199           , char_type const* initial_state = 0) const
200         {
201             if (!init_dfa())    // never minimize DFA for dynamic lexers
202                 return iterator_type();
203 
204             iterator_data_type iterator_data(state_machine_, rules_, actions_);
205             return iterator_type(iterator_data, first, last, initial_state);
206         }
207 
208         //  Return the end iterator usable to stop iterating over the generated
209         //  tokens.
end() const210         iterator_type end() const
211         {
212             return iterator_type();
213         }
214 
215     protected:
216         //  Lexer instances can be created by means of a derived class only.
lexer(unsigned int flags)217         lexer(unsigned int flags)
218           : flags_(detail::map_flags(flags))
219           , rules_(flags_)
220           , initialized_dfa_(false)
221         {}
222 
223     public:
224         // interface for token definition management
add_token(char_type const * state,char_type tokendef,std::size_t token_id,char_type const * targetstate)225         std::size_t add_token(char_type const* state, char_type tokendef,
226             std::size_t token_id, char_type const* targetstate)
227         {
228             add_state(state);
229             initialized_dfa_ = false;
230             if (state == all_states())
231                 return rules_.add(state, detail::escape(tokendef), token_id, rules_.dot());
232 
233             if (0 == targetstate)
234                 targetstate = state;
235             else
236                 add_state(targetstate);
237             return rules_.add(state, detail::escape(tokendef), token_id, targetstate);
238         }
add_token(char_type const * state,string_type const & tokendef,std::size_t token_id,char_type const * targetstate)239         std::size_t add_token(char_type const* state, string_type const& tokendef,
240             std::size_t token_id, char_type const* targetstate)
241         {
242             add_state(state);
243             initialized_dfa_ = false;
244             if (state == all_states())
245                 return rules_.add(state, tokendef, token_id, rules_.dot());
246 
247             if (0 == targetstate)
248                 targetstate = state;
249             else
250                 add_state(targetstate);
251             return rules_.add(state, tokendef, token_id, targetstate);
252         }
253 
254         // interface for pattern definition management
add_pattern(char_type const * state,string_type const & name,string_type const & patterndef)255         void add_pattern (char_type const* state, string_type const& name,
256             string_type const& patterndef)
257         {
258             add_state(state);
259             rules_.add_macro(name.c_str(), patterndef);
260             initialized_dfa_ = false;
261         }
262 
get_rules() const263         boost::lexer::rules const& get_rules() const { return rules_; }
264 
clear(char_type const * state)265         void clear(char_type const* state)
266         {
267             std::size_t s = rules_.state(state);
268             if (boost::lexer::npos != s)
269                 rules_.clear(state);
270             initialized_dfa_ = false;
271         }
add_state(char_type const * state)272         std::size_t add_state(char_type const* state)
273         {
274             if (state == all_states())
275                 return all_states_id;
276 
277             std::size_t stateid = rules_.state(state);
278             if (boost::lexer::npos == stateid) {
279                 stateid = rules_.add_state(state);
280                 initialized_dfa_ = false;
281             }
282             return stateid;
283         }
initial_state() const284         string_type initial_state() const
285         {
286             return string_type(rules_.initial());
287         }
all_states() const288         string_type all_states() const
289         {
290             return string_type(rules_.all_states());
291         }
292 
293         //  Register a semantic action with the given id
294         template <typename F>
add_action(std::size_t unique_id,std::size_t state,F act)295         void add_action(std::size_t unique_id, std::size_t state, F act)
296         {
297             // If you see an error here stating add_action is not a member of
298             // fusion::unused_type then you are probably having semantic actions
299             // attached to at least one token in the lexer definition without
300             // using the lex::lexertl::actor_lexer<> as its base class.
301             typedef typename Functor::wrap_action_type wrapper_type;
302             if (state == all_states_id) {
303                 // add the action to all known states
304                 typedef typename
305                     basic_rules_type::string_size_t_map::value_type
306                 state_type;
307 
308                 std::size_t states = rules_.statemap().size();
309                 BOOST_FOREACH(state_type const& s, rules_.statemap()) {
310                     for (std::size_t j = 0; j < states; ++j)
311                         actions_.add_action(unique_id + j, s.second, wrapper_type::call(act));
312                 }
313             }
314             else {
315                 actions_.add_action(unique_id, state, wrapper_type::call(act));
316             }
317         }
318 //         template <typename F>
319 //         void add_action(std::size_t unique_id, char_type const* state, F act)
320 //         {
321 //             typedef typename Functor::wrap_action_type wrapper_type;
322 //             actions_.add_action(unique_id, add_state(state), wrapper_type::call(act));
323 //         }
324 
325         // We do not minimize the state machine by default anymore because
326         // Ben said: "If you can afford to generate a lexer at runtime, there
327         //            is little point in calling minimise."
328         // Go figure.
init_dfa(bool minimize=false) const329         bool init_dfa(bool minimize = false) const
330         {
331             if (!initialized_dfa_) {
332                 state_machine_.clear();
333                 typedef boost::lexer::basic_generator<char_type> generator;
334                 generator::build (rules_, state_machine_);
335                 if (minimize)
336                     generator::minimise (state_machine_);
337 
338 #if defined(BOOST_SPIRIT_LEXERTL_DEBUG)
339                 boost::lexer::debug::dump(state_machine_, std::cerr);
340 #endif
341                 initialized_dfa_ = true;
342 
343 //                 // release memory held by rules description
344 //                 basic_rules_type rules;
345 //                 rules.init_state_info(rules_);        // preserve states
346 //                 std::swap(rules, rules_);
347             }
348             return true;
349         }
350 
351     private:
352         // lexertl specific data
353         mutable boost::lexer::basic_state_machine<char_type> state_machine_;
354         boost::lexer::regex_flags flags_;
355         /*mutable*/ basic_rules_type rules_;
356 
357         typename Functor::semantic_actions_type actions_;
358         mutable bool initialized_dfa_;
359 
360         // generator functions must be able to access members directly
361         template <typename Lexer, typename F>
362         friend bool generate_static(Lexer const&
363           , std::basic_ostream<typename Lexer::char_type>&
364           , typename Lexer::char_type const*, F);
365     };
366 
367     ///////////////////////////////////////////////////////////////////////////
368     //
369     //  The actor_lexer class is another implementation of a Spirit.Lex
370     //  lexer on top of Ben Hanson's lexertl library as outlined above (For
371     //  more information about lexertl go here:
372     //  http://www.benhanson.net/lexertl.html).
373     //
374     //  The only difference to the lexer class above is that token_def
375     //  definitions may have semantic (lexer) actions attached while being
376     //  defined:
377     //
378     //      int w;
379     //      token_def word = "[^ \t\n]+";
380     //      self = word[++ref(w)];        // see example: word_count_lexer
381     //
382     //  This class is supposed to be used as the first and only template
383     //  parameter while instantiating instances of a lex::lexer class.
384     //
385     ///////////////////////////////////////////////////////////////////////////
386     template <typename Token = token<>
387       , typename Iterator = typename Token::iterator_type
388       , typename Functor = functor<Token, lexertl::detail::data, Iterator, mpl::true_> >
389     class actor_lexer : public lexer<Token, Iterator, Functor>
390     {
391     protected:
392         //  Lexer instances can be created by means of a derived class only.
actor_lexer(unsigned int flags)393         actor_lexer(unsigned int flags)
394           : lexer<Token, Iterator, Functor>(flags) {}
395     };
396 
397 }}}}
398 
399 #endif
400