1 //  Copyright (c) 2001-2011 Hartmut Kaiser
2 //
3 //  Distributed under the Boost Software License, Version 1.0. (See accompanying
4 //  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
5 
6 #if !defined(BOOST_SPIRIT_LEX_LEXER_MAR_17_2007_0139PM)
7 #define BOOST_SPIRIT_LEX_LEXER_MAR_17_2007_0139PM
8 
9 #if defined(_MSC_VER)
10 #pragma once
11 #endif
12 
13 #include <iosfwd>
14 
15 #include <boost/spirit/home/support/detail/lexer/generator.hpp>
16 #include <boost/spirit/home/support/detail/lexer/rules.hpp>
17 #include <boost/spirit/home/support/detail/lexer/consts.hpp>
18 #include <boost/spirit/home/support/unused.hpp>
19 
20 #include <boost/spirit/home/lex/lexer/lexertl/token.hpp>
21 #include <boost/spirit/home/lex/lexer/lexertl/functor.hpp>
22 #include <boost/spirit/home/lex/lexer/lexertl/functor_data.hpp>
23 #include <boost/spirit/home/lex/lexer/lexertl/iterator.hpp>
24 #if defined(BOOST_SPIRIT_LEXERTL_DEBUG)
25 #include <boost/spirit/home/support/detail/lexer/debug.hpp>
26 #endif
27 
28 #include <boost/foreach.hpp>
29 
30 namespace boost { namespace spirit { namespace lex { namespace lexertl
31 {
32     ///////////////////////////////////////////////////////////////////////////
33     namespace detail
34     {
35         ///////////////////////////////////////////////////////////////////////
36         //  The must_escape function checks if the given character value needs
37         //  to be preceded by a backslash character to disable its special
38         //  meaning in the context of a regular expression
39         ///////////////////////////////////////////////////////////////////////
40         template <typename Char>
must_escape(Char c)41         inline bool must_escape(Char c)
42         {
43             // FIXME: more needed?
44             switch (c) {
45             case '+': case '/': case '*': case '?':
46             case '|':
47             case '(': case ')':
48             case '[': case ']':
49             case '{': case '}':
50             case '.':
51             case '^': case '$':
52             case '\\':
53             case '"':
54                 return true;
55 
56             default:
57                 break;
58             }
59             return false;
60         }
61 
62         ///////////////////////////////////////////////////////////////////////
63         //  The escape function returns the string representation of the given
64         //  character value, possibly escaped with a backslash character, to
65         //  allow it being safely used in a regular expression definition.
66         ///////////////////////////////////////////////////////////////////////
67         template <typename Char>
escape(Char ch)68         inline std::basic_string<Char> escape(Char ch)
69         {
70             std::basic_string<Char> result(1, ch);
71             if (detail::must_escape(ch))
72             {
73                 typedef typename std::basic_string<Char>::size_type size_type;
74                 result.insert((size_type)0, 1, '\\');
75             }
76             return result;
77         }
78 
79         ///////////////////////////////////////////////////////////////////////
80         //
81         ///////////////////////////////////////////////////////////////////////
map_flags(unsigned int flags)82         inline boost::lexer::regex_flags map_flags(unsigned int flags)
83         {
84             unsigned int retval = boost::lexer::none;
85             if (flags & match_flags::match_not_dot_newline)
86                 retval |= boost::lexer::dot_not_newline;
87             if (flags & match_flags::match_icase)
88                 retval |= boost::lexer::icase;
89 
90             return boost::lexer::regex_flags(retval);
91         }
92     }
93 
94     ///////////////////////////////////////////////////////////////////////////
95     template <typename Lexer, typename F>
96     bool generate_static(Lexer const&
97       , std::basic_ostream<typename Lexer::char_type>&
98       , typename Lexer::char_type const*, F);
99 
100     ///////////////////////////////////////////////////////////////////////////
101     //
102     //  Every lexer type to be used as a lexer for Spirit has to conform to
103     //  the following public interface:
104     //
105     //    typedefs:
106     //        iterator_type   The type of the iterator exposed by this lexer.
107     //        token_type      The type of the tokens returned from the exposed
108     //                        iterators.
109     //
110     //    functions:
111     //        default constructor
112     //                        Since lexers are instantiated as base classes
113     //                        only it might be a good idea to make this
114     //                        constructor protected.
115     //        begin, end      Return a pair of iterators, when dereferenced
116     //                        returning the sequence of tokens recognized in
117     //                        the input stream given as the parameters to the
118     //                        begin() function.
119     //        add_token       Should add the definition of a token to be
120     //                        recognized by this lexer.
121     //        clear           Should delete all current token definitions
122     //                        associated with the given state of this lexer
123     //                        object.
124     //
125     //    template parameters:
126     //        Iterator        The type of the iterator used to access the
127     //                        underlying character stream.
128     //        Token           The type of the tokens to be returned from the
129     //                        exposed token iterator.
130     //        Functor         The type of the InputPolicy to use to instantiate
131     //                        the multi_pass iterator type to be used as the
132     //                        token iterator (returned from begin()/end()).
133     //
134     ///////////////////////////////////////////////////////////////////////////
135 
136     ///////////////////////////////////////////////////////////////////////////
137     //
138     //  The lexer class is a implementation of a Spirit.Lex lexer on
139     //  top of Ben Hanson's lexertl library as outlined above (For more
140     //  information about lexertl go here: http://www.benhanson.net/lexertl.html).
141     //
142     //  This class is supposed to be used as the first and only template
143     //  parameter while instantiating instances of a lex::lexer class.
144     //
145     ///////////////////////////////////////////////////////////////////////////
146     template <typename Token = token<>
147       , typename Iterator = typename Token::iterator_type
148       , typename Functor = functor<Token, lexertl::detail::data, Iterator> >
149     class lexer
150     {
151     private:
true_boost::spirit::lex::lexertl::lexer::dummy152         struct dummy { void true_() {} };
153         typedef void (dummy::*safe_bool)();
154 
155         static std::size_t const all_states_id = static_cast<std::size_t>(-2);
156 
157     public:
operator safe_bool() const158         operator safe_bool() const
159             { return initialized_dfa_ ? &dummy::true_ : 0; }
160 
161         typedef typename boost::detail::iterator_traits<Iterator>::value_type
162             char_type;
163         typedef std::basic_string<char_type> string_type;
164 
165         typedef boost::lexer::basic_rules<char_type> basic_rules_type;
166 
167         //  Every lexer type to be used as a lexer for Spirit has to conform to
168         //  a public interface .
169         typedef Token token_type;
170         typedef typename Token::id_type id_type;
171         typedef iterator<Functor> iterator_type;
172 
173     private:
174         // this type is purely used for the iterator_type construction below
175         struct iterator_data_type
176         {
177             typedef typename Functor::semantic_actions_type semantic_actions_type;
178 
iterator_data_typeboost::spirit::lex::lexertl::lexer::iterator_data_type179             iterator_data_type(
180                     boost::lexer::basic_state_machine<char_type> const& sm
181                   , boost::lexer::basic_rules<char_type> const& rules
182                   , semantic_actions_type const& actions)
183               : state_machine_(sm), rules_(rules), actions_(actions)
184             {}
185 
186             boost::lexer::basic_state_machine<char_type> const& state_machine_;
187             boost::lexer::basic_rules<char_type> const& rules_;
188             semantic_actions_type const& actions_;
189 
190         private:
191             // silence MSVC warning C4512: assignment operator could not be generated
192             iterator_data_type& operator= (iterator_data_type const&);
193         };
194 
195     public:
196         //  Return the start iterator usable for iterating over the generated
197         //  tokens.
begin(Iterator & first,Iterator const & last,char_type const * initial_state=0) const198         iterator_type begin(Iterator& first, Iterator const& last
199           , char_type const* initial_state = 0) const
200         {
201             if (!init_dfa())    // never minimize DFA for dynamic lexers
202                 return iterator_type();
203 
204             iterator_data_type iterator_data(state_machine_, rules_, actions_);
205             return iterator_type(iterator_data, first, last, initial_state);
206         }
207 
208         //  Return the end iterator usable to stop iterating over the generated
209         //  tokens.
end() const210         iterator_type end() const
211         {
212             return iterator_type();
213         }
214 
215     protected:
216         //  Lexer instances can be created by means of a derived class only.
lexer(unsigned int flags)217         lexer(unsigned int flags)
218           : flags_(detail::map_flags(flags))
219           , rules_(flags_)
220           , initialized_dfa_(false)
221         {}
222 
223     public:
224         // interface for token definition management
add_token(char_type const * state,char_type tokendef,std::size_t token_id,char_type const * targetstate)225         std::size_t add_token(char_type const* state, char_type tokendef,
226             std::size_t token_id, char_type const* targetstate)
227         {
228             add_state(state);
229             initialized_dfa_ = false;
230             if (state == all_states())
231                 return rules_.add(state, detail::escape(tokendef), token_id, rules_.dot());
232 
233             if (0 == targetstate)
234                 targetstate = state;
235             else
236                 add_state(targetstate);
237             return rules_.add(state, detail::escape(tokendef), token_id, targetstate);
238         }
add_token(char_type const * state,string_type const & tokendef,std::size_t token_id,char_type const * targetstate)239         std::size_t add_token(char_type const* state, string_type const& tokendef,
240             std::size_t token_id, char_type const* targetstate)
241         {
242             add_state(state);
243             initialized_dfa_ = false;
244             if (state == all_states())
245                 return rules_.add(state, tokendef, token_id, rules_.dot());
246 
247             if (0 == targetstate)
248                 targetstate = state;
249             else
250                 add_state(targetstate);
251             return rules_.add(state, tokendef, token_id, targetstate);
252         }
253 
254         // interface for pattern definition management
add_pattern(char_type const * state,string_type const & name,string_type const & patterndef)255         void add_pattern (char_type const* state, string_type const& name,
256             string_type const& patterndef)
257         {
258             add_state(state);
259             rules_.add_macro(name.c_str(), patterndef);
260             initialized_dfa_ = false;
261         }
262 
get_rules() const263         boost::lexer::rules const& get_rules() const { return rules_; }
264 
clear(char_type const * state)265         void clear(char_type const* state)
266         {
267             std::size_t s = rules_.state(state);
268             if (boost::lexer::npos != s)
269                 rules_.clear(state);
270             initialized_dfa_ = false;
271         }
add_state(char_type const * state)272         std::size_t add_state(char_type const* state)
273         {
274             if (state == all_states())
275                 return all_states_id;
276 
277             std::size_t stateid = rules_.state(state);
278             if (boost::lexer::npos == stateid) {
279                 stateid = rules_.add_state(state);
280                 initialized_dfa_ = false;
281             }
282             return stateid;
283         }
initial_state() const284         string_type initial_state() const
285         {
286             return string_type(rules_.initial());
287         }
all_states() const288         string_type all_states() const
289         {
290             return string_type(rules_.all_states());
291         }
292 
293         //  Register a semantic action with the given id
294         template <typename F>
add_action(std::size_t unique_id,std::size_t state,F act)295         void add_action(std::size_t unique_id, std::size_t state, F act)
296         {
297             // If you see an error here stating add_action is not a member of
298             // fusion::unused_type then you are probably having semantic actions
299             // attached to at least one token in the lexer definition without
300             // using the lex::lexertl::actor_lexer<> as its base class.
301             typedef typename Functor::wrap_action_type wrapper_type;
302             if (state == all_states_id) {
303                 // add the action to all known states
304                 typedef typename
305                     basic_rules_type::string_size_t_map::value_type
306                 state_type;
307 
308                 std::size_t states = rules_.statemap().size();
309                 BOOST_FOREACH(state_type const& s, rules_.statemap()) {
310                     for (std::size_t j = 0; j < states; ++j)
311                         actions_.add_action(unique_id + j, s.second, wrapper_type::call(act));
312                 }
313             }
314             else {
315                 actions_.add_action(unique_id, state, wrapper_type::call(act));
316             }
317         }
318 //         template <typename F>
319 //         void add_action(std::size_t unique_id, char_type const* state, F act)
320 //         {
321 //             typedef typename Functor::wrap_action_type wrapper_type;
322 //             actions_.add_action(unique_id, add_state(state), wrapper_type::call(act));
323 //         }
324 
325         // We do not minimize the state machine by default anymore because
326         // Ben said: "If you can afford to generate a lexer at runtime, there
327         //            is little point in calling minimise."
328         // Go figure.
init_dfa(bool minimize=false) const329         bool init_dfa(bool minimize = false) const
330         {
331             if (!initialized_dfa_) {
332                 state_machine_.clear();
333                 typedef boost::lexer::basic_generator<char_type> generator;
334                 generator::build (rules_, state_machine_);
335                 if (minimize)
336                     generator::minimise (state_machine_);
337 
338 #if defined(BOOST_SPIRIT_LEXERTL_DEBUG)
339                 boost::lexer::debug::dump(state_machine_, std::cerr);
340 #endif
341                 initialized_dfa_ = true;
342 
343 //                 // release memory held by rules description
344 //                 basic_rules_type rules;
345 //                 rules.init_state_info(rules_);        // preserve states
346 //                 std::swap(rules, rules_);
347             }
348             return true;
349         }
350 
351     private:
352         // lexertl specific data
353         mutable boost::lexer::basic_state_machine<char_type> state_machine_;
354         boost::lexer::regex_flags flags_;
355         /*mutable*/ basic_rules_type rules_;
356 
357         typename Functor::semantic_actions_type actions_;
358         mutable bool initialized_dfa_;
359 
360         // generator functions must be able to access members directly
361         template <typename Lexer, typename F>
362         friend bool generate_static(Lexer const&
363           , std::basic_ostream<typename Lexer::char_type>&
364           , typename Lexer::char_type const*, F);
365     };
366 
367     ///////////////////////////////////////////////////////////////////////////
368     //
369     //  The actor_lexer class is another implementation of a Spirit.Lex
370     //  lexer on top of Ben Hanson's lexertl library as outlined above (For
371     //  more information about lexertl go here:
372     //  http://www.benhanson.net/lexertl.html).
373     //
374     //  The only difference to the lexer class above is that token_def
375     //  definitions may have semantic (lexer) actions attached while being
376     //  defined:
377     //
378     //      int w;
379     //      token_def word = "[^ \t\n]+";
380     //      self = word[++ref(w)];        // see example: word_count_lexer
381     //
382     //  This class is supposed to be used as the first and only template
383     //  parameter while instantiating instances of a lex::lexer class.
384     //
385     ///////////////////////////////////////////////////////////////////////////
386     template <typename Token = token<>
387       , typename Iterator = typename Token::iterator_type
388       , typename Functor = functor<Token, lexertl::detail::data, Iterator, mpl::true_> >
389     class actor_lexer : public lexer<Token, Iterator, Functor>
390     {
391     protected:
392         //  Lexer instances can be created by means of a derived class only.
actor_lexer(unsigned int flags)393         actor_lexer(unsigned int flags)
394           : lexer<Token, Iterator, Functor>(flags) {}
395     };
396 
397 }}}}
398 
399 #endif
400