1 // Copyright (c) 2001-2011 Hartmut Kaiser 2 // 3 // Distributed under the Boost Software License, Version 1.0. (See accompanying 4 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 5 6 #if !defined(BOOST_SPIRIT_LEX_LEXER_MAR_17_2007_0139PM) 7 #define BOOST_SPIRIT_LEX_LEXER_MAR_17_2007_0139PM 8 9 #if defined(_MSC_VER) 10 #pragma once 11 #endif 12 13 #include <iosfwd> 14 15 #include <boost/spirit/home/support/detail/lexer/generator.hpp> 16 #include <boost/spirit/home/support/detail/lexer/rules.hpp> 17 #include <boost/spirit/home/support/detail/lexer/consts.hpp> 18 #include <boost/spirit/home/support/unused.hpp> 19 20 #include <boost/spirit/home/lex/lexer/lexertl/token.hpp> 21 #include <boost/spirit/home/lex/lexer/lexertl/functor.hpp> 22 #include <boost/spirit/home/lex/lexer/lexertl/functor_data.hpp> 23 #include <boost/spirit/home/lex/lexer/lexertl/iterator.hpp> 24 #if defined(BOOST_SPIRIT_LEXERTL_DEBUG) 25 #include <boost/spirit/home/support/detail/lexer/debug.hpp> 26 #endif 27 28 #include <boost/foreach.hpp> 29 30 namespace boost { namespace spirit { namespace lex { namespace lexertl 31 { 32 /////////////////////////////////////////////////////////////////////////// 33 namespace detail 34 { 35 /////////////////////////////////////////////////////////////////////// 36 // The must_escape function checks if the given character value needs 37 // to be preceded by a backslash character to disable its special 38 // meaning in the context of a regular expression 39 /////////////////////////////////////////////////////////////////////// 40 template <typename Char> must_escape(Char c)41 inline bool must_escape(Char c) 42 { 43 // FIXME: more needed? 44 switch (c) { 45 case '+': case '/': case '*': case '?': 46 case '|': 47 case '(': case ')': 48 case '[': case ']': 49 case '{': case '}': 50 case '.': 51 case '^': case '$': 52 case '\\': 53 case '"': 54 return true; 55 56 default: 57 break; 58 } 59 return false; 60 } 61 62 /////////////////////////////////////////////////////////////////////// 63 // The escape function returns the string representation of the given 64 // character value, possibly escaped with a backslash character, to 65 // allow it being safely used in a regular expression definition. 66 /////////////////////////////////////////////////////////////////////// 67 template <typename Char> escape(Char ch)68 inline std::basic_string<Char> escape(Char ch) 69 { 70 std::basic_string<Char> result(1, ch); 71 if (detail::must_escape(ch)) 72 { 73 typedef typename std::basic_string<Char>::size_type size_type; 74 result.insert((size_type)0, 1, '\\'); 75 } 76 return result; 77 } 78 79 /////////////////////////////////////////////////////////////////////// 80 // 81 /////////////////////////////////////////////////////////////////////// map_flags(unsigned int flags)82 inline boost::lexer::regex_flags map_flags(unsigned int flags) 83 { 84 unsigned int retval = boost::lexer::none; 85 if (flags & match_flags::match_not_dot_newline) 86 retval |= boost::lexer::dot_not_newline; 87 if (flags & match_flags::match_icase) 88 retval |= boost::lexer::icase; 89 90 return boost::lexer::regex_flags(retval); 91 } 92 } 93 94 /////////////////////////////////////////////////////////////////////////// 95 template <typename Lexer, typename F> 96 bool generate_static(Lexer const& 97 , std::basic_ostream<typename Lexer::char_type>& 98 , typename Lexer::char_type const*, F); 99 100 /////////////////////////////////////////////////////////////////////////// 101 // 102 // Every lexer type to be used as a lexer for Spirit has to conform to 103 // the following public interface: 104 // 105 // typedefs: 106 // iterator_type The type of the iterator exposed by this lexer. 107 // token_type The type of the tokens returned from the exposed 108 // iterators. 109 // 110 // functions: 111 // default constructor 112 // Since lexers are instantiated as base classes 113 // only it might be a good idea to make this 114 // constructor protected. 115 // begin, end Return a pair of iterators, when dereferenced 116 // returning the sequence of tokens recognized in 117 // the input stream given as the parameters to the 118 // begin() function. 119 // add_token Should add the definition of a token to be 120 // recognized by this lexer. 121 // clear Should delete all current token definitions 122 // associated with the given state of this lexer 123 // object. 124 // 125 // template parameters: 126 // Iterator The type of the iterator used to access the 127 // underlying character stream. 128 // Token The type of the tokens to be returned from the 129 // exposed token iterator. 130 // Functor The type of the InputPolicy to use to instantiate 131 // the multi_pass iterator type to be used as the 132 // token iterator (returned from begin()/end()). 133 // 134 /////////////////////////////////////////////////////////////////////////// 135 136 /////////////////////////////////////////////////////////////////////////// 137 // 138 // The lexer class is a implementation of a Spirit.Lex lexer on 139 // top of Ben Hanson's lexertl library as outlined above (For more 140 // information about lexertl go here: http://www.benhanson.net/lexertl.html). 141 // 142 // This class is supposed to be used as the first and only template 143 // parameter while instantiating instances of a lex::lexer class. 144 // 145 /////////////////////////////////////////////////////////////////////////// 146 template <typename Token = token<> 147 , typename Iterator = typename Token::iterator_type 148 , typename Functor = functor<Token, lexertl::detail::data, Iterator> > 149 class lexer 150 { 151 private: true_boost::spirit::lex::lexertl::lexer::dummy152 struct dummy { void true_() {} }; 153 typedef void (dummy::*safe_bool)(); 154 155 static std::size_t const all_states_id = static_cast<std::size_t>(-2); 156 157 public: operator safe_bool() const158 operator safe_bool() const 159 { return initialized_dfa_ ? &dummy::true_ : 0; } 160 161 typedef typename boost::detail::iterator_traits<Iterator>::value_type 162 char_type; 163 typedef std::basic_string<char_type> string_type; 164 165 typedef boost::lexer::basic_rules<char_type> basic_rules_type; 166 167 // Every lexer type to be used as a lexer for Spirit has to conform to 168 // a public interface . 169 typedef Token token_type; 170 typedef typename Token::id_type id_type; 171 typedef iterator<Functor> iterator_type; 172 173 private: 174 // this type is purely used for the iterator_type construction below 175 struct iterator_data_type 176 { 177 typedef typename Functor::semantic_actions_type semantic_actions_type; 178 iterator_data_typeboost::spirit::lex::lexertl::lexer::iterator_data_type179 iterator_data_type( 180 boost::lexer::basic_state_machine<char_type> const& sm 181 , boost::lexer::basic_rules<char_type> const& rules 182 , semantic_actions_type const& actions) 183 : state_machine_(sm), rules_(rules), actions_(actions) 184 {} 185 186 boost::lexer::basic_state_machine<char_type> const& state_machine_; 187 boost::lexer::basic_rules<char_type> const& rules_; 188 semantic_actions_type const& actions_; 189 190 private: 191 // silence MSVC warning C4512: assignment operator could not be generated 192 iterator_data_type& operator= (iterator_data_type const&); 193 }; 194 195 public: 196 // Return the start iterator usable for iterating over the generated 197 // tokens. begin(Iterator & first,Iterator const & last,char_type const * initial_state=0) const198 iterator_type begin(Iterator& first, Iterator const& last 199 , char_type const* initial_state = 0) const 200 { 201 if (!init_dfa()) // never minimize DFA for dynamic lexers 202 return iterator_type(); 203 204 iterator_data_type iterator_data(state_machine_, rules_, actions_); 205 return iterator_type(iterator_data, first, last, initial_state); 206 } 207 208 // Return the end iterator usable to stop iterating over the generated 209 // tokens. end() const210 iterator_type end() const 211 { 212 return iterator_type(); 213 } 214 215 protected: 216 // Lexer instances can be created by means of a derived class only. lexer(unsigned int flags)217 lexer(unsigned int flags) 218 : flags_(detail::map_flags(flags)) 219 , rules_(flags_) 220 , initialized_dfa_(false) 221 {} 222 223 public: 224 // interface for token definition management add_token(char_type const * state,char_type tokendef,std::size_t token_id,char_type const * targetstate)225 std::size_t add_token(char_type const* state, char_type tokendef, 226 std::size_t token_id, char_type const* targetstate) 227 { 228 add_state(state); 229 initialized_dfa_ = false; 230 if (state == all_states()) 231 return rules_.add(state, detail::escape(tokendef), token_id, rules_.dot()); 232 233 if (0 == targetstate) 234 targetstate = state; 235 else 236 add_state(targetstate); 237 return rules_.add(state, detail::escape(tokendef), token_id, targetstate); 238 } add_token(char_type const * state,string_type const & tokendef,std::size_t token_id,char_type const * targetstate)239 std::size_t add_token(char_type const* state, string_type const& tokendef, 240 std::size_t token_id, char_type const* targetstate) 241 { 242 add_state(state); 243 initialized_dfa_ = false; 244 if (state == all_states()) 245 return rules_.add(state, tokendef, token_id, rules_.dot()); 246 247 if (0 == targetstate) 248 targetstate = state; 249 else 250 add_state(targetstate); 251 return rules_.add(state, tokendef, token_id, targetstate); 252 } 253 254 // interface for pattern definition management add_pattern(char_type const * state,string_type const & name,string_type const & patterndef)255 void add_pattern (char_type const* state, string_type const& name, 256 string_type const& patterndef) 257 { 258 add_state(state); 259 rules_.add_macro(name.c_str(), patterndef); 260 initialized_dfa_ = false; 261 } 262 get_rules() const263 boost::lexer::rules const& get_rules() const { return rules_; } 264 clear(char_type const * state)265 void clear(char_type const* state) 266 { 267 std::size_t s = rules_.state(state); 268 if (boost::lexer::npos != s) 269 rules_.clear(state); 270 initialized_dfa_ = false; 271 } add_state(char_type const * state)272 std::size_t add_state(char_type const* state) 273 { 274 if (state == all_states()) 275 return all_states_id; 276 277 std::size_t stateid = rules_.state(state); 278 if (boost::lexer::npos == stateid) { 279 stateid = rules_.add_state(state); 280 initialized_dfa_ = false; 281 } 282 return stateid; 283 } initial_state() const284 string_type initial_state() const 285 { 286 return string_type(rules_.initial()); 287 } all_states() const288 string_type all_states() const 289 { 290 return string_type(rules_.all_states()); 291 } 292 293 // Register a semantic action with the given id 294 template <typename F> add_action(std::size_t unique_id,std::size_t state,F act)295 void add_action(std::size_t unique_id, std::size_t state, F act) 296 { 297 // If you see an error here stating add_action is not a member of 298 // fusion::unused_type then you are probably having semantic actions 299 // attached to at least one token in the lexer definition without 300 // using the lex::lexertl::actor_lexer<> as its base class. 301 typedef typename Functor::wrap_action_type wrapper_type; 302 if (state == all_states_id) { 303 // add the action to all known states 304 typedef typename 305 basic_rules_type::string_size_t_map::value_type 306 state_type; 307 308 std::size_t states = rules_.statemap().size(); 309 BOOST_FOREACH(state_type const& s, rules_.statemap()) { 310 for (std::size_t j = 0; j < states; ++j) 311 actions_.add_action(unique_id + j, s.second, wrapper_type::call(act)); 312 } 313 } 314 else { 315 actions_.add_action(unique_id, state, wrapper_type::call(act)); 316 } 317 } 318 // template <typename F> 319 // void add_action(std::size_t unique_id, char_type const* state, F act) 320 // { 321 // typedef typename Functor::wrap_action_type wrapper_type; 322 // actions_.add_action(unique_id, add_state(state), wrapper_type::call(act)); 323 // } 324 325 // We do not minimize the state machine by default anymore because 326 // Ben said: "If you can afford to generate a lexer at runtime, there 327 // is little point in calling minimise." 328 // Go figure. init_dfa(bool minimize=false) const329 bool init_dfa(bool minimize = false) const 330 { 331 if (!initialized_dfa_) { 332 state_machine_.clear(); 333 typedef boost::lexer::basic_generator<char_type> generator; 334 generator::build (rules_, state_machine_); 335 if (minimize) 336 generator::minimise (state_machine_); 337 338 #if defined(BOOST_SPIRIT_LEXERTL_DEBUG) 339 boost::lexer::debug::dump(state_machine_, std::cerr); 340 #endif 341 initialized_dfa_ = true; 342 343 // // release memory held by rules description 344 // basic_rules_type rules; 345 // rules.init_state_info(rules_); // preserve states 346 // std::swap(rules, rules_); 347 } 348 return true; 349 } 350 351 private: 352 // lexertl specific data 353 mutable boost::lexer::basic_state_machine<char_type> state_machine_; 354 boost::lexer::regex_flags flags_; 355 /*mutable*/ basic_rules_type rules_; 356 357 typename Functor::semantic_actions_type actions_; 358 mutable bool initialized_dfa_; 359 360 // generator functions must be able to access members directly 361 template <typename Lexer, typename F> 362 friend bool generate_static(Lexer const& 363 , std::basic_ostream<typename Lexer::char_type>& 364 , typename Lexer::char_type const*, F); 365 }; 366 367 /////////////////////////////////////////////////////////////////////////// 368 // 369 // The actor_lexer class is another implementation of a Spirit.Lex 370 // lexer on top of Ben Hanson's lexertl library as outlined above (For 371 // more information about lexertl go here: 372 // http://www.benhanson.net/lexertl.html). 373 // 374 // The only difference to the lexer class above is that token_def 375 // definitions may have semantic (lexer) actions attached while being 376 // defined: 377 // 378 // int w; 379 // token_def word = "[^ \t\n]+"; 380 // self = word[++ref(w)]; // see example: word_count_lexer 381 // 382 // This class is supposed to be used as the first and only template 383 // parameter while instantiating instances of a lex::lexer class. 384 // 385 /////////////////////////////////////////////////////////////////////////// 386 template <typename Token = token<> 387 , typename Iterator = typename Token::iterator_type 388 , typename Functor = functor<Token, lexertl::detail::data, Iterator, mpl::true_> > 389 class actor_lexer : public lexer<Token, Iterator, Functor> 390 { 391 protected: 392 // Lexer instances can be created by means of a derived class only. actor_lexer(unsigned int flags)393 actor_lexer(unsigned int flags) 394 : lexer<Token, Iterator, Functor>(flags) {} 395 }; 396 397 }}}} 398 399 #endif 400