1 // Copyright (c) 2001-2011 Hartmut Kaiser 2 // 3 // Distributed under the Boost Software License, Version 1.0. (See accompanying 4 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 5 6 #if !defined(BOOST_SPIRIT_LEX_LEXER_MAR_17_2007_0139PM) 7 #define BOOST_SPIRIT_LEX_LEXER_MAR_17_2007_0139PM 8 9 #if defined(_MSC_VER) 10 #pragma once 11 #endif 12 13 #include <iosfwd> 14 15 #include <boost/spirit/home/support/detail/lexer/generator.hpp> 16 #include <boost/spirit/home/support/detail/lexer/rules.hpp> 17 #include <boost/spirit/home/support/detail/lexer/consts.hpp> 18 #include <boost/spirit/home/support/unused.hpp> 19 20 #include <boost/spirit/home/lex/lexer/lexertl/token.hpp> 21 #include <boost/spirit/home/lex/lexer/lexertl/functor.hpp> 22 #include <boost/spirit/home/lex/lexer/lexertl/functor_data.hpp> 23 #include <boost/spirit/home/lex/lexer/lexertl/iterator.hpp> 24 #if defined(BOOST_SPIRIT_LEXERTL_DEBUG) 25 #include <boost/spirit/home/support/detail/lexer/debug.hpp> 26 #endif 27 28 #include <boost/foreach.hpp> 29 30 #include <iterator> // for std::iterator_traits 31 32 namespace boost { namespace spirit { namespace lex { namespace lexertl 33 { 34 /////////////////////////////////////////////////////////////////////////// 35 namespace detail 36 { 37 /////////////////////////////////////////////////////////////////////// 38 // The must_escape function checks if the given character value needs 39 // to be preceded by a backslash character to disable its special 40 // meaning in the context of a regular expression 41 /////////////////////////////////////////////////////////////////////// 42 template <typename Char> must_escape(Char c)43 inline bool must_escape(Char c) 44 { 45 // FIXME: more needed? 46 switch (c) { 47 case '+': case '/': case '*': case '?': 48 case '|': 49 case '(': case ')': 50 case '[': case ']': 51 case '{': case '}': 52 case '.': 53 case '^': case '$': 54 case '\\': 55 case '"': 56 return true; 57 58 default: 59 break; 60 } 61 return false; 62 } 63 64 /////////////////////////////////////////////////////////////////////// 65 // The escape function returns the string representation of the given 66 // character value, possibly escaped with a backslash character, to 67 // allow it being safely used in a regular expression definition. 68 /////////////////////////////////////////////////////////////////////// 69 template <typename Char> escape(Char ch)70 inline std::basic_string<Char> escape(Char ch) 71 { 72 std::basic_string<Char> result(1, ch); 73 if (detail::must_escape(ch)) 74 { 75 typedef typename std::basic_string<Char>::size_type size_type; 76 result.insert((size_type)0, 1, '\\'); 77 } 78 return result; 79 } 80 81 /////////////////////////////////////////////////////////////////////// 82 // 83 /////////////////////////////////////////////////////////////////////// map_flags(unsigned int flags)84 inline boost::lexer::regex_flags map_flags(unsigned int flags) 85 { 86 unsigned int retval = boost::lexer::none; 87 if (flags & match_flags::match_not_dot_newline) 88 retval |= boost::lexer::dot_not_newline; 89 if (flags & match_flags::match_icase) 90 retval |= boost::lexer::icase; 91 92 return boost::lexer::regex_flags(retval); 93 } 94 } 95 96 /////////////////////////////////////////////////////////////////////////// 97 template <typename Lexer, typename F> 98 bool generate_static(Lexer const& 99 , std::basic_ostream<typename Lexer::char_type>& 100 , typename Lexer::char_type const*, F); 101 102 /////////////////////////////////////////////////////////////////////////// 103 // 104 // Every lexer type to be used as a lexer for Spirit has to conform to 105 // the following public interface: 106 // 107 // typedefs: 108 // iterator_type The type of the iterator exposed by this lexer. 109 // token_type The type of the tokens returned from the exposed 110 // iterators. 111 // 112 // functions: 113 // default constructor 114 // Since lexers are instantiated as base classes 115 // only it might be a good idea to make this 116 // constructor protected. 117 // begin, end Return a pair of iterators, when dereferenced 118 // returning the sequence of tokens recognized in 119 // the input stream given as the parameters to the 120 // begin() function. 121 // add_token Should add the definition of a token to be 122 // recognized by this lexer. 123 // clear Should delete all current token definitions 124 // associated with the given state of this lexer 125 // object. 126 // 127 // template parameters: 128 // Iterator The type of the iterator used to access the 129 // underlying character stream. 130 // Token The type of the tokens to be returned from the 131 // exposed token iterator. 132 // Functor The type of the InputPolicy to use to instantiate 133 // the multi_pass iterator type to be used as the 134 // token iterator (returned from begin()/end()). 135 // 136 /////////////////////////////////////////////////////////////////////////// 137 138 /////////////////////////////////////////////////////////////////////////// 139 // 140 // The lexer class is a implementation of a Spirit.Lex lexer on 141 // top of Ben Hanson's lexertl library as outlined above (For more 142 // information about lexertl go here: http://www.benhanson.net/lexertl.html). 143 // 144 // This class is supposed to be used as the first and only template 145 // parameter while instantiating instances of a lex::lexer class. 146 // 147 /////////////////////////////////////////////////////////////////////////// 148 template <typename Token = token<> 149 , typename Iterator = typename Token::iterator_type 150 , typename Functor = functor<Token, lexertl::detail::data, Iterator> > 151 class lexer 152 { 153 private: true_boost::spirit::lex::lexertl::lexer::dummy154 struct dummy { void true_() {} }; 155 typedef void (dummy::*safe_bool)(); 156 157 static std::size_t const all_states_id = static_cast<std::size_t>(-2); 158 159 public: operator safe_bool() const160 operator safe_bool() const 161 { return initialized_dfa_ ? &dummy::true_ : 0; } 162 163 typedef typename std::iterator_traits<Iterator>::value_type char_type; 164 typedef std::basic_string<char_type> string_type; 165 166 typedef boost::lexer::basic_rules<char_type> basic_rules_type; 167 168 // Every lexer type to be used as a lexer for Spirit has to conform to 169 // a public interface . 170 typedef Token token_type; 171 typedef typename Token::id_type id_type; 172 typedef iterator<Functor> iterator_type; 173 174 private: 175 // this type is purely used for the iterator_type construction below 176 struct iterator_data_type 177 { 178 typedef typename Functor::semantic_actions_type semantic_actions_type; 179 iterator_data_typeboost::spirit::lex::lexertl::lexer::iterator_data_type180 iterator_data_type( 181 boost::lexer::basic_state_machine<char_type> const& sm 182 , boost::lexer::basic_rules<char_type> const& rules 183 , semantic_actions_type const& actions) 184 : state_machine_(sm), rules_(rules), actions_(actions) 185 {} 186 187 boost::lexer::basic_state_machine<char_type> const& state_machine_; 188 boost::lexer::basic_rules<char_type> const& rules_; 189 semantic_actions_type const& actions_; 190 191 // silence MSVC warning C4512: assignment operator could not be generated 192 BOOST_DELETED_FUNCTION(iterator_data_type& operator= (iterator_data_type const&)) 193 }; 194 195 public: 196 // Return the start iterator usable for iterating over the generated 197 // tokens. begin(Iterator & first,Iterator const & last,char_type const * initial_state=0) const198 iterator_type begin(Iterator& first, Iterator const& last 199 , char_type const* initial_state = 0) const 200 { 201 if (!init_dfa()) // never minimize DFA for dynamic lexers 202 return iterator_type(); 203 204 iterator_data_type iterator_data(state_machine_, rules_, actions_); 205 return iterator_type(iterator_data, first, last, initial_state); 206 } 207 208 // Return the end iterator usable to stop iterating over the generated 209 // tokens. end() const210 iterator_type end() const 211 { 212 return iterator_type(); 213 } 214 215 protected: 216 // Lexer instances can be created by means of a derived class only. lexer(unsigned int flags)217 lexer(unsigned int flags) 218 : flags_(detail::map_flags(flags)) 219 , rules_(flags_) 220 , initialized_dfa_(false) 221 {} 222 223 public: 224 // interface for token definition management add_token(char_type const * state,char_type tokendef,std::size_t token_id,char_type const * targetstate)225 std::size_t add_token(char_type const* state, char_type tokendef, 226 std::size_t token_id, char_type const* targetstate) 227 { 228 add_state(state); 229 initialized_dfa_ = false; 230 if (state == all_states()) 231 return rules_.add(state, detail::escape(tokendef), token_id, rules_.dot()); 232 233 if (0 == targetstate) 234 targetstate = state; 235 else 236 add_state(targetstate); 237 return rules_.add(state, detail::escape(tokendef), token_id, targetstate); 238 } add_token(char_type const * state,string_type const & tokendef,std::size_t token_id,char_type const * targetstate)239 std::size_t add_token(char_type const* state, string_type const& tokendef, 240 std::size_t token_id, char_type const* targetstate) 241 { 242 add_state(state); 243 initialized_dfa_ = false; 244 if (state == all_states()) 245 return rules_.add(state, tokendef, token_id, rules_.dot()); 246 247 if (0 == targetstate) 248 targetstate = state; 249 else 250 add_state(targetstate); 251 return rules_.add(state, tokendef, token_id, targetstate); 252 } 253 254 // interface for pattern definition management add_pattern(char_type const * state,string_type const & name,string_type const & patterndef)255 void add_pattern (char_type const* state, string_type const& name, 256 string_type const& patterndef) 257 { 258 add_state(state); 259 rules_.add_macro(name.c_str(), patterndef); 260 initialized_dfa_ = false; 261 } 262 get_rules() const263 boost::lexer::rules const& get_rules() const { return rules_; } 264 clear(char_type const * state)265 void clear(char_type const* state) 266 { 267 std::size_t s = rules_.state(state); 268 if (boost::lexer::npos != s) 269 rules_.clear(state); 270 initialized_dfa_ = false; 271 } add_state(char_type const * state)272 std::size_t add_state(char_type const* state) 273 { 274 if (state == all_states()) 275 return all_states_id; 276 277 std::size_t stateid = rules_.state(state); 278 if (boost::lexer::npos == stateid) { 279 stateid = rules_.add_state(state); 280 initialized_dfa_ = false; 281 } 282 return stateid; 283 } initial_state() const284 string_type initial_state() const 285 { 286 return string_type(rules_.initial()); 287 } all_states() const288 string_type all_states() const 289 { 290 return string_type(rules_.all_states()); 291 } 292 293 // Register a semantic action with the given id 294 template <typename F> add_action(std::size_t unique_id,std::size_t state,F act)295 void add_action(std::size_t unique_id, std::size_t state, F act) 296 { 297 // If you see an error here stating add_action is not a member of 298 // fusion::unused_type then you are probably having semantic actions 299 // attached to at least one token in the lexer definition without 300 // using the lex::lexertl::actor_lexer<> as its base class. 301 typedef typename Functor::wrap_action_type wrapper_type; 302 if (state == all_states_id) { 303 // add the action to all known states 304 typedef typename 305 basic_rules_type::string_size_t_map::value_type 306 state_type; 307 308 std::size_t states = rules_.statemap().size(); 309 BOOST_FOREACH(state_type const& s, rules_.statemap()) { 310 for (std::size_t j = 0; j < states; ++j) 311 actions_.add_action(unique_id + j, s.second, wrapper_type::call(act)); 312 } 313 } 314 else { 315 actions_.add_action(unique_id, state, wrapper_type::call(act)); 316 } 317 } 318 // template <typename F> 319 // void add_action(std::size_t unique_id, char_type const* state, F act) 320 // { 321 // typedef typename Functor::wrap_action_type wrapper_type; 322 // actions_.add_action(unique_id, add_state(state), wrapper_type::call(act)); 323 // } 324 325 // We do not minimize the state machine by default anymore because 326 // Ben said: "If you can afford to generate a lexer at runtime, there 327 // is little point in calling minimise." 328 // Go figure. init_dfa(bool minimize=false) const329 bool init_dfa(bool minimize = false) const 330 { 331 if (!initialized_dfa_) { 332 state_machine_.clear(); 333 typedef boost::lexer::basic_generator<char_type> generator; 334 generator::build (rules_, state_machine_); 335 if (minimize) 336 generator::minimise (state_machine_); 337 338 #if defined(BOOST_SPIRIT_LEXERTL_DEBUG) 339 boost::lexer::debug::dump(state_machine_, std::cerr); 340 #endif 341 initialized_dfa_ = true; 342 343 // // release memory held by rules description 344 // basic_rules_type rules; 345 // rules.init_state_info(rules_); // preserve states 346 // std::swap(rules, rules_); 347 } 348 return true; 349 } 350 351 private: 352 // lexertl specific data 353 mutable boost::lexer::basic_state_machine<char_type> state_machine_; 354 boost::lexer::regex_flags flags_; 355 /*mutable*/ basic_rules_type rules_; 356 357 typename Functor::semantic_actions_type actions_; 358 mutable bool initialized_dfa_; 359 360 // generator functions must be able to access members directly 361 template <typename Lexer, typename F> 362 friend bool generate_static(Lexer const& 363 , std::basic_ostream<typename Lexer::char_type>& 364 , typename Lexer::char_type const*, F); 365 }; 366 367 /////////////////////////////////////////////////////////////////////////// 368 // 369 // The actor_lexer class is another implementation of a Spirit.Lex 370 // lexer on top of Ben Hanson's lexertl library as outlined above (For 371 // more information about lexertl go here: 372 // http://www.benhanson.net/lexertl.html). 373 // 374 // The only difference to the lexer class above is that token_def 375 // definitions may have semantic (lexer) actions attached while being 376 // defined: 377 // 378 // int w; 379 // token_def word = "[^ \t\n]+"; 380 // self = word[++ref(w)]; // see example: word_count_lexer 381 // 382 // This class is supposed to be used as the first and only template 383 // parameter while instantiating instances of a lex::lexer class. 384 // 385 /////////////////////////////////////////////////////////////////////////// 386 template <typename Token = token<> 387 , typename Iterator = typename Token::iterator_type 388 , typename Functor = functor<Token, lexertl::detail::data, Iterator, mpl::true_> > 389 class actor_lexer : public lexer<Token, Iterator, Functor> 390 { 391 protected: 392 // Lexer instances can be created by means of a derived class only. actor_lexer(unsigned int flags)393 actor_lexer(unsigned int flags) 394 : lexer<Token, Iterator, Functor>(flags) {} 395 }; 396 397 }}}} 398 399 #endif 400