1 // parser.hpp 2 // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/) 3 // 4 // Distributed under the Boost Software License, Version 1.0. (See accompanying 5 // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 6 #ifndef BOOST_LEXER_PARSER_HPP 7 #define BOOST_LEXER_PARSER_HPP 8 9 #include <boost/assert.hpp> 10 #include "tree/end_node.hpp" 11 #include "tree/iteration_node.hpp" 12 #include "tree/leaf_node.hpp" 13 #include "../runtime_error.hpp" 14 #include "tree/selection_node.hpp" 15 #include "tree/sequence_node.hpp" 16 #include "../size_t.hpp" 17 #include "tokeniser/re_tokeniser.hpp" 18 19 namespace boost 20 { 21 namespace lexer 22 { 23 namespace detail 24 { 25 template<typename CharT> 26 class basic_parser 27 { 28 public: 29 typedef basic_re_tokeniser<CharT> tokeniser; 30 typedef typename tokeniser::string string; 31 typedef std::map<string, const node *> macro_map; 32 typedef node::node_ptr_vector node_ptr_vector; 33 typedef typename tokeniser::num_token token; 34 35 /* 36 General principles of regex parsing: 37 - Every regex is a sequence of sub-regexes. 38 - Regexes consist of operands and operators 39 - All operators decompose to sequence, selection ('|') and iteration ('*') 40 - Regex tokens are stored on the stack. 41 - When a complete sequence of regex tokens is on the stack it is processed. 42 43 Grammar: 44 45 <REGEX> -> <OREXP> 46 <OREXP> -> <SEQUENCE> | <OREXP>'|'<SEQUENCE> 47 <SEQUENCE> -> <SUB> 48 <SUB> -> <EXPRESSION> | <SUB><EXPRESSION> 49 <EXPRESSION> -> <REPEAT> 50 <REPEAT> -> charset | macro | '('<REGEX>')' | <REPEAT><DUPLICATE> 51 <DUPLICATE> -> '?' | '*' | '+' | '{n[,[m]]}' 52 */ parse(const CharT * start_,const CharT * const end_,const std::size_t id_,const std::size_t unique_id_,const std::size_t dfa_state_,const regex_flags flags_,const std::locale & locale_,node_ptr_vector & node_ptr_vector_,const macro_map & macromap_,typename tokeniser::token_map & map_,bool & seen_BOL_assertion_,bool & seen_EOL_assertion_)53 static node *parse (const CharT *start_, const CharT * const end_, 54 const std::size_t id_, const std::size_t unique_id_, 55 const std::size_t dfa_state_, const regex_flags flags_, 56 const std::locale &locale_, node_ptr_vector &node_ptr_vector_, 57 const macro_map ¯omap_, typename tokeniser::token_map &map_, 58 bool &seen_BOL_assertion_, bool &seen_EOL_assertion_) 59 { 60 node *root_ = 0; 61 state state_ (start_, end_, flags_, locale_); 62 token lhs_token_; 63 token rhs_token_; 64 token_stack token_stack_; 65 tree_node_stack tree_node_stack_; 66 char action_ = 0; 67 68 token_stack_.push (rhs_token_); 69 tokeniser::next (state_, map_, rhs_token_); 70 71 do 72 { 73 lhs_token_ = token_stack_.top (); 74 action_ = lhs_token_.precedence (rhs_token_._type); 75 76 switch (action_) 77 { 78 case '<': 79 case '=': 80 token_stack_.push (rhs_token_); 81 tokeniser::next (state_, map_, rhs_token_); 82 break; 83 case '>': 84 reduce (token_stack_, macromap_, node_ptr_vector_, 85 tree_node_stack_); 86 break; 87 default: 88 std::ostringstream ss_; 89 90 ss_ << "A syntax error occurred: '" << 91 lhs_token_.precedence_string () << 92 "' against '" << rhs_token_.precedence_string () << 93 "' at index " << state_.index () << "."; 94 throw runtime_error (ss_.str ().c_str ()); 95 break; 96 } 97 } while (!token_stack_.empty ()); 98 99 if (tree_node_stack_.empty ()) 100 { 101 throw runtime_error ("Empty rules are not allowed."); 102 } 103 104 BOOST_ASSERT(tree_node_stack_.size () == 1); 105 106 node *lhs_node_ = tree_node_stack_.top (); 107 108 tree_node_stack_.pop (); 109 110 if (id_ == 0) 111 { 112 // Macros have no end state... 113 root_ = lhs_node_; 114 } 115 else 116 { 117 node_ptr_vector_->push_back (static_cast<end_node *>(0)); 118 119 node *rhs_node_ = new end_node (id_, unique_id_, dfa_state_); 120 121 node_ptr_vector_->back () = rhs_node_; 122 node_ptr_vector_->push_back (static_cast<sequence_node *>(0)); 123 node_ptr_vector_->back () = new sequence_node 124 (lhs_node_, rhs_node_); 125 root_ = node_ptr_vector_->back (); 126 } 127 128 // Done this way as bug in VC++ 6 prevents |= operator working 129 // properly! 130 if (state_._seen_BOL_assertion) seen_BOL_assertion_ = true; 131 132 if (state_._seen_EOL_assertion) seen_EOL_assertion_ = true; 133 134 return root_; 135 } 136 137 private: 138 typedef typename tokeniser::state state; 139 typedef std::stack<token> token_stack; 140 typedef node::node_stack tree_node_stack; 141 reduce(token_stack & token_stack_,const macro_map & macromap_,node_ptr_vector & node_vector_ptr_,tree_node_stack & tree_node_stack_)142 static void reduce (token_stack &token_stack_, 143 const macro_map ¯omap_, node_ptr_vector &node_vector_ptr_, 144 tree_node_stack &tree_node_stack_) 145 { 146 typename tokeniser::num_token lhs_; 147 typename tokeniser::num_token rhs_; 148 token_stack handle_; 149 char action_ = 0; 150 151 do 152 { 153 rhs_ = token_stack_.top (); 154 token_stack_.pop (); 155 handle_.push (rhs_); 156 157 if (!token_stack_.empty ()) 158 { 159 lhs_ = token_stack_.top (); 160 action_ = lhs_.precedence (rhs_._type); 161 } 162 } while (!token_stack_.empty () && action_ == '='); 163 164 BOOST_ASSERT(token_stack_.empty () || action_ == '<'); 165 166 switch (rhs_._type) 167 { 168 case token::BEGIN: 169 // finished processing so exit 170 break; 171 case token::REGEX: 172 // finished parsing, nothing to do 173 break; 174 case token::OREXP: 175 orexp (handle_, token_stack_, node_vector_ptr_, tree_node_stack_); 176 break; 177 case token::SEQUENCE: 178 token_stack_.push (token::OREXP); 179 break; 180 case token::SUB: 181 sub (handle_, token_stack_, node_vector_ptr_, tree_node_stack_); 182 break; 183 case token::EXPRESSION: 184 token_stack_.push (token::SUB); 185 break; 186 case token::REPEAT: 187 repeat (handle_, token_stack_); 188 break; 189 case token::CHARSET: 190 charset (handle_, token_stack_, node_vector_ptr_, 191 tree_node_stack_); 192 break; 193 case token::MACRO: 194 macro (handle_, token_stack_, macromap_, node_vector_ptr_, 195 tree_node_stack_); 196 break; 197 case token::OPENPAREN: 198 openparen (handle_, token_stack_); 199 break; 200 case token::OPT: 201 case token::AOPT: 202 optional (rhs_._type == token::OPT, node_vector_ptr_, 203 tree_node_stack_); 204 token_stack_.push (token::DUP); 205 break; 206 case token::ZEROORMORE: 207 case token::AZEROORMORE: 208 zero_or_more (rhs_._type == token::ZEROORMORE, node_vector_ptr_, 209 tree_node_stack_); 210 token_stack_.push (token::DUP); 211 break; 212 case token::ONEORMORE: 213 case token::AONEORMORE: 214 one_or_more (rhs_._type == token::ONEORMORE, node_vector_ptr_, 215 tree_node_stack_); 216 token_stack_.push (token::DUP); 217 break; 218 case token::REPEATN: 219 case token::AREPEATN: 220 repeatn (rhs_._type == token::REPEATN, handle_.top (), 221 node_vector_ptr_, tree_node_stack_); 222 token_stack_.push (token::DUP); 223 break; 224 default: 225 throw runtime_error 226 ("Internal error regex_parser::reduce"); 227 break; 228 } 229 } 230 orexp(token_stack & handle_,token_stack & token_stack_,node_ptr_vector & node_ptr_vector_,tree_node_stack & tree_node_stack_)231 static void orexp (token_stack &handle_, token_stack &token_stack_, 232 node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) 233 { 234 BOOST_ASSERT(handle_.top ()._type == token::OREXP && 235 (handle_.size () == 1 || handle_.size () == 3)); 236 237 if (handle_.size () == 1) 238 { 239 token_stack_.push (token::REGEX); 240 } 241 else 242 { 243 handle_.pop (); 244 BOOST_ASSERT(handle_.top ()._type == token::OR); 245 handle_.pop (); 246 BOOST_ASSERT(handle_.top ()._type == token::SEQUENCE); 247 perform_or (node_ptr_vector_, tree_node_stack_); 248 token_stack_.push (token::OREXP); 249 } 250 } 251 sub(token_stack & handle_,token_stack & token_stack_,node_ptr_vector & node_ptr_vector_,tree_node_stack & tree_node_stack_)252 static void sub (token_stack &handle_, token_stack &token_stack_, 253 node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) 254 { 255 BOOST_ASSERT(handle_.top ()._type == token::SUB && 256 (handle_.size () == 1 || handle_.size () == 2)); 257 258 if (handle_.size () == 1) 259 { 260 token_stack_.push (token::SEQUENCE); 261 } 262 else 263 { 264 handle_.pop (); 265 BOOST_ASSERT(handle_.top ()._type == token::EXPRESSION); 266 // perform join 267 sequence (node_ptr_vector_, tree_node_stack_); 268 token_stack_.push (token::SUB); 269 } 270 } 271 repeat(token_stack & handle_,token_stack & token_stack_)272 static void repeat (token_stack &handle_, token_stack &token_stack_) 273 { 274 BOOST_ASSERT(handle_.top ()._type == token::REPEAT && 275 handle_.size () >= 1 && handle_.size () <= 3); 276 277 if (handle_.size () == 1) 278 { 279 token_stack_.push (token::EXPRESSION); 280 } 281 else 282 { 283 handle_.pop (); 284 BOOST_ASSERT(handle_.top ()._type == token::DUP); 285 token_stack_.push (token::REPEAT); 286 } 287 } 288 charset(token_stack & handle_,token_stack & token_stack_,node_ptr_vector & node_ptr_vector_,tree_node_stack & tree_node_stack_)289 static void charset (token_stack &handle_, token_stack &token_stack_, 290 node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) 291 { 292 BOOST_ASSERT(handle_.top ()._type == token::CHARSET && 293 handle_.size () == 1); 294 // store charset 295 node_ptr_vector_->push_back (static_cast<leaf_node *>(0)); 296 297 const size_t id_ = handle_.top ()._id; 298 299 node_ptr_vector_->back () = new leaf_node (id_, true); 300 tree_node_stack_.push (node_ptr_vector_->back ()); 301 token_stack_.push (token::REPEAT); 302 } 303 macro(token_stack & handle_,token_stack & token_stack_,const macro_map & macromap_,node_ptr_vector & node_ptr_vector_,tree_node_stack & tree_node_stack_)304 static void macro (token_stack &handle_, token_stack &token_stack_, 305 const macro_map ¯omap_, node_ptr_vector &node_ptr_vector_, 306 tree_node_stack &tree_node_stack_) 307 { 308 token &top_ = handle_.top (); 309 310 BOOST_ASSERT(top_._type == token::MACRO && handle_.size () == 1); 311 312 typename macro_map::const_iterator iter_ = 313 macromap_.find (top_._macro); 314 315 if (iter_ == macromap_.end ()) 316 { 317 const CharT *name_ = top_._macro; 318 std::basic_stringstream<CharT> ss_; 319 std::ostringstream os_; 320 321 os_ << "Unknown MACRO name '"; 322 323 while (*name_) 324 { 325 os_ << ss_.narrow (*name_++, ' '); 326 } 327 328 os_ << "'."; 329 throw runtime_error (os_.str ()); 330 } 331 332 tree_node_stack_.push (iter_->second->copy (node_ptr_vector_)); 333 token_stack_.push (token::REPEAT); 334 } 335 openparen(token_stack & handle_,token_stack & token_stack_)336 static void openparen (token_stack &handle_, token_stack &token_stack_) 337 { 338 BOOST_ASSERT(handle_.top ()._type == token::OPENPAREN && 339 handle_.size () == 3); 340 handle_.pop (); 341 BOOST_ASSERT(handle_.top ()._type == token::REGEX); 342 handle_.pop (); 343 BOOST_ASSERT(handle_.top ()._type == token::CLOSEPAREN); 344 token_stack_.push (token::REPEAT); 345 } 346 perform_or(node_ptr_vector & node_ptr_vector_,tree_node_stack & tree_node_stack_)347 static void perform_or (node_ptr_vector &node_ptr_vector_, 348 tree_node_stack &tree_node_stack_) 349 { 350 // perform or 351 node *rhs_ = tree_node_stack_.top (); 352 353 tree_node_stack_.pop (); 354 355 node *lhs_ = tree_node_stack_.top (); 356 357 node_ptr_vector_->push_back (static_cast<selection_node *>(0)); 358 node_ptr_vector_->back () = new selection_node (lhs_, rhs_); 359 tree_node_stack_.top () = node_ptr_vector_->back (); 360 } 361 sequence(node_ptr_vector & node_ptr_vector_,tree_node_stack & tree_node_stack_)362 static void sequence (node_ptr_vector &node_ptr_vector_, 363 tree_node_stack &tree_node_stack_) 364 { 365 node *rhs_ = tree_node_stack_.top (); 366 367 tree_node_stack_.pop (); 368 369 node *lhs_ = tree_node_stack_.top (); 370 371 node_ptr_vector_->push_back (static_cast<sequence_node *>(0)); 372 node_ptr_vector_->back () = new sequence_node (lhs_, rhs_); 373 tree_node_stack_.top () = node_ptr_vector_->back (); 374 } 375 optional(const bool greedy_,node_ptr_vector & node_ptr_vector_,tree_node_stack & tree_node_stack_)376 static void optional (const bool greedy_, 377 node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) 378 { 379 // perform ? 380 node *lhs_ = tree_node_stack_.top (); 381 // You don't know if lhs_ is a leaf_node, so get firstpos. 382 node::node_vector &firstpos_ = lhs_->firstpos (); 383 384 for (node::node_vector::iterator iter_ = firstpos_.begin (), 385 end_ = firstpos_.end (); iter_ != end_; ++iter_) 386 { 387 // These are leaf_nodes! 388 (*iter_)->greedy (greedy_); 389 } 390 391 node_ptr_vector_->push_back (static_cast<leaf_node *>(0)); 392 393 node *rhs_ = new leaf_node (null_token, greedy_); 394 395 node_ptr_vector_->back () = rhs_; 396 node_ptr_vector_->push_back (static_cast<selection_node *>(0)); 397 node_ptr_vector_->back () = new selection_node (lhs_, rhs_); 398 tree_node_stack_.top () = node_ptr_vector_->back (); 399 } 400 zero_or_more(const bool greedy_,node_ptr_vector & node_ptr_vector_,tree_node_stack & tree_node_stack_)401 static void zero_or_more (const bool greedy_, 402 node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) 403 { 404 // perform * 405 node *ptr_ = tree_node_stack_.top (); 406 407 node_ptr_vector_->push_back (static_cast<iteration_node *>(0)); 408 node_ptr_vector_->back () = new iteration_node (ptr_, greedy_); 409 tree_node_stack_.top () = node_ptr_vector_->back (); 410 } 411 one_or_more(const bool greedy_,node_ptr_vector & node_ptr_vector_,tree_node_stack & tree_node_stack_)412 static void one_or_more (const bool greedy_, 413 node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) 414 { 415 // perform + 416 node *lhs_ = tree_node_stack_.top (); 417 node *copy_ = lhs_->copy (node_ptr_vector_); 418 419 node_ptr_vector_->push_back (static_cast<iteration_node *>(0)); 420 421 node *rhs_ = new iteration_node (copy_, greedy_); 422 423 node_ptr_vector_->back () = rhs_; 424 node_ptr_vector_->push_back (static_cast<sequence_node *>(0)); 425 node_ptr_vector_->back () = new sequence_node (lhs_, rhs_); 426 tree_node_stack_.top () = node_ptr_vector_->back (); 427 } 428 429 // This is one of the most mind bending routines in this code... repeatn(const bool greedy_,const token & token_,node_ptr_vector & node_ptr_vector_,tree_node_stack & tree_node_stack_)430 static void repeatn (const bool greedy_, const token &token_, 431 node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) 432 { 433 // perform {n[,[m]]} 434 // Semantic checks have already been performed. 435 // {0,} = * 436 // {0,1} = ? 437 // {1,} = + 438 // therefore we do not check for these cases. 439 if (!(token_._min == 1 && !token_._comma)) 440 { 441 const std::size_t top_ = token_._min > 0 ? 442 token_._min : token_._max; 443 444 if (token_._min == 0) 445 { 446 optional (greedy_, node_ptr_vector_, tree_node_stack_); 447 } 448 449 node *prev_ = tree_node_stack_.top ()->copy (node_ptr_vector_); 450 node *curr_ = 0; 451 452 for (std::size_t i_ = 2; i_ < top_; ++i_) 453 { 454 curr_ = prev_->copy (node_ptr_vector_); 455 tree_node_stack_.push (static_cast<node *>(0)); 456 tree_node_stack_.top () = prev_; 457 sequence (node_ptr_vector_, tree_node_stack_); 458 prev_ = curr_; 459 } 460 461 if (token_._comma && token_._min > 0) 462 { 463 if (token_._min > 1) 464 { 465 curr_ = prev_->copy (node_ptr_vector_); 466 tree_node_stack_.push (static_cast<node *>(0)); 467 tree_node_stack_.top () = prev_; 468 sequence (node_ptr_vector_, tree_node_stack_); 469 prev_ = curr_; 470 } 471 472 if (token_._comma && token_._max) 473 { 474 tree_node_stack_.push (static_cast<node *>(0)); 475 tree_node_stack_.top () = prev_; 476 optional (greedy_, node_ptr_vector_, tree_node_stack_); 477 prev_ = tree_node_stack_.top (); 478 tree_node_stack_.pop (); 479 480 const std::size_t count_ = token_._max - token_._min; 481 482 for (std::size_t i_ = 1; i_ < count_; ++i_) 483 { 484 curr_ = prev_->copy (node_ptr_vector_); 485 tree_node_stack_.push (static_cast<node *>(0)); 486 tree_node_stack_.top () = prev_; 487 sequence (node_ptr_vector_, tree_node_stack_); 488 prev_ = curr_; 489 } 490 } 491 else 492 { 493 tree_node_stack_.push (static_cast<node *>(0)); 494 tree_node_stack_.top () = prev_; 495 zero_or_more (greedy_, node_ptr_vector_, tree_node_stack_); 496 prev_ = tree_node_stack_.top (); 497 tree_node_stack_.pop (); 498 } 499 } 500 501 tree_node_stack_.push (static_cast<node *>(0)); 502 tree_node_stack_.top () = prev_; 503 sequence (node_ptr_vector_, tree_node_stack_); 504 } 505 } 506 }; 507 } 508 } 509 } 510 511 #endif 512