1 // tokeniser.hpp 2 // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/) 3 // 4 // Distributed under the Boost Software License, Version 1.0. (See accompanying 5 // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 6 #ifndef BOOST_LEXER_RE_TOKENISER_HPP 7 #define BOOST_LEXER_RE_TOKENISER_HPP 8 9 // memcpy() 10 #include <cstring> 11 #include <map> 12 #include "num_token.hpp" 13 #include "../../runtime_error.hpp" 14 #include "../../size_t.hpp" 15 #include <sstream> 16 #include "../../string_token.hpp" 17 #include "re_tokeniser_helper.hpp" 18 19 namespace boost 20 { 21 namespace lexer 22 { 23 namespace detail 24 { 25 template<typename CharT> 26 class basic_re_tokeniser 27 { 28 public: 29 typedef basic_num_token<CharT> num_token; 30 typedef basic_re_tokeniser_state<CharT> state; 31 typedef basic_string_token<CharT> string_token; 32 typedef typename string_token::string string; 33 typedef std::map<string_token, std::size_t> token_map; 34 typedef std::pair<string_token, std::size_t> token_pair; 35 next(state & state_,token_map & map_,num_token & token_)36 static void next (state &state_, token_map &map_, num_token &token_) 37 { 38 CharT ch_ = 0; 39 bool eos_ = state_.next (ch_); 40 41 token_.min_max (0, false, 0); 42 43 while (!eos_ && ch_ == '"') 44 { 45 state_._in_string ^= 1; 46 eos_ = state_.next (ch_); 47 } 48 49 if (eos_) 50 { 51 if (state_._in_string) 52 { 53 throw runtime_error ("Unexpected end of regex " 54 "(missing '\"')."); 55 } 56 57 if (state_._paren_count) 58 { 59 throw runtime_error ("Unexpected end of regex " 60 "(missing ')')."); 61 } 62 63 token_.set (num_token::END, null_token); 64 } 65 else 66 { 67 if (ch_ == '\\') 68 { 69 // Even if we are in a string, respect escape sequences... 70 escape (state_, map_, token_); 71 } 72 else if (state_._in_string) 73 { 74 // All other meta characters lose their special meaning 75 // inside a string. 76 create_charset_token (string (1, ch_), false, map_, token_); 77 } 78 else 79 { 80 // Not an escape sequence and not inside a string, so 81 // check for meta characters. 82 switch (ch_) 83 { 84 case '(': 85 token_.set (num_token::OPENPAREN, null_token); 86 ++state_._paren_count; 87 read_options (state_); 88 break; 89 case ')': 90 --state_._paren_count; 91 92 if (state_._paren_count < 0) 93 { 94 std::ostringstream ss_; 95 96 ss_ << "Number of open parenthesis < 0 at index " << 97 state_.index () - 1 << '.'; 98 throw runtime_error (ss_.str ().c_str ()); 99 } 100 101 token_.set (num_token::CLOSEPAREN, null_token); 102 103 if (!state_._flags_stack.empty ()) 104 { 105 state_._flags = state_._flags_stack.top (); 106 state_._flags_stack.pop (); 107 } 108 break; 109 case '?': 110 if (!state_.eos () && *state_._curr == '?') 111 { 112 token_.set (num_token::AOPT, null_token); 113 state_.increment (); 114 } 115 else 116 { 117 token_.set (num_token::OPT, null_token); 118 } 119 120 break; 121 case '*': 122 if (!state_.eos () && *state_._curr == '?') 123 { 124 token_.set (num_token::AZEROORMORE, null_token); 125 state_.increment (); 126 } 127 else 128 { 129 token_.set (num_token::ZEROORMORE, null_token); 130 } 131 132 break; 133 case '+': 134 if (!state_.eos () && *state_._curr == '?') 135 { 136 token_.set (num_token::AONEORMORE, null_token); 137 state_.increment (); 138 } 139 else 140 { 141 token_.set (num_token::ONEORMORE, null_token); 142 } 143 144 break; 145 case '{': 146 open_curly (state_, token_); 147 break; 148 case '|': 149 token_.set (num_token::OR, null_token); 150 break; 151 case '^': 152 if (state_._curr - 1 == state_._start) 153 { 154 token_.set (num_token::CHARSET, bol_token); 155 state_._seen_BOL_assertion = true; 156 } 157 else 158 { 159 create_charset_token (string (1, ch_), false, 160 map_, token_); 161 } 162 163 break; 164 case '$': 165 if (state_._curr == state_._end) 166 { 167 token_.set (num_token::CHARSET, eol_token); 168 state_._seen_EOL_assertion = true; 169 } 170 else 171 { 172 create_charset_token (string (1, ch_), false, 173 map_, token_); 174 } 175 176 break; 177 case '.': 178 { 179 string dot_; 180 181 if (state_._flags & dot_not_newline) 182 { 183 dot_ = '\n'; 184 } 185 186 create_charset_token (dot_, true, map_, token_); 187 break; 188 } 189 case '[': 190 { 191 charset (state_, map_, token_); 192 break; 193 } 194 case '/': 195 throw runtime_error("Lookahead ('/') is not supported yet."); 196 break; 197 default: 198 if ((state_._flags & icase) && 199 (std::isupper (ch_, state_._locale) || 200 std::islower (ch_, state_._locale))) 201 { 202 CharT upper_ = std::toupper (ch_, state_._locale); 203 CharT lower_ = std::tolower (ch_, state_._locale); 204 205 string str_ (1, upper_); 206 207 str_ += lower_; 208 create_charset_token (str_, false, map_, token_); 209 } 210 else 211 { 212 create_charset_token (string (1, ch_), false, 213 map_, token_); 214 } 215 216 break; 217 } 218 } 219 } 220 } 221 222 private: 223 typedef basic_re_tokeniser_helper<CharT> tokeniser_helper; 224 read_options(state & state_)225 static void read_options (state &state_) 226 { 227 if (!state_.eos () && *state_._curr == '?') 228 { 229 CharT ch_ = 0; 230 bool eos_ = false; 231 bool negate_ = false; 232 233 state_.increment (); 234 eos_ = state_.next (ch_); 235 state_._flags_stack.push (state_._flags); 236 237 while (!eos_ && ch_ != ':') 238 { 239 switch (ch_) 240 { 241 case '-': 242 negate_ ^= 1; 243 break; 244 case 'i': 245 if (negate_) 246 { 247 state_._flags = static_cast<regex_flags> 248 (state_._flags & ~icase); 249 } 250 else 251 { 252 state_._flags = static_cast<regex_flags> 253 (state_._flags | icase); 254 } 255 256 negate_ = false; 257 break; 258 case 's': 259 if (negate_) 260 { 261 state_._flags = static_cast<regex_flags> 262 (state_._flags | dot_not_newline); 263 } 264 else 265 { 266 state_._flags = static_cast<regex_flags> 267 (state_._flags & ~dot_not_newline); 268 } 269 270 negate_ = false; 271 break; 272 default: 273 { 274 std::ostringstream ss_; 275 276 ss_ << "Unknown option at index " << 277 state_.index () - 1 << '.'; 278 throw runtime_error (ss_.str ().c_str ()); 279 } 280 } 281 282 eos_ = state_.next (ch_); 283 } 284 285 // End of string handler will handle early termination 286 } 287 else if (!state_._flags_stack.empty ()) 288 { 289 state_._flags_stack.push (state_._flags); 290 } 291 } 292 escape(state & state_,token_map & map_,num_token & token_)293 static void escape (state &state_, token_map &map_, num_token &token_) 294 { 295 CharT ch_ = 0; 296 std::size_t str_len_ = 0; 297 const CharT *str_ = tokeniser_helper::escape_sequence (state_, 298 ch_, str_len_); 299 300 if (str_) 301 { 302 state state2_ (str_ + 1, str_ + str_len_, state_._flags, 303 state_._locale); 304 305 charset (state2_, map_, token_); 306 } 307 else 308 { 309 create_charset_token (string (1, ch_), false, map_, token_); 310 } 311 } 312 charset(state & state_,token_map & map_,num_token & token_)313 static void charset (state &state_, token_map &map_, num_token &token_) 314 { 315 string chars_; 316 bool negated_ = false; 317 318 tokeniser_helper::charset (state_, chars_, negated_); 319 create_charset_token (chars_, negated_, map_, token_); 320 } 321 create_charset_token(const string & charset_,const bool negated_,token_map & map_,num_token & token_)322 static void create_charset_token (const string &charset_, 323 const bool negated_, token_map &map_, num_token &token_) 324 { 325 std::size_t id_ = null_token; 326 string_token stok_ (negated_, charset_); 327 328 stok_.remove_duplicates (); 329 stok_.normalise (); 330 331 typename token_map::const_iterator iter_ = map_.find (stok_); 332 333 if (iter_ == map_.end ()) 334 { 335 id_ = map_.size (); 336 map_.insert (token_pair (stok_, id_)); 337 } 338 else 339 { 340 id_ = iter_->second; 341 } 342 343 token_.set (num_token::CHARSET, id_); 344 } 345 open_curly(state & state_,num_token & token_)346 static void open_curly (state &state_, num_token &token_) 347 { 348 if (state_.eos ()) 349 { 350 throw runtime_error ("Unexpected end of regex " 351 "(missing '}')."); 352 } 353 else if (*state_._curr >= '0' && *state_._curr <= '9') 354 { 355 repeat_n (state_, token_); 356 357 if (!state_.eos () && *state_._curr == '?') 358 { 359 token_._type = num_token::AREPEATN; 360 state_.increment (); 361 } 362 } 363 else 364 { 365 macro (state_, token_); 366 } 367 } 368 369 // SYNTAX: 370 // {n[,[n]]} 371 // SEMANTIC RULES: 372 // {0} - INVALID (throw exception) 373 // {0,} = * 374 // {0,0} - INVALID (throw exception) 375 // {0,1} = ? 376 // {1,} = + 377 // {min,max} where min == max - {min} 378 // {min,max} where max < min - INVALID (throw exception) repeat_n(state & state_,num_token & token_)379 static void repeat_n (state &state_, num_token &token_) 380 { 381 CharT ch_ = 0; 382 bool eos_ = state_.next (ch_); 383 384 while (!eos_ && ch_ >= '0' && ch_ <= '9') 385 { 386 token_._min *= 10; 387 token_._min += ch_ - '0'; 388 eos_ = state_.next (ch_); 389 } 390 391 if (eos_) 392 { 393 throw runtime_error ("Unexpected end of regex " 394 "(missing '}')."); 395 } 396 397 bool min_max_ = false; 398 bool repeatn_ = true; 399 400 token_._comma = ch_ == ','; 401 402 if (token_._comma) 403 { 404 eos_ = state_.next (ch_); 405 406 if (eos_) 407 { 408 throw runtime_error ("Unexpected end of regex " 409 "(missing '}')."); 410 } 411 412 if (ch_ == '}') 413 { 414 // Small optimisation: Check for '*' equivalency. 415 if (token_._min == 0) 416 { 417 token_.set (num_token::ZEROORMORE, null_token); 418 repeatn_ = false; 419 } 420 // Small optimisation: Check for '+' equivalency. 421 else if (token_._min == 1) 422 { 423 token_.set (num_token::ONEORMORE, null_token); 424 repeatn_ = false; 425 } 426 } 427 else 428 { 429 if (ch_ < '0' || ch_ > '9') 430 { 431 std::ostringstream ss_; 432 433 ss_ << "Missing '}' at index " << 434 state_.index () - 1 << '.'; 435 throw runtime_error (ss_.str ().c_str ()); 436 } 437 438 min_max_ = true; 439 440 do 441 { 442 token_._max *= 10; 443 token_._max += ch_ - '0'; 444 eos_ = state_.next (ch_); 445 } while (!eos_ && ch_ >= '0' && ch_ <= '9'); 446 447 if (eos_) 448 { 449 throw runtime_error ("Unexpected end of regex " 450 "(missing '}')."); 451 } 452 453 // Small optimisation: Check for '?' equivalency. 454 if (token_._min == 0 && token_._max == 1) 455 { 456 token_.set (num_token::OPT, null_token); 457 repeatn_ = false; 458 } 459 // Small optimisation: if min == max, then min. 460 else if (token_._min == token_._max) 461 { 462 token_._comma = false; 463 min_max_ = false; 464 token_._max = 0; 465 } 466 } 467 } 468 469 if (ch_ != '}') 470 { 471 std::ostringstream ss_; 472 473 ss_ << "Missing '}' at index " << state_.index () - 1 << '.'; 474 throw runtime_error (ss_.str ().c_str ()); 475 } 476 477 if (repeatn_) 478 { 479 // SEMANTIC VALIDATION follows: 480 // NOTE: {0,} has already become * 481 // therefore we don't check for a comma. 482 if (token_._min == 0 && token_._max == 0) 483 { 484 std::ostringstream ss_; 485 486 ss_ << "Cannot have exactly zero repeats preceding index " << 487 state_.index () << '.'; 488 throw runtime_error (ss_.str ().c_str ()); 489 } 490 491 if (min_max_ && token_._max < token_._min) 492 { 493 std::ostringstream ss_; 494 495 ss_ << "Max less than min preceding index " << 496 state_.index () << '.'; 497 throw runtime_error (ss_.str ().c_str ()); 498 } 499 500 token_.set (num_token::REPEATN, null_token); 501 } 502 } 503 macro(state & state_,num_token & token_)504 static void macro (state &state_, num_token &token_) 505 { 506 CharT ch_ = 0; 507 bool eos_ = false; 508 const CharT *start_ = state_._curr; 509 510 state_.next (ch_); 511 512 if (ch_ != '_' && !(ch_ >= 'A' && ch_ <= 'Z') && 513 !(ch_ >= 'a' && ch_ <= 'z')) 514 { 515 std::ostringstream ss_; 516 517 ss_ << "Invalid MACRO name at index " << 518 state_.index () - 1 << '.'; 519 throw runtime_error (ss_.str ().c_str ()); 520 } 521 522 do 523 { 524 eos_ = state_.next (ch_); 525 526 if (eos_) 527 { 528 throw runtime_error ("Unexpected end of regex " 529 "(missing '}')."); 530 } 531 } while (ch_ == '_' || ch_ == '-' || (ch_ >= 'A' && ch_ <= 'Z') || 532 (ch_ >= 'a' && ch_ <= 'z') || (ch_ >= '0' && ch_ <= '9')); 533 534 if (ch_ != '}') 535 { 536 std::ostringstream ss_; 537 538 ss_ << "Missing '}' at index " << state_.index () - 1 << '.'; 539 throw runtime_error (ss_.str ().c_str ()); 540 } 541 542 std::size_t len_ = state_._curr - 1 - start_; 543 544 if (len_ > max_macro_len) 545 { 546 std::basic_stringstream<CharT> ss_; 547 std::ostringstream os_; 548 549 os_ << "MACRO name '"; 550 551 while (len_) 552 { 553 os_ << ss_.narrow (*start_++, ' '); 554 --len_; 555 } 556 557 os_ << "' too long."; 558 throw runtime_error (os_.str ()); 559 } 560 561 token_.set (num_token::MACRO, null_token); 562 563 // Some systems have memcpy in namespace std. 564 using namespace std; 565 566 memcpy (token_._macro, start_, len_ * sizeof (CharT)); 567 token_._macro[len_] = 0; 568 } 569 }; 570 } 571 } 572 } 573 574 #endif 575