1 // input.hpp 2 // Copyright (c) 2008-2009 Ben Hanson (http://www.benhanson.net/) 3 // 4 // Distributed under the Boost Software License, Version 1.0. (See accompanying 5 // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 6 #ifndef BOOST_LEXER_INPUT 7 #define BOOST_LEXER_INPUT 8 9 #include "char_traits.hpp" 10 #include <boost/detail/iterator.hpp> 11 #include "size_t.hpp" 12 #include "state_machine.hpp" 13 14 namespace boost 15 { 16 namespace lexer 17 { 18 template<typename FwdIter, typename Traits = 19 char_traits<typename boost::detail::iterator_traits<FwdIter>::value_type> > 20 class basic_input 21 { 22 public: 23 class iterator 24 { 25 public: 26 friend class basic_input; 27 28 struct data 29 { 30 std::size_t id; 31 std::size_t unique_id; 32 FwdIter start; 33 FwdIter end; 34 bool bol; 35 std::size_t state; 36 37 // Construct in end() state. databoost::lexer::basic_input::iterator::data38 data () : 39 id (0), 40 unique_id (npos), 41 bol (false), 42 state (npos) 43 { 44 } 45 operator ==boost::lexer::basic_input::iterator::data46 bool operator == (const data &rhs_) const 47 { 48 return id == rhs_.id && unique_id == rhs_.unique_id && 49 start == rhs_.start && end == rhs_.end && 50 bol == rhs_.bol && state == rhs_.state; 51 } 52 }; 53 iterator()54 iterator () : 55 _input (0) 56 { 57 } 58 operator ==(const iterator & rhs_) const59 bool operator == (const iterator &rhs_) const 60 { 61 return _data == rhs_._data; 62 } 63 operator !=(const iterator & rhs_) const64 bool operator != (const iterator &rhs_) const 65 { 66 return !(*this == rhs_); 67 } 68 operator *()69 data &operator * () 70 { 71 return _data; 72 } 73 operator ->()74 data *operator -> () 75 { 76 return &_data; 77 } 78 79 // Let compiler generate operator = (). 80 81 // prefix version operator ++()82 iterator &operator ++ () 83 { 84 next_token (); 85 return *this; 86 } 87 88 // postfix version operator ++(int)89 iterator operator ++ (int) 90 { 91 iterator iter_ = *this; 92 93 next_token (); 94 return iter_; 95 } 96 97 private: 98 // Not owner (obviously!) 99 const basic_input *_input; 100 data _data; 101 next_token()102 void next_token () 103 { 104 const detail::internals &internals_ = 105 _input->_state_machine->data (); 106 107 _data.start = _data.end; 108 109 if (internals_._dfa->size () == 1) 110 { 111 if (internals_._seen_BOL_assertion || 112 internals_._seen_EOL_assertion) 113 { 114 _data.id = next 115 (&internals_._lookup->front ()->front (), 116 internals_._dfa_alphabet.front (), 117 &internals_._dfa->front ()->front (), 118 _data.bol, _data.end, _input->_end, _data.unique_id); 119 } 120 else 121 { 122 _data.id = next (&internals_._lookup->front ()->front (), 123 internals_._dfa_alphabet.front (), &internals_. 124 _dfa->front ()->front (), _data.end, _input->_end, 125 _data.unique_id); 126 } 127 } 128 else 129 { 130 if (internals_._seen_BOL_assertion || 131 internals_._seen_EOL_assertion) 132 { 133 _data.id = next (internals_, _data.state, 134 _data.bol, _data.end, _input->_end, _data.unique_id); 135 } 136 else 137 { 138 _data.id = next (internals_, _data.state, 139 _data.end, _input->_end, _data.unique_id); 140 } 141 } 142 143 if (_data.end == _input->_end && _data.start == _data.end) 144 { 145 // Ensure current state matches that returned by end(). 146 _data.state = npos; 147 } 148 } 149 next(const detail::internals & internals_,std::size_t & start_state_,bool bol_,FwdIter & start_token_,const FwdIter & end_,std::size_t & unique_id_)150 std::size_t next (const detail::internals &internals_, 151 std::size_t &start_state_, bool bol_, 152 FwdIter &start_token_, const FwdIter &end_, 153 std::size_t &unique_id_) 154 { 155 if (start_token_ == end_) 156 { 157 unique_id_ = npos; 158 return 0; 159 } 160 161 again: 162 const std::size_t * lookup_ = &internals_._lookup[start_state_]-> 163 front (); 164 std::size_t dfa_alphabet_ = internals_._dfa_alphabet[start_state_]; 165 const std::size_t *dfa_ = &internals_._dfa[start_state_]->front (); 166 const std::size_t *ptr_ = dfa_ + dfa_alphabet_; 167 FwdIter curr_ = start_token_; 168 bool end_state_ = *ptr_ != 0; 169 std::size_t id_ = *(ptr_ + id_index); 170 std::size_t uid_ = *(ptr_ + unique_id_index); 171 std::size_t end_start_state_ = start_state_; 172 bool end_bol_ = bol_; 173 FwdIter end_token_ = start_token_; 174 175 while (curr_ != end_) 176 { 177 const std::size_t BOL_state_ = ptr_[bol_index]; 178 const std::size_t EOL_state_ = ptr_[eol_index]; 179 180 if (BOL_state_ && bol_) 181 { 182 ptr_ = &dfa_[BOL_state_ * dfa_alphabet_]; 183 } 184 else if (EOL_state_ && *curr_ == '\n') 185 { 186 ptr_ = &dfa_[EOL_state_ * dfa_alphabet_]; 187 } 188 else 189 { 190 typename Traits::char_type prev_char_ = *curr_++; 191 192 bol_ = prev_char_ == '\n'; 193 194 const std::size_t state_ = 195 ptr_[lookup_[static_cast<typename Traits::index_type> 196 (prev_char_)]]; 197 198 if (state_ == 0) 199 { 200 break; 201 } 202 203 ptr_ = &dfa_[state_ * dfa_alphabet_]; 204 } 205 206 if (*ptr_) 207 { 208 end_state_ = true; 209 id_ = *(ptr_ + id_index); 210 uid_ = *(ptr_ + unique_id_index); 211 end_start_state_ = *(ptr_ + state_index); 212 end_bol_ = bol_; 213 end_token_ = curr_; 214 } 215 } 216 217 const std::size_t EOL_state_ = ptr_[eol_index]; 218 219 if (EOL_state_ && curr_ == end_) 220 { 221 ptr_ = &dfa_[EOL_state_ * dfa_alphabet_]; 222 223 if (*ptr_) 224 { 225 end_state_ = true; 226 id_ = *(ptr_ + id_index); 227 uid_ = *(ptr_ + unique_id_index); 228 end_start_state_ = *(ptr_ + state_index); 229 end_bol_ = bol_; 230 end_token_ = curr_; 231 } 232 } 233 234 if (end_state_) 235 { 236 // return longest match 237 start_state_ = end_start_state_; 238 start_token_ = end_token_; 239 240 if (id_ == 0) 241 { 242 bol_ = end_bol_; 243 goto again; 244 } 245 else 246 { 247 _data.bol = end_bol_; 248 } 249 } 250 else 251 { 252 // No match causes char to be skipped 253 _data.bol = *start_token_ == '\n'; 254 ++start_token_; 255 id_ = npos; 256 uid_ = npos; 257 } 258 259 unique_id_ = uid_; 260 return id_; 261 } 262 next(const detail::internals & internals_,std::size_t & start_state_,FwdIter & start_token_,FwdIter const & end_,std::size_t & unique_id_)263 std::size_t next (const detail::internals &internals_, 264 std::size_t &start_state_, FwdIter &start_token_, 265 FwdIter const &end_, std::size_t &unique_id_) 266 { 267 if (start_token_ == end_) 268 { 269 unique_id_ = npos; 270 return 0; 271 } 272 273 again: 274 const std::size_t * lookup_ = &internals_._lookup[start_state_]-> 275 front (); 276 std::size_t dfa_alphabet_ = internals_._dfa_alphabet[start_state_]; 277 const std::size_t *dfa_ = &internals_._dfa[start_state_]->front (); 278 const std::size_t *ptr_ = dfa_ + dfa_alphabet_; 279 FwdIter curr_ = start_token_; 280 bool end_state_ = *ptr_ != 0; 281 std::size_t id_ = *(ptr_ + id_index); 282 std::size_t uid_ = *(ptr_ + unique_id_index); 283 std::size_t end_start_state_ = start_state_; 284 FwdIter end_token_ = start_token_; 285 286 while (curr_ != end_) 287 { 288 const std::size_t state_ = ptr_[lookup_[static_cast 289 <typename Traits::index_type>(*curr_++)]]; 290 291 if (state_ == 0) 292 { 293 break; 294 } 295 296 ptr_ = &dfa_[state_ * dfa_alphabet_]; 297 298 if (*ptr_) 299 { 300 end_state_ = true; 301 id_ = *(ptr_ + id_index); 302 uid_ = *(ptr_ + unique_id_index); 303 end_start_state_ = *(ptr_ + state_index); 304 end_token_ = curr_; 305 } 306 } 307 308 if (end_state_) 309 { 310 // return longest match 311 start_state_ = end_start_state_; 312 start_token_ = end_token_; 313 314 if (id_ == 0) goto again; 315 } 316 else 317 { 318 // No match causes char to be skipped 319 ++start_token_; 320 id_ = npos; 321 uid_ = npos; 322 } 323 324 unique_id_ = uid_; 325 return id_; 326 } 327 next(const std::size_t * const lookup_,const std::size_t dfa_alphabet_,const std::size_t * const dfa_,bool bol_,FwdIter & start_token_,FwdIter const & end_,std::size_t & unique_id_)328 std::size_t next (const std::size_t * const lookup_, 329 const std::size_t dfa_alphabet_, const std::size_t * const dfa_, 330 bool bol_, FwdIter &start_token_, FwdIter const &end_, 331 std::size_t &unique_id_) 332 { 333 if (start_token_ == end_) 334 { 335 unique_id_ = npos; 336 return 0; 337 } 338 339 const std::size_t *ptr_ = dfa_ + dfa_alphabet_; 340 FwdIter curr_ = start_token_; 341 bool end_state_ = *ptr_ != 0; 342 std::size_t id_ = *(ptr_ + id_index); 343 std::size_t uid_ = *(ptr_ + unique_id_index); 344 bool end_bol_ = bol_; 345 FwdIter end_token_ = start_token_; 346 347 while (curr_ != end_) 348 { 349 const std::size_t BOL_state_ = ptr_[bol_index]; 350 const std::size_t EOL_state_ = ptr_[eol_index]; 351 352 if (BOL_state_ && bol_) 353 { 354 ptr_ = &dfa_[BOL_state_ * dfa_alphabet_]; 355 } 356 else if (EOL_state_ && *curr_ == '\n') 357 { 358 ptr_ = &dfa_[EOL_state_ * dfa_alphabet_]; 359 } 360 else 361 { 362 typename Traits::char_type prev_char_ = *curr_++; 363 364 bol_ = prev_char_ == '\n'; 365 366 const std::size_t state_ = 367 ptr_[lookup_[static_cast<typename Traits::index_type> 368 (prev_char_)]]; 369 370 if (state_ == 0) 371 { 372 break; 373 } 374 375 ptr_ = &dfa_[state_ * dfa_alphabet_]; 376 } 377 378 if (*ptr_) 379 { 380 end_state_ = true; 381 id_ = *(ptr_ + id_index); 382 uid_ = *(ptr_ + unique_id_index); 383 end_bol_ = bol_; 384 end_token_ = curr_; 385 } 386 } 387 388 const std::size_t EOL_state_ = ptr_[eol_index]; 389 390 if (EOL_state_ && curr_ == end_) 391 { 392 ptr_ = &dfa_[EOL_state_ * dfa_alphabet_]; 393 394 if (*ptr_) 395 { 396 end_state_ = true; 397 id_ = *(ptr_ + id_index); 398 uid_ = *(ptr_ + unique_id_index); 399 end_bol_ = bol_; 400 end_token_ = curr_; 401 } 402 } 403 404 if (end_state_) 405 { 406 // return longest match 407 _data.bol = end_bol_; 408 start_token_ = end_token_; 409 } 410 else 411 { 412 // No match causes char to be skipped 413 _data.bol = *start_token_ == '\n'; 414 ++start_token_; 415 id_ = npos; 416 uid_ = npos; 417 } 418 419 unique_id_ = uid_; 420 return id_; 421 } 422 next(const std::size_t * const lookup_,const std::size_t dfa_alphabet_,const std::size_t * const dfa_,FwdIter & start_token_,FwdIter const & end_,std::size_t & unique_id_)423 std::size_t next (const std::size_t * const lookup_, 424 const std::size_t dfa_alphabet_, const std::size_t * const dfa_, 425 FwdIter &start_token_, FwdIter const &end_, 426 std::size_t &unique_id_) 427 { 428 if (start_token_ == end_) 429 { 430 unique_id_ = npos; 431 return 0; 432 } 433 434 const std::size_t *ptr_ = dfa_ + dfa_alphabet_; 435 FwdIter curr_ = start_token_; 436 bool end_state_ = *ptr_ != 0; 437 std::size_t id_ = *(ptr_ + id_index); 438 std::size_t uid_ = *(ptr_ + unique_id_index); 439 FwdIter end_token_ = start_token_; 440 441 while (curr_ != end_) 442 { 443 const std::size_t state_ = ptr_[lookup_[static_cast 444 <typename Traits::index_type>(*curr_++)]]; 445 446 if (state_ == 0) 447 { 448 break; 449 } 450 451 ptr_ = &dfa_[state_ * dfa_alphabet_]; 452 453 if (*ptr_) 454 { 455 end_state_ = true; 456 id_ = *(ptr_ + id_index); 457 uid_ = *(ptr_ + unique_id_index); 458 end_token_ = curr_; 459 } 460 } 461 462 if (end_state_) 463 { 464 // return longest match 465 start_token_ = end_token_; 466 } 467 else 468 { 469 // No match causes char to be skipped 470 ++start_token_; 471 id_ = npos; 472 uid_ = npos; 473 } 474 475 unique_id_ = uid_; 476 return id_; 477 } 478 }; 479 480 friend class iterator; 481 482 // Make it explict that we are NOT taking a copy of state_machine_! basic_input(const basic_state_machine<typename Traits::char_type> * state_machine_,const FwdIter & begin_,const FwdIter & end_)483 basic_input (const basic_state_machine<typename Traits::char_type> 484 *state_machine_, const FwdIter &begin_, const FwdIter &end_) : 485 _state_machine (state_machine_), 486 _begin (begin_), 487 _end (end_) 488 { 489 } 490 begin() const491 iterator begin () const 492 { 493 iterator iter_; 494 495 iter_._input = this; 496 // Over-ride default of 0 (EOI) 497 iter_._data.id = npos; 498 iter_._data.start = _begin; 499 iter_._data.end = _begin; 500 iter_._data.bol = _state_machine->data ()._seen_BOL_assertion; 501 iter_._data.state = 0; 502 ++iter_; 503 return iter_; 504 } 505 end() const506 iterator end () const 507 { 508 iterator iter_; 509 510 iter_._input = this; 511 iter_._data.start = _end; 512 iter_._data.end = _end; 513 return iter_; 514 } 515 516 private: 517 const basic_state_machine<typename Traits::char_type> *_state_machine; 518 FwdIter _begin; 519 FwdIter _end; 520 }; 521 522 typedef basic_input<std::string::iterator> iter_input; 523 typedef basic_input<std::basic_string<wchar_t>::iterator> iter_winput; 524 typedef basic_input<const char *> ptr_input; 525 typedef basic_input<const wchar_t *> ptr_winput; 526 } 527 } 528 529 #endif 530