1 // file_input.hpp 2 // Copyright (c) 2008-2009 Ben Hanson (http://www.benhanson.net/) 3 // 4 // Distributed under the Boost Software License, Version 1.0. (See accompanying 5 // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 6 #ifndef BOOST_LEXER_FILE_INPUT 7 #define BOOST_LEXER_FILE_INPUT 8 9 #include "char_traits.hpp" 10 // memcpy 11 #include <cstring> 12 #include <fstream> 13 #include "size_t.hpp" 14 #include "state_machine.hpp" 15 16 namespace boost 17 { 18 namespace lexer 19 { 20 template<typename CharT, typename Traits = char_traits<CharT> > 21 class basic_file_input 22 { 23 public: 24 class iterator 25 { 26 public: 27 friend class basic_file_input; 28 29 struct data 30 { 31 std::size_t id; 32 std::size_t unique_id; 33 const CharT *start; 34 const CharT *end; 35 std::size_t state; 36 37 // Construct in end() state. databoost::lexer::basic_file_input::iterator::data38 data () : 39 id (0), 40 unique_id (npos), 41 state (npos) 42 { 43 } 44 operator ==boost::lexer::basic_file_input::iterator::data45 bool operator == (const data &rhs_) const 46 { 47 return id == rhs_.id && unique_id == rhs_.unique_id && 48 start == rhs_.start && end == rhs_.end && 49 state == rhs_.state; 50 } 51 }; 52 iterator()53 iterator () : 54 _input (0) 55 { 56 } 57 operator ==(const iterator & rhs_) const58 bool operator == (const iterator &rhs_) const 59 { 60 return _data == rhs_._data; 61 } 62 operator !=(const iterator & rhs_) const63 bool operator != (const iterator &rhs_) const 64 { 65 return !(*this == rhs_); 66 } 67 operator *()68 data &operator * () 69 { 70 return _data; 71 } 72 operator ->()73 data *operator -> () 74 { 75 return &_data; 76 } 77 78 // Let compiler generate operator = (). 79 80 // prefix version operator ++()81 iterator &operator ++ () 82 { 83 next_token (); 84 return *this; 85 } 86 87 // postfix version operator ++(int)88 iterator operator ++ (int) 89 { 90 iterator iter_ = *this; 91 92 next_token (); 93 return iter_; 94 } 95 next_token()96 void next_token () 97 { 98 const detail::internals &internals_ = 99 _input->_state_machine->data (); 100 101 _data.start = _data.end; 102 103 if (internals_._dfa->size () == 1) 104 { 105 _data.id = _input->next (&internals_._lookup->front ()-> 106 front (), internals_._dfa_alphabet.front (), 107 &internals_._dfa->front ()->front (), _data.start, 108 _data.end, _data.unique_id); 109 } 110 else 111 { 112 _data.id = _input->next (internals_, _data.state, _data.start, 113 _data.end, _data.unique_id); 114 } 115 116 if (_data.id == 0) 117 { 118 _data.start = 0; 119 _data.end = 0; 120 // Ensure current state matches that returned by end(). 121 _data.state = npos; 122 } 123 } 124 125 private: 126 // Not owner (obviously!) 127 basic_file_input *_input; 128 data _data; 129 }; 130 131 friend class iterator; 132 133 // Make it explict that we are NOT taking a copy of state_machine_! basic_file_input(const basic_state_machine<CharT> * state_machine_,std::basic_ifstream<CharT> * is_,const std::streamsize buffer_size_=4096,const std::streamsize buffer_increment_=1024)134 basic_file_input (const basic_state_machine<CharT> *state_machine_, 135 std::basic_ifstream<CharT> *is_, 136 const std::streamsize buffer_size_ = 4096, 137 const std::streamsize buffer_increment_ = 1024) : 138 _state_machine (state_machine_), 139 _stream (is_), 140 _buffer_size (buffer_size_), 141 _buffer_increment (buffer_increment_), 142 _buffer (_buffer_size, '!') 143 { 144 _start_buffer = &_buffer.front (); 145 _end_buffer = _start_buffer + _buffer.size (); 146 _start_token = _end_buffer; 147 _end_token = _end_buffer; 148 } 149 begin()150 iterator begin () 151 { 152 iterator iter_; 153 154 iter_._input = this; 155 // Over-ride default of 0 (EOF) 156 iter_._data.id = npos; 157 iter_._data.start = 0; 158 iter_._data.end = 0; 159 iter_._data.state = 0; 160 ++iter_; 161 return iter_; 162 } 163 end()164 iterator end () 165 { 166 iterator iter_; 167 168 iter_._input = this; 169 iter_._data.start = 0; 170 iter_._data.end = 0; 171 return iter_; 172 } 173 flush()174 void flush () 175 { 176 // This temporary is mandatory, otherwise the 177 // pointer calculations won't work! 178 const CharT *temp_ = _end_buffer; 179 180 _start_token = _end_token = _end_buffer; 181 reload_buffer (temp_, true, _end_token); 182 } 183 184 private: 185 typedef std::basic_istream<CharT> istream; 186 typedef std::vector<CharT> buffer; 187 188 const basic_state_machine<CharT> *_state_machine; 189 const std::streamsize _buffer_size; 190 const std::streamsize _buffer_increment; 191 192 buffer _buffer; 193 CharT *_start_buffer; 194 istream *_stream; 195 const CharT *_start_token; 196 const CharT *_end_token; 197 CharT *_end_buffer; 198 next(const detail::internals & internals_,std::size_t & start_state_,const CharT * & start_,const CharT * & end_,std::size_t & unique_id_)199 std::size_t next (const detail::internals &internals_, 200 std::size_t &start_state_, const CharT * &start_, const CharT * &end_, 201 std::size_t &unique_id_) 202 { 203 _start_token = _end_token; 204 205 again: 206 const std::size_t * lookup_ = &internals_._lookup[start_state_]-> 207 front (); 208 std::size_t dfa_alphabet_ = internals_._dfa_alphabet[start_state_]; 209 const std::size_t *dfa_ = &internals_._dfa[start_state_]->front (); 210 const std::size_t *ptr_ = dfa_ + dfa_alphabet_; 211 const CharT *curr_ = _start_token; 212 bool end_state_ = *ptr_ != 0; 213 std::size_t id_ = *(ptr_ + id_index); 214 std::size_t uid_ = *(ptr_ + unique_id_index); 215 const CharT *end_token_ = curr_; 216 217 for (;;) 218 { 219 if (curr_ >= _end_buffer) 220 { 221 if (!reload_buffer (curr_, end_state_, end_token_)) 222 { 223 // EOF 224 break; 225 } 226 } 227 228 const std::size_t BOL_state_ = ptr_[bol_index]; 229 const std::size_t EOL_state_ = ptr_[eol_index]; 230 231 if (BOL_state_ && (_start_token == _start_buffer || 232 *(_start_token - 1) == '\n')) 233 { 234 ptr_ = &dfa_[BOL_state_ * dfa_alphabet_]; 235 } 236 else if (EOL_state_ && *curr_ == '\n') 237 { 238 ptr_ = &dfa_[EOL_state_ * dfa_alphabet_]; 239 } 240 else 241 { 242 const std::size_t state_ = 243 ptr_[lookup_[static_cast<typename Traits::index_type> 244 (*curr_++)]]; 245 246 if (state_ == 0) 247 { 248 break; 249 } 250 251 ptr_ = &dfa_[state_ * dfa_alphabet_]; 252 } 253 254 if (*ptr_) 255 { 256 end_state_ = true; 257 id_ = *(ptr_ + id_index); 258 uid_ = *(ptr_ + unique_id_index); 259 start_state_ = *(ptr_ + state_index); 260 end_token_ = curr_; 261 } 262 } 263 264 if (_start_token >= _end_buffer) 265 { 266 // No more tokens... 267 unique_id_ = npos; 268 return 0; 269 } 270 271 const std::size_t EOL_state_ = ptr_[eol_index]; 272 273 if (EOL_state_ && curr_ == end_) 274 { 275 ptr_ = &dfa_[EOL_state_ * dfa_alphabet_]; 276 277 if (*ptr_) 278 { 279 end_state_ = true; 280 id_ = *(ptr_ + id_index); 281 uid_ = *(ptr_ + unique_id_index); 282 start_state_ = *(ptr_ + state_index); 283 end_token_ = curr_; 284 } 285 } 286 287 if (end_state_) 288 { 289 // return longest match 290 _end_token = end_token_; 291 292 if (id_ == 0) goto again; 293 } 294 else 295 { 296 // No match causes char to be skipped 297 _end_token = _start_token + 1; 298 id_ = npos; 299 uid_ = npos; 300 } 301 302 start_ = _start_token; 303 end_ = _end_token; 304 unique_id_ = uid_; 305 return id_; 306 } 307 next(const std::size_t * const lookup_,const std::size_t dfa_alphabet_,const std::size_t * const dfa_,const CharT * & start_,const CharT * & end_,std::size_t & unique_id_)308 std::size_t next (const std::size_t * const lookup_, 309 const std::size_t dfa_alphabet_, const std::size_t * const dfa_, 310 const CharT * &start_, const CharT * &end_, std::size_t &unique_id_) 311 { 312 _start_token = _end_token; 313 314 const std::size_t *ptr_ = dfa_ + dfa_alphabet_; 315 const CharT *curr_ = _start_token; 316 bool end_state_ = *ptr_ != 0; 317 std::size_t id_ = *(ptr_ + id_index); 318 std::size_t uid_ = *(ptr_ + unique_id_index); 319 const CharT *end_token_ = curr_; 320 321 for (;;) 322 { 323 if (curr_ >= _end_buffer) 324 { 325 if (!reload_buffer (curr_, end_state_, end_token_)) 326 { 327 // EOF 328 break; 329 } 330 } 331 332 const std::size_t BOL_state_ = ptr_[bol_index]; 333 const std::size_t EOL_state_ = ptr_[eol_index]; 334 335 if (BOL_state_ && (_start_token == _start_buffer || 336 *(_start_token - 1) == '\n')) 337 { 338 ptr_ = &dfa_[BOL_state_ * dfa_alphabet_]; 339 } 340 else if (EOL_state_ && *curr_ == '\n') 341 { 342 ptr_ = &dfa_[EOL_state_ * dfa_alphabet_]; 343 } 344 else 345 { 346 const std::size_t state_ = 347 ptr_[lookup_[static_cast<typename Traits::index_type> 348 (*curr_++)]]; 349 350 if (state_ == 0) 351 { 352 break; 353 } 354 355 ptr_ = &dfa_[state_ * dfa_alphabet_]; 356 } 357 358 if (*ptr_) 359 { 360 end_state_ = true; 361 id_ = *(ptr_ + id_index); 362 uid_ = *(ptr_ + unique_id_index); 363 end_token_ = curr_; 364 } 365 } 366 367 if (_start_token >= _end_buffer) 368 { 369 // No more tokens... 370 unique_id_ = npos; 371 return 0; 372 } 373 374 const std::size_t EOL_state_ = ptr_[eol_index]; 375 376 if (EOL_state_ && curr_ == end_) 377 { 378 ptr_ = &dfa_[EOL_state_ * dfa_alphabet_]; 379 380 if (*ptr_) 381 { 382 end_state_ = true; 383 id_ = *(ptr_ + id_index); 384 uid_ = *(ptr_ + unique_id_index); 385 end_token_ = curr_; 386 } 387 } 388 389 if (end_state_) 390 { 391 // return longest match 392 _end_token = end_token_; 393 } 394 else 395 { 396 // No match causes char to be skipped 397 _end_token = _start_token + 1; 398 id_ = npos; 399 uid_ = npos; 400 } 401 402 start_ = _start_token; 403 end_ = _end_token; 404 unique_id_ = uid_; 405 return id_; 406 } 407 reload_buffer(const CharT * & curr_,const bool end_state_,const CharT * & end_token_)408 bool reload_buffer (const CharT * &curr_, const bool end_state_, 409 const CharT * &end_token_) 410 { 411 bool success_ = !_stream->eof (); 412 413 if (success_) 414 { 415 const CharT *old_start_token_ = _start_token; 416 std::size_t old_size_ = _buffer.size (); 417 std::size_t count_ = 0; 418 419 if (_start_token - 1 == _start_buffer) 420 { 421 // Run out of buffer space, so increase. 422 _buffer.resize (old_size_ + _buffer_increment, '!'); 423 _start_buffer = &_buffer.front (); 424 _start_token = _start_buffer + 1; 425 _stream->read (_start_buffer + old_size_, 426 _buffer_increment); 427 count_ = _stream->gcount (); 428 _end_buffer = _start_buffer + old_size_ + count_; 429 } 430 else if (_start_token < _end_buffer) 431 { 432 const std::size_t len_ = _end_buffer - _start_token; 433 // Some systems have memcpy in namespace std. 434 using namespace std; 435 436 memcpy (_start_buffer, _start_token - 1, (len_ + 1) * 437 sizeof (CharT)); 438 _stream->read (_start_buffer + len_ + 1, 439 static_cast<std::streamsize> (_buffer.size () - len_ - 1)); 440 count_ = _stream->gcount (); 441 _start_token = _start_buffer + 1; 442 _end_buffer = _start_buffer + len_ + 1 + count_; 443 } 444 else 445 { 446 _stream->read (_start_buffer, static_cast<std::streamsize> 447 (_buffer.size ())); 448 count_ = _stream->gcount (); 449 _start_token = _start_buffer; 450 _end_buffer = _start_buffer + count_; 451 } 452 453 if (end_state_) 454 { 455 end_token_ = _start_token + 456 (end_token_ - old_start_token_); 457 } 458 459 curr_ = _start_token + (curr_ - old_start_token_); 460 } 461 462 return success_; 463 } 464 465 // Disallow copying of buffer 466 basic_file_input (const basic_file_input &); 467 const basic_file_input &operator = (const basic_file_input &); 468 }; 469 470 typedef basic_file_input<char> file_input; 471 typedef basic_file_input<wchar_t> wfile_input; 472 } 473 } 474 475 #endif 476