1 // file_input.hpp
2 // Copyright (c) 2008-2009 Ben Hanson (http://www.benhanson.net/)
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See accompanying
5 // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6 #ifndef BOOST_LEXER_FILE_INPUT
7 #define BOOST_LEXER_FILE_INPUT
8 
9 #include "char_traits.hpp"
10 // memcpy
11 #include <cstring>
12 #include <fstream>
13 #include "size_t.hpp"
14 #include "state_machine.hpp"
15 
16 namespace boost
17 {
18 namespace lexer
19 {
20 template<typename CharT, typename Traits = char_traits<CharT> >
21 class basic_file_input
22 {
23 public:
24     class iterator
25     {
26     public:
27         friend class basic_file_input;
28 
29         struct data
30         {
31             std::size_t id;
32             std::size_t unique_id;
33             const CharT *start;
34             const CharT *end;
35             std::size_t state;
36 
37             // Construct in end() state.
databoost::lexer::basic_file_input::iterator::data38             data () :
39                 id (0),
40                 unique_id (npos),
41                 state (npos)
42             {
43             }
44 
operator ==boost::lexer::basic_file_input::iterator::data45             bool operator == (const data &rhs_) const
46             {
47                 return id == rhs_.id && unique_id == rhs_.unique_id &&
48                     start == rhs_.start && end == rhs_.end &&
49                     state == rhs_.state;
50             }
51         };
52 
iterator()53         iterator () :
54             _input (0)
55         {
56         }
57 
operator ==(const iterator & rhs_) const58         bool operator == (const iterator &rhs_) const
59         {
60             return _data == rhs_._data;
61         }
62 
operator !=(const iterator & rhs_) const63         bool operator != (const iterator &rhs_) const
64         {
65             return !(*this == rhs_);
66         }
67 
operator *()68         data &operator * ()
69         {
70             return _data;
71         }
72 
operator ->()73         data *operator -> ()
74         {
75             return &_data;
76         }
77 
78         // Let compiler generate operator = ().
79 
80         // prefix version
operator ++()81         iterator &operator ++ ()
82         {
83             next_token ();
84             return *this;
85         }
86 
87         // postfix version
operator ++(int)88         iterator operator ++ (int)
89         {
90             iterator iter_ = *this;
91 
92             next_token ();
93             return iter_;
94         }
95 
next_token()96         void next_token ()
97         {
98             const detail::internals &internals_ =
99                 _input->_state_machine->data ();
100 
101             _data.start = _data.end;
102 
103             if (internals_._dfa->size () == 1)
104             {
105                 _data.id = _input->next (&internals_._lookup->front ()->
106                     front (), internals_._dfa_alphabet.front (),
107                     &internals_._dfa->front ()->front (), _data.start,
108                     _data.end, _data.unique_id);
109             }
110             else
111             {
112                 _data.id = _input->next (internals_, _data.state, _data.start,
113                     _data.end, _data.unique_id);
114             }
115 
116             if (_data.id == 0)
117             {
118                 _data.start = 0;
119                 _data.end = 0;
120                 // Ensure current state matches that returned by end().
121                 _data.state = npos;
122             }
123         }
124 
125     private:
126         // Not owner (obviously!)
127         basic_file_input *_input;
128         data _data;
129     };
130 
131     friend class iterator;
132 
133     // Make it explict that we are NOT taking a copy of state_machine_!
basic_file_input(const basic_state_machine<CharT> * state_machine_,std::basic_ifstream<CharT> * is_,const std::streamsize buffer_size_=4096,const std::streamsize buffer_increment_=1024)134     basic_file_input (const basic_state_machine<CharT> *state_machine_,
135         std::basic_ifstream<CharT> *is_,
136         const std::streamsize buffer_size_ = 4096,
137         const std::streamsize buffer_increment_ = 1024) :
138         _state_machine (state_machine_),
139         _stream (is_),
140         _buffer_size (buffer_size_),
141         _buffer_increment (buffer_increment_),
142         _buffer (_buffer_size, '!')
143     {
144         _start_buffer = &_buffer.front ();
145         _end_buffer = _start_buffer + _buffer.size ();
146         _start_token = _end_buffer;
147         _end_token = _end_buffer;
148     }
149 
begin()150     iterator begin ()
151     {
152         iterator iter_;
153 
154         iter_._input = this;
155         // Over-ride default of 0 (EOF)
156         iter_._data.id = npos;
157         iter_._data.start = 0;
158         iter_._data.end = 0;
159         iter_._data.state = 0;
160         ++iter_;
161         return iter_;
162     }
163 
end()164     iterator end ()
165     {
166         iterator iter_;
167 
168         iter_._input = this;
169         iter_._data.start = 0;
170         iter_._data.end = 0;
171         return iter_;
172     }
173 
flush()174     void flush ()
175     {
176         // This temporary is mandatory, otherwise the
177         // pointer calculations won't work!
178         const CharT *temp_ = _end_buffer;
179 
180         _start_token = _end_token = _end_buffer;
181         reload_buffer (temp_, true, _end_token);
182     }
183 
184 private:
185     typedef std::basic_istream<CharT> istream;
186     typedef std::vector<CharT> buffer;
187 
188     const basic_state_machine<CharT> *_state_machine;
189     const std::streamsize _buffer_size;
190     const std::streamsize _buffer_increment;
191 
192     buffer _buffer;
193     CharT *_start_buffer;
194     istream *_stream;
195     const CharT *_start_token;
196     const CharT *_end_token;
197     CharT *_end_buffer;
198 
next(const detail::internals & internals_,std::size_t & start_state_,const CharT * & start_,const CharT * & end_,std::size_t & unique_id_)199     std::size_t next (const detail::internals &internals_,
200         std::size_t &start_state_, const CharT * &start_, const CharT * &end_,
201         std::size_t &unique_id_)
202     {
203         _start_token = _end_token;
204 
205 again:
206         const std::size_t * lookup_ = &internals_._lookup[start_state_]->
207             front ();
208         std::size_t dfa_alphabet_ = internals_._dfa_alphabet[start_state_];
209         const std::size_t *dfa_ = &internals_._dfa[start_state_]->front ();
210         const std::size_t *ptr_ = dfa_ + dfa_alphabet_;
211         const CharT *curr_ = _start_token;
212         bool end_state_ = *ptr_ != 0;
213         std::size_t id_ = *(ptr_ + id_index);
214         std::size_t uid_ = *(ptr_ + unique_id_index);
215         const CharT *end_token_ = curr_;
216 
217         for (;;)
218         {
219             if (curr_ >= _end_buffer)
220             {
221                 if (!reload_buffer (curr_, end_state_, end_token_))
222                 {
223                     // EOF
224                     break;
225                 }
226             }
227 
228             const std::size_t BOL_state_ = ptr_[bol_index];
229             const std::size_t EOL_state_ = ptr_[eol_index];
230 
231             if (BOL_state_ && (_start_token == _start_buffer ||
232                 *(_start_token - 1) == '\n'))
233             {
234                 ptr_ = &dfa_[BOL_state_ * dfa_alphabet_];
235             }
236             else if (EOL_state_ && *curr_ == '\n')
237             {
238                 ptr_ = &dfa_[EOL_state_ * dfa_alphabet_];
239             }
240             else
241             {
242                 const std::size_t state_ =
243                     ptr_[lookup_[static_cast<typename Traits::index_type>
244                         (*curr_++)]];
245 
246                 if (state_ == 0)
247                 {
248                     break;
249                 }
250 
251                 ptr_ = &dfa_[state_ * dfa_alphabet_];
252             }
253 
254             if (*ptr_)
255             {
256                 end_state_ = true;
257                 id_ = *(ptr_ + id_index);
258                 uid_ = *(ptr_ + unique_id_index);
259                 start_state_ = *(ptr_ + state_index);
260                 end_token_ = curr_;
261             }
262         }
263 
264         if (_start_token >= _end_buffer)
265         {
266             // No more tokens...
267             unique_id_ = npos;
268             return 0;
269         }
270 
271         const std::size_t EOL_state_ = ptr_[eol_index];
272 
273         if (EOL_state_ && curr_ == end_)
274         {
275             ptr_ = &dfa_[EOL_state_ * dfa_alphabet_];
276 
277             if (*ptr_)
278             {
279                 end_state_ = true;
280                 id_ = *(ptr_ + id_index);
281                 uid_ = *(ptr_ + unique_id_index);
282                 start_state_ = *(ptr_ + state_index);
283                 end_token_ = curr_;
284             }
285         }
286 
287         if (end_state_)
288         {
289             // return longest match
290             _end_token = end_token_;
291 
292             if (id_ == 0) goto again;
293         }
294         else
295         {
296             // No match causes char to be skipped
297             _end_token = _start_token + 1;
298             id_ = npos;
299             uid_ = npos;
300         }
301 
302         start_ = _start_token;
303         end_ = _end_token;
304         unique_id_ = uid_;
305         return id_;
306     }
307 
next(const std::size_t * const lookup_,const std::size_t dfa_alphabet_,const std::size_t * const dfa_,const CharT * & start_,const CharT * & end_,std::size_t & unique_id_)308     std::size_t next (const std::size_t * const lookup_,
309         const std::size_t dfa_alphabet_, const std::size_t * const dfa_,
310         const CharT * &start_, const CharT * &end_, std::size_t &unique_id_)
311     {
312         _start_token = _end_token;
313 
314         const std::size_t *ptr_ = dfa_ + dfa_alphabet_;
315         const CharT *curr_ = _start_token;
316         bool end_state_ = *ptr_ != 0;
317         std::size_t id_ = *(ptr_ + id_index);
318         std::size_t uid_ = *(ptr_ + unique_id_index);
319         const CharT *end_token_ = curr_;
320 
321         for (;;)
322         {
323             if (curr_ >= _end_buffer)
324             {
325                 if (!reload_buffer (curr_, end_state_, end_token_))
326                 {
327                     // EOF
328                     break;
329                 }
330             }
331 
332             const std::size_t BOL_state_ = ptr_[bol_index];
333             const std::size_t EOL_state_ = ptr_[eol_index];
334 
335             if (BOL_state_ && (_start_token == _start_buffer ||
336                 *(_start_token - 1) == '\n'))
337             {
338                 ptr_ = &dfa_[BOL_state_ * dfa_alphabet_];
339             }
340             else if (EOL_state_ && *curr_ == '\n')
341             {
342                 ptr_ = &dfa_[EOL_state_ * dfa_alphabet_];
343             }
344             else
345             {
346                 const std::size_t state_ =
347                     ptr_[lookup_[static_cast<typename Traits::index_type>
348                         (*curr_++)]];
349 
350                 if (state_ == 0)
351                 {
352                     break;
353                 }
354 
355                 ptr_ = &dfa_[state_ * dfa_alphabet_];
356             }
357 
358             if (*ptr_)
359             {
360                 end_state_ = true;
361                 id_ = *(ptr_ + id_index);
362                 uid_ = *(ptr_ + unique_id_index);
363                 end_token_ = curr_;
364             }
365         }
366 
367         if (_start_token >= _end_buffer)
368         {
369             // No more tokens...
370             unique_id_ = npos;
371             return 0;
372         }
373 
374         const std::size_t EOL_state_ = ptr_[eol_index];
375 
376         if (EOL_state_ && curr_ == end_)
377         {
378             ptr_ = &dfa_[EOL_state_ * dfa_alphabet_];
379 
380             if (*ptr_)
381             {
382                 end_state_ = true;
383                 id_ = *(ptr_ + id_index);
384                 uid_ = *(ptr_ + unique_id_index);
385                 end_token_ = curr_;
386             }
387         }
388 
389         if (end_state_)
390         {
391             // return longest match
392             _end_token = end_token_;
393         }
394         else
395         {
396             // No match causes char to be skipped
397             _end_token = _start_token + 1;
398             id_ = npos;
399             uid_ = npos;
400         }
401 
402         start_ = _start_token;
403         end_ = _end_token;
404         unique_id_ = uid_;
405         return id_;
406     }
407 
reload_buffer(const CharT * & curr_,const bool end_state_,const CharT * & end_token_)408     bool reload_buffer (const CharT * &curr_, const bool end_state_,
409         const CharT * &end_token_)
410     {
411         bool success_ = !_stream->eof ();
412 
413         if (success_)
414         {
415             const CharT *old_start_token_ = _start_token;
416             std::size_t old_size_ = _buffer.size ();
417             std::size_t count_ = 0;
418 
419             if (_start_token - 1 == _start_buffer)
420             {
421                 // Run out of buffer space, so increase.
422                 _buffer.resize (old_size_ + _buffer_increment, '!');
423                 _start_buffer = &_buffer.front ();
424                 _start_token = _start_buffer + 1;
425                 _stream->read (_start_buffer + old_size_,
426                     _buffer_increment);
427                 count_ = _stream->gcount ();
428                 _end_buffer = _start_buffer + old_size_ + count_;
429             }
430             else if (_start_token < _end_buffer)
431             {
432                 const std::size_t len_ = _end_buffer - _start_token;
433                 // Some systems have memcpy in namespace std.
434                 using namespace std;
435 
436                 memcpy (_start_buffer, _start_token - 1, (len_ + 1) *
437                     sizeof (CharT));
438                 _stream->read (_start_buffer + len_ + 1,
439                     static_cast<std::streamsize> (_buffer.size () - len_ - 1));
440                 count_ = _stream->gcount ();
441                 _start_token = _start_buffer + 1;
442                 _end_buffer = _start_buffer + len_ + 1 + count_;
443             }
444             else
445             {
446                 _stream->read (_start_buffer, static_cast<std::streamsize>
447                     (_buffer.size ()));
448                 count_ = _stream->gcount ();
449                 _start_token = _start_buffer;
450                 _end_buffer = _start_buffer + count_;
451             }
452 
453             if (end_state_)
454             {
455                 end_token_ = _start_token +
456                     (end_token_ - old_start_token_);
457             }
458 
459             curr_ = _start_token + (curr_ - old_start_token_);
460         }
461 
462         return success_;
463     }
464 
465     // Disallow copying of buffer
466     basic_file_input (const basic_file_input &);
467     const basic_file_input &operator = (const basic_file_input &);
468 };
469 
470 typedef basic_file_input<char> file_input;
471 typedef basic_file_input<wchar_t> wfile_input;
472 }
473 }
474 
475 #endif
476