1 // input.hpp
2 // Copyright (c) 2008-2009 Ben Hanson (http://www.benhanson.net/)
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See accompanying
5 // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6 #ifndef BOOST_LEXER_INPUT
7 #define BOOST_LEXER_INPUT
8 
9 #include "char_traits.hpp"
10 #include <boost/detail/iterator.hpp>
11 #include "size_t.hpp"
12 #include "state_machine.hpp"
13 
14 namespace boost
15 {
16 namespace lexer
17 {
18 template<typename FwdIter, typename Traits =
19     char_traits<typename boost::detail::iterator_traits<FwdIter>::value_type> >
20 class basic_input
21 {
22 public:
23     class iterator
24     {
25     public:
26         friend class basic_input;
27 
28         struct data
29         {
30             std::size_t id;
31             std::size_t unique_id;
32             FwdIter start;
33             FwdIter end;
34             bool bol;
35             std::size_t state;
36 
37             // Construct in end() state.
databoost::lexer::basic_input::iterator::data38             data () :
39                 id (0),
40                 unique_id (npos),
41                 bol (false),
42                 state (npos)
43             {
44             }
45 
operator ==boost::lexer::basic_input::iterator::data46             bool operator == (const data &rhs_) const
47             {
48                 return id == rhs_.id && unique_id == rhs_.unique_id &&
49                     start == rhs_.start && end == rhs_.end &&
50                     bol == rhs_.bol && state == rhs_.state;
51             }
52         };
53 
iterator()54         iterator () :
55             _input (0)
56         {
57         }
58 
operator ==(const iterator & rhs_) const59         bool operator == (const iterator &rhs_) const
60         {
61             return _data == rhs_._data;
62         }
63 
operator !=(const iterator & rhs_) const64         bool operator != (const iterator &rhs_) const
65         {
66             return !(*this == rhs_);
67         }
68 
operator *()69         data &operator * ()
70         {
71             return _data;
72         }
73 
operator ->()74         data *operator -> ()
75         {
76             return &_data;
77         }
78 
79         // Let compiler generate operator = ().
80 
81         // prefix version
operator ++()82         iterator &operator ++ ()
83         {
84             next_token ();
85             return *this;
86         }
87 
88         // postfix version
operator ++(int)89         iterator operator ++ (int)
90         {
91             iterator iter_ = *this;
92 
93             next_token ();
94             return iter_;
95         }
96 
97     private:
98         // Not owner (obviously!)
99         const basic_input *_input;
100         data _data;
101 
next_token()102         void next_token ()
103         {
104             const detail::internals &internals_ =
105                 _input->_state_machine->data ();
106 
107             _data.start = _data.end;
108 
109             if (internals_._dfa->size () == 1)
110             {
111                 if (internals_._seen_BOL_assertion ||
112                     internals_._seen_EOL_assertion)
113                 {
114                     _data.id = next
115                         (&internals_._lookup->front ()->front (),
116                         internals_._dfa_alphabet.front (),
117                         &internals_._dfa->front ()->front (),
118                         _data.bol, _data.end, _input->_end, _data.unique_id);
119                 }
120                 else
121                 {
122                     _data.id = next (&internals_._lookup->front ()->front (),
123                         internals_._dfa_alphabet.front (), &internals_.
124                         _dfa->front ()->front (), _data.end, _input->_end,
125                         _data.unique_id);
126                 }
127             }
128             else
129             {
130                 if (internals_._seen_BOL_assertion ||
131                     internals_._seen_EOL_assertion)
132                 {
133                     _data.id = next (internals_, _data.state,
134                         _data.bol, _data.end, _input->_end, _data.unique_id);
135                 }
136                 else
137                 {
138                     _data.id = next (internals_, _data.state,
139                         _data.end, _input->_end, _data.unique_id);
140                 }
141             }
142 
143             if (_data.end == _input->_end && _data.start == _data.end)
144             {
145                 // Ensure current state matches that returned by end().
146                 _data.state = npos;
147             }
148         }
149 
next(const detail::internals & internals_,std::size_t & start_state_,bool bol_,FwdIter & start_token_,const FwdIter & end_,std::size_t & unique_id_)150         std::size_t next (const detail::internals &internals_,
151             std::size_t &start_state_, bool bol_,
152             FwdIter &start_token_, const FwdIter &end_,
153             std::size_t &unique_id_)
154         {
155             if (start_token_ == end_)
156             {
157                 unique_id_ = npos;
158                 return 0;
159             }
160 
161         again:
162             const std::size_t * lookup_ = &internals_._lookup[start_state_]->
163                 front ();
164             std::size_t dfa_alphabet_ = internals_._dfa_alphabet[start_state_];
165             const std::size_t *dfa_ = &internals_._dfa[start_state_]->front ();
166             const std::size_t *ptr_ = dfa_ + dfa_alphabet_;
167             FwdIter curr_ = start_token_;
168             bool end_state_ = *ptr_ != 0;
169             std::size_t id_ = *(ptr_ + id_index);
170             std::size_t uid_ = *(ptr_ + unique_id_index);
171             std::size_t end_start_state_ = start_state_;
172             bool end_bol_ = bol_;
173             FwdIter end_token_ = start_token_;
174 
175             while (curr_ != end_)
176             {
177                 const std::size_t BOL_state_ = ptr_[bol_index];
178                 const std::size_t EOL_state_ = ptr_[eol_index];
179 
180                 if (BOL_state_ && bol_)
181                 {
182                     ptr_ = &dfa_[BOL_state_ * dfa_alphabet_];
183                 }
184                 else if (EOL_state_ && *curr_ == '\n')
185                 {
186                     ptr_ = &dfa_[EOL_state_ * dfa_alphabet_];
187                 }
188                 else
189                 {
190                     typename Traits::char_type prev_char_ = *curr_++;
191 
192                     bol_ = prev_char_ == '\n';
193 
194                     const std::size_t state_ =
195                         ptr_[lookup_[static_cast<typename Traits::index_type>
196                         (prev_char_)]];
197 
198                     if (state_ == 0)
199                     {
200                         break;
201                     }
202 
203                     ptr_ = &dfa_[state_ * dfa_alphabet_];
204                 }
205 
206                 if (*ptr_)
207                 {
208                     end_state_ = true;
209                     id_ = *(ptr_ + id_index);
210                     uid_ = *(ptr_ + unique_id_index);
211                     end_start_state_ = *(ptr_ + state_index);
212                     end_bol_ = bol_;
213                     end_token_ = curr_;
214                 }
215             }
216 
217             const std::size_t EOL_state_ = ptr_[eol_index];
218 
219             if (EOL_state_ && curr_ == end_)
220             {
221                 ptr_ = &dfa_[EOL_state_ * dfa_alphabet_];
222 
223                 if (*ptr_)
224                 {
225                     end_state_ = true;
226                     id_ = *(ptr_ + id_index);
227                     uid_ = *(ptr_ + unique_id_index);
228                     end_start_state_ = *(ptr_ + state_index);
229                     end_bol_ = bol_;
230                     end_token_ = curr_;
231                 }
232             }
233 
234             if (end_state_)
235             {
236                 // return longest match
237                 start_state_ = end_start_state_;
238                 start_token_ = end_token_;
239 
240                 if (id_ == 0)
241                 {
242                     bol_ = end_bol_;
243                     goto again;
244                 }
245                 else
246                 {
247                     _data.bol = end_bol_;
248                 }
249             }
250             else
251             {
252                 // No match causes char to be skipped
253                 _data.bol = *start_token_ == '\n';
254                 ++start_token_;
255                 id_ = npos;
256                 uid_ = npos;
257             }
258 
259             unique_id_ = uid_;
260             return id_;
261         }
262 
next(const detail::internals & internals_,std::size_t & start_state_,FwdIter & start_token_,FwdIter const & end_,std::size_t & unique_id_)263         std::size_t next (const detail::internals &internals_,
264             std::size_t &start_state_, FwdIter &start_token_,
265             FwdIter const &end_, std::size_t &unique_id_)
266         {
267             if (start_token_ == end_)
268             {
269                 unique_id_ = npos;
270                 return 0;
271             }
272 
273         again:
274             const std::size_t * lookup_ = &internals_._lookup[start_state_]->
275                 front ();
276             std::size_t dfa_alphabet_ = internals_._dfa_alphabet[start_state_];
277             const std::size_t *dfa_ = &internals_._dfa[start_state_]->front ();
278             const std::size_t *ptr_ = dfa_ + dfa_alphabet_;
279             FwdIter curr_ = start_token_;
280             bool end_state_ = *ptr_ != 0;
281             std::size_t id_ = *(ptr_ + id_index);
282             std::size_t uid_ = *(ptr_ + unique_id_index);
283             std::size_t end_start_state_ = start_state_;
284             FwdIter end_token_ = start_token_;
285 
286             while (curr_ != end_)
287             {
288                 const std::size_t state_ = ptr_[lookup_[static_cast
289                     <typename Traits::index_type>(*curr_++)]];
290 
291                 if (state_ == 0)
292                 {
293                     break;
294                 }
295 
296                 ptr_ = &dfa_[state_ * dfa_alphabet_];
297 
298                 if (*ptr_)
299                 {
300                     end_state_ = true;
301                     id_ = *(ptr_ + id_index);
302                     uid_ = *(ptr_ + unique_id_index);
303                     end_start_state_ = *(ptr_ + state_index);
304                     end_token_ = curr_;
305                 }
306             }
307 
308             if (end_state_)
309             {
310                 // return longest match
311                 start_state_ = end_start_state_;
312                 start_token_ = end_token_;
313 
314                 if (id_ == 0) goto again;
315             }
316             else
317             {
318                 // No match causes char to be skipped
319                 ++start_token_;
320                 id_ = npos;
321                 uid_ = npos;
322             }
323 
324             unique_id_ = uid_;
325             return id_;
326         }
327 
next(const std::size_t * const lookup_,const std::size_t dfa_alphabet_,const std::size_t * const dfa_,bool bol_,FwdIter & start_token_,FwdIter const & end_,std::size_t & unique_id_)328         std::size_t next (const std::size_t * const lookup_,
329             const std::size_t dfa_alphabet_, const std::size_t * const dfa_,
330             bool bol_, FwdIter &start_token_, FwdIter const &end_,
331             std::size_t &unique_id_)
332         {
333             if (start_token_ == end_)
334             {
335                 unique_id_ = npos;
336                 return 0;
337             }
338 
339             const std::size_t *ptr_ = dfa_ + dfa_alphabet_;
340             FwdIter curr_ = start_token_;
341             bool end_state_ = *ptr_ != 0;
342             std::size_t id_ = *(ptr_ + id_index);
343             std::size_t uid_ = *(ptr_ + unique_id_index);
344             bool end_bol_ = bol_;
345             FwdIter end_token_ = start_token_;
346 
347             while (curr_ != end_)
348             {
349                 const std::size_t BOL_state_ = ptr_[bol_index];
350                 const std::size_t EOL_state_ = ptr_[eol_index];
351 
352                 if (BOL_state_ && bol_)
353                 {
354                     ptr_ = &dfa_[BOL_state_ * dfa_alphabet_];
355                 }
356                 else if (EOL_state_ && *curr_ == '\n')
357                 {
358                     ptr_ = &dfa_[EOL_state_ * dfa_alphabet_];
359                 }
360                 else
361                 {
362                     typename Traits::char_type prev_char_ = *curr_++;
363 
364                     bol_ = prev_char_ == '\n';
365 
366                     const std::size_t state_ =
367                         ptr_[lookup_[static_cast<typename Traits::index_type>
368                         (prev_char_)]];
369 
370                     if (state_ == 0)
371                     {
372                         break;
373                     }
374 
375                     ptr_ = &dfa_[state_ * dfa_alphabet_];
376                 }
377 
378                 if (*ptr_)
379                 {
380                     end_state_ = true;
381                     id_ = *(ptr_ + id_index);
382                     uid_ = *(ptr_ + unique_id_index);
383                     end_bol_ = bol_;
384                     end_token_ = curr_;
385                 }
386             }
387 
388             const std::size_t EOL_state_ = ptr_[eol_index];
389 
390             if (EOL_state_ && curr_ == end_)
391             {
392                 ptr_ = &dfa_[EOL_state_ * dfa_alphabet_];
393 
394                 if (*ptr_)
395                 {
396                     end_state_ = true;
397                     id_ = *(ptr_ + id_index);
398                     uid_ = *(ptr_ + unique_id_index);
399                     end_bol_ = bol_;
400                     end_token_ = curr_;
401                 }
402             }
403 
404             if (end_state_)
405             {
406                 // return longest match
407                 _data.bol = end_bol_;
408                 start_token_ = end_token_;
409             }
410             else
411             {
412                 // No match causes char to be skipped
413                 _data.bol = *start_token_ == '\n';
414                 ++start_token_;
415                 id_ = npos;
416                 uid_ = npos;
417             }
418 
419             unique_id_ = uid_;
420             return id_;
421         }
422 
next(const std::size_t * const lookup_,const std::size_t dfa_alphabet_,const std::size_t * const dfa_,FwdIter & start_token_,FwdIter const & end_,std::size_t & unique_id_)423         std::size_t next (const std::size_t * const lookup_,
424             const std::size_t dfa_alphabet_, const std::size_t * const dfa_,
425             FwdIter &start_token_, FwdIter const &end_,
426             std::size_t &unique_id_)
427         {
428             if (start_token_ == end_)
429             {
430                 unique_id_ = npos;
431                 return 0;
432             }
433 
434             const std::size_t *ptr_ = dfa_ + dfa_alphabet_;
435             FwdIter curr_ = start_token_;
436             bool end_state_ = *ptr_ != 0;
437             std::size_t id_ = *(ptr_ + id_index);
438             std::size_t uid_ = *(ptr_ + unique_id_index);
439             FwdIter end_token_ = start_token_;
440 
441             while (curr_ != end_)
442             {
443                 const std::size_t state_ = ptr_[lookup_[static_cast
444                     <typename Traits::index_type>(*curr_++)]];
445 
446                 if (state_ == 0)
447                 {
448                     break;
449                 }
450 
451                 ptr_ = &dfa_[state_ * dfa_alphabet_];
452 
453                 if (*ptr_)
454                 {
455                     end_state_ = true;
456                     id_ = *(ptr_ + id_index);
457                     uid_ = *(ptr_ + unique_id_index);
458                     end_token_ = curr_;
459                 }
460             }
461 
462             if (end_state_)
463             {
464                 // return longest match
465                 start_token_ = end_token_;
466             }
467             else
468             {
469                 // No match causes char to be skipped
470                 ++start_token_;
471                 id_ = npos;
472                 uid_ = npos;
473             }
474 
475             unique_id_ = uid_;
476             return id_;
477         }
478     };
479 
480     friend class iterator;
481 
482     // Make it explict that we are NOT taking a copy of state_machine_!
basic_input(const basic_state_machine<typename Traits::char_type> * state_machine_,const FwdIter & begin_,const FwdIter & end_)483     basic_input (const basic_state_machine<typename Traits::char_type>
484         *state_machine_, const FwdIter &begin_, const FwdIter &end_) :
485         _state_machine (state_machine_),
486         _begin (begin_),
487         _end (end_)
488     {
489     }
490 
begin() const491     iterator begin () const
492     {
493         iterator iter_;
494 
495         iter_._input = this;
496         // Over-ride default of 0 (EOI)
497         iter_._data.id = npos;
498         iter_._data.start = _begin;
499         iter_._data.end = _begin;
500         iter_._data.bol = _state_machine->data ()._seen_BOL_assertion;
501         iter_._data.state = 0;
502         ++iter_;
503         return iter_;
504     }
505 
end() const506     iterator end () const
507     {
508         iterator iter_;
509 
510         iter_._input = this;
511         iter_._data.start = _end;
512         iter_._data.end = _end;
513         return iter_;
514     }
515 
516 private:
517     const basic_state_machine<typename Traits::char_type> *_state_machine;
518     FwdIter _begin;
519     FwdIter _end;
520 };
521 
522 typedef basic_input<std::string::iterator> iter_input;
523 typedef basic_input<std::basic_string<wchar_t>::iterator> iter_winput;
524 typedef basic_input<const char *> ptr_input;
525 typedef basic_input<const wchar_t *> ptr_winput;
526 }
527 }
528 
529 #endif
530