1  ///////////////////////////////////////////////////////////////////////////////
2 /// \file regex_token_iterator.hpp
3 /// Contains the definition of regex_token_iterator, and STL-compatible iterator
4 /// for tokenizing a string using a regular expression.
5 //
6 //  Copyright 2008 Eric Niebler. Distributed under the Boost
7 //  Software License, Version 1.0. (See accompanying file
8 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9 
10 #ifndef BOOST_XPRESSIVE_REGEX_TOKEN_ITERATOR_HPP_EAN_10_04_2005
11 #define BOOST_XPRESSIVE_REGEX_TOKEN_ITERATOR_HPP_EAN_10_04_2005
12 
13 // MS compatible compilers support #pragma once
14 #if defined(_MSC_VER)
15 # pragma once
16 #endif
17 
18 #include <vector>
19 #include <boost/assert.hpp>
20 #include <boost/mpl/assert.hpp>
21 #include <boost/type_traits/is_same.hpp>
22 #include <boost/type_traits/is_convertible.hpp>
23 #include <boost/xpressive/regex_iterator.hpp>
24 
25 namespace boost { namespace xpressive { namespace detail
26 {
27 
28 //////////////////////////////////////////////////////////////////////////
29 // regex_token_iterator_impl
30 //
31 template<typename BidiIter>
32 struct regex_token_iterator_impl
33   : counted_base<regex_token_iterator_impl<BidiIter> >
34 {
35     typedef sub_match<BidiIter> value_type;
36 
regex_token_iterator_implboost::xpressive::detail::regex_token_iterator_impl37     regex_token_iterator_impl
38     (
39         BidiIter begin
40       , BidiIter cur
41       , BidiIter end
42       , BidiIter next_search
43       , basic_regex<BidiIter> const &rex
44       , regex_constants::match_flag_type flags = regex_constants::match_default
45       , std::vector<int> subs = std::vector<int>(1, 0)
46       , int n = -2
47       , bool not_null = false
48     )
49       : iter_(begin, cur, end, next_search, rex, flags, not_null)
50       , result_()
51       , n_((-2 == n) ? (int)subs.size() - 1 : n)
52       , subs_()
53     {
54         BOOST_ASSERT(0 != subs.size());
55         this->subs_.swap(subs);
56     }
57 
nextboost::xpressive::detail::regex_token_iterator_impl58     bool next()
59     {
60         if(-1 != this->n_)
61         {
62             BidiIter cur = this->iter_.state_.cur_;
63             if(0 != (++this->n_ %= (int)this->subs_.size()) || this->iter_.next())
64             {
65                 this->result_ = (-1 == this->subs_[ this->n_ ])
66                     ? this->iter_.what_.prefix()
67                     : this->iter_.what_[ this->subs_[ this->n_ ] ];
68                 return true;
69             }
70             else if(-1 == this->subs_[ this->n_-- ] && cur != this->iter_.state_.end_)
71             {
72                 this->result_ = value_type(cur, this->iter_.state_.end_, true);
73                 return true;
74             }
75         }
76 
77         return false;
78     }
79 
equal_toboost::xpressive::detail::regex_token_iterator_impl80     bool equal_to(regex_token_iterator_impl<BidiIter> const &that) const
81     {
82         return this->iter_.equal_to(that.iter_) && this->n_ == that.n_;
83     }
84 
85     regex_iterator_impl<BidiIter> iter_;
86     value_type result_;
87     int n_;
88     std::vector<int> subs_;
89 };
90 
get_mark_number(int i)91 inline int get_mark_number(int i)
92 {
93     return i;
94 }
95 
to_vector(int subs)96 inline std::vector<int> to_vector(int subs)
97 {
98     return std::vector<int>(1, subs);
99 }
100 
to_vector(std::vector<int> const & subs)101 inline std::vector<int> const &to_vector(std::vector<int> const &subs)
102 {
103     return subs;
104 }
105 
106 template<typename Int, std::size_t Size>
to_vector(Int const (& sub_matches)[Size])107 inline std::vector<int> to_vector(Int const (&sub_matches)[ Size ])
108 {
109     // so that people can specify sub-match indices inline with
110     // string literals, like "\1\2\3", leave off the trailing '\0'
111     std::size_t const size = Size - is_same<Int, char>::value;
112     std::vector<int> vect(size);
113     for(std::size_t i = 0; i < size; ++i)
114     {
115         vect[i] = get_mark_number(sub_matches[i]);
116     }
117     return vect;
118 }
119 
120 template<typename Int>
to_vector(std::vector<Int> const & sub_matches)121 inline std::vector<int> to_vector(std::vector<Int> const &sub_matches)
122 {
123     BOOST_MPL_ASSERT((is_convertible<Int, int>));
124     return std::vector<int>(sub_matches.begin(), sub_matches.end());
125 }
126 
127 } // namespace detail
128 
129 //////////////////////////////////////////////////////////////////////////
130 // regex_token_iterator
131 //
132 template<typename BidiIter>
133 struct regex_token_iterator
134 {
135     typedef basic_regex<BidiIter> regex_type;
136     typedef typename iterator_value<BidiIter>::type char_type;
137     typedef sub_match<BidiIter> value_type;
138     typedef std::ptrdiff_t difference_type;
139     typedef value_type const *pointer;
140     typedef value_type const &reference;
141     typedef std::forward_iterator_tag iterator_category;
142 
143     /// INTERNAL ONLY
144     typedef detail::regex_token_iterator_impl<BidiIter> impl_type_;
145 
146     /// \post \c *this is the end of sequence iterator.
regex_token_iteratorboost::xpressive::regex_token_iterator147     regex_token_iterator()
148       : impl_()
149     {
150     }
151 
152     /// \param begin The beginning of the character range to search.
153     /// \param end The end of the character range to search.
154     /// \param rex The regex pattern to search for.
155     /// \pre \c [begin,end) is a valid range.
regex_token_iteratorboost::xpressive::regex_token_iterator156     regex_token_iterator
157     (
158         BidiIter begin
159       , BidiIter end
160       , basic_regex<BidiIter> const &rex
161     )
162       : impl_()
163     {
164         if(0 != rex.regex_id())
165         {
166             this->impl_ = new impl_type_(begin, begin, end, begin, rex);
167             this->next_();
168         }
169     }
170 
171     /// \param begin The beginning of the character range to search.
172     /// \param end The end of the character range to search.
173     /// \param rex The regex pattern to search for.
174     /// \param args A let() expression with argument bindings for semantic actions.
175     /// \pre \c [begin,end) is a valid range.
176     template<typename LetExpr>
regex_token_iteratorboost::xpressive::regex_token_iterator177     regex_token_iterator
178     (
179         BidiIter begin
180       , BidiIter end
181       , basic_regex<BidiIter> const &rex
182       , detail::let_<LetExpr> const &args
183     )
184       : impl_()
185     {
186         if(0 != rex.regex_id())
187         {
188             this->impl_ = new impl_type_(begin, begin, end, begin, rex);
189             detail::bind_args(args, this->impl_->iter_.what_);
190             this->next_();
191         }
192     }
193 
194     /// \param begin The beginning of the character range to search.
195     /// \param end The end of the character range to search.
196     /// \param rex The regex pattern to search for.
197     /// \param subs A range of integers designating sub-matches to be treated as tokens.
198     /// \param flags Optional match flags, used to control how the expression is matched against the sequence. (See match_flag_type.)
199     /// \pre \c [begin,end) is a valid range.
200     /// \pre \c subs is either an integer greater or equal to -1,
201     ///     or else an array or non-empty \c std::vector\<\> of such integers.
202     template<typename Subs>
regex_token_iteratorboost::xpressive::regex_token_iterator203     regex_token_iterator
204     (
205         BidiIter begin
206       , BidiIter end
207       , basic_regex<BidiIter> const &rex
208       , Subs const &subs
209       , regex_constants::match_flag_type flags = regex_constants::match_default
210     )
211       : impl_()
212     {
213         if(0 != rex.regex_id())
214         {
215             this->impl_ = new impl_type_(begin, begin, end, begin, rex, flags, detail::to_vector(subs));
216             this->next_();
217         }
218     }
219 
220     /// \param begin The beginning of the character range to search.
221     /// \param end The end of the character range to search.
222     /// \param rex The regex pattern to search for.
223     /// \param subs A range of integers designating sub-matches to be treated as tokens.
224     /// \param args A let() expression with argument bindings for semantic actions.
225     /// \param flags Optional match flags, used to control how the expression is matched against the sequence. (See match_flag_type.)
226     /// \pre \c [begin,end) is a valid range.
227     /// \pre \c subs is either an integer greater or equal to -1,
228     ///     or else an array or non-empty \c std::vector\<\> of such integers.
229     template<typename Subs, typename LetExpr>
regex_token_iteratorboost::xpressive::regex_token_iterator230     regex_token_iterator
231     (
232         BidiIter begin
233       , BidiIter end
234       , basic_regex<BidiIter> const &rex
235       , Subs const &subs
236       , detail::let_<LetExpr> const &args
237       , regex_constants::match_flag_type flags = regex_constants::match_default
238     )
239       : impl_()
240     {
241         if(0 != rex.regex_id())
242         {
243             this->impl_ = new impl_type_(begin, begin, end, begin, rex, flags, detail::to_vector(subs));
244             detail::bind_args(args, this->impl_->iter_.what_);
245             this->next_();
246         }
247     }
248 
249     /// \post <tt>*this == that</tt>
regex_token_iteratorboost::xpressive::regex_token_iterator250     regex_token_iterator(regex_token_iterator<BidiIter> const &that)
251       : impl_(that.impl_) // COW
252     {
253     }
254 
255     /// \post <tt>*this == that</tt>
operator =boost::xpressive::regex_token_iterator256     regex_token_iterator<BidiIter> &operator =(regex_token_iterator<BidiIter> const &that)
257     {
258         this->impl_ = that.impl_; // COW
259         return *this;
260     }
261 
operator ==(regex_token_iterator<BidiIter> const & left,regex_token_iterator<BidiIter> const & right)262     friend bool operator ==(regex_token_iterator<BidiIter> const &left, regex_token_iterator<BidiIter> const &right)
263     {
264         if(!left.impl_ || !right.impl_)
265         {
266             return !left.impl_ && !right.impl_;
267         }
268 
269         return left.impl_->equal_to(*right.impl_);
270     }
271 
operator !=(regex_token_iterator<BidiIter> const & left,regex_token_iterator<BidiIter> const & right)272     friend bool operator !=(regex_token_iterator<BidiIter> const &left, regex_token_iterator<BidiIter> const &right)
273     {
274         return !(left == right);
275     }
276 
operator *boost::xpressive::regex_token_iterator277     value_type const &operator *() const
278     {
279         return this->impl_->result_;
280     }
281 
operator ->boost::xpressive::regex_token_iterator282     value_type const *operator ->() const
283     {
284         return &this->impl_->result_;
285     }
286 
287     /// If N == -1 then sets *this equal to the end of sequence iterator.
288     /// Otherwise if N+1 \< subs.size(), then increments N and sets result equal to
289     /// ((subs[N] == -1) ? value_type(what.prefix().str()) : value_type(what[subs[N]].str())).
290     /// Otherwise if what.prefix().first != what[0].second and if the element match_prev_avail is
291     /// not set in flags then sets it. Then locates the next match as if by calling
292     /// regex_search(what[0].second, end, what, *pre, flags), with the following variation:
293     /// in the event that the previous match found was of zero length (what[0].length() == 0)
294     /// then attempts to find a non-zero length match starting at what[0].second, only if that
295     /// fails and provided what[0].second != suffix().second does it look for a (possibly zero
296     /// length) match starting from what[0].second + 1.  If such a match is found then sets N
297     /// equal to zero, and sets result equal to
298     /// ((subs[N] == -1) ? value_type(what.prefix().str()) : value_type(what[subs[N]].str())).
299     /// Otherwise if no further matches were found, then let last_end be the endpoint of the last
300     /// match that was found. Then if last_end != end and subs[0] == -1 sets N equal to -1 and
301     /// sets result equal to value_type(last_end, end). Otherwise sets *this equal to the end
302     /// of sequence iterator.
operator ++boost::xpressive::regex_token_iterator303     regex_token_iterator<BidiIter> &operator ++()
304     {
305         this->fork_(); // un-share the implementation
306         this->next_();
307         return *this;
308     }
309 
operator ++boost::xpressive::regex_token_iterator310     regex_token_iterator<BidiIter> operator ++(int)
311     {
312         regex_token_iterator<BidiIter> tmp(*this);
313         ++*this;
314         return tmp;
315     }
316 
317 private:
318 
319     /// INTERNAL ONLY
fork_boost::xpressive::regex_token_iterator320     void fork_()
321     {
322         if(1 != this->impl_->use_count())
323         {
324             intrusive_ptr<impl_type_> clone = new impl_type_
325             (
326                 this->impl_->iter_.state_.begin_
327               , this->impl_->iter_.state_.cur_
328               , this->impl_->iter_.state_.end_
329               , this->impl_->iter_.state_.next_search_
330               , this->impl_->iter_.rex_
331               , this->impl_->iter_.flags_
332               , this->impl_->subs_
333               , this->impl_->n_
334               , this->impl_->iter_.not_null_
335             );
336 
337             // only copy the match_results struct if we have to. Note: if the next call
338             // to impl_->next() will return false or call regex_search, we don't need to
339             // copy the match_results struct.
340             if(-1 != this->impl_->n_ && this->impl_->n_ + 1 != static_cast<int>(this->impl_->subs_.size()))
341             {
342                 // BUGBUG This is expensive -- it causes the sequence_stack to be cleared.
343                 // Find a better way
344                 clone->iter_.what_ = this->impl_->iter_.what_;
345             }
346             else
347             {
348                 // At the very least, copy the action args
349                 detail::core_access<BidiIter>::get_action_args(clone->iter_.what_)
350                     = detail::core_access<BidiIter>::get_action_args(this->impl_->iter_.what_);
351             }
352 
353             this->impl_.swap(clone);
354         }
355     }
356 
357     /// INTERNAL ONLY
next_boost::xpressive::regex_token_iterator358     void next_()
359     {
360         BOOST_ASSERT(this->impl_ && 1 == this->impl_->use_count());
361         if(!this->impl_->next())
362         {
363             this->impl_ = 0;
364         }
365     }
366 
367     intrusive_ptr<impl_type_> impl_;
368 };
369 
370 }} // namespace boost::xpressive
371 
372 #endif
373