1 // string_token.hpp
2 // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/)
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See accompanying
5 // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6 #ifndef BOOST_LEXER_STRING_TOKEN_HPP
7 #define BOOST_LEXER_STRING_TOKEN_HPP
8 
9 #include <algorithm>
10 #include "size_t.hpp"
11 #include "consts.hpp" // num_chars, num_wchar_ts
12 #include <string>
13 #include <limits>
14 
15 namespace boost
16 {
17 namespace lexer
18 {
19 template<typename CharT>
20 struct basic_string_token
21 {
22     typedef std::basic_string<CharT> string;
23 
24     bool _negated;
25     string _charset;
26 
basic_string_tokenboost::lexer::basic_string_token27     basic_string_token () :
28         _negated (false)
29     {
30     }
31 
basic_string_tokenboost::lexer::basic_string_token32     basic_string_token (const bool negated_, const string &charset_) :
33         _negated (negated_),
34         _charset (charset_)
35     {
36     }
37 
remove_duplicatesboost::lexer::basic_string_token38     void remove_duplicates ()
39     {
40         const CharT *start_ = _charset.c_str ();
41         const CharT *end_ = start_ + _charset.size ();
42 
43         // Optimisation for very large charsets:
44         // sorting via pointers is much quicker than
45         // via iterators...
46         std::sort (const_cast<CharT *> (start_), const_cast<CharT *> (end_));
47         _charset.erase (std::unique (_charset.begin (), _charset.end ()),
48             _charset.end ());
49     }
50 
normaliseboost::lexer::basic_string_token51     void normalise ()
52     {
53         const std::size_t max_chars_ = sizeof (CharT) == 1 ?
54             num_chars : num_wchar_ts;
55 
56         if (_charset.length () == max_chars_)
57         {
58             _negated = !_negated;
59             _charset.clear ();
60         }
61         else if (_charset.length () > max_chars_ / 2)
62         {
63             negate ();
64         }
65     }
66 
negateboost::lexer::basic_string_token67     void negate ()
68     {
69         const std::size_t max_chars_ = sizeof (CharT) == 1 ?
70             num_chars : num_wchar_ts;
71         CharT curr_char_ = (std::numeric_limits<CharT>::min)();
72         string temp_;
73         const CharT *curr_ = _charset.c_str ();
74         const CharT *chars_end_ = curr_ + _charset.size ();
75 
76         _negated = !_negated;
77         temp_.resize (max_chars_ - _charset.size ());
78 
79         CharT *ptr_ = const_cast<CharT *> (temp_.c_str ());
80         std::size_t i_ = 0;
81 
82         while (curr_ < chars_end_)
83         {
84             while (*curr_ > curr_char_)
85             {
86                 *ptr_ = curr_char_;
87                 ++ptr_;
88                 ++curr_char_;
89                 ++i_;
90             }
91 
92             ++curr_char_;
93             ++curr_;
94             ++i_;
95         }
96 
97         for (; i_ < max_chars_; ++i_)
98         {
99             *ptr_ = curr_char_;
100             ++ptr_;
101             ++curr_char_;
102         }
103 
104         _charset = temp_;
105     }
106 
operator <boost::lexer::basic_string_token107     bool operator < (const basic_string_token &rhs_) const
108     {
109         return _negated < rhs_._negated ||
110             (_negated == rhs_._negated && _charset < rhs_._charset);
111     }
112 
emptyboost::lexer::basic_string_token113     bool empty () const
114     {
115         return _charset.empty () && !_negated;
116     }
117 
anyboost::lexer::basic_string_token118     bool any () const
119     {
120         return _charset.empty () && _negated;
121     }
122 
clearboost::lexer::basic_string_token123     void clear ()
124     {
125         _negated = false;
126         _charset.clear ();
127     }
128 
intersectboost::lexer::basic_string_token129     void intersect (basic_string_token &rhs_, basic_string_token &overlap_)
130     {
131         if ((any () && rhs_.any ()) || (_negated == rhs_._negated &&
132             !any () && !rhs_.any ()))
133         {
134             intersect_same_types (rhs_, overlap_);
135         }
136         else
137         {
138             intersect_diff_types (rhs_, overlap_);
139         }
140     }
141 
escape_charboost::lexer::basic_string_token142     static void escape_char (const CharT ch_, string &out_)
143     {
144         switch (ch_)
145         {
146             case '\0':
147                 out_ += '\\';
148                 out_ += '0';
149                 break;
150             case '\a':
151                 out_ += '\\';
152                 out_ += 'a';
153                 break;
154             case '\b':
155                 out_ += '\\';
156                 out_ += 'b';
157                 break;
158             case 27:
159                 out_ += '\\';
160                 out_ += 'x';
161                 out_ += '1';
162                 out_ += 'b';
163                 break;
164             case '\f':
165                 out_ += '\\';
166                 out_ += 'f';
167                 break;
168             case '\n':
169                 out_ += '\\';
170                 out_ += 'n';
171                 break;
172             case '\r':
173                 out_ += '\\';
174                 out_ += 'r';
175                 break;
176             case '\t':
177                 out_ += '\\';
178                 out_ += 't';
179                 break;
180             case '\v':
181                 out_ += '\\';
182                 out_ += 'v';
183                 break;
184             case '\\':
185                 out_ += '\\';
186                 out_ += '\\';
187                 break;
188             case '"':
189                 out_ += '\\';
190                 out_ += '"';
191                 break;
192             case '\'':
193                 out_ += '\\';
194                 out_ += '\'';
195                 break;
196             default:
197             {
198                 if (ch_ < 32 && ch_ >= 0)
199                 {
200                     std::basic_stringstream<CharT> ss_;
201 
202                     out_ += '\\';
203                     out_ += 'x';
204                     ss_ << std::hex <<
205                         static_cast<std::size_t> (ch_);
206                     out_ += ss_.str ();
207                 }
208                 else
209                 {
210                     out_ += ch_;
211                 }
212 
213                 break;
214             }
215         }
216     }
217 
218 private:
intersect_same_typesboost::lexer::basic_string_token219     void intersect_same_types (basic_string_token &rhs_,
220         basic_string_token &overlap_)
221     {
222         if (any ())
223         {
224             clear ();
225             overlap_._negated = true;
226             rhs_.clear ();
227         }
228         else
229         {
230             typename string::iterator iter_ = _charset.begin ();
231             typename string::iterator end_ = _charset.end ();
232             typename string::iterator rhs_iter_ = rhs_._charset.begin ();
233             typename string::iterator rhs_end_ = rhs_._charset.end ();
234 
235             overlap_._negated = _negated;
236 
237             while (iter_ != end_ && rhs_iter_ != rhs_end_)
238             {
239                 if (*iter_ < *rhs_iter_)
240                 {
241                     ++iter_;
242                 }
243                 else if (*iter_ > *rhs_iter_)
244                 {
245                     ++rhs_iter_;
246                 }
247                 else
248                 {
249                     overlap_._charset += *iter_;
250                     iter_ = _charset.erase (iter_);
251                     end_ = _charset.end ();
252                     rhs_iter_ = rhs_._charset.erase (rhs_iter_);
253                     rhs_end_ = rhs_._charset.end ();
254                 }
255             }
256 
257             if (_negated)
258             {
259                 // duplicates already merged, so safe to merge
260                 // using std lib.
261 
262                 // src, dest
263                 merge (_charset, overlap_._charset);
264                 // duplicates already merged, so safe to merge
265                 // using std lib.
266 
267                 // src, dest
268                 merge (rhs_._charset, overlap_._charset);
269                 _negated = false;
270                 rhs_._negated = false;
271                 std::swap (_charset, rhs_._charset);
272                 normalise ();
273                 overlap_.normalise ();
274                 rhs_.normalise ();
275             }
276             else if (!overlap_._charset.empty ())
277             {
278                 normalise ();
279                 overlap_.normalise ();
280                 rhs_.normalise ();
281             }
282         }
283     }
284 
intersect_diff_typesboost::lexer::basic_string_token285     void intersect_diff_types (basic_string_token &rhs_,
286         basic_string_token &overlap_)
287     {
288         if (any ())
289         {
290             intersect_any (rhs_, overlap_);
291         }
292         else if (_negated)
293         {
294             intersect_negated (rhs_, overlap_);
295         }
296         else // _negated == false
297         {
298             intersect_charset (rhs_, overlap_);
299         }
300     }
301 
intersect_anyboost::lexer::basic_string_token302     void intersect_any (basic_string_token &rhs_, basic_string_token &overlap_)
303     {
304         if (rhs_._negated)
305         {
306             rhs_.intersect_negated (*this, overlap_);
307         }
308         else // rhs._negated == false
309         {
310             rhs_.intersect_charset (*this, overlap_);
311         }
312     }
313 
intersect_negatedboost::lexer::basic_string_token314     void intersect_negated (basic_string_token &rhs_,
315         basic_string_token &overlap_)
316     {
317         if (rhs_.any ())
318         {
319             overlap_._negated = true;
320             overlap_._charset = _charset;
321             rhs_._negated = false;
322             rhs_._charset = _charset;
323             clear ();
324         }
325         else // rhs._negated == false
326         {
327             rhs_.intersect_charset (*this, overlap_);
328         }
329     }
330 
intersect_charsetboost::lexer::basic_string_token331     void intersect_charset (basic_string_token &rhs_,
332         basic_string_token &overlap_)
333     {
334         if (rhs_.any ())
335         {
336             overlap_._charset = _charset;
337             rhs_._negated = true;
338             rhs_._charset = _charset;
339             clear ();
340         }
341         else // rhs_._negated == true
342         {
343             typename string::iterator iter_ = _charset.begin ();
344             typename string::iterator end_ = _charset.end ();
345             typename string::iterator rhs_iter_ = rhs_._charset.begin ();
346             typename string::iterator rhs_end_ = rhs_._charset.end ();
347 
348             while (iter_ != end_ && rhs_iter_ != rhs_end_)
349             {
350                 if (*iter_ < *rhs_iter_)
351                 {
352                     overlap_._charset += *iter_;
353                     rhs_iter_ = rhs_._charset.insert (rhs_iter_, *iter_);
354                     ++rhs_iter_;
355                     rhs_end_ = rhs_._charset.end ();
356                     iter_ = _charset.erase (iter_);
357                     end_ = _charset.end ();
358                 }
359                 else if (*iter_ > *rhs_iter_)
360                 {
361                     ++rhs_iter_;
362                 }
363                 else
364                 {
365                     ++iter_;
366                     ++rhs_iter_;
367                 }
368             }
369 
370             if (iter_ != end_)
371             {
372                 // nothing bigger in rhs_ than iter_,
373                 // so safe to merge using std lib.
374                 string temp_ (iter_, end_);
375 
376                 // src, dest
377                 merge (temp_, overlap_._charset);
378                 _charset.erase (iter_, end_);
379             }
380 
381             if (!overlap_._charset.empty ())
382             {
383                 merge (overlap_._charset, rhs_._charset);
384                 // possible duplicates, so check for any and erase.
385                 rhs_._charset.erase (std::unique (rhs_._charset.begin (),
386                     rhs_._charset.end ()), rhs_._charset.end ());
387                 normalise ();
388                 overlap_.normalise ();
389                 rhs_.normalise ();
390             }
391         }
392     }
393 
mergeboost::lexer::basic_string_token394     void merge (string &src_, string &dest_)
395     {
396         string tmp_ (src_.size () + dest_.size (), 0);
397 
398         std::merge (src_.begin (), src_.end (), dest_.begin (), dest_.end (),
399             tmp_.begin ());
400         dest_ = tmp_;
401     }
402 };
403 }
404 }
405 
406 #endif
407