1 // tokeniser.hpp
2 // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/)
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See accompanying
5 // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6 #ifndef BOOST_LEXER_RE_TOKENISER_HPP
7 #define BOOST_LEXER_RE_TOKENISER_HPP
8 
9 // memcpy()
10 #include <cstring>
11 #include <map>
12 #include "num_token.hpp"
13 #include "../../runtime_error.hpp"
14 #include "../../size_t.hpp"
15 #include <sstream>
16 #include "../../string_token.hpp"
17 #include "re_tokeniser_helper.hpp"
18 
19 namespace boost
20 {
21 namespace lexer
22 {
23 namespace detail
24 {
25 template<typename CharT>
26 class basic_re_tokeniser
27 {
28 public:
29     typedef basic_num_token<CharT> num_token;
30     typedef basic_re_tokeniser_state<CharT> state;
31     typedef basic_string_token<CharT> string_token;
32     typedef typename string_token::string string;
33     typedef std::map<string_token, std::size_t> token_map;
34     typedef std::pair<string_token, std::size_t> token_pair;
35 
next(state & state_,token_map & map_,num_token & token_)36     static void next (state &state_, token_map &map_, num_token &token_)
37     {
38         CharT ch_ = 0;
39         bool eos_ = state_.next (ch_);
40 
41         token_.min_max (0, false, 0);
42 
43         while (!eos_ && ch_ == '"')
44         {
45             state_._in_string ^= 1;
46             eos_ = state_.next (ch_);
47         }
48 
49         if (eos_)
50         {
51             if (state_._in_string)
52             {
53                 throw runtime_error ("Unexpected end of regex "
54                     "(missing '\"').");
55             }
56 
57             if (state_._paren_count)
58             {
59                 throw runtime_error ("Unexpected end of regex "
60                     "(missing ')').");
61             }
62 
63             token_.set (num_token::END, null_token);
64         }
65         else
66         {
67             if (ch_ == '\\')
68             {
69                 // Even if we are in a string, respect escape sequences...
70                 escape (state_, map_, token_);
71             }
72             else if (state_._in_string)
73             {
74                 // All other meta characters lose their special meaning
75                 // inside a string.
76                 create_charset_token (string (1, ch_), false, map_, token_);
77             }
78             else
79             {
80                 // Not an escape sequence and not inside a string, so
81                 // check for meta characters.
82                 switch (ch_)
83                 {
84                 case '(':
85                     token_.set (num_token::OPENPAREN, null_token);
86                     ++state_._paren_count;
87                     read_options (state_);
88                     break;
89                 case ')':
90                     --state_._paren_count;
91 
92                     if (state_._paren_count < 0)
93                     {
94                         std::ostringstream ss_;
95 
96                         ss_ << "Number of open parenthesis < 0 at index " <<
97                             state_.index () - 1 << '.';
98                         throw runtime_error (ss_.str ().c_str ());
99                     }
100 
101                     token_.set (num_token::CLOSEPAREN, null_token);
102 
103                     if (!state_._flags_stack.empty ())
104                     {
105                         state_._flags = state_._flags_stack.top ();
106                         state_._flags_stack.pop ();
107                     }
108                     break;
109                 case '?':
110                     if (!state_.eos () && *state_._curr == '?')
111                     {
112                         token_.set (num_token::AOPT, null_token);
113                         state_.increment ();
114                     }
115                     else
116                     {
117                         token_.set (num_token::OPT, null_token);
118                     }
119 
120                     break;
121                 case '*':
122                     if (!state_.eos () && *state_._curr == '?')
123                     {
124                         token_.set (num_token::AZEROORMORE, null_token);
125                         state_.increment ();
126                     }
127                     else
128                     {
129                         token_.set (num_token::ZEROORMORE, null_token);
130                     }
131 
132                     break;
133                 case '+':
134                     if (!state_.eos () && *state_._curr == '?')
135                     {
136                         token_.set (num_token::AONEORMORE, null_token);
137                         state_.increment ();
138                     }
139                     else
140                     {
141                         token_.set (num_token::ONEORMORE, null_token);
142                     }
143 
144                     break;
145                 case '{':
146                     open_curly (state_, token_);
147                     break;
148                 case '|':
149                     token_.set (num_token::OR, null_token);
150                     break;
151                 case '^':
152                     if (state_._curr - 1 == state_._start)
153                     {
154                         token_.set (num_token::CHARSET, bol_token);
155                         state_._seen_BOL_assertion = true;
156                     }
157                     else
158                     {
159                         create_charset_token (string (1, ch_), false,
160                             map_, token_);
161                     }
162 
163                     break;
164                 case '$':
165                     if (state_._curr == state_._end)
166                     {
167                         token_.set (num_token::CHARSET, eol_token);
168                         state_._seen_EOL_assertion = true;
169                     }
170                     else
171                     {
172                         create_charset_token (string (1, ch_), false,
173                             map_, token_);
174                     }
175 
176                     break;
177                 case '.':
178                 {
179                     string dot_;
180 
181                     if (state_._flags & dot_not_newline)
182                     {
183                         dot_ = '\n';
184                     }
185 
186                     create_charset_token (dot_, true, map_, token_);
187                     break;
188                 }
189                 case '[':
190                 {
191                     charset (state_, map_, token_);
192                     break;
193                 }
194                 case '/':
195                     throw runtime_error("Lookahead ('/') is not supported yet.");
196                     break;
197                 default:
198                     if ((state_._flags & icase) &&
199                         (std::isupper (ch_, state_._locale) ||
200                         std::islower (ch_, state_._locale)))
201                     {
202                         CharT upper_ = std::toupper (ch_, state_._locale);
203                         CharT lower_ = std::tolower (ch_, state_._locale);
204 
205                         string str_ (1, upper_);
206 
207                         str_ += lower_;
208                         create_charset_token (str_, false, map_, token_);
209                     }
210                     else
211                     {
212                         create_charset_token (string (1, ch_), false,
213                             map_, token_);
214                     }
215 
216                     break;
217                 }
218             }
219         }
220     }
221 
222 private:
223     typedef basic_re_tokeniser_helper<CharT> tokeniser_helper;
224 
read_options(state & state_)225     static void read_options (state &state_)
226     {
227         if (!state_.eos () && *state_._curr == '?')
228         {
229             CharT ch_ = 0;
230             bool eos_ = false;
231             bool negate_ = false;
232 
233             state_.increment ();
234             eos_ = state_.next (ch_);
235             state_._flags_stack.push (state_._flags);
236 
237             while (!eos_ && ch_ != ':')
238             {
239                 switch (ch_)
240                 {
241                 case '-':
242                     negate_ ^= 1;
243                     break;
244                 case 'i':
245                     if (negate_)
246                     {
247                         state_._flags = static_cast<regex_flags>
248                             (state_._flags & ~icase);
249                     }
250                     else
251                     {
252                         state_._flags = static_cast<regex_flags>
253                             (state_._flags | icase);
254                     }
255 
256                     negate_ = false;
257                     break;
258                 case 's':
259                     if (negate_)
260                     {
261                         state_._flags = static_cast<regex_flags>
262                             (state_._flags | dot_not_newline);
263                     }
264                     else
265                     {
266                         state_._flags = static_cast<regex_flags>
267                             (state_._flags & ~dot_not_newline);
268                     }
269 
270                     negate_ = false;
271                     break;
272                 default:
273                 {
274                     std::ostringstream ss_;
275 
276                     ss_ << "Unknown option at index " <<
277                         state_.index () - 1 << '.';
278                     throw runtime_error (ss_.str ().c_str ());
279                 }
280                 }
281 
282                 eos_ = state_.next (ch_);
283             }
284 
285             // End of string handler will handle early termination
286         }
287         else if (!state_._flags_stack.empty ())
288         {
289             state_._flags_stack.push (state_._flags);
290         }
291     }
292 
escape(state & state_,token_map & map_,num_token & token_)293     static void escape (state &state_, token_map &map_, num_token &token_)
294     {
295         CharT ch_ = 0;
296         std::size_t str_len_ = 0;
297         const CharT *str_ = tokeniser_helper::escape_sequence (state_,
298             ch_, str_len_);
299 
300         if (str_)
301         {
302             state state2_ (str_ + 1, str_ + str_len_, state_._flags,
303                 state_._locale);
304 
305             charset (state2_, map_, token_);
306         }
307         else
308         {
309             create_charset_token (string (1, ch_), false, map_, token_);
310         }
311     }
312 
charset(state & state_,token_map & map_,num_token & token_)313     static void charset (state &state_, token_map &map_, num_token &token_)
314     {
315         string chars_;
316         bool negated_ = false;
317 
318         tokeniser_helper::charset (state_, chars_, negated_);
319         create_charset_token (chars_, negated_, map_, token_);
320     }
321 
create_charset_token(const string & charset_,const bool negated_,token_map & map_,num_token & token_)322     static void create_charset_token (const string &charset_,
323         const bool negated_, token_map &map_, num_token &token_)
324     {
325         std::size_t id_ = null_token;
326         string_token stok_ (negated_, charset_);
327 
328         stok_.remove_duplicates ();
329         stok_.normalise ();
330 
331         typename token_map::const_iterator iter_ = map_.find (stok_);
332 
333         if (iter_ == map_.end ())
334         {
335             id_ = map_.size ();
336             map_.insert (token_pair (stok_, id_));
337         }
338         else
339         {
340             id_ = iter_->second;
341         }
342 
343         token_.set (num_token::CHARSET, id_);
344     }
345 
open_curly(state & state_,num_token & token_)346     static void open_curly (state &state_, num_token &token_)
347     {
348         if (state_.eos ())
349         {
350             throw runtime_error ("Unexpected end of regex "
351                 "(missing '}').");
352         }
353         else if (*state_._curr >= '0' && *state_._curr <= '9')
354         {
355             repeat_n (state_, token_);
356 
357             if (!state_.eos () && *state_._curr == '?')
358             {
359                 token_._type = num_token::AREPEATN;
360                 state_.increment ();
361             }
362         }
363         else
364         {
365             macro (state_, token_);
366         }
367     }
368 
369     // SYNTAX:
370     //   {n[,[n]]}
371     // SEMANTIC RULES:
372     //   {0} - INVALID (throw exception)
373     //   {0,} = *
374     //   {0,0} - INVALID (throw exception)
375     //   {0,1} = ?
376     //   {1,} = +
377     //   {min,max} where min == max - {min}
378     //   {min,max} where max < min - INVALID (throw exception)
repeat_n(state & state_,num_token & token_)379     static void repeat_n (state &state_, num_token &token_)
380     {
381         CharT ch_ = 0;
382         bool eos_ = state_.next (ch_);
383 
384         while (!eos_ && ch_ >= '0' && ch_ <= '9')
385         {
386             token_._min *= 10;
387             token_._min += ch_ - '0';
388             eos_ = state_.next (ch_);
389         }
390 
391         if (eos_)
392         {
393             throw runtime_error ("Unexpected end of regex "
394                 "(missing '}').");
395         }
396 
397         bool min_max_ = false;
398         bool repeatn_ = true;
399 
400         token_._comma = ch_ == ',';
401 
402         if (token_._comma)
403         {
404             eos_ = state_.next (ch_);
405 
406             if (eos_)
407             {
408                 throw runtime_error ("Unexpected end of regex "
409                     "(missing '}').");
410             }
411 
412             if (ch_ == '}')
413             {
414                 // Small optimisation: Check for '*' equivalency.
415                 if (token_._min == 0)
416                 {
417                     token_.set (num_token::ZEROORMORE, null_token);
418                     repeatn_ = false;
419                 }
420                 // Small optimisation: Check for '+' equivalency.
421                 else if (token_._min == 1)
422                 {
423                     token_.set (num_token::ONEORMORE, null_token);
424                     repeatn_ = false;
425                 }
426             }
427             else
428             {
429                 if (ch_ < '0' || ch_ > '9')
430                 {
431                     std::ostringstream ss_;
432 
433                     ss_ << "Missing '}' at index " <<
434                         state_.index () - 1 << '.';
435                     throw runtime_error (ss_.str ().c_str ());
436                 }
437 
438                 min_max_ = true;
439 
440                 do
441                 {
442                     token_._max *= 10;
443                     token_._max += ch_ - '0';
444                     eos_ = state_.next (ch_);
445                 } while (!eos_ && ch_ >= '0' && ch_ <= '9');
446 
447                 if (eos_)
448                 {
449                     throw runtime_error ("Unexpected end of regex "
450                         "(missing '}').");
451                 }
452 
453                 // Small optimisation: Check for '?' equivalency.
454                 if (token_._min == 0 && token_._max == 1)
455                 {
456                     token_.set (num_token::OPT, null_token);
457                     repeatn_ = false;
458                 }
459                 // Small optimisation: if min == max, then min.
460                 else if (token_._min == token_._max)
461                 {
462                     token_._comma = false;
463                     min_max_ = false;
464                     token_._max = 0;
465                 }
466             }
467         }
468 
469         if (ch_ != '}')
470         {
471             std::ostringstream ss_;
472 
473             ss_ << "Missing '}' at index " << state_.index () - 1 << '.';
474             throw runtime_error (ss_.str ().c_str ());
475         }
476 
477         if (repeatn_)
478         {
479             // SEMANTIC VALIDATION follows:
480             // NOTE: {0,} has already become *
481             // therefore we don't check for a comma.
482             if (token_._min == 0 && token_._max == 0)
483             {
484                 std::ostringstream ss_;
485 
486                 ss_ << "Cannot have exactly zero repeats preceding index " <<
487                     state_.index () << '.';
488                 throw runtime_error (ss_.str ().c_str ());
489             }
490 
491             if (min_max_ && token_._max < token_._min)
492             {
493                 std::ostringstream ss_;
494 
495                 ss_ << "Max less than min preceding index " <<
496                     state_.index () << '.';
497                 throw runtime_error (ss_.str ().c_str ());
498             }
499 
500             token_.set (num_token::REPEATN, null_token);
501         }
502     }
503 
macro(state & state_,num_token & token_)504     static void macro (state &state_, num_token &token_)
505     {
506         CharT ch_ = 0;
507         bool eos_ = false;
508         const CharT *start_ = state_._curr;
509 
510         state_.next (ch_);
511 
512         if (ch_ != '_' && !(ch_ >= 'A' && ch_ <= 'Z') &&
513             !(ch_ >= 'a' && ch_ <= 'z'))
514         {
515             std::ostringstream ss_;
516 
517             ss_ << "Invalid MACRO name at index " <<
518                 state_.index () - 1 << '.';
519             throw runtime_error (ss_.str ().c_str ());
520         }
521 
522         do
523         {
524             eos_ = state_.next (ch_);
525 
526             if (eos_)
527             {
528                 throw runtime_error ("Unexpected end of regex "
529                     "(missing '}').");
530             }
531         } while (ch_ == '_' || ch_ == '-' || (ch_ >= 'A' && ch_ <= 'Z') ||
532             (ch_ >= 'a' && ch_ <= 'z') || (ch_ >= '0' && ch_ <= '9'));
533 
534         if (ch_ != '}')
535         {
536             std::ostringstream ss_;
537 
538             ss_ << "Missing '}' at index " << state_.index () - 1 << '.';
539             throw runtime_error (ss_.str ().c_str ());
540         }
541 
542         std::size_t len_ = state_._curr - 1 - start_;
543 
544         if (len_ > max_macro_len)
545         {
546             std::basic_stringstream<CharT> ss_;
547             std::ostringstream os_;
548 
549             os_ << "MACRO name '";
550 
551             while (len_)
552             {
553                 os_ << ss_.narrow (*start_++, ' ');
554                 --len_;
555             }
556 
557             os_ << "' too long.";
558             throw runtime_error (os_.str ());
559         }
560 
561         token_.set (num_token::MACRO, null_token);
562 
563         // Some systems have memcpy in namespace std.
564         using namespace std;
565 
566         memcpy (token_._macro, start_, len_ * sizeof (CharT));
567         token_._macro[len_] = 0;
568     }
569 };
570 }
571 }
572 }
573 
574 #endif
575