1 // parser.hpp
2 // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/)
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See accompanying
5 // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6 #ifndef BOOST_LEXER_PARSER_HPP
7 #define BOOST_LEXER_PARSER_HPP
8 
9 #include <boost/assert.hpp>
10 #include "tree/end_node.hpp"
11 #include "tree/iteration_node.hpp"
12 #include "tree/leaf_node.hpp"
13 #include "../runtime_error.hpp"
14 #include "tree/selection_node.hpp"
15 #include "tree/sequence_node.hpp"
16 #include "../size_t.hpp"
17 #include "tokeniser/re_tokeniser.hpp"
18 
19 namespace boost
20 {
21 namespace lexer
22 {
23 namespace detail
24 {
25 template<typename CharT>
26 class basic_parser
27 {
28 public:
29     typedef basic_re_tokeniser<CharT> tokeniser;
30     typedef typename tokeniser::string string;
31     typedef std::map<string, const node *> macro_map;
32     typedef node::node_ptr_vector node_ptr_vector;
33     typedef typename tokeniser::num_token token;
34 
35 /*
36     General principles of regex parsing:
37     - Every regex is a sequence of sub-regexes.
38     - Regexes consist of operands and operators
39     - All operators decompose to sequence, selection ('|') and iteration ('*')
40     - Regex tokens are stored on the stack.
41     - When a complete sequence of regex tokens is on the stack it is processed.
42 
43 Grammar:
44 
45 <REGEX>      -> <OREXP>
46 <OREXP>      -> <SEQUENCE> | <OREXP>'|'<SEQUENCE>
47 <SEQUENCE>   -> <SUB>
48 <SUB>        -> <EXPRESSION> | <SUB><EXPRESSION>
49 <EXPRESSION> -> <REPEAT>
50 <REPEAT>     -> charset | macro | '('<REGEX>')' | <REPEAT><DUPLICATE>
51 <DUPLICATE>  -> '?' | '*' | '+' | '{n[,[m]]}'
52 */
parse(const CharT * start_,const CharT * const end_,const std::size_t id_,const std::size_t unique_id_,const std::size_t dfa_state_,const regex_flags flags_,const std::locale & locale_,node_ptr_vector & node_ptr_vector_,const macro_map & macromap_,typename tokeniser::token_map & map_,bool & seen_BOL_assertion_,bool & seen_EOL_assertion_)53     static node *parse (const CharT *start_, const CharT * const end_,
54         const std::size_t id_, const std::size_t unique_id_,
55         const std::size_t dfa_state_, const regex_flags flags_,
56         const std::locale &locale_, node_ptr_vector &node_ptr_vector_,
57         const macro_map &macromap_, typename tokeniser::token_map &map_,
58         bool &seen_BOL_assertion_, bool &seen_EOL_assertion_)
59     {
60         node *root_ = 0;
61         state state_ (start_, end_, flags_, locale_);
62         token lhs_token_;
63         token rhs_token_;
64         token_stack token_stack_;
65         tree_node_stack tree_node_stack_;
66         char action_ = 0;
67 
68         token_stack_.push (rhs_token_);
69         tokeniser::next (state_, map_, rhs_token_);
70 
71         do
72         {
73             lhs_token_ = token_stack_.top ();
74             action_ = lhs_token_.precedence (rhs_token_._type);
75 
76             switch (action_)
77             {
78             case '<':
79             case '=':
80                 token_stack_.push (rhs_token_);
81                 tokeniser::next (state_, map_, rhs_token_);
82                 break;
83             case '>':
84                 reduce (token_stack_, macromap_, node_ptr_vector_,
85                     tree_node_stack_);
86                 break;
87             default:
88                 std::ostringstream ss_;
89 
90                 ss_ << "A syntax error occurred: '" <<
91                     lhs_token_.precedence_string () <<
92                     "' against '" << rhs_token_.precedence_string () <<
93                     "' at index " << state_.index () << ".";
94                 throw runtime_error (ss_.str ().c_str ());
95                 break;
96             }
97         } while (!token_stack_.empty ());
98 
99         if (tree_node_stack_.empty ())
100         {
101             throw runtime_error ("Empty rules are not allowed.");
102         }
103 
104         BOOST_ASSERT(tree_node_stack_.size () == 1);
105 
106         node *lhs_node_ = tree_node_stack_.top ();
107 
108         tree_node_stack_.pop ();
109 
110         if (id_ == 0)
111         {
112             // Macros have no end state...
113             root_ = lhs_node_;
114         }
115         else
116         {
117             node_ptr_vector_->push_back (static_cast<end_node *>(0));
118 
119             node *rhs_node_ = new end_node (id_, unique_id_, dfa_state_);
120 
121             node_ptr_vector_->back () = rhs_node_;
122             node_ptr_vector_->push_back (static_cast<sequence_node *>(0));
123             node_ptr_vector_->back () = new sequence_node
124                 (lhs_node_, rhs_node_);
125             root_ = node_ptr_vector_->back ();
126         }
127 
128         // Done this way as bug in VC++ 6 prevents |= operator working
129         // properly!
130         if (state_._seen_BOL_assertion) seen_BOL_assertion_ = true;
131 
132         if (state_._seen_EOL_assertion) seen_EOL_assertion_ = true;
133 
134         return root_;
135     }
136 
137 private:
138     typedef typename tokeniser::state state;
139     typedef std::stack<token> token_stack;
140     typedef node::node_stack tree_node_stack;
141 
reduce(token_stack & token_stack_,const macro_map & macromap_,node_ptr_vector & node_vector_ptr_,tree_node_stack & tree_node_stack_)142     static void reduce (token_stack &token_stack_,
143         const macro_map &macromap_, node_ptr_vector &node_vector_ptr_,
144         tree_node_stack &tree_node_stack_)
145     {
146         typename tokeniser::num_token lhs_;
147         typename tokeniser::num_token rhs_;
148         token_stack handle_;
149         char action_ = 0;
150 
151         do
152         {
153             rhs_ = token_stack_.top ();
154             token_stack_.pop ();
155             handle_.push (rhs_);
156 
157             if (!token_stack_.empty ())
158             {
159                 lhs_ = token_stack_.top ();
160                 action_ = lhs_.precedence (rhs_._type);
161             }
162         } while (!token_stack_.empty () && action_ == '=');
163 
164         BOOST_ASSERT(token_stack_.empty () || action_ == '<');
165 
166         switch (rhs_._type)
167         {
168         case token::BEGIN:
169             // finished processing so exit
170             break;
171         case token::REGEX:
172             // finished parsing, nothing to do
173             break;
174         case token::OREXP:
175             orexp (handle_, token_stack_, node_vector_ptr_, tree_node_stack_);
176             break;
177         case token::SEQUENCE:
178             token_stack_.push (token::OREXP);
179             break;
180         case token::SUB:
181             sub (handle_, token_stack_, node_vector_ptr_, tree_node_stack_);
182             break;
183         case token::EXPRESSION:
184             token_stack_.push (token::SUB);
185             break;
186         case token::REPEAT:
187             repeat (handle_, token_stack_);
188             break;
189         case token::CHARSET:
190             charset (handle_, token_stack_, node_vector_ptr_,
191                 tree_node_stack_);
192             break;
193         case token::MACRO:
194             macro (handle_, token_stack_, macromap_, node_vector_ptr_,
195                 tree_node_stack_);
196             break;
197         case token::OPENPAREN:
198             openparen (handle_, token_stack_);
199             break;
200         case token::OPT:
201         case token::AOPT:
202             optional (rhs_._type == token::OPT, node_vector_ptr_,
203                 tree_node_stack_);
204             token_stack_.push (token::DUP);
205             break;
206         case token::ZEROORMORE:
207         case token::AZEROORMORE:
208             zero_or_more (rhs_._type == token::ZEROORMORE, node_vector_ptr_,
209                 tree_node_stack_);
210             token_stack_.push (token::DUP);
211             break;
212         case token::ONEORMORE:
213         case token::AONEORMORE:
214             one_or_more (rhs_._type == token::ONEORMORE, node_vector_ptr_,
215                 tree_node_stack_);
216             token_stack_.push (token::DUP);
217             break;
218         case token::REPEATN:
219         case token::AREPEATN:
220             repeatn (rhs_._type == token::REPEATN, handle_.top (),
221                 node_vector_ptr_, tree_node_stack_);
222             token_stack_.push (token::DUP);
223             break;
224         default:
225             throw runtime_error
226                 ("Internal error regex_parser::reduce");
227             break;
228         }
229     }
230 
orexp(token_stack & handle_,token_stack & token_stack_,node_ptr_vector & node_ptr_vector_,tree_node_stack & tree_node_stack_)231     static void orexp (token_stack &handle_, token_stack &token_stack_,
232         node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
233     {
234         BOOST_ASSERT(handle_.top ()._type == token::OREXP &&
235             (handle_.size () == 1 || handle_.size () == 3));
236 
237         if (handle_.size () == 1)
238         {
239             token_stack_.push (token::REGEX);
240         }
241         else
242         {
243             handle_.pop ();
244             BOOST_ASSERT(handle_.top ()._type == token::OR);
245             handle_.pop ();
246             BOOST_ASSERT(handle_.top ()._type == token::SEQUENCE);
247             perform_or (node_ptr_vector_, tree_node_stack_);
248             token_stack_.push (token::OREXP);
249         }
250     }
251 
sub(token_stack & handle_,token_stack & token_stack_,node_ptr_vector & node_ptr_vector_,tree_node_stack & tree_node_stack_)252     static void sub (token_stack &handle_, token_stack &token_stack_,
253         node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
254     {
255         BOOST_ASSERT(handle_.top ()._type == token::SUB &&
256             (handle_.size () == 1 || handle_.size () == 2));
257 
258         if (handle_.size () == 1)
259         {
260             token_stack_.push (token::SEQUENCE);
261         }
262         else
263         {
264             handle_.pop ();
265             BOOST_ASSERT(handle_.top ()._type == token::EXPRESSION);
266             // perform join
267             sequence (node_ptr_vector_, tree_node_stack_);
268             token_stack_.push (token::SUB);
269         }
270     }
271 
repeat(token_stack & handle_,token_stack & token_stack_)272     static void repeat (token_stack &handle_, token_stack &token_stack_)
273     {
274         BOOST_ASSERT(handle_.top ()._type == token::REPEAT &&
275             handle_.size () >= 1 && handle_.size () <= 3);
276 
277         if (handle_.size () == 1)
278         {
279             token_stack_.push (token::EXPRESSION);
280         }
281         else
282         {
283             handle_.pop ();
284             BOOST_ASSERT(handle_.top ()._type == token::DUP);
285             token_stack_.push (token::REPEAT);
286         }
287     }
288 
charset(token_stack & handle_,token_stack & token_stack_,node_ptr_vector & node_ptr_vector_,tree_node_stack & tree_node_stack_)289     static void charset (token_stack &handle_, token_stack &token_stack_,
290         node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
291     {
292         BOOST_ASSERT(handle_.top ()._type == token::CHARSET &&
293             handle_.size () == 1);
294         // store charset
295         node_ptr_vector_->push_back (static_cast<leaf_node *>(0));
296 
297         const size_t id_ = handle_.top ()._id;
298 
299         node_ptr_vector_->back () = new leaf_node (id_, true);
300         tree_node_stack_.push (node_ptr_vector_->back ());
301         token_stack_.push (token::REPEAT);
302     }
303 
macro(token_stack & handle_,token_stack & token_stack_,const macro_map & macromap_,node_ptr_vector & node_ptr_vector_,tree_node_stack & tree_node_stack_)304     static void macro (token_stack &handle_, token_stack &token_stack_,
305         const macro_map &macromap_, node_ptr_vector &node_ptr_vector_,
306         tree_node_stack &tree_node_stack_)
307     {
308         token &top_ = handle_.top ();
309 
310         BOOST_ASSERT(top_._type == token::MACRO && handle_.size () == 1);
311 
312         typename macro_map::const_iterator iter_ =
313             macromap_.find (top_._macro);
314 
315         if (iter_ == macromap_.end ())
316         {
317             const CharT *name_ = top_._macro;
318             std::basic_stringstream<CharT> ss_;
319             std::ostringstream os_;
320 
321             os_ << "Unknown MACRO name '";
322 
323             while (*name_)
324             {
325                 os_ << ss_.narrow (*name_++, ' ');
326             }
327 
328             os_ << "'.";
329             throw runtime_error (os_.str ());
330         }
331 
332         tree_node_stack_.push (iter_->second->copy (node_ptr_vector_));
333         token_stack_.push (token::REPEAT);
334     }
335 
openparen(token_stack & handle_,token_stack & token_stack_)336     static void openparen (token_stack &handle_, token_stack &token_stack_)
337     {
338         BOOST_ASSERT(handle_.top ()._type == token::OPENPAREN &&
339             handle_.size () == 3);
340         handle_.pop ();
341         BOOST_ASSERT(handle_.top ()._type == token::REGEX);
342         handle_.pop ();
343         BOOST_ASSERT(handle_.top ()._type == token::CLOSEPAREN);
344         token_stack_.push (token::REPEAT);
345     }
346 
perform_or(node_ptr_vector & node_ptr_vector_,tree_node_stack & tree_node_stack_)347     static void perform_or (node_ptr_vector &node_ptr_vector_,
348         tree_node_stack &tree_node_stack_)
349     {
350         // perform or
351         node *rhs_ = tree_node_stack_.top ();
352 
353         tree_node_stack_.pop ();
354 
355         node *lhs_ = tree_node_stack_.top ();
356 
357         node_ptr_vector_->push_back (static_cast<selection_node *>(0));
358         node_ptr_vector_->back () = new selection_node (lhs_, rhs_);
359         tree_node_stack_.top () = node_ptr_vector_->back ();
360     }
361 
sequence(node_ptr_vector & node_ptr_vector_,tree_node_stack & tree_node_stack_)362     static void sequence (node_ptr_vector &node_ptr_vector_,
363         tree_node_stack &tree_node_stack_)
364     {
365         node *rhs_ = tree_node_stack_.top ();
366 
367         tree_node_stack_.pop ();
368 
369         node *lhs_ = tree_node_stack_.top ();
370 
371         node_ptr_vector_->push_back (static_cast<sequence_node *>(0));
372         node_ptr_vector_->back () = new sequence_node (lhs_, rhs_);
373         tree_node_stack_.top () = node_ptr_vector_->back ();
374     }
375 
optional(const bool greedy_,node_ptr_vector & node_ptr_vector_,tree_node_stack & tree_node_stack_)376     static void optional (const bool greedy_,
377         node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
378     {
379         // perform ?
380         node *lhs_ = tree_node_stack_.top ();
381         // You don't know if lhs_ is a leaf_node, so get firstpos.
382         node::node_vector &firstpos_ = lhs_->firstpos ();
383 
384         for (node::node_vector::iterator iter_ = firstpos_.begin (),
385             end_ = firstpos_.end (); iter_ != end_; ++iter_)
386         {
387             // These are leaf_nodes!
388             (*iter_)->greedy (greedy_);
389         }
390 
391         node_ptr_vector_->push_back (static_cast<leaf_node *>(0));
392 
393         node *rhs_ = new leaf_node (null_token, greedy_);
394 
395         node_ptr_vector_->back () = rhs_;
396         node_ptr_vector_->push_back (static_cast<selection_node *>(0));
397         node_ptr_vector_->back () = new selection_node (lhs_, rhs_);
398         tree_node_stack_.top () = node_ptr_vector_->back ();
399     }
400 
zero_or_more(const bool greedy_,node_ptr_vector & node_ptr_vector_,tree_node_stack & tree_node_stack_)401     static void zero_or_more (const bool greedy_,
402         node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
403     {
404         // perform *
405         node *ptr_ = tree_node_stack_.top ();
406 
407         node_ptr_vector_->push_back (static_cast<iteration_node *>(0));
408         node_ptr_vector_->back () = new iteration_node (ptr_, greedy_);
409         tree_node_stack_.top () = node_ptr_vector_->back ();
410     }
411 
one_or_more(const bool greedy_,node_ptr_vector & node_ptr_vector_,tree_node_stack & tree_node_stack_)412     static void one_or_more (const bool greedy_,
413         node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
414     {
415         // perform +
416         node *lhs_ = tree_node_stack_.top ();
417         node *copy_ = lhs_->copy (node_ptr_vector_);
418 
419         node_ptr_vector_->push_back (static_cast<iteration_node *>(0));
420 
421         node *rhs_ = new iteration_node (copy_, greedy_);
422 
423         node_ptr_vector_->back () = rhs_;
424         node_ptr_vector_->push_back (static_cast<sequence_node *>(0));
425         node_ptr_vector_->back () = new sequence_node (lhs_, rhs_);
426         tree_node_stack_.top () = node_ptr_vector_->back ();
427     }
428 
429     // This is one of the most mind bending routines in this code...
repeatn(const bool greedy_,const token & token_,node_ptr_vector & node_ptr_vector_,tree_node_stack & tree_node_stack_)430     static void repeatn (const bool greedy_, const token &token_,
431         node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
432     {
433         // perform {n[,[m]]}
434         // Semantic checks have already been performed.
435         // {0,}  = *
436         // {0,1} = ?
437         // {1,}  = +
438         // therefore we do not check for these cases.
439         if (!(token_._min == 1 && !token_._comma))
440         {
441             const std::size_t top_ = token_._min > 0 ?
442                 token_._min : token_._max;
443 
444             if (token_._min == 0)
445             {
446                 optional (greedy_, node_ptr_vector_, tree_node_stack_);
447             }
448 
449             node *prev_ = tree_node_stack_.top ()->copy (node_ptr_vector_);
450             node *curr_ = 0;
451 
452             for (std::size_t i_ = 2; i_ < top_; ++i_)
453             {
454                 curr_ = prev_->copy (node_ptr_vector_);
455                 tree_node_stack_.push (static_cast<node *>(0));
456                 tree_node_stack_.top () = prev_;
457                 sequence (node_ptr_vector_, tree_node_stack_);
458                 prev_ = curr_;
459             }
460 
461             if (token_._comma && token_._min > 0)
462             {
463                 if (token_._min > 1)
464                 {
465                     curr_ = prev_->copy (node_ptr_vector_);
466                     tree_node_stack_.push (static_cast<node *>(0));
467                     tree_node_stack_.top () = prev_;
468                     sequence (node_ptr_vector_, tree_node_stack_);
469                     prev_ = curr_;
470                 }
471 
472                 if (token_._comma && token_._max)
473                 {
474                     tree_node_stack_.push (static_cast<node *>(0));
475                     tree_node_stack_.top () = prev_;
476                     optional (greedy_, node_ptr_vector_, tree_node_stack_);
477                     prev_ = tree_node_stack_.top ();
478                     tree_node_stack_.pop ();
479 
480                     const std::size_t count_ = token_._max - token_._min;
481 
482                     for (std::size_t i_ = 1; i_ < count_; ++i_)
483                     {
484                         curr_ = prev_->copy (node_ptr_vector_);
485                         tree_node_stack_.push (static_cast<node *>(0));
486                         tree_node_stack_.top () = prev_;
487                         sequence (node_ptr_vector_, tree_node_stack_);
488                         prev_ = curr_;
489                     }
490                 }
491                 else
492                 {
493                     tree_node_stack_.push (static_cast<node *>(0));
494                     tree_node_stack_.top () = prev_;
495                     zero_or_more (greedy_, node_ptr_vector_, tree_node_stack_);
496                     prev_ = tree_node_stack_.top ();
497                     tree_node_stack_.pop ();
498                 }
499             }
500 
501             tree_node_stack_.push (static_cast<node *>(0));
502             tree_node_stack_.top () = prev_;
503             sequence (node_ptr_vector_, tree_node_stack_);
504         }
505     }
506 };
507 }
508 }
509 }
510 
511 #endif
512