1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2  * vim: set ts=8 sts=4 et sw=4 tw=99: */
3 
4 // Copyright 2012 the V8 project authors. All rights reserved.
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 //       notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 //       copyright notice, this list of conditions and the following
13 //       disclaimer in the documentation and/or other materials provided
14 //       with the distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 //       contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 #ifndef V8_REGEXP_AST_H_
32 #define V8_REGEXP_AST_H_
33 
34 // Prevent msvc build failures as indicated in bug 1205328
35 #ifdef min
36 # undef min
37 #endif
38 #ifdef max
39 # undef max
40 #endif
41 
42 #include "irregexp/RegExpEngine.h"
43 
44 namespace js {
45 namespace irregexp {
46 
47 class RegExpCompiler;
48 class RegExpNode;
49 
50 class RegExpVisitor
51 {
52   public:
~RegExpVisitor()53     virtual ~RegExpVisitor() { }
54 #define MAKE_CASE(Name)                                         \
55     virtual void* Visit##Name(RegExp##Name*, void* data) = 0;
56     FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE)
57 #undef MAKE_CASE
58 };
59 
60 class RegExpTree
61 {
62   public:
63     static const int kInfinity = INT32_MAX;
~RegExpTree()64     virtual ~RegExpTree() {}
65     virtual RegExpNode* ToNode(RegExpCompiler* compiler,
66                                RegExpNode* on_success) = 0;
IsTextElement()67     virtual bool IsTextElement() { return false; }
IsAnchoredAtStart()68     virtual bool IsAnchoredAtStart() { return false; }
IsAnchoredAtEnd()69     virtual bool IsAnchoredAtEnd() { return false; }
70     virtual int min_match() = 0;
71     virtual int max_match() = 0;
72     // Returns the interval of registers used for captures within this
73     // expression.
CaptureRegisters()74     virtual Interval CaptureRegisters() { return Interval::Empty(); }
AppendToText(RegExpText * text)75     virtual void AppendToText(RegExpText* text) {
76         MOZ_CRASH("Bad call");
77     }
78 #define MAKE_ASTYPE(Name)                                               \
79     virtual RegExp##Name* As##Name();                                   \
80     virtual bool Is##Name();
81     FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE)
82 #undef MAKE_ASTYPE
83 };
84 
85 typedef Vector<RegExpTree*, 1, LifoAllocPolicy<Infallible> > RegExpTreeVector;
86 
87 class RegExpDisjunction : public RegExpTree
88 {
89   public:
90     explicit RegExpDisjunction(RegExpTreeVector* alternatives);
91     virtual void* Accept(RegExpVisitor* visitor, void* data);
92     virtual RegExpNode* ToNode(RegExpCompiler* compiler,
93                                RegExpNode* on_success);
94     virtual RegExpDisjunction* AsDisjunction();
95     virtual Interval CaptureRegisters();
96     virtual bool IsDisjunction();
97     virtual bool IsAnchoredAtStart();
98     virtual bool IsAnchoredAtEnd();
min_match()99     virtual int min_match() { return min_match_; }
max_match()100     virtual int max_match() { return max_match_; }
101 
alternatives()102     const RegExpTreeVector& alternatives() { return *alternatives_; }
103 
104   private:
105     RegExpTreeVector* alternatives_;
106     int min_match_;
107     int max_match_;
108 };
109 
110 class RegExpAlternative : public RegExpTree
111 {
112   public:
113     explicit RegExpAlternative(RegExpTreeVector* nodes);
114     virtual void* Accept(RegExpVisitor* visitor, void* data);
115     virtual RegExpNode* ToNode(RegExpCompiler* compiler,
116                                RegExpNode* on_success);
117     virtual RegExpAlternative* AsAlternative();
118     virtual Interval CaptureRegisters();
119     virtual bool IsAlternative();
120     virtual bool IsAnchoredAtStart();
121     virtual bool IsAnchoredAtEnd();
min_match()122     virtual int min_match() { return min_match_; }
max_match()123     virtual int max_match() { return max_match_; }
124 
nodes()125     const RegExpTreeVector& nodes() { return *nodes_; }
126 
127   private:
128     RegExpTreeVector* nodes_;
129     int min_match_;
130     int max_match_;
131 };
132 
133 class RegExpAssertion : public RegExpTree {
134  public:
135   enum AssertionType {
136     START_OF_LINE,
137     START_OF_INPUT,
138     END_OF_LINE,
139     END_OF_INPUT,
140     BOUNDARY,
141     NON_BOUNDARY
142   };
RegExpAssertion(AssertionType type)143   explicit RegExpAssertion(AssertionType type) : assertion_type_(type) { }
144   virtual void* Accept(RegExpVisitor* visitor, void* data);
145   virtual RegExpNode* ToNode(RegExpCompiler* compiler,
146                              RegExpNode* on_success);
147   virtual RegExpAssertion* AsAssertion();
148   virtual bool IsAssertion();
149   virtual bool IsAnchoredAtStart();
150   virtual bool IsAnchoredAtEnd();
min_match()151   virtual int min_match() { return 0; }
max_match()152   virtual int max_match() { return 0; }
assertion_type()153   AssertionType assertion_type() { return assertion_type_; }
154  private:
155   AssertionType assertion_type_;
156 };
157 
158 class CharacterSet
159 {
160   public:
CharacterSet(char16_t standard_set_type)161     explicit CharacterSet(char16_t standard_set_type)
162       : ranges_(nullptr),
163         standard_set_type_(standard_set_type)
164     {}
CharacterSet(CharacterRangeVector * ranges)165     explicit CharacterSet(CharacterRangeVector* ranges)
166       : ranges_(ranges),
167         standard_set_type_(0)
168     {}
169 
170     CharacterRangeVector& ranges(LifoAlloc* alloc);
standard_set_type()171     char16_t standard_set_type() { return standard_set_type_; }
set_standard_set_type(char16_t special_set_type)172     void set_standard_set_type(char16_t special_set_type) {
173         standard_set_type_ = special_set_type;
174     }
is_standard()175     bool is_standard() { return standard_set_type_ != 0; }
176     void Canonicalize();
177 
178   private:
179     CharacterRangeVector* ranges_;
180 
181     // If non-zero, the value represents a standard set (e.g., all whitespace
182     // characters) without having to expand the ranges.
183     char16_t standard_set_type_;
184 };
185 
186 class RegExpCharacterClass : public RegExpTree
187 {
188   public:
RegExpCharacterClass(CharacterRangeVector * ranges,bool is_negated)189     RegExpCharacterClass(CharacterRangeVector* ranges, bool is_negated)
190       : set_(ranges),
191         is_negated_(is_negated)
192     {}
193 
RegExpCharacterClass(char16_t type)194     explicit RegExpCharacterClass(char16_t type)
195       : set_(type),
196         is_negated_(false)
197     {}
198 
199     virtual void* Accept(RegExpVisitor* visitor, void* data);
200     virtual RegExpNode* ToNode(RegExpCompiler* compiler,
201                                RegExpNode* on_success);
202     virtual RegExpCharacterClass* AsCharacterClass();
203     virtual bool IsCharacterClass();
IsTextElement()204     virtual bool IsTextElement() { return true; }
min_match()205     virtual int min_match() { return 1; }
max_match()206     virtual int max_match() { return 1; }
207     virtual void AppendToText(RegExpText* text);
208 
character_set()209     CharacterSet character_set() { return set_; }
210 
211     // TODO(lrn): Remove need for complex version if is_standard that
212     // recognizes a mangled standard set and just do { return set_.is_special(); }
213     bool is_standard(LifoAlloc* alloc);
214 
215     // Returns a value representing the standard character set if is_standard()
216     // returns true.
217     // Currently used values are:
218     // s : unicode whitespace
219     // S : unicode non-whitespace
220     // w : ASCII word character (digit, letter, underscore)
221     // W : non-ASCII word character
222     // d : ASCII digit
223     // D : non-ASCII digit
224     // . : non-unicode non-newline
225     // * : All characters
standard_type()226     char16_t standard_type() { return set_.standard_set_type(); }
227 
ranges(LifoAlloc * alloc)228     CharacterRangeVector& ranges(LifoAlloc* alloc) { return set_.ranges(alloc); }
is_negated()229     bool is_negated() { return is_negated_; }
230 
231   private:
232     CharacterSet set_;
233     bool is_negated_;
234 };
235 
236 typedef Vector<char16_t, 10, LifoAllocPolicy<Infallible> > CharacterVector;
237 
238 class RegExpAtom : public RegExpTree
239 {
240   public:
RegExpAtom(CharacterVector * data)241     explicit RegExpAtom(CharacterVector* data)
242       : data_(data)
243     {}
244 
245     virtual void* Accept(RegExpVisitor* visitor, void* data);
246     virtual RegExpNode* ToNode(RegExpCompiler* compiler,
247                                RegExpNode* on_success);
248     virtual RegExpAtom* AsAtom();
249     virtual bool IsAtom();
IsTextElement()250     virtual bool IsTextElement() { return true; }
min_match()251     virtual int min_match() { return data_->length(); }
max_match()252     virtual int max_match() { return data_->length(); }
253     virtual void AppendToText(RegExpText* text);
254 
data()255     const CharacterVector& data() { return *data_; }
length()256     int length() { return data_->length(); }
257 
258   private:
259     CharacterVector* data_;
260 };
261 
262 class RegExpText : public RegExpTree
263 {
264   public:
RegExpText(LifoAlloc * alloc)265     explicit RegExpText(LifoAlloc* alloc)
266       : elements_(*alloc), length_(0)
267     {}
268 
269     virtual void* Accept(RegExpVisitor* visitor, void* data);
270     virtual RegExpNode* ToNode(RegExpCompiler* compiler,
271                                RegExpNode* on_success);
272     virtual RegExpText* AsText();
273     virtual bool IsText();
IsTextElement()274     virtual bool IsTextElement() { return true; }
min_match()275     virtual int min_match() { return length_; }
max_match()276     virtual int max_match() { return length_; }
277     virtual void AppendToText(RegExpText* text);
278 
AddElement(TextElement elm)279     void AddElement(TextElement elm)  {
280         elements_.append(elm);
281         length_ += elm.length();
282     }
elements()283     const TextElementVector& elements() { return elements_; }
284 
285   private:
286     TextElementVector elements_;
287     int length_;
288 };
289 
290 class RegExpQuantifier : public RegExpTree
291 {
292   public:
293     enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE };
RegExpQuantifier(int min,int max,QuantifierType type,RegExpTree * body)294     RegExpQuantifier(int min, int max, QuantifierType type, RegExpTree* body)
295       : body_(body),
296         min_(min),
297         max_(max),
298         min_match_(min * body->min_match()),
299         quantifier_type_(type)
300     {
301         if (max > 0 && body->max_match() > kInfinity / max) {
302             max_match_ = kInfinity;
303         } else {
304             max_match_ = max * body->max_match();
305         }
306     }
307 
308     virtual void* Accept(RegExpVisitor* visitor, void* data);
309     virtual RegExpNode* ToNode(RegExpCompiler* compiler,
310                                RegExpNode* on_success);
311     static RegExpNode* ToNode(int min,
312                               int max,
313                               bool is_greedy,
314                               RegExpTree* body,
315                               RegExpCompiler* compiler,
316                               RegExpNode* on_success,
317                               bool not_at_start = false);
318     virtual RegExpQuantifier* AsQuantifier();
319     virtual Interval CaptureRegisters();
320     virtual bool IsQuantifier();
min_match()321     virtual int min_match() { return min_match_; }
max_match()322     virtual int max_match() { return max_match_; }
min()323     int min() { return min_; }
max()324     int max() { return max_; }
is_possessive()325     bool is_possessive() { return quantifier_type_ == POSSESSIVE; }
is_non_greedy()326     bool is_non_greedy() { return quantifier_type_ == NON_GREEDY; }
is_greedy()327     bool is_greedy() { return quantifier_type_ == GREEDY; }
body()328     RegExpTree* body() { return body_; }
329 
330   private:
331     RegExpTree* body_;
332     int min_;
333     int max_;
334     int min_match_;
335     int max_match_;
336     QuantifierType quantifier_type_;
337 };
338 
339 class RegExpCapture : public RegExpTree
340 {
341   public:
RegExpCapture(RegExpTree * body,int index)342     explicit RegExpCapture(RegExpTree* body, int index)
343       : body_(body), index_(index)
344     {}
345 
346     virtual void* Accept(RegExpVisitor* visitor, void* data);
347     virtual RegExpNode* ToNode(RegExpCompiler* compiler,
348                                RegExpNode* on_success);
349     static RegExpNode* ToNode(RegExpTree* body,
350                               int index,
351                               RegExpCompiler* compiler,
352                               RegExpNode* on_success);
353     virtual RegExpCapture* AsCapture();
354     virtual bool IsAnchoredAtStart();
355     virtual bool IsAnchoredAtEnd();
356     virtual Interval CaptureRegisters();
357     virtual bool IsCapture();
min_match()358     virtual int min_match() { return body_->min_match(); }
max_match()359     virtual int max_match() { return body_->max_match(); }
body()360     RegExpTree* body() { return body_; }
index()361     int index() { return index_; }
StartRegister(int index)362     static int StartRegister(int index) { return index * 2; }
EndRegister(int index)363     static int EndRegister(int index) { return index * 2 + 1; }
364 
365   private:
366     RegExpTree* body_;
367     int index_;
368 };
369 
370 class RegExpLookahead : public RegExpTree
371 {
372   public:
RegExpLookahead(RegExpTree * body,bool is_positive,int capture_count,int capture_from)373     RegExpLookahead(RegExpTree* body,
374                     bool is_positive,
375                     int capture_count,
376                     int capture_from)
377       : body_(body),
378         is_positive_(is_positive),
379         capture_count_(capture_count),
380         capture_from_(capture_from)
381     {}
382 
383     virtual void* Accept(RegExpVisitor* visitor, void* data);
384     virtual RegExpNode* ToNode(RegExpCompiler* compiler,
385                                RegExpNode* on_success);
386     virtual RegExpLookahead* AsLookahead();
387     virtual Interval CaptureRegisters();
388     virtual bool IsLookahead();
389     virtual bool IsAnchoredAtStart();
min_match()390     virtual int min_match() { return 0; }
max_match()391     virtual int max_match() { return 0; }
body()392     RegExpTree* body() { return body_; }
is_positive()393     bool is_positive() { return is_positive_; }
capture_count()394     int capture_count() { return capture_count_; }
capture_from()395     int capture_from() { return capture_from_; }
396 
397   private:
398     RegExpTree* body_;
399     bool is_positive_;
400     int capture_count_;
401     int capture_from_;
402 };
403 
404 typedef Vector<RegExpCapture*, 1, LifoAllocPolicy<Infallible> > RegExpCaptureVector;
405 
406 class RegExpBackReference : public RegExpTree
407 {
408   public:
RegExpBackReference(RegExpCapture * capture)409     explicit RegExpBackReference(RegExpCapture* capture)
410       : capture_(capture)
411     {}
412 
413     virtual void* Accept(RegExpVisitor* visitor, void* data);
414     virtual RegExpNode* ToNode(RegExpCompiler* compiler,
415                                RegExpNode* on_success);
416     virtual RegExpBackReference* AsBackReference();
417     virtual bool IsBackReference();
min_match()418     virtual int min_match() { return 0; }
max_match()419     virtual int max_match() { return capture_->max_match(); }
index()420     int index() { return capture_->index(); }
capture()421     RegExpCapture* capture() { return capture_; }
422   private:
423     RegExpCapture* capture_;
424 };
425 
426 class RegExpEmpty : public RegExpTree
427 {
428   public:
RegExpEmpty()429     RegExpEmpty()
430     {}
431 
432     virtual void* Accept(RegExpVisitor* visitor, void* data);
433     virtual RegExpNode* ToNode(RegExpCompiler* compiler,
434                                RegExpNode* on_success);
435     virtual RegExpEmpty* AsEmpty();
436     virtual bool IsEmpty();
min_match()437     virtual int min_match() { return 0; }
max_match()438     virtual int max_match() { return 0; }
GetInstance()439     static RegExpEmpty* GetInstance() {
440         static RegExpEmpty instance;
441         return &instance;
442     }
443 };
444 
445 } } // namespace js::irregexp
446 
447 #endif  // V8_REGEXP_AST_H_
448