1 /*
2  * Copyright (C) 2009 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 
26 #ifndef RegexPattern_h
27 #define RegexPattern_h
28 
29 #include <wtf/Platform.h>
30 
31 #if ENABLE(YARR)
32 
33 #include <wtf/Vector.h>
34 #include <wtf/unicode/Unicode.h>
35 
36 
37 namespace JSC { namespace Yarr {
38 
39 #define RegexStackSpaceForBackTrackInfoPatternCharacter 1 // Only for !fixed quantifiers.
40 #define RegexStackSpaceForBackTrackInfoCharacterClass 1 // Only for !fixed quantifiers.
41 #define RegexStackSpaceForBackTrackInfoBackReference 2
42 #define RegexStackSpaceForBackTrackInfoAlternative 1 // One per alternative.
43 #define RegexStackSpaceForBackTrackInfoParentheticalAssertion 1
44 #define RegexStackSpaceForBackTrackInfoParenthesesOnce 1 // Only for !fixed quantifiers.
45 #define RegexStackSpaceForBackTrackInfoParentheses 4
46 
47 struct PatternDisjunction;
48 
49 struct CharacterRange {
50     UChar begin;
51     UChar end;
52 
CharacterRangeCharacterRange53     CharacterRange(UChar begin, UChar end)
54         : begin(begin)
55         , end(end)
56     {
57     }
58 };
59 
60 struct CharacterClass : FastAllocBase {
61     Vector<UChar> m_matches;
62     Vector<CharacterRange> m_ranges;
63     Vector<UChar> m_matchesUnicode;
64     Vector<CharacterRange> m_rangesUnicode;
65 };
66 
67 enum QuantifierType {
68     QuantifierFixedCount,
69     QuantifierGreedy,
70     QuantifierNonGreedy,
71 };
72 
73 struct PatternTerm {
74     enum Type {
75         TypeAssertionBOL,
76         TypeAssertionEOL,
77         TypeAssertionWordBoundary,
78         TypePatternCharacter,
79         TypeCharacterClass,
80         TypeBackReference,
81         TypeForwardReference,
82         TypeParenthesesSubpattern,
83         TypeParentheticalAssertion,
84     } type;
85     bool invertOrCapture;
86     union {
87         UChar patternCharacter;
88         CharacterClass* characterClass;
89         unsigned subpatternId;
90         struct {
91             PatternDisjunction* disjunction;
92             unsigned subpatternId;
93             unsigned lastSubpatternId;
94             bool isCopy;
95         } parentheses;
96     };
97     QuantifierType quantityType;
98     unsigned quantityCount;
99     int inputPosition;
100     unsigned frameLocation;
101 
PatternTermPatternTerm102     PatternTerm(UChar ch)
103         : type(PatternTerm::TypePatternCharacter)
104     {
105         patternCharacter = ch;
106         quantityType = QuantifierFixedCount;
107         quantityCount = 1;
108     }
109 
PatternTermPatternTerm110     PatternTerm(CharacterClass* charClass, bool invert)
111         : type(PatternTerm::TypeCharacterClass)
112         , invertOrCapture(invert)
113     {
114         characterClass = charClass;
115         quantityType = QuantifierFixedCount;
116         quantityCount = 1;
117     }
118 
PatternTermPatternTerm119     PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool invertOrCapture)
120         : type(type)
121         , invertOrCapture(invertOrCapture)
122     {
123         parentheses.disjunction = disjunction;
124         parentheses.subpatternId = subpatternId;
125         parentheses.isCopy = false;
126         quantityType = QuantifierFixedCount;
127         quantityCount = 1;
128     }
129 
130     PatternTerm(Type type, bool invert = false)
typePatternTerm131         : type(type)
132         , invertOrCapture(invert)
133     {
134         quantityType = QuantifierFixedCount;
135         quantityCount = 1;
136     }
137 
PatternTermPatternTerm138     PatternTerm(unsigned spatternId)
139         : type(TypeBackReference)
140         , invertOrCapture(false)
141     {
142         subpatternId = spatternId;
143         quantityType = QuantifierFixedCount;
144         quantityCount = 1;
145     }
146 
ForwardReferencePatternTerm147     static PatternTerm ForwardReference()
148     {
149         return PatternTerm(TypeForwardReference);
150     }
151 
BOLPatternTerm152     static PatternTerm BOL()
153     {
154         return PatternTerm(TypeAssertionBOL);
155     }
156 
EOLPatternTerm157     static PatternTerm EOL()
158     {
159         return PatternTerm(TypeAssertionEOL);
160     }
161 
WordBoundaryPatternTerm162     static PatternTerm WordBoundary(bool invert)
163     {
164         return PatternTerm(TypeAssertionWordBoundary, invert);
165     }
166 
invertPatternTerm167     bool invert()
168     {
169         return invertOrCapture;
170     }
171 
capturePatternTerm172     bool capture()
173     {
174         return invertOrCapture;
175     }
176 
quantifyPatternTerm177     void quantify(unsigned count, QuantifierType type)
178     {
179         quantityCount = count;
180         quantityType = type;
181     }
182 };
183 
184 struct PatternAlternative : FastAllocBase {
PatternAlternativePatternAlternative185     PatternAlternative(PatternDisjunction* disjunction)
186         : m_parent(disjunction)
187     {
188     }
189 
lastTermPatternAlternative190     PatternTerm& lastTerm()
191     {
192         ASSERT(m_terms.size());
193         return m_terms[m_terms.size() - 1];
194     }
195 
removeLastTermPatternAlternative196     void removeLastTerm()
197     {
198         ASSERT(m_terms.size());
199         m_terms.shrink(m_terms.size() - 1);
200     }
201 
202     Vector<PatternTerm> m_terms;
203     PatternDisjunction* m_parent;
204     unsigned m_minimumSize;
205     bool m_hasFixedSize;
206 };
207 
208 struct PatternDisjunction : FastAllocBase {
209     PatternDisjunction(PatternAlternative* parent = 0)
m_parentPatternDisjunction210         : m_parent(parent)
211     {
212     }
213 
~PatternDisjunctionPatternDisjunction214     ~PatternDisjunction()
215     {
216         deleteAllValues(m_alternatives);
217     }
218 
addNewAlternativePatternDisjunction219     PatternAlternative* addNewAlternative()
220     {
221         PatternAlternative* alternative = new PatternAlternative(this);
222         m_alternatives.append(alternative);
223         return alternative;
224     }
225 
226     Vector<PatternAlternative*> m_alternatives;
227     PatternAlternative* m_parent;
228     unsigned m_minimumSize;
229     unsigned m_callFrameSize;
230     bool m_hasFixedSize;
231 };
232 
233 // You probably don't want to be calling these functions directly
234 // (please to be calling newlineCharacterClass() et al on your
235 // friendly neighborhood RegexPattern instance to get nicely
236 // cached copies).
237 CharacterClass* newlineCreate();
238 CharacterClass* digitsCreate();
239 CharacterClass* spacesCreate();
240 CharacterClass* wordcharCreate();
241 CharacterClass* nondigitsCreate();
242 CharacterClass* nonspacesCreate();
243 CharacterClass* nonwordcharCreate();
244 
245 struct RegexPattern {
RegexPatternRegexPattern246     RegexPattern(bool ignoreCase, bool multiline)
247         : m_ignoreCase(ignoreCase)
248         , m_multiline(multiline)
249         , m_numSubpatterns(0)
250         , m_maxBackReference(0)
251         , newlineCached(0)
252         , digitsCached(0)
253         , spacesCached(0)
254         , wordcharCached(0)
255         , nondigitsCached(0)
256         , nonspacesCached(0)
257         , nonwordcharCached(0)
258     {
259     }
260 
~RegexPatternRegexPattern261     ~RegexPattern()
262     {
263         deleteAllValues(m_disjunctions);
264         deleteAllValues(m_userCharacterClasses);
265     }
266 
resetRegexPattern267     void reset()
268     {
269         m_numSubpatterns = 0;
270         m_maxBackReference = 0;
271 
272         newlineCached = 0;
273         digitsCached = 0;
274         spacesCached = 0;
275         wordcharCached = 0;
276         nondigitsCached = 0;
277         nonspacesCached = 0;
278         nonwordcharCached = 0;
279 
280         deleteAllValues(m_disjunctions);
281         m_disjunctions.clear();
282         deleteAllValues(m_userCharacterClasses);
283         m_userCharacterClasses.clear();
284     }
285 
containsIllegalBackReferenceRegexPattern286     bool containsIllegalBackReference()
287     {
288         return m_maxBackReference > m_numSubpatterns;
289     }
290 
newlineCharacterClassRegexPattern291     CharacterClass* newlineCharacterClass()
292     {
293         if (!newlineCached)
294             m_userCharacterClasses.append(newlineCached = newlineCreate());
295         return newlineCached;
296     }
digitsCharacterClassRegexPattern297     CharacterClass* digitsCharacterClass()
298     {
299         if (!digitsCached)
300             m_userCharacterClasses.append(digitsCached = digitsCreate());
301         return digitsCached;
302     }
spacesCharacterClassRegexPattern303     CharacterClass* spacesCharacterClass()
304     {
305         if (!spacesCached)
306             m_userCharacterClasses.append(spacesCached = spacesCreate());
307         return spacesCached;
308     }
wordcharCharacterClassRegexPattern309     CharacterClass* wordcharCharacterClass()
310     {
311         if (!wordcharCached)
312             m_userCharacterClasses.append(wordcharCached = wordcharCreate());
313         return wordcharCached;
314     }
nondigitsCharacterClassRegexPattern315     CharacterClass* nondigitsCharacterClass()
316     {
317         if (!nondigitsCached)
318             m_userCharacterClasses.append(nondigitsCached = nondigitsCreate());
319         return nondigitsCached;
320     }
nonspacesCharacterClassRegexPattern321     CharacterClass* nonspacesCharacterClass()
322     {
323         if (!nonspacesCached)
324             m_userCharacterClasses.append(nonspacesCached = nonspacesCreate());
325         return nonspacesCached;
326     }
nonwordcharCharacterClassRegexPattern327     CharacterClass* nonwordcharCharacterClass()
328     {
329         if (!nonwordcharCached)
330             m_userCharacterClasses.append(nonwordcharCached = nonwordcharCreate());
331         return nonwordcharCached;
332     }
333 
334     bool m_ignoreCase;
335     bool m_multiline;
336     unsigned m_numSubpatterns;
337     unsigned m_maxBackReference;
338     PatternDisjunction* m_body;
339     Vector<PatternDisjunction*, 4> m_disjunctions;
340     Vector<CharacterClass*> m_userCharacterClasses;
341 
342 private:
343     CharacterClass* newlineCached;
344     CharacterClass* digitsCached;
345     CharacterClass* spacesCached;
346     CharacterClass* wordcharCached;
347     CharacterClass* nondigitsCached;
348     CharacterClass* nonspacesCached;
349     CharacterClass* nonwordcharCached;
350 };
351 
352 } } // namespace JSC::Yarr
353 
354 #endif
355 
356 #endif // RegexPattern_h
357