1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 //
4 //  file:  regexcmp.cpp
5 //
6 //  Copyright (C) 2002-2016 International Business Machines Corporation and others.
7 //  All Rights Reserved.
8 //
9 //  This file contains the ICU regular expression compiler, which is responsible
10 //  for processing a regular expression pattern into the compiled form that
11 //  is used by the match finding engine.
12 //
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
17 
18 #include "unicode/ustring.h"
19 #include "unicode/unistr.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/uchriter.h"
23 #include "unicode/parsepos.h"
24 #include "unicode/parseerr.h"
25 #include "unicode/regex.h"
26 #include "unicode/utf.h"
27 #include "unicode/utf16.h"
28 #include "patternprops.h"
29 #include "putilimp.h"
30 #include "cmemory.h"
31 #include "cstr.h"
32 #include "cstring.h"
33 #include "uvectr32.h"
34 #include "uvectr64.h"
35 #include "uassert.h"
36 #include "uinvchar.h"
37 
38 #include "regeximp.h"
39 #include "regexcst.h"   // Contains state table for the regex pattern parser.
40                         //   generated by a Perl script.
41 #include "regexcmp.h"
42 #include "regexst.h"
43 #include "regextxt.h"
44 
45 
46 
47 U_NAMESPACE_BEGIN
48 
49 
50 //------------------------------------------------------------------------------
51 //
52 //  Constructor.
53 //
54 //------------------------------------------------------------------------------
RegexCompile(RegexPattern * rxp,UErrorCode & status)55 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
56    fParenStack(status), fSetStack(status), fSetOpStack(status)
57 {
58     // Lazy init of all shared global sets (needed for init()'s empty text)
59     RegexStaticSets::initGlobals(&status);
60 
61     fStatus           = &status;
62 
63     fRXPat            = rxp;
64     fScanIndex        = 0;
65     fLastChar         = -1;
66     fPeekChar         = -1;
67     fLineNum          = 1;
68     fCharNum          = 0;
69     fQuoteMode        = FALSE;
70     fInBackslashQuote = FALSE;
71     fModeFlags        = fRXPat->fFlags | 0x80000000;
72     fEOLComments      = TRUE;
73 
74     fMatchOpenParen   = -1;
75     fMatchCloseParen  = -1;
76     fCaptureName      = NULL;
77     fLastSetLiteral   = U_SENTINEL;
78 
79     if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
80         status = rxp->fDeferredStatus;
81     }
82 }
83 
84 static const UChar      chAmp       = 0x26;      // '&'
85 static const UChar      chDash      = 0x2d;      // '-'
86 
87 
88 //------------------------------------------------------------------------------
89 //
90 //  Destructor
91 //
92 //------------------------------------------------------------------------------
~RegexCompile()93 RegexCompile::~RegexCompile() {
94     delete fCaptureName;         // Normally will be NULL, but can exist if pattern
95                                  //   compilation stops with a syntax error.
96 }
97 
addCategory(UnicodeSet * set,int32_t value,UErrorCode & ec)98 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
99     set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
100 }
101 
102 //------------------------------------------------------------------------------
103 //
104 //  Compile regex pattern.   The state machine for rexexp pattern parsing is here.
105 //                           The state tables are hand-written in the file regexcst.txt,
106 //                           and converted to the form used here by a perl
107 //                           script regexcst.pl
108 //
109 //------------------------------------------------------------------------------
compile(const UnicodeString & pat,UParseError & pp,UErrorCode & e)110 void    RegexCompile::compile(
111                          const UnicodeString &pat,   // Source pat to be compiled.
112                          UParseError &pp,            // Error position info
113                          UErrorCode &e)              // Error Code
114 {
115     fRXPat->fPatternString = new UnicodeString(pat);
116     UText patternText = UTEXT_INITIALIZER;
117     utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
118 
119     if (U_SUCCESS(e)) {
120         compile(&patternText, pp, e);
121         utext_close(&patternText);
122     }
123 }
124 
125 //
126 //   compile, UText mode
127 //     All the work is actually done here.
128 //
compile(UText * pat,UParseError & pp,UErrorCode & e)129 void    RegexCompile::compile(
130                          UText *pat,                 // Source pat to be compiled.
131                          UParseError &pp,            // Error position info
132                          UErrorCode &e)              // Error Code
133 {
134     fStatus             = &e;
135     fParseErr           = &pp;
136     fStackPtr           = 0;
137     fStack[fStackPtr]   = 0;
138 
139     if (U_FAILURE(*fStatus)) {
140         return;
141     }
142 
143     // There should be no pattern stuff in the RegexPattern object.  They can not be reused.
144     U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0);
145 
146     // Prepare the RegexPattern object to receive the compiled pattern.
147     fRXPat->fPattern        = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
148     if (U_FAILURE(*fStatus)) {
149         return;
150     }
151     fRXPat->fStaticSets     = RegexStaticSets::gStaticSets->fPropSets;
152     fRXPat->fStaticSets8    = RegexStaticSets::gStaticSets->fPropSets8;
153 
154 
155     // Initialize the pattern scanning state machine
156     fPatternLength = utext_nativeLength(pat);
157     uint16_t                state = 1;
158     const RegexTableEl      *tableEl;
159 
160     // UREGEX_LITERAL force entire pattern to be treated as a literal string.
161     if (fModeFlags & UREGEX_LITERAL) {
162         fQuoteMode = TRUE;
163     }
164 
165     nextChar(fC);                        // Fetch the first char from the pattern string.
166 
167     //
168     // Main loop for the regex pattern parsing state machine.
169     //   Runs once per state transition.
170     //   Each time through optionally performs, depending on the state table,
171     //      - an advance to the the next pattern char
172     //      - an action to be performed.
173     //      - pushing or popping a state to/from the local state return stack.
174     //   file regexcst.txt is the source for the state table.  The logic behind
175     //     recongizing the pattern syntax is there, not here.
176     //
177     for (;;) {
178         //  Bail out if anything has gone wrong.
179         //  Regex pattern parsing stops on the first error encountered.
180         if (U_FAILURE(*fStatus)) {
181             break;
182         }
183 
184         U_ASSERT(state != 0);
185 
186         // Find the state table element that matches the input char from the pattern, or the
187         //    class of the input character.  Start with the first table row for this
188         //    state, then linearly scan forward until we find a row that matches the
189         //    character.  The last row for each state always matches all characters, so
190         //    the search will stop there, if not before.
191         //
192         tableEl = &gRuleParseStateTable[state];
193         REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d)    state=%s ",
194             fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
195 
196         for (;;) {    // loop through table rows belonging to this state, looking for one
197                       //   that matches the current input char.
198             REGEX_SCAN_DEBUG_PRINTF(("."));
199             if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE &&   tableEl->fCharClass == fC.fChar) {
200                 // Table row specified an individual character, not a set, and
201                 //   the input character is not quoted, and
202                 //   the input character matched it.
203                 break;
204             }
205             if (tableEl->fCharClass == 255) {
206                 // Table row specified default, match anything character class.
207                 break;
208             }
209             if (tableEl->fCharClass == 254 && fC.fQuoted)  {
210                 // Table row specified "quoted" and the char was quoted.
211                 break;
212             }
213             if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1)  {
214                 // Table row specified eof and we hit eof on the input.
215                 break;
216             }
217 
218             if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
219                 fC.fQuoted == FALSE &&                                       //   char is not escaped &&
220                 fC.fChar != (UChar32)-1) {                                   //   char is not EOF
221                 U_ASSERT(tableEl->fCharClass <= 137);
222                 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
223                     // Table row specified a character class, or set of characters,
224                     //   and the current char matches it.
225                     break;
226                 }
227             }
228 
229             // No match on this row, advance to the next  row for this state,
230             tableEl++;
231         }
232         REGEX_SCAN_DEBUG_PRINTF(("\n"));
233 
234         //
235         // We've found the row of the state table that matches the current input
236         //   character from the rules string.
237         // Perform any action specified  by this row in the state table.
238         if (doParseActions(tableEl->fAction) == FALSE) {
239             // Break out of the state machine loop if the
240             //   the action signalled some kind of error, or
241             //   the action was to exit, occurs on normal end-of-rules-input.
242             break;
243         }
244 
245         if (tableEl->fPushState != 0) {
246             fStackPtr++;
247             if (fStackPtr >= kStackSize) {
248                 error(U_REGEX_INTERNAL_ERROR);
249                 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
250                 fStackPtr--;
251             }
252             fStack[fStackPtr] = tableEl->fPushState;
253         }
254 
255         //
256         //  NextChar.  This is where characters are actually fetched from the pattern.
257         //             Happens under control of the 'n' tag in the state table.
258         //
259         if (tableEl->fNextChar) {
260             nextChar(fC);
261         }
262 
263         // Get the next state from the table entry, or from the
264         //   state stack if the next state was specified as "pop".
265         if (tableEl->fNextState != 255) {
266             state = tableEl->fNextState;
267         } else {
268             state = fStack[fStackPtr];
269             fStackPtr--;
270             if (fStackPtr < 0) {
271                 // state stack underflow
272                 // This will occur if the user pattern has mis-matched parentheses,
273                 //   with extra close parens.
274                 //
275                 fStackPtr++;
276                 error(U_REGEX_MISMATCHED_PAREN);
277             }
278         }
279 
280     }
281 
282     if (U_FAILURE(*fStatus)) {
283         // Bail out if the pattern had errors.
284         //   Set stack cleanup:  a successful compile would have left it empty,
285         //   but errors can leave temporary sets hanging around.
286         while (!fSetStack.empty()) {
287             delete (UnicodeSet *)fSetStack.pop();
288         }
289         return;
290     }
291 
292     //
293     // The pattern has now been read and processed, and the compiled code generated.
294     //
295 
296     //
297     // The pattern's fFrameSize so far has accumulated the requirements for
298     //   storage for capture parentheses, counters, etc. that are encountered
299     //   in the pattern.  Add space for the two variables that are always
300     //   present in the saved state:  the input string position (int64_t) and
301     //   the position in the compiled pattern.
302     //
303     allocateStackData(RESTACKFRAME_HDRCOUNT);
304 
305     //
306     // Optimization pass 1: NOPs, back-references, and case-folding
307     //
308     stripNOPs();
309 
310     //
311     // Get bounds for the minimum and maximum length of a string that this
312     //   pattern can match.  Used to avoid looking for matches in strings that
313     //   are too short.
314     //
315     fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
316 
317     //
318     // Optimization pass 2: match start type
319     //
320     matchStartType();
321 
322     //
323     // Set up fast latin-1 range sets
324     //
325     int32_t numSets = fRXPat->fSets->size();
326     fRXPat->fSets8 = new Regex8BitSet[numSets];
327     // Null pointer check.
328     if (fRXPat->fSets8 == NULL) {
329         e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
330         return;
331     }
332     int32_t i;
333     for (i=0; i<numSets; i++) {
334         UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
335         fRXPat->fSets8[i].init(s);
336     }
337 
338 }
339 
340 
341 
342 
343 
344 //------------------------------------------------------------------------------
345 //
346 //  doParseAction        Do some action during regex pattern parsing.
347 //                       Called by the parse state machine.
348 //
349 //                       Generation of the match engine PCode happens here, or
350 //                       in functions called from the parse actions defined here.
351 //
352 //
353 //------------------------------------------------------------------------------
doParseActions(int32_t action)354 UBool RegexCompile::doParseActions(int32_t action)
355 {
356     UBool   returnVal = TRUE;
357 
358     switch ((Regex_PatternParseAction)action) {
359 
360     case doPatStart:
361         // Start of pattern compiles to:
362         //0   SAVE   2        Fall back to position of FAIL
363         //1   jmp    3
364         //2   FAIL            Stop if we ever reach here.
365         //3   NOP             Dummy, so start of pattern looks the same as
366         //                    the start of an ( grouping.
367         //4   NOP             Resreved, will be replaced by a save if there are
368         //                    OR | operators at the top level
369         appendOp(URX_STATE_SAVE, 2);
370         appendOp(URX_JMP,  3);
371         appendOp(URX_FAIL, 0);
372 
373         // Standard open nonCapture paren action emits the two NOPs and
374         //   sets up the paren stack frame.
375         doParseActions(doOpenNonCaptureParen);
376         break;
377 
378     case doPatFinish:
379         // We've scanned to the end of the pattern
380         //  The end of pattern compiles to:
381         //        URX_END
382         //    which will stop the runtime match engine.
383         //  Encountering end of pattern also behaves like a close paren,
384         //   and forces fixups of the State Save at the beginning of the compiled pattern
385         //   and of any OR operations at the top level.
386         //
387         handleCloseParen();
388         if (fParenStack.size() > 0) {
389             // Missing close paren in pattern.
390             error(U_REGEX_MISMATCHED_PAREN);
391         }
392 
393         // add the END operation to the compiled pattern.
394         appendOp(URX_END, 0);
395 
396         // Terminate the pattern compilation state machine.
397         returnVal = FALSE;
398         break;
399 
400 
401 
402     case doOrOperator:
403         // Scanning a '|', as in (A|B)
404         {
405             // Generate code for any pending literals preceding the '|'
406             fixLiterals(FALSE);
407 
408             // Insert a SAVE operation at the start of the pattern section preceding
409             //   this OR at this level.  This SAVE will branch the match forward
410             //   to the right hand side of the OR in the event that the left hand
411             //   side fails to match and backtracks.  Locate the position for the
412             //   save from the location on the top of the parentheses stack.
413             int32_t savePosition = fParenStack.popi();
414             int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
415             U_ASSERT(URX_TYPE(op) == URX_NOP);  // original contents of reserved location
416             op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
417             fRXPat->fCompiledPat->setElementAt(op, savePosition);
418 
419             // Append an JMP operation into the compiled pattern.  The operand for
420             //  the JMP will eventually be the location following the ')' for the
421             //  group.  This will be patched in later, when the ')' is encountered.
422             appendOp(URX_JMP, 0);
423 
424             // Push the position of the newly added JMP op onto the parentheses stack.
425             // This registers if for fixup when this block's close paren is encountered.
426             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
427 
428             // Append a NOP to the compiled pattern.  This is the slot reserved
429             //   for a SAVE in the event that there is yet another '|' following
430             //   this one.
431             appendOp(URX_NOP, 0);
432             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
433         }
434         break;
435 
436 
437     case doBeginNamedCapture:
438         // Scanning (?<letter.
439         //   The first letter of the name will come through again under doConinueNamedCapture.
440         fCaptureName = new UnicodeString();
441         if (fCaptureName == NULL) {
442             error(U_MEMORY_ALLOCATION_ERROR);
443         }
444         break;
445 
446     case  doContinueNamedCapture:
447         fCaptureName->append(fC.fChar);
448         break;
449 
450     case doBadNamedCapture:
451         error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
452         break;
453 
454     case doOpenCaptureParen:
455         // Open Capturing Paren, possibly named.
456         //   Compile to a
457         //      - NOP, which later may be replaced by a save-state if the
458         //         parenthesized group gets a * quantifier, followed by
459         //      - START_CAPTURE  n    where n is stack frame offset to the capture group variables.
460         //      - NOP, which may later be replaced by a save-state if there
461         //             is an '|' alternation within the parens.
462         //
463         //    Each capture group gets three slots in the save stack frame:
464         //         0: Capture Group start position (in input string being matched.)
465         //         1: Capture Group end position.
466         //         2: Start of Match-in-progress.
467         //    The first two locations are for a completed capture group, and are
468         //     referred to by back references and the like.
469         //    The third location stores the capture start position when an START_CAPTURE is
470         //      encountered.  This will be promoted to a completed capture when (and if) the corresponding
471         //      END_CAPTURE is encountered.
472         {
473             fixLiterals();
474             appendOp(URX_NOP, 0);
475             int32_t  varsLoc = allocateStackData(3);    // Reserve three slots in match stack frame.
476             appendOp(URX_START_CAPTURE, varsLoc);
477             appendOp(URX_NOP, 0);
478 
479             // On the Parentheses stack, start a new frame and add the postions
480             //   of the two NOPs.  Depending on what follows in the pattern, the
481             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
482             //   address of the end of the parenthesized group.
483             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
484             fParenStack.push(capturing, *fStatus);                        // Frame type.
485             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first  NOP location
486             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
487 
488             // Save the mapping from group number to stack frame variable position.
489             fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
490 
491             // If this is a named capture group, add the name->group number mapping.
492             if (fCaptureName != NULL) {
493                 int32_t groupNumber = fRXPat->fGroupMap->size();
494                 int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
495                 fCaptureName = NULL;    // hash table takes ownership of the name (key) string.
496                 if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
497                     error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
498                 }
499             }
500         }
501         break;
502 
503     case doOpenNonCaptureParen:
504         // Open non-caputuring (grouping only) Paren.
505         //   Compile to a
506         //      - NOP, which later may be replaced by a save-state if the
507         //         parenthesized group gets a * quantifier, followed by
508         //      - NOP, which may later be replaced by a save-state if there
509         //             is an '|' alternation within the parens.
510         {
511             fixLiterals();
512             appendOp(URX_NOP, 0);
513             appendOp(URX_NOP, 0);
514 
515             // On the Parentheses stack, start a new frame and add the postions
516             //   of the two NOPs.
517             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
518             fParenStack.push(plain,      *fStatus);                       // Begin a new frame.
519             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
520             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
521         }
522          break;
523 
524 
525     case doOpenAtomicParen:
526         // Open Atomic Paren.  (?>
527         //   Compile to a
528         //      - NOP, which later may be replaced if the parenthesized group
529         //         has a quantifier, followed by
530         //      - STO_SP  save state stack position, so it can be restored at the ")"
531         //      - NOP, which may later be replaced by a save-state if there
532         //             is an '|' alternation within the parens.
533         {
534             fixLiterals();
535             appendOp(URX_NOP, 0);
536             int32_t  varLoc = allocateData(1);    // Reserve a data location for saving the state stack ptr.
537             appendOp(URX_STO_SP, varLoc);
538             appendOp(URX_NOP, 0);
539 
540             // On the Parentheses stack, start a new frame and add the postions
541             //   of the two NOPs.  Depending on what follows in the pattern, the
542             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
543             //   address of the end of the parenthesized group.
544             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
545             fParenStack.push(atomic, *fStatus);                           // Frame type.
546             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
547             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
548         }
549         break;
550 
551 
552     case doOpenLookAhead:
553         // Positive Look-ahead   (?=  stuff  )
554         //
555         //   Note:   Addition of transparent input regions, with the need to
556         //           restore the original regions when failing out of a lookahead
557         //           block, complicated this sequence.  Some conbined opcodes
558         //           might make sense - or might not, lookahead aren't that common.
559         //
560         //      Caution:  min match length optimization knows about this
561         //               sequence; don't change without making updates there too.
562         //
563         // Compiles to
564         //    1    START_LA     dataLoc     Saves SP, Input Pos
565         //    2.   STATE_SAVE   4            on failure of lookahead, goto 4
566         //    3    JMP          6           continue ...
567         //
568         //    4.   LA_END                   Look Ahead failed.  Restore regions.
569         //    5.   BACKTRACK                and back track again.
570         //
571         //    6.   NOP              reserved for use by quantifiers on the block.
572         //                          Look-ahead can't have quantifiers, but paren stack
573         //                             compile time conventions require the slot anyhow.
574         //    7.   NOP              may be replaced if there is are '|' ops in the block.
575         //    8.     code for parenthesized stuff.
576         //    9.   LA_END
577         //
578         //  Two data slots are reserved, for saving the stack ptr and the input position.
579         {
580             fixLiterals();
581             int32_t dataLoc = allocateData(2);
582             appendOp(URX_LA_START, dataLoc);
583             appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
584             appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
585             appendOp(URX_LA_END, dataLoc);
586             appendOp(URX_BACKTRACK, 0);
587             appendOp(URX_NOP, 0);
588             appendOp(URX_NOP, 0);
589 
590             // On the Parentheses stack, start a new frame and add the postions
591             //   of the NOPs.
592             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
593             fParenStack.push(lookAhead, *fStatus);                        // Frame type.
594             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
595             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
596         }
597         break;
598 
599     case doOpenLookAheadNeg:
600         // Negated Lookahead.   (?! stuff )
601         // Compiles to
602         //    1.    START_LA    dataloc
603         //    2.    SAVE_STATE  7         // Fail within look-ahead block restores to this state,
604         //                                //   which continues with the match.
605         //    3.    NOP                   // Std. Open Paren sequence, for possible '|'
606         //    4.       code for parenthesized stuff.
607         //    5.    END_LA                // Cut back stack, remove saved state from step 2.
608         //    6.    BACKTRACK             // code in block succeeded, so neg. lookahead fails.
609         //    7.    END_LA                // Restore match region, in case look-ahead was using
610         //                                        an alternate (transparent) region.
611         {
612             fixLiterals();
613             int32_t dataLoc = allocateData(2);
614             appendOp(URX_LA_START, dataLoc);
615             appendOp(URX_STATE_SAVE, 0);    // dest address will be patched later.
616             appendOp(URX_NOP, 0);
617 
618             // On the Parentheses stack, start a new frame and add the postions
619             //   of the StateSave and NOP.
620             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
621             fParenStack.push(negLookAhead, *fStatus);                    // Frame type
622             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The STATE_SAVE location
623             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
624 
625             // Instructions #5 - #7 will be added when the ')' is encountered.
626         }
627         break;
628 
629     case doOpenLookBehind:
630         {
631             //   Compile a (?<= look-behind open paren.
632             //
633             //          Compiles to
634             //              0       URX_LB_START     dataLoc
635             //              1       URX_LB_CONT      dataLoc
636             //              2                        MinMatchLen
637             //              3                        MaxMatchLen
638             //              4       URX_NOP          Standard '(' boilerplate.
639             //              5       URX_NOP          Reserved slot for use with '|' ops within (block).
640             //              6         <code for LookBehind expression>
641             //              7       URX_LB_END       dataLoc    # Check match len, restore input  len
642             //              8       URX_LA_END       dataLoc    # Restore stack, input pos
643             //
644             //          Allocate a block of matcher data, to contain (when running a match)
645             //              0:    Stack ptr on entry
646             //              1:    Input Index on entry
647             //              2:    Start index of match current match attempt.
648             //              3:    Original Input String len.
649 
650             // Generate match code for any pending literals.
651             fixLiterals();
652 
653             // Allocate data space
654             int32_t dataLoc = allocateData(4);
655 
656             // Emit URX_LB_START
657             appendOp(URX_LB_START, dataLoc);
658 
659             // Emit URX_LB_CONT
660             appendOp(URX_LB_CONT, dataLoc);
661             appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
662             appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
663 
664             // Emit the NOPs
665             appendOp(URX_NOP, 0);
666             appendOp(URX_NOP, 0);
667 
668             // On the Parentheses stack, start a new frame and add the postions
669             //   of the URX_LB_CONT and the NOP.
670             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
671             fParenStack.push(lookBehind, *fStatus);                       // Frame type
672             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
673             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
674 
675             // The final two instructions will be added when the ')' is encountered.
676         }
677 
678         break;
679 
680     case doOpenLookBehindNeg:
681         {
682             //   Compile a (?<! negated look-behind open paren.
683             //
684             //          Compiles to
685             //              0       URX_LB_START     dataLoc    # Save entry stack, input len
686             //              1       URX_LBN_CONT     dataLoc    # Iterate possible match positions
687             //              2                        MinMatchLen
688             //              3                        MaxMatchLen
689             //              4                        continueLoc (9)
690             //              5       URX_NOP          Standard '(' boilerplate.
691             //              6       URX_NOP          Reserved slot for use with '|' ops within (block).
692             //              7         <code for LookBehind expression>
693             //              8       URX_LBN_END      dataLoc    # Check match len, cause a FAIL
694             //              9       ...
695             //
696             //          Allocate a block of matcher data, to contain (when running a match)
697             //              0:    Stack ptr on entry
698             //              1:    Input Index on entry
699             //              2:    Start index of match current match attempt.
700             //              3:    Original Input String len.
701 
702             // Generate match code for any pending literals.
703             fixLiterals();
704 
705             // Allocate data space
706             int32_t dataLoc = allocateData(4);
707 
708             // Emit URX_LB_START
709             appendOp(URX_LB_START, dataLoc);
710 
711             // Emit URX_LBN_CONT
712             appendOp(URX_LBN_CONT, dataLoc);
713             appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
714             appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
715             appendOp(URX_RESERVED_OP, 0);    // Continue Loc.    To be filled later.
716 
717             // Emit the NOPs
718             appendOp(URX_NOP, 0);
719             appendOp(URX_NOP, 0);
720 
721             // On the Parentheses stack, start a new frame and add the postions
722             //   of the URX_LB_CONT and the NOP.
723             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
724             fParenStack.push(lookBehindN, *fStatus);                      // Frame type
725             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
726             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
727 
728             // The final two instructions will be added when the ')' is encountered.
729         }
730         break;
731 
732     case doConditionalExpr:
733         // Conditionals such as (?(1)a:b)
734     case doPerlInline:
735         // Perl inline-condtionals.  (?{perl code}a|b) We're not perl, no way to do them.
736         error(U_REGEX_UNIMPLEMENTED);
737         break;
738 
739 
740     case doCloseParen:
741         handleCloseParen();
742         if (fParenStack.size() <= 0) {
743             //  Extra close paren, or missing open paren.
744             error(U_REGEX_MISMATCHED_PAREN);
745         }
746         break;
747 
748     case doNOP:
749         break;
750 
751 
752     case doBadOpenParenType:
753     case doRuleError:
754         error(U_REGEX_RULE_SYNTAX);
755         break;
756 
757 
758     case doMismatchedParenErr:
759         error(U_REGEX_MISMATCHED_PAREN);
760         break;
761 
762     case doPlus:
763         //  Normal '+'  compiles to
764         //     1.   stuff to be repeated  (already built)
765         //     2.   jmp-sav 1
766         //     3.   ...
767         //
768         //  Or, if the item to be repeated can match a zero length string,
769         //     1.   STO_INP_LOC  data-loc
770         //     2.      body of stuff to be repeated
771         //     3.   JMP_SAV_X    2
772         //     4.   ...
773 
774         //
775         //  Or, if the item to be repeated is simple
776         //     1.   Item to be repeated.
777         //     2.   LOOP_SR_I    set number  (assuming repeated item is a set ref)
778         //     3.   LOOP_C       stack location
779         {
780             int32_t  topLoc = blockTopLoc(FALSE);        // location of item #1
781             int32_t  frameLoc;
782 
783             // Check for simple constructs, which may get special optimized code.
784             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
785                 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
786 
787                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
788                     // Emit optimized code for [char set]+
789                     appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
790                     frameLoc = allocateStackData(1);
791                     appendOp(URX_LOOP_C, frameLoc);
792                     break;
793                 }
794 
795                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
796                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
797                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
798                     // Emit Optimized code for .+ operations.
799                     int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
800                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
801                         // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
802                         loopOpI |= 1;
803                     }
804                     if (fModeFlags & UREGEX_UNIX_LINES) {
805                         loopOpI |= 2;
806                     }
807                     appendOp(loopOpI);
808                     frameLoc = allocateStackData(1);
809                     appendOp(URX_LOOP_C, frameLoc);
810                     break;
811                 }
812 
813             }
814 
815             // General case.
816 
817             // Check for minimum match length of zero, which requires
818             //    extra loop-breaking code.
819             if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
820                 // Zero length match is possible.
821                 // Emit the code sequence that can handle it.
822                 insertOp(topLoc);
823                 frameLoc = allocateStackData(1);
824 
825                 int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
826                 fRXPat->fCompiledPat->setElementAt(op, topLoc);
827 
828                 appendOp(URX_JMP_SAV_X, topLoc+1);
829             } else {
830                 // Simpler code when the repeated body must match something non-empty
831                 appendOp(URX_JMP_SAV, topLoc);
832             }
833         }
834         break;
835 
836     case doNGPlus:
837         //  Non-greedy '+?'  compiles to
838         //     1.   stuff to be repeated  (already built)
839         //     2.   state-save  1
840         //     3.   ...
841         {
842             int32_t topLoc      = blockTopLoc(FALSE);
843             appendOp(URX_STATE_SAVE, topLoc);
844         }
845         break;
846 
847 
848     case doOpt:
849         // Normal (greedy) ? quantifier.
850         //  Compiles to
851         //     1. state save 3
852         //     2.    body of optional block
853         //     3. ...
854         // Insert the state save into the compiled pattern, and we're done.
855         {
856             int32_t   saveStateLoc = blockTopLoc(TRUE);
857             int32_t   saveStateOp  = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
858             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
859         }
860         break;
861 
862     case doNGOpt:
863         // Non-greedy ?? quantifier
864         //   compiles to
865         //    1.  jmp   4
866         //    2.     body of optional block
867         //    3   jmp   5
868         //    4.  state save 2
869         //    5    ...
870         //  This code is less than ideal, with two jmps instead of one, because we can only
871         //  insert one instruction at the top of the block being iterated.
872         {
873             int32_t  jmp1_loc = blockTopLoc(TRUE);
874             int32_t  jmp2_loc = fRXPat->fCompiledPat->size();
875 
876             int32_t  jmp1_op  = buildOp(URX_JMP, jmp2_loc+1);
877             fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
878 
879             appendOp(URX_JMP, jmp2_loc+2);
880 
881             appendOp(URX_STATE_SAVE, jmp1_loc+1);
882         }
883         break;
884 
885 
886     case doStar:
887         // Normal (greedy) * quantifier.
888         // Compiles to
889         //       1.   STATE_SAVE   4
890         //       2.      body of stuff being iterated over
891         //       3.   JMP_SAV      2
892         //       4.   ...
893         //
894         // Or, if the body is a simple [Set],
895         //       1.   LOOP_SR_I    set number
896         //       2.   LOOP_C       stack location
897         //       ...
898         //
899         // Or if this is a .*
900         //       1.   LOOP_DOT_I    (. matches all mode flag)
901         //       2.   LOOP_C        stack location
902         //
903         // Or, if the body can match a zero-length string, to inhibit infinite loops,
904         //       1.   STATE_SAVE   5
905         //       2.   STO_INP_LOC  data-loc
906         //       3.      body of stuff
907         //       4.   JMP_SAV_X    2
908         //       5.   ...
909         {
910             // location of item #1, the STATE_SAVE
911             int32_t   topLoc = blockTopLoc(FALSE);
912             int32_t   dataLoc = -1;
913 
914             // Check for simple *, where the construct being repeated
915             //   compiled to single opcode, and might be optimizable.
916             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
917                 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
918 
919                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
920                     // Emit optimized code for a [char set]*
921                     int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
922                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
923                     dataLoc = allocateStackData(1);
924                     appendOp(URX_LOOP_C, dataLoc);
925                     break;
926                 }
927 
928                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
929                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
930                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
931                     // Emit Optimized code for .* operations.
932                     int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
933                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
934                         // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
935                         loopOpI |= 1;
936                     }
937                     if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
938                         loopOpI |= 2;
939                     }
940                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
941                     dataLoc = allocateStackData(1);
942                     appendOp(URX_LOOP_C, dataLoc);
943                     break;
944                 }
945             }
946 
947             // Emit general case code for this *
948             // The optimizations did not apply.
949 
950             int32_t   saveStateLoc = blockTopLoc(TRUE);
951             int32_t   jmpOp        = buildOp(URX_JMP_SAV, saveStateLoc+1);
952 
953             // Check for minimum match length of zero, which requires
954             //    extra loop-breaking code.
955             if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
956                 insertOp(saveStateLoc);
957                 dataLoc = allocateStackData(1);
958 
959                 int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
960                 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
961                 jmpOp      = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
962             }
963 
964             // Locate the position in the compiled pattern where the match will continue
965             //   after completing the *.   (4 or 5 in the comment above)
966             int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
967 
968             // Put together the save state op and store it into the compiled code.
969             int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
970             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
971 
972             // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
973             appendOp(jmpOp);
974         }
975         break;
976 
977     case doNGStar:
978         // Non-greedy *? quantifier
979         // compiles to
980         //     1.   JMP    3
981         //     2.      body of stuff being iterated over
982         //     3.   STATE_SAVE  2
983         //     4    ...
984         {
985             int32_t     jmpLoc  = blockTopLoc(TRUE);                   // loc  1.
986             int32_t     saveLoc = fRXPat->fCompiledPat->size();        // loc  3.
987             int32_t     jmpOp   = buildOp(URX_JMP, saveLoc);
988             fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
989             appendOp(URX_STATE_SAVE, jmpLoc+1);
990         }
991         break;
992 
993 
994     case doIntervalInit:
995         // The '{' opening an interval quantifier was just scanned.
996         // Init the counter varaiables that will accumulate the values as the digits
997         //    are scanned.
998         fIntervalLow = 0;
999         fIntervalUpper = -1;
1000         break;
1001 
1002     case doIntevalLowerDigit:
1003         // Scanned a digit from the lower value of an {lower,upper} interval
1004         {
1005             int32_t digitValue = u_charDigitValue(fC.fChar);
1006             U_ASSERT(digitValue >= 0);
1007             int64_t val = (int64_t)fIntervalLow*10 + digitValue;
1008             if (val > INT32_MAX) {
1009                 error(U_REGEX_NUMBER_TOO_BIG);
1010             } else {
1011                 fIntervalLow = (int32_t)val;
1012             }
1013         }
1014         break;
1015 
1016     case doIntervalUpperDigit:
1017         // Scanned a digit from the upper value of an {lower,upper} interval
1018         {
1019             if (fIntervalUpper < 0) {
1020                 fIntervalUpper = 0;
1021             }
1022             int32_t digitValue = u_charDigitValue(fC.fChar);
1023             U_ASSERT(digitValue >= 0);
1024             int64_t val = (int64_t)fIntervalUpper*10 + digitValue;
1025             if (val > INT32_MAX) {
1026                 error(U_REGEX_NUMBER_TOO_BIG);
1027             } else {
1028                 fIntervalUpper = (int32_t)val;
1029             }
1030         }
1031         break;
1032 
1033     case doIntervalSame:
1034         // Scanned a single value interval like {27}.  Upper = Lower.
1035         fIntervalUpper = fIntervalLow;
1036         break;
1037 
1038     case doInterval:
1039         // Finished scanning a normal {lower,upper} interval.  Generate the code for it.
1040         if (compileInlineInterval() == FALSE) {
1041             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1042         }
1043         break;
1044 
1045     case doPossessiveInterval:
1046         // Finished scanning a Possessive {lower,upper}+ interval.  Generate the code for it.
1047         {
1048             // Remember the loc for the top of the block being looped over.
1049             //   (Can not reserve a slot in the compiled pattern at this time, because
1050             //    compileInterval needs to reserve also, and blockTopLoc can only reserve
1051             //    once per block.)
1052             int32_t topLoc = blockTopLoc(FALSE);
1053 
1054             // Produce normal looping code.
1055             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1056 
1057             // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1058             //  just as if the loop was inclosed in atomic parentheses.
1059 
1060             // First the STO_SP before the start of the loop
1061             insertOp(topLoc);
1062 
1063             int32_t  varLoc = allocateData(1);   // Reserve a data location for saving the
1064             int32_t  op     = buildOp(URX_STO_SP, varLoc);
1065             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1066 
1067             int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
1068             U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
1069             loopOp++;     // point LoopOp after the just-inserted STO_SP
1070             fRXPat->fCompiledPat->push(loopOp, *fStatus);
1071 
1072             // Then the LD_SP after the end of the loop
1073             appendOp(URX_LD_SP, varLoc);
1074         }
1075 
1076         break;
1077 
1078     case doNGInterval:
1079         // Finished scanning a non-greedy {lower,upper}? interval.  Generate the code for it.
1080         compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1081         break;
1082 
1083     case doIntervalError:
1084         error(U_REGEX_BAD_INTERVAL);
1085         break;
1086 
1087     case doLiteralChar:
1088         // We've just scanned a "normal" character from the pattern,
1089         literalChar(fC.fChar);
1090         break;
1091 
1092 
1093     case doEscapedLiteralChar:
1094         // We've just scanned an backslashed escaped character with  no
1095         //   special meaning.  It represents itself.
1096         if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1097             ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1098             (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1099                error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1100              }
1101         literalChar(fC.fChar);
1102         break;
1103 
1104 
1105     case doDotAny:
1106         // scanned a ".",  match any single character.
1107         {
1108             fixLiterals(FALSE);
1109             if (fModeFlags & UREGEX_DOTALL) {
1110                 appendOp(URX_DOTANY_ALL, 0);
1111             } else if (fModeFlags & UREGEX_UNIX_LINES) {
1112                 appendOp(URX_DOTANY_UNIX, 0);
1113             } else {
1114                 appendOp(URX_DOTANY, 0);
1115             }
1116         }
1117         break;
1118 
1119     case doCaret:
1120         {
1121             fixLiterals(FALSE);
1122             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1123                 appendOp(URX_CARET, 0);
1124             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1125                 appendOp(URX_CARET_M, 0);
1126             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1127                 appendOp(URX_CARET, 0);   // Only testing true start of input.
1128             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1129                 appendOp(URX_CARET_M_UNIX, 0);
1130             }
1131         }
1132         break;
1133 
1134     case doDollar:
1135         {
1136             fixLiterals(FALSE);
1137             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1138                 appendOp(URX_DOLLAR, 0);
1139             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1140                 appendOp(URX_DOLLAR_M, 0);
1141             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1142                 appendOp(URX_DOLLAR_D, 0);
1143             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1144                 appendOp(URX_DOLLAR_MD, 0);
1145             }
1146         }
1147         break;
1148 
1149     case doBackslashA:
1150         fixLiterals(FALSE);
1151         appendOp(URX_CARET, 0);
1152         break;
1153 
1154     case doBackslashB:
1155         {
1156             #if  UCONFIG_NO_BREAK_ITERATION==1
1157             if (fModeFlags & UREGEX_UWORD) {
1158                 error(U_UNSUPPORTED_ERROR);
1159             }
1160             #endif
1161             fixLiterals(FALSE);
1162             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1163             appendOp(op, 1);
1164         }
1165         break;
1166 
1167     case doBackslashb:
1168         {
1169             #if  UCONFIG_NO_BREAK_ITERATION==1
1170             if (fModeFlags & UREGEX_UWORD) {
1171                 error(U_UNSUPPORTED_ERROR);
1172             }
1173             #endif
1174             fixLiterals(FALSE);
1175             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1176             appendOp(op, 0);
1177         }
1178         break;
1179 
1180     case doBackslashD:
1181         fixLiterals(FALSE);
1182         appendOp(URX_BACKSLASH_D, 1);
1183         break;
1184 
1185     case doBackslashd:
1186         fixLiterals(FALSE);
1187         appendOp(URX_BACKSLASH_D, 0);
1188         break;
1189 
1190     case doBackslashG:
1191         fixLiterals(FALSE);
1192         appendOp(URX_BACKSLASH_G, 0);
1193         break;
1194 
1195     case doBackslashH:
1196         fixLiterals(FALSE);
1197         appendOp(URX_BACKSLASH_H, 1);
1198         break;
1199 
1200     case doBackslashh:
1201         fixLiterals(FALSE);
1202         appendOp(URX_BACKSLASH_H, 0);
1203         break;
1204 
1205     case doBackslashR:
1206         fixLiterals(FALSE);
1207         appendOp(URX_BACKSLASH_R, 0);
1208         break;
1209 
1210     case doBackslashS:
1211         fixLiterals(FALSE);
1212         appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
1213         break;
1214 
1215     case doBackslashs:
1216         fixLiterals(FALSE);
1217         appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
1218         break;
1219 
1220     case doBackslashV:
1221         fixLiterals(FALSE);
1222         appendOp(URX_BACKSLASH_V, 1);
1223         break;
1224 
1225     case doBackslashv:
1226         fixLiterals(FALSE);
1227         appendOp(URX_BACKSLASH_V, 0);
1228         break;
1229 
1230     case doBackslashW:
1231         fixLiterals(FALSE);
1232         appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
1233         break;
1234 
1235     case doBackslashw:
1236         fixLiterals(FALSE);
1237         appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
1238         break;
1239 
1240     case doBackslashX:
1241         fixLiterals(FALSE);
1242         appendOp(URX_BACKSLASH_X, 0);
1243         break;
1244 
1245 
1246     case doBackslashZ:
1247         fixLiterals(FALSE);
1248         appendOp(URX_DOLLAR, 0);
1249         break;
1250 
1251     case doBackslashz:
1252         fixLiterals(FALSE);
1253         appendOp(URX_BACKSLASH_Z, 0);
1254         break;
1255 
1256     case doEscapeError:
1257         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1258         break;
1259 
1260     case doExit:
1261         fixLiterals(FALSE);
1262         returnVal = FALSE;
1263         break;
1264 
1265     case doProperty:
1266         {
1267             fixLiterals(FALSE);
1268             UnicodeSet *theSet = scanProp();
1269             compileSet(theSet);
1270         }
1271         break;
1272 
1273     case doNamedChar:
1274         {
1275             UChar32 c = scanNamedChar();
1276             literalChar(c);
1277         }
1278         break;
1279 
1280 
1281     case doBackRef:
1282         // BackReference.  Somewhat unusual in that the front-end can not completely parse
1283         //                 the regular expression, because the number of digits to be consumed
1284         //                 depends on the number of capture groups that have been defined.  So
1285         //                 we have to do it here instead.
1286         {
1287             int32_t  numCaptureGroups = fRXPat->fGroupMap->size();
1288             int32_t  groupNum = 0;
1289             UChar32  c        = fC.fChar;
1290 
1291             for (;;) {
1292                 // Loop once per digit, for max allowed number of digits in a back reference.
1293                 int32_t digit = u_charDigitValue(c);
1294                 groupNum = groupNum * 10 + digit;
1295                 if (groupNum >= numCaptureGroups) {
1296                     break;
1297                 }
1298                 c = peekCharLL();
1299                 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
1300                     break;
1301                 }
1302                 nextCharLL();
1303             }
1304 
1305             // Scan of the back reference in the source regexp is complete.  Now generate
1306             //  the compiled code for it.
1307             // Because capture groups can be forward-referenced by back-references,
1308             //  we fill the operand with the capture group number.  At the end
1309             //  of compilation, it will be changed to the variable's location.
1310             U_ASSERT(groupNum > 0);  // Shouldn't happen.  '\0' begins an octal escape sequence,
1311                                      //    and shouldn't enter this code path at all.
1312             fixLiterals(FALSE);
1313             if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1314                 appendOp(URX_BACKREF_I, groupNum);
1315             } else {
1316                 appendOp(URX_BACKREF, groupNum);
1317             }
1318         }
1319         break;
1320 
1321     case doBeginNamedBackRef:
1322         U_ASSERT(fCaptureName == NULL);
1323         fCaptureName = new UnicodeString;
1324         if (fCaptureName == NULL) {
1325             error(U_MEMORY_ALLOCATION_ERROR);
1326         }
1327         break;
1328 
1329     case doContinueNamedBackRef:
1330         fCaptureName->append(fC.fChar);
1331         break;
1332 
1333     case doCompleteNamedBackRef:
1334         {
1335         int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName);
1336         if (groupNumber == 0) {
1337             // Group name has not been defined.
1338             //   Could be a forward reference. If we choose to support them at some
1339             //   future time, extra mechanism will be required at this point.
1340             error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
1341         } else {
1342             // Given the number, handle identically to a \n numbered back reference.
1343             // See comments above, under doBackRef
1344             fixLiterals(FALSE);
1345             if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1346                 appendOp(URX_BACKREF_I, groupNumber);
1347             } else {
1348                 appendOp(URX_BACKREF, groupNumber);
1349             }
1350         }
1351         delete fCaptureName;
1352         fCaptureName = NULL;
1353         break;
1354         }
1355 
1356     case doPossessivePlus:
1357         // Possessive ++ quantifier.
1358         // Compiles to
1359         //       1.   STO_SP
1360         //       2.      body of stuff being iterated over
1361         //       3.   STATE_SAVE 5
1362         //       4.   JMP        2
1363         //       5.   LD_SP
1364         //       6.   ...
1365         //
1366         //  Note:  TODO:  This is pretty inefficient.  A mass of saved state is built up
1367         //                then unconditionally discarded.  Perhaps introduce a new opcode.  Ticket 6056
1368         //
1369         {
1370             // Emit the STO_SP
1371             int32_t   topLoc = blockTopLoc(TRUE);
1372             int32_t   stoLoc = allocateData(1);  // Reserve the data location for storing save stack ptr.
1373             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1374             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1375 
1376             // Emit the STATE_SAVE
1377             appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
1378 
1379             // Emit the JMP
1380             appendOp(URX_JMP, topLoc+1);
1381 
1382             // Emit the LD_SP
1383             appendOp(URX_LD_SP, stoLoc);
1384         }
1385         break;
1386 
1387     case doPossessiveStar:
1388         // Possessive *+ quantifier.
1389         // Compiles to
1390         //       1.   STO_SP       loc
1391         //       2.   STATE_SAVE   5
1392         //       3.      body of stuff being iterated over
1393         //       4.   JMP          2
1394         //       5.   LD_SP        loc
1395         //       6    ...
1396         // TODO:  do something to cut back the state stack each time through the loop.
1397         {
1398             // Reserve two slots at the top of the block.
1399             int32_t   topLoc = blockTopLoc(TRUE);
1400             insertOp(topLoc);
1401 
1402             // emit   STO_SP     loc
1403             int32_t   stoLoc = allocateData(1);    // Reserve the data location for storing save stack ptr.
1404             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1405             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1406 
1407             // Emit the SAVE_STATE   5
1408             int32_t L7 = fRXPat->fCompiledPat->size()+1;
1409             op = buildOp(URX_STATE_SAVE, L7);
1410             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1411 
1412             // Append the JMP operation.
1413             appendOp(URX_JMP, topLoc+1);
1414 
1415             // Emit the LD_SP       loc
1416             appendOp(URX_LD_SP, stoLoc);
1417         }
1418         break;
1419 
1420     case doPossessiveOpt:
1421         // Possessive  ?+ quantifier.
1422         //  Compiles to
1423         //     1. STO_SP      loc
1424         //     2. SAVE_STATE  5
1425         //     3.    body of optional block
1426         //     4. LD_SP       loc
1427         //     5. ...
1428         //
1429         {
1430             // Reserve two slots at the top of the block.
1431             int32_t   topLoc = blockTopLoc(TRUE);
1432             insertOp(topLoc);
1433 
1434             // Emit the STO_SP
1435             int32_t   stoLoc = allocateData(1);   // Reserve the data location for storing save stack ptr.
1436             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1437             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1438 
1439             // Emit the SAVE_STATE
1440             int32_t   continueLoc = fRXPat->fCompiledPat->size()+1;
1441             op = buildOp(URX_STATE_SAVE, continueLoc);
1442             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1443 
1444             // Emit the LD_SP
1445             appendOp(URX_LD_SP, stoLoc);
1446         }
1447         break;
1448 
1449 
1450     case doBeginMatchMode:
1451         fNewModeFlags = fModeFlags;
1452         fSetModeFlag  = TRUE;
1453         break;
1454 
1455     case doMatchMode:   //  (?i)    and similar
1456         {
1457             int32_t  bit = 0;
1458             switch (fC.fChar) {
1459             case 0x69: /* 'i' */   bit = UREGEX_CASE_INSENSITIVE; break;
1460             case 0x64: /* 'd' */   bit = UREGEX_UNIX_LINES;       break;
1461             case 0x6d: /* 'm' */   bit = UREGEX_MULTILINE;        break;
1462             case 0x73: /* 's' */   bit = UREGEX_DOTALL;           break;
1463             case 0x75: /* 'u' */   bit = 0; /* Unicode casing */  break;
1464             case 0x77: /* 'w' */   bit = UREGEX_UWORD;            break;
1465             case 0x78: /* 'x' */   bit = UREGEX_COMMENTS;         break;
1466             case 0x2d: /* '-' */   fSetModeFlag = FALSE;          break;
1467             default:
1468                 UPRV_UNREACHABLE;   // Should never happen.  Other chars are filtered out
1469                                    // by the scanner.
1470             }
1471             if (fSetModeFlag) {
1472                 fNewModeFlags |= bit;
1473             } else {
1474                 fNewModeFlags &= ~bit;
1475             }
1476         }
1477         break;
1478 
1479     case doSetMatchMode:
1480         // Emit code to match any pending literals, using the not-yet changed match mode.
1481         fixLiterals();
1482 
1483         // We've got a (?i) or similar.  The match mode is being changed, but
1484         //   the change is not scoped to a parenthesized block.
1485         U_ASSERT(fNewModeFlags < 0);
1486         fModeFlags = fNewModeFlags;
1487 
1488         break;
1489 
1490 
1491     case doMatchModeParen:
1492         // We've got a (?i: or similar.  Begin a parenthesized block, save old
1493         //   mode flags so they can be restored at the close of the block.
1494         //
1495         //   Compile to a
1496         //      - NOP, which later may be replaced by a save-state if the
1497         //         parenthesized group gets a * quantifier, followed by
1498         //      - NOP, which may later be replaced by a save-state if there
1499         //             is an '|' alternation within the parens.
1500         {
1501             fixLiterals(FALSE);
1502             appendOp(URX_NOP, 0);
1503             appendOp(URX_NOP, 0);
1504 
1505             // On the Parentheses stack, start a new frame and add the postions
1506             //   of the two NOPs (a normal non-capturing () frame, except for the
1507             //   saving of the orignal mode flags.)
1508             fParenStack.push(fModeFlags, *fStatus);
1509             fParenStack.push(flags, *fStatus);                            // Frame Marker
1510             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
1511             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
1512 
1513             // Set the current mode flags to the new values.
1514             U_ASSERT(fNewModeFlags < 0);
1515             fModeFlags = fNewModeFlags;
1516         }
1517         break;
1518 
1519     case doBadModeFlag:
1520         error(U_REGEX_INVALID_FLAG);
1521         break;
1522 
1523     case doSuppressComments:
1524         // We have just scanned a '(?'.  We now need to prevent the character scanner from
1525         // treating a '#' as a to-the-end-of-line comment.
1526         //   (This Perl compatibility just gets uglier and uglier to do...)
1527         fEOLComments = FALSE;
1528         break;
1529 
1530 
1531     case doSetAddAmp:
1532         {
1533           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1534           set->add(chAmp);
1535         }
1536         break;
1537 
1538     case doSetAddDash:
1539         {
1540           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1541           set->add(chDash);
1542         }
1543         break;
1544 
1545      case doSetBackslash_s:
1546         {
1547          UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1548          set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1549          break;
1550         }
1551 
1552      case doSetBackslash_S:
1553         {
1554             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1555             UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1556             SSet.complement();
1557             set->addAll(SSet);
1558             break;
1559         }
1560 
1561     case doSetBackslash_d:
1562         {
1563             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1564             // TODO - make a static set, ticket 6058.
1565             addCategory(set, U_GC_ND_MASK, *fStatus);
1566             break;
1567         }
1568 
1569     case doSetBackslash_D:
1570         {
1571             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1572             UnicodeSet digits;
1573             // TODO - make a static set, ticket 6058.
1574             digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
1575             digits.complement();
1576             set->addAll(digits);
1577             break;
1578         }
1579 
1580     case doSetBackslash_h:
1581         {
1582             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1583             UnicodeSet h;
1584             h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1585             h.add((UChar32)9);   // Tab
1586             set->addAll(h);
1587             break;
1588         }
1589 
1590     case doSetBackslash_H:
1591         {
1592             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1593             UnicodeSet h;
1594             h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1595             h.add((UChar32)9);   // Tab
1596             h.complement();
1597             set->addAll(h);
1598             break;
1599         }
1600 
1601     case doSetBackslash_v:
1602         {
1603             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1604             set->add((UChar32)0x0a, (UChar32)0x0d);  // add range
1605             set->add((UChar32)0x85);
1606             set->add((UChar32)0x2028, (UChar32)0x2029);
1607             break;
1608         }
1609 
1610     case doSetBackslash_V:
1611         {
1612             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1613             UnicodeSet v;
1614             v.add((UChar32)0x0a, (UChar32)0x0d);  // add range
1615             v.add((UChar32)0x85);
1616             v.add((UChar32)0x2028, (UChar32)0x2029);
1617             v.complement();
1618             set->addAll(v);
1619             break;
1620         }
1621 
1622     case doSetBackslash_w:
1623         {
1624             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1625             set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1626             break;
1627         }
1628 
1629     case doSetBackslash_W:
1630         {
1631             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1632             UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1633             SSet.complement();
1634             set->addAll(SSet);
1635             break;
1636         }
1637 
1638     case doSetBegin:
1639         fixLiterals(FALSE);
1640         fSetStack.push(new UnicodeSet(), *fStatus);
1641         fSetOpStack.push(setStart, *fStatus);
1642         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1643             fSetOpStack.push(setCaseClose, *fStatus);
1644         }
1645         break;
1646 
1647     case doSetBeginDifference1:
1648         //  We have scanned something like [[abc]-[
1649         //  Set up a new UnicodeSet for the set beginning with the just-scanned '['
1650         //  Push a Difference operator, which will cause the new set to be subtracted from what
1651         //    went before once it is created.
1652         setPushOp(setDifference1);
1653         fSetOpStack.push(setStart, *fStatus);
1654         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1655             fSetOpStack.push(setCaseClose, *fStatus);
1656         }
1657         break;
1658 
1659     case doSetBeginIntersection1:
1660         //  We have scanned something like  [[abc]&[
1661         //   Need both the '&' operator and the open '[' operator.
1662         setPushOp(setIntersection1);
1663         fSetOpStack.push(setStart, *fStatus);
1664         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1665             fSetOpStack.push(setCaseClose, *fStatus);
1666         }
1667         break;
1668 
1669     case doSetBeginUnion:
1670         //  We have scanned something like  [[abc][
1671         //     Need to handle the union operation explicitly [[abc] | [
1672         setPushOp(setUnion);
1673         fSetOpStack.push(setStart, *fStatus);
1674         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1675             fSetOpStack.push(setCaseClose, *fStatus);
1676         }
1677         break;
1678 
1679     case doSetDifference2:
1680         // We have scanned something like [abc--
1681         //   Consider this to unambiguously be a set difference operator.
1682         setPushOp(setDifference2);
1683         break;
1684 
1685     case doSetEnd:
1686         // Have encountered the ']' that closes a set.
1687         //    Force the evaluation of any pending operations within this set,
1688         //    leave the completed set on the top of the set stack.
1689         setEval(setEnd);
1690         U_ASSERT(fSetOpStack.peeki()==setStart);
1691         fSetOpStack.popi();
1692         break;
1693 
1694     case doSetFinish:
1695         {
1696         // Finished a complete set expression, including all nested sets.
1697         //   The close bracket has already triggered clearing out pending set operators,
1698         //    the operator stack should be empty and the operand stack should have just
1699         //    one entry, the result set.
1700         U_ASSERT(fSetOpStack.empty());
1701         UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
1702         U_ASSERT(fSetStack.empty());
1703         compileSet(theSet);
1704         break;
1705         }
1706 
1707     case doSetIntersection2:
1708         // Have scanned something like [abc&&
1709         setPushOp(setIntersection2);
1710         break;
1711 
1712     case doSetLiteral:
1713         // Union the just-scanned literal character into the set being built.
1714         //    This operation is the highest precedence set operation, so we can always do
1715         //    it immediately, without waiting to see what follows.  It is necessary to perform
1716         //    any pending '-' or '&' operation first, because these have the same precedence
1717         //    as union-ing in a literal'
1718         {
1719             setEval(setUnion);
1720             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1721             s->add(fC.fChar);
1722             fLastSetLiteral = fC.fChar;
1723             break;
1724         }
1725 
1726     case doSetLiteralEscaped:
1727         // A back-slash escaped literal character was encountered.
1728         // Processing is the same as with setLiteral, above, with the addition of
1729         //  the optional check for errors on escaped ASCII letters.
1730         {
1731             if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1732                 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1733                  (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1734                 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1735             }
1736             setEval(setUnion);
1737             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1738             s->add(fC.fChar);
1739             fLastSetLiteral = fC.fChar;
1740             break;
1741         }
1742 
1743         case doSetNamedChar:
1744         // Scanning a \N{UNICODE CHARACTER NAME}
1745         //  Aside from the source of the character, the processing is identical to doSetLiteral,
1746         //    above.
1747         {
1748             UChar32  c = scanNamedChar();
1749             setEval(setUnion);
1750             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1751             s->add(c);
1752             fLastSetLiteral = c;
1753             break;
1754         }
1755 
1756     case doSetNamedRange:
1757         // We have scanned literal-\N{CHAR NAME}.  Add the range to the set.
1758         // The left character is already in the set, and is saved in fLastSetLiteral.
1759         // The right side needs to be picked up, the scan is at the 'N'.
1760         // Lower Limit > Upper limit being an error matches both Java
1761         //        and ICU UnicodeSet behavior.
1762         {
1763             UChar32  c = scanNamedChar();
1764             if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > c)) {
1765                 error(U_REGEX_INVALID_RANGE);
1766             }
1767             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1768             s->add(fLastSetLiteral, c);
1769             fLastSetLiteral = c;
1770             break;
1771         }
1772 
1773 
1774     case  doSetNegate:
1775         // Scanned a '^' at the start of a set.
1776         // Push the negation operator onto the set op stack.
1777         // A twist for case-insensitive matching:
1778         //   the case closure operation must happen _before_ negation.
1779         //   But the case closure operation will already be on the stack if it's required.
1780         //   This requires checking for case closure, and swapping the stack order
1781         //    if it is present.
1782         {
1783             int32_t  tosOp = fSetOpStack.peeki();
1784             if (tosOp == setCaseClose) {
1785                 fSetOpStack.popi();
1786                 fSetOpStack.push(setNegation, *fStatus);
1787                 fSetOpStack.push(setCaseClose, *fStatus);
1788             } else {
1789                 fSetOpStack.push(setNegation, *fStatus);
1790             }
1791         }
1792         break;
1793 
1794     case doSetNoCloseError:
1795         error(U_REGEX_MISSING_CLOSE_BRACKET);
1796         break;
1797 
1798     case doSetOpError:
1799         error(U_REGEX_RULE_SYNTAX);   //  -- or && at the end of a set.  Illegal.
1800         break;
1801 
1802     case doSetPosixProp:
1803         {
1804             UnicodeSet *s = scanPosixProp();
1805             if (s != NULL) {
1806                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1807                 tos->addAll(*s);
1808                 delete s;
1809             }  // else error.  scanProp() reported the error status already.
1810         }
1811         break;
1812 
1813     case doSetProp:
1814         //  Scanned a \p \P within [brackets].
1815         {
1816             UnicodeSet *s = scanProp();
1817             if (s != NULL) {
1818                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1819                 tos->addAll(*s);
1820                 delete s;
1821             }  // else error.  scanProp() reported the error status already.
1822         }
1823         break;
1824 
1825 
1826     case doSetRange:
1827         // We have scanned literal-literal.  Add the range to the set.
1828         // The left character is already in the set, and is saved in fLastSetLiteral.
1829         // The right side is the current character.
1830         // Lower Limit > Upper limit being an error matches both Java
1831         //        and ICU UnicodeSet behavior.
1832         {
1833 
1834         if (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > fC.fChar) {
1835             error(U_REGEX_INVALID_RANGE);
1836         }
1837         UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1838         s->add(fLastSetLiteral, fC.fChar);
1839         break;
1840         }
1841 
1842     default:
1843         UPRV_UNREACHABLE;
1844     }
1845 
1846     if (U_FAILURE(*fStatus)) {
1847         returnVal = FALSE;
1848     }
1849 
1850     return returnVal;
1851 }
1852 
1853 
1854 
1855 //------------------------------------------------------------------------------
1856 //
1857 //   literalChar           We've encountered a literal character from the pattern,
1858 //                             or an escape sequence that reduces to a character.
1859 //                         Add it to the string containing all literal chars/strings from
1860 //                             the pattern.
1861 //
1862 //------------------------------------------------------------------------------
literalChar(UChar32 c)1863 void RegexCompile::literalChar(UChar32 c)  {
1864     fLiteralChars.append(c);
1865 }
1866 
1867 
1868 //------------------------------------------------------------------------------
1869 //
1870 //    fixLiterals           When compiling something that can follow a literal
1871 //                          string in a pattern, emit the code to match the
1872 //                          accumulated literal string.
1873 //
1874 //                          Optionally, split the last char of the string off into
1875 //                          a single "ONE_CHAR" operation, so that quantifiers can
1876 //                          apply to that char alone.  Example:   abc*
1877 //                          The * must apply to the 'c' only.
1878 //
1879 //------------------------------------------------------------------------------
fixLiterals(UBool split)1880 void    RegexCompile::fixLiterals(UBool split) {
1881 
1882     // If no literal characters have been scanned but not yet had code generated
1883     //   for them, nothing needs to be done.
1884     if (fLiteralChars.length() == 0) {
1885         return;
1886     }
1887 
1888     int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1889     UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1890 
1891     // Split:  We need to  ensure that the last item in the compiled pattern
1892     //     refers only to the last literal scanned in the pattern, so that
1893     //     quantifiers (*, +, etc.) affect only it, and not a longer string.
1894     //     Split before case folding for case insensitive matches.
1895 
1896     if (split) {
1897         fLiteralChars.truncate(indexOfLastCodePoint);
1898         fixLiterals(FALSE);   // Recursive call, emit code to match the first part of the string.
1899                               //  Note that the truncated literal string may be empty, in which case
1900                               //  nothing will be emitted.
1901 
1902         literalChar(lastCodePoint);  // Re-add the last code point as if it were a new literal.
1903         fixLiterals(FALSE);          // Second recursive call, code for the final code point.
1904         return;
1905     }
1906 
1907     // If we are doing case-insensitive matching, case fold the string.  This may expand
1908     //   the string, e.g. the German sharp-s turns into "ss"
1909     if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1910         fLiteralChars.foldCase();
1911         indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1912         lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1913     }
1914 
1915     if (indexOfLastCodePoint == 0) {
1916         // Single character, emit a URX_ONECHAR op to match it.
1917         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1918                  u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
1919             appendOp(URX_ONECHAR_I, lastCodePoint);
1920         } else {
1921             appendOp(URX_ONECHAR, lastCodePoint);
1922         }
1923     } else {
1924         // Two or more chars, emit a URX_STRING to match them.
1925         if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) {
1926             error(U_REGEX_PATTERN_TOO_BIG);
1927         }
1928         if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1929             appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
1930         } else {
1931             // TODO here:  add optimization to split case sensitive strings of length two
1932             //             into two single char ops, for efficiency.
1933             appendOp(URX_STRING, fRXPat->fLiteralText.length());
1934         }
1935         appendOp(URX_STRING_LEN, fLiteralChars.length());
1936 
1937         // Add this string into the accumulated strings of the compiled pattern.
1938         fRXPat->fLiteralText.append(fLiteralChars);
1939     }
1940 
1941     fLiteralChars.remove();
1942 }
1943 
1944 
buildOp(int32_t type,int32_t val)1945 int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
1946     if (U_FAILURE(*fStatus)) {
1947         return 0;
1948     }
1949     if (type < 0 || type > 255) {
1950         UPRV_UNREACHABLE;
1951     }
1952     if (val > 0x00ffffff) {
1953         UPRV_UNREACHABLE;
1954     }
1955     if (val < 0) {
1956         if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) {
1957             UPRV_UNREACHABLE;
1958         }
1959         if (URX_TYPE(val) != 0xff) {
1960             UPRV_UNREACHABLE;
1961         }
1962         type = URX_RESERVED_OP_N;
1963     }
1964     return (type << 24) | val;
1965 }
1966 
1967 
1968 //------------------------------------------------------------------------------
1969 //
1970 //   appendOp()             Append a new instruction onto the compiled pattern
1971 //                          Includes error checking, limiting the size of the
1972 //                          pattern to lengths that can be represented in the
1973 //                          24 bit operand field of an instruction.
1974 //
1975 //------------------------------------------------------------------------------
appendOp(int32_t op)1976 void RegexCompile::appendOp(int32_t op) {
1977     if (U_FAILURE(*fStatus)) {
1978         return;
1979     }
1980     fRXPat->fCompiledPat->addElement(op, *fStatus);
1981     if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
1982         error(U_REGEX_PATTERN_TOO_BIG);
1983     }
1984 }
1985 
appendOp(int32_t type,int32_t val)1986 void RegexCompile::appendOp(int32_t type, int32_t val) {
1987     appendOp(buildOp(type, val));
1988 }
1989 
1990 
1991 //------------------------------------------------------------------------------
1992 //
1993 //   insertOp()             Insert a slot for a new opcode into the already
1994 //                          compiled pattern code.
1995 //
1996 //                          Fill the slot with a NOP.  Our caller will replace it
1997 //                          with what they really wanted.
1998 //
1999 //------------------------------------------------------------------------------
insertOp(int32_t where)2000 void   RegexCompile::insertOp(int32_t where) {
2001     UVector64 *code = fRXPat->fCompiledPat;
2002     U_ASSERT(where>0 && where < code->size());
2003 
2004     int32_t  nop = buildOp(URX_NOP, 0);
2005     code->insertElementAt(nop, where, *fStatus);
2006 
2007     // Walk through the pattern, looking for any ops with targets that
2008     //  were moved down by the insert.  Fix them.
2009     int32_t loc;
2010     for (loc=0; loc<code->size(); loc++) {
2011         int32_t op = (int32_t)code->elementAti(loc);
2012         int32_t opType = URX_TYPE(op);
2013         int32_t opValue = URX_VAL(op);
2014         if ((opType == URX_JMP         ||
2015             opType == URX_JMPX         ||
2016             opType == URX_STATE_SAVE   ||
2017             opType == URX_CTR_LOOP     ||
2018             opType == URX_CTR_LOOP_NG  ||
2019             opType == URX_JMP_SAV      ||
2020             opType == URX_JMP_SAV_X    ||
2021             opType == URX_RELOC_OPRND)    && opValue > where) {
2022             // Target location for this opcode is after the insertion point and
2023             //   needs to be incremented to adjust for the insertion.
2024             opValue++;
2025             op = buildOp(opType, opValue);
2026             code->setElementAt(op, loc);
2027         }
2028     }
2029 
2030     // Now fix up the parentheses stack.  All positive values in it are locations in
2031     //  the compiled pattern.   (Negative values are frame boundaries, and don't need fixing.)
2032     for (loc=0; loc<fParenStack.size(); loc++) {
2033         int32_t x = fParenStack.elementAti(loc);
2034         U_ASSERT(x < code->size());
2035         if (x>where) {
2036             x++;
2037             fParenStack.setElementAt(x, loc);
2038         }
2039     }
2040 
2041     if (fMatchCloseParen > where) {
2042         fMatchCloseParen++;
2043     }
2044     if (fMatchOpenParen > where) {
2045         fMatchOpenParen++;
2046     }
2047 }
2048 
2049 
2050 //------------------------------------------------------------------------------
2051 //
2052 //   allocateData()        Allocate storage in the matcher's static data area.
2053 //                         Return the index for the newly allocated data.
2054 //                         The storage won't actually exist until we are running a match
2055 //                         operation, but the storage indexes are inserted into various
2056 //                         opcodes while compiling the pattern.
2057 //
2058 //------------------------------------------------------------------------------
allocateData(int32_t size)2059 int32_t RegexCompile::allocateData(int32_t size) {
2060     if (U_FAILURE(*fStatus)) {
2061         return 0;
2062     }
2063     if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
2064         error(U_REGEX_INTERNAL_ERROR);
2065         return 0;
2066     }
2067     int32_t dataIndex = fRXPat->fDataSize;
2068     fRXPat->fDataSize += size;
2069     if (fRXPat->fDataSize >= 0x00fffff0) {
2070         error(U_REGEX_INTERNAL_ERROR);
2071     }
2072     return dataIndex;
2073 }
2074 
2075 
2076 //------------------------------------------------------------------------------
2077 //
2078 //   allocateStackData()   Allocate space in the back-tracking stack frame.
2079 //                         Return the index for the newly allocated data.
2080 //                         The frame indexes are inserted into various
2081 //                         opcodes while compiling the pattern, meaning that frame
2082 //                         size must be restricted to the size that will fit
2083 //                         as an operand (24 bits).
2084 //
2085 //------------------------------------------------------------------------------
allocateStackData(int32_t size)2086 int32_t RegexCompile::allocateStackData(int32_t size) {
2087     if (U_FAILURE(*fStatus)) {
2088         return 0;
2089     }
2090     if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
2091         error(U_REGEX_INTERNAL_ERROR);
2092         return 0;
2093     }
2094     int32_t dataIndex = fRXPat->fFrameSize;
2095     fRXPat->fFrameSize += size;
2096     if (fRXPat->fFrameSize >= 0x00fffff0) {
2097         error(U_REGEX_PATTERN_TOO_BIG);
2098     }
2099     return dataIndex;
2100 }
2101 
2102 
2103 //------------------------------------------------------------------------------
2104 //
2105 //   blockTopLoc()          Find or create a location in the compiled pattern
2106 //                          at the start of the operation or block that has
2107 //                          just been compiled.  Needed when a quantifier (* or
2108 //                          whatever) appears, and we need to add an operation
2109 //                          at the start of the thing being quantified.
2110 //
2111 //                          (Parenthesized Blocks) have a slot with a NOP that
2112 //                          is reserved for this purpose.  .* or similar don't
2113 //                          and a slot needs to be added.
2114 //
2115 //       parameter reserveLoc   :  TRUE -  ensure that there is space to add an opcode
2116 //                                         at the returned location.
2117 //                                 FALSE - just return the address,
2118 //                                         do not reserve a location there.
2119 //
2120 //------------------------------------------------------------------------------
blockTopLoc(UBool reserveLoc)2121 int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
2122     int32_t   theLoc;
2123     fixLiterals(TRUE);  // Emit code for any pending literals.
2124                         //   If last item was a string, emit separate op for the its last char.
2125     if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
2126     {
2127         // The item just processed is a parenthesized block.
2128         theLoc = fMatchOpenParen;   // A slot is already reserved for us.
2129         U_ASSERT(theLoc > 0);
2130         U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
2131     }
2132     else {
2133         // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2134         // No slot for STATE_SAVE was pre-reserved in the compiled code.
2135         // We need to make space now.
2136         theLoc = fRXPat->fCompiledPat->size()-1;
2137         int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
2138         if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
2139             // Strings take two opcode, we want the position of the first one.
2140             // We can have a string at this point if a single character case-folded to two.
2141             theLoc--;
2142         }
2143         if (reserveLoc) {
2144             int32_t  nop = buildOp(URX_NOP, 0);
2145             fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
2146         }
2147     }
2148     return theLoc;
2149 }
2150 
2151 
2152 
2153 //------------------------------------------------------------------------------
2154 //
2155 //    handleCloseParen      When compiling a close paren, we need to go back
2156 //                          and fix up any JMP or SAVE operations within the
2157 //                          parenthesized block that need to target the end
2158 //                          of the block.  The locations of these are kept on
2159 //                          the paretheses stack.
2160 //
2161 //                          This function is called both when encountering a
2162 //                          real ) and at the end of the pattern.
2163 //
2164 //------------------------------------------------------------------------------
handleCloseParen()2165 void  RegexCompile::handleCloseParen() {
2166     int32_t   patIdx;
2167     int32_t   patOp;
2168     if (fParenStack.size() <= 0) {
2169         error(U_REGEX_MISMATCHED_PAREN);
2170         return;
2171     }
2172 
2173     // Emit code for any pending literals.
2174     fixLiterals(FALSE);
2175 
2176     // Fixup any operations within the just-closed parenthesized group
2177     //    that need to reference the end of the (block).
2178     //    (The first one popped from the stack is an unused slot for
2179     //     alternation (OR) state save, but applying the fixup to it does no harm.)
2180     for (;;) {
2181         patIdx = fParenStack.popi();
2182         if (patIdx < 0) {
2183             // value < 0 flags the start of the frame on the paren stack.
2184             break;
2185         }
2186         U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
2187         patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
2188         U_ASSERT(URX_VAL(patOp) == 0);          // Branch target for JMP should not be set.
2189         patOp |= fRXPat->fCompiledPat->size();  // Set it now.
2190         fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
2191         fMatchOpenParen     = patIdx;
2192     }
2193 
2194     //  At the close of any parenthesized block, restore the match mode flags  to
2195     //  the value they had at the open paren.  Saved value is
2196     //  at the top of the paren stack.
2197     fModeFlags = fParenStack.popi();
2198     U_ASSERT(fModeFlags < 0);
2199 
2200     // DO any additional fixups, depending on the specific kind of
2201     // parentesized grouping this is
2202 
2203     switch (patIdx) {
2204     case plain:
2205     case flags:
2206         // No additional fixups required.
2207         //   (Grouping-only parentheses)
2208         break;
2209     case capturing:
2210         // Capturing Parentheses.
2211         //   Insert a End Capture op into the pattern.
2212         //   The frame offset of the variables for this cg is obtained from the
2213         //       start capture op and put it into the end-capture op.
2214         {
2215             int32_t   captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2216             U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
2217 
2218             int32_t   frameVarLocation = URX_VAL(captureOp);
2219             appendOp(URX_END_CAPTURE, frameVarLocation);
2220         }
2221         break;
2222     case atomic:
2223         // Atomic Parenthesis.
2224         //   Insert a LD_SP operation to restore the state stack to the position
2225         //   it was when the atomic parens were entered.
2226         {
2227             int32_t   stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2228             U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
2229             int32_t   stoLoc = URX_VAL(stoOp);
2230             appendOp(URX_LD_SP, stoLoc);
2231         }
2232         break;
2233 
2234     case lookAhead:
2235         {
2236             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2237             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2238             int32_t dataLoc  = URX_VAL(startOp);
2239             appendOp(URX_LA_END, dataLoc);
2240         }
2241         break;
2242 
2243     case negLookAhead:
2244         {
2245             // See comment at doOpenLookAheadNeg
2246             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
2247             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2248             int32_t dataLoc  = URX_VAL(startOp);
2249             appendOp(URX_LA_END, dataLoc);
2250             appendOp(URX_BACKTRACK, 0);
2251             appendOp(URX_LA_END, dataLoc);
2252 
2253             // Patch the URX_SAVE near the top of the block.
2254             // The destination of the SAVE is the final LA_END that was just added.
2255             int32_t saveOp   = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
2256             U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
2257             int32_t dest     = fRXPat->fCompiledPat->size()-1;
2258             saveOp           = buildOp(URX_STATE_SAVE, dest);
2259             fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2260         }
2261         break;
2262 
2263     case lookBehind:
2264         {
2265             // See comment at doOpenLookBehind.
2266 
2267             // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2268             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
2269             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2270             int32_t dataLoc  = URX_VAL(startOp);
2271             appendOp(URX_LB_END, dataLoc);
2272             appendOp(URX_LA_END, dataLoc);
2273 
2274             // Determine the min and max bounds for the length of the
2275             //  string that the pattern can match.
2276             //  An unbounded upper limit is an error.
2277             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2278             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2279             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2280             if (URX_TYPE(maxML) != 0) {
2281                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2282                 break;
2283             }
2284             if (maxML == INT32_MAX) {
2285                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2286                 break;
2287             }
2288             if (minML == INT32_MAX && maxML == 0) {
2289                 // This condition happens when no match is possible, such as with a
2290                 // [set] expression containing no elements.
2291                 // In principle, the generated code to evaluate the expression could be deleted,
2292                 // but it's probably not worth the complication.
2293                 minML = 0;
2294             }
2295             U_ASSERT(minML <= maxML);
2296 
2297             // Insert the min and max match len bounds into the URX_LB_CONT op that
2298             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2299             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-2);
2300             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-1);
2301 
2302         }
2303         break;
2304 
2305 
2306 
2307     case lookBehindN:
2308         {
2309             // See comment at doOpenLookBehindNeg.
2310 
2311             // Append the URX_LBN_END to the compiled pattern.
2312             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2313             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2314             int32_t dataLoc  = URX_VAL(startOp);
2315             appendOp(URX_LBN_END, dataLoc);
2316 
2317             // Determine the min and max bounds for the length of the
2318             //  string that the pattern can match.
2319             //  An unbounded upper limit is an error.
2320             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2321             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2322             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2323             if (URX_TYPE(maxML) != 0) {
2324                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2325                 break;
2326             }
2327             if (maxML == INT32_MAX) {
2328                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2329                 break;
2330             }
2331             if (minML == INT32_MAX && maxML == 0) {
2332                 // This condition happens when no match is possible, such as with a
2333                 // [set] expression containing no elements.
2334                 // In principle, the generated code to evaluate the expression could be deleted,
2335                 // but it's probably not worth the complication.
2336                 minML = 0;
2337             }
2338 
2339             U_ASSERT(minML <= maxML);
2340 
2341             // Insert the min and max match len bounds into the URX_LB_CONT op that
2342             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2343             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-3);
2344             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-2);
2345 
2346             // Insert the pattern location to continue at after a successful match
2347             //  as the last operand of the URX_LBN_CONT
2348             int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
2349             fRXPat->fCompiledPat->setElementAt(op,  fMatchOpenParen-1);
2350         }
2351         break;
2352 
2353 
2354 
2355     default:
2356         UPRV_UNREACHABLE;
2357     }
2358 
2359     // remember the next location in the compiled pattern.
2360     // The compilation of Quantifiers will look at this to see whether its looping
2361     //   over a parenthesized block or a single item
2362     fMatchCloseParen = fRXPat->fCompiledPat->size();
2363 }
2364 
2365 
2366 
2367 //------------------------------------------------------------------------------
2368 //
2369 //   compileSet       Compile the pattern operations for a reference to a
2370 //                    UnicodeSet.
2371 //
2372 //------------------------------------------------------------------------------
compileSet(UnicodeSet * theSet)2373 void        RegexCompile::compileSet(UnicodeSet *theSet)
2374 {
2375     if (theSet == NULL) {
2376         return;
2377     }
2378     //  Remove any strings from the set.
2379     //  There shoudn't be any, but just in case.
2380     //     (Case Closure can add them; if we had a simple case closure avaialble that
2381     //      ignored strings, that would be better.)
2382     theSet->removeAllStrings();
2383     int32_t  setSize = theSet->size();
2384 
2385     switch (setSize) {
2386     case 0:
2387         {
2388             // Set of no elements.   Always fails to match.
2389             appendOp(URX_BACKTRACK, 0);
2390             delete theSet;
2391         }
2392         break;
2393 
2394     case 1:
2395         {
2396             // The set contains only a single code point.  Put it into
2397             //   the compiled pattern as a single char operation rather
2398             //   than a set, and discard the set itself.
2399             literalChar(theSet->charAt(0));
2400             delete theSet;
2401         }
2402         break;
2403 
2404     default:
2405         {
2406             //  The set contains two or more chars.  (the normal case)
2407             //  Put it into the compiled pattern as a set.
2408             int32_t setNumber = fRXPat->fSets->size();
2409             fRXPat->fSets->addElement(theSet, *fStatus);
2410             appendOp(URX_SETREF, setNumber);
2411         }
2412     }
2413 }
2414 
2415 
2416 //------------------------------------------------------------------------------
2417 //
2418 //   compileInterval    Generate the code for a {min, max} style interval quantifier.
2419 //                      Except for the specific opcodes used, the code is the same
2420 //                      for all three types (greedy, non-greedy, possessive) of
2421 //                      intervals.  The opcodes are supplied as parameters.
2422 //                      (There are two sets of opcodes - greedy & possessive use the
2423 //                      same ones, while non-greedy has it's own.)
2424 //
2425 //                      The code for interval loops has this form:
2426 //                         0  CTR_INIT   counter loc (in stack frame)
2427 //                         1             5  patt address of CTR_LOOP at bottom of block
2428 //                         2             min count
2429 //                         3             max count   (-1 for unbounded)
2430 //                         4  ...        block to be iterated over
2431 //                         5  CTR_LOOP
2432 //
2433 //                       In
2434 //------------------------------------------------------------------------------
compileInterval(int32_t InitOp,int32_t LoopOp)2435 void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)
2436 {
2437     // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2438     //   four slots in the compiled code.  Reserve them.
2439     int32_t   topOfBlock = blockTopLoc(TRUE);
2440     insertOp(topOfBlock);
2441     insertOp(topOfBlock);
2442     insertOp(topOfBlock);
2443 
2444     // The operands for the CTR_INIT opcode include the index in the matcher data
2445     //   of the counter.  Allocate it now. There are two data items
2446     //        counterLoc   -->  Loop counter
2447     //               +1    -->  Input index (for breaking non-progressing loops)
2448     //                          (Only present if unbounded upper limit on loop)
2449     int32_t   dataSize = fIntervalUpper < 0 ? 2 : 1;
2450     int32_t   counterLoc = allocateStackData(dataSize);
2451 
2452     int32_t   op = buildOp(InitOp, counterLoc);
2453     fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2454 
2455     // The second operand of CTR_INIT is the location following the end of the loop.
2456     //   Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2457     //   compilation of something later on causes the code to grow and the target
2458     //   position to move.
2459     int32_t loopEnd = fRXPat->fCompiledPat->size();
2460     op = buildOp(URX_RELOC_OPRND, loopEnd);
2461     fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
2462 
2463     // Followed by the min and max counts.
2464     fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
2465     fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
2466 
2467     // Apend the CTR_LOOP op.  The operand is the location of the CTR_INIT op.
2468     //   Goes at end of the block being looped over, so just append to the code so far.
2469     appendOp(LoopOp, topOfBlock);
2470 
2471     if ((fIntervalLow & 0xff000000) != 0 ||
2472         (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
2473             error(U_REGEX_NUMBER_TOO_BIG);
2474         }
2475 
2476     if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
2477         error(U_REGEX_MAX_LT_MIN);
2478     }
2479 }
2480 
2481 
2482 
compileInlineInterval()2483 UBool RegexCompile::compileInlineInterval() {
2484     if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
2485         // Too big to inline.  Fail, which will cause looping code to be generated.
2486         //   (Upper < Lower picks up unbounded upper and errors, both.)
2487         return FALSE;
2488     }
2489 
2490     int32_t   topOfBlock = blockTopLoc(FALSE);
2491     if (fIntervalUpper == 0) {
2492         // Pathological case.  Attempt no matches, as if the block doesn't exist.
2493         // Discard the generated code for the block.
2494         // If the block included parens, discard the info pertaining to them as well.
2495         fRXPat->fCompiledPat->setSize(topOfBlock);
2496         if (fMatchOpenParen >= topOfBlock) {
2497             fMatchOpenParen = -1;
2498         }
2499         if (fMatchCloseParen >= topOfBlock) {
2500             fMatchCloseParen = -1;
2501         }
2502         return TRUE;
2503     }
2504 
2505     if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
2506         // The thing being repeated is not a single op, but some
2507         //   more complex block.  Do it as a loop, not inlines.
2508         //   Note that things "repeated" a max of once are handled as inline, because
2509         //     the one copy of the code already generated is just fine.
2510         return FALSE;
2511     }
2512 
2513     // Pick up the opcode that is to be repeated
2514     //
2515     int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
2516 
2517     // Compute the pattern location where the inline sequence
2518     //   will end, and set up the state save op that will be needed.
2519     //
2520     int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
2521                                 + fIntervalUpper + (fIntervalUpper-fIntervalLow);
2522     int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
2523     if (fIntervalLow == 0) {
2524         insertOp(topOfBlock);
2525         fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2526     }
2527 
2528 
2529 
2530     //  Loop, emitting the op for the thing being repeated each time.
2531     //    Loop starts at 1 because one instance of the op already exists in the pattern,
2532     //    it was put there when it was originally encountered.
2533     int32_t i;
2534     for (i=1; i<fIntervalUpper; i++ ) {
2535         if (i >= fIntervalLow) {
2536             appendOp(saveOp);
2537         }
2538         appendOp(op);
2539     }
2540     return TRUE;
2541 }
2542 
2543 
2544 
2545 //------------------------------------------------------------------------------
2546 //
2547 //   caseInsensitiveStart  given a single code point from a pattern string, determine the
2548 //                         set of characters that could potentially begin a case-insensitive
2549 //                         match of a string beginning with that character, using full Unicode
2550 //                         case insensitive matching.
2551 //
2552 //          This is used in optimizing find().
2553 //
2554 //          closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2555 //          misses cases like this:
2556 //             A string from the pattern begins with 'ss' (although all we know
2557 //                 in this context is that it begins with 's')
2558 //             The pattern could match a string beginning with a German sharp-s
2559 //
2560 //           To the ordinary case closure for a character c, we add all other
2561 //           characters cx where the case closure of cx incudes a string form that begins
2562 //           with the original character c.
2563 //
2564 //           This function could be made smarter. The full pattern string is available
2565 //           and it would be possible to verify that the extra characters being added
2566 //           to the starting set fully match, rather than having just a first-char of the
2567 //           folded form match.
2568 //
2569 //------------------------------------------------------------------------------
findCaseInsensitiveStarters(UChar32 c,UnicodeSet * starterChars)2570 void  RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
2571 
2572 // Machine Generated below.
2573 // It may need updating with new versions of Unicode.
2574 // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2575 // The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
2576 
2577 // Machine Generated Data. Do not hand edit.
2578     static const UChar32 RECaseFixCodePoints[] = {
2579         0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc,
2580         0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565,
2581         0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07,
2582         0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61,
2583         0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
2584 
2585     static const int16_t RECaseFixStringOffsets[] = {
2586         0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10,
2587         0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f,
2588         0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43,
2589         0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57,
2590         0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
2591 
2592     static const int16_t RECaseFixCounts[] = {
2593         0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1,
2594         0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1,
2595         0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2596         0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2597         0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
2598 
2599     static const UChar RECaseFixData[] = {
2600         0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf,
2601         0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3,
2602         0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3,
2603         0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3,
2604         0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14,
2605         0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83,
2606         0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90,
2607         0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95,
2608         0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2,
2609         0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7,
2610         0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
2611 
2612 // End of machine generated data.
2613 
2614     if (c < UCHAR_MIN_VALUE || c > UCHAR_MAX_VALUE) {
2615         // This function should never be called with an invalid input character.
2616         UPRV_UNREACHABLE;
2617     } else if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2618         UChar32 caseFoldedC  = u_foldCase(c, U_FOLD_CASE_DEFAULT);
2619         starterChars->set(caseFoldedC, caseFoldedC);
2620 
2621         int32_t i;
2622         for (i=0; RECaseFixCodePoints[i]<c ; i++) {
2623             // Simple linear search through the sorted list of interesting code points.
2624         }
2625 
2626         if (RECaseFixCodePoints[i] == c) {
2627             int32_t dataIndex = RECaseFixStringOffsets[i];
2628             int32_t numCharsToAdd = RECaseFixCounts[i];
2629             UChar32 cpToAdd = 0;
2630             for (int32_t j=0; j<numCharsToAdd; j++) {
2631                 U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
2632                 starterChars->add(cpToAdd);
2633             }
2634         }
2635 
2636         starterChars->closeOver(USET_CASE_INSENSITIVE);
2637         starterChars->removeAllStrings();
2638     } else {
2639         // Not a cased character. Just return it alone.
2640         starterChars->set(c, c);
2641     }
2642 }
2643 
2644 
2645 // Increment with overflow check.
2646 // val and delta will both be positive.
2647 
safeIncrement(int32_t val,int32_t delta)2648 static int32_t safeIncrement(int32_t val, int32_t delta) {
2649     if (INT32_MAX - val > delta) {
2650         return val + delta;
2651     } else {
2652         return INT32_MAX;
2653     }
2654 }
2655 
2656 
2657 //------------------------------------------------------------------------------
2658 //
2659 //   matchStartType    Determine how a match can start.
2660 //                     Used to optimize find() operations.
2661 //
2662 //                     Operation is very similar to minMatchLength().  Walk the compiled
2663 //                     pattern, keeping an on-going minimum-match-length.  For any
2664 //                     op where the min match coming in is zero, add that ops possible
2665 //                     starting matches to the possible starts for the overall pattern.
2666 //
2667 //------------------------------------------------------------------------------
matchStartType()2668 void   RegexCompile::matchStartType() {
2669     if (U_FAILURE(*fStatus)) {
2670         return;
2671     }
2672 
2673 
2674     int32_t    loc;                    // Location in the pattern of the current op being processed.
2675     int32_t    op;                     // The op being processed
2676     int32_t    opType;                 // The opcode type of the op
2677     int32_t    currentLen = 0;         // Minimum length of a match to this point (loc) in the pattern
2678     int32_t    numInitialStrings = 0;  // Number of strings encountered that could match at start.
2679 
2680     UBool      atStart = TRUE;         // True if no part of the pattern yet encountered
2681                                        //   could have advanced the position in a match.
2682                                        //   (Maximum match length so far == 0)
2683 
2684     // forwardedLength is a vector holding minimum-match-length values that
2685     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
2686     //   It must be one longer than the pattern being checked because some  ops
2687     //   will jmp to a end-of-block+1 location from within a block, and we must
2688     //   count those when checking the block.
2689     int32_t end = fRXPat->fCompiledPat->size();
2690     UVector32  forwardedLength(end+1, *fStatus);
2691     forwardedLength.setSize(end+1);
2692     for (loc=3; loc<end; loc++) {
2693         forwardedLength.setElementAt(INT32_MAX, loc);
2694     }
2695 
2696     for (loc = 3; loc<end; loc++) {
2697         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2698         opType = URX_TYPE(op);
2699 
2700         // The loop is advancing linearly through the pattern.
2701         // If the op we are now at was the destination of a branch in the pattern,
2702         // and that path has a shorter minimum length than the current accumulated value,
2703         // replace the current accumulated value.
2704         if (forwardedLength.elementAti(loc) < currentLen) {
2705             currentLen = forwardedLength.elementAti(loc);
2706             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2707         }
2708 
2709         switch (opType) {
2710             // Ops that don't change the total length matched
2711         case URX_RESERVED_OP:
2712         case URX_END:
2713         case URX_FAIL:
2714         case URX_STRING_LEN:
2715         case URX_NOP:
2716         case URX_START_CAPTURE:
2717         case URX_END_CAPTURE:
2718         case URX_BACKSLASH_B:
2719         case URX_BACKSLASH_BU:
2720         case URX_BACKSLASH_G:
2721         case URX_BACKSLASH_Z:
2722         case URX_DOLLAR:
2723         case URX_DOLLAR_M:
2724         case URX_DOLLAR_D:
2725         case URX_DOLLAR_MD:
2726         case URX_RELOC_OPRND:
2727         case URX_STO_INP_LOC:
2728         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
2729         case URX_BACKREF_I:
2730 
2731         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
2732         case URX_LD_SP:
2733             break;
2734 
2735         case URX_CARET:
2736             if (atStart) {
2737                 fRXPat->fStartType = START_START;
2738             }
2739             break;
2740 
2741         case URX_CARET_M:
2742         case URX_CARET_M_UNIX:
2743             if (atStart) {
2744                 fRXPat->fStartType = START_LINE;
2745             }
2746             break;
2747 
2748         case URX_ONECHAR:
2749             if (currentLen == 0) {
2750                 // This character could appear at the start of a match.
2751                 //   Add it to the set of possible starting characters.
2752                 fRXPat->fInitialChars->add(URX_VAL(op));
2753                 numInitialStrings += 2;
2754             }
2755             currentLen = safeIncrement(currentLen, 1);
2756             atStart = FALSE;
2757             break;
2758 
2759 
2760         case URX_SETREF:
2761             if (currentLen == 0) {
2762                 int32_t  sn = URX_VAL(op);
2763                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2764                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2765                 fRXPat->fInitialChars->addAll(*s);
2766                 numInitialStrings += 2;
2767             }
2768             currentLen = safeIncrement(currentLen, 1);
2769             atStart = FALSE;
2770             break;
2771 
2772         case URX_LOOP_SR_I:
2773             // [Set]*, like a SETREF, above, in what it can match,
2774             //  but may not match at all, so currentLen is not incremented.
2775             if (currentLen == 0) {
2776                 int32_t  sn = URX_VAL(op);
2777                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2778                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2779                 fRXPat->fInitialChars->addAll(*s);
2780                 numInitialStrings += 2;
2781             }
2782             atStart = FALSE;
2783             break;
2784 
2785         case URX_LOOP_DOT_I:
2786             if (currentLen == 0) {
2787                 // .* at the start of a pattern.
2788                 //    Any character can begin the match.
2789                 fRXPat->fInitialChars->clear();
2790                 fRXPat->fInitialChars->complement();
2791                 numInitialStrings += 2;
2792             }
2793             atStart = FALSE;
2794             break;
2795 
2796 
2797         case URX_STATIC_SETREF:
2798             if (currentLen == 0) {
2799                 int32_t  sn = URX_VAL(op);
2800                 U_ASSERT(sn>0 && sn<URX_LAST_SET);
2801                 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2802                 fRXPat->fInitialChars->addAll(*s);
2803                 numInitialStrings += 2;
2804             }
2805             currentLen = safeIncrement(currentLen, 1);
2806             atStart = FALSE;
2807             break;
2808 
2809 
2810 
2811         case URX_STAT_SETREF_N:
2812             if (currentLen == 0) {
2813                 int32_t  sn = URX_VAL(op);
2814                 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2815                 UnicodeSet sc(*s);
2816                 sc.complement();
2817                 fRXPat->fInitialChars->addAll(sc);
2818                 numInitialStrings += 2;
2819             }
2820             currentLen = safeIncrement(currentLen, 1);
2821             atStart = FALSE;
2822             break;
2823 
2824 
2825 
2826         case URX_BACKSLASH_D:
2827             // Digit Char
2828              if (currentLen == 0) {
2829                  UnicodeSet s;
2830                  s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
2831                  if (URX_VAL(op) != 0) {
2832                      s.complement();
2833                  }
2834                  fRXPat->fInitialChars->addAll(s);
2835                  numInitialStrings += 2;
2836             }
2837             currentLen = safeIncrement(currentLen, 1);
2838             atStart = FALSE;
2839             break;
2840 
2841 
2842         case URX_BACKSLASH_H:
2843             // Horiz white space
2844             if (currentLen == 0) {
2845                 UnicodeSet s;
2846                 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
2847                 s.add((UChar32)9);   // Tab
2848                 if (URX_VAL(op) != 0) {
2849                     s.complement();
2850                 }
2851                 fRXPat->fInitialChars->addAll(s);
2852                 numInitialStrings += 2;
2853             }
2854             currentLen = safeIncrement(currentLen, 1);
2855             atStart = FALSE;
2856             break;
2857 
2858 
2859         case URX_BACKSLASH_R:       // Any line ending sequence
2860         case URX_BACKSLASH_V:       // Any line ending code point, with optional negation
2861             if (currentLen == 0) {
2862                 UnicodeSet s;
2863                 s.add((UChar32)0x0a, (UChar32)0x0d);  // add range
2864                 s.add((UChar32)0x85);
2865                 s.add((UChar32)0x2028, (UChar32)0x2029);
2866                 if (URX_VAL(op) != 0) {
2867                      // Complement option applies to URX_BACKSLASH_V only.
2868                      s.complement();
2869                 }
2870                 fRXPat->fInitialChars->addAll(s);
2871                 numInitialStrings += 2;
2872             }
2873             currentLen = safeIncrement(currentLen, 1);
2874             atStart = FALSE;
2875             break;
2876 
2877 
2878 
2879         case URX_ONECHAR_I:
2880             // Case Insensitive Single Character.
2881             if (currentLen == 0) {
2882                 UChar32  c = URX_VAL(op);
2883                 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2884                     UnicodeSet starters(c, c);
2885                     starters.closeOver(USET_CASE_INSENSITIVE);
2886                     // findCaseInsensitiveStarters(c, &starters);
2887                     //   For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2888                     //   The expanded folding can't match the pattern.
2889                     fRXPat->fInitialChars->addAll(starters);
2890                 } else {
2891                     // Char has no case variants.  Just add it as-is to the
2892                     //   set of possible starting chars.
2893                     fRXPat->fInitialChars->add(c);
2894                 }
2895                 numInitialStrings += 2;
2896             }
2897             currentLen = safeIncrement(currentLen, 1);
2898             atStart = FALSE;
2899             break;
2900 
2901 
2902         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
2903         case URX_DOTANY_ALL:    // . matches one or two.
2904         case URX_DOTANY:
2905         case URX_DOTANY_UNIX:
2906             if (currentLen == 0) {
2907                 // These constructs are all bad news when they appear at the start
2908                 //   of a match.  Any character can begin the match.
2909                 fRXPat->fInitialChars->clear();
2910                 fRXPat->fInitialChars->complement();
2911                 numInitialStrings += 2;
2912             }
2913             currentLen = safeIncrement(currentLen, 1);
2914             atStart = FALSE;
2915             break;
2916 
2917 
2918         case URX_JMPX:
2919             loc++;             // Except for extra operand on URX_JMPX, same as URX_JMP.
2920             U_FALLTHROUGH;
2921         case URX_JMP:
2922             {
2923                 int32_t  jmpDest = URX_VAL(op);
2924                 if (jmpDest < loc) {
2925                     // Loop of some kind.  Can safely ignore, the worst that will happen
2926                     //  is that we understate the true minimum length
2927                     currentLen = forwardedLength.elementAti(loc+1);
2928 
2929                 } else {
2930                     // Forward jump.  Propagate the current min length to the target loc of the jump.
2931                     U_ASSERT(jmpDest <= end+1);
2932                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
2933                         forwardedLength.setElementAt(currentLen, jmpDest);
2934                     }
2935                 }
2936             }
2937             atStart = FALSE;
2938             break;
2939 
2940         case URX_JMP_SAV:
2941         case URX_JMP_SAV_X:
2942             // Combo of state save to the next loc, + jmp backwards.
2943             //   Net effect on min. length computation is nothing.
2944             atStart = FALSE;
2945             break;
2946 
2947         case URX_BACKTRACK:
2948             // Fails are kind of like a branch, except that the min length was
2949             //   propagated already, by the state save.
2950             currentLen = forwardedLength.elementAti(loc+1);
2951             atStart = FALSE;
2952             break;
2953 
2954 
2955         case URX_STATE_SAVE:
2956             {
2957                 // State Save, for forward jumps, propagate the current minimum.
2958                 //             of the state save.
2959                 int32_t  jmpDest = URX_VAL(op);
2960                 if (jmpDest > loc) {
2961                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
2962                         forwardedLength.setElementAt(currentLen, jmpDest);
2963                     }
2964                 }
2965             }
2966             atStart = FALSE;
2967             break;
2968 
2969 
2970 
2971 
2972         case URX_STRING:
2973             {
2974                 loc++;
2975                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2976                 int32_t stringLen   = URX_VAL(stringLenOp);
2977                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2978                 U_ASSERT(stringLenOp >= 2);
2979                 if (currentLen == 0) {
2980                     // Add the starting character of this string to the set of possible starting
2981                     //   characters for this pattern.
2982                     int32_t stringStartIdx = URX_VAL(op);
2983                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
2984                     fRXPat->fInitialChars->add(c);
2985 
2986                     // Remember this string.  After the entire pattern has been checked,
2987                     //  if nothing else is identified that can start a match, we'll use it.
2988                     numInitialStrings++;
2989                     fRXPat->fInitialStringIdx = stringStartIdx;
2990                     fRXPat->fInitialStringLen = stringLen;
2991                 }
2992 
2993                 currentLen = safeIncrement(currentLen, stringLen);
2994                 atStart = FALSE;
2995             }
2996             break;
2997 
2998         case URX_STRING_I:
2999             {
3000                 // Case-insensitive string.  Unlike exact-match strings, we won't
3001                 //   attempt a string search for possible match positions.  But we
3002                 //   do update the set of possible starting characters.
3003                 loc++;
3004                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3005                 int32_t stringLen   = URX_VAL(stringLenOp);
3006                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
3007                 U_ASSERT(stringLenOp >= 2);
3008                 if (currentLen == 0) {
3009                     // Add the starting character of this string to the set of possible starting
3010                     //   characters for this pattern.
3011                     int32_t stringStartIdx = URX_VAL(op);
3012                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
3013                     UnicodeSet s;
3014                     findCaseInsensitiveStarters(c, &s);
3015                     fRXPat->fInitialChars->addAll(s);
3016                     numInitialStrings += 2;  // Matching on an initial string not possible.
3017                 }
3018                 currentLen = safeIncrement(currentLen, stringLen);
3019                 atStart = FALSE;
3020             }
3021             break;
3022 
3023         case URX_CTR_INIT:
3024         case URX_CTR_INIT_NG:
3025             {
3026                 // Loop Init Ops.  These don't change the min length, but they are 4 word ops
3027                 //   so location must be updated accordingly.
3028                 // Loop Init Ops.
3029                 //   If the min loop count == 0
3030                 //      move loc forwards to the end of the loop, skipping over the body.
3031                 //   If the min count is > 0,
3032                 //      continue normal processing of the body of the loop.
3033                 int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3034                         loopEndLoc   = URX_VAL(loopEndLoc);
3035                 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3036                 if (minLoopCount == 0) {
3037                     // Min Loop Count of 0, treat like a forward branch and
3038                     //   move the current minimum length up to the target
3039                     //   (end of loop) location.
3040                     U_ASSERT(loopEndLoc <= end+1);
3041                     if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
3042                         forwardedLength.setElementAt(currentLen, loopEndLoc);
3043                     }
3044                 }
3045                 loc+=3;  // Skips over operands of CTR_INIT
3046             }
3047             atStart = FALSE;
3048             break;
3049 
3050 
3051         case URX_CTR_LOOP:
3052         case URX_CTR_LOOP_NG:
3053             // Loop ops.
3054             //  The jump is conditional, backwards only.
3055             atStart = FALSE;
3056             break;
3057 
3058         case URX_LOOP_C:
3059             // More loop ops.  These state-save to themselves.
3060             //   don't change the minimum match
3061             atStart = FALSE;
3062             break;
3063 
3064 
3065         case URX_LA_START:
3066         case URX_LB_START:
3067             {
3068                 // Look-around.  Scan forward until the matching look-ahead end,
3069                 //   without processing the look-around block.  This is overly pessimistic.
3070 
3071                 // Keep track of the nesting depth of look-around blocks.  Boilerplate code for
3072                 //   lookahead contains two LA_END instructions, so count goes up by two
3073                 //   for each LA_START.
3074                 int32_t  depth = (opType == URX_LA_START? 2: 1);
3075                 for (;;) {
3076                     loc++;
3077                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3078                     if (URX_TYPE(op) == URX_LA_START) {
3079                         depth+=2;
3080                     }
3081                     if (URX_TYPE(op) == URX_LB_START) {
3082                         depth++;
3083                     }
3084                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3085                         depth--;
3086                         if (depth == 0) {
3087                             break;
3088                         }
3089                     }
3090                     if (URX_TYPE(op) == URX_STATE_SAVE) {
3091                         // Need this because neg lookahead blocks will FAIL to outside
3092                         //   of the block.
3093                         int32_t  jmpDest = URX_VAL(op);
3094                         if (jmpDest > loc) {
3095                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
3096                                 forwardedLength.setElementAt(currentLen, jmpDest);
3097                             }
3098                         }
3099                     }
3100                     U_ASSERT(loc <= end);
3101                 }
3102             }
3103             break;
3104 
3105         case URX_LA_END:
3106         case URX_LB_CONT:
3107         case URX_LB_END:
3108         case URX_LBN_CONT:
3109         case URX_LBN_END:
3110             UPRV_UNREACHABLE;     // Shouldn't get here.  These ops should be
3111                                  //  consumed by the scan in URX_LA_START and LB_START
3112         default:
3113             UPRV_UNREACHABLE;
3114             }
3115 
3116         }
3117 
3118 
3119     // We have finished walking through the ops.  Check whether some forward jump
3120     //   propagated a shorter length to location end+1.
3121     if (forwardedLength.elementAti(end+1) < currentLen) {
3122         currentLen = forwardedLength.elementAti(end+1);
3123     }
3124 
3125 
3126     fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
3127 
3128 
3129     // Sort out what we should check for when looking for candidate match start positions.
3130     // In order of preference,
3131     //     1.   Start of input text buffer.
3132     //     2.   A literal string.
3133     //     3.   Start of line in multi-line mode.
3134     //     4.   A single literal character.
3135     //     5.   A character from a set of characters.
3136     //
3137     if (fRXPat->fStartType == START_START) {
3138         // Match only at the start of an input text string.
3139         //    start type is already set.  We're done.
3140     } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
3141         // Match beginning only with a literal string.
3142         UChar32  c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
3143         U_ASSERT(fRXPat->fInitialChars->contains(c));
3144         fRXPat->fStartType   = START_STRING;
3145         fRXPat->fInitialChar = c;
3146     } else if (fRXPat->fStartType == START_LINE) {
3147         // Match at start of line in Multi-Line mode.
3148         // Nothing to do here; everything is already set.
3149     } else if (fRXPat->fMinMatchLen == 0) {
3150         // Zero length match possible.  We could start anywhere.
3151         fRXPat->fStartType = START_NO_INFO;
3152     } else if (fRXPat->fInitialChars->size() == 1) {
3153         // All matches begin with the same char.
3154         fRXPat->fStartType   = START_CHAR;
3155         fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
3156         U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
3157     } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE &&
3158         fRXPat->fMinMatchLen > 0) {
3159         // Matches start with a set of character smaller than the set of all chars.
3160         fRXPat->fStartType = START_SET;
3161     } else {
3162         // Matches can start with anything
3163         fRXPat->fStartType = START_NO_INFO;
3164     }
3165 
3166     return;
3167 }
3168 
3169 
3170 
3171 //------------------------------------------------------------------------------
3172 //
3173 //   minMatchLength    Calculate the length of the shortest string that could
3174 //                     match the specified pattern.
3175 //                     Length is in 16 bit code units, not code points.
3176 //
3177 //                     The calculated length may not be exact.  The returned
3178 //                     value may be shorter than the actual minimum; it must
3179 //                     never be longer.
3180 //
3181 //                     start and end are the range of p-code operations to be
3182 //                     examined.  The endpoints are included in the range.
3183 //
3184 //------------------------------------------------------------------------------
minMatchLength(int32_t start,int32_t end)3185 int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
3186     if (U_FAILURE(*fStatus)) {
3187         return 0;
3188     }
3189 
3190     U_ASSERT(start <= end);
3191     U_ASSERT(end < fRXPat->fCompiledPat->size());
3192 
3193 
3194     int32_t    loc;
3195     int32_t    op;
3196     int32_t    opType;
3197     int32_t    currentLen = 0;
3198 
3199 
3200     // forwardedLength is a vector holding minimum-match-length values that
3201     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
3202     //   It must be one longer than the pattern being checked because some  ops
3203     //   will jmp to a end-of-block+1 location from within a block, and we must
3204     //   count those when checking the block.
3205     UVector32  forwardedLength(end+2, *fStatus);
3206     forwardedLength.setSize(end+2);
3207     for (loc=start; loc<=end+1; loc++) {
3208         forwardedLength.setElementAt(INT32_MAX, loc);
3209     }
3210 
3211     for (loc = start; loc<=end; loc++) {
3212         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3213         opType = URX_TYPE(op);
3214 
3215         // The loop is advancing linearly through the pattern.
3216         // If the op we are now at was the destination of a branch in the pattern,
3217         // and that path has a shorter minimum length than the current accumulated value,
3218         // replace the current accumulated value.
3219         // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);  // MinLength == INT32_MAX for some
3220                                                                //   no-match-possible cases.
3221         if (forwardedLength.elementAti(loc) < currentLen) {
3222             currentLen = forwardedLength.elementAti(loc);
3223             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3224         }
3225 
3226         switch (opType) {
3227             // Ops that don't change the total length matched
3228         case URX_RESERVED_OP:
3229         case URX_END:
3230         case URX_STRING_LEN:
3231         case URX_NOP:
3232         case URX_START_CAPTURE:
3233         case URX_END_CAPTURE:
3234         case URX_BACKSLASH_B:
3235         case URX_BACKSLASH_BU:
3236         case URX_BACKSLASH_G:
3237         case URX_BACKSLASH_Z:
3238         case URX_CARET:
3239         case URX_DOLLAR:
3240         case URX_DOLLAR_M:
3241         case URX_DOLLAR_D:
3242         case URX_DOLLAR_MD:
3243         case URX_RELOC_OPRND:
3244         case URX_STO_INP_LOC:
3245         case URX_CARET_M:
3246         case URX_CARET_M_UNIX:
3247         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3248         case URX_BACKREF_I:
3249 
3250         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3251         case URX_LD_SP:
3252 
3253         case URX_JMP_SAV:
3254         case URX_JMP_SAV_X:
3255             break;
3256 
3257 
3258             // Ops that match a minimum of one character (one or two 16 bit code units.)
3259             //
3260         case URX_ONECHAR:
3261         case URX_STATIC_SETREF:
3262         case URX_STAT_SETREF_N:
3263         case URX_SETREF:
3264         case URX_BACKSLASH_D:
3265         case URX_BACKSLASH_H:
3266         case URX_BACKSLASH_R:
3267         case URX_BACKSLASH_V:
3268         case URX_ONECHAR_I:
3269         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
3270         case URX_DOTANY_ALL:    // . matches one or two.
3271         case URX_DOTANY:
3272         case URX_DOTANY_UNIX:
3273             currentLen = safeIncrement(currentLen, 1);
3274             break;
3275 
3276 
3277         case URX_JMPX:
3278             loc++;              // URX_JMPX has an extra operand, ignored here,
3279                                 //   otherwise processed identically to URX_JMP.
3280             U_FALLTHROUGH;
3281         case URX_JMP:
3282             {
3283                 int32_t  jmpDest = URX_VAL(op);
3284                 if (jmpDest < loc) {
3285                     // Loop of some kind.  Can safely ignore, the worst that will happen
3286                     //  is that we understate the true minimum length
3287                     currentLen = forwardedLength.elementAti(loc+1);
3288                 } else {
3289                     // Forward jump.  Propagate the current min length to the target loc of the jump.
3290                     U_ASSERT(jmpDest <= end+1);
3291                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
3292                         forwardedLength.setElementAt(currentLen, jmpDest);
3293                     }
3294                 }
3295             }
3296             break;
3297 
3298         case URX_BACKTRACK:
3299             {
3300                 // Back-tracks are kind of like a branch, except that the min length was
3301                 //   propagated already, by the state save.
3302                 currentLen = forwardedLength.elementAti(loc+1);
3303             }
3304             break;
3305 
3306 
3307         case URX_STATE_SAVE:
3308             {
3309                 // State Save, for forward jumps, propagate the current minimum.
3310                 //             of the state save.
3311                 int32_t  jmpDest = URX_VAL(op);
3312                 if (jmpDest > loc) {
3313                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
3314                         forwardedLength.setElementAt(currentLen, jmpDest);
3315                     }
3316                 }
3317             }
3318             break;
3319 
3320 
3321         case URX_STRING:
3322             {
3323                 loc++;
3324                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3325                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3326             }
3327             break;
3328 
3329 
3330         case URX_STRING_I:
3331             {
3332                 loc++;
3333                 // TODO: with full case folding, matching input text may be shorter than
3334                 //       the string we have here.  More smarts could put some bounds on it.
3335                 //       Assume a min length of one for now.  A min length of zero causes
3336                 //        optimization failures for a pattern like "string"+
3337                 // currentLen += URX_VAL(stringLenOp);
3338                 currentLen = safeIncrement(currentLen, 1);
3339             }
3340             break;
3341 
3342         case URX_CTR_INIT:
3343         case URX_CTR_INIT_NG:
3344             {
3345                 // Loop Init Ops.
3346                 //   If the min loop count == 0
3347                 //      move loc forwards to the end of the loop, skipping over the body.
3348                 //   If the min count is > 0,
3349                 //      continue normal processing of the body of the loop.
3350                 int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3351                         loopEndLoc   = URX_VAL(loopEndLoc);
3352                 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3353                 if (minLoopCount == 0) {
3354                     loc = loopEndLoc;
3355                 } else {
3356                     loc+=3;  // Skips over operands of CTR_INIT
3357                 }
3358             }
3359             break;
3360 
3361 
3362         case URX_CTR_LOOP:
3363         case URX_CTR_LOOP_NG:
3364             // Loop ops.
3365             //  The jump is conditional, backwards only.
3366             break;
3367 
3368         case URX_LOOP_SR_I:
3369         case URX_LOOP_DOT_I:
3370         case URX_LOOP_C:
3371             // More loop ops.  These state-save to themselves.
3372             //   don't change the minimum match - could match nothing at all.
3373             break;
3374 
3375 
3376         case URX_LA_START:
3377         case URX_LB_START:
3378             {
3379                 // Look-around.  Scan forward until the matching look-ahead end,
3380                 //   without processing the look-around block.  This is overly pessimistic for look-ahead,
3381                 //   it assumes that the look-ahead match might be zero-length.
3382                 //   TODO:  Positive lookahead could recursively do the block, then continue
3383                 //          with the longer of the block or the value coming in.  Ticket 6060
3384                 int32_t  depth = (opType == URX_LA_START? 2: 1);;
3385                 for (;;) {
3386                     loc++;
3387                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3388                     if (URX_TYPE(op) == URX_LA_START) {
3389                         // The boilerplate for look-ahead includes two LA_END insturctions,
3390                         //    Depth will be decremented by each one when it is seen.
3391                         depth += 2;
3392                     }
3393                     if (URX_TYPE(op) == URX_LB_START) {
3394                         depth++;
3395                     }
3396                     if (URX_TYPE(op) == URX_LA_END) {
3397                         depth--;
3398                         if (depth == 0) {
3399                             break;
3400                         }
3401                     }
3402                     if (URX_TYPE(op)==URX_LBN_END) {
3403                         depth--;
3404                         if (depth == 0) {
3405                             break;
3406                         }
3407                     }
3408                     if (URX_TYPE(op) == URX_STATE_SAVE) {
3409                         // Need this because neg lookahead blocks will FAIL to outside
3410                         //   of the block.
3411                         int32_t  jmpDest = URX_VAL(op);
3412                         if (jmpDest > loc) {
3413                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
3414                                 forwardedLength.setElementAt(currentLen, jmpDest);
3415                             }
3416                         }
3417                     }
3418                     U_ASSERT(loc <= end);
3419                 }
3420             }
3421             break;
3422 
3423         case URX_LA_END:
3424         case URX_LB_CONT:
3425         case URX_LB_END:
3426         case URX_LBN_CONT:
3427         case URX_LBN_END:
3428             // Only come here if the matching URX_LA_START or URX_LB_START was not in the
3429             //   range being sized, which happens when measuring size of look-behind blocks.
3430             break;
3431 
3432         default:
3433             UPRV_UNREACHABLE;
3434             }
3435 
3436         }
3437 
3438     // We have finished walking through the ops.  Check whether some forward jump
3439     //   propagated a shorter length to location end+1.
3440     if (forwardedLength.elementAti(end+1) < currentLen) {
3441         currentLen = forwardedLength.elementAti(end+1);
3442         U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3443     }
3444 
3445     return currentLen;
3446 }
3447 
3448 //------------------------------------------------------------------------------
3449 //
3450 //   maxMatchLength    Calculate the length of the longest string that could
3451 //                     match the specified pattern.
3452 //                     Length is in 16 bit code units, not code points.
3453 //
3454 //                     The calculated length may not be exact.  The returned
3455 //                     value may be longer than the actual maximum; it must
3456 //                     never be shorter.
3457 //
3458 //------------------------------------------------------------------------------
maxMatchLength(int32_t start,int32_t end)3459 int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
3460     if (U_FAILURE(*fStatus)) {
3461         return 0;
3462     }
3463     U_ASSERT(start <= end);
3464     U_ASSERT(end < fRXPat->fCompiledPat->size());
3465 
3466 
3467     int32_t    loc;
3468     int32_t    op;
3469     int32_t    opType;
3470     int32_t    currentLen = 0;
3471     UVector32  forwardedLength(end+1, *fStatus);
3472     forwardedLength.setSize(end+1);
3473 
3474     for (loc=start; loc<=end; loc++) {
3475         forwardedLength.setElementAt(0, loc);
3476     }
3477 
3478     for (loc = start; loc<=end; loc++) {
3479         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3480         opType = URX_TYPE(op);
3481 
3482         // The loop is advancing linearly through the pattern.
3483         // If the op we are now at was the destination of a branch in the pattern,
3484         // and that path has a longer maximum length than the current accumulated value,
3485         // replace the current accumulated value.
3486         if (forwardedLength.elementAti(loc) > currentLen) {
3487             currentLen = forwardedLength.elementAti(loc);
3488         }
3489 
3490         switch (opType) {
3491             // Ops that don't change the total length matched
3492         case URX_RESERVED_OP:
3493         case URX_END:
3494         case URX_STRING_LEN:
3495         case URX_NOP:
3496         case URX_START_CAPTURE:
3497         case URX_END_CAPTURE:
3498         case URX_BACKSLASH_B:
3499         case URX_BACKSLASH_BU:
3500         case URX_BACKSLASH_G:
3501         case URX_BACKSLASH_Z:
3502         case URX_CARET:
3503         case URX_DOLLAR:
3504         case URX_DOLLAR_M:
3505         case URX_DOLLAR_D:
3506         case URX_DOLLAR_MD:
3507         case URX_RELOC_OPRND:
3508         case URX_STO_INP_LOC:
3509         case URX_CARET_M:
3510         case URX_CARET_M_UNIX:
3511 
3512         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3513         case URX_LD_SP:
3514 
3515         case URX_LB_END:
3516         case URX_LB_CONT:
3517         case URX_LBN_CONT:
3518         case URX_LBN_END:
3519             break;
3520 
3521 
3522             // Ops that increase that cause an unbounded increase in the length
3523             //   of a matched string, or that increase it a hard to characterize way.
3524             //   Call the max length unbounded, and stop further checking.
3525         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3526         case URX_BACKREF_I:
3527         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
3528             currentLen = INT32_MAX;
3529             break;
3530 
3531 
3532             // Ops that match a max of one character (possibly two 16 bit code units.)
3533             //
3534         case URX_STATIC_SETREF:
3535         case URX_STAT_SETREF_N:
3536         case URX_SETREF:
3537         case URX_BACKSLASH_D:
3538         case URX_BACKSLASH_H:
3539         case URX_BACKSLASH_R:
3540         case URX_BACKSLASH_V:
3541         case URX_ONECHAR_I:
3542         case URX_DOTANY_ALL:
3543         case URX_DOTANY:
3544         case URX_DOTANY_UNIX:
3545             currentLen = safeIncrement(currentLen, 2);
3546             break;
3547 
3548             // Single literal character.  Increase current max length by one or two,
3549             //       depending on whether the char is in the supplementary range.
3550         case URX_ONECHAR:
3551             currentLen = safeIncrement(currentLen, 1);
3552             if (URX_VAL(op) > 0x10000) {
3553                 currentLen = safeIncrement(currentLen, 1);
3554             }
3555             break;
3556 
3557             // Jumps.
3558             //
3559         case URX_JMP:
3560         case URX_JMPX:
3561         case URX_JMP_SAV:
3562         case URX_JMP_SAV_X:
3563             {
3564                 int32_t  jmpDest = URX_VAL(op);
3565                 if (jmpDest < loc) {
3566                     // Loop of some kind.  Max match length is unbounded.
3567                     currentLen = INT32_MAX;
3568                 } else {
3569                     // Forward jump.  Propagate the current min length to the target loc of the jump.
3570                     if (forwardedLength.elementAti(jmpDest) < currentLen) {
3571                         forwardedLength.setElementAt(currentLen, jmpDest);
3572                     }
3573                     currentLen = 0;
3574                 }
3575             }
3576             break;
3577 
3578         case URX_BACKTRACK:
3579             // back-tracks are kind of like a branch, except that the max length was
3580             //   propagated already, by the state save.
3581             currentLen = forwardedLength.elementAti(loc+1);
3582             break;
3583 
3584 
3585         case URX_STATE_SAVE:
3586             {
3587                 // State Save, for forward jumps, propagate the current minimum.
3588                 //               of the state save.
3589                 //             For backwards jumps, they create a loop, maximum
3590                 //               match length is unbounded.
3591                 int32_t  jmpDest = URX_VAL(op);
3592                 if (jmpDest > loc) {
3593                     if (currentLen > forwardedLength.elementAti(jmpDest)) {
3594                         forwardedLength.setElementAt(currentLen, jmpDest);
3595                     }
3596                 } else {
3597                     currentLen = INT32_MAX;
3598                 }
3599             }
3600             break;
3601 
3602 
3603 
3604 
3605         case URX_STRING:
3606             {
3607                 loc++;
3608                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3609                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3610                 break;
3611             }
3612 
3613         case URX_STRING_I:
3614             // TODO:  This code assumes that any user string that matches will be no longer
3615             //        than our compiled string, with case insensitive matching.
3616             //        Our compiled string has been case-folded already.
3617             //
3618             //        Any matching user string will have no more code points than our
3619             //        compiled (folded) string.  Folding may add code points, but
3620             //        not remove them.
3621             //
3622             //        There is a potential problem if a supplemental code point
3623             //        case-folds to a BMP code point.  In this case our compiled string
3624             //        could be shorter (in code units) than a matching user string.
3625             //
3626             //        At this time (Unicode 6.1) there are no such characters, and this case
3627             //        is not being handled.  A test, intltest regex/Bug9283, will fail if
3628             //        any problematic characters are added to Unicode.
3629             //
3630             //        If this happens, we can make a set of the BMP chars that the
3631             //        troublesome supplementals fold to, scan our string, and bump the
3632             //        currentLen one extra for each that is found.
3633             //
3634             {
3635                 loc++;
3636                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3637                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3638             }
3639             break;
3640 
3641         case URX_CTR_INIT:
3642         case URX_CTR_INIT_NG:
3643             // For Loops, recursively call this function on the pattern for the loop body,
3644             //   then multiply the result by the maximum loop count.
3645             {
3646                 int32_t  loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
3647                 if (loopEndLoc == loc+4) {
3648                     // Loop has an empty body. No affect on max match length.
3649                     // Continue processing with code after the loop end.
3650                     loc = loopEndLoc;
3651                     break;
3652                 }
3653 
3654                 int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3));
3655                 if (maxLoopCount == -1) {
3656                     // Unbounded Loop. No upper bound on match length.
3657                     currentLen = INT32_MAX;
3658                     break;
3659                 }
3660 
3661                 U_ASSERT(loopEndLoc >= loc+4);
3662                 int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1);  // Recursive call.
3663                 int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount;
3664                 if (updatedLen >= INT32_MAX) {
3665                     currentLen = INT32_MAX;
3666                     break;
3667                 }
3668                 currentLen = (int32_t)updatedLen;
3669                 loc = loopEndLoc;
3670                 break;
3671             }
3672 
3673         case URX_CTR_LOOP:
3674         case URX_CTR_LOOP_NG:
3675             // These opcodes will be skipped over by code for URX_CRT_INIT.
3676             // We shouldn't encounter them here.
3677             UPRV_UNREACHABLE;
3678 
3679         case URX_LOOP_SR_I:
3680         case URX_LOOP_DOT_I:
3681         case URX_LOOP_C:
3682             // For anything to do with loops, make the match length unbounded.
3683             currentLen = INT32_MAX;
3684             break;
3685 
3686 
3687 
3688         case URX_LA_START:
3689         case URX_LA_END:
3690             // Look-ahead.  Just ignore, treat the look-ahead block as if
3691             // it were normal pattern.  Gives a too-long match length,
3692             //  but good enough for now.
3693             break;
3694 
3695             // End of look-ahead ops should always be consumed by the processing at
3696             //  the URX_LA_START op.
3697             // UPRV_UNREACHABLE;
3698 
3699         case URX_LB_START:
3700             {
3701                 // Look-behind.  Scan forward until the matching look-around end,
3702                 //   without processing the look-behind block.
3703                 int32_t  depth = 0;
3704                 for (;;) {
3705                     loc++;
3706                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3707                     if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) {
3708                         depth++;
3709                     }
3710                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3711                         if (depth == 0) {
3712                             break;
3713                         }
3714                         depth--;
3715                     }
3716                     U_ASSERT(loc < end);
3717                 }
3718             }
3719             break;
3720 
3721         default:
3722             UPRV_UNREACHABLE;
3723         }
3724 
3725 
3726         if (currentLen == INT32_MAX) {
3727             //  The maximum length is unbounded.
3728             //  Stop further processing of the pattern.
3729             break;
3730         }
3731 
3732     }
3733     return currentLen;
3734 
3735 }
3736 
3737 
3738 //------------------------------------------------------------------------------
3739 //
3740 //   stripNOPs    Remove any NOP operations from the compiled pattern code.
3741 //                Extra NOPs are inserted for some constructs during the initial
3742 //                code generation to provide locations that may be patched later.
3743 //                Many end up unneeded, and are removed by this function.
3744 //
3745 //                In order to minimize the number of passes through the pattern,
3746 //                back-reference fixup is also performed here (adjusting
3747 //                back-reference operands to point to the correct frame offsets).
3748 //
3749 //------------------------------------------------------------------------------
stripNOPs()3750 void RegexCompile::stripNOPs() {
3751 
3752     if (U_FAILURE(*fStatus)) {
3753         return;
3754     }
3755 
3756     int32_t    end = fRXPat->fCompiledPat->size();
3757     UVector32  deltas(end, *fStatus);
3758 
3759     // Make a first pass over the code, computing the amount that things
3760     //   will be offset at each location in the original code.
3761     int32_t   loc;
3762     int32_t   d = 0;
3763     for (loc=0; loc<end; loc++) {
3764         deltas.addElement(d, *fStatus);
3765         int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3766         if (URX_TYPE(op) == URX_NOP) {
3767             d++;
3768         }
3769     }
3770 
3771     UnicodeString caseStringBuffer;
3772 
3773     // Make a second pass over the code, removing the NOPs by moving following
3774     //  code up, and patching operands that refer to code locations that
3775     //  are being moved.  The array of offsets from the first step is used
3776     //  to compute the new operand values.
3777     int32_t src;
3778     int32_t dst = 0;
3779     for (src=0; src<end; src++) {
3780         int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
3781         int32_t opType = URX_TYPE(op);
3782         switch (opType) {
3783         case URX_NOP:
3784             break;
3785 
3786         case URX_STATE_SAVE:
3787         case URX_JMP:
3788         case URX_CTR_LOOP:
3789         case URX_CTR_LOOP_NG:
3790         case URX_RELOC_OPRND:
3791         case URX_JMPX:
3792         case URX_JMP_SAV:
3793         case URX_JMP_SAV_X:
3794             // These are instructions with operands that refer to code locations.
3795             {
3796                 int32_t  operandAddress = URX_VAL(op);
3797                 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
3798                 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3799                 op = buildOp(opType, fixedOperandAddress);
3800                 fRXPat->fCompiledPat->setElementAt(op, dst);
3801                 dst++;
3802                 break;
3803             }
3804 
3805         case URX_BACKREF:
3806         case URX_BACKREF_I:
3807             {
3808                 int32_t where = URX_VAL(op);
3809                 if (where > fRXPat->fGroupMap->size()) {
3810                     error(U_REGEX_INVALID_BACK_REF);
3811                     break;
3812                 }
3813                 where = fRXPat->fGroupMap->elementAti(where-1);
3814                 op    = buildOp(opType, where);
3815                 fRXPat->fCompiledPat->setElementAt(op, dst);
3816                 dst++;
3817 
3818                 fRXPat->fNeedsAltInput = TRUE;
3819                 break;
3820             }
3821         case URX_RESERVED_OP:
3822         case URX_RESERVED_OP_N:
3823         case URX_BACKTRACK:
3824         case URX_END:
3825         case URX_ONECHAR:
3826         case URX_STRING:
3827         case URX_STRING_LEN:
3828         case URX_START_CAPTURE:
3829         case URX_END_CAPTURE:
3830         case URX_STATIC_SETREF:
3831         case URX_STAT_SETREF_N:
3832         case URX_SETREF:
3833         case URX_DOTANY:
3834         case URX_FAIL:
3835         case URX_BACKSLASH_B:
3836         case URX_BACKSLASH_BU:
3837         case URX_BACKSLASH_G:
3838         case URX_BACKSLASH_X:
3839         case URX_BACKSLASH_Z:
3840         case URX_DOTANY_ALL:
3841         case URX_BACKSLASH_D:
3842         case URX_CARET:
3843         case URX_DOLLAR:
3844         case URX_CTR_INIT:
3845         case URX_CTR_INIT_NG:
3846         case URX_DOTANY_UNIX:
3847         case URX_STO_SP:
3848         case URX_LD_SP:
3849         case URX_STO_INP_LOC:
3850         case URX_LA_START:
3851         case URX_LA_END:
3852         case URX_ONECHAR_I:
3853         case URX_STRING_I:
3854         case URX_DOLLAR_M:
3855         case URX_CARET_M:
3856         case URX_CARET_M_UNIX:
3857         case URX_LB_START:
3858         case URX_LB_CONT:
3859         case URX_LB_END:
3860         case URX_LBN_CONT:
3861         case URX_LBN_END:
3862         case URX_LOOP_SR_I:
3863         case URX_LOOP_DOT_I:
3864         case URX_LOOP_C:
3865         case URX_DOLLAR_D:
3866         case URX_DOLLAR_MD:
3867         case URX_BACKSLASH_H:
3868         case URX_BACKSLASH_R:
3869         case URX_BACKSLASH_V:
3870             // These instructions are unaltered by the relocation.
3871             fRXPat->fCompiledPat->setElementAt(op, dst);
3872             dst++;
3873             break;
3874 
3875         default:
3876             // Some op is unaccounted for.
3877             UPRV_UNREACHABLE;
3878         }
3879     }
3880 
3881     fRXPat->fCompiledPat->setSize(dst);
3882 }
3883 
3884 
3885 
3886 
3887 //------------------------------------------------------------------------------
3888 //
3889 //  Error         Report a rule parse error.
3890 //                Only report it if no previous error has been recorded.
3891 //
3892 //------------------------------------------------------------------------------
error(UErrorCode e)3893 void RegexCompile::error(UErrorCode e) {
3894     if (U_SUCCESS(*fStatus) || e == U_MEMORY_ALLOCATION_ERROR) {
3895         *fStatus = e;
3896         // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3897         // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3898         // int64_t. If the values of the latter are out of range for the former,
3899         // set them to the appropriate "field not supported" values.
3900         if (fLineNum > 0x7FFFFFFF) {
3901             fParseErr->line   = 0;
3902             fParseErr->offset = -1;
3903         } else if (fCharNum > 0x7FFFFFFF) {
3904             fParseErr->line   = (int32_t)fLineNum;
3905             fParseErr->offset = -1;
3906         } else {
3907             fParseErr->line   = (int32_t)fLineNum;
3908             fParseErr->offset = (int32_t)fCharNum;
3909         }
3910 
3911         UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
3912 
3913         // Fill in the context.
3914         //   Note: extractBetween() pins supplied indicies to the string bounds.
3915         uprv_memset(fParseErr->preContext,  0, sizeof(fParseErr->preContext));
3916         uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
3917         utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
3918         utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
3919     }
3920 }
3921 
3922 
3923 //
3924 //  Assorted Unicode character constants.
3925 //     Numeric because there is no portable way to enter them as literals.
3926 //     (Think EBCDIC).
3927 //
3928 static const UChar      chCR        = 0x0d;      // New lines, for terminating comments.
3929 static const UChar      chLF        = 0x0a;      // Line Feed
3930 static const UChar      chPound     = 0x23;      // '#', introduces a comment.
3931 static const UChar      chDigit0    = 0x30;      // '0'
3932 static const UChar      chDigit7    = 0x37;      // '9'
3933 static const UChar      chColon     = 0x3A;      // ':'
3934 static const UChar      chE         = 0x45;      // 'E'
3935 static const UChar      chQ         = 0x51;      // 'Q'
3936 //static const UChar      chN         = 0x4E;      // 'N'
3937 static const UChar      chP         = 0x50;      // 'P'
3938 static const UChar      chBackSlash = 0x5c;      // '\'  introduces a char escape
3939 //static const UChar      chLBracket  = 0x5b;      // '['
3940 static const UChar      chRBracket  = 0x5d;      // ']'
3941 static const UChar      chUp        = 0x5e;      // '^'
3942 static const UChar      chLowerP    = 0x70;
3943 static const UChar      chLBrace    = 0x7b;      // '{'
3944 static const UChar      chRBrace    = 0x7d;      // '}'
3945 static const UChar      chNEL       = 0x85;      //    NEL newline variant
3946 static const UChar      chLS        = 0x2028;    //    Unicode Line Separator
3947 
3948 
3949 //------------------------------------------------------------------------------
3950 //
3951 //  nextCharLL    Low Level Next Char from the regex pattern.
3952 //                Get a char from the string, keep track of input position
3953 //                     for error reporting.
3954 //
3955 //------------------------------------------------------------------------------
nextCharLL()3956 UChar32  RegexCompile::nextCharLL() {
3957     UChar32       ch;
3958 
3959     if (fPeekChar != -1) {
3960         ch = fPeekChar;
3961         fPeekChar = -1;
3962         return ch;
3963     }
3964 
3965     // assume we're already in the right place
3966     ch = UTEXT_NEXT32(fRXPat->fPattern);
3967     if (ch == U_SENTINEL) {
3968         return ch;
3969     }
3970 
3971     if (ch == chCR ||
3972         ch == chNEL ||
3973         ch == chLS   ||
3974         (ch == chLF && fLastChar != chCR)) {
3975         // Character is starting a new line.  Bump up the line number, and
3976         //  reset the column to 0.
3977         fLineNum++;
3978         fCharNum=0;
3979     }
3980     else {
3981         // Character is not starting a new line.  Except in the case of a
3982         //   LF following a CR, increment the column position.
3983         if (ch != chLF) {
3984             fCharNum++;
3985         }
3986     }
3987     fLastChar = ch;
3988     return ch;
3989 }
3990 
3991 //------------------------------------------------------------------------------
3992 //
3993 //   peekCharLL    Low Level Character Scanning, sneak a peek at the next
3994 //                 character without actually getting it.
3995 //
3996 //------------------------------------------------------------------------------
peekCharLL()3997 UChar32  RegexCompile::peekCharLL() {
3998     if (fPeekChar == -1) {
3999         fPeekChar = nextCharLL();
4000     }
4001     return fPeekChar;
4002 }
4003 
4004 
4005 //------------------------------------------------------------------------------
4006 //
4007 //   nextChar     for pattern scanning.  At this level, we handle stripping
4008 //                out comments and processing some backslash character escapes.
4009 //                The rest of the pattern grammar is handled at the next level up.
4010 //
4011 //------------------------------------------------------------------------------
nextChar(RegexPatternChar & c)4012 void RegexCompile::nextChar(RegexPatternChar &c) {
4013   tailRecursion:
4014     fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4015     c.fChar    = nextCharLL();
4016     c.fQuoted  = FALSE;
4017 
4018     if (fQuoteMode) {
4019         c.fQuoted = TRUE;
4020         if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
4021             c.fChar == (UChar32)-1) {
4022             fQuoteMode = FALSE;  //  Exit quote mode,
4023             nextCharLL();        // discard the E
4024             // nextChar(c);      // recurse to get the real next char
4025             goto tailRecursion;  // Note: fuzz testing produced testcases that
4026                                  //       resulted in stack overflow here.
4027         }
4028     }
4029     else if (fInBackslashQuote) {
4030         // The current character immediately follows a '\'
4031         // Don't check for any further escapes, just return it as-is.
4032         // Don't set c.fQuoted, because that would prevent the state machine from
4033         //    dispatching on the character.
4034         fInBackslashQuote = FALSE;
4035     }
4036     else
4037     {
4038         // We are not in a \Q quoted region \E of the source.
4039         //
4040         if (fModeFlags & UREGEX_COMMENTS) {
4041             //
4042             // We are in free-spacing and comments mode.
4043             //  Scan through any white space and comments, until we
4044             //  reach a significant character or the end of inut.
4045             for (;;) {
4046                 if (c.fChar == (UChar32)-1) {
4047                     break;     // End of Input
4048                 }
4049                 if  (c.fChar == chPound && fEOLComments == TRUE) {
4050                     // Start of a comment.  Consume the rest of it, until EOF or a new line
4051                     for (;;) {
4052                         c.fChar = nextCharLL();
4053                         if (c.fChar == (UChar32)-1 ||  // EOF
4054                             c.fChar == chCR        ||
4055                             c.fChar == chLF        ||
4056                             c.fChar == chNEL       ||
4057                             c.fChar == chLS)       {
4058                             break;
4059                         }
4060                     }
4061                 }
4062                 // TODO:  check what Java & Perl do with non-ASCII white spaces.  Ticket 6061.
4063                 if (PatternProps::isWhiteSpace(c.fChar) == FALSE) {
4064                     break;
4065                 }
4066                 c.fChar = nextCharLL();
4067             }
4068         }
4069 
4070         //
4071         //  check for backslash escaped characters.
4072         //
4073         if (c.fChar == chBackSlash) {
4074             int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4075             if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
4076                 //
4077                 // A '\' sequence that is handled by ICU's standard unescapeAt function.
4078                 //   Includes \uxxxx, \n, \r, many others.
4079                 //   Return the single equivalent character.
4080                 //
4081                 nextCharLL();                 // get & discard the peeked char.
4082                 c.fQuoted = TRUE;
4083 
4084                 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
4085                     int32_t endIndex = (int32_t)pos;
4086                     c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
4087 
4088                     if (endIndex == pos) {
4089                         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4090                     }
4091                     fCharNum += endIndex - pos;
4092                     UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
4093                 } else {
4094                     int32_t offset = 0;
4095                     struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
4096 
4097                     UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
4098                     c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
4099 
4100                     if (offset == 0) {
4101                         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4102                     } else if (context.lastOffset == offset) {
4103                         UTEXT_PREVIOUS32(fRXPat->fPattern);
4104                     } else if (context.lastOffset != offset-1) {
4105                         utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
4106                     }
4107                     fCharNum += offset;
4108                 }
4109             }
4110             else if (peekCharLL() == chDigit0) {
4111                 //  Octal Escape, using Java Regexp Conventions
4112                 //    which are \0 followed by 1-3 octal digits.
4113                 //    Different from ICU Unescape handling of Octal, which does not
4114                 //    require the leading 0.
4115                 //  Java also has the convention of only consuming 2 octal digits if
4116                 //    the three digit number would be > 0xff
4117                 //
4118                 c.fChar = 0;
4119                 nextCharLL();    // Consume the initial 0.
4120                 int index;
4121                 for (index=0; index<3; index++) {
4122                     int32_t ch = peekCharLL();
4123                     if (ch<chDigit0 || ch>chDigit7) {
4124                         if (index==0) {
4125                            // \0 is not followed by any octal digits.
4126                            error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4127                         }
4128                         break;
4129                     }
4130                     c.fChar <<= 3;
4131                     c.fChar += ch&7;
4132                     if (c.fChar <= 255) {
4133                         nextCharLL();
4134                     } else {
4135                         // The last digit made the number too big.  Forget we saw it.
4136                         c.fChar >>= 3;
4137                     }
4138                 }
4139                 c.fQuoted = TRUE;
4140             }
4141             else if (peekCharLL() == chQ) {
4142                 //  "\Q"  enter quote mode, which will continue until "\E"
4143                 fQuoteMode = TRUE;
4144                 nextCharLL();        // discard the 'Q'.
4145                 // nextChar(c);      // recurse to get the real next char.
4146                 goto tailRecursion;  // Note: fuzz testing produced test cases that
4147                 //                            resulted in stack overflow here.
4148             }
4149             else
4150             {
4151                 // We are in a '\' escape that will be handled by the state table scanner.
4152                 // Just return the backslash, but remember that the following char is to
4153                 //  be taken literally.
4154                 fInBackslashQuote = TRUE;
4155             }
4156         }
4157     }
4158 
4159     // re-enable # to end-of-line comments, in case they were disabled.
4160     // They are disabled by the parser upon seeing '(?', but this lasts for
4161     //  the fetching of the next character only.
4162     fEOLComments = TRUE;
4163 
4164     // putc(c.fChar, stdout);
4165 }
4166 
4167 
4168 
4169 //------------------------------------------------------------------------------
4170 //
4171 //  scanNamedChar
4172 //            Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4173 //
4174 //             The scan position will be at the 'N'.  On return
4175 //             the scan position should be just after the '}'
4176 //
4177 //             Return the UChar32
4178 //
4179 //------------------------------------------------------------------------------
scanNamedChar()4180 UChar32  RegexCompile::scanNamedChar() {
4181     if (U_FAILURE(*fStatus)) {
4182         return 0;
4183     }
4184 
4185     nextChar(fC);
4186     if (fC.fChar != chLBrace) {
4187         error(U_REGEX_PROPERTY_SYNTAX);
4188         return 0;
4189     }
4190 
4191     UnicodeString  charName;
4192     for (;;) {
4193         nextChar(fC);
4194         if (fC.fChar == chRBrace) {
4195             break;
4196         }
4197         if (fC.fChar == -1) {
4198             error(U_REGEX_PROPERTY_SYNTAX);
4199             return 0;
4200         }
4201         charName.append(fC.fChar);
4202     }
4203 
4204     char name[100];
4205     if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
4206          (uint32_t)charName.length()>=sizeof(name)) {
4207         // All Unicode character names have only invariant characters.
4208         // The API to get a character, given a name, accepts only char *, forcing us to convert,
4209         //   which requires this error check
4210         error(U_REGEX_PROPERTY_SYNTAX);
4211         return 0;
4212     }
4213     charName.extract(0, charName.length(), name, sizeof(name), US_INV);
4214 
4215     UChar32  theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
4216     if (U_FAILURE(*fStatus)) {
4217         error(U_REGEX_PROPERTY_SYNTAX);
4218     }
4219 
4220     nextChar(fC);      // Continue overall regex pattern processing with char after the '}'
4221     return theChar;
4222 }
4223 
4224 //------------------------------------------------------------------------------
4225 //
4226 //  scanProp   Construct a UnicodeSet from the text at the current scan
4227 //             position, which will be of the form \p{whaterver}
4228 //
4229 //             The scan position will be at the 'p' or 'P'.  On return
4230 //             the scan position should be just after the '}'
4231 //
4232 //             Return a UnicodeSet, constructed from the \P pattern,
4233 //             or NULL if the pattern is invalid.
4234 //
4235 //------------------------------------------------------------------------------
scanProp()4236 UnicodeSet *RegexCompile::scanProp() {
4237     UnicodeSet    *uset = NULL;
4238 
4239     if (U_FAILURE(*fStatus)) {
4240         return NULL;
4241     }
4242     (void)chLowerP;   // Suppress compiler unused variable warning.
4243     U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
4244     UBool negated = (fC.fChar == chP);
4245 
4246     UnicodeString propertyName;
4247     nextChar(fC);
4248     if (fC.fChar != chLBrace) {
4249         error(U_REGEX_PROPERTY_SYNTAX);
4250         return NULL;
4251     }
4252     for (;;) {
4253         nextChar(fC);
4254         if (fC.fChar == chRBrace) {
4255             break;
4256         }
4257         if (fC.fChar == -1) {
4258             // Hit the end of the input string without finding the closing '}'
4259             error(U_REGEX_PROPERTY_SYNTAX);
4260             return NULL;
4261         }
4262         propertyName.append(fC.fChar);
4263     }
4264     uset = createSetForProperty(propertyName, negated);
4265     nextChar(fC);    // Move input scan to position following the closing '}'
4266     return uset;
4267 }
4268 
4269 //------------------------------------------------------------------------------
4270 //
4271 //  scanPosixProp   Construct a UnicodeSet from the text at the current scan
4272 //             position, which is expected be of the form [:property expression:]
4273 //
4274 //             The scan position will be at the opening ':'.  On return
4275 //             the scan position must be on the closing ']'
4276 //
4277 //             Return a UnicodeSet constructed from the pattern,
4278 //             or NULL if this is not a valid POSIX-style set expression.
4279 //             If not a property expression, restore the initial scan position
4280 //                (to the opening ':')
4281 //
4282 //               Note:  the opening '[:' is not sufficient to guarantee that
4283 //                      this is a [:property:] expression.
4284 //                      [:'+=,] is a perfectly good ordinary set expression that
4285 //                              happens to include ':' as one of its characters.
4286 //
4287 //------------------------------------------------------------------------------
scanPosixProp()4288 UnicodeSet *RegexCompile::scanPosixProp() {
4289     UnicodeSet    *uset = NULL;
4290 
4291     if (U_FAILURE(*fStatus)) {
4292         return NULL;
4293     }
4294 
4295     U_ASSERT(fC.fChar == chColon);
4296 
4297     // Save the scanner state.
4298     // TODO:  move this into the scanner, with the state encapsulated in some way.  Ticket 6062
4299     int64_t     savedScanIndex        = fScanIndex;
4300     int64_t     savedNextIndex        = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4301     UBool       savedQuoteMode        = fQuoteMode;
4302     UBool       savedInBackslashQuote = fInBackslashQuote;
4303     UBool       savedEOLComments      = fEOLComments;
4304     int64_t     savedLineNum          = fLineNum;
4305     int64_t     savedCharNum          = fCharNum;
4306     UChar32     savedLastChar         = fLastChar;
4307     UChar32     savedPeekChar         = fPeekChar;
4308     RegexPatternChar savedfC          = fC;
4309 
4310     // Scan for a closing ].   A little tricky because there are some perverse
4311     //   edge cases possible.  "[:abc\Qdef:] \E]"  is a valid non-property expression,
4312     //   ending on the second closing ].
4313 
4314     UnicodeString propName;
4315     UBool         negated  = FALSE;
4316 
4317     // Check for and consume the '^' in a negated POSIX property, e.g.  [:^Letter:]
4318     nextChar(fC);
4319     if (fC.fChar == chUp) {
4320        negated = TRUE;
4321        nextChar(fC);
4322     }
4323 
4324     // Scan for the closing ":]", collecting the property name along the way.
4325     UBool  sawPropSetTerminator = FALSE;
4326     for (;;) {
4327         propName.append(fC.fChar);
4328         nextChar(fC);
4329         if (fC.fQuoted || fC.fChar == -1) {
4330             // Escaped characters or end of input - either says this isn't a [:Property:]
4331             break;
4332         }
4333         if (fC.fChar == chColon) {
4334             nextChar(fC);
4335             if (fC.fChar == chRBracket) {
4336                 sawPropSetTerminator = TRUE;
4337             }
4338             break;
4339         }
4340     }
4341 
4342     if (sawPropSetTerminator) {
4343         uset = createSetForProperty(propName, negated);
4344     }
4345     else
4346     {
4347         // No closing ":]".
4348         //  Restore the original scan position.
4349         //  The main scanner will retry the input as a normal set expression,
4350         //    not a [:Property:] expression.
4351         fScanIndex        = savedScanIndex;
4352         fQuoteMode        = savedQuoteMode;
4353         fInBackslashQuote = savedInBackslashQuote;
4354         fEOLComments      = savedEOLComments;
4355         fLineNum          = savedLineNum;
4356         fCharNum          = savedCharNum;
4357         fLastChar         = savedLastChar;
4358         fPeekChar         = savedPeekChar;
4359         fC                = savedfC;
4360         UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
4361     }
4362     return uset;
4363 }
4364 
addIdentifierIgnorable(UnicodeSet * set,UErrorCode & ec)4365 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
4366     set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4367     addCategory(set, U_GC_CF_MASK, ec);
4368 }
4369 
4370 //
4371 //  Create a Unicode Set from a Unicode Property expression.
4372 //     This is common code underlying both \p{...} ane [:...:] expressions.
4373 //     Includes trying the Java "properties" that aren't supported as
4374 //     normal ICU UnicodeSet properties
4375 //
createSetForProperty(const UnicodeString & propName,UBool negated)4376 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
4377 
4378     if (U_FAILURE(*fStatus)) {
4379         return nullptr;
4380     }
4381     LocalPointer<UnicodeSet> set;
4382     UErrorCode status = U_ZERO_ERROR;
4383 
4384     do {      // non-loop, exists to allow breaks from the block.
4385         //
4386         //  First try the property as we received it
4387         //
4388         UnicodeString   setExpr;
4389         uint32_t        usetFlags = 0;
4390         setExpr.append(u"[\\p{", -1);
4391         setExpr.append(propName);
4392         setExpr.append(u"}]", -1);
4393         if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
4394             usetFlags |= USET_CASE_INSENSITIVE;
4395         }
4396         set.adoptInsteadAndCheckErrorCode(new UnicodeSet(setExpr, usetFlags, NULL, status), status);
4397         if (U_SUCCESS(status) || status == U_MEMORY_ALLOCATION_ERROR) {
4398             break;
4399         }
4400 
4401         //
4402         //  The incoming property wasn't directly recognized by ICU.
4403 
4404         //  Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet.
4405         //     Java accepts 'word' with mixed case.
4406         //     Java accepts 'all' only in all lower case.
4407 
4408         status = U_ZERO_ERROR;
4409         if (propName.caseCompare(u"word", -1, 0) == 0) {
4410             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET])), status);
4411             break;
4412         }
4413         if (propName.compare(u"all", -1) == 0) {
4414             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(0, 0x10ffff), status);
4415             break;
4416         }
4417 
4418 
4419         //    Do Java InBlock expressions
4420         //
4421         UnicodeString mPropName = propName;
4422         if (mPropName.startsWith(u"In", 2) && mPropName.length() >= 3) {
4423             status = U_ZERO_ERROR;
4424             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4425             if (U_FAILURE(status)) {
4426                 break;
4427             }
4428             UnicodeString blockName(mPropName, 2);  // Property with the leading "In" removed.
4429             set->applyPropertyAlias(UnicodeString(u"Block"), blockName, status);
4430             break;
4431         }
4432 
4433         //  Check for the Java form "IsBooleanPropertyValue", which we will recast
4434         //  as "BooleanPropertyValue". The property value can be either a
4435         //  a General Category or a Script Name.
4436 
4437         if (propName.startsWith(u"Is", 2) && propName.length()>=3) {
4438             mPropName.remove(0, 2);      // Strip the "Is"
4439             if (mPropName.indexOf(u'=') >= 0) {
4440                 // Reject any "Is..." property expression containing an '=', that is,
4441                 // any non-binary property expression.
4442                 status = U_REGEX_PROPERTY_SYNTAX;
4443                 break;
4444             }
4445 
4446             if (mPropName.caseCompare(u"assigned", -1, 0) == 0) {
4447                 mPropName.setTo(u"unassigned", -1);
4448                 negated = !negated;
4449             } else if (mPropName.caseCompare(u"TitleCase", -1, 0) == 0) {
4450                 mPropName.setTo(u"Titlecase_Letter", -1);
4451             }
4452 
4453             mPropName.insert(0, u"[\\p{", -1);
4454             mPropName.append(u"}]", -1);
4455             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(mPropName, *fStatus), status);
4456 
4457             if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4458                 set->closeOver(USET_CASE_INSENSITIVE);
4459             }
4460             break;
4461 
4462         }
4463 
4464         if (propName.startsWith(u"java", -1)) {
4465             status = U_ZERO_ERROR;
4466             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4467             if (U_FAILURE(status)) {
4468                 break;
4469             }
4470             //
4471             //  Try the various Java specific properties.
4472             //   These all begin with "java"
4473             //
4474             if (propName.compare(u"javaDefined", -1) == 0) {
4475                 addCategory(set.getAlias(), U_GC_CN_MASK, status);
4476                 set->complement();
4477             }
4478             else if (propName.compare(u"javaDigit", -1) == 0) {
4479                 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4480             }
4481             else if (propName.compare(u"javaIdentifierIgnorable", -1) == 0) {
4482                 addIdentifierIgnorable(set.getAlias(), status);
4483             }
4484             else if (propName.compare(u"javaISOControl", -1) == 0) {
4485                 set->add(0, 0x1F).add(0x7F, 0x9F);
4486             }
4487             else if (propName.compare(u"javaJavaIdentifierPart", -1) == 0) {
4488                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4489                 addCategory(set.getAlias(), U_GC_SC_MASK, status);
4490                 addCategory(set.getAlias(), U_GC_PC_MASK, status);
4491                 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4492                 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4493                 addCategory(set.getAlias(), U_GC_MC_MASK, status);
4494                 addCategory(set.getAlias(), U_GC_MN_MASK, status);
4495                 addIdentifierIgnorable(set.getAlias(), status);
4496             }
4497             else if (propName.compare(u"javaJavaIdentifierStart", -1) == 0) {
4498                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4499                 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4500                 addCategory(set.getAlias(), U_GC_SC_MASK, status);
4501                 addCategory(set.getAlias(), U_GC_PC_MASK, status);
4502             }
4503             else if (propName.compare(u"javaLetter", -1) == 0) {
4504                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4505             }
4506             else if (propName.compare(u"javaLetterOrDigit", -1) == 0) {
4507                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4508                 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4509             }
4510             else if (propName.compare(u"javaLowerCase", -1) == 0) {
4511                 addCategory(set.getAlias(), U_GC_LL_MASK, status);
4512             }
4513             else if (propName.compare(u"javaMirrored", -1) == 0) {
4514                 set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, status);
4515             }
4516             else if (propName.compare(u"javaSpaceChar", -1) == 0) {
4517                 addCategory(set.getAlias(), U_GC_Z_MASK, status);
4518             }
4519             else if (propName.compare(u"javaSupplementaryCodePoint", -1) == 0) {
4520                 set->add(0x10000, UnicodeSet::MAX_VALUE);
4521             }
4522             else if (propName.compare(u"javaTitleCase", -1) == 0) {
4523                 addCategory(set.getAlias(), U_GC_LT_MASK, status);
4524             }
4525             else if (propName.compare(u"javaUnicodeIdentifierStart", -1) == 0) {
4526                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4527                 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4528             }
4529             else if (propName.compare(u"javaUnicodeIdentifierPart", -1) == 0) {
4530                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4531                 addCategory(set.getAlias(), U_GC_PC_MASK, status);
4532                 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4533                 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4534                 addCategory(set.getAlias(), U_GC_MC_MASK, status);
4535                 addCategory(set.getAlias(), U_GC_MN_MASK, status);
4536                 addIdentifierIgnorable(set.getAlias(), status);
4537             }
4538             else if (propName.compare(u"javaUpperCase", -1) == 0) {
4539                 addCategory(set.getAlias(), U_GC_LU_MASK, status);
4540             }
4541             else if (propName.compare(u"javaValidCodePoint", -1) == 0) {
4542                 set->add(0, UnicodeSet::MAX_VALUE);
4543             }
4544             else if (propName.compare(u"javaWhitespace", -1) == 0) {
4545                 addCategory(set.getAlias(), U_GC_Z_MASK, status);
4546                 set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4547                 set->add(9, 0x0d).add(0x1c, 0x1f);
4548             } else {
4549                 status = U_REGEX_PROPERTY_SYNTAX;
4550             }
4551 
4552             if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4553                 set->closeOver(USET_CASE_INSENSITIVE);
4554             }
4555             break;
4556         }
4557 
4558         // Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility
4559         // extensions matched it.
4560         status = U_REGEX_PROPERTY_SYNTAX;
4561     } while (false);   // End of do loop block. Code above breaks out of the block on success or hard failure.
4562 
4563     if (U_SUCCESS(status)) {
4564         U_ASSERT(set.isValid());
4565         if (negated) {
4566             set->complement();
4567         }
4568         return set.orphan();
4569     } else {
4570         if (status == U_ILLEGAL_ARGUMENT_ERROR) {
4571             status = U_REGEX_PROPERTY_SYNTAX;
4572         }
4573         error(status);
4574         return nullptr;
4575     }
4576 }
4577 
4578 
4579 //
4580 //  SetEval   Part of the evaluation of [set expressions].
4581 //            Perform any pending (stacked) operations with precedence
4582 //            equal or greater to that of the next operator encountered
4583 //            in the expression.
4584 //
setEval(int32_t nextOp)4585 void RegexCompile::setEval(int32_t nextOp) {
4586     UnicodeSet *rightOperand = NULL;
4587     UnicodeSet *leftOperand  = NULL;
4588     for (;;) {
4589         U_ASSERT(fSetOpStack.empty()==FALSE);
4590         int32_t pendingSetOperation = fSetOpStack.peeki();
4591         if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
4592             break;
4593         }
4594         fSetOpStack.popi();
4595         U_ASSERT(fSetStack.empty() == FALSE);
4596         rightOperand = (UnicodeSet *)fSetStack.peek();
4597         switch (pendingSetOperation) {
4598             case setNegation:
4599                 rightOperand->complement();
4600                 break;
4601             case setCaseClose:
4602                 // TODO: need a simple close function.  Ticket 6065
4603                 rightOperand->closeOver(USET_CASE_INSENSITIVE);
4604                 rightOperand->removeAllStrings();
4605                 break;
4606             case setDifference1:
4607             case setDifference2:
4608                 fSetStack.pop();
4609                 leftOperand = (UnicodeSet *)fSetStack.peek();
4610                 leftOperand->removeAll(*rightOperand);
4611                 delete rightOperand;
4612                 break;
4613             case setIntersection1:
4614             case setIntersection2:
4615                 fSetStack.pop();
4616                 leftOperand = (UnicodeSet *)fSetStack.peek();
4617                 leftOperand->retainAll(*rightOperand);
4618                 delete rightOperand;
4619                 break;
4620             case setUnion:
4621                 fSetStack.pop();
4622                 leftOperand = (UnicodeSet *)fSetStack.peek();
4623                 leftOperand->addAll(*rightOperand);
4624                 delete rightOperand;
4625                 break;
4626             default:
4627                 UPRV_UNREACHABLE;
4628             }
4629         }
4630     }
4631 
setPushOp(int32_t op)4632 void RegexCompile::setPushOp(int32_t op) {
4633     setEval(op);
4634     fSetOpStack.push(op, *fStatus);
4635     fSetStack.push(new UnicodeSet(), *fStatus);
4636 }
4637 
4638 U_NAMESPACE_END
4639 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
4640